diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/.gitignore b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/.gitignore
deleted file mode 100644
index a7c4577149ca74342fc598bc9cda086f61a34a5b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/.gitignore
+++ /dev/null
@@ -1,132 +0,0 @@
-# JetBrains PyCharm IDE
-.idea/
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# macOS dir files
-.DS_Store
-
-# Distribution / packaging
-.Python
-env/
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# Checkpoints
-checkpoints
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# dotenv
-.env
-
-# virtualenv
-.venv
-venv/
-ENV/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-
-# Generated files
-/fairseq/temporal_convolution_tbc
-/fairseq/modules/*_layer/*_forward.cu
-/fairseq/modules/*_layer/*_backward.cu
-
-# data
-data-bin/
-
-# reranking
-/examples/reranking/rerank_data
-
-# Cython-generated C++ source files
-/fairseq/data/data_utils_fast.cpp
-/fairseq/data/token_block_utils_fast.cpp
-
-# VSCODE
-.vscode/ftp-sync.json
-.vscode/settings.json
-
-# Experimental Folder
-experimental/*
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/.gitmodules b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/.gitmodules
deleted file mode 100644
index 07a55d45d4f0bed755dbfc1f440f214ed43d206a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/.gitmodules
+++ /dev/null
@@ -1,4 +0,0 @@
-[submodule "fairseq/model_parallel/megatron"]
-    path = fairseq/model_parallel/megatron
-    url = https://github.com/ngoyal2707/Megatron-LM
-    branch = fairseq
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/CODE_OF_CONDUCT.md b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/CODE_OF_CONDUCT.md
deleted file mode 100644
index a0cbeaab7650bf08267fbdbc9bb54e845c88f392..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/CODE_OF_CONDUCT.md
+++ /dev/null
@@ -1,77 +0,0 @@
-# Code of Conduct
-
-## Our Pledge
-
-In the interest of fostering an open and welcoming environment, we as
-contributors and maintainers pledge to make participation in our project and
-our community a harassment-free experience for everyone, regardless of age, body
-size, disability, ethnicity, sex characteristics, gender identity and expression,
-level of experience, education, socio-economic status, nationality, personal
-appearance, race, religion, or sexual identity and orientation.
-
-## Our Standards
-
-Examples of behavior that contributes to creating a positive environment
-include:
-
-* Using welcoming and inclusive language
-* Being respectful of differing viewpoints and experiences
-* Gracefully accepting constructive criticism
-* Focusing on what is best for the community
-* Showing empathy towards other community members
-
-Examples of unacceptable behavior by participants include:
-
-* The use of sexualized language or imagery and unwelcome sexual attention or
-  advances
-* Trolling, insulting/derogatory comments, and personal or political attacks
-* Public or private harassment
-* Publishing others' private information, such as a physical or electronic
-  address, without explicit permission
-* Other conduct which could reasonably be considered inappropriate in a
-  professional setting
-
-## Our Responsibilities
-
-Project maintainers are responsible for clarifying the standards of acceptable
-behavior and are expected to take appropriate and fair corrective action in
-response to any instances of unacceptable behavior.
-
-Project maintainers have the right and responsibility to remove, edit, or
-reject comments, commits, code, wiki edits, issues, and other contributions
-that are not aligned to this Code of Conduct, or to ban temporarily or
-permanently any contributor for other behaviors that they deem inappropriate,
-threatening, offensive, or harmful.
-
-## Scope
-
-This Code of Conduct applies within all project spaces, and it also applies when
-an individual is representing the project or its community in public spaces.
-Examples of representing a project or community include using an official
-project e-mail address, posting via an official social media account, or acting
-as an appointed representative at an online or offline event. Representation of
-a project may be further defined and clarified by project maintainers.
-
-## Enforcement
-
-Instances of abusive, harassing, or otherwise unacceptable behavior may be
-reported by contacting the project team at <conduct@pytorch.org>. All
-complaints will be reviewed and investigated and will result in a response that
-is deemed necessary and appropriate to the circumstances. The project team is
-obligated to maintain confidentiality with regard to the reporter of an incident.
-Further details of specific enforcement policies may be posted separately.
-
-Project maintainers who do not follow or enforce the Code of Conduct in good
-faith may face temporary or permanent repercussions as determined by other
-members of the project's leadership.
-
-## Attribution
-
-This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
-available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
-
-[homepage]: https://www.contributor-covenant.org
-
-For answers to common questions about this code of conduct, see
-https://www.contributor-covenant.org/faq
-
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/CONTRIBUTING.md b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/CONTRIBUTING.md
deleted file mode 100644
index 4d7ca6a98ebdabd7a6770ea616ee355ffb4a41e1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/CONTRIBUTING.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# Contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq)
-We want to make contributing to this project as easy and transparent as
-possible.
-
-## Pull Requests
-We actively welcome your pull requests.
-
-1. Fork the repo and create your branch from `master`.
-2. If you've added code that should be tested, add tests.
-3. If you've changed APIs, update the documentation.
-4. Ensure the test suite passes.
-5. Make sure your code lints.
-6. If you haven't already, complete the Contributor License Agreement ("CLA").
-
-## Contributor License Agreement ("CLA")
-In order to accept your pull request, we need you to submit a CLA. You only need
-to do this once to work on any of Facebook's open source projects.
-
-Complete your CLA here: <https://code.facebook.com/cla>
-
-## Issues
-We use GitHub issues to track public bugs. Please ensure your description is
-clear and has sufficient instructions to be able to reproduce the issue.
-
-## License
-By contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq),
-you agree that your contributions will be licensed under the LICENSE file in
-the root directory of this source tree.
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/LICENSE b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/LICENSE
deleted file mode 100644
index b96dcb0480a0b0be0727976e5202a1e7b23edc3f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) Facebook, Inc. and its affiliates.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/README.md b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/README.md
deleted file mode 100644
index 76f04b28458ed9e29ce749f13c29ff7170572590..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/README.md
+++ /dev/null
@@ -1,132 +0,0 @@
-# MBART: Multilingual Denoising Pre-training for Neural Machine Translation
-
-# 安装fairseq
-
-```bash
-在工程根目录下执行pip install -e ./
-```
-# 下载预处理模型
-1. 下载mbart.CC25.tar.gz
-2. tar -xzvf mbart.CC25.tar.gz
-3. 将模型放于工程根目录下，其目录结构如下:
-```bash
-mbart.cc25
-    | -- model.pt
-    | -- dict.txt
-    | -- sentence.bpe.model
-```
-
-# 数据集
-## 方法一. 下载已预处理好的数据集
-1. 下载train_data.tar
-2. tar -xvf train_data.tar
-3. 将数据集放于工程根目录下，其目录结构如下:
-```bash
-train_data
-    | -- en_ro
-        | -- preprocess.log
-        | -- dict.en_XX.txt
-        | -- dict.ro_RO.txt
-        | -- test.en_XX-ro_RO.ro_RO.bin
-        | -- test.en_XX-ro_RO.ro_RO.idx
-        | -- test.en_XX-ro_RO.en_XX.bin
-        | -- test.en_XX-ro_RO.en_XX.idx
-        | -- train.en_XX-ro_RO.ro_RO.bin
-        | -- train.en_XX-ro_RO.ro_RO.idx
-        | -- train.en_XX-ro_RO.en_XX.bin
-        | -- train.en_XX-ro_RO.en_XX.idx
-        | -- valid.en_XX-ro_RO.ro_RO.bin
-        | -- valid.en_XX-ro_RO.ro_RO.idx
-        | -- valid.en_XX-ro_RO.en_XX.bin
-        | -- valid.en_XX-ro_RO.en_XX.idx
-
-```
-
-## 方法二. 下载数据集并自行处理
-### 1. 分词处理
-1. 下载en_ro数据集并放于工程根目录下
-2. 下载并安装SPM [here](https://github.com/google/sentencepiece)
-```bash
-SPM=/path/to/sentencepiece/build/src/spm_encode
-MODEL=sentence.bpe.model
-DATA=path_2_data
-SRC=en_XX
-TGT=ro_RO
-TRAIN=train
-VALID=valid
-TEST=test
-${SPM} --model=${MODEL} < ${DATA}/${TRAIN}.${SRC} > ${DATA}/${TRAIN}.spm.${SRC} &
-${SPM} --model=${MODEL} < ${DATA}/${TRAIN}.${TGT} > ${DATA}/${TRAIN}.spm.${TGT} &
-${SPM} --model=${MODEL} < ${DATA}/${VALID}.${SRC} > ${DATA}/${VALID}.spm.${SRC} &
-${SPM} --model=${MODEL} < ${DATA}/${VALID}.${TGT} > ${DATA}/${VALID}.spm.${TGT} &
-${SPM} --model=${MODEL} < ${DATA}/${TEST}.${SRC} > ${DATA}/${TEST}.spm.${SRC} &
-${SPM} --model=${MODEL} < ${DATA}/${TEST}.${TGT} > ${DATA}/${TEST}.spm.${TGT} &
-```
-
-
-### 2. 数据预处理
-
-```bash
-DICT=dict.txt
-DATA=/path/data/
-DEST=/path/dest/
-NAME=en_ro
-TRAIN=train
-TEST=test
-SRC=en_XX
-TGT=ro_RO
-VALID=valid
-TEST=test
-fairseq-preprocess \
-  --source-lang ${SRC} \
-  --target-lang ${TGT} \
-  --trainpref ${DATA}/${TRAIN}.spm \
-  --validpref ${DATA}/${VALID}.spm \
-  --testpref ${DATA}/${TEST}.spm \
-  --destdir ${DEST}/${NAME} \
-  --thresholdtgt 0 \
-  --thresholdsrc 0 \
-  --srcdict ${DICT} \
-  --tgtdict ${DICT} \
-  --workers 70
-
-```
-
-# 在数据集上进行fine-tune
-
-```bash
-1. 修改run_8p.sh中PRETRAIN为模型的路径，DATA_PATH为数据集的路径  
-2. 执行 bash run_8p.sh
-```
-# 在数据集上进行评估
-
-1.下载依赖评估包
-
-```bash  
-git clone https://github.com/moses-smt/mosesdecoder.git
-git clone https://github.com/rsennrich/wmt16-scripts.git
-pip install sacrebleu==1.5.1
-```
-
-2.执行评估脚本
-
-```bash
-1. 修改generate_on_en_ro.sh中DATA_PATH为数据集的路径，BPE_PATH为sentence.bpe.model的路径，SCRIPTS为mosesdecoder/scripts的路径，WMT16_SCRIPTS为wmt16-scripts的路径
-2. 执行 bash generate_on_en_ro.sh  checkpoints/checkpoint_best.pt
-```
-
-# Docker容器训练
-
-1.导入镜像二进制包
-
-```bash
-docker import ubuntuarmpytorch.tar pytorch:b***
-```
-
-2.执行docker_start.sh
-
-```
-./docker_start.sh pytorch:b*** /path/data /path/mbart
-```
-
-3.执行正常安装及训练步骤
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/config.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/config.yaml
deleted file mode 100644
index 66723e706cfe498e1fd04a2b759e092af0dad2f8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/config.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-defaults:
-  - params: training_params
-  - task: language_modeling
-  - model: transformer_lm
-  - criterion: cross_entropy
-  - optimizer: adam
-  - lr_scheduler: inverse_sqrt
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/config_eval_lm.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/config_eval_lm.yaml
deleted file mode 100644
index 5a93cb5d92216c483e5a2172bc7d62c69b165f29..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/config_eval_lm.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-defaults:
-  - params: eval_lm_params
-  - task: language_modeling
-  - model: transformer_lm
-  - criterion: cross_entropy
-  - optimizer: adam
-  - lr_scheduler: inverse_sqrt
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/criterion/adaptive_loss.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/criterion/adaptive_loss.yaml
deleted file mode 100644
index a85a7eed1c94cf81021e32e3dd3cf42fb5a525d8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/criterion/adaptive_loss.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-# @package _group_
-sentence_avg: ${params.optimization.sentence_avg}
-ddp_backend: ${params.distributed_training.ddp_backend}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/criterion/cross_entropy.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/criterion/cross_entropy.yaml
deleted file mode 100644
index a85a7eed1c94cf81021e32e3dd3cf42fb5a525d8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/criterion/cross_entropy.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-# @package _group_
-sentence_avg: ${params.optimization.sentence_avg}
-ddp_backend: ${params.distributed_training.ddp_backend}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/lr_scheduler/cosine.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/lr_scheduler/cosine.yaml
deleted file mode 100644
index 0f91e0d24091ff41458c918821bad3b0103649f9..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/lr_scheduler/cosine.yaml
+++ /dev/null
@@ -1,7 +0,0 @@
-# @package _group_
-warmup_updates: 0
-warmup_init_lr: -1
-max_lr: 1.0
-t_mult: 1.0
-lr_period_updates: -1
-lr_shrink: 0.1
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/lr_scheduler/inverse_sqrt.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/lr_scheduler/inverse_sqrt.yaml
deleted file mode 100644
index 0eac7d88eb9ac6c5e6da9ab2f108b73b00f2b69e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/lr_scheduler/inverse_sqrt.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-# @package _group_
-warmup_updates: 4000
-warmup_init_lr: -1
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm.yaml
deleted file mode 100644
index 3837ea54e165ab7b3387f26ee814fb015196ffd9..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# @package _group_
-activation_fn: "relu"
-dropout: 0.1
-attention_dropout: 0.0
-activation_dropout: 0.0
-relu_dropout: 0.0
-decoder_embed_dim: 512
-decoder_output_dim: 512
-decoder_input_dim: 512
-decoder_ffn_embed_dim: 2048
-decoder_layers: 6
-decoder_attention_heads: 8
-decoder_normalize_before: true
-no_decoder_final_norm: false
-adaptive_softmax_cutoff: null
-adaptive_softmax_dropout: 0
-adaptive_softmax_factor: 4
-no_token_positional_embeddings: false
-share_decoder_input_output_embed: false
-character_embeddings: false
-character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
-character_embedding_dim: 4
-char_embedder_highway_layers: 2
-adaptive_input: false
-adaptive_input_factor: 4
-adaptive_input_cutoff: null
-tie_adaptive_weights: false
-tie_adaptive_proj: false
-decoder_learned_pos: false
-decoder_layerdrop: 0
-decoder_layers_to_keep: null
-layernorm_embedding: false
-no_scale_embedding: false
-quant_noise_pq: 0
-quant_noise_pq_block_size: 8
-quant_noise_scalar: 0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_baevski_gbw.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_baevski_gbw.yaml
deleted file mode 100644
index 30b1a4f1e0f5e7f7c2671ff8ec995cc32363f10f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_baevski_gbw.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# @package _group_
-activation_fn: "relu"
-dropout: 0.1
-attention_dropout: 0.1
-activation_dropout: 0.0
-relu_dropout: 0.0
-decoder_embed_dim: 512
-decoder_output_dim: 512
-decoder_input_dim: 512
-decoder_ffn_embed_dim: 4096
-decoder_layers: 12
-decoder_attention_heads: 16
-decoder_normalize_before: true
-no_decoder_final_norm: true
-adaptive_softmax_cutoff: null
-adaptive_softmax_dropout: 0
-adaptive_softmax_factor: 4
-no_token_positional_embeddings: false
-share_decoder_input_output_embed: false
-character_embeddings: false
-character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
-character_embedding_dim: 4
-char_embedder_highway_layers: 2
-adaptive_input: false
-adaptive_input_factor: 4
-adaptive_input_cutoff: null
-tie_adaptive_weights: false
-tie_adaptive_proj: false
-decoder_learned_pos: false
-decoder_layerdrop: 0
-decoder_layers_to_keep: null
-layernorm_embedding: false
-no_scale_embedding: false
-quant_noise_pq: 0
-quant_noise_pq_block_size: 8
-quant_noise_scalar: 0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_baevski_wiki103.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_baevski_wiki103.yaml
deleted file mode 100644
index 1154cfa660ee5ce6a272cd1a0049eead1e92c117..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_baevski_wiki103.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# @package _group_
-activation_fn: "relu"
-dropout: 0.3
-attention_dropout: 0.1
-activation_dropout: 0.1
-relu_dropout: 0.1
-decoder_embed_dim: 1024
-decoder_output_dim: 1024
-decoder_input_dim: 1024
-decoder_ffn_embed_dim: 4096
-decoder_layers: 16
-decoder_attention_heads: 8
-decoder_normalize_before: true
-no_decoder_final_norm: true
-adaptive_softmax_cutoff: "20000,60000"
-adaptive_softmax_dropout: 0.2
-adaptive_softmax_factor: 4
-no_token_positional_embeddings: false
-share_decoder_input_output_embed: false
-character_embeddings: false
-character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
-character_embedding_dim: 4
-char_embedder_highway_layers: 2
-adaptive_input: true
-adaptive_input_factor: 4
-adaptive_input_cutoff: "20000,60000"
-tie_adaptive_weights: true
-tie_adaptive_proj: true
-decoder_learned_pos: false
-decoder_layerdrop: 0
-decoder_layers_to_keep: null
-layernorm_embedding: false
-no_scale_embedding: false
-quant_noise_pq: 0
-quant_noise_pq_block_size: 8
-quant_noise_scalar: 0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_big.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_big.yaml
deleted file mode 100644
index 309575310bfc5d9c5cde31563073bef18abc646e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_big.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# @package _group_
-activation_fn: "relu"
-dropout: 0.1
-attention_dropout: 0.0
-activation_dropout: 0.0
-relu_dropout: 0.0
-decoder_embed_dim: 1024
-decoder_output_dim: 1024
-decoder_input_dim: 1024
-decoder_ffn_embed_dim: 4096
-decoder_layers: 12
-decoder_attention_heads: 16
-decoder_normalize_before: true
-no_decoder_final_norm: false
-adaptive_softmax_cutoff: null
-adaptive_softmax_dropout: 0
-adaptive_softmax_factor: 4
-no_token_positional_embeddings: false
-share_decoder_input_output_embed: false
-character_embeddings: false
-character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
-character_embedding_dim: 4
-char_embedder_highway_layers: 2
-adaptive_input: false
-adaptive_input_factor: 4
-adaptive_input_cutoff: null
-tie_adaptive_weights: false
-tie_adaptive_proj: false
-decoder_learned_pos: false
-decoder_layerdrop: 0
-decoder_layers_to_keep: null
-layernorm_embedding: false
-no_scale_embedding: false
-quant_noise_pq: 0
-quant_noise_pq_block_size: 8
-quant_noise_scalar: 0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gbw.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gbw.yaml
deleted file mode 100644
index 30b1a4f1e0f5e7f7c2671ff8ec995cc32363f10f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gbw.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# @package _group_
-activation_fn: "relu"
-dropout: 0.1
-attention_dropout: 0.1
-activation_dropout: 0.0
-relu_dropout: 0.0
-decoder_embed_dim: 512
-decoder_output_dim: 512
-decoder_input_dim: 512
-decoder_ffn_embed_dim: 4096
-decoder_layers: 12
-decoder_attention_heads: 16
-decoder_normalize_before: true
-no_decoder_final_norm: true
-adaptive_softmax_cutoff: null
-adaptive_softmax_dropout: 0
-adaptive_softmax_factor: 4
-no_token_positional_embeddings: false
-share_decoder_input_output_embed: false
-character_embeddings: false
-character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
-character_embedding_dim: 4
-char_embedder_highway_layers: 2
-adaptive_input: false
-adaptive_input_factor: 4
-adaptive_input_cutoff: null
-tie_adaptive_weights: false
-tie_adaptive_proj: false
-decoder_learned_pos: false
-decoder_layerdrop: 0
-decoder_layers_to_keep: null
-layernorm_embedding: false
-no_scale_embedding: false
-quant_noise_pq: 0
-quant_noise_pq_block_size: 8
-quant_noise_scalar: 0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gpt.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gpt.yaml
deleted file mode 100644
index 2c6cb7be3801115371566932ffc78651c9ac6c0f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gpt.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# @package _group_
-activation_fn: "gelu"
-dropout: 0.1
-attention_dropout: 0.1
-activation_dropout: 0.0
-relu_dropout: 0.0
-decoder_embed_dim: 768
-decoder_output_dim: 768
-decoder_input_dim: 768
-decoder_ffn_embed_dim: 3072
-decoder_layers: 12
-decoder_attention_heads: 12
-decoder_normalize_before: true
-no_decoder_final_norm: false
-adaptive_softmax_cutoff: null
-adaptive_softmax_dropout: 0
-adaptive_softmax_factor: 4
-no_token_positional_embeddings: false
-share_decoder_input_output_embed: false
-character_embeddings: false
-character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
-character_embedding_dim: 4
-char_embedder_highway_layers: 2
-adaptive_input: false
-adaptive_input_factor: 4
-adaptive_input_cutoff: null
-tie_adaptive_weights: false
-tie_adaptive_proj: false
-decoder_learned_pos: false
-decoder_layerdrop: 0
-decoder_layers_to_keep: null
-layernorm_embedding: false
-no_scale_embedding: false
-quant_noise_pq: 0
-quant_noise_pq_block_size: 8
-quant_noise_scalar: 0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gpt2_big.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gpt2_big.yaml
deleted file mode 100644
index a08769a1781abdb13302bf57bf1338bcaf68a0ec..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gpt2_big.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# @package _group_
-activation_fn: "gelu"
-dropout: 0.1
-attention_dropout: 0.1
-activation_dropout: 0.0
-relu_dropout: 0.0
-decoder_embed_dim: 1600
-decoder_output_dim: 1600
-decoder_input_dim: 1600
-decoder_ffn_embed_dim: 6400
-decoder_layers: 48
-decoder_attention_heads: 25
-decoder_normalize_before: true
-no_decoder_final_norm: false
-adaptive_softmax_cutoff: null
-adaptive_softmax_dropout: 0
-adaptive_softmax_factor: 4
-no_token_positional_embeddings: false
-share_decoder_input_output_embed: false
-character_embeddings: false
-character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
-character_embedding_dim: 4
-char_embedder_highway_layers: 2
-adaptive_input: false
-adaptive_input_factor: 4
-adaptive_input_cutoff: null
-tie_adaptive_weights: false
-tie_adaptive_proj: false
-decoder_learned_pos: false
-decoder_layerdrop: 0
-decoder_layers_to_keep: null
-layernorm_embedding: false
-no_scale_embedding: false
-quant_noise_pq: 0
-quant_noise_pq_block_size: 8
-quant_noise_scalar: 0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gpt2_medium.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gpt2_medium.yaml
deleted file mode 100644
index 64261d793c0f1ae091c9bf5c8c77093a07326137..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gpt2_medium.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# @package _group_
-activation_fn: "gelu"
-dropout: 0.1
-attention_dropout: 0.1
-activation_dropout: 0.0
-relu_dropout: 0.0
-decoder_embed_dim: 1280
-decoder_output_dim: 1280
-decoder_input_dim: 1280
-decoder_ffn_embed_dim: 5120
-decoder_layers: 36
-decoder_attention_heads: 20
-decoder_normalize_before: true
-no_decoder_final_norm: false
-adaptive_softmax_cutoff: null
-adaptive_softmax_dropout: 0
-adaptive_softmax_factor: 4
-no_token_positional_embeddings: false
-share_decoder_input_output_embed: false
-character_embeddings: false
-character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
-character_embedding_dim: 4
-char_embedder_highway_layers: 2
-adaptive_input: false
-adaptive_input_factor: 4
-adaptive_input_cutoff: null
-tie_adaptive_weights: false
-tie_adaptive_proj: false
-decoder_learned_pos: false
-decoder_layerdrop: 0
-decoder_layers_to_keep: null
-layernorm_embedding: false
-no_scale_embedding: false
-quant_noise_pq: 0
-quant_noise_pq_block_size: 8
-quant_noise_scalar: 0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gpt2_small.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gpt2_small.yaml
deleted file mode 100644
index 702e81f466c82edf40433589d389edbe0a7b96db..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_gpt2_small.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# @package _group_
-activation_fn: "gelu"
-dropout: 0.1
-attention_dropout: 0.1
-activation_dropout: 0.0
-relu_dropout: 0.0
-decoder_embed_dim: 1024
-decoder_output_dim: 1024
-decoder_input_dim: 1024
-decoder_ffn_embed_dim: 4096
-decoder_layers: 24
-decoder_attention_heads: 16
-decoder_normalize_before: true
-no_decoder_final_norm: false
-adaptive_softmax_cutoff: null
-adaptive_softmax_dropout: 0
-adaptive_softmax_factor: 4
-no_token_positional_embeddings: false
-share_decoder_input_output_embed: false
-character_embeddings: false
-character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
-character_embedding_dim: 4
-char_embedder_highway_layers: 2
-adaptive_input: false
-adaptive_input_factor: 4
-adaptive_input_cutoff: null
-tie_adaptive_weights: false
-tie_adaptive_proj: false
-decoder_learned_pos: false
-decoder_layerdrop: 0
-decoder_layers_to_keep: null
-layernorm_embedding: false
-no_scale_embedding: false
-quant_noise_pq: 0
-quant_noise_pq_block_size: 8
-quant_noise_scalar: 0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_wiki103.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_wiki103.yaml
deleted file mode 100644
index 1154cfa660ee5ce6a272cd1a0049eead1e92c117..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/model/transformer_lm_wiki103.yaml
+++ /dev/null
@@ -1,36 +0,0 @@
-# @package _group_
-activation_fn: "relu"
-dropout: 0.3
-attention_dropout: 0.1
-activation_dropout: 0.1
-relu_dropout: 0.1
-decoder_embed_dim: 1024
-decoder_output_dim: 1024
-decoder_input_dim: 1024
-decoder_ffn_embed_dim: 4096
-decoder_layers: 16
-decoder_attention_heads: 8
-decoder_normalize_before: true
-no_decoder_final_norm: true
-adaptive_softmax_cutoff: "20000,60000"
-adaptive_softmax_dropout: 0.2
-adaptive_softmax_factor: 4
-no_token_positional_embeddings: false
-share_decoder_input_output_embed: false
-character_embeddings: false
-character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]"
-character_embedding_dim: 4
-char_embedder_highway_layers: 2
-adaptive_input: true
-adaptive_input_factor: 4
-adaptive_input_cutoff: "20000,60000"
-tie_adaptive_weights: true
-tie_adaptive_proj: true
-decoder_learned_pos: false
-decoder_layerdrop: 0
-decoder_layers_to_keep: null
-layernorm_embedding: false
-no_scale_embedding: false
-quant_noise_pq: 0
-quant_noise_pq_block_size: 8
-quant_noise_scalar: 0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/optimizer/adam.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/optimizer/adam.yaml
deleted file mode 100644
index e5264f895e60901a3c68f3300a4c7a9070eeaeff..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/optimizer/adam.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-# @package _group_
-adam_betas: "(0.9, 0.999)"
-adam_eps: 1.0e-8
-weight_decay: 0
-use_old_adam: false
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/optimizer/nag.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/optimizer/nag.yaml
deleted file mode 100644
index 4ab274568658d104bfca51f09046a1f27eb2fd28..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/optimizer/nag.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-# @package _group_
-momentum: 0.99
-weight_decay: 0.0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/params/eval_lm_params.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/params/eval_lm_params.yaml
deleted file mode 100644
index 6f27055d643c055943add764ad79bbeed23e363d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/params/eval_lm_params.yaml
+++ /dev/null
@@ -1,105 +0,0 @@
-# @package _group_
-common:
-  no_progress_bar: false
-  log_interval: 100
-  log_format: null
-  tensorboard_logdir: null
-  seed: 1
-  cpu: false
-  fp16: false
-  memory_efficient_fp16: false
-  fp16_no_flatten_grads: false
-  fp16_init_scale: 128
-  fp16_scale_window: null
-  fp16_scale_tolerance: 0.0
-  min_loss_scale: 1.0e-4
-  threshold_loss_scale: null
-  user_dir: null
-  empty_cache_freq: 0
-  all_gather_list_size: 16384
-  model_parallel_size: 1
-  checkpoint_suffix: ""
-  quantization_config_path: null
-distributed_training:
-  distributed_rank: 0
-  distributed_backend: "nccl"
-  distributed_init_method: null
-  distributed_port: -1
-  device_id: 0
-  local_rank: 0
-  distributed_no_spawn: false
-  ddp_backend: "c10d"
-  bucket_cap_mb: 25
-  fix_batches_to_gpus: false
-  find_unused_parameters: false
-  fast_stat_sync: false
-  broadcast_buffers: false
-  distributed_wrapper: "DDP"
-  slowmo_momentum: null
-  slowmo_algorithm: "LocalSGD"
-  localsgd_frequency: 3
-dataset:
-  num_workers: 1
-  skip_invalid_size_inputs_valid_test: false
-  max_tokens: null
-  batch_size: ${params.dataset.batch_size}
-  required_batch_size_multiple: 8
-  dataset_impl: null
-  data_buffer_size: 10
-  train_subset: "train"
-  valid_subset: "valid"
-  validate_interval: 1
-  fixed_validation_seed: null
-  disable_validation: false
-  curriculum: 0
-  gen_subset: "test"
-  num_shards: 1
-  shard_id: 0
-  max_tokens_valid: ${params.dataset.max_tokens}
-  batch_size_valid: ${params.dataset.batch_size}
-optimization:
-  max_epoch: 0
-  max_update: 0
-  clip_norm: 25.0
-  sentence_avg: false
-  update_freq: [1]
-  lr: [0.25]
-  min_lr: -1.0
-  use_bmuf: false
-checkpoint:
-  save_dir: "checkpoints"
-  restore_file: "checkpoint_last.pt"
-  reset_dataloader: false
-  reset_lr_scheduler: false
-  reset_meters: false
-  reset_optimizer: false
-  optimizer_overrides: "{}"
-  save_interval: 1
-  save_interval_updates: 0
-  keep_interval_updates: -1
-  keep_last_epochs: -1
-  keep_best_checkpoints: -1
-  no_save: false
-  no_epoch_checkpoints: false
-  no_last_checkpoints: false
-  no_save_optimizer_state: false
-  best_checkpoint_metric: "loss"
-  maximize_best_checkpoint_metric: false
-  patience: -1
-common_eval:
-  path: null
-  remove_bpe: null
-  quiet: false
-  model_overrides: "{}"
-  results_path: null
-eval_lm:
-  output_word_probs: false
-  output_word_stats: false
-  context_window: 0
-bmuf:
-  block_lr: 1
-  block_momentum: 0.875
-  global_sync_iter: 50
-  warmup_iterations: 500
-  use_nbm: false
-  average_sync: false
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/params/training_params.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/params/training_params.yaml
deleted file mode 100644
index 2ce94f929088427db52a40981d117ce5a6d3d8c0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/params/training_params.yaml
+++ /dev/null
@@ -1,95 +0,0 @@
-# @package _group_
-common:
-  no_progress_bar: false
-  log_interval: 100
-  log_format: null
-  tensorboard_logdir: null
-  seed: 1
-  cpu: false
-  fp16: false
-  memory_efficient_fp16: false
-  fp16_no_flatten_grads: false
-  fp16_init_scale: 128
-  fp16_scale_window: null
-  fp16_scale_tolerance: 0.0
-  min_loss_scale: 1.0e-4
-  threshold_loss_scale: null
-  user_dir: null
-  empty_cache_freq: 0
-  all_gather_list_size: 16384
-  model_parallel_size: 1
-  checkpoint_suffix: ""
-  quantization_config_path: null
-distributed_training:
-  distributed_rank: 0
-  distributed_backend: "nccl"
-  distributed_init_method: null
-  distributed_port: -1
-  device_id: 0
-  local_rank: 0
-  distributed_no_spawn: false
-  ddp_backend: "c10d"
-  bucket_cap_mb: 25
-  fix_batches_to_gpus: false
-  find_unused_parameters: false
-  fast_stat_sync: false
-  broadcast_buffers: false
-  distributed_wrapper: "DDP"
-  slowmo_momentum: null
-  slowmo_algorithm: "LocalSGD"
-  localsgd_frequency: 3
-dataset:
-  num_workers: 1
-  skip_invalid_size_inputs_valid_test: false
-  max_tokens: null
-  batch_size: ${params.dataset.batch_size}
-  required_batch_size_multiple: 8
-  dataset_impl: null
-  data_buffer_size: 10
-  train_subset: "train"
-  valid_subset: "valid"
-  validate_interval: 1
-  fixed_validation_seed: null
-  disable_validation: false
-  curriculum: 0
-  gen_subset: "test"
-  num_shards: 1
-  shard_id: 0
-  max_tokens_valid: ${params.dataset.max_tokens}
-  batch_size_valid: ${params.dataset.batch_size}
-optimization:
-  max_epoch: 0
-  max_update: 0
-  clip_norm: 25.0
-  sentence_avg: false
-  update_freq: [1]
-  lr: [0.25]
-  min_lr: -1.0
-  use_bmuf: false
-checkpoint:
-  save_dir: "checkpoints"
-  restore_file: "checkpoint_last.pt"
-  reset_dataloader: false
-  reset_lr_scheduler: false
-  reset_meters: false
-  reset_optimizer: false
-  optimizer_overrides: "{}"
-  save_interval: 1
-  save_interval_updates: 0
-  keep_interval_updates: -1
-  keep_last_epochs: -1
-  keep_best_checkpoints: -1
-  no_save: false
-  no_epoch_checkpoints: false
-  no_last_checkpoints: false
-  no_save_optimizer_state: false
-  best_checkpoint_metric: "loss"
-  maximize_best_checkpoint_metric: false
-  patience: -1
-bmuf:
-  block_lr: 1
-  block_momentum: 0.875
-  global_sync_iter: 50
-  warmup_iterations: 500
-  use_nbm: false
-  average_sync: false
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/task/language_modeling.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/task/language_modeling.yaml
deleted file mode 100644
index 58a2ad1358e705b3fbc7f85e0520062837cf5f96..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/config/task/language_modeling.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-# @package _group_
-data: ???
-sample_break_mode: "none"
-tokens_per_sample: 1024
-output_dictionary_size: -1
-self_target: false
-future_target: false
-past_target: false
-add_bos_token: false
-max_target_positions: null
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docker_start.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docker_start.sh
deleted file mode 100644
index 55574023389a4de663582452bb7b7197c79d3072..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docker_start.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-docker_image=$1
-data_dir=$2
-model_dir=$3
-
-docker run -it --ipc=host \
-               --device=/dev/davinci0 \
-               --device=/dev/davinci1 \
-               --device=/dev/davinci2 \
-               --device=/dev/davinci3 \
-               --device=/dev/davinci4 \
-               --device=/dev/davinci5 \
-               --device=/dev/davinci6 \
-               --device=/dev/davinci7 \
-               --device=/dev/davinci_manager \
-               --device=/dev/devmm_svm --device=/dev/hisi_hdc \
-               -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-               -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
-               -v ${model_dir}:${model_dir} \
-               -v ${data_dir}:${data_dir}  \
-               -v /var/log/npu/conf/slog/:/var/log/npu/conf/slog/ \
-               -v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \
-               -v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \
-               /bin/bash
\ No newline at end of file
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/Makefile b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/Makefile
deleted file mode 100644
index c2f5b1a89cfc9e02d1bb09027d9e1e520ba53d53..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = python -msphinx
-SPHINXPROJ    = fairseq
-SOURCEDIR     = .
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/_static/theme_overrides.css b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/_static/theme_overrides.css
deleted file mode 100644
index 2a0764193625e1a6fd66ff8af2ccdd0ad6369188..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/_static/theme_overrides.css
+++ /dev/null
@@ -1,9 +0,0 @@
-.wy-table-responsive table td kbd {
-    white-space: nowrap;
-}
-.wy-table-responsive table td {
-    white-space: normal !important;
-}
-.wy-table-responsive {
-    overflow: visible !important;
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/conf.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/conf.py
deleted file mode 100644
index cd43f00ddb8a7aeb4b026bb01e40fe4e1d7cff6b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/conf.py
+++ /dev/null
@@ -1,164 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# BSD 3-Clause License
-#
-# Copyright (c) 2017 xxxx
-# All rights reserved.
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# ============================================================================
-#
-# fairseq documentation build configuration file, created by
-# sphinx-quickstart on Fri Aug 17 21:45:30 2018.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-
-import os
-import sys
-
-
-# source code directory, relative to this file, for sphinx-autobuild
-sys.path.insert(0, os.path.abspath(".."))
-
-source_suffix = [".rst"]
-
-# -- General configuration ------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    "sphinx.ext.autodoc",
-    "sphinx.ext.intersphinx",
-    "sphinx.ext.viewcode",
-    "sphinx.ext.napoleon",
-    "sphinxarg.ext",
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# The master toctree document.
-master_doc = "index"
-
-# General information about the project.
-project = "fairseq"
-copyright = "2019, Facebook AI Research (FAIR)"
-author = "Facebook AI Research (FAIR)"
-
-github_doc_root = "https://github.com/pytorch/fairseq/tree/master/docs/"
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-version = "0.10.2"
-# The full version, including alpha/beta/rc tags.
-release = "0.10.2"
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = "sphinx"
-highlight_language = "python"
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
-
-# -- Options for HTML output ----------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = "sphinx_rtd_theme"
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-# html_theme_options = {}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-
-html_context = {
-    "css_files": [
-        "_static/theme_overrides.css",  # override wide tables in RTD theme
-    ],
-}
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# This is required for the alabaster theme
-# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
-# html_sidebars = {
-#    '**': [
-#        'about.html',
-#        'navigation.html',
-#        'relations.html',  # needs 'show_related': True theme option to display
-#        'searchbox.html',
-#        'donate.html',
-#    ]
-# }
-
-
-# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {
-    "numpy": ("http://docs.scipy.org/doc/numpy/", None),
-    "python": ("https://docs.python.org/", None),
-    "torch": ("https://pytorch.org/docs/master/", None),
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/docutils.conf b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/docutils.conf
deleted file mode 100644
index 526acffd32d16217160aee917db2b120354f20f0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/docutils.conf
+++ /dev/null
@@ -1,2 +0,0 @@
-[writers]
-option-limit=0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/hydra_integration.md b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/hydra_integration.md
deleted file mode 100644
index 9b77dd83511dafbdc9b14b3ec5d33d1caf233f56..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/hydra_integration.md
+++ /dev/null
@@ -1,113 +0,0 @@
-
-
-## Hydra
-
-Hydra is an open-source Python framework that simplifies the development of research and other complex applications. The key feature is the ability to dynamically create a hierarchical configuration by composition and override it through config files and the command line. The name Hydra comes from its ability to run multiple similar jobs - much like a Hydra with multiple heads.
-
-## Train models with hydra interface
-
-#### Provide parameters in `.yaml` files
-For example, if we'd like to train a language model with transformer, we could provide parameters in yaml files. Note that the modules used (task, model, criterion, optimizer, lr scheduler) in training must be migrated with hydra interface already (See session below).
-
-- Provide top level choices on which generic parameter file, and which modules to use: `config/config.yaml`, this will look like for example:
-
-```
-defaults:
-  - params: training_params
-  - task: language_modeling
-  - model: transformer_lm
-  - criterion: cross_entropy
-  - optimizer: adam
-  - lr_scheduler: inverse_sqrt
-```
-
-- Provide generic parameters common across different training jobs: `config/params/training_params.yaml`
-- Provide task parameters: `config/task/language_modeling.yaml`
-- Provide model parameters: `config/model/transformer_lm.yaml`
-- Provide criterion parameters: `config/criterion/cross_entropy.yaml`
-- Provide optimizer parameters: `config/optimizer/adam.yaml`
-- Provide lr_scheduler parameters `config/lr_scheduler/inverse_sqrt.yaml`
-
-#### Command line overriding
-`train_hydra.py` is the main entry point for training with hydra interface. If we specify all parameters we want in `.yaml` files, then we could simply use command:
-
-```
-# task.data is requested field marked by `???` in yaml
-python fairseq_cli/train_hydra.py \
-task.data=/private/home/abaevski/data/wiki103 \
-```
-
-Alternatively, if we need to override certain params from the command line, we could do so as below (note the structure of where each parameter sits)
-
-```
-python fairseq_cli/train_hydra.py
-params=training_params \
-task=language_modeling \
-task.data=/private/home/abaevski/data/wiki103 \
-task.tokens_per_sample=512 \
-task.sample_break_mode=none \
-model=transformer_lm \
-model.share_decoder_input_output_embed=true \
-model.dropout=0.1 \
-optimizer=adam \
-optimizer.adam_betas="'(0.9, 0.98)'" \
-optimizer.weight_decay=0.01 \
-lr_scheduler=inverse_sqrt \
-lr_scheduler.warmup_updates=4000 \
-lr_scheduler.warmup_init_lr=1e-07 \
-criterion=cross_entropy \
-params.common.fp16=true \
-params.common.log_format=json \
-params.common.log_interval=1 \
-params.dataset.max_tokens=1024 \
-params.dataset.num_workers=4 \
-params.optimization.update_freq=[16] \
-params.optimization.max_update=50000 \
-params.optimization.clip_norm=0.0 \
-params.optimization.lr=[0.0005] \
-params.checkpoint.save_dir=/checkpoint/mtian/transformer_wikitext-103-hydra-args-cli \
-params.checkpoint.save_interval_updates=10
-```
-
-## Migrate existing/Creating new modules to hydra interface
-
-In each of the modules we want to migrated/create with hydra interface, fundamentally we need to
-
-- Provide a dataclass that layouts the parameters used in the module.
-
-- Modify the builder and/or constructor that previously takes `argparse.Namespace` argument `args`, into taking `omegaconf.DictConfig` config objects. At this moment we allow `Union[omegaconf.DictConfig, argparse.Namespace]` to support compatibility.
-
-- For `add_args()`, we need to extract argument from the dataclass defined in the same file, and append them into `parser`. This is also to support compatibility. This is simply supported with `gen_parser_from_dataclass` API, see examples files below.
-
-#### Migrated examples:
-
-- Task: `fairseq/tasks/language_modeling.py`
-
-- Model: `fairseq/models/transformer_lm.py`
-
-- Criterion: `fairseq/criterions/adaptive_loss.py` and `fairseq/criterions/cross_entropy.py`
-
-- Optimizer: `fairseq/optim/adam.py` and `fairseq/optim/nag.py`
-
-- LR scheduler: `fairseq/optim/lr_scheduler/cosine_lr_scheduler.py` and `fairseq/optim/lr_scheduler/inverse_square_root_schedule.py`
-
-
-## Interpolate parameters across different places
-
-## Support of legacy interface
-If you still like to pass legacy style arguments in command line, `fairseq_cli/train.py` can support this. Internally it coverted `args` into hydra config objects whenever there are migrated modules aligned.
-
-```
-python fairseq_cli/train.py --task language_modeling \
-/private/home/abaevski/data/wiki103 \
---save-dir /checkpoint/mtian/transformer_wikitext-103-hydra-args-cli \
---arch transformer_lm --share-decoder-input-output-embed \
---dropout 0.1 \
---optimizer adam --adam-betas '(0.9, 0.98)' --weight-decay 0.01 --clip-norm 0.0 \
---lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
---tokens-per-sample 512 --sample-break-mode none \
---max-tokens 1024 --update-freq 16 \
---fp16 \
---max-update 50000 --log-format json --log-interval 1 --num-workers 4 \
---save-interval-updates 10
-```
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/make.bat b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/make.bat
deleted file mode 100644
index baa9d02a79266ed17e0841f08a83931d46583393..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/make.bat
+++ /dev/null
@@ -1,36 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=python -msphinx
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-set SPHINXPROJ=fairseq
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
-	echo.then set the SPHINXBUILD environment variable to point to the full
-	echo.path of the 'sphinx-build' executable. Alternatively you may add the
-	echo.Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.http://sphinx-doc.org/
-	exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-
-:end
-popd
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/requirements.txt b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/requirements.txt
deleted file mode 100644
index c734a1f04f1c108d84d3a07643ac93adf6485f13..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/docs/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-sphinx<2.0
-sphinx-argparse
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/env.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/env.sh
deleted file mode 100644
index 5e6632b494eea9d09a8166d6829a71dbc0a64cf6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/env.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-export install_path=/usr/local/Ascend
-
-if [ -d ${install_path}/toolkit ]; then
-    export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH}
-    export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
-    export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH
-    export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH
-    export ASCEND_OPP_PATH=${install_path}/opp
-else
-    if [ -d ${install_path}/nnae/latest ];then
-        export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
-        export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/
-        export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/
-        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
-        export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
-        export ASCEND_AICPU_PATH=${install_path}/nnae/latest
-    else
-        export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
-        export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
-        export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/
-        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
-        export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
-        export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest
-    fi
-fi
-
-export SCALAR_TO_HOST_MEM=1
-export BMMV2_ENABLE=1
-#将Host日志输出到串口,0-关闭/1-开启
-export ASCEND_SLOG_PRINT_TO_STDOUT=0
-#设置默认日志级别,0-debug/1-info/2-warning/3-error
-export ASCEND_GLOBAL_LOG_LEVEL=3
-#设置Host侧Event日志开启标志,0-关闭/1-开启
-export ASCEND_GLOBAL_EVENT_ENABLE=0
-#设置是否开启taskque,0-关闭/1-开启
-export TASK_QUEUE_ENABLE=1
-#设置是否开启PTCopy,0-关闭/1-开启
-export PTCOPY_ENABLE=1
-#设置是否开启combined标志,0-关闭/1-开启
-export COMBINED_ENABLE=1
-#设置特殊场景是否需要重新编译,不需要修改
-export DYNAMIC_OP="ADD#MUL"
-#HCCL白名单开关,1-关闭/0-开启
-export HCCL_WHITELIST_DISABLE=1
-#设置Device侧日志等级为error
-${install_path}/driver/tools/msnpureport -d 0 -g error
-${install_path}/driver/tools/msnpureport -d 1 -g error
-${install_path}/driver/tools/msnpureport -d 2 -g error
-${install_path}/driver/tools/msnpureport -d 3 -g error
-${install_path}/driver/tools/msnpureport -d 4 -g error
-${install_path}/driver/tools/msnpureport -d 5 -g error
-${install_path}/driver/tools/msnpureport -d 6 -g error
-${install_path}/driver/tools/msnpureport -d 7 -g error
-#关闭Device侧Event日志
-${install_path}/driver/tools/msnpureport -e disable
-
-
-path_lib=$(python3.7 -c """
-import sys
-import re
-result=''
-for index in range(len(sys.path)):
-    match_sit = re.search('-packages', sys.path[index])
-    if match_sit is not None:
-        match_lib = re.search('lib', sys.path[index])
-
-        if match_lib is not None:
-            end=match_lib.span()[1]
-            result += sys.path[index][0:end] + ':'
-
-        result+=sys.path[index] + '/torch/lib:'
-print(result)"""
-)
-
-echo ${path_lib}
-
-export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/examples/mbart/README.md b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/examples/mbart/README.md
deleted file mode 100644
index 291009aac39d2ac0e18b08953a11e14421b3d320..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/examples/mbart/README.md
+++ /dev/null
@@ -1,119 +0,0 @@
-# MBART: Multilingual Denoising Pre-training for Neural Machine Translation
-[https://arxiv.org/abs/2001.08210]
-
-## Introduction
-
-MBART is a sequence-to-sequence denoising auto-encoder pre-trained on large-scale monolingual corpora in many languages using the BART objective. mBART is one of the first methods for pre-training a complete sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only on the encoder, decoder, or reconstructing parts of the text.
-
-## Pre-trained models
-
-Model | Description | # params | Download
----|---|---|---
-`mbart.CC25` | mBART model with 12 encoder and decoder layers trained on 25 languages' monolingual corpus | 610M | mbart.CC25.tar.gz 
-`mbart.ft.ro_en` | finetune mBART cc25 model on ro-en language pairs | 610M | mbart.cc25.ft.enro.tar.gz 
-
-## Results
-
-
-
-Model | en-ro | ro-en
----|---|---
-`Random` | 34.3 | 34.0
-`mbart.cc25` | 37.7 | 37.8
-`mbart.enro.bilingual` | 38.5 | 38.5 
-
-## BPE data
-# download model
-tar -xzvf mbart.CC25.tar.gz
-# bpe data
-install SPM
-```bash
-SPM=/path/to/sentencepiece/build/src/spm_encode
-MODEL=sentence.bpe.model
-${SPM} --model=${MODEL} < ${DATA}/${TRAIN}.${SRC} > ${DATA}/${TRAIN}.spm.${SRC} &
-${SPM} --model=${MODEL} < ${DATA}/${TRAIN}.${TGT} > ${DATA}/${TRAIN}.spm.${TGT} &
-${SPM} --model=${MODEL} < ${DATA}/${VALID}.${SRC} > ${DATA}/${VALID}.spm.${SRC} &
-${SPM} --model=${MODEL} < ${DATA}/${VALID}.${TGT} > ${DATA}/${VALID}.spm.${TGT} &
-${SPM} --model=${MODEL} < ${DATA}/${TEST}.${SRC} > ${DATA}/${TEST}.spm.${SRC} &
-${SPM} --model=${MODEL} < ${DATA}/${TEST}.${TGT} > ${DATA}/${TEST}.spm.${TGT} &
-```
-
-## Preprocess data
-
-```bash
-DICT=dict.txt
-fairseq-preprocess \
-  --source-lang ${SRC} \
-  --target-lang ${TGT} \
-  --trainpref ${DATA}/${TRAIN}.spm \
-  --validpref ${DATA}/${VALID}.spm \
-  --testpref ${DATA}/${TEST}.spm \
-  --destdir ${DEST}/${NAME} \
-  --thresholdtgt 0 \
-  --thresholdsrc 0 \
-  --srcdict ${DICT} \
-  --tgtdict ${DICT} \
-  --workers 70
-```
-
-## Finetune on EN-RO
-Finetune on mbart CC25
-
-```bash
-PRETRAIN=mbart.cc25 # fix if you moved the downloaded checkpoint
-langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN
-
-fairseq-train path_2_data \
-  --encoder-normalize-before --decoder-normalize-before \
-  --arch mbart_large --layernorm-embedding \
-  --task translation_from_pretrained_bart \
-  --source-lang en_XX --target-lang ro_RO \
-  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
-  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
-  --lr-scheduler polynomial_decay --lr 3e-05 --min-lr -1 --warmup-updates 2500 --total-num-update 40000 \
-  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
-  --max-tokens 1024 --update-freq 2 \
-  --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
-  --seed 222 --log-format simple --log-interval 2 \
-  --restore-file $PRETRAIN \
-  --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
-  --langs $langs \
-  --ddp-backend no_c10d
-```
-## Generate on EN-RO
-Get sacrebleu on finetuned en-ro model
-
-get tokenizer 
-```bash  
-tar -xzvf mbart.cc25.ft.enro.tar.gz
-```
-
-```bash
-model_dir=MBART_finetuned_enro # fix if you moved the checkpoint
-
-fairseq-generate path_2_data \
-  --path $model_dir/model.pt \
-  --task translation_from_pretrained_bart \
-  --gen-subset test \
-  -t ro_RO -s en_XX \
-  --bpe 'sentencepiece' --sentencepiece-model $model_dir/sentence.bpe.model \
-  --sacrebleu --remove-bpe 'sentencepiece' \
-  --batch-size 32 --langs $langs > en_ro
-
-cat en_ro | grep -P "^H" |sort -V |cut -f 3- | sed 's/\[ro_RO\]//g' |$TOKENIZER ro > en_ro.hyp
-cat en_ro | grep -P "^T" |sort -V |cut -f 2- | sed 's/\[ro_RO\]//g' |$TOKENIZER ro > en_ro.ref
-sacrebleu -tok 'none' -s 'none' en_ro.ref < en_ro.hyp
-```
-
-## Citation
-
-```bibtex
-@article{liu2020multilingual,
-    title={Multilingual Denoising Pre-training for Neural Machine Translation},
-    author={Yinhan Liu and Jiatao Gu and Naman Goyal and Xian Li and Sergey Edunov and Marjan Ghazvininejad and Mike Lewis and Luke Zettlemoyer},
-    year={2020},
-    eprint={2001.08210},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-```
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/__init__.py
deleted file mode 100644
index 9c1f7569a9e2d7a300d6d415f8a5d512ad9efebf..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""isort:skip_file"""
-
-__all__ = ["pdb"]
-__version__ = "0.10.2"
-
-import sys
-
-# backwards compatibility to support `from fairseq.meters import AverageMeter`
-from fairseq.logging import meters, metrics, progress_bar  # noqa
-
-sys.modules["fairseq.meters"] = meters
-sys.modules["fairseq.metrics"] = metrics
-sys.modules["fairseq.progress_bar"] = progress_bar
-
-import fairseq.criterions  # noqa
-import fairseq.models  # noqa
-import fairseq.modules  # noqa
-import fairseq.optim  # noqa
-import fairseq.optim.lr_scheduler  # noqa
-import fairseq.pdb  # noqa
-import fairseq.scoring  # noqa
-import fairseq.tasks  # noqa
-import fairseq.token_generation_constraints  # noqa
-
-import fairseq.benchmark  # noqa
-import fairseq.model_parallel  # noqa
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/__init__.py
deleted file mode 100644
index f6584661bd9ad7984438a575dfd6b93a10d7c7f3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# import models/tasks to register them
-from . import dummy_lm, dummy_masked_lm, dummy_model, dummy_mt  # noqa
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/dummy_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/dummy_lm.py
deleted file mode 100644
index 6429d04de38acaa9f642dd7432673d6b9d1a48f1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/dummy_lm.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-import numpy as np
-import torch
-from fairseq.data import Dictionary, FairseqDataset
-from fairseq.tasks import LegacyFairseqTask, register_task
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("dummy_lm")
-class DummyLMTask(LegacyFairseqTask):
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        parser.add_argument("--dict-size", default=49996, type=int)
-        parser.add_argument("--dataset-size", default=100000, type=int)
-        parser.add_argument(
-            "--tokens-per-sample",
-            default=512,
-            type=int,
-            help="max number of total tokens over all segments "
-            "per sample for BERT dataset",
-        )
-
-    def __init__(self, args, dictionary):
-        super().__init__(args)
-        self.dictionary = dictionary
-        self.seed = args.seed
-
-        dictionary.pad_to_multiple_(8)  # often faster if divisible by 8
-
-        seq = torch.arange(args.tokens_per_sample + 1) + dictionary.pad() + 1
-
-        self.dummy_src = seq[:-1]
-        self.dummy_tgt = seq[1:]
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        """Setup the task. """
-        dictionary = Dictionary()
-        for i in range(args.dict_size):
-            dictionary.add_symbol("word{}".format(i))
-        logger.info("dictionary: {} types".format(len(dictionary)))
-        return cls(args, dictionary)
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        if self.args.batch_size is not None:
-            bsz = self.args.batch_size
-        else:
-            bsz = max(1, self.args.max_tokens // self.args.tokens_per_sample)
-        self.datasets[split] = DummyDataset(
-            {
-                "id": 1,
-                "net_input": {
-                    "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]),
-                    "src_lengths": torch.full(
-                        (bsz,), self.args.tokens_per_sample, dtype=torch.long
-                    ),
-                },
-                "target": torch.stack([self.dummy_tgt for _ in range(bsz)]),
-                "nsentences": bsz,
-                "ntokens": bsz * self.args.tokens_per_sample,
-            },
-            num_items=self.args.dataset_size,
-            item_size=self.args.tokens_per_sample,
-        )
-
-    @property
-    def source_dictionary(self):
-        return self.dictionary
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
-
-
-class DummyDataset(FairseqDataset):
-    def __init__(self, batch, num_items, item_size):
-        super().__init__()
-        self.batch = batch
-        self.num_items = num_items
-        self.item_size = item_size
-
-    def __getitem__(self, index):
-        return index
-
-    def __len__(self):
-        return self.num_items
-
-    def collater(self, samples):
-        return self.batch
-
-    @property
-    def sizes(self):
-        return np.array([self.item_size] * self.num_items)
-
-    def num_tokens(self, index):
-        return self.item_size
-
-    def size(self, index):
-        return self.item_size
-
-    def ordered_indices(self):
-        return np.arange(self.num_items)
-
-    @property
-    def supports_prefetch(self):
-        return False
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/dummy_masked_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/dummy_masked_lm.py
deleted file mode 100644
index ab506fe1d50abd76e1518214ef102578cdfbd0c5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/dummy_masked_lm.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-import numpy as np
-import torch
-from fairseq.data import Dictionary, FairseqDataset
-from fairseq.tasks import LegacyFairseqTask, register_task
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("dummy_masked_lm")
-class DummyMaskedLMTask(LegacyFairseqTask):
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        parser.add_argument("--dict-size", default=49995, type=int)
-        parser.add_argument("--dataset-size", default=100000, type=int)
-        parser.add_argument(
-            "--tokens-per-sample",
-            default=512,
-            type=int,
-            help="max number of total tokens over all segments "
-            "per sample for BERT dataset",
-        )
-
-    def __init__(self, args, dictionary):
-        super().__init__(args)
-        self.dictionary = dictionary
-
-        # add mask token
-        self.mask_idx = dictionary.add_symbol("<mask>")
-        dictionary.pad_to_multiple_(8)  # often faster if divisible by 8
-
-        mask_idx = 0
-        pad_idx = 1
-        seq = torch.arange(args.tokens_per_sample) + pad_idx + 1
-        mask = torch.arange(2, args.tokens_per_sample, 7)  # ~15%
-        src = seq.clone()
-        src[mask] = mask_idx
-        tgt = torch.full_like(seq, pad_idx)
-        tgt[mask] = seq[mask]
-
-        self.dummy_src = src
-        self.dummy_tgt = tgt
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        """Setup the task. """
-        dictionary = Dictionary()
-        for i in range(args.dict_size):
-            dictionary.add_symbol("word{}".format(i))
-        logger.info("dictionary: {} types".format(len(dictionary)))
-        return cls(args, dictionary)
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        if self.args.batch_size is not None:
-            bsz = self.args.batch_size
-        else:
-            bsz = max(1, self.args.max_tokens // self.args.tokens_per_sample)
-        self.datasets[split] = DummyDataset(
-            {
-                "id": 1,
-                "net_input": {
-                    "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]),
-                    "src_lengths": torch.full(
-                        (bsz,), self.args.tokens_per_sample, dtype=torch.long
-                    ),
-                },
-                "target": torch.stack([self.dummy_tgt for _ in range(bsz)]),
-                "nsentences": bsz,
-                "ntokens": bsz * self.args.tokens_per_sample,
-            },
-            num_items=self.args.dataset_size,
-            item_size=self.args.tokens_per_sample,
-        )
-
-    @property
-    def source_dictionary(self):
-        return self.dictionary
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
-
-
-class DummyDataset(FairseqDataset):
-    def __init__(self, batch, num_items, item_size):
-        super().__init__()
-        self.batch = batch
-        self.num_items = num_items
-        self.item_size = item_size
-
-    def __getitem__(self, index):
-        return index
-
-    def __len__(self):
-        return self.num_items
-
-    def collater(self, samples):
-        return self.batch
-
-    @property
-    def sizes(self):
-        return np.array([self.item_size] * self.num_items)
-
-    def num_tokens(self, index):
-        return self.item_size
-
-    def size(self, index):
-        return self.item_size
-
-    def ordered_indices(self):
-        return np.arange(self.num_items)
-
-    @property
-    def supports_prefetch(self):
-        return False
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/dummy_model.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/dummy_model.py
deleted file mode 100644
index ff26e4fe655d8e8d7f9942c4bd3df7cd267405fb..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/dummy_model.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq.data import Dictionary
-from fairseq.models import (
-    FairseqDecoder,
-    FairseqLanguageModel,
-    register_model,
-    register_model_architecture,
-)
-
-
-@register_model("dummy_model")
-class DummyModel(FairseqLanguageModel):
-    def __init__(self, args, encoder):
-        super().__init__(encoder)
-        self.args = args
-
-    @staticmethod
-    def add_args(parser):
-        parser.add_argument("--num-layers", type=int, default=24)
-        parser.add_argument("--embed-dim", type=int, default=1024)
-
-    @classmethod
-    def build_model(cls, args, task):
-        encoder = DummyEncoder(
-            num_embed=len(task.target_dictionary),
-            embed_dim=args.embed_dim,
-            num_layers=args.num_layers,
-        )
-        return cls(args, encoder)
-
-    def forward(self, src_tokens, masked_tokens=None, **kwargs):
-        return self.decoder(src_tokens, masked_tokens=masked_tokens)
-
-
-class DummyEncoder(FairseqDecoder):
-    def __init__(self, num_embed=50000, embed_dim=1024, num_layers=24):
-        super().__init__(Dictionary())
-        self.embed = nn.Embedding(
-            num_embeddings=num_embed, embedding_dim=embed_dim, padding_idx=0
-        )
-        self.layers_a = nn.ModuleList(
-            [
-                nn.Sequential(
-                    nn.LayerNorm(embed_dim),
-                    nn.Linear(embed_dim, 3 * embed_dim),  # q, k, v input projection
-                    nn.Linear(3 * embed_dim, embed_dim),  # skip self-attention
-                    nn.Linear(embed_dim, embed_dim),  # output projection
-                    nn.Dropout(),
-                )
-                for i in range(num_layers)
-            ]
-        )
-        self.layers_b = nn.ModuleList(
-            [
-                nn.Sequential(
-                    nn.LayerNorm(embed_dim),
-                    nn.Linear(embed_dim, 4 * embed_dim),  # FFN
-                    nn.ReLU(),
-                    nn.Linear(4 * embed_dim, embed_dim),  # FFN
-                    nn.Dropout(0.1),
-                )
-                for i in range(num_layers)
-            ]
-        )
-        self.out_proj = nn.Linear(embed_dim, num_embed)
-
-    def forward(self, tokens, masked_tokens=None):
-        x = self.embed(tokens)
-        for layer_a, layer_b in zip(self.layers_a, self.layers_b):
-            x = x + layer_a(x)
-            x = x + layer_b(x)
-        x = self.out_proj(x)
-        if masked_tokens is not None:
-            x = x[masked_tokens]
-        return (x,)
-
-    def max_positions(self):
-        return 1024
-
-    def get_normalized_probs(self, net_output, log_probs, sample=None):
-        logits = net_output[0].float()
-        if log_probs:
-            return F.log_softmax(logits, dim=-1)
-        else:
-            return F.softmax(logits, dim=-1)
-
-
-@register_model_architecture("dummy_model", "dummy_model")
-def base_architecture(args):
-    pass
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/dummy_mt.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/dummy_mt.py
deleted file mode 100644
index 4ca7be93a38d8d2b47685b74b4f8b8f9dcb03d2e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/benchmark/dummy_mt.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-import numpy as np
-import torch
-from fairseq.data import Dictionary, FairseqDataset
-from fairseq.tasks import LegacyFairseqTask, register_task
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("dummy_mt")
-class DummyMTTask(LegacyFairseqTask):
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        parser.add_argument("--dict-size", default=49996, type=int)
-        parser.add_argument("--dataset-size", default=100000, type=int)
-        parser.add_argument("--src-len", default=30, type=int)
-        parser.add_argument("--tgt-len", default=30, type=int)
-
-    def __init__(self, args, dictionary):
-        super().__init__(args)
-        self.dictionary = dictionary
-        self.seed = args.seed
-
-        dictionary.pad_to_multiple_(8)  # often faster if divisible by 8
-
-        self.dummy_src = torch.arange(args.src_len + 1) + dictionary.pad() + 1
-        self.dummy_tgt = torch.arange(args.tgt_len + 1) + dictionary.pad() + 1
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        """Setup the task. """
-        dictionary = Dictionary()
-        for i in range(args.dict_size):
-            dictionary.add_symbol("word{}".format(i))
-        logger.info("dictionary: {} types".format(len(dictionary)))
-
-        args.max_source_positions = args.src_len + dictionary.pad() + 2
-        args.max_target_positions = args.tgt_len + dictionary.pad() + 2
-
-        return cls(args, dictionary)
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        item_size = max(self.args.src_len, self.args.tgt_len)
-        if self.args.batch_size is not None:
-            bsz = self.args.batch_size
-        else:
-            bsz = max(1, self.args.max_tokens // item_size)
-        tgt = torch.stack([self.dummy_tgt for _ in range(bsz)])
-        self.datasets[split] = DummyDataset(
-            {
-                "id": 1,
-                "net_input": {
-                    "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]),
-                    "src_lengths": torch.full(
-                        (bsz,), self.args.src_len, dtype=torch.long
-                    ),
-                    "prev_output_tokens": tgt.clone(),
-                },
-                "target": tgt,
-                "nsentences": bsz,
-                "ntokens": bsz * self.args.tgt_len,
-            },
-            num_items=self.args.dataset_size,
-            item_size=item_size,
-        )
-
-    @property
-    def source_dictionary(self):
-        return self.dictionary
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
-
-
-class DummyDataset(FairseqDataset):
-    def __init__(self, batch, num_items, item_size):
-        super().__init__()
-        self.batch = batch
-        self.num_items = num_items
-        self.item_size = item_size
-
-    def __getitem__(self, index):
-        return index
-
-    def __len__(self):
-        return self.num_items
-
-    def collater(self, samples):
-        return self.batch
-
-    @property
-    def sizes(self):
-        return np.array([self.item_size] * self.num_items)
-
-    def num_tokens(self, index):
-        return self.item_size
-
-    def size(self, index):
-        return self.item_size
-
-    def ordered_indices(self):
-        return np.arange(self.num_items)
-
-    @property
-    def supports_prefetch(self):
-        return False
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/binarizer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/binarizer.py
deleted file mode 100644
index 0255c084b572b1b393f81850609fe75f8dd05781..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/binarizer.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-from collections import Counter
-
-import torch
-from fairseq.file_io import PathManager
-from fairseq.tokenizer import tokenize_line
-
-
-def safe_readline(f):
-    pos = f.tell()
-    while True:
-        try:
-            return f.readline()
-        except UnicodeDecodeError:
-            pos -= 1
-            f.seek(pos)  # search where this character begins
-
-
-class Binarizer:
-    @staticmethod
-    def binarize(
-        filename,
-        dict,
-        consumer,
-        tokenize=tokenize_line,
-        append_eos=True,
-        reverse_order=False,
-        offset=0,
-        end=-1,
-        already_numberized=False,
-    ):
-        nseq, ntok = 0, 0
-        replaced = Counter()
-
-        def replaced_consumer(word, idx):
-            if idx == dict.unk_index and word != dict.unk_word:
-                replaced.update([word])
-
-        with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f:
-            f.seek(offset)
-            # next(f) breaks f.tell(), hence readline() must be used
-            line = safe_readline(f)
-            while line:
-                if end > 0 and f.tell() > end:
-                    break
-                if already_numberized:
-                    id_strings = line.strip().split()
-                    id_list = [int(id_string) for id_string in id_strings]
-                    if reverse_order:
-                        id_list.reverse()
-                    if append_eos:
-                        id_list.append(dict.eos())
-                    ids = torch.IntTensor(id_list)
-                else:
-                    ids = dict.encode_line(
-                        line=line,
-                        line_tokenizer=tokenize,
-                        add_if_not_exist=False,
-                        consumer=replaced_consumer,
-                        append_eos=append_eos,
-                        reverse_order=reverse_order,
-                    )
-                nseq += 1
-                ntok += len(ids)
-                consumer(ids)
-                line = f.readline()
-        return {
-            "nseq": nseq,
-            "nunk": sum(replaced.values()),
-            "ntok": ntok,
-            "replaced": replaced,
-        }
-
-    @staticmethod
-    def binarize_alignments(filename, alignment_parser, consumer, offset=0, end=-1):
-        nseq = 0
-
-        with open(PathManager.get_local_path(filename), "r") as f:
-            f.seek(offset)
-            line = safe_readline(f)
-            while line:
-                if end > 0 and f.tell() > end:
-                    break
-                ids = alignment_parser(line)
-                nseq += 1
-                consumer(ids)
-                line = f.readline()
-        return {"nseq": nseq}
-
-    @staticmethod
-    def find_offsets(filename, num_chunks):
-        with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f:
-            size = os.fstat(f.fileno()).st_size
-            chunk_size = size // num_chunks
-            offsets = [0 for _ in range(num_chunks + 1)]
-            for i in range(1, num_chunks):
-                f.seek(chunk_size * i)
-                safe_readline(f)
-                offsets[i] = f.tell()
-            return offsets
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/checkpoint_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/checkpoint_utils.py
deleted file mode 100644
index ad28ccfc17e1038ee4c9cd9a50a074809ef450d8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/checkpoint_utils.py
+++ /dev/null
@@ -1,573 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import collections
-import logging
-import os
-import re
-import traceback
-from collections import OrderedDict
-from typing import Union
-
-import torch
-from fairseq.file_io import PathManager
-from fairseq.models import FairseqDecoder, FairseqEncoder
-from torch.serialization import default_restore_location
-
-
-logger = logging.getLogger(__name__)
-
-
-def save_checkpoint(args, trainer, epoch_itr, val_loss):
-    from fairseq import distributed_utils, meters
-
-    # only one worker should attempt to create the required dir
-    if args.distributed_rank == 0:
-        os.makedirs(args.save_dir, exist_ok=True)
-
-    prev_best = getattr(save_checkpoint, "best", val_loss)
-    if val_loss is not None:
-        best_function = max if args.maximize_best_checkpoint_metric else min
-        save_checkpoint.best = best_function(val_loss, prev_best)
-
-    if args.no_save:
-        return
-
-    trainer.consolidate_optimizer()
-
-    if not trainer.is_data_parallel_master:
-        return
-
-    def is_better(a, b):
-        return a >= b if args.maximize_best_checkpoint_metric else a <= b
-
-    write_timer = meters.StopwatchMeter()
-    write_timer.start()
-
-    epoch = epoch_itr.epoch
-    end_of_epoch = epoch_itr.end_of_epoch()
-    updates = trainer.get_num_updates()
-
-    suffix = getattr(args, "checkpoint_suffix", "")
-    checkpoint_conds = collections.OrderedDict()
-    checkpoint_conds["checkpoint{}{}.pt".format(epoch, suffix)] = (
-        end_of_epoch
-        and not args.no_epoch_checkpoints
-        and epoch % args.save_interval == 0
-    )
-    checkpoint_conds["checkpoint_{}_{}{}.pt".format(epoch, updates, suffix)] = (
-        not end_of_epoch
-        and args.save_interval_updates > 0
-        and updates % args.save_interval_updates == 0
-    )
-    checkpoint_conds["checkpoint_best{}.pt".format(suffix)] = val_loss is not None and (
-        not hasattr(save_checkpoint, "best")
-        or is_better(val_loss, save_checkpoint.best)
-    )
-    if val_loss is not None and args.keep_best_checkpoints > 0:
-        checkpoint_conds[
-            "checkpoint.best_{}_{:.2f}.pt".format(args.best_checkpoint_metric, val_loss)
-        ] = not hasattr(save_checkpoint, "best") or is_better(
-            val_loss, save_checkpoint.best
-        )
-    checkpoint_conds[
-        "checkpoint_last{}.pt".format(suffix)
-    ] = not args.no_last_checkpoints
-
-    extra_state = {"train_iterator": epoch_itr.state_dict(), "val_loss": val_loss}
-    if hasattr(save_checkpoint, "best"):
-        extra_state.update({"best": save_checkpoint.best})
-
-    checkpoints = [
-        os.path.join(args.save_dir, fn) for fn, cond in checkpoint_conds.items() if cond
-    ]
-    if len(checkpoints) > 0:
-        trainer.save_checkpoint(checkpoints[0], extra_state)
-        for cp in checkpoints[1:]:
-            PathManager.copy(checkpoints[0], cp, overwrite=True)
-
-        write_timer.stop()
-        logger.info(
-            "saved checkpoint {} (epoch {} @ {} updates, score {}) (writing took {} seconds)".format(
-                checkpoints[0], epoch, updates, val_loss, write_timer.sum
-            )
-        )
-
-    if not end_of_epoch and args.keep_interval_updates > 0:
-        # remove old checkpoints; checkpoints are sorted in descending order
-        checkpoints = checkpoint_paths(
-            args.save_dir, pattern=r"checkpoint_\d+_(\d+)\.pt"
-        )
-        for old_chk in checkpoints[args.keep_interval_updates :]:
-            if os.path.lexists(old_chk):
-                os.remove(old_chk)
-
-    if args.keep_last_epochs > 0:
-        # remove old epoch checkpoints; checkpoints are sorted in descending order
-        checkpoints = checkpoint_paths(args.save_dir, pattern=r"checkpoint(\d+)\.pt")
-        for old_chk in checkpoints[args.keep_last_epochs :]:
-            if os.path.lexists(old_chk):
-                os.remove(old_chk)
-
-    if args.keep_best_checkpoints > 0:
-        # only keep the best N checkpoints according to validation metric
-        checkpoints = checkpoint_paths(
-            args.save_dir,
-            pattern=r"checkpoint\.best_{}_(\d+\.?\d*)\.pt".format(
-                args.best_checkpoint_metric
-            ),
-        )
-        if not args.maximize_best_checkpoint_metric:
-            checkpoints = checkpoints[::-1]
-        for old_chk in checkpoints[args.keep_best_checkpoints :]:
-            if os.path.lexists(old_chk):
-                os.remove(old_chk)
-
-
-def load_checkpoint(args, trainer, **passthrough_args):
-    """
-    Load a checkpoint and restore the training iterator.
-
-    *passthrough_args* will be passed through to
-    ``trainer.get_train_iterator``.
-    """
-    reset_optimizer = args.reset_optimizer
-    reset_lr_scheduler = args.reset_lr_scheduler
-    optimizer_overrides = eval(args.optimizer_overrides)
-    reset_meters = args.reset_meters
-    reset_dataloader = args.reset_dataloader
-
-    if getattr(args, "finetune_from_model", None) is not None and (
-        reset_optimizer or reset_lr_scheduler or reset_meters or reset_dataloader
-    ):
-        raise ValueError(
-            "--finetune-from-model can not be set together with either --reset-optimizer"
-            " or reset_lr_scheduler or reset_meters or reset_dataloader"
-        )
-
-    suffix = getattr(args, "checkpoint_suffix", "")
-    if (
-        args.restore_file == "checkpoint_last.pt"
-    ):  # default value of restore_file is 'checkpoint_last.pt'
-        checkpoint_path = os.path.join(
-            args.save_dir, "checkpoint_last{}.pt".format(suffix)
-        )
-        first_launch = not PathManager.exists(checkpoint_path)
-        if getattr(args, "finetune_from_model", None) is not None and first_launch:
-            # if there is no last checkpoint to restore, start the finetune from pretrained model
-            # else just use usual logic to load checkpoint, e.g. restart from last checkpoint and etc.
-            if PathManager.exists(args.finetune_from_model):
-                checkpoint_path = args.finetune_from_model
-                reset_optimizer = True
-                reset_lr_scheduler = True
-                reset_meters = True
-                reset_dataloader = True
-                logger.info(
-                    f"loading pretrained model from {checkpoint_path}: "
-                    "optimizer, lr scheduler, meters, dataloader will be reset"
-                )
-            else:
-                raise ValueError(
-                    f"--funetune-from-model {args.finetune_from_model} does not exist"
-                )
-    elif getattr(args, "model_parallel_size", 1) > 1:
-        checkpoint_path = args.restore_file.replace(".pt", suffix + ".pt")
-    else:
-        checkpoint_path = args.restore_file
-
-    if args.restore_file != "checkpoint_last.pt" and getattr(
-        args, "finetune_from_model", None
-    ):
-        raise ValueError(
-            "--finetune-from-model and --restore-file (non-default value) "
-            "can not be specified together: " + str(args)
-        )
-
-    extra_state = trainer.load_checkpoint(
-        checkpoint_path,
-        reset_optimizer,
-        reset_lr_scheduler,
-        optimizer_overrides,
-        reset_meters=reset_meters,
-    )
-
-    if (
-        extra_state is not None
-        and "best" in extra_state
-        and not reset_optimizer
-        and not reset_meters
-    ):
-        save_checkpoint.best = extra_state["best"]
-
-    if extra_state is not None and not reset_dataloader:
-        # restore iterator from checkpoint
-        itr_state = extra_state["train_iterator"]
-        epoch_itr = trainer.get_train_iterator(
-            epoch=itr_state["epoch"], load_dataset=True, **passthrough_args
-        )
-        epoch_itr.load_state_dict(itr_state)
-    else:
-        epoch_itr = trainer.get_train_iterator(
-            epoch=1, load_dataset=True, **passthrough_args
-        )
-
-    trainer.lr_step(epoch_itr.epoch)
-
-    return extra_state, epoch_itr
-
-
-def load_checkpoint_to_cpu(path, arg_overrides=None):
-    """Loads a checkpoint to CPU (with upgrading for backward compatibility)."""
-    with open(PathManager.get_local_path(path), "rb") as f:
-        state = torch.load(
-            path, map_location=lambda s, l: default_restore_location(s, "cpu")
-        )
-
-    args = state["args"]
-    if arg_overrides is not None:
-        for arg_name, arg_val in arg_overrides.items():
-            setattr(args, arg_name, arg_val)
-    state = _upgrade_state_dict(state)
-    return state
-
-
-def load_model_ensemble(
-    filenames, arg_overrides=None, task=None, strict=True, suffix="", num_shards=1
-):
-    """Loads an ensemble of models.
-
-    Args:
-        filenames (List[str]): checkpoint files to load
-        arg_overrides (Dict[str,Any], optional): override model args that
-            were used during model training
-        task (fairseq.tasks.FairseqTask, optional): task to use for loading
-    """
-    assert not (
-        strict and num_shards > 1
-    ), "Cannot load state dict with strict=True and checkpoint shards > 1"
-    ensemble, args, _task = load_model_ensemble_and_task(
-        filenames,
-        arg_overrides,
-        task,
-        strict,
-        suffix,
-        num_shards,
-    )
-    return ensemble, args
-
-
-def load_model_ensemble_and_task(
-    filenames, arg_overrides=None, task=None, strict=True, suffix="", num_shards=1
-):
-    from fairseq import tasks
-
-    assert not (
-        strict and num_shards > 1
-    ), "Cannot load state dict with strict=True and checkpoint shards > 1"
-    ensemble = []
-    for filename in filenames:
-        orig_filename = filename
-        for shard_idx in range(num_shards):
-            if num_shards == 1:
-                filename = filename.replace(".pt", suffix + ".pt")
-            else:
-                filename = orig_filename[:-3] + f"_part{shard_idx}.pt"
-            if not PathManager.exists(filename):
-                raise IOError("Model file not found: {}".format(filename))
-            state = load_checkpoint_to_cpu(filename, arg_overrides)
-            if shard_idx == 0:
-                args = state["args"]
-                if task is None:
-                    task = tasks.setup_task(args)
-
-                # build model for ensemble
-                model = task.build_model(args)
-            model.load_state_dict(state["model"], strict=strict, args=args)
-        ensemble.append(model)
-    return ensemble, args, task
-
-
-def checkpoint_paths(path, pattern=r"checkpoint(\d+)\.pt"):
-    """Retrieves all checkpoints found in `path` directory.
-
-    Checkpoints are identified by matching filename to the specified pattern. If
-    the pattern contains groups, the result will be sorted by the first group in
-    descending order.
-    """
-    pt_regexp = re.compile(pattern)
-    files = os.listdir(path)
-
-    entries = []
-    for i, f in enumerate(files):
-        m = pt_regexp.fullmatch(f)
-        if m is not None:
-            idx = float(m.group(1)) if len(m.groups()) > 0 else i
-            entries.append((idx, m.group(0)))
-    return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)]
-
-
-def torch_persistent_save(obj, f):
-    if isinstance(f, str):
-        with PathManager.open(f, "wb") as h:
-            torch_persistent_save(obj, h)
-        return
-    for i in range(3):
-        try:
-            return torch.save(obj, f)
-        except Exception:
-            if i == 2:
-                logger.error(traceback.format_exc())
-
-
-def save_state(
-    filename,
-    args,
-    model_state_dict,
-    criterion,
-    optimizer,
-    lr_scheduler,
-    num_updates,
-    optim_history=None,
-    extra_state=None,
-):
-    from fairseq import utils
-
-    if optim_history is None:
-        optim_history = []
-    if extra_state is None:
-        extra_state = {}
-    state_dict = {
-        "args": args,
-        "model": model_state_dict or {},
-        "optimizer_history": optim_history
-        + [
-            {
-                "criterion_name": criterion.__class__.__name__,
-                "optimizer_name": optimizer.__class__.__name__,
-                "lr_scheduler_state": lr_scheduler.state_dict(),
-                "num_updates": num_updates,
-            }
-        ],
-        "extra_state": extra_state,
-    }
-    if utils.has_parameters(criterion):
-        state_dict["criterion"] = criterion.state_dict()
-    if not args.no_save_optimizer_state:
-        state_dict["last_optimizer_state"] = optimizer.state_dict()
-
-    # convert all state to CPU
-    state_dict = utils.move_to_cpu(state_dict)
-
-    with PathManager.open(filename, "wb") as f:
-        torch_persistent_save(state_dict, f)
-
-
-def _upgrade_state_dict(state):
-    """Helper for upgrading old model checkpoints."""
-    from fairseq import models, registry, tasks
-
-    # add optimizer_history
-    if "optimizer_history" not in state:
-        state["optimizer_history"] = [
-            {"criterion_name": "CrossEntropyCriterion", "best_loss": state["best_loss"]}
-        ]
-        state["last_optimizer_state"] = state["optimizer"]
-        del state["optimizer"]
-        del state["best_loss"]
-    # move extra_state into sub-dictionary
-    if "epoch" in state and "extra_state" not in state:
-        state["extra_state"] = {
-            "epoch": state["epoch"],
-            "batch_offset": state["batch_offset"],
-            "val_loss": state["val_loss"],
-        }
-        del state["epoch"]
-        del state["batch_offset"]
-        del state["val_loss"]
-    # reduce optimizer history's memory usage (only keep the last state)
-    if "optimizer" in state["optimizer_history"][-1]:
-        state["last_optimizer_state"] = state["optimizer_history"][-1]["optimizer"]
-        for optim_hist in state["optimizer_history"]:
-            del optim_hist["optimizer"]
-    # record the optimizer class name
-    if "optimizer_name" not in state["optimizer_history"][-1]:
-        state["optimizer_history"][-1]["optimizer_name"] = "FairseqNAG"
-    # move best_loss into lr_scheduler_state
-    if "lr_scheduler_state" not in state["optimizer_history"][-1]:
-        state["optimizer_history"][-1]["lr_scheduler_state"] = {
-            "best": state["optimizer_history"][-1]["best_loss"]
-        }
-        del state["optimizer_history"][-1]["best_loss"]
-    # keep track of number of updates
-    if "num_updates" not in state["optimizer_history"][-1]:
-        state["optimizer_history"][-1]["num_updates"] = 0
-    # old model checkpoints may not have separate source/target positions
-    if hasattr(state["args"], "max_positions") and not hasattr(
-        state["args"], "max_source_positions"
-    ):
-        state["args"].max_source_positions = state["args"].max_positions
-        state["args"].max_target_positions = state["args"].max_positions
-    # use stateful training data iterator
-    if "train_iterator" not in state["extra_state"]:
-        state["extra_state"]["train_iterator"] = {
-            "epoch": state["extra_state"]["epoch"],
-            "iterations_in_epoch": state["extra_state"].get("batch_offset", 0),
-        }
-    # default to translation task
-    if not hasattr(state["args"], "task"):
-        state["args"].task = "translation"
-    # --raw-text and --lazy-load are deprecated
-    if getattr(state["args"], "raw_text", False):
-        state["args"].dataset_impl = "raw"
-    elif getattr(state["args"], "lazy_load", False):
-        state["args"].dataset_impl = "lazy"
-    # epochs start at 1
-    if state["extra_state"]["train_iterator"] is not None:
-        state["extra_state"]["train_iterator"]["epoch"] = max(
-            state["extra_state"]["train_iterator"].get("epoch", 1),
-            1,
-        )
-
-    # set any missing default values in the task, model or other registries
-    registry.set_defaults(state["args"], tasks.TASK_REGISTRY[state["args"].task])
-    registry.set_defaults(state["args"], models.ARCH_MODEL_REGISTRY[state["args"].arch])
-    for registry_name, REGISTRY in registry.REGISTRIES.items():
-        choice = getattr(state["args"], registry_name, None)
-        if choice is not None:
-            cls = REGISTRY["registry"][choice]
-            registry.set_defaults(state["args"], cls)
-
-    return state
-
-
-def prune_state_dict(state_dict, args):
-    """Prune the given state_dict if desired for LayerDrop
-    (https://arxiv.org/abs/1909.11556).
-
-    Training with LayerDrop allows models to be robust to pruning at inference
-    time. This function prunes state_dict to allow smaller models to be loaded
-    from a larger model and re-maps the existing state_dict for this to occur.
-
-    It's called by functions that load models from checkpoints and does not
-    need to be called directly.
-    """
-    if not args or args.arch == "ptt_transformer":
-        # args should not be none, but don't crash if it is.
-        return state_dict
-
-    encoder_layers_to_keep = (
-        args.encoder_layers_to_keep if "encoder_layers_to_keep" in vars(args) else None
-    )
-    decoder_layers_to_keep = (
-        args.decoder_layers_to_keep if "decoder_layers_to_keep" in vars(args) else None
-    )
-
-    if not encoder_layers_to_keep and not decoder_layers_to_keep:
-        return state_dict
-
-    # apply pruning
-    logger.info(
-        "Pruning model to specified layer configuration - this works best if the model was trained with LayerDrop"
-    )
-
-    def create_pruning_pass(layers_to_keep, layer_name):
-        keep_layers = sorted(
-            [int(layer_string) for layer_string in layers_to_keep.split(",")]
-        )
-        mapping_dict = {}
-        for i in range(len(keep_layers)):
-            mapping_dict[str(keep_layers[i])] = str(i)
-
-        regex = re.compile(r"^{layer}.*\.layers\.(\d+)".format(layer=layer_name))
-        return {"substitution_regex": regex, "mapping_dict": mapping_dict}
-
-    pruning_passes = []
-    if encoder_layers_to_keep:
-        pruning_passes.append(create_pruning_pass(encoder_layers_to_keep, "encoder"))
-    if decoder_layers_to_keep:
-        pruning_passes.append(create_pruning_pass(decoder_layers_to_keep, "decoder"))
-
-    new_state_dict = {}
-    for layer_name in state_dict.keys():
-        match = re.search(r"\.layers\.(\d+)\.", layer_name)
-        # if layer has no number in it, it is a supporting layer, such as an
-        # embedding
-        if not match:
-            new_state_dict[layer_name] = state_dict[layer_name]
-            continue
-
-        # otherwise, layer should be pruned.
-        original_layer_number = match.group(1)
-        # figure out which mapping dict to replace from
-        for pruning_pass in pruning_passes:
-            if original_layer_number in pruning_pass["mapping_dict"] and pruning_pass[
-                "substitution_regex"
-            ].search(layer_name):
-                new_layer_number = pruning_pass["mapping_dict"][original_layer_number]
-                substitution_match = pruning_pass["substitution_regex"].search(
-                    layer_name
-                )
-                new_state_key = (
-                    layer_name[: substitution_match.start(1)]
-                    + new_layer_number
-                    + layer_name[substitution_match.end(1) :]
-                )
-                new_state_dict[new_state_key] = state_dict[layer_name]
-
-    # Since layers are now pruned, *_layers_to_keep are no longer needed.
-    # This is more of "It would make it work fix" rather than a proper fix.
-    if "encoder_layers_to_keep" in vars(args):
-        args.encoder_layers_to_keep = None
-    if "decoder_layers_to_keep" in vars(args):
-        args.decoder_layers_to_keep = None
-
-    return new_state_dict
-
-
-def load_pretrained_component_from_model(
-    component: Union[FairseqEncoder, FairseqDecoder], checkpoint: str
-):
-    """
-    Load a pretrained FairseqEncoder or FairseqDecoder from checkpoint into the
-    provided `component` object. If state_dict fails to load, there may be a
-    mismatch in the architecture of the corresponding `component` found in the
-    `checkpoint` file.
-    """
-    if not PathManager.exists(checkpoint):
-        raise IOError("Model file not found: {}".format(checkpoint))
-    state = load_checkpoint_to_cpu(checkpoint)
-    if isinstance(component, FairseqEncoder):
-        component_type = "encoder"
-    elif isinstance(component, FairseqDecoder):
-        component_type = "decoder"
-    else:
-        raise ValueError(
-            "component to load must be either a FairseqEncoder or "
-            "FairseqDecoder. Loading other component types are not supported."
-        )
-    component_state_dict = OrderedDict()
-    for key in state["model"].keys():
-        if key.startswith(component_type):
-            # encoder.input_layers.0.0.weight --> input_layers.0.0.weight
-            component_subkey = key[len(component_type) + 1 :]
-            component_state_dict[component_subkey] = state["model"][key]
-    component.load_state_dict(component_state_dict, strict=True)
-    return component
-
-
-def verify_checkpoint_directory(save_dir: str) -> None:
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir, exist_ok=True)
-    temp_file_path = os.path.join(save_dir, "dummy")
-    try:
-        with open(temp_file_path, "w"):
-            pass
-    except OSError as e:
-        logger.warning(
-            "Unable to access checkpoint save directory: {}".format(save_dir)
-        )
-        raise e
-    else:
-        os.remove(temp_file_path)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libbleu/libbleu.cpp b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libbleu/libbleu.cpp
deleted file mode 100644
index 3cf2d65b6d16e19ea299ebe43c9c25e3481d4524..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libbleu/libbleu.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/**
- * Copyright 2017-present, Facebook, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <map>
-#include <array>
-#include <cstring>
-#include <cstdio>
-
-typedef struct
-{
-    size_t reflen;
-    size_t predlen;
-    size_t match1;
-    size_t count1;
-    size_t match2;
-    size_t count2;
-    size_t match3;
-    size_t count3;
-    size_t match4;
-    size_t count4;
-} bleu_stat;
-
-// left trim (remove pad)
-void bleu_ltrim(size_t* len, int** sent, int pad) {
-  size_t start = 0;
-  while(start < *len) {
-    if (*(*sent + start) != pad) { break; }
-    start++;
-  }
-  *sent += start;
-  *len -= start;
-}
-
-// right trim remove (eos)
-void bleu_rtrim(size_t* len, int** sent, int pad, int eos) {
-  size_t end = *len - 1;
-  while (end > 0) {
-    if (*(*sent + end) != eos && *(*sent + end) != pad) { break; }
-    end--;
-  }
-  *len = end + 1;
-}
-
-// left and right trim
-void bleu_trim(size_t* len, int** sent, int pad, int eos) {
-  bleu_ltrim(len, sent, pad);
-  bleu_rtrim(len, sent, pad, eos);
-}
-
-size_t bleu_hash(int len, int* data) {
-  size_t h     = 14695981039346656037ul;
-  size_t prime = 0x100000001b3;
-  char* b      = (char*) data;
-  size_t blen  = sizeof(int) * len;
-
-  while (blen-- > 0) {
-    h ^= *b++;
-    h *= prime;
-  }
-
-  return h;
-}
-
-void bleu_addngram(
-    size_t *ntotal, size_t *nmatch, size_t n,
-    size_t reflen, int* ref, size_t predlen, int* pred) {
-
-  if (predlen < n) { return; }
-
-  predlen = predlen - n + 1;
-  (*ntotal) += predlen;
-
-  if (reflen < n) { return; }
-
-  reflen = reflen - n + 1;
-
-  std::map<size_t, size_t> count;
-  while (predlen > 0) {
-    size_t w = bleu_hash(n, pred++);
-    count[w]++;
-    predlen--;
-  }
-
-  while (reflen > 0) {
-    size_t w = bleu_hash(n, ref++);
-    if (count[w] > 0) {
-      (*nmatch)++;
-      count[w] -=1;
-    }
-    reflen--;
-  }
-}
-
-extern "C" {
-
-#ifdef _WIN64
-__declspec(dllexport) 
-#endif
-void bleu_zero_init(bleu_stat* stat) {
-  std::memset(stat, 0, sizeof(bleu_stat));
-}
-
-#ifdef _WIN64
-__declspec(dllexport) 
-#endif
-void bleu_one_init(bleu_stat* stat) {
-  bleu_zero_init(stat);
-  stat->count1 = 0;
-  stat->count2 = 1;
-  stat->count3 = 1;
-  stat->count4 = 1;
-  stat->match1 = 0;
-  stat->match2 = 1;
-  stat->match3 = 1;
-  stat->match4 = 1;
-}
-
-#ifdef _WIN64
-__declspec(dllexport) 
-#endif
-void bleu_add(
-    bleu_stat* stat,
-    size_t reflen, int* ref, size_t predlen, int* pred, int pad, int eos) {
-
-  bleu_trim(&reflen, &ref, pad, eos);
-  bleu_trim(&predlen, &pred, pad, eos);
-  stat->reflen += reflen;
-  stat->predlen += predlen;
-
-  bleu_addngram(&stat->count1, &stat->match1, 1, reflen, ref, predlen, pred);
-  bleu_addngram(&stat->count2, &stat->match2, 2, reflen, ref, predlen, pred);
-  bleu_addngram(&stat->count3, &stat->match3, 3, reflen, ref, predlen, pred);
-  bleu_addngram(&stat->count4, &stat->match4, 4, reflen, ref, predlen, pred);
-}
-
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libbleu/module.cpp b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libbleu/module.cpp
deleted file mode 100644
index 8ed9a84b1c028bfe9ed1d45be6857b6e79b3459f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libbleu/module.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * Copyright 2017-present, Facebook, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <Python.h>
-
-
-static PyMethodDef method_def[] = {
-  {NULL, NULL, 0, NULL}
-};
-
-static struct PyModuleDef module_def = {
-   PyModuleDef_HEAD_INIT,
-   "libbleu",   /* name of module */
-   NULL,     /* module documentation, may be NULL */
-   -1,       /* size of per-interpreter state of the module,
-                or -1 if the module keeps state in global variables. */
-   method_def
-};
-
-
-#if PY_MAJOR_VERSION == 2
-PyMODINIT_FUNC init_libbleu()
-#else
-PyMODINIT_FUNC PyInit_libbleu()
-#endif
-{
-  PyObject *m = PyModule_Create(&module_def);
-  if (!m) {
-    return NULL;
-  }
-  return m;
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libnat/edit_dist.cpp b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libnat/edit_dist.cpp
deleted file mode 100644
index 6bc6a937d6abde0cd49769c4def69ac0560096bc..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libnat/edit_dist.cpp
+++ /dev/null
@@ -1,231 +0,0 @@
-/**
- * Copyright 2017-present, Facebook, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <torch/torch.h> // @manual=//caffe2:torch_extension
-#include <pybind11/detail/common.h>
-#include <pybind11/pybind11.h>
-#include <vector>
-#include <algorithm>
-#include <cstdint>
-#include <iosfwd>
-#include <memory>
-#include <new>
-#include <string>
-#include <utility>
-
-using namespace ::std;
-
-vector<vector<uint32_t>> edit_distance2_with_dp(
-    vector<uint32_t>& x,
-    vector<uint32_t>& y) {
-  uint32_t lx = x.size();
-  uint32_t ly = y.size();
-  vector<vector<uint32_t>> d(lx + 1, vector<uint32_t>(ly + 1));
-  for (uint32_t i = 0; i < lx + 1; i++) {
-    d[i][0] = i;
-  }
-  for (uint32_t j = 0; j < ly + 1; j++) {
-    d[0][j] = j;
-  }
-  for (uint32_t i = 1; i < lx + 1; i++) {
-    for (uint32_t j = 1; j < ly + 1; j++) {
-      d[i][j] =
-          min(min(d[i - 1][j], d[i][j - 1]) + 1,
-              d[i - 1][j - 1] + 2 * (x.at(i - 1) == y.at(j - 1) ? 0 : 1));
-    }
-  }
-  return d;
-}
-
-vector<vector<uint32_t>> edit_distance2_backtracking(
-    vector<vector<uint32_t>>& d,
-    vector<uint32_t>& x,
-    vector<uint32_t>& y,
-    uint32_t terminal_symbol) {
-  vector<uint32_t> seq;
-  vector<vector<uint32_t>> edit_seqs(x.size() + 2, vector<uint32_t>());
-  /*
-  edit_seqs:
-  0~x.size() cell is the insertion sequences
-  last cell is the delete sequence
-  */
-
-  if (x.size() == 0) {
-    edit_seqs.at(0) = y;
-    return edit_seqs;
-  }
-
-  uint32_t i = d.size() - 1;
-  uint32_t j = d.at(0).size() - 1;
-
-  while ((i >= 0) && (j >= 0)) {
-    if ((i == 0) && (j == 0)) {
-      break;
-    }
-
-    if ((j > 0) && (d.at(i).at(j - 1) < d.at(i).at(j))) {
-      seq.push_back(1); // insert
-      seq.push_back(y.at(j - 1));
-      j--;
-    } else if ((i > 0) && (d.at(i - 1).at(j) < d.at(i).at(j))) {
-      seq.push_back(2); // delete
-      seq.push_back(x.at(i - 1));
-      i--;
-    } else {
-      seq.push_back(3); // keep
-      seq.push_back(x.at(i - 1));
-      i--;
-      j--;
-    }
-  }
-
-  uint32_t prev_op, op, s, word;
-  prev_op = 0, s = 0;
-  for (uint32_t k = 0; k < seq.size() / 2; k++) {
-    op = seq.at(seq.size() - 2 * k - 2);
-    word = seq.at(seq.size() - 2 * k - 1);
-    if (prev_op != 1) {
-      s++;
-    }
-    if (op == 1) // insert
-    {
-      edit_seqs.at(s - 1).push_back(word);
-    } else if (op == 2) // delete
-    {
-      edit_seqs.at(x.size() + 1).push_back(1);
-    } else {
-      edit_seqs.at(x.size() + 1).push_back(0);
-    }
-
-    prev_op = op;
-  }
-
-  for (uint32_t k = 0; k < edit_seqs.size(); k++) {
-    if (edit_seqs[k].size() == 0) {
-      edit_seqs[k].push_back(terminal_symbol);
-    }
-  }
-  return edit_seqs;
-}
-
-vector<vector<uint32_t>> edit_distance2_backtracking_with_delete(
-    vector<vector<uint32_t>>& d,
-    vector<uint32_t>& x,
-    vector<uint32_t>& y,
-    uint32_t terminal_symbol,
-    uint32_t deletion_symbol) {
-  vector<uint32_t> seq;
-  vector<vector<uint32_t>> edit_seqs(x.size() + 1, vector<uint32_t>());
-  /*
-  edit_seqs:
-  0~x.size() cell is the insertion sequences
-  last cell is the delete sequence
-  */
-
-  if (x.size() == 0) {
-    edit_seqs.at(0) = y;
-    return edit_seqs;
-  }
-
-  uint32_t i = d.size() - 1;
-  uint32_t j = d.at(0).size() - 1;
-
-  while ((i >= 0) && (j >= 0)) {
-    if ((i == 0) && (j == 0)) {
-      break;
-    }
-
-    if ((j > 0) && (d.at(i).at(j - 1) < d.at(i).at(j))) {
-      seq.push_back(1); // insert
-      seq.push_back(y.at(j - 1));
-      j--;
-    } else if ((i > 0) && (d.at(i - 1).at(j) < d.at(i).at(j))) {
-      seq.push_back(2); // delete
-      seq.push_back(x.at(i - 1));
-      i--;
-    } else {
-      seq.push_back(3); // keep
-      seq.push_back(x.at(i - 1));
-      i--;
-      j--;
-    }
-  }
-
-  uint32_t prev_op, op, s, word;
-  prev_op = 0, s = 0;
-  for (uint32_t k = 0; k < seq.size() / 2; k++) {
-    op = seq.at(seq.size() - 2 * k - 2);
-    word = seq.at(seq.size() - 2 * k - 1);
-    if (prev_op != 1) {
-      s++;
-    }
-    if (op == 1) // insert
-    {
-      edit_seqs.at(s - 1).push_back(word);
-    } else if (op == 2) // delete
-    {
-      edit_seqs.at(s - 1).push_back(deletion_symbol);
-    }
-
-    prev_op = op;
-  }
-
-  for (uint32_t k = 0; k < edit_seqs.size(); k++) {
-    if (edit_seqs.at(k).size() == 0) {
-      edit_seqs.at(k).push_back(terminal_symbol);
-    }
-  }
-  return edit_seqs;
-}
-
-vector<uint32_t> compute_ed2(
-    vector<vector<uint32_t>>& xs,
-    vector<vector<uint32_t>>& ys) {
-  vector<uint32_t> distances(xs.size());
-  for (uint32_t i = 0; i < xs.size(); i++) {
-    vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i));
-    distances.at(i) = d.at(xs.at(i).size()).at(ys.at(i).size());
-  }
-  return distances;
-}
-
-vector<vector<vector<uint32_t>>> suggested_ed2_path(
-    vector<vector<uint32_t>>& xs,
-    vector<vector<uint32_t>>& ys,
-    uint32_t terminal_symbol) {
-  vector<vector<vector<uint32_t>>> seq(xs.size());
-  for (uint32_t i = 0; i < xs.size(); i++) {
-    vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i));
-    seq.at(i) =
-        edit_distance2_backtracking(d, xs.at(i), ys.at(i), terminal_symbol);
-  }
-  return seq;
-}
-
-vector<vector<vector<uint32_t>>> suggested_ed2_path_with_delete(
-    vector<vector<uint32_t>>& xs,
-    vector<vector<uint32_t>>& ys,
-    uint32_t terminal_symbol,
-    uint32_t deletion_symbol) {
-  vector<vector<vector<uint32_t>>> seq(xs.size());
-  for (uint32_t i = 0; i < xs.size(); i++) {
-    vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i));
-    seq.at(i) = edit_distance2_backtracking_with_delete(
-        d, xs.at(i), ys.at(i), terminal_symbol, deletion_symbol);
-  }
-  return seq;
-}
-
-PYBIND11_MODULE(libnat, m) {
-  m.def("compute_ed2", &compute_ed2, "compute_ed2");
-  m.def("suggested_ed2_path", &suggested_ed2_path, "suggested_ed2_path");
-  m.def(
-      "suggested_ed2_path_with_delete",
-      &suggested_ed2_path_with_delete,
-      "suggested_ed2_path_with_delete");
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libnat_cuda/binding.cpp b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libnat_cuda/binding.cpp
deleted file mode 100644
index aaa6244d5c6819acfae5f408280205661a3389ae..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libnat_cuda/binding.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- * Copyright 2017-present, Facebook, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/*
- This code is partially adpoted from https://github.com/1ytic/pytorch-edit-distance
- */
-
-#include "edit_dist.h"
-#include <torch/types.h>
-
-#ifndef TORCH_CHECK
-#define TORCH_CHECK AT_CHECK
-#endif
-
-#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
-
-
-torch::Tensor LevenshteinDistance(
-        torch::Tensor source,
-        torch::Tensor target,
-        torch::Tensor source_length,
-        torch::Tensor target_length) {
-
-    CHECK_INPUT(source);
-    CHECK_INPUT(target);
-    CHECK_INPUT(source_length);
-    CHECK_INPUT(target_length);
-    return LevenshteinDistanceCuda(source, target, source_length, target_length);
-}
-
-torch::Tensor GenerateDeletionLabel(
-        torch::Tensor source,
-        torch::Tensor operations) {
-
-    CHECK_INPUT(source);
-    CHECK_INPUT(operations);
-    return GenerateDeletionLabelCuda(source, operations);
-}
-
-std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabel(
-        torch::Tensor target,
-        torch::Tensor operations) {
-
-    CHECK_INPUT(target);
-    CHECK_INPUT(operations);
-    return GenerateInsertionLabelCuda(target, operations);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("levenshtein_distance", &LevenshteinDistance, "Levenshtein distance");
-    m.def("generate_deletion_labels", &GenerateDeletionLabel, "Generate Deletion Label");
-    m.def("generate_insertion_labels", &GenerateInsertionLabel, "Generate Insertion Label");
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libnat_cuda/edit_dist.cu b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libnat_cuda/edit_dist.cu
deleted file mode 100644
index 22de16b270851227348c43d6adfb763c8a325df6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libnat_cuda/edit_dist.cu
+++ /dev/null
@@ -1,332 +0,0 @@
-/**
-* Copyright 2017-present, Facebook, Inc.
-* All rights reserved.
-*
-* This source code is licensed under the license found in the
-* LICENSE file in the root directory of this source tree.
-*/
-
-#include "edit_dist.h"
-#include <THC/THC.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
-#include <utility>      // std::pair
-
-template <typename scalar_t>
-__global__ void generate_deletion_label_kernel(
-        const scalar_t* __restrict__ source,
-        const size_t source_size,
-        const size_t operation_size,
-        int* __restrict__ operations,
-        int* __restrict__ labels) {
-
-    const int index = blockIdx.x;
-    const int offset = index * operation_size;
-    const int offset_label = index * source_size;
-
-    for (int i = 0; i < source_size; i++) {
-        labels[offset_label + i] = 0;
-    }
-
-    int k = 0;
-    for (int i = 0; i < operation_size; i++){
-        if (operations[offset + i] == 0){
-            break;
-        } else if (operations[offset + i] == 1){
-            continue;
-        } else {
-            labels[offset_label + k] = 3 - operations[offset + i];
-            k++;
-        }
-    }
-}
-
-template <typename scalar_t>
-__global__ void generate_insertion_label_kernel(
-        const scalar_t* __restrict__ target,
-        const size_t target_size,
-        const size_t operation_size,
-        int* __restrict__ operations,
-        int* __restrict__ labels,
-        int* __restrict__ masks) {
-
-    const int index = blockIdx.x;
-    const int offset = index * operation_size;
-    const int offset_label = index * target_size;
-
-    int k = 0;
-    int u = 0;
-    int m = 0;
-
-    for (int i = 0; i < target_size; i++) {
-        labels[offset_label + i] = 0;
-        masks[offset_label + i] = 0;
-    }
-
-    for (int i = 0; i < operation_size-1; i++){
-        if (operations[offset + i] == 0){
-            break;
-        } else if (operations[offset + i] == 2){
-            continue;
-        } else if (operations[offset + i] == 1){
-            masks[offset_label + m] = 1;
-            u++; m++;
-        } else {
-            labels[offset_label + k] = u;
-            masks[offset_label + m] = 0;
-            k++; m++;
-            u = 0;
-        }
-    }
-}
-
-template <typename scalar_t>
-__global__ void levenshtein_distance_kernel(
-        const scalar_t* __restrict__ source,
-        const scalar_t* __restrict__ target,
-        const int* __restrict__ source_length,
-        const int* __restrict__ target_length,
-        const size_t source_size,
-        const size_t target_size,
-        int* __restrict__ operations,
-        int* __restrict__ errors_curr) {
-
-    const int index = blockIdx.x;
-    const int offset = index * (source_size + target_size);
-    const int d = index * (source_size + 1) * (target_size + 1);
-    const int t = target_size + 1;
-
-    auto err_idx = [d, t](int i, int j) { return d + i * t + j; };
-    auto opt_idx = [offset](int k) { return offset + k; };
-
-    const int hyp_len = source_length[index];
-    const int ref_len = target_length[index];
-    const scalar_t* hyp_begin = source + index * source_size;
-    const scalar_t* ref_begin = target + index * target_size;
-
-    // dynamic programming
-    for (int i = 0; i <= hyp_len; i++){
-        errors_curr[err_idx(i, 0)] = i;
-    }
-    for (int j = 0; j <= ref_len; j++){
-        errors_curr[err_idx(0, j)] = j;
-    }
-    for (int i = 1; i <= hyp_len; i++){
-        for (int j = 1; j <= ref_len; j++){
-            errors_curr[err_idx(i, j)] = min(
-                min(
-                    errors_curr[err_idx(i-1, j)],
-                    errors_curr[err_idx(i, j-1)]
-                ) + 1,
-                errors_curr[err_idx(i-1, j-1)] + 2 * (
-                    *(hyp_begin+i-1) == *(ref_begin+j-1) ? 0 : 1
-                )
-            );
-        }
-    }
-
-    // back-tracing
-    int i = hyp_len;
-    int j = ref_len;
-    int o = hyp_len + ref_len;
-
-    for (int k = 0; k < source_size + target_size; k++) {
-        operations[opt_idx(k)] = 0;
-    }
-
-    while ((i >= 0) && (j >= 0)) {
-        if ((i == 0) && (j == 0)) {
-        break;
-        }
-
-        if ((j > 0) && (errors_curr[err_idx(i, j-1)] < errors_curr[err_idx(i, j)])) {
-            o--; operations[opt_idx(o)] = 1; j--;  // insertion
-        } else if ((i > 0) && (errors_curr[err_idx(i-1, j)] < errors_curr[err_idx(i, j)])) {
-            o--; operations[opt_idx(o)] = 2; i--;  // deletion
-        } else {
-            o--; operations[opt_idx(o)] = 3; i--; j--;  // do nothing
-        }
-    }
-
-    // moving to the left
-    for (int k = 0; k < hyp_len + ref_len; k++) {
-        if (k + o < hyp_len + ref_len){
-            operations[opt_idx(k)] = operations[opt_idx(k+o)];
-        } else{
-            operations[opt_idx(k)] = 0;  // padding
-        }
-    }
-
-}
-
-template <typename scalar_t>
-__global__ void faster_levenshtein_distance_kernel(
-        const scalar_t* __restrict__ source,
-        const scalar_t* __restrict__ target,
-        const int* __restrict__ source_length,
-        const int* __restrict__ target_length,
-        const size_t source_size,
-        const size_t target_size,
-        int* __restrict__ operations) {
-
-    extern __shared__ short errors[];
-    auto errors_curr = errors;
-
-    const int index = blockIdx.x;
-    const int offset = index * (source_size + target_size);
-    const int t = target_size + 1;
-
-    auto err_idx = [t](int i, int j) { return i * t + j; };
-    auto opt_idx = [offset](int k) { return offset + k; };
-
-    const int hyp_len = source_length[index];
-    const int ref_len = target_length[index];
-    const scalar_t* hyp_begin = source + index * source_size;
-    const scalar_t* ref_begin = target + index * target_size;
-
-    // dynamic programming
-    for (int i = 0; i <= hyp_len; i++){
-        errors_curr[err_idx(i, 0)] = i;
-    }
-    for (int j = 0; j <= ref_len; j++){
-        errors_curr[err_idx(0, j)] = j;
-    }
-    for (int i = 1; i <= hyp_len; i++){
-        for (int j = 1; j <= ref_len; j++){
-            errors_curr[err_idx(i, j)] = min(
-                min(
-                    errors_curr[err_idx(i-1, j)],
-                    errors_curr[err_idx(i, j-1)]
-                ) + 1,
-                errors_curr[err_idx(i-1, j-1)] + 2 * (
-                    *(hyp_begin+i-1) == *(ref_begin+j-1) ? 0 : 1
-                )
-            );
-        }
-    }
-
-    // back-tracing
-    int i = hyp_len;
-    int j = ref_len;
-    int o = hyp_len + ref_len;
-
-    for (int k = 0; k < source_size + target_size; k++) {
-        operations[opt_idx(k)] = 0;
-    }
-
-    while ((i >= 0) && (j >= 0)) {
-        if ((i == 0) && (j == 0)) {
-        break;
-        }
-
-        if ((j > 0) && (errors_curr[err_idx(i, j-1)] < errors_curr[err_idx(i, j)])) {
-            o--; operations[opt_idx(o)] = 1; j--;  // insertion
-        } else if ((i > 0) && (errors_curr[err_idx(i-1, j)] < errors_curr[err_idx(i, j)])) {
-            o--; operations[opt_idx(o)] = 2; i--;  // deletion
-        } else {
-            o--; operations[opt_idx(o)] = 3; i--; j--;  // do nothing
-        }
-    }
-
-    // moving to the left
-    for (int k = 0; k < hyp_len + ref_len; k++) {
-        if (k + o < hyp_len + ref_len){
-            operations[opt_idx(k)] = operations[opt_idx(k+o)];
-        } else{
-            operations[opt_idx(k)] = 0;  // padding
-        }
-    }
-
-}
-
-
-torch::Tensor GenerateDeletionLabelCuda(
-        torch::Tensor source,
-        torch::Tensor operations) {
-
-    const auto batch_size = source.size(0);
-    at::TensorOptions options(source.device());
-    options = options.dtype(at::ScalarType::Int);
-    auto labels = torch::empty({batch_size, source.size(1)}, options);
-    auto stream = at::cuda::getCurrentCUDAStream(source.device().index());
-
-    AT_DISPATCH_ALL_TYPES(source.scalar_type(), "generate_deletion_labels", ([&] {
-        generate_deletion_label_kernel<scalar_t><<<batch_size, 1, 0, stream>>>(
-            source.data_ptr<scalar_t>(),
-            source.size(1),
-            operations.size(1),
-            operations.data_ptr<int>(),
-            labels.data_ptr<int>());
-    }));
-
-    return labels;
-}
-
-std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabelCuda(
-    torch::Tensor target,
-    torch::Tensor operations) {
-
-const auto batch_size = target.size(0);
-at::TensorOptions options(target.device());
-options = options.dtype(at::ScalarType::Int);
-auto labels = torch::empty({batch_size, target.size(1)}, options);
-auto masks  = torch::empty({batch_size, target.size(1)}, options);
-auto stream = at::cuda::getCurrentCUDAStream(target.device().index());
-
-AT_DISPATCH_ALL_TYPES(target.scalar_type(), "generate_insertion_labels", ([&] {
-    generate_insertion_label_kernel<scalar_t><<<batch_size, 1, 0, stream>>>(
-        target.data_ptr<scalar_t>(),
-        target.size(1),
-        operations.size(1),
-        operations.data_ptr<int>(),
-        labels.data_ptr<int>(),
-        masks.data_ptr<int>());
-}));
-
-return std::make_pair(labels, masks);
-}
-
-
-torch::Tensor LevenshteinDistanceCuda(
-        torch::Tensor source,
-        torch::Tensor target,
-        torch::Tensor source_length,
-        torch::Tensor target_length) {
-
-    const auto batch_size = source.size(0);
-    const auto shared_size = (source.size(1) + 1) * (target.size(1) + 1) * sizeof(short);
-    
-    at::TensorOptions options(source.device());
-    options = options.dtype(at::ScalarType::Int);
-    auto operations = torch::empty({batch_size, source.size(1) + target.size(1)}, options);
-    auto stream = at::cuda::getCurrentCUDAStream(source.device().index());
-
-    if (shared_size > 40000) {
-        auto distances = torch::empty({batch_size, (source.size(1) + 1) * (target.size(1) + 1)}, options);
-        AT_DISPATCH_ALL_TYPES(source.scalar_type(), "levenshtein_distance", ([&] {
-            levenshtein_distance_kernel<scalar_t><<<batch_size, 1, 0, stream>>>(
-                source.data_ptr<scalar_t>(),
-                target.data_ptr<scalar_t>(),
-                source_length.data_ptr<int>(),
-                target_length.data_ptr<int>(),
-                source.size(1),
-                target.size(1),
-                operations.data_ptr<int>(),
-                distances.data_ptr<int>());
-        }));
-    } else {
-        AT_DISPATCH_ALL_TYPES(source.scalar_type(), "faster_levenshtein_distance", ([&] {
-            faster_levenshtein_distance_kernel<scalar_t><<<batch_size, 1, shared_size, stream>>>(
-                source.data_ptr<scalar_t>(),
-                target.data_ptr<scalar_t>(),
-                source_length.data_ptr<int>(),
-                target_length.data_ptr<int>(),
-                source.size(1),
-                target.size(1),
-                operations.data_ptr<int>());
-        }));
-    }
-
-    return operations;
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libnat_cuda/edit_dist.h b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libnat_cuda/edit_dist.h
deleted file mode 100644
index e3506cd34ddaa35bb724fe64a459bad8046b9a34..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/clib/libnat_cuda/edit_dist.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/**
- * Copyright 2017-present, Facebook, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <torch/extension.h>
-
-torch::Tensor LevenshteinDistanceCuda(
-        torch::Tensor source,
-        torch::Tensor target,
-        torch::Tensor source_length,
-        torch::Tensor target_length);
-
-torch::Tensor GenerateDeletionLabelCuda(
-        torch::Tensor source,
-        torch::Tensor operations);
-
-std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabelCuda(
-        torch::Tensor source,
-        torch::Tensor operations);
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/__init__.py
deleted file mode 100644
index a7eb5f6f3c272c86b15fdf697f72ee9e9382907f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/__init__.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""isort:skip_file"""
-
-import importlib
-import os
-from argparse import Namespace
-from typing import Union
-
-from fairseq import registry
-from fairseq.criterions.fairseq_criterion import (  # noqa
-    FairseqCriterion,
-    LegacyFairseqCriterion,
-)
-from omegaconf import DictConfig
-
-
-(
-    build_criterion_,
-    register_criterion,
-    CRITERION_REGISTRY,
-    CRITERION_DATACLASS_REGISTRY,
-) = registry.setup_registry(
-    "--criterion", base_class=FairseqCriterion, default="cross_entropy"
-)
-
-
-def build_criterion(criterion_cfg: Union[DictConfig, Namespace], task):
-    return build_criterion_(criterion_cfg, task)
-
-
-# automatically import any Python files in the criterions/ directory
-for file in os.listdir(os.path.dirname(__file__)):
-    if file.endswith(".py") and not file.startswith("_"):
-        file_name = file[: file.find(".py")]
-        importlib.import_module("fairseq.criterions." + file_name)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/adaptive_loss.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/adaptive_loss.py
deleted file mode 100644
index 74ba37c321e7ba95c1cd97b5d9f0396dd313b4ee..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/adaptive_loss.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from dataclasses import dataclass
-
-import torch.nn.functional as F
-from fairseq import metrics, utils
-from fairseq.criterions import FairseqCriterion, register_criterion
-from fairseq.dataclass import FairseqDataclass
-from fairseq.dataclass.constants import DDP_BACKEND_CHOICES
-from omegaconf import II
-
-
-@dataclass
-class AdaptiveLossConfig(FairseqDataclass):
-    sentence_avg: bool = II("params.optimization.sentence_avg")
-    ddp_backend: DDP_BACKEND_CHOICES = II("params.distributed_training.ddp_backend")
-
-
-@register_criterion("adaptive_loss", dataclass=AdaptiveLossConfig)
-class AdaptiveLoss(FairseqCriterion):
-    """This is an implementation of the loss function accompanying the adaptive softmax approximation for
-    graphical processing units (GPU), described in the paper "Efficient softmax approximation for GPUs"
-    (http://arxiv.org/abs/1609.04309)."""
-
-    def __init__(self, task, sentence_avg):
-        super().__init__(task)
-        self.sentence_avg = sentence_avg
-
-    @classmethod
-    def build_criterion(cls, args, task):
-        if getattr(args, "ddp_backend", None) == "c10d":
-            raise Exception(
-                "AdaptiveLoss is not compatible with the c10d "
-                "version of DistributedDataParallel. Please use "
-                "`--ddp-backend=no_c10d` instead."
-            )
-        return cls(task, args.sentence_avg)
-
-    def forward(self, model, sample, reduce=True):
-        """Compute the loss for the given sample.
-
-        Returns a tuple with three elements:
-        1) the loss
-        2) the sample size, which is used as the denominator for the gradient
-        3) logging outputs to display while training
-        """
-
-        assert (
-            hasattr(model.decoder, "adaptive_softmax")
-            and model.decoder.adaptive_softmax is not None
-        )
-        adaptive_softmax = model.decoder.adaptive_softmax
-
-        net_output = model(**sample["net_input"])
-        orig_target = model.get_targets(sample, net_output)
-
-        nsentences = orig_target.size(0)
-        orig_target = orig_target.view(-1)
-
-        bsz = orig_target.size(0)
-
-        logits, target = adaptive_softmax(net_output[0], orig_target)
-        assert len(target) == len(logits)
-
-        loss = net_output[0].new(1 if reduce else bsz).zero_()
-
-        for i in range(len(target)):
-            if target[i] is not None:
-                assert target[i].min() >= 0 and target[i].max() <= logits[i].size(1)
-                loss += F.cross_entropy(
-                    logits[i],
-                    target[i],
-                    ignore_index=self.padding_idx,
-                    reduction="sum" if reduce else "none",
-                )
-
-        orig = utils.strip_pad(orig_target, self.padding_idx)
-        ntokens = orig.numel()
-        sample_size = sample["target"].size(0) if self.sentence_avg else ntokens
-        logging_output = {
-            "loss": loss.data,
-            "ntokens": ntokens,
-            "nsentences": nsentences,
-            "sample_size": sample_size,
-        }
-        return loss, sample_size, logging_output
-
-    @staticmethod
-    def reduce_metrics(logging_outputs) -> None:
-        """Aggregate logging outputs from data parallel training."""
-        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
-        ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
-        sample_size = utils.item(
-            sum(log.get("sample_size", 0) for log in logging_outputs)
-        )
-
-        metrics.log_scalar(
-            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
-        )
-        if sample_size != ntokens:
-            metrics.log_scalar(
-                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
-            )
-            metrics.log_derived(
-                "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
-            )
-        else:
-            metrics.log_derived(
-                "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
-            )
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/composite_loss.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/composite_loss.py
deleted file mode 100644
index 98e835fa6e4c0bcad062df9c519701bf795c98be..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/composite_loss.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq import utils
-from fairseq.criterions import LegacyFairseqCriterion, register_criterion
-from torch import nn
-
-
-@register_criterion("composite_loss")
-class CompositeLoss(LegacyFairseqCriterion):
-    """This is a composite loss that, given a list of model outputs and a list of targets,
-    computes an average of losses for each output-target pair"""
-
-    def __init__(self, args, task):
-        super().__init__(args, task)
-        self.underlying_criterion = args.underlying_criterion
-
-    @staticmethod
-    def add_args(parser):
-        """Add criterion-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--underlying-criterion', type=str, metavar='VAL', required=True,
-                            help='underlying criterion to use for the composite loss')
-        # fmt: on
-
-    @staticmethod
-    def build_underlying_criterion(args, task):
-        saved_criterion = args.criterion
-        args.criterion = args.underlying_criterion
-        assert saved_criterion != args.underlying_criterion
-        underlying_criterion = task.build_criterion(args)
-        args.criterion = saved_criterion
-        return underlying_criterion
-
-    @classmethod
-    def build_criterion(cls, args, task):
-        underlying_criterion = CompositeLoss.build_underlying_criterion(args, task)
-
-        class FakeModel(nn.Module):
-            def __init__(self, model, net_out, target):
-                super().__init__()
-                self.model = model
-                self.net_out = net_out
-                self.target = target
-
-            def forward(self, **unused):
-                return self.net_out
-
-            def get_normalized_probs(self, net_output, log_probs, sample=None):
-                return self.model.get_normalized_probs(
-                    net_output, log_probs, sample=sample
-                )
-
-            def get_targets(self, *unused):
-                return self.target
-
-            @property
-            def decoder(self):
-                return self.model.decoder
-
-        class _CompositeLoss(LegacyFairseqCriterion):
-            def __init__(self, args, task, underlying_criterion):
-                super().__init__(args, task)
-                self.underlying_criterion = underlying_criterion
-
-            def forward(self, model, sample, reduce=True):
-                net_outputs = model(**sample["net_input"])
-                targets = sample["target"]
-
-                bsz = targets[0].size(0)
-                loss = net_outputs[0][0].new(1 if reduce else bsz).float().zero_()
-
-                sample_size = 0
-                logging_output = {}
-                for o, t in zip(net_outputs[0], targets):
-                    m = FakeModel(model, (o, net_outputs[1]), t)
-                    sample["target"] = t
-                    l, ss, logging_output = self.underlying_criterion(m, sample, reduce)
-                    loss += l
-                    sample_size += ss
-
-                loss.div_(len(targets))
-                sample_size /= len(targets)
-
-                logging_output["loss"] = utils.item(loss.data) if reduce else loss.data
-                return loss, sample_size, logging_output
-
-            @staticmethod
-            def aggregate_logging_outputs(logging_outputs):
-                return underlying_criterion.__class__.aggregate_logging_outputs(
-                    logging_outputs
-                )
-
-            @staticmethod
-            def reduce_metrics(logging_outputs) -> None:
-                underlying_criterion.__class__.reduce_metrics(logging_outputs)
-
-        return _CompositeLoss(args, task, underlying_criterion)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/cross_entropy.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/cross_entropy.py
deleted file mode 100644
index 66500bf25e49585649a861f30798f14e19c3c3c8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/cross_entropy.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from dataclasses import dataclass
-
-import torch.nn.functional as F
-from fairseq import metrics, utils
-from fairseq.criterions import FairseqCriterion, register_criterion
-from fairseq.dataclass import FairseqDataclass
-from omegaconf import II
-
-
-@dataclass
-class CrossEntropyCriterionConfig(FairseqDataclass):
-    sentence_avg: bool = II("params.optimization.sentence_avg")
-
-
-@register_criterion("cross_entropy", dataclass=CrossEntropyCriterionConfig)
-class CrossEntropyCriterion(FairseqCriterion):
-    def __init__(self, task, sentence_avg):
-        super().__init__(task)
-        self.sentence_avg = sentence_avg
-
-    def forward(self, model, sample, reduce=True):
-        """Compute the loss for the given sample.
-
-        Returns a tuple with three elements:
-        1) the loss
-        2) the sample size, which is used as the denominator for the gradient
-        3) logging outputs to display while training
-        """
-        net_output = model(**sample["net_input"])
-        loss, _ = self.compute_loss(model, net_output, sample, reduce=reduce)
-        sample_size = (
-            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
-        )
-        logging_output = {
-            "loss": loss.data,
-            "ntokens": sample["ntokens"],
-            "nsentences": sample["target"].size(0),
-            "sample_size": sample_size,
-        }
-        return loss, sample_size, logging_output
-
-    def compute_loss(self, model, net_output, sample, reduce=True):
-        lprobs = model.get_normalized_probs(net_output, log_probs=True)
-        lprobs = lprobs.view(-1, lprobs.size(-1))
-        target = model.get_targets(sample, net_output).view(-1)
-        loss = F.nll_loss(
-            lprobs,
-            target,
-            ignore_index=self.padding_idx,
-            reduction="sum" if reduce else "none",
-        )
-        return loss, loss
-
-    @staticmethod
-    def reduce_metrics(logging_outputs) -> None:
-        """Aggregate logging outputs from data parallel training."""
-        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
-        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
-        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
-
-        # we divide by log(2) to convert the loss from base e to base 2
-        metrics.log_scalar(
-            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
-        )
-        if sample_size != ntokens:
-            metrics.log_scalar(
-                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
-            )
-            metrics.log_derived(
-                "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
-            )
-        else:
-            metrics.log_derived(
-                "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
-            )
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/ctc.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/ctc.py
deleted file mode 100644
index 4f93b3cbfd172f43449d2b80b6f3efd88416eba2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/ctc.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
-
-import math
-from argparse import Namespace
-
-import torch
-import torch.nn.functional as F
-from fairseq import metrics, utils
-from fairseq.criterions import FairseqCriterion, register_criterion
-from fairseq.data.data_utils import post_process
-from fairseq.logging.meters import safe_round
-
-
-@register_criterion("ctc")
-class CtcCriterion(FairseqCriterion):
-    def __init__(self, task, wer_args, zero_infinity, sentence_avg, remove_bpe):
-        super().__init__(task)
-        self.blank_idx = task.target_dictionary.bos()
-        self.pad_idx = task.target_dictionary.pad()
-        self.eos_idx = task.target_dictionary.eos()
-        self.post_process = remove_bpe if remove_bpe else "letter"
-
-        if wer_args is not None:
-            from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder
-
-            wer_compute_kenlm, wer_lexicon, lm_w, ws_w = eval(wer_args)
-
-            dec_args = Namespace()
-            dec_args.nbest = 1
-            dec_args.criterion = "ctc"
-            dec_args.kenlm_model = wer_compute_kenlm
-            dec_args.lexicon = wer_lexicon
-            dec_args.beam = 50
-            dec_args.beam_size_token = min(50, len(task.target_dictionary))
-            dec_args.beam_threshold = min(50, len(task.target_dictionary))
-            dec_args.lm_weight = lm_w
-            dec_args.word_score = ws_w
-            dec_args.unk_weight = -math.inf
-            dec_args.sil_weight = 0
-
-            self.w2l_decoder = W2lKenLMDecoder(dec_args, task.target_dictionary)
-        else:
-            self.w2l_decoder = None
-
-        self.zero_infinity = zero_infinity
-        self.sentence_avg = sentence_avg
-
-    @staticmethod
-    def add_args(parser):
-        """Add criterion-specific arguments to the parser."""
-        parser.add_argument(
-            "--zero-infinity", action="store_true", help="zero inf loss"
-        )
-        try:
-            parser.add_argument(
-                "--remove-bpe",
-                "--post-process",
-                default="letter",
-                help="remove BPE tokens before scoring (can be set to sentencepiece, letter, and more)",
-            )
-        except:
-            pass  # this option might have been added from eval args
-        parser.add_argument(
-            "--wer-args",
-            type=str,
-            default=None,
-            help="options for wer computation on valid set using 4 gram lm. this should be a tuple of 4 elements: path to 4-gram lm, \
-            path to lexicon, lm score, word score",
-        )
-
-    def forward(self, model, sample, reduce=True):
-        net_output = model(**sample["net_input"])
-        lprobs = model.get_normalized_probs(
-            net_output, log_probs=True
-        ).contiguous()  # (T, B, C) from the encoder
-
-        if "src_lengths" in sample["net_input"]:
-            input_lengths = sample["net_input"]["src_lengths"]
-        else:
-            non_padding_mask = ~net_output["padding_mask"]
-            input_lengths = non_padding_mask.long().sum(-1)
-
-        pad_mask = (sample["target"] != self.pad_idx) & (
-            sample["target"] != self.eos_idx
-        )
-        targets_flat = sample["target"].masked_select(pad_mask)
-        target_lengths = sample["target_lengths"]
-
-        with torch.backends.cudnn.flags(enabled=False):
-            loss = F.ctc_loss(
-                lprobs,
-                targets_flat,
-                input_lengths,
-                target_lengths,
-                blank=self.blank_idx,
-                reduction="sum",
-                zero_infinity=self.zero_infinity,
-            )
-
-        ntokens = (
-            sample["ntokens"] if "ntokens" in sample else target_lengths.sum().item()
-        )
-
-        sample_size = sample["target"].size(0) if self.sentence_avg else ntokens
-        logging_output = {
-            "loss": utils.item(loss.data),  # * sample['ntokens'],
-            "ntokens": ntokens,
-            "nsentences": sample["id"].numel(),
-            "sample_size": sample_size,
-        }
-
-        if not model.training:
-            import editdistance
-
-            with torch.no_grad():
-                lprobs_t = lprobs.transpose(0, 1).float().contiguous().cpu()
-
-                c_err = 0
-                c_len = 0
-                w_errs = 0
-                w_len = 0
-                wv_errs = 0
-                for lp, t, inp_l in zip(
-                    lprobs_t,
-                    sample["target_label"]
-                    if "target_label" in sample
-                    else sample["target"],
-                    input_lengths,
-                ):
-                    lp = lp[:inp_l].unsqueeze(0)
-
-                    decoded = None
-                    if self.w2l_decoder is not None:
-                        decoded = self.w2l_decoder.decode(lp)
-                        if len(decoded) < 1:
-                            decoded = None
-                        else:
-                            decoded = decoded[0]
-                            if len(decoded) < 1:
-                                decoded = None
-                            else:
-                                decoded = decoded[0]
-
-                    p = (t != self.task.target_dictionary.pad()) & (
-                        t != self.task.target_dictionary.eos()
-                    )
-                    targ = t[p]
-                    targ_units = self.task.target_dictionary.string(targ)
-                    targ_units_arr = targ.tolist()
-
-                    toks = lp.argmax(dim=-1).unique_consecutive()
-                    pred_units_arr = toks[toks != self.blank_idx].tolist()
-
-                    c_err += editdistance.eval(pred_units_arr, targ_units_arr)
-                    c_len += len(targ_units_arr)
-
-                    targ_words = post_process(targ_units, self.post_process).split()
-
-                    pred_units = self.task.target_dictionary.string(pred_units_arr)
-                    pred_words_raw = post_process(pred_units, self.post_process).split()
-
-                    if decoded is not None and "words" in decoded:
-                        pred_words = decoded["words"]
-                        w_errs += editdistance.eval(pred_words, targ_words)
-                        wv_errs += editdistance.eval(pred_words_raw, targ_words)
-                    else:
-                        dist = editdistance.eval(pred_words_raw, targ_words)
-                        w_errs += dist
-                        wv_errs += dist
-
-                    w_len += len(targ_words)
-
-                logging_output["wv_errors"] = wv_errs
-                logging_output["w_errors"] = w_errs
-                logging_output["w_total"] = w_len
-                logging_output["c_errors"] = c_err
-                logging_output["c_total"] = c_len
-
-        return loss, sample_size, logging_output
-
-    @staticmethod
-    def reduce_metrics(logging_outputs) -> None:
-        """Aggregate logging outputs from data parallel training."""
-
-        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
-        ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
-        nsentences = utils.item(
-            sum(log.get("nsentences", 0) for log in logging_outputs)
-        )
-        sample_size = utils.item(
-            sum(log.get("sample_size", 0) for log in logging_outputs)
-        )
-
-        metrics.log_scalar(
-            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
-        )
-        metrics.log_scalar("ntokens", ntokens)
-        metrics.log_scalar("nsentences", nsentences)
-        if sample_size != ntokens:
-            metrics.log_scalar(
-                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
-            )
-
-        c_errors = sum(log.get("c_errors", 0) for log in logging_outputs)
-        metrics.log_scalar("_c_errors", c_errors)
-        c_total = sum(log.get("c_total", 0) for log in logging_outputs)
-        metrics.log_scalar("_c_total", c_total)
-        w_errors = sum(log.get("w_errors", 0) for log in logging_outputs)
-        metrics.log_scalar("_w_errors", w_errors)
-        wv_errors = sum(log.get("wv_errors", 0) for log in logging_outputs)
-        metrics.log_scalar("_wv_errors", wv_errors)
-        w_total = sum(log.get("w_total", 0) for log in logging_outputs)
-        metrics.log_scalar("_w_total", w_total)
-
-        if c_total > 0:
-            metrics.log_derived(
-                "uer",
-                lambda meters: safe_round(
-                    meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3
-                )
-                if meters["_c_total"].sum > 0
-                else float("nan"),
-            )
-        if w_total > 0:
-            metrics.log_derived(
-                "wer",
-                lambda meters: safe_round(
-                    meters["_w_errors"].sum * 100.0 / meters["_w_total"].sum, 3
-                )
-                if meters["_w_total"].sum > 0
-                else float("nan"),
-            )
-            metrics.log_derived(
-                "raw_wer",
-                lambda meters: safe_round(
-                    meters["_wv_errors"].sum * 100.0 / meters["_w_total"].sum, 3
-                )
-                if meters["_w_total"].sum > 0
-                else float("nan"),
-            )
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/fairseq_criterion.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/fairseq_criterion.py
deleted file mode 100644
index ef94a863276d6569cb47028069ec199ec5f63055..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/fairseq_criterion.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import inspect
-from typing import Any, Dict, List
-
-from fairseq import metrics, utils
-from fairseq.dataclass.utils import gen_parser_from_dataclass
-from torch.nn.modules.loss import _Loss
-
-
-class FairseqCriterion(_Loss):
-    def __init__(self, task):
-        super().__init__()
-        self.task = task
-        if hasattr(task, "target_dictionary"):
-            tgt_dict = task.target_dictionary
-            self.padding_idx = tgt_dict.pad() if tgt_dict is not None else -100
-
-    @classmethod
-    def add_args(cls, parser):
-        """Add criterion-specific arguments to the parser."""
-        dc = getattr(cls, "__dataclass", None)
-        if dc is not None:
-            gen_parser_from_dataclass(parser, dc())
-
-    @classmethod
-    def build_criterion(cls, args, task):
-        """Construct a criterion from command-line args."""
-        # Criterions can override this, but for convenience we also try
-        # to automatically map argparse.Namespace keys to corresponding
-        # arguments in the __init__.
-        init_args = {}
-        for p in inspect.signature(cls).parameters.values():
-            if (
-                p.kind == p.POSITIONAL_ONLY
-                or p.kind == p.VAR_POSITIONAL
-                or p.kind == p.VAR_KEYWORD
-            ):
-                # we haven't implemented inference for these argument types,
-                # but PRs welcome :)
-                raise NotImplementedError("{} not supported".format(p.kind))
-
-            assert p.kind in {p.POSITIONAL_OR_KEYWORD, p.KEYWORD_ONLY}
-
-            if p.name == "task":
-                init_args["task"] = task
-            elif hasattr(args, p.name):
-                init_args[p.name] = getattr(args, p.name)
-            elif p.default != p.empty:
-                pass  # we'll use the default value
-            else:
-                raise NotImplementedError(
-                    "Unable to infer Criterion arguments, please implement "
-                    "{}.build_criterion".format(cls.__name__)
-                )
-        return cls(**init_args)
-
-    def forward(self, model, sample, reduce=True):
-        """Compute the loss for the given sample.
-
-        Returns a tuple with three elements:
-        1) the loss
-        2) the sample size, which is used as the denominator for the gradient
-        3) logging outputs to display while training
-        """
-        raise NotImplementedError
-
-    @staticmethod
-    def aggregate_logging_outputs(
-        logging_outputs: List[Dict[str, Any]],
-    ) -> Dict[str, Any]:
-        """Aggregate logging outputs from data parallel training."""
-        utils.deprecation_warning(
-            "The aggregate_logging_outputs API is deprecated. "
-            "Please use the reduce_metrics API instead."
-        )
-        raise NotImplementedError
-
-    @classmethod
-    def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None:
-        """Aggregate logging outputs from data parallel training."""
-        utils.deprecation_warning(
-            "Criterions should implement the reduce_metrics API. "
-            "Falling back to deprecated aggregate_logging_outputs API."
-        )
-        agg_logging_outputs = cls.aggregate_logging_outputs(logging_outputs)
-        for k, v in agg_logging_outputs.items():
-            if k in {"nsentences", "ntokens", "sample_size"}:
-                continue
-            metrics.log_scalar(k, v)
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return False
-
-
-class LegacyFairseqCriterion(FairseqCriterion):
-    def __init__(self, args, task):
-        super().__init__(task=task)
-        self.args = args
-
-        utils.deprecation_warning(
-            "Criterions should take explicit arguments instead of an "
-            "argparse.Namespace object, please update your criterion by "
-            "extending FairseqCriterion instead of LegacyFairseqCriterion."
-        )
-
-    @classmethod
-    def build_criterion(cls, args, task):
-        """Construct a criterion from command-line args."""
-        return cls(args, task)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/label_smoothed_cross_entropy.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/label_smoothed_cross_entropy.py
deleted file mode 100644
index 2763cd598e84479b33a059c853aa81166fba4427..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/label_smoothed_cross_entropy.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-from fairseq import metrics, utils
-from fairseq.criterions import FairseqCriterion, register_criterion
-
-
-def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=True):
-    if target.dim() == lprobs.dim() - 1:
-        target = target.unsqueeze(-1)
-    nll_loss = -lprobs.gather(dim=-1, index=target.long())
-    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
-    if ignore_index is not None:
-        pad_mask = target.eq(ignore_index)
-        nll_loss.masked_fill_(pad_mask, 0.0)
-        smooth_loss.masked_fill_(pad_mask, 0.0)
-    else:
-        nll_loss = nll_loss.squeeze(-1)
-        smooth_loss = smooth_loss.squeeze(-1)
-    if reduce:
-        nll_loss = nll_loss.sum()
-        smooth_loss = smooth_loss.sum()
-    eps_i = epsilon / lprobs.size(-1)
-    loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
-    return loss, nll_loss
-
-
-@register_criterion("label_smoothed_cross_entropy")
-class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
-    def __init__(
-        self,
-        task,
-        sentence_avg,
-        label_smoothing,
-        ignore_prefix_size=0,
-        report_accuracy=False,
-    ):
-        super().__init__(task)
-        self.sentence_avg = sentence_avg
-        self.eps = label_smoothing
-        self.ignore_prefix_size = ignore_prefix_size
-        self.report_accuracy = report_accuracy
-
-    @staticmethod
-    def add_args(parser):
-        """Add criterion-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--label-smoothing', default=0., type=float, metavar='D',
-                            help='epsilon for label smoothing, 0 means no label smoothing')
-        parser.add_argument('--report-accuracy', action='store_true',
-                            help='report accuracy metric')
-        parser.add_argument('--ignore-prefix-size', default=0, type=int,
-                            help='Ignore first N tokens')
-        # fmt: on
-
-    def forward(self, model, sample, reduce=True):
-        """Compute the loss for the given sample.
-
-        Returns a tuple with three elements:
-        1) the loss
-        2) the sample size, which is used as the denominator for the gradient
-        3) logging outputs to display while training
-        """
-        net_output = model(**sample["net_input"])
-        loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
-        sample_size = (
-            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
-        )
-        logging_output = {
-            "loss": loss.data,
-            "nll_loss": nll_loss.data,
-            "ntokens": sample["ntokens"],
-            "nsentences": sample["target"].size(0),
-            "sample_size": sample_size,
-        }
-        if self.report_accuracy:
-            n_correct, total = self.compute_accuracy(model, net_output, sample)
-            logging_output["n_correct"] = utils.item(n_correct.data)
-            logging_output["total"] = utils.item(total.data)
-        return loss, sample_size, logging_output
-
-    def get_lprobs_and_target(self, model, net_output, sample):
-        lprobs = model.get_normalized_probs(net_output, log_probs=True)
-        target = model.get_targets(sample, net_output)
-        if self.ignore_prefix_size > 0:
-            if getattr(lprobs, "batch_first", False):
-                lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous()
-                target = target[:, self.ignore_prefix_size :].contiguous()
-            else:
-                lprobs = lprobs[self.ignore_prefix_size :, :, :].contiguous()
-                target = target[self.ignore_prefix_size :, :].contiguous()
-        return lprobs.view(-1, lprobs.size(-1)), target.view(-1)
-
-    def compute_loss(self, model, net_output, sample, reduce=True):
-        lprobs, target = self.get_lprobs_and_target(model, net_output, sample)
-        loss, nll_loss = label_smoothed_nll_loss(
-            lprobs,
-            target,
-            self.eps,
-            ignore_index=self.padding_idx,
-            reduce=reduce,
-        )
-        return loss, nll_loss
-
-    def compute_accuracy(self, model, net_output, sample):
-        lprobs, target = self.get_lprobs_and_target(model, net_output, sample)
-        mask = target.ne(self.padding_idx)
-        n_correct = torch.sum(
-            lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask))
-        )
-        total = torch.sum(mask)
-        return n_correct, total
-
-    @classmethod
-    def reduce_metrics(cls, logging_outputs) -> None:
-        """Aggregate logging outputs from data parallel training."""
-        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
-        nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs)
-        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
-        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
-
-        metrics.log_scalar(
-            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
-        )
-        metrics.log_scalar(
-            "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3
-        )
-        metrics.log_derived(
-            "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
-        )
-
-        total = utils.item(sum(log.get("total", 0) for log in logging_outputs))
-        if total > 0:
-            metrics.log_scalar("total", total)
-            n_correct = utils.item(
-                sum(log.get("n_correct", 0) for log in logging_outputs)
-            )
-            metrics.log_scalar("n_correct", n_correct)
-            metrics.log_derived(
-                "accuracy",
-                lambda meters: round(
-                    meters["n_correct"].sum * 100.0 / meters["total"].sum, 3
-                )
-                if meters["total"].sum > 0
-                else float("nan"),
-            )
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
deleted file mode 100644
index 73cfa05310e51d9a5f349cc30b8406002d25861b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-from fairseq import metrics, utils
-from fairseq.criterions import register_criterion
-
-from .label_smoothed_cross_entropy import LabelSmoothedCrossEntropyCriterion
-
-
-@register_criterion("label_smoothed_cross_entropy_with_alignment")
-class LabelSmoothedCrossEntropyCriterionWithAlignment(
-    LabelSmoothedCrossEntropyCriterion
-):
-    def __init__(self, task, sentence_avg, label_smoothing, alignment_lambda):
-        super().__init__(task, sentence_avg, label_smoothing)
-        self.alignment_lambda = alignment_lambda
-
-    @staticmethod
-    def add_args(parser):
-        """Add criterion-specific arguments to the parser."""
-        LabelSmoothedCrossEntropyCriterion.add_args(parser)
-        parser.add_argument(
-            "--alignment-lambda",
-            default=0.05,
-            type=float,
-            metavar="D",
-            help="weight for the alignment loss",
-        )
-
-    def forward(self, model, sample, reduce=True):
-        """Compute the loss for the given sample.
-
-        Returns a tuple with three elements:
-        1) the loss
-        2) the sample size, which is used as the denominator for the gradient
-        3) logging outputs to display while training
-        """
-        net_output = model(**sample["net_input"])
-        loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
-        sample_size = (
-            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
-        )
-        logging_output = {
-            "loss": utils.item(loss.data) if reduce else loss.data,
-            "nll_loss": utils.item(nll_loss.data) if reduce else nll_loss.data,
-            "ntokens": sample["ntokens"],
-            "nsentences": sample["target"].size(0),
-            "sample_size": sample_size,
-        }
-
-        alignment_loss = None
-
-        # Compute alignment loss only for training set and non dummy batches.
-        if "alignments" in sample and sample["alignments"] is not None:
-            alignment_loss = self.compute_alignment_loss(sample, net_output)
-
-        if alignment_loss is not None:
-            logging_output["alignment_loss"] = utils.item(alignment_loss.data)
-            loss += self.alignment_lambda * alignment_loss
-
-        return loss, sample_size, logging_output
-
-    def compute_alignment_loss(self, sample, net_output):
-        attn_prob = net_output[1]["attn"][0]
-        bsz, tgt_sz, src_sz = attn_prob.shape
-        attn = attn_prob.view(bsz * tgt_sz, src_sz)
-
-        align = sample["alignments"]
-        align_weights = sample["align_weights"].float()
-
-        if len(align) > 0:
-            # Alignment loss computation. align (shape [:, 2]) contains the src-tgt index pairs corresponding to
-            # the alignments. align_weights (shape [:]) contains the 1 / frequency of a tgt index for normalizing.
-            loss = -(
-                (attn[align[:, 1][:, None], align[:, 0][:, None]]).log()
-                * align_weights[:, None]
-            ).sum()
-        else:
-            return None
-
-        return loss
-
-    @staticmethod
-    def reduce_metrics(logging_outputs) -> None:
-        """Aggregate logging outputs from data parallel training."""
-        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
-        nll_loss_sum = utils.item(
-            sum(log.get("nll_loss", 0) for log in logging_outputs)
-        )
-        alignment_loss_sum = utils.item(
-            sum(log.get("alignment_loss", 0) for log in logging_outputs)
-        )
-        ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
-        sample_size = utils.item(
-            sum(log.get("sample_size", 0) for log in logging_outputs)
-        )
-
-        metrics.log_scalar(
-            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
-        )
-        metrics.log_scalar(
-            "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3
-        )
-        metrics.log_scalar(
-            "alignment_loss",
-            alignment_loss_sum / sample_size / math.log(2),
-            sample_size,
-            round=3,
-        )
-        metrics.log_derived(
-            "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
-        )
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/legacy_masked_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/legacy_masked_lm.py
deleted file mode 100644
index c70608c5a143b7b4fbd8c58dfcf9f873639d379c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/legacy_masked_lm.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-import torch.nn.functional as F
-from fairseq import metrics, utils
-from fairseq.criterions import FairseqCriterion, register_criterion
-
-
-def compute_cross_entropy_loss(logits, targets, ignore_index=-100):
-    """
-    Function to compute the cross entropy loss. The default value of
-    ignore_index is the same as the default value for F.cross_entropy in
-    pytorch.
-    """
-    assert logits.size(0) == targets.size(
-        -1
-    ), "Logits and Targets tensor shapes don't match up"
-
-    loss = F.nll_loss(
-        F.log_softmax(logits, -1, dtype=torch.float32),
-        targets,
-        reduction="sum",
-        ignore_index=ignore_index,
-    )
-    return loss
-
-
-@register_criterion("legacy_masked_lm_loss")
-class LegacyMaskedLmLoss(FairseqCriterion):
-    """
-    Implementation for the loss used in masked language model (MLM) training.
-    This optionally also computes the next sentence prediction (NSP) loss and
-    adds it to the overall loss based on the specified args. There are three
-    cases to consider:
-        1) Generic MLM training without NSP loss. In this case sentence_targets
-           and sentence_logits are both None.
-        2) BERT training without NSP loss. In this case sentence_targets is
-           not None but sentence_logits is None and we should not be computing
-           a sentence level loss.
-        3) BERT training with NSP loss. In this case both sentence_targets and
-           sentence_logits are not None and we should be computing a sentence
-           level loss. The weight of the sentence level loss is specified as
-           an argument.
-    """
-
-    def __init__(self, task, masked_lm_only, nsp_loss_weight):
-        super().__init__(task)
-        self.masked_lm_only = masked_lm_only
-        self.nsp_loss_weight = nsp_loss_weight
-
-    @staticmethod
-    def add_args(parser):
-        """Args for MaskedLM Loss"""
-        # Default for masked_lm_only is False so as to not break BERT training
-        parser.add_argument(
-            "--masked-lm-only",
-            default=False,
-            action="store_true",
-            help="compute MLM loss only",
-        )
-        parser.add_argument(
-            "--nsp-loss-weight",
-            default=1.0,
-            type=float,
-            help="weight for next sentence prediction" " loss (default 1)",
-        )
-
-    def forward(self, model, sample, reduce=True):
-        """Compute the loss for the given sample.
-        Returns a tuple with three elements:
-        1) the loss
-        2) the sample size, which is used as the denominator for the gradient
-        3) logging outputs to display while training
-        """
-        lm_logits, output_metadata = model(**sample["net_input"])
-
-        # reshape lm_logits from (N,T,C) to (N*T,C)
-        lm_logits = lm_logits.view(-1, lm_logits.size(-1))
-        lm_targets = sample["lm_target"].view(-1)
-        lm_loss = compute_cross_entropy_loss(lm_logits, lm_targets, self.padding_idx)
-
-        # compute the number of tokens for which loss is computed. This is used
-        # to normalize the loss
-        ntokens = utils.strip_pad(lm_targets, self.padding_idx).numel()
-        loss = lm_loss / ntokens
-        nsentences = sample["nsentences"]
-        # nsentences = 0
-
-        # Compute sentence loss if masked_lm_only is False
-        sentence_loss = None
-        if not self.masked_lm_only:
-            sentence_logits = output_metadata["sentence_logits"]
-            sentence_targets = sample["sentence_target"].view(-1)
-            # This needs to be recomputed due to some differences between
-            # TokenBlock and BlockPair dataset. This can be resolved with a
-            # refactor of BERTModel which we will do in the future.
-            # TODO: Remove this after refactor of BERTModel
-            nsentences = sentence_targets.size(0)
-
-            # Check for logits being none which can happen when remove_heads
-            # is set to true in the BERT model. Ideally we should set
-            # masked_lm_only to true in this case, but that requires some
-            # refactor in the BERT model.
-            if sentence_logits is not None:
-                sentence_loss = compute_cross_entropy_loss(
-                    sentence_logits, sentence_targets
-                )
-
-                loss += self.nsp_loss_weight * (sentence_loss / nsentences)
-
-        # NOTE: as we are summing up per token mlm loss and per sentence nsp loss
-        # we don't need to use sample_size as denominator for the gradient
-        # here sample_size is just used for logging
-        sample_size = 1
-        logging_output = {
-            "loss": utils.item(loss.data) if reduce else loss.data,
-            "lm_loss": utils.item(lm_loss.data) if reduce else lm_loss.data,
-            # sentence loss is not always computed
-            "sentence_loss": (
-                (utils.item(sentence_loss.data) if reduce else sentence_loss.data)
-                if sentence_loss is not None
-                else 0.0
-            ),
-            "ntokens": ntokens,
-            "nsentences": nsentences,
-            "sample_size": sample_size,
-        }
-        return loss, sample_size, logging_output
-
-    @staticmethod
-    def reduce_metrics(logging_outputs) -> None:
-        """Aggregate logging outputs from data parallel training."""
-        lm_loss_sum = sum(log.get("lm_loss", 0) for log in logging_outputs)
-        sentence_loss_sum = sum(log.get("sentence_loss", 0) for log in logging_outputs)
-        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
-        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
-        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
-        agg_loss = sum(log.get("loss", 0) for log in logging_outputs)
-
-        metrics.log_scalar(
-            "loss",
-            agg_loss / sample_size / math.log(2) if sample_size > 0 else 0.0,
-            sample_size,
-            round=3,
-        )
-        metrics.log_scalar(
-            "lm_loss",
-            lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.0,
-            ntokens,
-            round=3,
-        )
-        metrics.log_scalar(
-            "sentence_loss",
-            sentence_loss_sum / nsentences / math.log(2) if nsentences > 0 else 0.0,
-            nsentences,
-            round=3,
-        )
-        metrics.log_scalar(
-            "nll_loss",
-            lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.0,
-            ntokens,
-            round=3,
-        )
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/masked_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/masked_lm.py
deleted file mode 100644
index b04cfbff6dcbfacb91156bb10a7c8cdbb9e76d37..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/masked_lm.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-import torch.nn.functional as F
-from fairseq import metrics, modules, utils
-from fairseq.criterions import FairseqCriterion, register_criterion
-
-
-@register_criterion("masked_lm")
-class MaskedLmLoss(FairseqCriterion):
-    """
-    Implementation for the loss used in masked language model (MLM) training.
-    """
-
-    def __init__(self, task, tpu=False):
-        super().__init__(task)
-        self.tpu = tpu
-
-    def forward(self, model, sample, reduce=True):
-        """Compute the loss for the given sample.
-
-        Returns a tuple with three elements:
-        1) the loss
-        2) the sample size, which is used as the denominator for the gradient
-        3) logging outputs to display while training
-        """
-        masked_tokens = sample["target"].ne(self.padding_idx)
-        sample_size = masked_tokens.int().sum()
-
-        # Rare: when all tokens are masked, project all tokens.
-        # We use torch.where to avoid device-to-host transfers,
-        # except on CPU where torch.where is not well supported
-        # (see github.com/pytorch/pytorch/issues/26247).
-        if self.tpu:
-            masked_tokens = None  # always project all tokens on TPU
-        elif masked_tokens.device == torch.device("cpu"):
-            if not masked_tokens.any():
-                masked_tokens = None
-        else:
-            masked_tokens = torch.where(
-                masked_tokens.any(),
-                masked_tokens,
-                masked_tokens.new([True]),
-            )
-
-        logits = model(**sample["net_input"], masked_tokens=masked_tokens)[0]
-        targets = model.get_targets(sample, [logits])
-        if masked_tokens is not None:
-            targets = targets[masked_tokens]
-
-        loss = modules.cross_entropy(
-            logits.view(-1, logits.size(-1)),
-            targets.view(-1),
-            reduction="sum",
-            ignore_index=self.padding_idx,
-        )
-
-        logging_output = {
-            "loss": loss if self.tpu else loss.data,
-            "ntokens": sample["ntokens"],
-            "nsentences": sample["nsentences"],
-            "sample_size": sample_size,
-        }
-        return loss, sample_size, logging_output
-
-    @staticmethod
-    def reduce_metrics(logging_outputs) -> None:
-        """Aggregate logging outputs from data parallel training."""
-        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
-        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
-
-        metrics.log_scalar(
-            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
-        )
-        metrics.log_derived(
-            "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
-        )
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/nat_loss.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/nat_loss.py
deleted file mode 100644
index cdc7da861d7d5d5af183a78fdde51f49eb0cf5e7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/nat_loss.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-import torch.nn.functional as F
-from fairseq import metrics, utils
-from fairseq.criterions import FairseqCriterion, register_criterion
-from torch import Tensor
-
-
-@register_criterion("nat_loss")
-class LabelSmoothedDualImitationCriterion(FairseqCriterion):
-    def __init__(self, task, label_smoothing):
-        super().__init__(task)
-        self.label_smoothing = label_smoothing
-
-    @staticmethod
-    def add_args(parser):
-        """Add criterion-specific arguments to the parser."""
-        parser.add_argument(
-            "--label-smoothing",
-            default=0.0,
-            type=float,
-            metavar="D",
-            help="epsilon for label smoothing, 0 means no label smoothing",
-        )
-
-    def _compute_loss(
-        self, outputs, targets, masks=None, label_smoothing=0.0, name="loss", factor=1.0
-    ):
-        """
-        outputs: batch x len x d_model
-        targets: batch x len
-        masks:   batch x len
-
-        policy_logprob: if there is some policy
-            depends on the likelihood score as rewards.
-        """
-
-        def mean_ds(x: Tensor, dim=None) -> Tensor:
-            return (
-                x.float().mean().type_as(x)
-                if dim is None
-                else x.float().mean(dim).type_as(x)
-            )
-
-        if masks is not None:
-            outputs, targets = outputs[masks], targets[masks]
-
-        if masks is not None and not masks.any():
-            nll_loss = torch.tensor(0)
-            loss = nll_loss
-        else:
-            logits = F.log_softmax(outputs, dim=-1)
-            if targets.dim() == 1:
-                losses = F.nll_loss(logits, targets.to(logits.device), reduction="none")
-
-            else:  # soft-labels
-                losses = F.kl_div(logits, targets.to(logits.device), reduction="none")
-                losses = losses.sum(-1)
-
-            nll_loss = mean_ds(losses)
-            if label_smoothing > 0:
-                loss = (
-                    nll_loss * (1 - label_smoothing) - mean_ds(logits) * label_smoothing
-                )
-            else:
-                loss = nll_loss
-
-        loss = loss * factor
-        return {"name": name, "loss": loss, "nll_loss": nll_loss, "factor": factor}
-
-    def _custom_loss(self, loss, name="loss", factor=1.0):
-        return {"name": name, "loss": loss, "factor": factor}
-
-    def forward(self, model, sample, reduce=True):
-        """Compute the loss for the given sample.
-        Returns a tuple with three elements:
-        1) the loss
-        2) the sample size, which is used as the denominator for the gradient
-        3) logging outputs to display while training
-        """
-        nsentences, ntokens = sample["nsentences"], sample["ntokens"]
-
-        # B x T
-        src_tokens, src_lengths = (
-            sample["net_input"]["src_tokens"],
-            sample["net_input"]["src_lengths"],
-        )
-        tgt_tokens, prev_output_tokens = sample["target"], sample["prev_target"]
-
-        outputs = model(src_tokens, src_lengths, prev_output_tokens, tgt_tokens)
-        losses, nll_loss = [], []
-
-        for obj in outputs:
-            if outputs[obj].get("loss", None) is None:
-                _losses = self._compute_loss(
-                    outputs[obj].get("out"),
-                    outputs[obj].get("tgt"),
-                    outputs[obj].get("mask", None),
-                    outputs[obj].get("ls", 0.0),
-                    name=obj + "-loss",
-                    factor=outputs[obj].get("factor", 1.0),
-                )
-            else:
-                _losses = self._custom_loss(
-                    outputs[obj].get("loss"),
-                    name=obj + "-loss",
-                    factor=outputs[obj].get("factor", 1.0),
-                )
-
-            losses += [_losses]
-            if outputs[obj].get("nll_loss", False):
-                nll_loss += [_losses.get("nll_loss", 0.0)]
-
-        loss = sum(l["loss"] for l in losses)
-        nll_loss = sum(l for l in nll_loss) if len(nll_loss) > 0 else loss.new_tensor(0)
-
-        # NOTE:
-        # we don't need to use sample_size as denominator for the gradient
-        # here sample_size is just used for logging
-        sample_size = 1
-        logging_output = {
-            "loss": loss.data,
-            "nll_loss": nll_loss.data,
-            "ntokens": ntokens,
-            "nsentences": nsentences,
-            "sample_size": sample_size,
-        }
-
-        for l in losses:
-            logging_output[l["name"]] = (
-                utils.item(l["loss"].data / l["factor"])
-                if reduce
-                else l[["loss"]].data / l["factor"]
-            )
-
-        return loss, sample_size, logging_output
-
-    @staticmethod
-    def reduce_metrics(logging_outputs) -> None:
-        """Aggregate logging outputs from data parallel training."""
-        sample_size = utils.item(
-            sum(log.get("sample_size", 0) for log in logging_outputs)
-        )
-        loss = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
-        nll_loss = utils.item(sum(log.get("nll_loss", 0) for log in logging_outputs))
-
-        metrics.log_scalar(
-            "loss", loss / sample_size / math.log(2), sample_size, round=3
-        )
-        metrics.log_scalar(
-            "nll_loss", nll_loss / sample_size / math.log(2), sample_size, round=3
-        )
-        metrics.log_derived(
-            "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
-        )
-
-        for key in logging_outputs[0]:
-            if key[-5:] == "-loss":
-                val = sum(log.get(key, 0) for log in logging_outputs)
-                metrics.log_scalar(
-                    key[:-5],
-                    val / sample_size / math.log(2) if sample_size > 0 else 0.0,
-                    sample_size,
-                    round=3,
-                )
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/sentence_prediction.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/sentence_prediction.py
deleted file mode 100644
index 9519fdc56d7de86b727f74ef5b18db520382e562..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/sentence_prediction.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-import torch.nn.functional as F
-from fairseq import metrics, utils
-from fairseq.criterions import FairseqCriterion, register_criterion
-
-
-@register_criterion("sentence_prediction")
-class SentencePredictionCriterion(FairseqCriterion):
-    def __init__(self, task, classification_head_name, regression_target):
-        super().__init__(task)
-        self.classification_head_name = classification_head_name
-        self.regression_target = regression_target
-
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        parser.add_argument('--classification-head-name',
-                            default='sentence_classification_head',
-                            help='name of the classification head to use')
-        # fmt: on
-
-    def forward(self, model, sample, reduce=True):
-        """Compute the loss for the given sample.
-
-        Returns a tuple with three elements:
-        1) the loss
-        2) the sample size, which is used as the denominator for the gradient
-        3) logging outputs to display while training
-        """
-        assert (
-            hasattr(model, "classification_heads")
-            and self.classification_head_name in model.classification_heads
-        ), "model must provide sentence classification head for --criterion=sentence_prediction"
-
-        logits, _ = model(
-            **sample["net_input"],
-            features_only=True,
-            classification_head_name=self.classification_head_name,
-        )
-        targets = model.get_targets(sample, [logits]).view(-1)
-        sample_size = targets.numel()
-
-        if not self.regression_target:
-            lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
-            loss = F.nll_loss(lprobs, targets, reduction="sum")
-        else:
-            logits = logits.view(-1).float()
-            targets = targets.float()
-            loss = F.mse_loss(logits, targets, reduction="sum")
-
-        logging_output = {
-            "loss": loss.data,
-            "ntokens": sample["ntokens"],
-            "nsentences": sample_size,
-            "sample_size": sample_size,
-        }
-        if not self.regression_target:
-            preds = logits.argmax(dim=1)
-            logging_output["ncorrect"] = (preds == targets).sum()
-
-        return loss, sample_size, logging_output
-
-    @staticmethod
-    def reduce_metrics(logging_outputs) -> None:
-        """Aggregate logging outputs from data parallel training."""
-        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
-        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
-        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
-        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
-
-        metrics.log_scalar(
-            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
-        )
-        if sample_size != ntokens:
-            metrics.log_scalar(
-                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
-            )
-
-        if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]:
-            ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs)
-            metrics.log_scalar(
-                "accuracy", 100.0 * ncorrect / nsentences, nsentences, round=1
-            )
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/sentence_ranking.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/sentence_ranking.py
deleted file mode 100644
index d4c76341d4d87e6d0da21ac89e833ce0bda13a0c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/sentence_ranking.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-import torch.nn.functional as F
-from fairseq import metrics, utils
-from fairseq.criterions import FairseqCriterion, register_criterion
-
-
-@register_criterion("sentence_ranking")
-class SentenceRankingCriterion(FairseqCriterion):
-    def __init__(self, task, ranking_head_name, save_predictions, num_classes):
-        super().__init__(task)
-        self.ranking_head_name = ranking_head_name
-        if save_predictions is not None:
-            self.prediction_h = open(save_predictions, "w")
-        else:
-            self.prediction_h = None
-        self.num_classes = num_classes
-
-    def __del__(self):
-        if self.prediction_h is not None:
-            self.prediction_h.close()
-
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        parser.add_argument('--save-predictions', metavar='FILE',
-                            help='file to save predictions to')
-        parser.add_argument('--ranking-head-name',
-                            default='sentence_classification_head',
-                            help='name of the ranking head to use')
-        # fmt: on
-
-    def forward(self, model, sample, reduce=True):
-        """Compute ranking loss for the given sample.
-
-        Returns a tuple with three elements:
-        1) the loss
-        2) the sample size, which is used as the denominator for the gradient
-        3) logging outputs to display while training
-        """
-        assert (
-            hasattr(model, "classification_heads")
-            and self.ranking_head_name in model.classification_heads
-        ), "model must provide sentence ranking head for --criterion=sentence_ranking"
-
-        scores = []
-        for idx in range(self.num_classes):
-            score, _ = model(
-                **sample["net_input{idx}".format(idx=idx + 1)],
-                classification_head_name=self.ranking_head_name,
-            )
-            scores.append(score)
-
-        logits = torch.cat(scores, dim=1)
-        sample_size = logits.size(0)
-
-        if "target" in sample:
-            targets = model.get_targets(sample, [logits]).view(-1)
-            lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
-            loss = F.nll_loss(lprobs, targets, reduction="sum")
-        else:
-            targets = None
-            loss = torch.tensor(0.0, requires_grad=True)
-
-        if self.prediction_h is not None:
-            preds = logits.argmax(dim=1)
-            for i, (id, pred) in enumerate(zip(sample["id"].tolist(), preds.tolist())):
-                if targets is not None:
-                    label = targets[i].item()
-                    print("{}\t{}\t{}".format(id, pred, label), file=self.prediction_h)
-                else:
-                    print("{}\t{}".format(id, pred), file=self.prediction_h)
-
-        logging_output = {
-            "loss": loss.data,
-            "ntokens": sample["ntokens"],
-            "nsentences": sample_size,
-            "sample_size": sample_size,
-        }
-        if targets is not None:
-            logging_output["ncorrect"] = (logits.argmax(dim=1) == targets).sum()
-
-        return loss, sample_size, logging_output
-
-    @staticmethod
-    def reduce_metrics(logging_outputs) -> None:
-        """Aggregate logging outputs from data parallel training."""
-        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
-        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
-        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
-        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
-
-        metrics.log_scalar(
-            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
-        )
-        if sample_size != ntokens:
-            metrics.log_scalar(
-                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
-            )
-
-        if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]:
-            ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs)
-            metrics.log_scalar(
-                "accuracy", 100.0 * ncorrect / nsentences, nsentences, round=1
-            )
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/wav2vec_criterion.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/wav2vec_criterion.py
deleted file mode 100644
index 6ac7557dcc4a77bd8d6fa468da87717c7f6fb704..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/criterions/wav2vec_criterion.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-import torch.nn.functional as F
-from fairseq import metrics, utils
-from fairseq.criterions import FairseqCriterion, register_criterion
-from fairseq.logging.meters import safe_round
-
-
-@register_criterion("wav2vec")
-class Wav2vecCriterion(FairseqCriterion):
-    def __init__(self, task, infonce=False, loss_weights=None, log_keys=None):
-        super().__init__(task)
-        self.infonce = infonce
-        self.loss_weights = None if loss_weights is None else eval(loss_weights)
-        self.log_keys = [] if log_keys is None else eval(log_keys)
-
-    @staticmethod
-    def add_args(parser):
-        """Add criterion-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--infonce', action='store_true',
-                            help='if set, uses cross entropy instead of binary cross entropy (i.e. InfoNCE loss)')
-        parser.add_argument('--loss-weights', type=str, default=None,
-                            help='weights for additional loss terms (not first one)')
-        parser.add_argument('--log-keys', type=str, default=None,
-                            help='output keys to log')
-        # fmt: on
-
-    def forward(self, model, sample, reduce=True, log_pred=False):
-        """Compute the loss for the given sample.
-
-        Returns a tuple with three elements:
-        1) the loss
-        2) the sample size, which is used as the denominator for the gradient
-        3) logging outputs to display while training
-        """
-        net_output = model(**sample["net_input"])
-        logits = model.get_logits(net_output).float()
-        target = model.get_targets(sample, net_output)
-
-        weights = None
-        if hasattr(model, "get_target_weights") and not self.infonce:
-            weights = model.get_target_weights(target, net_output)
-            if torch.is_tensor(weights):
-                weights = weights.float()
-
-        losses = []
-
-        if self.infonce:
-            loss = F.cross_entropy(
-                logits,
-                target,
-                reduction="sum" if reduce else "none",
-            )
-        else:
-            loss = F.binary_cross_entropy_with_logits(
-                logits,
-                target.float(),
-                weights,
-                reduction="sum" if reduce else "none",
-            )
-
-        sample_size = target.numel() if self.infonce else target.long().sum().item()
-        losses.append(loss.detach().clone())
-
-        if self.loss_weights is not None:
-            assert hasattr(model, "get_extra_losses")
-            extra_losses = model.get_extra_losses(net_output)
-            if torch.is_tensor(extra_losses):
-                extra_losses = [extra_losses]
-            if len(self.loss_weights) == 1 and len(extra_losses) != 1:
-                self.loss_weights = [self.loss_weights[0]] * len(extra_losses)
-            assert len(extra_losses) == len(
-                self.loss_weights
-            ), f"{len(extra_losses)}, {len(self.loss_weights)}"
-            for p, coef in zip(extra_losses, self.loss_weights):
-                if coef != 0 and p is not None:
-                    p = coef * p.float() * sample_size
-                    loss += p
-                    losses.append(p)
-
-        logging_output = {
-            "loss": loss.item() if reduce else loss,
-            "ntokens": sample_size,
-            "nsentences": sample["id"].numel(),
-            "sample_size": sample_size,
-        }
-
-        for lk in self.log_keys:
-            if lk in net_output:
-                logging_output[lk] = float((net_output[lk]))
-
-        if len(losses) > 1:
-            for i, l in enumerate(losses):
-                logging_output[f"loss_{i}"] = l.item()
-
-        if self.infonce:
-            with torch.no_grad():
-                if logits.numel() == 0:
-                    corr = 0
-                    count = 0
-                else:
-                    assert logits.dim() > 1, logits.shape
-                    max = logits.argmax(-1) == 0
-                    min = logits.argmin(-1) == 0
-                    both = max & min
-                    corr = max.long().sum().item() - both.long().sum().item()
-                    count = max.numel()
-
-                logging_output["correct"] = corr
-                logging_output["count"] = count
-
-        if log_pred:
-            logging_output["logits"] = logits.cpu().numpy()
-            logging_output["target"] = target.cpu().numpy()
-        return loss, sample_size, logging_output
-
-    @staticmethod
-    def reduce_metrics(logging_outputs) -> None:
-        """Aggregate logging outputs from data parallel training."""
-        loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
-        ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
-        nsentences = utils.item(
-            sum(log.get("nsentences", 0) for log in logging_outputs)
-        )
-        sample_size = utils.item(
-            sum(log.get("sample_size", 0) for log in logging_outputs)
-        )
-
-        metrics.log_scalar(
-            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
-        )
-        metrics.log_scalar("ntokens", ntokens)
-        metrics.log_scalar("nsentences", nsentences)
-
-        correct = sum(log.get("correct", 0) for log in logging_outputs)
-        metrics.log_scalar("_correct", correct)
-
-        total = sum(log.get("count", 0) for log in logging_outputs)
-        metrics.log_scalar("_total", total)
-
-        if total > 0:
-            metrics.log_derived(
-                "accuracy",
-                lambda meters: safe_round(
-                    meters["_correct"].sum / meters["_total"].sum, 5
-                )
-                if meters["_total"].sum > 0
-                else float("nan"),
-            )
-
-        builtin_keys = {
-            "loss",
-            "ntokens",
-            "nsentences",
-            "sample_size",
-            "correct",
-            "count",
-        }
-
-        for k in logging_outputs[0]:
-            if k not in builtin_keys:
-                val = sum(log.get(k, 0) for log in logging_outputs) / len(
-                    logging_outputs
-                )
-                if k.startswith("loss"):
-                    metrics.log_scalar(k, val / sample_size / math.log(2), sample_size)
-                else:
-                    metrics.log_scalar(k, val, round=3)
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return False
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/__init__.py
deleted file mode 100644
index 9b3081395514a21d01c41358afa52a7ebca5e49e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/__init__.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""isort:skip_file"""
-
-from .dictionary import Dictionary, TruncatedDictionary
-
-from .fairseq_dataset import FairseqDataset, FairseqIterableDataset
-
-from .base_wrapper_dataset import BaseWrapperDataset
-
-from .add_target_dataset import AddTargetDataset
-from .append_token_dataset import AppendTokenDataset
-from .audio.raw_audio_dataset import FileAudioDataset
-from .backtranslation_dataset import BacktranslationDataset
-from .bucket_pad_length_dataset import BucketPadLengthDataset
-from .colorize_dataset import ColorizeDataset
-from .concat_dataset import ConcatDataset
-from .concat_sentences_dataset import ConcatSentencesDataset
-from .denoising_dataset import DenoisingDataset
-from .id_dataset import IdDataset
-from .indexed_dataset import (
-    IndexedCachedDataset,
-    IndexedDataset,
-    IndexedRawTextDataset,
-    MMapIndexedDataset,
-)
-from .language_pair_dataset import LanguagePairDataset
-from .list_dataset import ListDataset
-from .lm_context_window_dataset import LMContextWindowDataset
-from .lru_cache_dataset import LRUCacheDataset
-from .mask_tokens_dataset import MaskTokensDataset
-from .monolingual_dataset import MonolingualDataset
-from .multi_corpus_sampled_dataset import MultiCorpusSampledDataset
-from .nested_dictionary_dataset import NestedDictionaryDataset
-from .noising import NoisingDataset
-from .numel_dataset import NumelDataset
-from .num_samples_dataset import NumSamplesDataset
-from .offset_tokens_dataset import OffsetTokensDataset
-from .pad_dataset import LeftPadDataset, PadDataset, RightPadDataset
-from .prepend_dataset import PrependDataset
-from .prepend_token_dataset import PrependTokenDataset
-from .raw_label_dataset import RawLabelDataset
-from .replace_dataset import ReplaceDataset
-from .resampling_dataset import ResamplingDataset
-from .roll_dataset import RollDataset
-from .round_robin_zip_datasets import RoundRobinZipDatasets
-from .sort_dataset import SortDataset
-from .strip_token_dataset import StripTokenDataset
-from .subsample_dataset import SubsampleDataset
-from .token_block_dataset import TokenBlockDataset
-from .transform_eos_dataset import TransformEosDataset
-from .transform_eos_lang_pair_dataset import TransformEosLangPairDataset
-from .shorten_dataset import TruncateDataset, RandomCropDataset
-from .multilingual.sampled_multi_dataset import SampledMultiDataset
-from .multilingual.sampled_multi_epoch_dataset import SampledMultiEpochDataset
-from .fasta_dataset import FastaDataset, EncodedFastaDataset
-
-from .iterators import (
-    CountingIterator,
-    EpochBatchIterator,
-    GroupedIterator,
-    ShardedIterator,
-)
-
-__all__ = [
-    "AddTargetDataset",
-    "AppendTokenDataset",
-    "BacktranslationDataset",
-    "BaseWrapperDataset",
-    "BucketPadLengthDataset",
-    "ColorizeDataset",
-    "ConcatDataset",
-    "ConcatSentencesDataset",
-    "CountingIterator",
-    "DenoisingDataset",
-    "Dictionary",
-    "EncodedFastaDataset",
-    "EpochBatchIterator",
-    "FairseqDataset",
-    "FairseqIterableDataset",
-    "FastaDataset",
-    "GroupedIterator",
-    "IdDataset",
-    "IndexedCachedDataset",
-    "IndexedDataset",
-    "IndexedRawTextDataset",
-    "LanguagePairDataset",
-    "LeftPadDataset",
-    "ListDataset",
-    "LMContextWindowDataset",
-    "LRUCacheDataset",
-    "MaskTokensDataset",
-    "MMapIndexedDataset",
-    "MonolingualDataset",
-    "MultiCorpusSampledDataset",
-    "NestedDictionaryDataset",
-    "NoisingDataset",
-    "NumelDataset",
-    "NumSamplesDataset",
-    "OffsetTokensDataset",
-    "PadDataset",
-    "PrependDataset",
-    "PrependTokenDataset",
-    "ReplaceDataset",
-    "RollDataset",
-    "FileAudioDataset",
-    "RawLabelDataset",
-    "ResamplingDataset",
-    "RightPadDataset",
-    "RoundRobinZipDatasets",
-    "SampledMultiDataset",
-    "SampledMultiEpochDataset",
-    "ShardedIterator",
-    "SortDataset",
-    "StripTokenDataset",
-    "SubsampleDataset",
-    "TokenBlockDataset",
-    "TransformEosDataset",
-    "TransformEosLangPairDataset",
-    "TruncateDataset",
-    "TruncatedDictionary",
-]
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/add_target_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/add_target_dataset.py
deleted file mode 100644
index 9ef467058b89d9d74f703acbe5b45cb5ef9b2b69..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/add_target_dataset.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-
-from . import BaseWrapperDataset, data_utils
-
-
-class AddTargetDataset(BaseWrapperDataset):
-    def __init__(
-        self,
-        dataset,
-        labels,
-        pad,
-        eos,
-        batch_targets,
-        process_label=None,
-        add_to_input=False,
-    ):
-        super().__init__(dataset)
-        self.labels = labels
-        self.batch_targets = batch_targets
-        self.pad = pad
-        self.eos = eos
-        self.process_label = process_label
-        self.add_to_input = add_to_input
-
-    def get_label(self, index):
-        return (
-            self.labels[index]
-            if self.process_label is None
-            else self.process_label(self.labels[index])
-        )
-
-    def __getitem__(self, index):
-        item = self.dataset[index]
-        item["label"] = self.get_label(index)
-        return item
-
-    def size(self, index):
-        sz = self.dataset.size(index)
-        own_sz = len(self.get_label(index))
-        return (sz, own_sz)
-
-    def collater(self, samples):
-        collated = self.dataset.collater(samples)
-        if len(collated) == 0:
-            return collated
-        indices = set(collated["id"].tolist())
-        target = [s["label"] for s in samples if s["id"] in indices]
-
-        if self.batch_targets:
-            collated["target_lengths"] = torch.LongTensor([len(t) for t in target])
-            target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False)
-            collated["ntokens"] = collated["target_lengths"].sum().item()
-        else:
-            collated["ntokens"] = sum([len(t) for t in target])
-
-        collated["target"] = target
-
-        if self.add_to_input:
-            eos = target.new_full((target.size(0), 1), self.eos)
-            collated["target"] = torch.cat([target, eos], dim=-1).long()
-            collated["net_input"]["prev_output_tokens"] = torch.cat(
-                [eos, target], dim=-1
-            ).long()
-            collated["ntokens"] += target.size(0)
-        return collated
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/append_token_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/append_token_dataset.py
deleted file mode 100644
index 87695bd0f5fcb6b10247e3b743340623e6438cc1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/append_token_dataset.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-
-from . import BaseWrapperDataset
-
-
-class AppendTokenDataset(BaseWrapperDataset):
-    def __init__(self, dataset, token=None):
-        super().__init__(dataset)
-        self.token = token
-        if token is not None:
-            self._sizes = np.array(dataset.sizes) + 1
-        else:
-            self._sizes = dataset.sizes
-
-    def __getitem__(self, idx):
-        item = self.dataset[idx]
-        if self.token is not None:
-            item = torch.cat([item, item.new([self.token])])
-        return item
-
-    @property
-    def sizes(self):
-        return self._sizes
-
-    def num_tokens(self, index):
-        n = self.dataset.num_tokens(index)
-        if self.token is not None:
-            n += 1
-        return n
-
-    def size(self, index):
-        n = self.dataset.size(index)
-        if self.token is not None:
-            n += 1
-        return n
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/audio_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/audio_utils.py
deleted file mode 100644
index dd76a1e2f763b55e0fe28ca3fcd1f0782a93440f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/audio_utils.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# BSD 3-Clause License
-#
-# Copyright (c) 2017 xxxx
-# All rights reserved.
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# ============================================================================
-import os.path as op
-from typing import BinaryIO, Optional, Tuple, Union
-
-import numpy as np
-
-
-def get_waveform(
-    path_or_fp: Union[str, BinaryIO], normalization=True
-) -> Tuple[np.ndarray, int]:
-    """Get the waveform and sample rate of a 16-bit mono-channel WAV or FLAC.
-
-    Args:
-        path_or_fp (str or BinaryIO): the path or file-like object
-        normalization (bool): Normalize values to [-1, 1] (Default: True)
-    """
-    if isinstance(path_or_fp, str):
-        ext = op.splitext(op.basename(path_or_fp))[1]
-        if ext not in {".flac", ".wav"}:
-            raise ValueError(f"Unsupported audio format: {ext}")
-
-    try:
-        import soundfile as sf
-    except ImportError:
-        raise ImportError("Please install soundfile to load WAV/FLAC file")
-
-    waveform, sample_rate = sf.read(path_or_fp, dtype="float32")
-    if not normalization:
-        waveform *= 2 ** 15  # denormalized to 16-bit signed integers
-    return waveform, sample_rate
-
-
-def _get_kaldi_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarray]:
-    """Get mel-filter bank features via PyKaldi."""
-    try:
-        from kaldi.feat.mel import MelBanksOptions
-        from kaldi.feat.fbank import FbankOptions, Fbank
-        from kaldi.feat.window import FrameExtractionOptions
-        from kaldi.matrix import Vector
-
-        mel_opts = MelBanksOptions()
-        mel_opts.num_bins = n_bins
-        frame_opts = FrameExtractionOptions()
-        frame_opts.samp_freq = sample_rate
-        opts = FbankOptions()
-        opts.mel_opts = mel_opts
-        opts.frame_opts = frame_opts
-        fbank = Fbank(opts=opts)
-        features = fbank.compute(Vector(waveform), 1.0).numpy()
-        return features
-    except ImportError:
-        return None
-
-
-def _get_torchaudio_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarray]:
-    """Get mel-filter bank features via TorchAudio."""
-    try:
-        import torch
-        import torchaudio.compliance.kaldi as ta_kaldi
-
-        waveform = torch.from_numpy(waveform).unsqueeze(0)
-        features = ta_kaldi.fbank(
-            waveform, num_mel_bins=n_bins, sample_frequency=sample_rate
-        )
-        return features.numpy()
-    except ImportError:
-        return None
-
-
-def get_fbank(path_or_fp: Union[str, BinaryIO], n_bins=80) -> np.ndarray:
-    """Get mel-filter bank features via PyKaldi or TorchAudio. Prefer PyKaldi
-    (faster CPP implementation) to TorchAudio (Python implementation). Note that
-    Kaldi/TorchAudio requires 16-bit signed integers as inputs and hence the
-    waveform should not be normalized."""
-    sound, sample_rate = get_waveform(path_or_fp, normalization=False)
-
-    features = _get_kaldi_fbank(sound, sample_rate, n_bins)
-    if features is None:
-        features = _get_torchaudio_fbank(sound, sample_rate, n_bins)
-    if features is None:
-        raise ImportError(
-            "Please install pyKaldi or torchaudio to enable "
-            "online filterbank feature extraction"
-        )
-
-    return features
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/feature_transforms/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/feature_transforms/__init__.py
deleted file mode 100644
index 359fa069716cba0dd615ce0959368b20828c31f7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/feature_transforms/__init__.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import importlib
-import os
-from abc import ABC, abstractmethod
-from typing import Dict, Optional
-
-
-class AudioFeatureTransform(ABC):
-    @classmethod
-    @abstractmethod
-    def from_config_dict(cls, config: Optional[Dict] = None):
-        pass
-
-
-AUDIO_FEATURE_TRANSFORM_REGISTRY = {}
-AUDIO_FEATURE_TRANSFORM_CLASS_NAMES = set()
-
-
-def register_audio_feature_transform(name):
-    def register_audio_feature_transform_cls(cls):
-        if name in AUDIO_FEATURE_TRANSFORM_REGISTRY:
-            raise ValueError(f"Cannot register duplicate transform ({name})")
-        if not issubclass(cls, AudioFeatureTransform):
-            raise ValueError(
-                f"Transform ({name}: {cls.__name__}) must extend "
-                "AudioFeatureTransform"
-            )
-        if cls.__name__ in AUDIO_FEATURE_TRANSFORM_CLASS_NAMES:
-            raise ValueError(
-                f"Cannot register audio feature transform with duplicate "
-                f"class name ({cls.__name__})"
-            )
-        AUDIO_FEATURE_TRANSFORM_REGISTRY[name] = cls
-        AUDIO_FEATURE_TRANSFORM_CLASS_NAMES.add(cls.__name__)
-        return cls
-
-    return register_audio_feature_transform_cls
-
-
-def get_audio_feature_transform(name):
-    return AUDIO_FEATURE_TRANSFORM_REGISTRY[name]
-
-
-transforms_dir = os.path.dirname(__file__)
-for file in os.listdir(transforms_dir):
-    path = os.path.join(transforms_dir, file)
-    if (
-        not file.startswith("_")
-        and not file.startswith(".")
-        and (file.endswith(".py") or os.path.isdir(path))
-    ):
-        name = file[: file.find(".py")] if file.endswith(".py") else file
-        importlib.import_module("fairseq.data.audio.feature_transforms." + name)
-
-
-class CompositeAudioFeatureTransform(AudioFeatureTransform):
-    @classmethod
-    def from_config_dict(cls, config=None):
-        _config = {} if config is None else config
-        _transforms = _config.get("transforms")
-        if _transforms is None:
-            return None
-        transforms = [
-            get_audio_feature_transform(_t).from_config_dict(_config.get(_t))
-            for _t in _transforms
-        ]
-        return CompositeAudioFeatureTransform(transforms)
-
-    def __init__(self, transforms):
-        self.transforms = [t for t in transforms if t is not None]
-
-    def __call__(self, x):
-        for t in self.transforms:
-            x = t(x)
-        return x
-
-    def __repr__(self):
-        format_string = (
-            [self.__class__.__name__ + "("]
-            + [f"    {t.__repr__()}" for t in self.transforms]
-            + [")"]
-        )
-        return "\n".join(format_string)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/feature_transforms/global_cmvn.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/feature_transforms/global_cmvn.py
deleted file mode 100644
index cf9564f2a8f97cada1bf0bd2810004643c17bc25..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/feature_transforms/global_cmvn.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# BSD 3-Clause License
-#
-# Copyright (c) 2017 xxxx
-# All rights reserved.
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# ============================================================================
-import numpy as np
-from fairseq.data.audio.feature_transforms import (
-    AudioFeatureTransform,
-    register_audio_feature_transform,
-)
-
-
-@register_audio_feature_transform("global_cmvn")
-class GlobalCMVN(AudioFeatureTransform):
-    """Global CMVN (cepstral mean and variance normalization). The global mean
-    and variance need to be pre-computed and stored in NumPy format (.npz)."""
-
-    @classmethod
-    def from_config_dict(cls, config=None):
-        _config = {} if config is None else config
-        return GlobalCMVN(_config.get("stats_npz_path"))
-
-    def __init__(self, stats_npz_path):
-        stats = np.load(stats_npz_path)
-        self.mean, self.std = stats["mean"], stats["std"]
-
-    def __call__(self, x):
-        x = np.subtract(x, self.mean)
-        x = np.divide(x, self.std)
-        return x
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/feature_transforms/specaugment.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/feature_transforms/specaugment.py
deleted file mode 100644
index 1fb4e95f1a4df13bf9fa47a4cc0f0533af51305f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/feature_transforms/specaugment.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# BSD 3-Clause License
-#
-# Copyright (c) 2017 xxxx
-# All rights reserved.
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# ============================================================================
-import math
-import numbers
-from typing import Optional
-
-import numpy as np
-from fairseq.data.audio.feature_transforms import (
-    AudioFeatureTransform,
-    register_audio_feature_transform,
-)
-
-
-@register_audio_feature_transform("specaugment")
-class SpecAugmentTransform(AudioFeatureTransform):
-    """SpecAugment (https://arxiv.org/abs/1904.08779)"""
-
-    @classmethod
-    def from_config_dict(cls, config=None):
-        _config = {} if config is None else config
-        return SpecAugmentTransform(
-            _config.get("time_warp_W", 0),
-            _config.get("freq_mask_N", 0),
-            _config.get("freq_mask_F", 0),
-            _config.get("time_mask_N", 0),
-            _config.get("time_mask_T", 0),
-            _config.get("time_mask_p", 0.0),
-            _config.get("mask_value", None),
-        )
-
-    def __init__(
-        self,
-        time_warp_w: int = 0,
-        freq_mask_n: int = 0,
-        freq_mask_f: int = 0,
-        time_mask_n: int = 0,
-        time_mask_t: int = 0,
-        time_mask_p: float = 0.0,
-        mask_value: Optional[float] = 0.0,
-    ):
-        # Sanity checks
-        assert mask_value is None or isinstance(
-            mask_value, numbers.Number
-        ), f"mask_value (type: {type(mask_value)}) must be None or a number"
-        if freq_mask_n > 0:
-            assert freq_mask_f > 0, (
-                f"freq_mask_F ({freq_mask_f}) "
-                f"must be larger than 0 when doing freq masking."
-            )
-        if time_mask_n > 0:
-            assert time_mask_t > 0, (
-                f"time_mask_T ({time_mask_t}) must be larger than 0 when "
-                f"doing time masking."
-            )
-
-        self.time_warp_w = time_warp_w
-        self.freq_mask_n = freq_mask_n
-        self.freq_mask_f = freq_mask_f
-        self.time_mask_n = time_mask_n
-        self.time_mask_t = time_mask_t
-        self.time_mask_p = time_mask_p
-        self.mask_value = mask_value
-
-    def __repr__(self):
-        return (
-            self.__class__.__name__
-            + "("
-            + ", ".join(
-                [
-                    f"time_warp_w={self.time_warp_w}",
-                    f"freq_mask_n={self.freq_mask_n}",
-                    f"freq_mask_f={self.freq_mask_f}",
-                    f"time_mask_n={self.time_mask_n}",
-                    f"time_mask_t={self.time_mask_t}",
-                    f"time_mask_p={self.time_mask_p}",
-                ]
-            )
-            + ")"
-        )
-
-    def __call__(self, spectrogram):
-        assert len(spectrogram.shape) == 2, "spectrogram must be a 2-D tensor."
-
-        distorted = spectrogram.copy()  # make a copy of input spectrogram.
-        num_frames = spectrogram.shape[0]  # or 'tau' in the paper.
-        num_freqs = spectrogram.shape[1]  # or 'miu' in the paper.
-        mask_value = self.mask_value
-
-        if mask_value is None:  # if no value was specified, use local mean.
-            mask_value = spectrogram.mean()
-
-        if num_frames == 0:
-            return spectrogram
-
-        if num_freqs < self.freq_mask_f:
-            return spectrogram
-
-        if self.time_warp_w > 0:
-            if 2 * self.time_warp_w < num_frames:
-                import cv2
-
-                w0 = np.random.randint(self.time_warp_w, num_frames - self.time_warp_w)
-                w = np.random.randint(0, self.time_warp_w)
-                upper, lower = distorted[:w0, :], distorted[w0:, :]
-                upper = cv2.resize(
-                    upper, dsize=(num_freqs, w0 + w), interpolation=cv2.INTER_LINEAR
-                )
-                lower = cv2.resize(
-                    lower,
-                    dsize=(num_freqs, num_frames - w0 - w),
-                    interpolation=cv2.INTER_LINEAR,
-                )
-                distorted = np.concatenate((upper, lower), axis=0)
-
-        for _i in range(self.freq_mask_n):
-            f = np.random.randint(0, self.freq_mask_f)
-            f0 = np.random.randint(0, num_freqs - f)
-            if f != 0:
-                distorted[:, f0 : f0 + f] = mask_value
-
-        max_time_mask_t = min(
-            self.time_mask_t, math.floor(num_frames * self.time_mask_p)
-        )
-        if max_time_mask_t < 1:
-            return distorted
-
-        for _i in range(self.time_mask_n):
-            t = np.random.randint(0, max_time_mask_t)
-            t0 = np.random.randint(0, num_frames - t)
-            if t != 0:
-                distorted[t0 : t0 + t, :] = mask_value
-
-        return distorted
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/feature_transforms/utterance_cmvn.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/feature_transforms/utterance_cmvn.py
deleted file mode 100644
index 149ff5e002554bf3cee6dc6f076f58e745f18b6a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/feature_transforms/utterance_cmvn.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# BSD 3-Clause License
-#
-# Copyright (c) 2017 xxxx
-# All rights reserved.
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# ============================================================================
-import numpy as np
-from fairseq.data.audio.feature_transforms import (
-    AudioFeatureTransform,
-    register_audio_feature_transform,
-)
-
-
-@register_audio_feature_transform("utterance_cmvn")
-class UtteranceCMVN(AudioFeatureTransform):
-    """Utterance-level CMVN (cepstral mean and variance normalization)"""
-
-    @classmethod
-    def from_config_dict(cls, config=None):
-        _config = {} if config is None else config
-        return UtteranceCMVN(
-            _config.get("norm_means", True),
-            _config.get("norm_vars", True),
-        )
-
-    def __init__(self, norm_means=True, norm_vars=True):
-        self.norm_means, self.norm_vars = norm_means, norm_vars
-
-    def __repr__(self):
-        return (
-            self.__class__.__name__
-            + f"(norm_means={self.norm_means}, norm_vars={self.norm_vars})"
-        )
-
-    def __call__(self, x):
-        mean = x.mean(axis=0)
-        square_sums = (x ** 2).sum(axis=0)
-
-        if self.norm_means:
-            x = np.subtract(x, mean)
-        if self.norm_vars:
-            var = square_sums / x.shape[0] - mean ** 2
-            std = np.sqrt(np.maximum(var, 1e-10))
-            x = np.divide(x, std)
-
-        return x
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/raw_audio_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/raw_audio_dataset.py
deleted file mode 100644
index 8d6ce85ecca2122a3c66c96a3a460721e0c86f0c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/raw_audio_dataset.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import logging
-import os
-import sys
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-
-from .. import FairseqDataset
-
-
-logger = logging.getLogger(__name__)
-
-
-class RawAudioDataset(FairseqDataset):
-    def __init__(
-        self,
-        sample_rate,
-        max_sample_size=None,
-        min_sample_size=None,
-        shuffle=True,
-        min_length=0,
-        pad=False,
-        normalize=False,
-    ):
-        super().__init__()
-
-        self.sample_rate = sample_rate
-        self.sizes = []
-        self.max_sample_size = (
-            max_sample_size if max_sample_size is not None else sys.maxsize
-        )
-        self.min_sample_size = min_sample_size
-        self.min_length = min_length
-        self.pad = pad
-        self.shuffle = shuffle
-        self.normalize = normalize
-
-    def __getitem__(self, index):
-        raise NotImplementedError()
-
-    def __len__(self):
-        return len(self.sizes)
-
-    def postprocess(self, feats, curr_sample_rate):
-        if feats.dim() == 2:
-            feats = feats.mean(-1)
-
-        if curr_sample_rate != self.sample_rate:
-            raise Exception(f"sample rate: {curr_sample_rate}, need {self.sample_rate}")
-
-        assert feats.dim() == 1, feats.dim()
-
-        if self.normalize:
-            with torch.no_grad():
-                feats = F.layer_norm(feats, feats.shape)
-        return feats
-
-    def crop_to_max_size(self, wav, target_size):
-        size = len(wav)
-        diff = size - target_size
-        if diff <= 0:
-            return wav
-
-        start = np.random.randint(0, diff + 1)
-        end = size - diff + start
-        return wav[start:end]
-
-    def collater(self, samples):
-        samples = [s for s in samples if s["source"] is not None]
-        if len(samples) == 0:
-            return {}
-
-        sources = [s["source"] for s in samples]
-        sizes = [len(s) for s in sources]
-
-        if self.pad:
-            target_size = min(max(sizes), self.max_sample_size)
-        else:
-            target_size = min(min(sizes), self.max_sample_size)
-
-        collated_sources = sources[0].new_zeros(len(sources), target_size)
-        padding_mask = (
-            torch.BoolTensor(collated_sources.shape).fill_(False) if self.pad else None
-        )
-        for i, (source, size) in enumerate(zip(sources, sizes)):
-            diff = size - target_size
-            if diff == 0:
-                collated_sources[i] = source
-            elif diff < 0:
-                assert self.pad
-                collated_sources[i] = torch.cat(
-                    [source, source.new_full((-diff,), 0.0)]
-                )
-                padding_mask[i, diff:] = True
-            else:
-                collated_sources[i] = self.crop_to_max_size(source, target_size)
-
-        input = {"source": collated_sources}
-        if self.pad:
-            input["padding_mask"] = padding_mask
-        return {"id": torch.LongTensor([s["id"] for s in samples]), "net_input": input}
-
-    def num_tokens(self, index):
-        return self.size(index)
-
-    def size(self, index):
-        """Return an example's size as a float or tuple. This value is used when
-        filtering a dataset with ``--max-positions``."""
-        if self.pad:
-            return self.sizes[index]
-        return min(self.sizes[index], self.max_sample_size)
-
-    def ordered_indices(self):
-        """Return an ordered list of indices. Batches will be constructed based
-        on this order."""
-
-        if self.shuffle:
-            order = [np.random.permutation(len(self))]
-        else:
-            order = [np.arange(len(self))]
-
-        order.append(self.sizes)
-        return np.lexsort(order)[::-1]
-
-
-class FileAudioDataset(RawAudioDataset):
-    def __init__(
-        self,
-        manifest_path,
-        sample_rate,
-        max_sample_size=None,
-        min_sample_size=None,
-        shuffle=True,
-        min_length=0,
-        pad=False,
-        normalize=False,
-    ):
-        super().__init__(
-            sample_rate=sample_rate,
-            max_sample_size=max_sample_size,
-            min_sample_size=min_sample_size,
-            shuffle=shuffle,
-            min_length=min_length,
-            pad=pad,
-            normalize=normalize,
-        )
-
-        self.fnames = []
-
-        skipped = 0
-        with open(manifest_path, "r") as f:
-            self.root_dir = f.readline().strip()
-            for line in f:
-                items = line.strip().split("\t")
-                assert len(items) == 2, line
-                sz = int(items[1])
-                if min_length is not None and sz < min_length:
-                    skipped += 1
-                    continue
-                self.fnames.append(items[0])
-                self.sizes.append(sz)
-        logger.info(f"loaded {len(self.fnames)}, skipped {skipped} samples")
-
-    def __getitem__(self, index):
-        import soundfile as sf
-
-        fname = os.path.join(self.root_dir, self.fnames[index])
-        wav, curr_sample_rate = sf.read(fname)
-        feats = torch.from_numpy(wav).float()
-        feats = self.postprocess(feats, curr_sample_rate)
-        return {"id": index, "source": feats}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/speech_to_text_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/speech_to_text_dataset.py
deleted file mode 100644
index 6e5fd70e3ceade0d03f8d9f7d65a95b92b39ed35..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/audio/speech_to_text_dataset.py
+++ /dev/null
@@ -1,528 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import csv
-import io
-import logging
-import os.path as op
-import re
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-import torch
-from fairseq.data import (
-    ConcatDataset,
-    Dictionary,
-    FairseqDataset,
-    ResamplingDataset,
-    data_utils as fairseq_data_utils,
-)
-from fairseq.data.audio.audio_utils import get_fbank, get_waveform
-from fairseq.data.audio.feature_transforms import CompositeAudioFeatureTransform
-
-
-logger = logging.getLogger(__name__)
-
-
-class S2TDataConfig(object):
-    """Wrapper class for data config YAML"""
-
-    def __init__(self, yaml_path):
-        try:
-            import yaml
-        except ImportError:
-            print("Please install PyYAML to load YAML files for " "S2T data config")
-        self.config = {}
-        if op.isfile(yaml_path):
-            try:
-                with open(yaml_path) as f:
-                    self.config = yaml.load(f, Loader=yaml.FullLoader)
-            except Exception as e:
-                logger.info(f"Failed to load config from {yaml_path}: {e}")
-        else:
-            logger.info(f"Cannot find {yaml_path}")
-
-    @property
-    def vocab_filename(self):
-        """fairseq vocabulary file under data root"""
-        return self.config.get("vocab_filename", "dict.txt")
-
-    @property
-    def shuffle(self) -> bool:
-        """Shuffle dataset samples before batching"""
-        return self.config.get("shuffle", False)
-
-    @property
-    def pre_tokenizer(self) -> Dict:
-        """Pre-tokenizer to apply before subword tokenization. Returning
-        a dictionary with `tokenizer` providing the tokenizer name and
-        the other items providing the tokenizer-specific arguments.
-        Tokenizers are defined in `fairseq.data.encoders.*`"""
-        return self.config.get("pre_tokenizer", {"tokenizer": None})
-
-    @property
-    def bpe_tokenizer(self) -> Dict:
-        """Subword tokenizer to apply after pre-tokenization. Returning
-        a dictionary with `bpe` providing the tokenizer name and
-        the other items providing the tokenizer-specific arguments.
-        Tokenizers are defined in `fairseq.data.encoders.*`"""
-        return self.config.get("bpe_tokenizer", None)
-
-    @property
-    def prepend_tgt_lang_tag(self) -> bool:
-        """Prepend target lang ID token as the target BOS (e.g. for to-many
-        multilingual setting). During inference, this requires `--prefix-size 1`
-        to force BOS to be lang ID token."""
-        return self.config.get("prepend_tgt_lang_tag", False)
-
-    @property
-    def input_feat_per_channel(self):
-        """The dimension of input features (per audio channel)"""
-        return self.config.get("input_feat_per_channel", 80)
-
-    @property
-    def input_channels(self):
-        """The number of channels in the input audio"""
-        return self.config.get("input_channels", 1)
-
-    @property
-    def sampling_alpha(self):
-        """Hyper-parameter alpha = 1/T for temperature-based resampling.
-        (alpha = 1 for no resampling)"""
-        return self.config.get("sampling_alpha", 1.0)
-
-    @property
-    def use_audio_input(self):
-        """Needed by the dataset loader to see if the model requires
-        raw audio as inputs."""
-        return self.config.get("use_audio_input", False)
-
-    @property
-    def audio_root(self):
-        """Audio paths in the manifest TSV can be relative and this provides
-        the root path. Set this to empty string when using absolute paths."""
-        return self.config.get("audio_root", "")
-
-    def get_feature_transforms(self, split, is_train):
-        """Split-specific feature transforms. Allowing train set wildcard `_train`,
-        evaluation set wildcard `_eval` and general wildcard `*` for matching."""
-        from copy import deepcopy
-
-        cfg = deepcopy(self.config)
-        _cur = cfg.get("transforms", {})
-        cur = _cur.get(split)
-        cur = _cur.get("_train") if cur is None and is_train else cur
-        cur = _cur.get("_eval") if cur is None and not is_train else cur
-        cur = _cur.get("*") if cur is None else cur
-        cfg["transforms"] = cur
-        return cfg
-
-
-def is_npy_data(data: bytes) -> bool:
-    return data[0] == 147 and data[1] == 78
-
-
-def is_flac_or_wav_data(data: bytes) -> bool:
-    is_flac = data[0] == 102 and data[1] == 76
-    is_wav = data[0] == 82 and data[1] == 73
-    return is_flac or is_wav
-
-
-def read_from_uncompressed_zip(file_path, offset, file_size) -> bytes:
-    with open(file_path, "rb") as f:
-        f.seek(offset)
-        data = f.read(file_size)
-    return data
-
-
-def get_features_from_npy_or_audio(path):
-    ext = op.splitext(op.basename(path))[1]
-    if ext not in {".npy", ".flac", ".wav"}:
-        raise ValueError(f'Unsupported file format for "{path}"')
-    return np.load(path) if ext == ".npy" else get_fbank(path)
-
-
-def get_features_or_waveform_from_uncompressed_zip(
-    path, byte_offset, byte_size, need_waveform=False
-):
-    assert path.endswith(".zip")
-    data = read_from_uncompressed_zip(path, byte_offset, byte_size)
-    f = io.BytesIO(data)
-    if is_npy_data(data):
-        features_or_waveform = np.load(f)
-    elif is_flac_or_wav_data(data):
-        features_or_waveform = get_waveform(f)[0] if need_waveform else get_fbank(f)
-    else:
-        raise ValueError(f'Unknown file format for "{path}"')
-    return features_or_waveform
-
-
-def get_features_or_waveform(path: str, need_waveform=False):
-    """Get speech features from .npy file or waveform from .wav/.flac file.
-    The file may be inside an uncompressed ZIP file and is accessed via byte
-    offset and length.
-
-    Args:
-        path (str): File path in the format of "<.npy/.wav/.flac path>" or
-        "<zip path>:<byte offset>:<byte length>".
-        need_waveform (bool): return waveform instead of features.
-
-    Returns:
-        features_or_waveform (numpy.ndarray): speech features or waveform.
-    """
-    _path, *extra = path.split(":")
-    if not op.exists(_path):
-        raise FileNotFoundError(f"File not found: {_path}")
-
-    if len(extra) == 0:
-        if need_waveform:
-            return get_waveform(_path)
-        return get_features_from_npy_or_audio(_path)
-    elif len(extra) == 2:
-        extra = [int(i) for i in extra]
-        features_or_waveform = get_features_or_waveform_from_uncompressed_zip(
-            _path, extra[0], extra[1], need_waveform=need_waveform
-        )
-    else:
-        raise ValueError(f"Invalid path: {path}")
-
-    return features_or_waveform
-
-
-def _collate_frames(
-    frames: List[torch.Tensor], is_audio_input: bool = False
-) -> torch.Tensor:
-    """
-    Convert a list of 2D frames into a padded 3D tensor
-    Args:
-        frames (list): list of 2D frames of size L[i]*f_dim. Where L[i] is
-            length of i-th frame and f_dim is static dimension of features
-    Returns:
-        3D tensor of size len(frames)*len_max*f_dim where len_max is max of L[i]
-    """
-    max_len = max(frame.size(0) for frame in frames)
-    if is_audio_input:
-        out = frames[0].new_zeros((len(frames), max_len))
-    else:
-        out = frames[0].new_zeros((len(frames), max_len, frames[0].size(1)))
-    for i, v in enumerate(frames):
-        out[i, : v.size(0)] = v
-    return out
-
-
-class SpeechToTextDataset(FairseqDataset):
-    LANG_TAG_TEMPLATE = "<lang:{}>"
-
-    def __init__(
-        self,
-        split: str,
-        is_train_split: bool,
-        data_cfg: S2TDataConfig,
-        audio_paths: List[str],
-        n_frames: List[int],
-        src_texts: Optional[List[str]] = None,
-        tgt_texts: Optional[List[str]] = None,
-        speakers: Optional[List[str]] = None,
-        src_langs: Optional[List[str]] = None,
-        tgt_langs: Optional[List[str]] = None,
-        ids: Optional[List[str]] = None,
-        tgt_dict: Optional[Dictionary] = None,
-        pre_tokenizer=None,
-        bpe_tokenizer=None,
-    ):
-        self.split, self.is_train_split = split, is_train_split
-        self.data_cfg = data_cfg
-        self.audio_paths, self.n_frames = audio_paths, n_frames
-        self.n_samples = len(audio_paths)
-        assert len(n_frames) == self.n_samples > 0
-        assert src_texts is None or len(src_texts) == self.n_samples
-        assert tgt_texts is None or len(tgt_texts) == self.n_samples
-        assert speakers is None or len(speakers) == self.n_samples
-        assert src_langs is None or len(src_langs) == self.n_samples
-        assert tgt_langs is None or len(tgt_langs) == self.n_samples
-        assert ids is None or len(ids) == self.n_samples
-        assert (tgt_dict is None and tgt_texts is None) or (
-            tgt_dict is not None and tgt_texts is not None
-        )
-        self.tgt_dict = tgt_dict
-        self.check_tgt_lang_tag()
-        self.src_texts, self.tgt_texts = src_texts, tgt_texts
-        self.src_langs, self.tgt_langs = src_langs, tgt_langs
-        self.ids = ids
-        self.shuffle = data_cfg.shuffle if is_train_split else False
-
-        self.feature_transforms = CompositeAudioFeatureTransform.from_config_dict(
-            self.data_cfg.get_feature_transforms(split, is_train_split)
-        )
-
-        self.pre_tokenizer = pre_tokenizer
-        self.bpe_tokenizer = bpe_tokenizer
-
-        logger.info(self.__repr__())
-
-    def __repr__(self):
-        return (
-            self.__class__.__name__
-            + f'(split="{self.split}", n_samples={self.n_samples}, '
-            f"prepend_tgt_lang_tag={self.data_cfg.prepend_tgt_lang_tag}, "
-            f"shuffle={self.shuffle}, transforms={self.feature_transforms})"
-        )
-
-    @classmethod
-    def is_lang_tag(cls, token):
-        pattern = cls.LANG_TAG_TEMPLATE.replace("{}", "(.*)")
-        return re.match(pattern, token)
-
-    def check_tgt_lang_tag(self):
-        if self.data_cfg.prepend_tgt_lang_tag:
-            assert self.tgt_langs is not None and self.tgt_dict is not None
-            tgt_lang_tags = [
-                self.LANG_TAG_TEMPLATE.format(t) for t in set(self.tgt_langs)
-            ]
-            assert all(t in self.tgt_dict for t in tgt_lang_tags)
-
-    def tokenize_text(self, text: str):
-        if self.pre_tokenizer is not None:
-            text = self.pre_tokenizer.encode(text)
-        if self.bpe_tokenizer is not None:
-            text = self.bpe_tokenizer.encode(text)
-        return text
-
-    def __getitem__(
-        self, index: int
-    ) -> Tuple[int, torch.Tensor, Optional[torch.Tensor]]:
-        source = get_features_or_waveform(
-            self.audio_paths[index], need_waveform=self.data_cfg.use_audio_input
-        )
-        if self.feature_transforms is not None:
-            assert not self.data_cfg.use_audio_input
-            source = self.feature_transforms(source)
-        source = torch.from_numpy(source).float()
-
-        target = None
-        if self.tgt_texts is not None:
-            tokenized = self.tokenize_text(self.tgt_texts[index])
-            target = self.tgt_dict.encode_line(
-                tokenized, add_if_not_exist=False, append_eos=True
-            ).long()
-            if self.data_cfg.prepend_tgt_lang_tag:
-                lang_tag = self.LANG_TAG_TEMPLATE.format(self.tgt_langs[index])
-                lang_tag_idx = self.tgt_dict.index(lang_tag)
-                target = torch.cat((torch.LongTensor([lang_tag_idx]), target), 0)
-        return index, source, target
-
-    def __len__(self):
-        return self.n_samples
-
-    def collater(self, samples: List[Tuple[int, torch.Tensor, torch.Tensor]]) -> Dict:
-        if len(samples) == 0:
-            return {}
-        indices = torch.tensor([i for i, _, _ in samples], dtype=torch.long)
-        frames = _collate_frames(
-            [s for _, s, _ in samples], self.data_cfg.use_audio_input
-        )
-        # sort samples by descending number of frames
-        n_frames = torch.tensor([s.size(0) for _, s, _ in samples], dtype=torch.long)
-        n_frames, order = n_frames.sort(descending=True)
-        indices = indices.index_select(0, order)
-        frames = frames.index_select(0, order)
-
-        target, target_lengths = None, None
-        prev_output_tokens = None
-        ntokens = None
-        if self.tgt_texts is not None:
-            target = fairseq_data_utils.collate_tokens(
-                [t for _, _, t in samples],
-                self.tgt_dict.pad(),
-                self.tgt_dict.eos(),
-                left_pad=False,
-                move_eos_to_beginning=False,
-            )
-            target = target.index_select(0, order)
-            target_lengths = torch.tensor(
-                [t.size(0) for _, _, t in samples], dtype=torch.long
-            ).index_select(0, order)
-            prev_output_tokens = fairseq_data_utils.collate_tokens(
-                [t for _, _, t in samples],
-                self.tgt_dict.pad(),
-                self.tgt_dict.eos(),
-                left_pad=False,
-                move_eos_to_beginning=True,
-            )
-            prev_output_tokens = prev_output_tokens.index_select(0, order)
-            ntokens = sum(t.size(0) for _, _, t in samples)
-
-        out = {
-            "id": indices,
-            "net_input": {
-                "src_tokens": frames,
-                "src_lengths": n_frames,
-                "prev_output_tokens": prev_output_tokens,
-            },
-            "target": target,
-            "target_lengths": target_lengths,
-            "ntokens": ntokens,
-            "nsentences": len(samples),
-        }
-        return out
-
-    def num_tokens(self, index):
-        return self.n_frames[index]
-
-    def size(self, index):
-        t_len = 0
-        if self.tgt_texts is not None:
-            tokenized = self.tokenize_text(self.tgt_texts[index])
-            t_len = len(tokenized.split(" "))
-        return self.n_frames[index], t_len
-
-    @property
-    def sizes(self):
-        return np.array(self.n_frames)
-
-    @property
-    def can_reuse_epoch_itr_across_epochs(self):
-        return True
-
-    def ordered_indices(self):
-        if self.shuffle:
-            order = [np.random.permutation(len(self))]
-        else:
-            order = [np.arange(len(self))]
-        # first by descending order of # of frames then by original/random order
-        order.append([-n for n in self.n_frames])
-        return np.lexsort(order)
-
-    def prefetch(self, indices):
-        raise False
-
-
-class SpeechToTextDatasetCreator(object):
-    # mandatory columns
-    KEY_ID, KEY_AUDIO, KEY_N_FRAMES = "id", "audio", "n_frames"
-    KEY_TGT_TEXT = "tgt_text"
-    # optional columns
-    KEY_SPEAKER, KEY_SRC_TEXT = "speaker", "src_text"
-    KEY_SRC_LANG, KEY_TGT_LANG = "src_lang", "tgt_lang"
-    # default values
-    DEFAULT_SPEAKER = DEFAULT_SRC_TEXT = DEFAULT_LANG = ""
-
-    @classmethod
-    def _from_list(
-        cls,
-        split_name: str,
-        is_train_split,
-        samples: List[List[Dict]],
-        data_cfg: S2TDataConfig,
-        tgt_dict,
-        pre_tokenizer,
-        bpe_tokenizer,
-    ) -> SpeechToTextDataset:
-        audio_paths, n_frames, src_texts, tgt_texts, ids = [], [], [], [], []
-        speakers, src_langs, tgt_langs = [], [], []
-        for s in samples:
-            ids.extend([ss[cls.KEY_ID] for ss in s])
-            audio_paths.extend(
-                [op.join(data_cfg.audio_root, ss[cls.KEY_AUDIO]) for ss in s]
-            )
-            n_frames.extend([int(ss[cls.KEY_N_FRAMES]) for ss in s])
-            tgt_texts.extend([ss[cls.KEY_TGT_TEXT] for ss in s])
-            src_texts.extend(
-                [ss.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for ss in s]
-            )
-            speakers.extend([ss.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for ss in s])
-            src_langs.extend([ss.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for ss in s])
-            tgt_langs.extend([ss.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for ss in s])
-        return SpeechToTextDataset(
-            split_name,
-            is_train_split,
-            data_cfg,
-            audio_paths,
-            n_frames,
-            src_texts,
-            tgt_texts,
-            speakers,
-            src_langs,
-            tgt_langs,
-            ids,
-            tgt_dict,
-            pre_tokenizer,
-            bpe_tokenizer,
-        )
-
-    @classmethod
-    def _get_size_ratios(cls, ids: List[str], sizes: List[int], alpha: float = 1.0):
-        """Size ratios for temperature-based sampling
-        (https://arxiv.org/abs/1907.05019)"""
-        _sizes = np.array(sizes)
-        prob = _sizes / _sizes.sum()
-        smoothed_prob = prob ** alpha
-        smoothed_prob = smoothed_prob / smoothed_prob.sum()
-        size_ratio = (smoothed_prob * _sizes.sum()) / _sizes
-
-        o_str = str({_i: f"{prob[i]:.3f}" for i, _i in enumerate(ids)})
-        logger.info(f"original sampling probability: {o_str}")
-        p_str = str({_i: f"{smoothed_prob[i]:.3f}" for i, _i in enumerate(ids)})
-        logger.info(f"balanced sampling probability: {p_str}")
-        sr_str = str({_id: f"{size_ratio[i]:.3f}" for i, _id in enumerate(ids)})
-        logger.info(f"balanced sampling size ratio: {sr_str}")
-        return size_ratio.tolist()
-
-    @classmethod
-    def from_tsv(
-        cls,
-        root: str,
-        data_cfg: S2TDataConfig,
-        splits: str,
-        tgt_dict,
-        pre_tokenizer,
-        bpe_tokenizer,
-        is_train_split: bool,
-        epoch: int,
-        seed: int,
-    ) -> SpeechToTextDataset:
-        samples = []
-        _splits = splits.split(",")
-        for split in _splits:
-            tsv_path = op.join(root, f"{split}.tsv")
-            if not op.isfile(tsv_path):
-                raise FileNotFoundError(f"Dataset not found: {tsv_path}")
-            with open(tsv_path) as f:
-                reader = csv.DictReader(
-                    f,
-                    delimiter="\t",
-                    quotechar=None,
-                    doublequote=False,
-                    lineterminator="\n",
-                    quoting=csv.QUOTE_NONE,
-                )
-                samples.append([dict(e) for e in reader])
-                assert len(samples) > 0
-
-        datasets = [
-            cls._from_list(
-                name,
-                is_train_split,
-                [s],
-                data_cfg,
-                tgt_dict,
-                pre_tokenizer,
-                bpe_tokenizer,
-            )
-            for name, s in zip(_splits, samples)
-        ]
-
-        if is_train_split and len(_splits) > 1 and data_cfg.sampling_alpha != 1.0:
-            # temperature-based sampling
-            size_ratios = cls._get_size_ratios(
-                _splits, [len(s) for s in samples], alpha=data_cfg.sampling_alpha
-            )
-            datasets = [
-                ResamplingDataset(
-                    d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0)
-                )
-                for d, r in zip(datasets, size_ratios)
-            ]
-        return ConcatDataset(datasets)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/backtranslation_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/backtranslation_dataset.py
deleted file mode 100644
index 8f70c90df3d237077537993e125d366c95292f1a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/backtranslation_dataset.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-from fairseq import utils
-
-from . import FairseqDataset
-
-
-def backtranslate_samples(samples, collate_fn, generate_fn, cuda=True):
-    """Backtranslate a list of samples.
-
-    Given an input (*samples*) of the form:
-
-        [{'id': 1, 'source': 'hallo welt'}]
-
-    this will return:
-
-        [{'id': 1, 'source': 'hello world', 'target': 'hallo welt'}]
-
-    Args:
-        samples (List[dict]): samples to backtranslate. Individual samples are
-            expected to have a 'source' key, which will become the 'target'
-            after backtranslation.
-        collate_fn (callable): function to collate samples into a mini-batch
-        generate_fn (callable): function to generate backtranslations
-        cuda (bool): use GPU for generation (default: ``True``)
-
-    Returns:
-        List[dict]: an updated list of samples with a backtranslated source
-    """
-    collated_samples = collate_fn(samples)
-    s = utils.move_to_cuda(collated_samples) if cuda else collated_samples
-    generated_sources = generate_fn(s)
-
-    id_to_src = {sample["id"]: sample["source"] for sample in samples}
-
-    # Go through each tgt sentence in batch and its corresponding best
-    # generated hypothesis and create a backtranslation data pair
-    # {id: id, source: generated backtranslation, target: original tgt}
-    return [
-        {
-            "id": id.item(),
-            "target": id_to_src[id.item()],
-            "source": hypos[0]["tokens"].cpu(),
-        }
-        for id, hypos in zip(collated_samples["id"], generated_sources)
-    ]
-
-
-class BacktranslationDataset(FairseqDataset):
-    """
-    Sets up a backtranslation dataset which takes a tgt batch, generates
-    a src using a tgt-src backtranslation function (*backtranslation_fn*),
-    and returns the corresponding `{generated src, input tgt}` batch.
-
-    Args:
-        tgt_dataset (~fairseq.data.FairseqDataset): the dataset to be
-            backtranslated. Only the source side of this dataset will be used.
-            After backtranslation, the source sentences in this dataset will be
-            returned as the targets.
-        src_dict (~fairseq.data.Dictionary): the dictionary of backtranslated
-            sentences.
-        tgt_dict (~fairseq.data.Dictionary, optional): the dictionary of
-            sentences to be backtranslated.
-        backtranslation_fn (callable, optional): function to call to generate
-            backtranslations. This is typically the `generate` method of a
-            :class:`~fairseq.sequence_generator.SequenceGenerator` object.
-            Pass in None when it is not available at initialization time, and
-            use set_backtranslation_fn function to set it when available.
-        output_collater (callable, optional): function to call on the
-            backtranslated samples to create the final batch
-            (default: ``tgt_dataset.collater``).
-        cuda: use GPU for generation
-    """
-
-    def __init__(
-        self,
-        tgt_dataset,
-        src_dict,
-        tgt_dict=None,
-        backtranslation_fn=None,
-        output_collater=None,
-        cuda=True,
-        **kwargs
-    ):
-        self.tgt_dataset = tgt_dataset
-        self.backtranslation_fn = backtranslation_fn
-        self.output_collater = (
-            output_collater if output_collater is not None else tgt_dataset.collater
-        )
-        self.cuda = cuda if torch.cuda.is_available() else False
-        self.src_dict = src_dict
-        self.tgt_dict = tgt_dict
-
-    def __getitem__(self, index):
-        """
-        Returns a single sample from *tgt_dataset*. Note that backtranslation is
-        not applied in this step; use :func:`collater` instead to backtranslate
-        a batch of samples.
-        """
-        return self.tgt_dataset[index]
-
-    def __len__(self):
-        return len(self.tgt_dataset)
-
-    def set_backtranslation_fn(self, backtranslation_fn):
-        self.backtranslation_fn = backtranslation_fn
-
-    def collater(self, samples):
-        """Merge and backtranslate a list of samples to form a mini-batch.
-
-        Using the samples from *tgt_dataset*, load a collated target sample to
-        feed to the backtranslation model. Then take the backtranslation with
-        the best score as the source and the original input as the target.
-
-        Note: we expect *tgt_dataset* to provide a function `collater()` that
-        will collate samples into the format expected by *backtranslation_fn*.
-        After backtranslation, we will feed the new list of samples (i.e., the
-        `(backtranslated source, original source)` pairs) to *output_collater*
-        and return the result.
-
-        Args:
-            samples (List[dict]): samples to backtranslate and collate
-
-        Returns:
-            dict: a mini-batch with keys coming from *output_collater*
-        """
-        if samples[0].get("is_dummy", False):
-            return samples
-        samples = backtranslate_samples(
-            samples=samples,
-            collate_fn=self.tgt_dataset.collater,
-            generate_fn=(lambda net_input: self.backtranslation_fn(net_input)),
-            cuda=self.cuda,
-        )
-        return self.output_collater(samples)
-
-    def num_tokens(self, index):
-        """Just use the tgt dataset num_tokens"""
-        return self.tgt_dataset.num_tokens(index)
-
-    def ordered_indices(self):
-        """Just use the tgt dataset ordered_indices"""
-        return self.tgt_dataset.ordered_indices()
-
-    def size(self, index):
-        """Return an example's size as a float or tuple. This value is used
-        when filtering a dataset with ``--max-positions``.
-
-        Note: we use *tgt_dataset* to approximate the length of the source
-        sentence, since we do not know the actual length until after
-        backtranslation.
-        """
-        tgt_size = self.tgt_dataset.size(index)[0]
-        return (tgt_size, tgt_size)
-
-    @property
-    def supports_prefetch(self):
-        return getattr(self.tgt_dataset, "supports_prefetch", False)
-
-    def prefetch(self, indices):
-        return self.tgt_dataset.prefetch(indices)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/base_wrapper_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/base_wrapper_dataset.py
deleted file mode 100644
index 134d398b47dc73c8807759188504aee205b3b34d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/base_wrapper_dataset.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from torch.utils.data.dataloader import default_collate
-
-from . import FairseqDataset
-
-
-class BaseWrapperDataset(FairseqDataset):
-    def __init__(self, dataset):
-        super().__init__()
-        self.dataset = dataset
-
-    def __getitem__(self, index):
-        return self.dataset[index]
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def collater(self, samples):
-        if hasattr(self.dataset, "collater"):
-            return self.dataset.collater(samples)
-        else:
-            return default_collate(samples)
-
-    @property
-    def sizes(self):
-        return self.dataset.sizes
-
-    def num_tokens(self, index):
-        return self.dataset.num_tokens(index)
-
-    def size(self, index):
-        return self.dataset.size(index)
-
-    def ordered_indices(self):
-        return self.dataset.ordered_indices()
-
-    @property
-    def supports_prefetch(self):
-        return getattr(self.dataset, "supports_prefetch", False)
-
-    def attr(self, attr: str, index: int):
-        return self.dataset.attr(attr, index)
-
-    def prefetch(self, indices):
-        self.dataset.prefetch(indices)
-
-    def get_batch_shapes(self):
-        return self.dataset.get_batch_shapes()
-
-    def batch_by_size(
-        self,
-        indices,
-        max_tokens=None,
-        max_sentences=None,
-        required_batch_size_multiple=1,
-    ):
-        return self.dataset.batch_by_size(
-            indices,
-            max_tokens=max_tokens,
-            max_sentences=max_sentences,
-            required_batch_size_multiple=required_batch_size_multiple,
-        )
-
-    def filter_indices_by_size(self, indices, max_sizes):
-        return self.dataset.filter_indices_by_size(indices, max_sizes)
-
-    @property
-    def can_reuse_epoch_itr_across_epochs(self):
-        return self.dataset.can_reuse_epoch_itr_across_epochs
-
-    def set_epoch(self, epoch):
-        super().set_epoch(epoch)
-        if hasattr(self.dataset, "set_epoch"):
-            self.dataset.set_epoch(epoch)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/bucket_pad_length_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/bucket_pad_length_dataset.py
deleted file mode 100644
index cda8834ac86db512c29fe7a0c12ce96d84c0c2c8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/bucket_pad_length_dataset.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch.nn.functional as F
-from fairseq.data import BaseWrapperDataset
-
-
-class BucketPadLengthDataset(BaseWrapperDataset):
-    """
-    Bucket and pad item lengths to the nearest bucket size. This can be used to
-    reduce the number of unique batch shapes, which is important on TPUs since
-    each new batch shape requires a recompilation.
-
-    Args:
-        dataset (FairseqDatset): dataset to bucket
-        sizes (List[int]): all item sizes
-        num_buckets (int): number of buckets to create
-        pad_idx (int): padding symbol
-        left_pad (bool): if True, pad on the left; otherwise right pad
-    """
-
-    def __init__(
-        self,
-        dataset,
-        sizes,
-        num_buckets,
-        pad_idx,
-        left_pad,
-    ):
-        super().__init__(dataset)
-        self.pad_idx = pad_idx
-        self.left_pad = left_pad
-
-        assert num_buckets > 0
-        self.buckets = np.unique(
-            np.percentile(
-                sizes,
-                np.linspace(0, 100, num_buckets + 1),
-                interpolation="lower",
-            )[1:]
-        )
-
-        def get_bucketed_sizes(orig_sizes, buckets):
-            sizes = np.copy(orig_sizes)
-            assert np.min(sizes) >= 0
-            start_val = -1
-            for end_val in buckets:
-                mask = (sizes > start_val) & (sizes <= end_val)
-                sizes[mask] = end_val
-                start_val = end_val
-            return sizes
-
-        self._bucketed_sizes = get_bucketed_sizes(sizes, self.buckets)
-
-    def __getitem__(self, index):
-        item = self.dataset[index]
-        bucket_size = self._bucketed_sizes[index]
-        num_pad = bucket_size - item.size(-1)
-        return F.pad(
-            item,
-            (num_pad if self.left_pad else 0, 0 if self.left_pad else num_pad),
-            value=self.pad_idx,
-        )
-
-    @property
-    def sizes(self):
-        return self._bucketed_sizes
-
-    def num_tokens(self, index):
-        return self._bucketed_sizes[index]
-
-    def size(self, index):
-        return self._bucketed_sizes[index]
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/colorize_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/colorize_dataset.py
deleted file mode 100644
index 6ef097bff1a013f4944b1cb55e1e7e4e2480b3a6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/colorize_dataset.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-
-from . import BaseWrapperDataset
-
-
-class ColorizeDataset(BaseWrapperDataset):
-    """ Adds 'colors' property to net input that is obtained from the provided color getter for use by models """
-
-    def __init__(self, dataset, color_getter):
-        super().__init__(dataset)
-        self.color_getter = color_getter
-
-    def collater(self, samples):
-        base_collate = super().collater(samples)
-        if len(base_collate) > 0:
-            base_collate["net_input"]["colors"] = torch.tensor(
-                list(self.color_getter(self.dataset, s["id"]) for s in samples),
-                dtype=torch.long,
-            )
-        return base_collate
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/concat_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/concat_dataset.py
deleted file mode 100644
index 01a4078bb159fa44b2d1062b9a971fe7f1abd1c2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/concat_dataset.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import bisect
-
-import numpy as np
-from torch.utils.data.dataloader import default_collate
-
-from . import FairseqDataset
-
-
-class ConcatDataset(FairseqDataset):
-    @staticmethod
-    def cumsum(sequence, sample_ratios):
-        r, s = [], 0
-        for e, ratio in zip(sequence, sample_ratios):
-            curr_len = int(ratio * len(e))
-            r.append(curr_len + s)
-            s += curr_len
-        return r
-
-    def __init__(self, datasets, sample_ratios=1):
-        super(ConcatDataset, self).__init__()
-        assert len(datasets) > 0, "datasets should not be an empty iterable"
-        self.datasets = list(datasets)
-        if isinstance(sample_ratios, int):
-            sample_ratios = [sample_ratios] * len(self.datasets)
-        self.sample_ratios = sample_ratios
-        self.cumulative_sizes = self.cumsum(self.datasets, sample_ratios)
-        self.real_sizes = [len(d) for d in self.datasets]
-
-    def __len__(self):
-        return self.cumulative_sizes[-1]
-
-    def __getitem__(self, idx):
-        dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
-        return self.datasets[dataset_idx][sample_idx]
-
-    def _get_dataset_and_sample_index(self, idx: int):
-        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
-        if dataset_idx == 0:
-            sample_idx = idx
-        else:
-            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
-        sample_idx = sample_idx % self.real_sizes[dataset_idx]
-        return dataset_idx, sample_idx
-
-    def collater(self, samples, **extra_args):
-        # For now only supports datasets with same underlying collater implementations
-        if hasattr(self.datasets[0], "collater"):
-            return self.datasets[0].collater(samples, **extra_args)
-        else:
-            return default_collate(samples, **extra_args)
-
-    def size(self, idx: int):
-        """
-        Return an example's size as a float or tuple.
-        """
-        dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
-        return self.datasets[dataset_idx].size(sample_idx)
-
-    def num_tokens(self, index: int):
-        return np.max(self.size(index))
-
-    def attr(self, attr: str, index: int):
-        dataset_idx = bisect.bisect_right(self.cumulative_sizes, index)
-        return getattr(self.datasets[dataset_idx], attr, None)
-
-    @property
-    def sizes(self):
-        _dataset_sizes = []
-        for ds, sr in zip(self.datasets, self.sample_ratios):
-            if isinstance(ds.sizes, np.ndarray):
-                _dataset_sizes.append(np.tile(ds.sizes, sr))
-            else:
-                # Only support underlying dataset with single size array.
-                assert isinstance(ds.sizes, list)
-                _dataset_sizes.append(np.tile(ds.sizes[0], sr))
-        return np.concatenate(_dataset_sizes)
-
-    @property
-    def supports_prefetch(self):
-        return all(d.supports_prefetch for d in self.datasets)
-
-    def ordered_indices(self):
-        """
-        Returns indices sorted by length. So less padding is needed.
-        """
-        if isinstance(self.sizes, np.ndarray) and len(self.sizes.shape) > 1:
-            # special handling for concatenating lang_pair_datasets
-            indices = np.arange(len(self))
-            sizes = self.sizes
-            tgt_sizes = (
-                sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None
-            )
-            src_sizes = (
-                sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes
-            )
-            # sort by target length, then source length
-            if tgt_sizes is not None:
-                indices = indices[np.argsort(tgt_sizes[indices], kind="mergesort")]
-            return indices[np.argsort(src_sizes[indices], kind="mergesort")]
-        else:
-            return np.argsort(self.sizes)
-
-    def prefetch(self, indices):
-        frm = 0
-        for to, ds in zip(self.cumulative_sizes, self.datasets):
-            real_size = len(ds)
-            if getattr(ds, "supports_prefetch", False):
-                ds.prefetch([(i - frm) % real_size for i in indices if frm <= i < to])
-            frm = to
-
-    @property
-    def can_reuse_epoch_itr_across_epochs(self):
-        return all(d.can_reuse_epoch_itr_across_epochs for d in self.datasets)
-
-    def set_epoch(self, epoch):
-        super().set_epoch(epoch)
-        for ds in self.datasets:
-            if hasattr(ds, "set_epoch"):
-                ds.set_epoch(epoch)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/concat_sentences_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/concat_sentences_dataset.py
deleted file mode 100644
index 625a29370e90f9d1d7274024afb902ed83a22325..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/concat_sentences_dataset.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-
-from . import FairseqDataset
-
-
-class ConcatSentencesDataset(FairseqDataset):
-    def __init__(self, *datasets):
-        super().__init__()
-        self.datasets = datasets
-        assert all(
-            len(ds) == len(datasets[0]) for ds in datasets
-        ), "datasets must have the same length"
-
-    def __getitem__(self, index):
-        return torch.cat([ds[index] for ds in self.datasets])
-
-    def __len__(self):
-        return len(self.datasets[0])
-
-    def collater(self, samples):
-        return self.datasets[0].collater(samples)
-
-    @property
-    def sizes(self):
-        return sum(ds.sizes for ds in self.datasets)
-
-    def num_tokens(self, index):
-        return sum(ds.num_tokens(index) for ds in self.datasets)
-
-    def size(self, index):
-        return sum(ds.size(index) for ds in self.datasets)
-
-    def ordered_indices(self):
-        return self.datasets[0].ordered_indices()
-
-    @property
-    def supports_prefetch(self):
-        return any(getattr(ds, "supports_prefetch", False) for ds in self.datasets)
-
-    def prefetch(self, indices):
-        for ds in self.datasets:
-            if getattr(ds, "supports_prefetch", False):
-                ds.prefetch(indices)
-
-    def set_epoch(self, epoch):
-        super().set_epoch(epoch)
-        for ds in self.datasets:
-            if hasattr(ds, "set_epoch"):
-                ds.set_epoch(epoch)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/data_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/data_utils.py
deleted file mode 100644
index e386fe86a35f6ed9fd395699516a4ea178a93159..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/data_utils.py
+++ /dev/null
@@ -1,526 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-try:
-    from collections.abc import Iterable
-except ImportError:
-    from collections import Iterable
-import contextlib
-import itertools
-import logging
-import os
-import warnings
-from typing import Optional, Tuple
-
-import numpy as np
-import torch
-
-
-logger = logging.getLogger(__name__)
-
-
-def infer_language_pair(path):
-    """Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx"""
-    src, dst = None, None
-    for filename in os.listdir(path):
-        parts = filename.split(".")
-        if len(parts) >= 3 and len(parts[1].split("-")) == 2:
-            return parts[1].split("-")
-    return src, dst
-
-
-def collate_tokens(
-    values,
-    other_values,
-    pad_idx,
-    eos_idx=None,
-    left_pad=False,
-    move_eos_to_beginning=False,
-    pad_to_length=None,
-    pad_to_multiple=1,
-):
-    """Convert a list of 1d tensors into a padded 2d tensor."""
-    size = max(max(v.size(0) for v in values), max(ov.size(0) for ov in other_values))
-    buckets = [16, 32, 64, 128, 256, 512, 1024]
-    for buck in buckets:
-        if size <= buck:
-            size = buck
-            break
-    size = size if pad_to_length is None else max(size, pad_to_length)
-    if pad_to_multiple != 1 and size % pad_to_multiple != 0:
-        size = int(((size - 0.1) // pad_to_multiple + 1) * pad_to_multiple)
-    res = values[0].new(len(values), size).fill_(pad_idx)
-
-    def copy_tensor(src, dst):
-        assert dst.numel() == src.numel()
-        if move_eos_to_beginning:
-            if eos_idx is None:
-                # if no eos_idx is specified, then use the last token in src
-                dst[0] = src[-1]
-            else:
-                dst[0] = eos_idx
-            dst[1:] = src[:-1]
-        else:
-            dst.copy_(src)
-
-    for i, v in enumerate(values):
-        copy_tensor(v, res[i][size - len(v) :] if left_pad else res[i][: len(v)])
-    return res
-
-
-def load_indexed_dataset(
-    path, dictionary=None, dataset_impl=None, combine=False, default="cached"
-):
-    """A helper function for loading indexed datasets.
-
-    Args:
-        path (str): path to indexed dataset (e.g., 'data-bin/train')
-        dictionary (~fairseq.data.Dictionary): data dictionary
-        dataset_impl (str, optional): which dataset implementation to use. If
-            not provided, it will be inferred automatically. For legacy indexed
-            data we use the 'cached' implementation by default.
-        combine (bool, optional): automatically load and combine multiple
-            datasets. For example, if *path* is 'data-bin/train', then we will
-            combine 'data-bin/train', 'data-bin/train1', ... and return a
-            single ConcatDataset instance.
-    """
-    from fairseq.data.concat_dataset import ConcatDataset
-    import fairseq.data.indexed_dataset as indexed_dataset
-
-    datasets = []
-    for k in itertools.count():
-        path_k = path + (str(k) if k > 0 else "")
-        path_k = indexed_dataset.get_indexed_dataset_to_local(path_k)
-
-        dataset_impl_k = dataset_impl
-        if dataset_impl_k is None:
-            dataset_impl_k = indexed_dataset.infer_dataset_impl(path_k)
-        dataset = indexed_dataset.make_dataset(
-            path_k,
-            impl=dataset_impl_k or default,
-            fix_lua_indexing=True,
-            dictionary=dictionary,
-        )
-        if dataset is None:
-            break
-        logger.info("loaded {} examples from: {}".format(len(dataset), path_k))
-        datasets.append(dataset)
-        if not combine:
-            break
-    if len(datasets) == 0:
-        return None
-    elif len(datasets) == 1:
-        return datasets[0]
-    else:
-        return ConcatDataset(datasets)
-
-
-@contextlib.contextmanager
-def numpy_seed(seed, *addl_seeds):
-    """Context manager which seeds the NumPy PRNG with the specified seed and
-    restores the state afterward"""
-    if seed is None:
-        yield
-        return
-    if len(addl_seeds) > 0:
-        seed = int(hash((seed, *addl_seeds)) % 1e6)
-    state = np.random.get_state()
-    np.random.seed(seed)
-    try:
-        yield
-    finally:
-        np.random.set_state(state)
-
-
-def collect_filtered(function, iterable, filtered):
-    """
-    Similar to :func:`filter` but collects filtered elements in ``filtered``.
-
-    Args:
-        function (callable): function that returns ``False`` for elements that
-            should be filtered
-        iterable (iterable): iterable to filter
-        filtered (list): list to store filtered elements
-    """
-    for el in iterable:
-        if function(el):
-            yield el
-        else:
-            filtered.append(el)
-
-
-def _filter_by_size_dynamic(indices, size_fn, max_positions, raise_exception=False):
-    def compare_leq(a, b):
-        return a <= b if not isinstance(a, tuple) else max(a) <= b
-
-    def check_size(idx):
-        if isinstance(max_positions, float) or isinstance(max_positions, int):
-            return size_fn(idx) <= max_positions
-        elif isinstance(max_positions, dict):
-            idx_size = size_fn(idx)
-            assert isinstance(idx_size, dict)
-            intersect_keys = set(max_positions.keys()) & set(idx_size.keys())
-            return all(
-                all(
-                    a is None or b is None or a <= b
-                    for a, b in zip(idx_size[key], max_positions[key])
-                )
-                for key in intersect_keys
-            )
-        else:
-            # Hacky as heck, for the specific case of multilingual training with RoundRobin.
-            if isinstance(size_fn(idx), dict) and isinstance(max_positions, tuple):
-                return all(
-                    a is None or b is None or compare_leq(a, b)
-                    for a, b in zip(size_fn(idx).values(), max_positions)
-                )
-            # For MultiCorpusSampledDataset, will generalize it later
-            if not isinstance(size_fn(idx), Iterable):
-                return all(size_fn(idx) <= b for b in max_positions)
-            return all(
-                a is None or b is None or a <= b
-                for a, b in zip(size_fn(idx), max_positions)
-            )
-
-    ignored = []
-    itr = collect_filtered(check_size, indices, ignored)
-    indices = np.fromiter(itr, dtype=np.int64, count=-1)
-    return indices, ignored
-
-
-def filter_by_size(indices, dataset, max_positions, raise_exception=False):
-    """
-    [deprecated] Filter indices based on their size.
-    Use `FairseqDataset::filter_indices_by_size` instead.
-
-    Args:
-        indices (List[int]): ordered list of dataset indices
-        dataset (FairseqDataset): fairseq dataset instance
-        max_positions (tuple): filter elements larger than this size.
-            Comparisons are done component-wise.
-        raise_exception (bool, optional): if ``True``, raise an exception if
-            any elements are filtered (default: False).
-    """
-    warnings.warn(
-        "data_utils.filter_by_size is deprecated. "
-        "Use `FairseqDataset::filter_indices_by_size` instead.",
-        stacklevel=2,
-    )
-    if isinstance(max_positions, float) or isinstance(max_positions, int):
-        if hasattr(dataset, "sizes") and isinstance(dataset.sizes, np.ndarray):
-            ignored = indices[dataset.sizes[indices] > max_positions].tolist()
-            indices = indices[dataset.sizes[indices] <= max_positions]
-        elif (
-            hasattr(dataset, "sizes")
-            and isinstance(dataset.sizes, list)
-            and len(dataset.sizes) == 1
-        ):
-            ignored = indices[dataset.sizes[0][indices] > max_positions].tolist()
-            indices = indices[dataset.sizes[0][indices] <= max_positions]
-        else:
-            indices, ignored = _filter_by_size_dynamic(
-                indices, dataset.size, max_positions
-            )
-    else:
-        indices, ignored = _filter_by_size_dynamic(indices, dataset.size, max_positions)
-
-    if len(ignored) > 0 and raise_exception:
-        raise Exception(
-            (
-                "Size of sample #{} is invalid (={}) since max_positions={}, "
-                "skip this example with --skip-invalid-size-inputs-valid-test"
-            ).format(ignored[0], dataset.size(ignored[0]), max_positions)
-        )
-    if len(ignored) > 0:
-        logger.warning(
-            (
-                "{} samples have invalid sizes and will be skipped, "
-                "max_positions={}, first few sample ids={}"
-            ).format(len(ignored), max_positions, ignored[:10])
-        )
-    return indices
-
-
-def filter_paired_dataset_indices_by_size(src_sizes, tgt_sizes, indices, max_sizes):
-    """Filter a list of sample indices. Remove those that are longer
-        than specified in max_sizes.
-
-    Args:
-        indices (np.array): original array of sample indices
-        max_sizes (int or list[int] or tuple[int]): max sample size,
-            can be defined separately for src and tgt (then list or tuple)
-
-    Returns:
-        np.array: filtered sample array
-        list: list of removed indices
-    """
-    if max_sizes is None:
-        return indices, []
-    if type(max_sizes) in (int, float):
-        max_src_size, max_tgt_size = max_sizes, max_sizes
-    else:
-        max_src_size, max_tgt_size = max_sizes
-    if tgt_sizes is None:
-        ignored = indices[src_sizes[indices] > max_src_size]
-    else:
-        ignored = indices[
-            (src_sizes[indices] > max_src_size) | (tgt_sizes[indices] > max_tgt_size)
-        ]
-    if len(ignored) > 0:
-        if tgt_sizes is None:
-            indices = indices[src_sizes[indices] <= max_src_size]
-        else:
-            indices = indices[
-                (src_sizes[indices] <= max_src_size)
-                & (tgt_sizes[indices] <= max_tgt_size)
-            ]
-    return indices, ignored.tolist()
-
-
-def batch_by_size(
-    indices,
-    num_tokens_fn,
-    max_tokens=None,
-    max_sentences=None,
-    required_batch_size_multiple=1,
-    fixed_shapes=None,
-):
-    """
-    Yield mini-batches of indices bucketed by size. Batches may contain
-    sequences of different lengths.
-
-    Args:
-        indices (List[int]): ordered list of dataset indices
-        num_tokens_fn (callable): function that returns the number of tokens at
-            a given index
-        max_tokens (int, optional): max number of tokens in each batch
-            (default: None).
-        max_sentences (int, optional): max number of sentences in each
-            batch (default: None).
-        required_batch_size_multiple (int, optional): require batch size to
-            be less than N or a multiple of N (default: 1).
-        fixed_shapes (List[Tuple[int, int]], optional): if given, batches will
-            only be created with the given shapes. *max_sentences* and
-            *required_batch_size_multiple* will be ignored (default: None).
-    """
-    try:
-        from fairseq.data.data_utils_fast import (
-            batch_by_size_fast,
-            batch_fixed_shapes_fast,
-        )
-    except ImportError:
-        raise ImportError(
-            "Please build Cython components with: `pip install --editable .` "
-            "or `python setup.py build_ext --inplace`"
-        )
-
-    max_tokens = max_tokens if max_tokens is not None else -1
-    max_sentences = max_sentences if max_sentences is not None else -1
-    bsz_mult = required_batch_size_multiple
-
-    if not isinstance(indices, np.ndarray):
-        indices = np.fromiter(indices, dtype=np.int64, count=-1)
-
-    if fixed_shapes is None:
-        return batch_by_size_fast_fix(
-            indices,
-            num_tokens_fn,
-            max_tokens,
-            max_sentences,
-            bsz_mult,
-        )
-    else:
-        fixed_shapes = np.array(fixed_shapes, dtype=np.int64)
-        sort_order = np.lexsort(
-            [
-                fixed_shapes[:, 1].argsort(),  # length
-                fixed_shapes[:, 0].argsort(),  # bsz
-            ]
-        )
-        fixed_shapes_sorted = fixed_shapes[sort_order]
-        return batch_fixed_shapes_fast(indices, num_tokens_fn, fixed_shapes_sorted)
-
-def batch_by_size_fast_fix(indices, num_tokens_fn,max_tokens,max_sentences,bsz_mult):
-    indices_view = indices
-    buckets = [16, 32, 64, 128, 256, 512, 1024]
-    fix_shape_dict = {}
-    for buck in buckets:
-        fix_shape_dict[buck] = []
-    batch_by_size_list = []
-    for i in range(len(indices_view)):
-        idx = indices_view[i]
-        fix_shape_dict[num_tokens_fn(idx)].append(idx)
-    for idx, key_len in enumerate(buckets):
-        max_batch = max_tokens // key_len
-        division_len = max_batch * (len(fix_shape_dict[key_len]) // max_batch)
-        tail_len = len(fix_shape_dict[key_len]) - division_len
-        if tail_len > 0 and key_len != 1024:
-            fix_shape_dict[buckets[idx + 1]] = fix_shape_dict[key_len][division_len:] + fix_shape_dict[buckets[idx + 1]]
-        if division_len == 0:
-            pass
-        else:
-            batch_by_size_list.extend(np.split(np.array(fix_shape_dict[key_len][:division_len]), division_len // max_batch))
-    return batch_by_size_list
-
-def post_process(sentence: str, symbol: str):
-    if symbol == "sentencepiece" and "\u2581" in sentence:
-        sentence = sentence.replace(" ", "").replace("\u2581", " ").strip()
-    elif symbol == "wordpiece":
-        sentence = sentence.replace(" ", "").replace("_", " ").strip()
-    elif symbol == "letter":
-        sentence = sentence.replace(" ", "").replace("|", " ").strip()
-    elif symbol == "_EOW":
-        sentence = sentence.replace(" ", "").replace("_EOW", " ").strip()
-    elif symbol is not None and symbol != "none":
-        sentence = (sentence + " ").replace(symbol, "").rstrip()
-    return sentence
-
-
-def compute_mask_indices(
-    shape: Tuple[int, int],
-    padding_mask: Optional[torch.Tensor],
-    mask_prob: float,
-    mask_length: int,
-    mask_type: str = "static",
-    mask_other: float = 0.0,
-    min_masks: int = 0,
-    no_overlap: bool = False,
-    min_space: int = 0,
-) -> np.ndarray:
-    """
-    Computes random mask spans for a given shape
-
-    Args:
-        shape: the the shape for which to compute masks.
-            should be of size 2 where first element is batch size and 2nd is timesteps
-        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
-        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
-            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
-            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
-        mask_type: how to compute mask lengths
-            static = fixed size
-            uniform = sample from uniform distribution [mask_other, mask_length*2]
-            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
-            poisson = sample from possion distribution with lambda = mask length
-        min_masks: minimum number of masked spans
-        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
-        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
-    """
-
-    bsz, all_sz = shape
-    mask = np.full((bsz, all_sz), False)
-
-    all_num_mask = int(
-        # add a random number for probabilistic rounding
-        mask_prob * all_sz / float(mask_length)
-        + np.random.rand()
-    )
-
-    all_num_mask = max(min_masks, all_num_mask)
-
-    mask_idcs = []
-    for i in range(bsz):
-        if padding_mask is not None:
-            sz = all_sz - padding_mask[i].long().sum().item()
-            num_mask = int(
-                # add a random number for probabilistic rounding
-                mask_prob * sz / float(mask_length)
-                + np.random.rand()
-            )
-            num_mask = max(min_masks, num_mask)
-        else:
-            sz = all_sz
-            num_mask = all_num_mask
-
-        if mask_type == "static":
-            lengths = np.full(num_mask, mask_length)
-        elif mask_type == "uniform":
-            lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
-        elif mask_type == "normal":
-            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
-            lengths = [max(1, int(round(x))) for x in lengths]
-        elif mask_type == "poisson":
-            lengths = np.random.poisson(mask_length, size=num_mask)
-            lengths = [int(round(x)) for x in lengths]
-        else:
-            raise Exception("unknown mask selection " + mask_type)
-
-        if sum(lengths) == 0:
-            lengths[0] = min(mask_length, sz - 1)
-
-        if no_overlap:
-            mask_idc = []
-
-            def arrange(s, e, length, keep_length):
-                span_start = np.random.randint(s, e - length)
-                mask_idc.extend(span_start + i for i in range(length))
-
-                new_parts = []
-                if span_start - s - min_space >= keep_length:
-                    new_parts.append((s, span_start - min_space + 1))
-                if e - span_start - keep_length - min_space > keep_length:
-                    new_parts.append((span_start + length + min_space, e))
-                return new_parts
-
-            parts = [(0, sz)]
-            min_length = min(lengths)
-            for length in sorted(lengths, reverse=True):
-                lens = np.fromiter(
-                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
-                    np.int,
-                )
-                l_sum = np.sum(lens)
-                if l_sum == 0:
-                    break
-                probs = lens / np.sum(lens)
-                c = np.random.choice(len(parts), p=probs)
-                s, e = parts.pop(c)
-                parts.extend(arrange(s, e, length, min_length))
-            mask_idc = np.asarray(mask_idc)
-        else:
-            min_len = min(lengths)
-            if sz - min_len <= num_mask:
-                min_len = sz - num_mask - 1
-
-            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
-
-            mask_idc = np.asarray(
-                [
-                    mask_idc[j] + offset
-                    for j in range(len(mask_idc))
-                    for offset in range(lengths[j])
-                ]
-            )
-
-        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
-
-    min_len = min([len(m) for m in mask_idcs])
-    for i, mask_idc in enumerate(mask_idcs):
-        if len(mask_idc) > min_len:
-            mask_idc = np.random.choice(mask_idc, min_len, replace=False)
-        mask[i, mask_idc] = True
-
-    return mask
-
-
-def get_mem_usage():
-    try:
-        import psutil
-
-        mb = 1024 * 1024
-        return f"used={psutil.virtual_memory().used / mb}Mb; avail={psutil.virtual_memory().available / mb}Mb"
-    except ImportError:
-        return "N/A"
-
-
-def lengths_to_padding_mask(lens: torch.LongTensor) -> torch.BoolTensor:
-    bsz, max_lens = lens.size(0), torch.max(lens).item()
-    mask = torch.arange(max_lens).to(lens.device).view(1, max_lens)
-    mask = mask.expand(bsz, -1) >= lens.view(bsz, 1).expand(-1, max_lens)
-    return mask
-
-
-def lengths_to_mask(lens: torch.LongTensor) -> torch.BoolTensor:
-    return ~lengths_to_padding_mask(lens)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/data_utils_fast.pyx b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/data_utils_fast.pyx
deleted file mode 100644
index 38b4aa67dd8f3c08fe9ec45358bdbb7eccbdc762..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/data_utils_fast.pyx
+++ /dev/null
@@ -1,123 +0,0 @@
-# cython: language_level=3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-
-cimport cython
-cimport numpy as np
-
-from libc.stdint cimport int32_t, int64_t
-
-ctypedef int64_t DTYPE_t
-
-
-cdef _is_batch_full(int64_t num_sentences, int64_t num_tokens, int64_t max_tokens, int64_t max_sentences):
-    if num_sentences == 0:
-        return 0
-    if max_sentences > 0 and num_sentences == max_sentences:
-        return 1
-    if max_tokens > 0 and num_tokens > max_tokens:
-        return 1
-    return 0
-
-
-@cython.cdivision(True)
-cpdef list batch_by_size_fast(
-    np.ndarray[DTYPE_t, ndim=1] indices,
-    num_tokens_fn,
-    int64_t max_tokens,
-    int64_t max_sentences,
-    int32_t bsz_mult,
-):
-    cdef int64_t sample_len = 0
-    cdef list sample_lens = []
-    cdef list batch = []
-    cdef list batches = []
-    cdef int64_t mod_len
-    cdef int64_t i
-    cdef int64_t idx
-    cdef int64_t num_tokens
-    cdef DTYPE_t[:] indices_view = indices
-
-    for i in range(len(indices_view)):
-        idx = indices_view[i]
-        num_tokens = num_tokens_fn(idx)
-        sample_lens.append(num_tokens)
-        sample_len = max(sample_len, num_tokens)
-
-        assert max_tokens <= 0 or sample_len <= max_tokens, (
-            "sentence at index {} of size {} exceeds max_tokens "
-            "limit of {}!".format(idx, sample_len, max_tokens)
-        )
-        num_tokens = (len(batch) + 1) * sample_len
-
-        if _is_batch_full(len(batch), num_tokens, max_tokens, max_sentences):
-            mod_len = max(
-                bsz_mult * (len(batch) // bsz_mult),
-                len(batch) % bsz_mult,
-            )
-            batches.append(batch[:mod_len])
-            batch = batch[mod_len:]
-            sample_lens = sample_lens[mod_len:]
-            sample_len = max(sample_lens) if len(sample_lens) > 0 else 0
-        batch.append(idx)
-    if len(batch) > 0:
-        batches.append(batch)
-    return batches
-
-
-cdef _find_valid_shape(
-    DTYPE_t[:, :] shapes_view,
-    int64_t num_sentences,
-    int64_t num_tokens,
-):
-    """Return index of first valid shape of -1 if none is found."""
-    for i in range(shapes_view.shape[0]):
-        if num_sentences <= shapes_view[i][0] and num_tokens <= shapes_view[i][1]:
-            return i
-    return -1
-
-
-@cython.cdivision(True)
-cpdef list batch_fixed_shapes_fast(
-    np.ndarray[DTYPE_t, ndim=1] indices,
-    num_tokens_fn,
-    np.ndarray[DTYPE_t, ndim=2] fixed_shapes_sorted,
-):
-    cdef int64_t sample_len = 0
-    cdef list sample_lens = []
-    cdef list batch = []
-    cdef list batches = []
-    cdef int64_t mod_len
-    cdef int64_t i
-    cdef int64_t idx
-    cdef int64_t num_tokens
-    cdef DTYPE_t[:] indices_view = indices
-    cdef DTYPE_t[:, :] shapes_view = fixed_shapes_sorted
-
-    for i in range(len(indices_view)):
-        idx = indices_view[i]
-        num_tokens = num_tokens_fn(idx)
-        sample_lens.append(num_tokens)
-        sample_len = max(sample_len, num_tokens)
-
-        shape_idx = _find_valid_shape(shapes_view, len(batch) + 1, sample_len)
-        if shape_idx == -1:
-            batches.append(batch)
-            batch = []
-            sample_lens = []
-            sample_len = 0
-            shapes_view = fixed_shapes_sorted
-        elif shape_idx > 0:
-            # small optimization for the next call to _find_valid_shape
-            shapes_view = shapes_view[shape_idx:]
-
-        batch.append(idx)
-
-    if len(batch) > 0:
-        batches.append(batch)
-
-    return batches
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/denoising_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/denoising_dataset.py
deleted file mode 100644
index bdb62c8d5db9c8755c72db4d0d8083c936f18dc8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/denoising_dataset.py
+++ /dev/null
@@ -1,436 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import numpy as np
-import torch
-
-from . import FairseqDataset, data_utils
-
-
-def collate(
-    samples,
-    pad_idx,
-    eos_idx,
-    vocab,
-    left_pad_source=False,
-    left_pad_target=False,
-    input_feeding=True,
-    pad_to_length=None,
-):
-    assert input_feeding
-    if len(samples) == 0:
-        return {}
-
-    def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None):
-        return data_utils.collate_tokens(
-            [s[key] for s in samples],
-            pad_idx,
-            eos_idx=None,  # use eos_idx of each sample instead of vocab.eos()
-            left_pad=left_pad,
-            move_eos_to_beginning=move_eos_to_beginning,
-            pad_to_length=pad_to_length,
-        )
-
-    id = torch.LongTensor([s["id"] for s in samples])
-    src_tokens = merge(
-        "source",
-        left_pad=left_pad_source,
-        pad_to_length=pad_to_length["source"] if pad_to_length is not None else None,
-    )
-    # sort by descending source length
-    src_lengths = torch.LongTensor([s["source"].numel() for s in samples])
-    src_lengths, sort_order = src_lengths.sort(descending=True)
-    id = id.index_select(0, sort_order)
-    src_tokens = src_tokens.index_select(0, sort_order)
-
-    prev_output_tokens = None
-    target = None
-    if samples[0].get("target", None) is not None:
-        target = merge(
-            "target",
-            left_pad=left_pad_target,
-            pad_to_length=pad_to_length["target"]
-            if pad_to_length is not None
-            else None,
-        )
-        target = target.index_select(0, sort_order)
-        ntokens = sum(len(s["target"]) for s in samples)
-
-        if input_feeding:
-            # we create a shifted version of targets for feeding the
-            # previous output token(s) into the next decoder step
-            prev_output_tokens = merge(
-                "target",
-                left_pad=left_pad_target,
-                move_eos_to_beginning=True,
-                pad_to_length=pad_to_length["target"]
-                if pad_to_length is not None
-                else None,
-            )
-            prev_output_tokens = prev_output_tokens.index_select(0, sort_order)
-    else:
-        ntokens = sum(len(s["source"]) for s in samples)
-
-    batch = {
-        "id": id,
-        "ntokens": ntokens,
-        "net_input": {
-            "src_tokens": src_tokens,
-            "src_lengths": src_lengths,
-        },
-        "target": target,
-        "nsentences": samples[0]["source"].size(0),
-        "sort_order": sort_order,
-    }
-    if prev_output_tokens is not None:
-        batch["net_input"]["prev_output_tokens"] = prev_output_tokens
-
-    return batch
-
-
-class DenoisingDataset(FairseqDataset):
-    """
-    A wrapper around TokenBlockDataset for BART dataset.
-
-    Args:
-        dataset (TokenBlockDataset): dataset to wrap
-        sizes (List[int]): sentence lengths
-        vocab (~fairseq.data.Dictionary): vocabulary
-        mask_idx (int): dictionary index used for masked token
-        mask_whole_words: only mask whole words. This should be a byte mask
-            over vocab indices, indicating whether it is the beginning of a
-            word. We will extend any mask to encompass the whole word.
-        shuffle (bool, optional): shuffle the elements before batching.
-          Default: ``True``
-        seed: Seed for random number generator for reproducibility.
-        args: argparse arguments.
-    """
-
-    def __init__(
-        self,
-        dataset,
-        sizes,
-        vocab,
-        mask_idx,
-        mask_whole_words,
-        shuffle,
-        seed,
-        args,
-        eos=None,
-        item_transform_func=None,
-    ):
-        self.dataset = dataset
-
-        self.sizes = sizes
-
-        self.vocab = vocab
-        self.shuffle = shuffle
-        self.seed = seed
-        self.mask_idx = mask_idx
-        self.mask_whole_word = mask_whole_words
-        self.mask_ratio = args.mask
-        self.random_ratio = args.mask_random
-        self.insert_ratio = args.insert
-        self.rotate_ratio = args.rotate
-        self.permute_sentence_ratio = args.permute_sentences
-        self.eos = eos if eos is not None else vocab.eos()
-        self.item_transform_func = item_transform_func
-
-        if args.bpe != "gpt2":
-            self.full_stop_index = self.vocab.eos()
-        else:
-            assert args.bpe == "gpt2"
-            self.full_stop_index = self.vocab.index("13")
-
-        self.replace_length = args.replace_length
-        if self.replace_length not in [-1, 0, 1]:
-            raise ValueError(f"invalid arg: replace_length={self.replace_length}")
-        if args.mask_length not in ["subword", "word", "span-poisson"]:
-            raise ValueError(f"invalid arg: mask-length={args.mask_length}")
-        if args.mask_length == "subword" and args.replace_length not in [0, 1]:
-            raise ValueError(f"if using subwords, use replace-length=1 or 0")
-
-        self.mask_span_distribution = None
-        if args.mask_length == "span-poisson":
-            _lambda = args.poisson_lambda
-
-            lambda_to_the_k = 1
-            e_to_the_minus_lambda = math.exp(-_lambda)
-            k_factorial = 1
-            ps = []
-            for k in range(0, 128):
-                ps.append(e_to_the_minus_lambda * lambda_to_the_k / k_factorial)
-                lambda_to_the_k *= _lambda
-                k_factorial *= k + 1
-                if ps[-1] < 0.0000001:
-                    break
-            ps = torch.FloatTensor(ps)
-            self.mask_span_distribution = torch.distributions.Categorical(ps)
-
-        self.epoch = 0
-
-    @property
-    def can_reuse_epoch_itr_across_epochs(self):
-        return True  # only the noise changes, not item sizes
-
-    def set_epoch(self, epoch, **unused):
-        self.epoch = epoch
-
-    def __getitem__(self, index):
-        with data_utils.numpy_seed(self.seed, self.epoch, index):
-            tokens = self.dataset[index]
-            assert tokens[-1] == self.eos
-            source, target = tokens, tokens.clone()
-
-            if self.permute_sentence_ratio > 0.0:
-                source = self.permute_sentences(source, self.permute_sentence_ratio)
-
-            if self.mask_ratio > 0:
-                source = self.add_whole_word_mask(source, self.mask_ratio)
-
-            if self.insert_ratio > 0:
-                source = self.add_insertion_noise(source, self.insert_ratio)
-
-            if self.rotate_ratio > 0.0 and np.random.random() < self.rotate_ratio:
-                source = self.add_rolling_noise(source)
-        # there can additional changes to make:
-        if self.item_transform_func is not None:
-            source, target = self.item_transform_func(source, target)
-
-        assert (source >= 0).all()
-        assert (source[1:-1] >= 1).all()
-        assert (source <= len(self.vocab)).all()
-        assert source[0] == self.vocab.bos()
-        assert source[-1] == self.eos
-        return {
-            "id": index,
-            "source": source,
-            "target": target,
-        }
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def permute_sentences(self, source, p=1.0):
-        full_stops = source == self.full_stop_index
-        # Pretend it ends with a full stop so last span is a sentence
-        full_stops[-2] = 1
-
-        # Tokens that are full stops, where the previous token is not
-        sentence_ends = (full_stops[1:] * ~full_stops[:-1]).nonzero(as_tuple=False) + 2
-        result = source.clone()
-
-        num_sentences = sentence_ends.size(0)
-        num_to_permute = math.ceil((num_sentences * 2 * p) / 2.0)
-        substitutions = torch.randperm(num_sentences)[:num_to_permute]
-        ordering = torch.arange(0, num_sentences)
-        ordering[substitutions] = substitutions[torch.randperm(num_to_permute)]
-
-        # Ignore <bos> at start
-        index = 1
-        for i in ordering:
-            sentence = source[(sentence_ends[i - 1] if i > 0 else 1) : sentence_ends[i]]
-            result[index : index + sentence.size(0)] = sentence
-            index += sentence.size(0)
-        return result
-
-    def word_starts(self, source):
-        if self.mask_whole_word is not None:
-            is_word_start = self.mask_whole_word.gather(0, source)
-        else:
-            is_word_start = torch.ones(source.size())
-        is_word_start[0] = 0
-        is_word_start[-1] = 0
-        return is_word_start
-
-    def add_whole_word_mask(self, source, p):
-        is_word_start = self.word_starts(source)
-        num_to_mask = int(math.ceil(is_word_start.float().sum() * p))
-        num_inserts = 0
-        if num_to_mask == 0:
-            return source
-
-        if self.mask_span_distribution is not None:
-            lengths = self.mask_span_distribution.sample(sample_shape=(num_to_mask,))
-
-            # Make sure we have enough to mask
-            cum_length = torch.cumsum(lengths, 0)
-            while cum_length[-1] < num_to_mask:
-                lengths = torch.cat(
-                    [
-                        lengths,
-                        self.mask_span_distribution.sample(sample_shape=(num_to_mask,)),
-                    ],
-                    dim=0,
-                )
-                cum_length = torch.cumsum(lengths, 0)
-
-            # Trim to masking budget
-            i = 0
-            while cum_length[i] < num_to_mask:
-                i += 1
-            lengths[i] = num_to_mask - (0 if i == 0 else cum_length[i - 1])
-            num_to_mask = i + 1
-            lengths = lengths[:num_to_mask]
-
-            # Handle 0-length mask (inserts) separately
-            lengths = lengths[lengths > 0]
-            num_inserts = num_to_mask - lengths.size(0)
-            num_to_mask -= num_inserts
-            if num_to_mask == 0:
-                return self.add_insertion_noise(source, num_inserts / source.size(0))
-
-            assert (lengths > 0).all()
-        else:
-            lengths = torch.ones((num_to_mask,)).long()
-        assert is_word_start[-1] == 0
-        word_starts = is_word_start.nonzero(as_tuple=False)
-        indices = word_starts[
-            torch.randperm(word_starts.size(0))[:num_to_mask]
-        ].squeeze(1)
-        mask_random = torch.FloatTensor(num_to_mask).uniform_() < self.random_ratio
-
-        source_length = source.size(0)
-        assert source_length - 1 not in indices
-        to_keep = torch.ones(source_length, dtype=torch.bool)
-        is_word_start[
-            -1
-        ] = 255  # acts as a long length, so spans don't go over the end of doc
-        if self.replace_length == 0:
-            to_keep[indices] = 0
-        else:
-            # keep index, but replace it with [MASK]
-            source[indices] = self.mask_idx
-            source[indices[mask_random]] = torch.randint(
-                1, len(self.vocab), size=(mask_random.sum(),)
-            )
-
-        if self.mask_span_distribution is not None:
-            assert len(lengths.size()) == 1
-            assert lengths.size() == indices.size()
-            lengths -= 1
-            while indices.size(0) > 0:
-                assert lengths.size() == indices.size()
-                lengths -= is_word_start[indices + 1].long()
-                uncompleted = lengths >= 0
-                indices = indices[uncompleted] + 1
-                mask_random = mask_random[uncompleted]
-                lengths = lengths[uncompleted]
-                if self.replace_length != -1:
-                    # delete token
-                    to_keep[indices] = 0
-                else:
-                    # keep index, but replace it with [MASK]
-                    source[indices] = self.mask_idx
-                    source[indices[mask_random]] = torch.randint(
-                        1, len(self.vocab), size=(mask_random.sum(),)
-                    )
-        else:
-            # A bit faster when all lengths are 1
-            while indices.size(0) > 0:
-                uncompleted = is_word_start[indices + 1] == 0
-                indices = indices[uncompleted] + 1
-                mask_random = mask_random[uncompleted]
-                if self.replace_length != -1:
-                    # delete token
-                    to_keep[indices] = 0
-                else:
-                    # keep index, but replace it with [MASK]
-                    source[indices] = self.mask_idx
-                    source[indices[mask_random]] = torch.randint(
-                        1, len(self.vocab), size=(mask_random.sum(),)
-                    )
-
-                assert source_length - 1 not in indices
-
-        source = source[to_keep]
-
-        if num_inserts > 0:
-            source = self.add_insertion_noise(source, num_inserts / source.size(0))
-
-        return source
-
-    def add_permuted_noise(self, tokens, p):
-        num_words = len(tokens)
-        num_to_permute = math.ceil(((num_words * 2) * p) / 2.0)
-        substitutions = torch.randperm(num_words - 2)[:num_to_permute] + 1
-        tokens[substitutions] = tokens[substitutions[torch.randperm(num_to_permute)]]
-        return tokens
-
-    def add_rolling_noise(self, tokens):
-        offset = np.random.randint(1, max(1, tokens.size(-1) - 1) + 1)
-        tokens = torch.cat(
-            (tokens[0:1], tokens[offset:-1], tokens[1:offset], tokens[-1:]),
-            dim=0,
-        )
-        return tokens
-
-    def add_insertion_noise(self, tokens, p):
-        if p == 0.0:
-            return tokens
-
-        num_tokens = len(tokens)
-        n = int(math.ceil(num_tokens * p))
-
-        noise_indices = torch.randperm(num_tokens + n - 2)[:n] + 1
-        noise_mask = torch.zeros(size=(num_tokens + n,), dtype=torch.bool)
-        noise_mask[noise_indices] = 1
-        result = torch.LongTensor(n + len(tokens)).fill_(-1)
-
-        num_random = int(math.ceil(n * self.random_ratio))
-        result[noise_indices[num_random:]] = self.mask_idx
-        result[noise_indices[:num_random]] = torch.randint(
-            low=1, high=len(self.vocab), size=(num_random,)
-        )
-
-        result[~noise_mask] = tokens
-
-        assert (result >= 0).all()
-        return result
-
-    def collater(self, samples, pad_to_length=None):
-        """Merge a list of samples to form a mini-batch.
-        Args:
-            samples (List[dict]): samples to collate
-        Returns:
-            dict: a mini-batch of data
-        """
-        return collate(
-            samples, self.vocab.pad(), self.eos, self.vocab, pad_to_length=pad_to_length
-        )
-
-    def num_tokens(self, index):
-        """Return the number of tokens in a sample. This value is used to
-        enforce ``--max-tokens`` during batching."""
-        return self.sizes[index]
-
-    def size(self, index):
-        """Return an example's size as a float or tuple. This value is used when
-        filtering a dataset with ``--max-positions``."""
-        return self.sizes[index]
-
-    def ordered_indices(self):
-        """Return an ordered list of indices. Batches will be constructed based
-        on this order."""
-        if self.shuffle:
-            indices = np.random.permutation(len(self))
-        else:
-            indices = np.arange(len(self))
-        return indices[np.argsort(self.sizes[indices], kind="mergesort")]
-
-    def prefetch(self, indices):
-        self.src.prefetch(indices)
-        self.tgt.prefetch(indices)
-
-    @property
-    def supports_prefetch(self):
-        return (
-            hasattr(self.src, "supports_prefetch")
-            and self.src.supports_prefetch
-            and hasattr(self.tgt, "supports_prefetch")
-            and self.tgt.supports_prefetch
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/dictionary.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/dictionary.py
deleted file mode 100644
index e2df08e092350c5d5feb34723fae8744c7286a44..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/dictionary.py
+++ /dev/null
@@ -1,387 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-from collections import Counter
-from multiprocessing import Pool
-
-import torch
-from fairseq import utils
-from fairseq.binarizer import safe_readline
-from fairseq.data import data_utils
-from fairseq.file_io import PathManager
-from fairseq.tokenizer import tokenize_line
-
-
-class Dictionary(object):
-    """A mapping from symbols to consecutive integers"""
-
-    def __init__(
-        self,
-        *,  # begin keyword-only arguments
-        bos="<s>",
-        pad="<pad>",
-        eos="</s>",
-        unk="<unk>",
-        extra_special_symbols=None,
-    ):
-        self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos
-        self.symbols = []
-        self.count = []
-        self.indices = {}
-        self.bos_index = self.add_symbol(bos)
-        self.pad_index = self.add_symbol(pad)
-        self.eos_index = self.add_symbol(eos)
-        self.unk_index = self.add_symbol(unk)
-        if extra_special_symbols:
-            for s in extra_special_symbols:
-                self.add_symbol(s)
-        self.nspecial = len(self.symbols)
-
-    def __eq__(self, other):
-        return self.indices == other.indices
-
-    def __getitem__(self, idx):
-        if idx < len(self.symbols):
-            return self.symbols[idx]
-        return self.unk_word
-
-    def __len__(self):
-        """Returns the number of symbols in the dictionary"""
-        return len(self.symbols)
-
-    def __contains__(self, sym):
-        return sym in self.indices
-
-    def index(self, sym):
-        """Returns the index of the specified symbol"""
-        assert isinstance(sym, str)
-        if sym in self.indices:
-            return self.indices[sym]
-        return self.unk_index
-
-    def string(
-        self,
-        tensor,
-        bpe_symbol=None,
-        escape_unk=False,
-        extra_symbols_to_ignore=None,
-        unk_string=None,
-    ):
-        """Helper for converting a tensor of token indices to a string.
-
-        Can optionally remove BPE symbols or escape <unk> words.
-        """
-        if torch.is_tensor(tensor) and tensor.dim() == 2:
-            return "\n".join(
-                self.string(t, bpe_symbol, escape_unk, extra_symbols_to_ignore)
-                for t in tensor
-            )
-
-        extra_symbols_to_ignore = set(extra_symbols_to_ignore or [])
-        extra_symbols_to_ignore.add(self.eos())
-
-        def token_string(i):
-            if i == self.unk():
-                if unk_string is not None:
-                    return unk_string
-                else:
-                    return self.unk_string(escape_unk)
-            else:
-                return self[i]
-
-        if hasattr(self, "bos_index"):
-            extra_symbols_to_ignore.add(self.bos())
-
-        sent = " ".join(
-            token_string(i)
-            for i in tensor
-            if utils.item(i) not in extra_symbols_to_ignore
-        )
-
-        return data_utils.post_process(sent, bpe_symbol)
-
-    def unk_string(self, escape=False):
-        """Return unknown string, optionally escaped as: <<unk>>"""
-        if escape:
-            return "<{}>".format(self.unk_word)
-        else:
-            return self.unk_word
-
-    def add_symbol(self, word, n=1, overwrite=False):
-        """Adds a word to the dictionary"""
-        if word in self.indices and not overwrite:
-            idx = self.indices[word]
-            self.count[idx] = self.count[idx] + n
-            return idx
-        else:
-            idx = len(self.symbols)
-            self.indices[word] = idx
-            self.symbols.append(word)
-            self.count.append(n)
-            return idx
-
-    def update(self, new_dict):
-        """Updates counts from new dictionary."""
-        for word in new_dict.symbols:
-            idx2 = new_dict.indices[word]
-            if word in self.indices:
-                idx = self.indices[word]
-                self.count[idx] = self.count[idx] + new_dict.count[idx2]
-            else:
-                idx = len(self.symbols)
-                self.indices[word] = idx
-                self.symbols.append(word)
-                self.count.append(new_dict.count[idx2])
-
-    def finalize(self, threshold=-1, nwords=-1, padding_factor=8):
-        """Sort symbols by frequency in descending order, ignoring special ones.
-
-        Args:
-            - threshold defines the minimum word count
-            - nwords defines the total number of words in the final dictionary,
-                including special symbols
-            - padding_factor can be used to pad the dictionary size to be a
-                multiple of 8, which is important on some hardware (e.g., Nvidia
-                Tensor Cores).
-        """
-        if nwords <= 0:
-            nwords = len(self)
-
-        new_indices = dict(zip(self.symbols[: self.nspecial], range(self.nspecial)))
-        new_symbols = self.symbols[: self.nspecial]
-        new_count = self.count[: self.nspecial]
-
-        c = Counter(
-            dict(
-                sorted(zip(self.symbols[self.nspecial :], self.count[self.nspecial :]))
-            )
-        )
-        for symbol, count in c.most_common(nwords - self.nspecial):
-            if count >= threshold:
-                new_indices[symbol] = len(new_symbols)
-                new_symbols.append(symbol)
-                new_count.append(count)
-            else:
-                break
-
-        assert len(new_symbols) == len(new_indices)
-
-        self.count = list(new_count)
-        self.symbols = list(new_symbols)
-        self.indices = new_indices
-
-        self.pad_to_multiple_(padding_factor)
-
-    def pad_to_multiple_(self, padding_factor):
-        """Pad Dictionary size to be a multiple of *padding_factor*."""
-        if padding_factor > 1:
-            i = 0
-            while len(self) % padding_factor != 0:
-                symbol = "madeupword{:04d}".format(i)
-                self.add_symbol(symbol, n=0)
-                i += 1
-
-    def bos(self):
-        """Helper to get index of beginning-of-sentence symbol"""
-        return self.bos_index
-
-    def pad(self):
-        """Helper to get index of pad symbol"""
-        return self.pad_index
-
-    def eos(self):
-        """Helper to get index of end-of-sentence symbol"""
-        return self.eos_index
-
-    def unk(self):
-        """Helper to get index of unk symbol"""
-        return self.unk_index
-
-    @classmethod
-    def load(cls, f):
-        """Loads the dictionary from a text file with the format:
-
-        ```
-        <symbol0> <count0>
-        <symbol1> <count1>
-        ...
-        ```
-        """
-        d = cls()
-        d.add_from_file(f)
-        return d
-
-    def add_from_file(self, f):
-        """
-        Loads a pre-existing dictionary from a text file and adds its symbols
-        to this instance.
-        """
-        if isinstance(f, str):
-            try:
-                with open(PathManager.get_local_path(f), "r", encoding="utf-8") as fd:
-                    self.add_from_file(fd)
-            except FileNotFoundError as fnfe:
-                raise fnfe
-            except UnicodeError:
-                raise Exception(
-                    "Incorrect encoding detected in {}, please "
-                    "rebuild the dataset".format(f)
-                )
-            return
-
-        lines = f.readlines()
-        indices_start_line = self._load_meta(lines)
-
-        for line in lines[indices_start_line:]:
-            try:
-                line, field = line.rstrip().rsplit(" ", 1)
-                if field == "#fairseq:overwrite":
-                    overwrite = True
-                    line, field = line.rsplit(" ", 1)
-                else:
-                    overwrite = False
-                count = int(field)
-                word = line
-                if word in self and not overwrite:
-                    raise RuntimeError(
-                        "Duplicate word found when loading Dictionary: '{}'. "
-                        "Duplicate words can overwrite earlier ones by adding the "
-                        "#fairseq:overwrite flag at the end of the corresponding row "
-                        "in the dictionary file. If using the Camembert model, please "
-                        "download an updated copy of the model file.".format(word)
-                    )
-                self.add_symbol(word, n=count, overwrite=overwrite)
-            except ValueError:
-                raise ValueError(
-                    "Incorrect dictionary format, expected '<token> <cnt> [flags]'"
-                )
-
-    def _save(self, f, kv_iterator):
-        if isinstance(f, str):
-            PathManager.mkdirs(os.path.dirname(f))
-            with PathManager.open(f, "w", encoding="utf-8") as fd:
-                return self.save(fd)
-        for k, v in kv_iterator:
-            print("{} {}".format(k, v), file=f)
-
-    def _get_meta(self):
-        return [], []
-
-    def _load_meta(self, lines):
-        return 0
-
-    def save(self, f):
-        """Stores dictionary into a text file"""
-        ex_keys, ex_vals = self._get_meta()
-        self._save(
-            f,
-            zip(
-                ex_keys + self.symbols[self.nspecial :],
-                ex_vals + self.count[self.nspecial :],
-            ),
-        )
-
-    def dummy_sentence(self, length):
-        t = torch.Tensor(length).uniform_(self.nspecial + 1, len(self)).long()
-        t[-1] = self.eos()
-        return t
-
-    def encode_line(
-        self,
-        line,
-        line_tokenizer=tokenize_line,
-        add_if_not_exist=True,
-        consumer=None,
-        append_eos=True,
-        reverse_order=False,
-    ):
-        words = line_tokenizer(line)
-        if reverse_order:
-            words = list(reversed(words))
-        nwords = len(words)
-        ids = torch.IntTensor(nwords + 1 if append_eos else nwords)
-
-        for i, word in enumerate(words):
-            if add_if_not_exist:
-                idx = self.add_symbol(word)
-            else:
-                idx = self.index(word)
-            if consumer is not None:
-                consumer(word, idx)
-            ids[i] = idx
-        if append_eos:
-            ids[nwords] = self.eos_index
-        return ids
-
-    @staticmethod
-    def _add_file_to_dictionary_single_worker(
-        filename, tokenize, eos_word, worker_id=0, num_workers=1
-    ):
-        counter = Counter()
-        with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f:
-            size = os.fstat(f.fileno()).st_size
-            chunk_size = size // num_workers
-            offset = worker_id * chunk_size
-            end = offset + chunk_size
-            f.seek(offset)
-            if offset > 0:
-                safe_readline(f)  # drop first incomplete line
-            line = f.readline()
-            while line:
-                for word in tokenize(line):
-                    counter.update([word])
-                counter.update([eos_word])
-                if f.tell() > end:
-                    break
-                line = f.readline()
-        return counter
-
-    @staticmethod
-    def add_file_to_dictionary(filename, dict, tokenize, num_workers):
-        def merge_result(counter):
-            for w, c in sorted(counter.items()):
-                dict.add_symbol(w, c)
-
-        if num_workers > 1:
-            pool = Pool(processes=num_workers)
-            results = []
-            for worker_id in range(num_workers):
-                results.append(
-                    pool.apply_async(
-                        Dictionary._add_file_to_dictionary_single_worker,
-                        (filename, tokenize, dict.eos_word, worker_id, num_workers),
-                    )
-                )
-            pool.close()
-            pool.join()
-            for r in results:
-                merge_result(r.get())
-        else:
-            merge_result(
-                Dictionary._add_file_to_dictionary_single_worker(
-                    filename, tokenize, dict.eos_word
-                )
-            )
-
-
-class TruncatedDictionary(object):
-    def __init__(self, wrapped_dict, length):
-        self.__class__ = type(
-            wrapped_dict.__class__.__name__,
-            (self.__class__, wrapped_dict.__class__),
-            {},
-        )
-        self.__dict__ = wrapped_dict.__dict__
-        self.wrapped_dict = wrapped_dict
-        self.length = min(len(self.wrapped_dict), length)
-
-    def __len__(self):
-        return self.length
-
-    def __getitem__(self, i):
-        if i < self.length:
-            return self.wrapped_dict[i]
-        return self.wrapped_dict.unk()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/__init__.py
deleted file mode 100644
index 2e807d8ae7d8af26c51641209856b0ca3255c0a5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import importlib
-import os
-
-from fairseq import registry
-
-
-build_tokenizer, register_tokenizer, TOKENIZER_REGISTRY, _ = registry.setup_registry(
-    "--tokenizer",
-    default=None,
-)
-
-
-build_bpe, register_bpe, BPE_REGISTRY, _ = registry.setup_registry(
-    "--bpe",
-    default=None,
-)
-
-
-# automatically import any Python files in the encoders/ directory
-for file in os.listdir(os.path.dirname(__file__)):
-    if file.endswith(".py") and not file.startswith("_"):
-        module = file[: file.find(".py")]
-        importlib.import_module("fairseq.data.encoders." + module)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/byte_bpe.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/byte_bpe.py
deleted file mode 100644
index 0d2da3ea1a2f00674a59a27f82efba275419c435..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/byte_bpe.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-from fairseq import file_utils
-from fairseq.data.encoders import register_bpe
-from fairseq.data.encoders.byte_utils import (
-    SPACE,
-    SPACE_ESCAPE,
-    byte_encode,
-    smart_byte_decode,
-)
-
-
-@register_bpe("byte_bpe")
-class ByteBPE(object):
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        parser.add_argument('--sentencepiece-model-path', type=str,
-                            help='path to sentencepiece model')
-        # fmt: on
-
-    def __init__(self, args):
-        vocab = file_utils.cached_path(args.sentencepiece_model_path)
-        try:
-            import sentencepiece as spm
-
-            self.sp = spm.SentencePieceProcessor()
-            self.sp.Load(vocab)
-        except ImportError:
-            raise ImportError(
-                "Please install sentencepiece with: pip install sentencepiece"
-            )
-
-    def encode(self, x: str) -> str:
-        byte_encoded = byte_encode(x)
-        return SPACE.join(self.sp.EncodeAsPieces(byte_encoded))
-
-    @staticmethod
-    def decode(x: str) -> str:
-        unescaped = x.replace(SPACE, "").replace(SPACE_ESCAPE, SPACE)
-        return smart_byte_decode(unescaped)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/byte_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/byte_utils.py
deleted file mode 100644
index a305c080926c2d094b7e8ae48f5331da82025a75..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/byte_utils.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import re
-
-
-WHITESPACE_NORMALIZER = re.compile(r"\s+")
-SPACE = chr(32)
-SPACE_ESCAPE = chr(9601)
-# excluding non-breaking space (160) here
-PRINTABLE_LATIN = set(
-    list(range(32, 126 + 1)) + list(range(161, 172 + 1)) + list(range(174, 255 + 1))
-)
-BYTE_TO_BCHAR = {
-    b: chr(b) if b in PRINTABLE_LATIN else chr(256 + b) for b in range(256)
-}
-BCHAR_TO_BYTE = {bc: b for b, bc in BYTE_TO_BCHAR.items()}
-
-
-def byte_encode(x: str) -> str:
-    normalized = WHITESPACE_NORMALIZER.sub(SPACE, x)
-    return "".join([BYTE_TO_BCHAR[b] for b in normalized.encode("utf-8")])
-
-
-def byte_decode(x: str) -> str:
-    try:
-        return bytes([BCHAR_TO_BYTE[bc] for bc in x]).decode("utf-8")
-    except ValueError:
-        return ""
-
-
-def smart_byte_decode(x: str) -> str:
-    output = byte_decode(x)
-    if output == "":
-        # DP the best recovery (max valid chars) if it's broken
-        n_bytes = len(x)
-        f = [0 for _ in range(n_bytes + 1)]
-        pt = [0 for _ in range(n_bytes + 1)]
-        for i in range(1, n_bytes + 1):
-            f[i], pt[i] = f[i - 1], i - 1
-            for j in range(1, min(4, i) + 1):
-                if f[i - j] + 1 > f[i] and len(byte_decode(x[i - j : i])) > 0:
-                    f[i], pt[i] = f[i - j] + 1, i - j
-        cur_pt = n_bytes
-        while cur_pt > 0:
-            if f[cur_pt] == f[pt[cur_pt]] + 1:
-                output = byte_decode(x[pt[cur_pt] : cur_pt]) + output
-            cur_pt = pt[cur_pt]
-    return output
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/bytes.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/bytes.py
deleted file mode 100644
index bb9554ed53bff409cd74c5031e8c151513c28d9c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/bytes.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-from fairseq.data.encoders import register_bpe
-from fairseq.data.encoders.byte_utils import (
-    SPACE,
-    SPACE_ESCAPE,
-    byte_encode,
-    smart_byte_decode,
-)
-
-
-@register_bpe("bytes")
-class Bytes(object):
-    def __init__(self, args):
-        pass
-
-    @staticmethod
-    def add_args(parser):
-        pass
-
-    @staticmethod
-    def encode(x: str) -> str:
-        encoded = byte_encode(x)
-        escaped = encoded.replace(SPACE, SPACE_ESCAPE)
-        return SPACE.join(list(escaped))
-
-    @staticmethod
-    def decode(x: str) -> str:
-        unescaped = x.replace(SPACE, "").replace(SPACE_ESCAPE, SPACE)
-        return smart_byte_decode(unescaped)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/characters.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/characters.py
deleted file mode 100644
index cffc57511c72838d46c178a64c1197110bd462f4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/characters.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-from fairseq.data.encoders import register_bpe
-
-
-SPACE = chr(32)
-SPACE_ESCAPE = chr(9601)
-
-
-@register_bpe("characters")
-class Characters(object):
-    def __init__(self, args):
-        pass
-
-    @staticmethod
-    def add_args(parser):
-        pass
-
-    @staticmethod
-    def encode(x: str) -> str:
-        escaped = x.replace(SPACE, SPACE_ESCAPE)
-        return SPACE.join(list(escaped))
-
-    @staticmethod
-    def decode(x: str) -> str:
-        return x.replace(SPACE, "").replace(SPACE_ESCAPE, SPACE)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/fastbpe.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/fastbpe.py
deleted file mode 100644
index 74d4ad850409d69a5b2476480a9a3aa229038686..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/fastbpe.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq import file_utils
-from fairseq.data.encoders import register_bpe
-
-
-@register_bpe("fastbpe")
-class fastBPE(object):
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        parser.add_argument('--bpe-codes', type=str,
-                            help='path to fastBPE BPE')
-        # fmt: on
-
-    def __init__(self, args):
-        if args.bpe_codes is None:
-            raise ValueError("--bpe-codes is required for --bpe=fastbpe")
-        codes = file_utils.cached_path(args.bpe_codes)
-        try:
-            import fastBPE
-
-            self.bpe = fastBPE.fastBPE(codes)
-            self.bpe_symbol = "@@ "
-        except ImportError:
-            raise ImportError("Please install fastBPE with: pip install fastBPE")
-
-    def encode(self, x: str) -> str:
-        return self.bpe.apply([x])[0]
-
-    def decode(self, x: str) -> str:
-        return (x + " ").replace(self.bpe_symbol, "").rstrip()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/gpt2_bpe.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/gpt2_bpe.py
deleted file mode 100644
index 8ac099a688107135bf2ad5b24d5cd7b3477a16e1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/gpt2_bpe.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq import file_utils
-from fairseq.data.encoders import register_bpe
-
-from .gpt2_bpe_utils import get_encoder
-
-
-DEFAULT_ENCODER_JSON = "https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json"
-DEFAULT_VOCAB_BPE = "https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe"
-
-
-@register_bpe("gpt2")
-class GPT2BPE(object):
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        parser.add_argument('--gpt2-encoder-json', type=str,
-                            default=DEFAULT_ENCODER_JSON,
-                            help='path to encoder.json')
-        parser.add_argument('--gpt2-vocab-bpe', type=str,
-                            default=DEFAULT_VOCAB_BPE,
-                            help='path to vocab.bpe')
-        # fmt: on
-
-    def __init__(self, args):
-        encoder_json = file_utils.cached_path(
-            getattr(args, "gpt2_encoder_json", DEFAULT_ENCODER_JSON)
-        )
-        vocab_bpe = file_utils.cached_path(
-            getattr(args, "gpt2_vocab_bpe", DEFAULT_VOCAB_BPE)
-        )
-        self.bpe = get_encoder(encoder_json, vocab_bpe)
-
-    def encode(self, x: str) -> str:
-        return " ".join(map(str, self.bpe.encode(x)))
-
-    def decode(self, x: str) -> str:
-        return self.bpe.decode(
-            [int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split()]
-        )
-
-    def is_beginning_of_word(self, x: str) -> bool:
-        return self.decode(x).startswith(" ")
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/gpt2_bpe_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/gpt2_bpe_utils.py
deleted file mode 100644
index 688d4e36e358df2dcc432d37d3e57bd81e2f1ed1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/gpt2_bpe_utils.py
+++ /dev/null
@@ -1,140 +0,0 @@
-"""
-Byte pair encoding utilities from GPT-2.
-
-Original source: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-Original license: MIT
-"""
-
-import json
-from functools import lru_cache
-
-
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1))
-        + list(range(ord("¡"), ord("¬") + 1))
-        + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2 ** 8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2 ** 8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-class Encoder:
-    def __init__(self, encoder, bpe_merges, errors="replace"):
-        self.encoder = encoder
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-
-        try:
-            import regex as re
-
-            self.re = re
-        except ImportError:
-            raise ImportError("Please install regex with: pip install regex")
-
-        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = self.re.compile(
-            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
-        )
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def encode(self, text):
-        bpe_tokens = []
-        for token in self.re.findall(self.pat, text):
-            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
-            bpe_tokens.extend(
-                self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
-            )
-        return bpe_tokens
-
-    def decode(self, tokens):
-        text = "".join([self.decoder.get(token, token) for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode(
-            "utf-8", errors=self.errors
-        )
-        return text
-
-
-def get_encoder(encoder_json_path, vocab_bpe_path):
-    with open(encoder_json_path, "r") as f:
-        encoder = json.load(f)
-    with open(vocab_bpe_path, "r", encoding="utf-8") as f:
-        bpe_data = f.read()
-    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
-    return Encoder(
-        encoder=encoder,
-        bpe_merges=bpe_merges,
-    )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/hf_bert_bpe.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/hf_bert_bpe.py
deleted file mode 100644
index a968fe8857194c47c04f9bc0cdda38421c90526f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/hf_bert_bpe.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.data.encoders import register_bpe
-
-
-@register_bpe("bert")
-class BertBPE(object):
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        parser.add_argument('--bpe-cased', action='store_true',
-                            help='set for cased BPE',
-                            default=False)
-        parser.add_argument('--bpe-vocab-file', type=str,
-                            help='bpe vocab file.')
-        # fmt: on
-
-    def __init__(self, args):
-        try:
-            from transformers import BertTokenizer
-        except ImportError:
-            raise ImportError(
-                "Please install transformers with: pip install transformers"
-            )
-
-        if "bpe_vocab_file" in args:
-            self.bert_tokenizer = BertTokenizer(
-                args.bpe_vocab_file, do_lower_case=not args.bpe_cased
-            )
-        else:
-            vocab_file_name = (
-                "bert-base-cased" if args.bpe_cased else "bert-base-uncased"
-            )
-            self.bert_tokenizer = BertTokenizer.from_pretrained(vocab_file_name)
-
-    def encode(self, x: str) -> str:
-        return " ".join(self.bert_tokenizer.tokenize(x))
-
-    def decode(self, x: str) -> str:
-        return self.bert_tokenizer.clean_up_tokenization(
-            self.bert_tokenizer.convert_tokens_to_string(x.split(" "))
-        )
-
-    def is_beginning_of_word(self, x: str) -> bool:
-        return not x.startswith("##")
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/hf_byte_bpe.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/hf_byte_bpe.py
deleted file mode 100644
index 544d408273f95797e202d7ac5edaa935b4c71368..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/hf_byte_bpe.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.data.encoders import register_bpe
-
-
-@register_bpe("hf_byte_bpe")
-class HuggingFaceByteLevelBPE(object):
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        parser.add_argument('--bpe-merges', help='path to merges.txt')
-        parser.add_argument('--bpe-vocab', help='path to vocab.json')
-        parser.add_argument('--bpe-add-prefix-space', action='store_true',
-                            help='add prefix space before encoding')
-        # fmt: on
-
-    def __init__(self, args):
-        try:
-            from tokenizers import ByteLevelBPETokenizer
-        except ImportError:
-            raise ImportError(
-                "Please install huggingface/tokenizers with: " "pip install tokenizers"
-            )
-
-        self.bpe = ByteLevelBPETokenizer(
-            args.bpe_vocab,
-            args.bpe_merges,
-            add_prefix_space=getattr(args, "bpe_add_prefix_space", False),
-        )
-
-    def encode(self, x: str) -> str:
-        return " ".join(map(str, self.bpe.encode(x).ids))
-
-    def decode(self, x: str) -> str:
-        return self.bpe.decode(
-            [int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split()]
-        )
-
-    def is_beginning_of_word(self, x: str) -> bool:
-        return self.decode(x).startswith(" ")
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/moses_tokenizer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/moses_tokenizer.py
deleted file mode 100644
index 8c24844263a98c58a160b624b7741947ee290884..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/moses_tokenizer.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.data.encoders import register_tokenizer
-
-
-@register_tokenizer("moses")
-class MosesTokenizer(object):
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        parser.add_argument('--moses-source-lang', metavar='SRC',
-                            help='source language')
-        parser.add_argument('--moses-target-lang', metavar='TARGET',
-                            help='target language')
-        parser.add_argument('--moses-no-dash-splits', action='store_true', default=False,
-                            help='don\'t apply dash split rules')
-        parser.add_argument('--moses-no-escape', action='store_true', default=False,
-                            help='don\'t perform HTML escaping on apostrophy, quotes, etc.')
-        # fmt: on
-
-    def __init__(self, args):
-        self.args = args
-
-        if getattr(args, "moses_source_lang", None) is None:
-            args.moses_source_lang = getattr(args, "source_lang", "en")
-        if getattr(args, "moses_target_lang", None) is None:
-            args.moses_target_lang = getattr(args, "target_lang", "en")
-
-        try:
-            from sacremoses import MosesTokenizer, MosesDetokenizer
-
-            self.tok = MosesTokenizer(args.moses_source_lang)
-            self.detok = MosesDetokenizer(args.moses_target_lang)
-        except ImportError:
-            raise ImportError(
-                "Please install Moses tokenizer with: pip install sacremoses"
-            )
-
-    def encode(self, x: str) -> str:
-        return self.tok.tokenize(
-            x,
-            aggressive_dash_splits=(not self.args.moses_no_dash_splits),
-            return_str=True,
-            escape=(not self.args.moses_no_escape),
-        )
-
-    def decode(self, x: str) -> str:
-        return self.detok.detokenize(x.split())
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/nltk_tokenizer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/nltk_tokenizer.py
deleted file mode 100644
index 3b617e7314f0e3aae298b685eea54cbb16312203..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/nltk_tokenizer.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.data.encoders import register_tokenizer
-
-
-@register_tokenizer("nltk")
-class NLTKTokenizer(object):
-    def __init__(self, source_lang=None, target_lang=None):
-        try:
-            from nltk.tokenize import word_tokenize
-
-            self.word_tokenize = word_tokenize
-        except ImportError:
-            raise ImportError("Please install nltk with: pip install nltk")
-
-    def encode(self, x: str) -> str:
-        return " ".join(self.word_tokenize(x))
-
-    def decode(self, x: str) -> str:
-        return x
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/sentencepiece_bpe.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/sentencepiece_bpe.py
deleted file mode 100644
index 67c6523837c43f190a0502a7d548a03ae3485403..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/sentencepiece_bpe.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq import file_utils
-from fairseq.data.encoders import register_bpe
-
-
-@register_bpe("sentencepiece")
-class SentencepieceBPE(object):
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        parser.add_argument('--sentencepiece-model', type=str,
-                            help='path to sentencepiece model')
-        # fmt: on
-
-    def __init__(self, args):
-        sentencepiece_model = file_utils.cached_path(args.sentencepiece_model)
-        try:
-            import sentencepiece as spm
-
-            self.sp = spm.SentencePieceProcessor()
-            self.sp.Load(sentencepiece_model)
-        except ImportError:
-            raise ImportError(
-                "Please install sentencepiece with: pip install sentencepiece"
-            )
-
-    def encode(self, x: str) -> str:
-        return " ".join(self.sp.EncodeAsPieces(x))
-
-    def decode(self, x: str) -> str:
-        if "\u2581" in x:
-            x = x.replace(" ", "").replace("\u2581", " ").strip()
-        return x
-
-    def is_beginning_of_word(self, x: str) -> bool:
-        if x in ["<unk>", "<s>", "</s>", "<pad>"]:
-            # special elements are always considered beginnings
-            # HACK: this logic is already present in fairseq/tasks/masked_lm.py
-            # but these special tokens are also contained in the sentencepiece
-            # vocabulary which causes duplicate special tokens. This hack makes
-            # sure that they are all taken into account.
-            return True
-        return x.startswith("\u2581")
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/space_tokenizer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/space_tokenizer.py
deleted file mode 100644
index 3bc7ce495866f1a04e52cf2c25d0e5fedb2925d3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/space_tokenizer.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import re
-
-from fairseq.data.encoders import register_tokenizer
-
-
-@register_tokenizer("space")
-class SpaceTokenizer(object):
-    def __init__(self, source_lang=None, target_lang=None):
-        self.space_tok = re.compile(r"\s+")
-
-    def encode(self, x: str) -> str:
-        return self.space_tok.sub(" ", x)
-
-    def decode(self, x: str) -> str:
-        return x
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/subword_nmt_bpe.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/subword_nmt_bpe.py
deleted file mode 100644
index e85f99af396a77d273ba2a4ecaf3db399017b9af..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/subword_nmt_bpe.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq import file_utils
-from fairseq.data.encoders import register_bpe
-
-
-@register_bpe("subword_nmt")
-class SubwordNMTBPE(object):
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        parser.add_argument('--bpe-codes', type=str,
-                            help='path to subword NMT BPE')
-        parser.add_argument('--bpe-separator', default='@@',
-                            help='BPE separator')
-        # fmt: on
-
-    def __init__(self, args):
-        if args.bpe_codes is None:
-            raise ValueError("--bpe-codes is required for --bpe=subword_nmt")
-        codes = file_utils.cached_path(args.bpe_codes)
-        try:
-            from subword_nmt import apply_bpe
-
-            bpe_parser = apply_bpe.create_parser()
-            bpe_args = bpe_parser.parse_args(
-                [
-                    "--codes",
-                    codes,
-                    "--separator",
-                    args.bpe_separator,
-                ]
-            )
-            self.bpe = apply_bpe.BPE(
-                bpe_args.codes,
-                bpe_args.merges,
-                bpe_args.separator,
-                None,
-                bpe_args.glossaries,
-            )
-            self.bpe_symbol = bpe_args.separator + " "
-        except ImportError:
-            raise ImportError(
-                "Please install subword_nmt with: pip install subword-nmt"
-            )
-
-    def encode(self, x: str) -> str:
-        return self.bpe.process_line(x)
-
-    def decode(self, x: str) -> str:
-        return (x + " ").replace(self.bpe_symbol, "").rstrip()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/utils.py
deleted file mode 100644
index d93eb532ef84f0e2bc708b777229ab2cb76ca14b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/encoders/utils.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-from fairseq.data import encoders
-
-
-def get_whole_word_mask(args, dictionary):
-    bpe = encoders.build_bpe(args)
-    if bpe is not None:
-
-        def is_beginning_of_word(i):
-            if i < dictionary.nspecial:
-                # special elements are always considered beginnings
-                return True
-            tok = dictionary[i]
-            if tok.startswith("madeupword"):
-                return True
-            try:
-                return bpe.is_beginning_of_word(tok)
-            except ValueError:
-                return True
-
-        mask_whole_words = torch.ByteTensor(
-            list(map(is_beginning_of_word, range(len(dictionary))))
-        )
-        return mask_whole_words
-    return None
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/fairseq_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/fairseq_dataset.py
deleted file mode 100644
index ed08c1ba200f3d4b95053c02aaa227169fe80d26..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/fairseq_dataset.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch.utils.data
-from fairseq.data import data_utils
-
-
-class EpochListening:
-    """Mixin for receiving updates whenever the epoch increments."""
-
-    @property
-    def can_reuse_epoch_itr_across_epochs(self):
-        """
-        Whether we can reuse the :class:`fairseq.data.EpochBatchIterator` for
-        this dataset across epochs.
-
-        This needs to return ``False`` if the sample sizes can change across
-        epochs, in which case we may need to regenerate batches at each epoch.
-        If your dataset relies in ``set_epoch`` then you should consider setting
-        this to ``False``.
-        """
-        return True
-
-    def set_epoch(self, epoch):
-        """Will receive the updated epoch number at the beginning of the epoch."""
-        pass
-
-
-class FairseqDataset(torch.utils.data.Dataset, EpochListening):
-    """A dataset that provides helpers for batching."""
-
-    def __getitem__(self, index):
-        raise NotImplementedError
-
-    def __len__(self):
-        raise NotImplementedError
-
-    def collater(self, samples):
-        """Merge a list of samples to form a mini-batch.
-
-        Args:
-            samples (List[dict]): samples to collate
-
-        Returns:
-            dict: a mini-batch suitable for forwarding with a Model
-        """
-        raise NotImplementedError
-
-    def num_tokens(self, index):
-        """Return the number of tokens in a sample. This value is used to
-        enforce ``--max-tokens`` during batching."""
-        raise NotImplementedError
-
-    def size(self, index):
-        """Return an example's size as a float or tuple. This value is used when
-        filtering a dataset with ``--max-positions``."""
-        raise NotImplementedError
-
-    def ordered_indices(self):
-        """Return an ordered list of indices. Batches will be constructed based
-        on this order."""
-        return np.arange(len(self), dtype=np.int64)
-
-    @property
-    def supports_prefetch(self):
-        """Whether this dataset supports prefetching."""
-        return False
-
-    def attr(self, attr: str, index: int):
-        return getattr(self, attr, None)
-
-    def prefetch(self, indices):
-        """Prefetch the data required for this epoch."""
-        raise NotImplementedError
-
-    def get_batch_shapes(self):
-        """
-        Return a list of valid batch shapes, for example::
-
-            [(8, 512), (16, 256), (32, 128)]
-
-        The first dimension of each tuple is the batch size and can be ``None``
-        to automatically infer the max batch size based on ``--max-tokens``.
-        The second dimension of each tuple is the max supported length as given
-        by :func:`fairseq.data.FairseqDataset.num_tokens`.
-
-        This will be used by :func:`fairseq.data.FairseqDataset.batch_by_size`
-        to restrict batch shapes. This is useful on TPUs to avoid too many
-        dynamic shapes (and recompilations).
-        """
-        return None
-
-    def batch_by_size(
-        self,
-        indices,
-        max_tokens=None,
-        max_sentences=None,
-        required_batch_size_multiple=1,
-    ):
-        """
-        Given an ordered set of indices, return batches according to
-        *max_tokens*, *max_sentences* and *required_batch_size_multiple*.
-        """
-        from fairseq.data import data_utils
-
-        fixed_shapes = self.get_batch_shapes()
-        if fixed_shapes is not None:
-
-            def adjust_bsz(bsz, num_tokens):
-                if bsz is None:
-                    assert max_tokens is not None, "Must specify --max-tokens"
-                    bsz = max_tokens // num_tokens
-                if max_sentences is not None:
-                    bsz = min(bsz, max_sentences)
-                elif (
-                    bsz >= required_batch_size_multiple
-                    and bsz % required_batch_size_multiple != 0
-                ):
-                    bsz -= bsz % required_batch_size_multiple
-                return bsz
-
-            fixed_shapes = np.array(
-                [
-                    [adjust_bsz(bsz, num_tokens), num_tokens]
-                    for (bsz, num_tokens) in fixed_shapes
-                ]
-            )
-
-        return data_utils.batch_by_size(
-            indices,
-            num_tokens_fn=self.num_tokens,
-            max_tokens=max_tokens,
-            max_sentences=max_sentences,
-            required_batch_size_multiple=required_batch_size_multiple,
-            fixed_shapes=fixed_shapes,
-        )
-
-    def filter_indices_by_size(self, indices, max_sizes):
-        """
-        Filter a list of sample indices. Remove those that are longer than
-        specified in *max_sizes*.
-
-        WARNING: don't update, override method in child classes
-
-        Args:
-            indices (np.array): original array of sample indices
-            max_sizes (int or list[int] or tuple[int]): max sample size,
-                can be defined separately for src and tgt (then list or tuple)
-
-        Returns:
-            np.array: filtered sample array
-            list: list of removed indices
-        """
-        if isinstance(max_sizes, float) or isinstance(max_sizes, int):
-            if hasattr(self, "sizes") and isinstance(self.sizes, np.ndarray):
-                ignored = indices[self.sizes[indices] > max_sizes].tolist()
-                indices = indices[self.sizes[indices] <= max_sizes]
-            elif (
-                hasattr(self, "sizes")
-                and isinstance(self.sizes, list)
-                and len(self.sizes) == 1
-            ):
-                ignored = indices[self.sizes[0][indices] > max_sizes].tolist()
-                indices = indices[self.sizes[0][indices] <= max_sizes]
-            else:
-                indices, ignored = data_utils._filter_by_size_dynamic(
-                    indices, self.size, max_sizes
-                )
-        else:
-            indices, ignored = data_utils._filter_by_size_dynamic(
-                indices, self.size, max_sizes
-            )
-        return indices, ignored
-
-    @property
-    def supports_fetch_outside_dataloader(self):
-        """Whether this dataset supports fetching outside the workers of the dataloader."""
-        return True
-
-
-class FairseqIterableDataset(torch.utils.data.IterableDataset, EpochListening):
-    """
-    For datasets that need to be read sequentially, usually because the data is
-    being streamed or otherwise can't be manipulated on a single machine.
-    """
-
-    def __iter__(self):
-        raise NotImplementedError
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/fasta_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/fasta_dataset.py
deleted file mode 100644
index 007011974a997fd7446dd29d7eba097d7513bab0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/fasta_dataset.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import subprocess
-import threading
-from pathlib import Path
-
-import numpy as np
-import torch
-
-
-def fasta_file_path(prefix_path):
-    return prefix_path + ".fasta"
-
-
-class FastaDataset(torch.utils.data.Dataset):
-    """
-    For loading protein sequence datasets in the common FASTA data format
-    """
-
-    def __init__(self, path: str, cache_indices=False):
-        self.fn = fasta_file_path(path)
-        self.threadlocal = threading.local()
-        self.cache = Path(f"{path}.fasta.idx.npy")
-        if cache_indices:
-            if self.cache.exists():
-                self.offsets, self.sizes = np.load(self.cache)
-            else:
-                self.offsets, self.sizes = self._build_index(path)
-                np.save(self.cache, np.stack([self.offsets, self.sizes]))
-        else:
-            self.offsets, self.sizes = self._build_index(path)
-
-    def _get_file(self):
-        if not hasattr(self.threadlocal, "f"):
-            self.threadlocal.f = open(self.fn, "r")
-        return self.threadlocal.f
-
-    def __getitem__(self, idx):
-        f = self._get_file()
-        f.seek(self.offsets[idx])
-        desc = f.readline().strip()
-        line = f.readline()
-        seq = ""
-        while line != "" and line[0] != ">":
-            seq += line.strip()
-            line = f.readline()
-        return desc, seq
-
-    def __len__(self):
-        return self.offsets.size
-
-    def _build_index(self, path: str):
-        # Use grep and awk to get 100M/s on local SSD.
-        # Should process your enormous 100G fasta in ~10 min single core...
-        path = fasta_file_path(path)
-        bytes_offsets = subprocess.check_output(
-            f"cat {path} | tqdm --bytes --total $(wc -c < {path})"
-            "| grep --byte-offset '^>' -o | cut -d: -f1",
-            shell=True,
-        )
-        fasta_lengths = subprocess.check_output(
-            f"cat {path} | tqdm --bytes --total $(wc -c < {path})"
-            "| awk '/^>/ {print \"\";next;} { printf(\"%s\",$0);}' | tail -n+2 | awk '{print length($1)}'",
-            shell=True,
-        )
-        bytes_np = np.fromstring(bytes_offsets, dtype=np.int64, sep=" ")
-        sizes_np = np.fromstring(fasta_lengths, dtype=np.int64, sep=" ")
-        return bytes_np, sizes_np
-
-    def __setstate__(self, state):
-        self.__dict__ = state
-        self.threadlocal = threading.local()
-
-    def __getstate__(self):
-        d = {}
-        for i, v in self.__dict__.items():
-            if i != "threadlocal":
-                d[i] = v
-        return d
-
-    def __del__(self):
-        if hasattr(self.threadlocal, "f"):
-            self.threadlocal.f.close()
-            del self.threadlocal.f
-
-    @staticmethod
-    def exists(path):
-        return os.path.exists(fasta_file_path(path))
-
-
-class EncodedFastaDataset(FastaDataset):
-    """
-    The FastaDataset returns raw sequences - this allows us to return
-    indices with a dictionary instead.
-    """
-
-    def __init__(self, path, dictionary):
-        super().__init__(path, cache_indices=True)
-        self.dictionary = dictionary
-
-    def __getitem__(self, idx):
-        desc, seq = super().__getitem__(idx)
-        return self.dictionary.encode_line(seq, line_tokenizer=list).long()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/id_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/id_dataset.py
deleted file mode 100644
index 3e4d7969cf2a26e852b466f165a6fadabae3b35f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/id_dataset.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-
-from . import FairseqDataset
-
-
-class IdDataset(FairseqDataset):
-    def __getitem__(self, index):
-        return index
-
-    def __len__(self):
-        return 0
-
-    def collater(self, samples):
-        return torch.tensor(samples)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/indexed_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/indexed_dataset.py
deleted file mode 100644
index a86ba402e4d62f93afc80e1b346ebcfbbe9b2e21..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/indexed_dataset.py
+++ /dev/null
@@ -1,561 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import shutil
-import struct
-from functools import lru_cache
-
-import numpy as np
-import torch
-from fairseq.data.fasta_dataset import FastaDataset
-from fairseq.file_io import PathManager
-
-from . import FairseqDataset
-
-
-def __best_fitting_dtype(vocab_size=None):
-    if vocab_size is not None and vocab_size < 65500:
-        return np.uint16
-    else:
-        return np.int32
-
-
-def get_available_dataset_impl():
-    return ["raw", "lazy", "cached", "mmap", "fasta"]
-
-
-def infer_dataset_impl(path):
-    if IndexedRawTextDataset.exists(path):
-        return "raw"
-    elif IndexedDataset.exists(path):
-        with open(index_file_path(path), "rb") as f:
-            magic = f.read(8)
-            if magic == IndexedDataset._HDR_MAGIC:
-                return "cached"
-            elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]:
-                return "mmap"
-            else:
-                return None
-    elif FastaDataset.exists(path):
-        return "fasta"
-    else:
-        return None
-
-
-def make_builder(out_file, impl, vocab_size=None):
-    if impl == "mmap":
-        return MMapIndexedDatasetBuilder(
-            out_file, dtype=__best_fitting_dtype(vocab_size)
-        )
-    elif impl == "fasta":
-        raise NotImplementedError
-    else:
-        return IndexedDatasetBuilder(out_file)
-
-
-def make_dataset(path, impl, fix_lua_indexing=False, dictionary=None):
-    if impl == "raw" and IndexedRawTextDataset.exists(path):
-        assert dictionary is not None
-        return IndexedRawTextDataset(path, dictionary)
-    elif impl == "lazy" and IndexedDataset.exists(path):
-        return IndexedDataset(path, fix_lua_indexing=fix_lua_indexing)
-    elif impl == "cached" and IndexedDataset.exists(path):
-        return IndexedCachedDataset(path, fix_lua_indexing=fix_lua_indexing)
-    elif impl == "mmap" and MMapIndexedDataset.exists(path):
-        return MMapIndexedDataset(path)
-    elif impl == "fasta" and FastaDataset.exists(path):
-        from fairseq.data.fasta_dataset import EncodedFastaDataset
-
-        return EncodedFastaDataset(path, dictionary)
-    return None
-
-
-def dataset_exists(path, impl):
-    if impl == "raw":
-        return IndexedRawTextDataset.exists(path)
-    elif impl == "mmap":
-        return MMapIndexedDataset.exists(path)
-    else:
-        return IndexedDataset.exists(path)
-
-
-def read_longs(f, n):
-    a = np.empty(n, dtype=np.int64)
-    f.readinto(a)
-    return a
-
-
-def write_longs(f, a):
-    f.write(np.array(a, dtype=np.int64))
-
-
-dtypes = {
-    1: np.uint8,
-    2: np.int8,
-    3: np.int16,
-    4: np.int32,
-    5: np.int64,
-    6: np.float,
-    7: np.double,
-    8: np.uint16,
-}
-
-
-def code(dtype):
-    for k in dtypes.keys():
-        if dtypes[k] == dtype:
-            return k
-    raise ValueError(dtype)
-
-
-def index_file_path(prefix_path):
-    return prefix_path + ".idx"
-
-
-def data_file_path(prefix_path):
-    return prefix_path + ".bin"
-
-
-class IndexedDataset(FairseqDataset):
-    """Loader for TorchNet IndexedDataset"""
-
-    _HDR_MAGIC = b"TNTIDX\x00\x00"
-
-    def __init__(self, path, fix_lua_indexing=False):
-        super().__init__()
-        self.path = path
-        self.fix_lua_indexing = fix_lua_indexing
-        self.data_file = None
-        self.read_index(path)
-
-    def read_index(self, path):
-        with open(index_file_path(path), "rb") as f:
-            magic = f.read(8)
-            assert magic == self._HDR_MAGIC, (
-                "Index file doesn't match expected format. "
-                "Make sure that --dataset-impl is configured properly."
-            )
-            version = f.read(8)
-            assert struct.unpack("<Q", version) == (1,)
-            code, self.element_size = struct.unpack("<QQ", f.read(16))
-            self.dtype = dtypes[code]
-            self._len, self.s = struct.unpack("<QQ", f.read(16))
-            self.dim_offsets = read_longs(f, self._len + 1)
-            self.data_offsets = read_longs(f, self._len + 1)
-            self.sizes = read_longs(f, self.s)
-
-    def read_data(self, path):
-        self.data_file = open(data_file_path(path), "rb", buffering=0)
-
-    def check_index(self, i):
-        if i < 0 or i >= self._len:
-            raise IndexError("index out of range")
-
-    def __del__(self):
-        if self.data_file:
-            self.data_file.close()
-
-    @lru_cache(maxsize=8)
-    def __getitem__(self, i):
-        if not self.data_file:
-            self.read_data(self.path)
-        self.check_index(i)
-        tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]]
-        a = np.empty(tensor_size, dtype=self.dtype)
-        self.data_file.seek(self.data_offsets[i] * self.element_size)
-        self.data_file.readinto(a)
-        item = torch.from_numpy(a).long()
-        if self.fix_lua_indexing:
-            item -= 1  # subtract 1 for 0-based indexing
-        return item
-
-    def __len__(self):
-        return self._len
-
-    def num_tokens(self, index):
-        return self.sizes[index]
-
-    def size(self, index):
-        return self.sizes[index]
-
-    @staticmethod
-    def exists(path):
-        return PathManager.exists(index_file_path(path)) and PathManager.exists(
-            data_file_path(path)
-        )
-
-    @property
-    def supports_prefetch(self):
-        return False  # avoid prefetching to save memory
-
-
-class IndexedCachedDataset(IndexedDataset):
-    def __init__(self, path, fix_lua_indexing=False):
-        super().__init__(path, fix_lua_indexing=fix_lua_indexing)
-        self.cache = None
-        self.cache_index = {}
-
-    @property
-    def supports_prefetch(self):
-        return True
-
-    def prefetch(self, indices):
-        if all(i in self.cache_index for i in indices):
-            return
-        if not self.data_file:
-            self.read_data(self.path)
-        indices = sorted(set(indices))
-        total_size = 0
-        for i in indices:
-            total_size += self.data_offsets[i + 1] - self.data_offsets[i]
-        self.cache = np.empty(total_size, dtype=self.dtype)
-        ptx = 0
-        self.cache_index.clear()
-        for i in indices:
-            self.cache_index[i] = ptx
-            size = self.data_offsets[i + 1] - self.data_offsets[i]
-            a = self.cache[ptx : ptx + size]
-            self.data_file.seek(self.data_offsets[i] * self.element_size)
-            self.data_file.readinto(a)
-            ptx += size
-        if self.data_file:
-            # close and delete data file after prefetch so we can pickle
-            self.data_file.close()
-            self.data_file = None
-
-    @lru_cache(maxsize=8)
-    def __getitem__(self, i):
-        self.check_index(i)
-        tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]]
-        a = np.empty(tensor_size, dtype=self.dtype)
-        ptx = self.cache_index[i]
-        np.copyto(a, self.cache[ptx : ptx + a.size])
-        item = torch.from_numpy(a).long()
-        if self.fix_lua_indexing:
-            item -= 1  # subtract 1 for 0-based indexing
-        return item
-
-
-class IndexedRawTextDataset(FairseqDataset):
-    """Takes a text file as input and binarizes it in memory at instantiation.
-    Original lines are also kept in memory"""
-
-    def __init__(self, path, dictionary, append_eos=True, reverse_order=False):
-        self.tokens_list = []
-        self.lines = []
-        self.sizes = []
-        self.append_eos = append_eos
-        self.reverse_order = reverse_order
-        self.read_data(path, dictionary)
-        self.size = len(self.tokens_list)
-
-    def read_data(self, path, dictionary):
-        with open(path, "r", encoding="utf-8") as f:
-            for line in f:
-                self.lines.append(line.strip("\n"))
-                tokens = dictionary.encode_line(
-                    line,
-                    add_if_not_exist=False,
-                    append_eos=self.append_eos,
-                    reverse_order=self.reverse_order,
-                ).long()
-                self.tokens_list.append(tokens)
-                self.sizes.append(len(tokens))
-        self.sizes = np.array(self.sizes)
-
-    def check_index(self, i):
-        if i < 0 or i >= self.size:
-            raise IndexError("index out of range")
-
-    @lru_cache(maxsize=8)
-    def __getitem__(self, i):
-        self.check_index(i)
-        return self.tokens_list[i]
-
-    def get_original_text(self, i):
-        self.check_index(i)
-        return self.lines[i]
-
-    def __del__(self):
-        pass
-
-    def __len__(self):
-        return self.size
-
-    def num_tokens(self, index):
-        return self.sizes[index]
-
-    def size(self, index):
-        return self.sizes[index]
-
-    @staticmethod
-    def exists(path):
-        return PathManager.exists(path)
-
-
-class IndexedDatasetBuilder(object):
-    element_sizes = {
-        np.uint8: 1,
-        np.int8: 1,
-        np.int16: 2,
-        np.int32: 4,
-        np.int64: 8,
-        np.float: 4,
-        np.double: 8,
-    }
-
-    def __init__(self, out_file, dtype=np.int32):
-        self.out_file = open(out_file, "wb")
-        self.dtype = dtype
-        self.data_offsets = [0]
-        self.dim_offsets = [0]
-        self.sizes = []
-        self.element_size = self.element_sizes[self.dtype]
-
-    def add_item(self, tensor):
-        # +1 for Lua compatibility
-        bytes = self.out_file.write(np.array(tensor.numpy() + 1, dtype=self.dtype))
-        self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
-        for s in tensor.size():
-            self.sizes.append(s)
-        self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size()))
-
-    def merge_file_(self, another_file):
-        index = IndexedDataset(another_file)
-        assert index.dtype == self.dtype
-
-        begin = self.data_offsets[-1]
-        for offset in index.data_offsets[1:]:
-            self.data_offsets.append(begin + offset)
-        self.sizes.extend(index.sizes)
-        begin = self.dim_offsets[-1]
-        for dim_offset in index.dim_offsets[1:]:
-            self.dim_offsets.append(begin + dim_offset)
-
-        with open(data_file_path(another_file), "rb") as f:
-            while True:
-                data = f.read(1024)
-                if data:
-                    self.out_file.write(data)
-                else:
-                    break
-
-    def finalize(self, index_file):
-        self.out_file.close()
-        index = open(index_file, "wb")
-        index.write(b"TNTIDX\x00\x00")
-        index.write(struct.pack("<Q", 1))
-        index.write(struct.pack("<QQ", code(self.dtype), self.element_size))
-        index.write(struct.pack("<QQ", len(self.data_offsets) - 1, len(self.sizes)))
-        write_longs(index, self.dim_offsets)
-        write_longs(index, self.data_offsets)
-        write_longs(index, self.sizes)
-        index.close()
-
-
-def _warmup_mmap_file(path):
-    with open(path, "rb") as stream:
-        while stream.read(100 * 1024 * 1024):
-            pass
-
-
-class MMapIndexedDataset(torch.utils.data.Dataset):
-    class Index(object):
-        _HDR_MAGIC = b"MMIDIDX\x00\x00"
-
-        @classmethod
-        def writer(cls, path, dtype):
-            class _Writer(object):
-                def __enter__(self):
-                    self._file = open(path, "wb")
-
-                    self._file.write(cls._HDR_MAGIC)
-                    self._file.write(struct.pack("<Q", 1))
-                    self._file.write(struct.pack("<B", code(dtype)))
-
-                    return self
-
-                @staticmethod
-                def _get_pointers(sizes):
-                    dtype_size = dtype().itemsize
-                    address = 0
-                    pointers = []
-
-                    for size in sizes:
-                        pointers.append(address)
-                        address += size * dtype_size
-
-                    return pointers
-
-                def write(self, sizes):
-                    pointers = self._get_pointers(sizes)
-
-                    self._file.write(struct.pack("<Q", len(sizes)))
-
-                    sizes = np.array(sizes, dtype=np.int32)
-                    self._file.write(sizes.tobytes(order="C"))
-                    del sizes
-
-                    pointers = np.array(pointers, dtype=np.int64)
-                    self._file.write(pointers.tobytes(order="C"))
-                    del pointers
-
-                def __exit__(self, exc_type, exc_val, exc_tb):
-                    self._file.close()
-
-            return _Writer()
-
-        def __init__(self, path):
-            with open(path, "rb") as stream:
-                magic_test = stream.read(9)
-                assert self._HDR_MAGIC == magic_test, (
-                    "Index file doesn't match expected format. "
-                    "Make sure that --dataset-impl is configured properly."
-                )
-                version = struct.unpack("<Q", stream.read(8))
-                assert (1,) == version
-
-                (dtype_code,) = struct.unpack("<B", stream.read(1))
-                self._dtype = dtypes[dtype_code]
-                self._dtype_size = self._dtype().itemsize
-
-                self._len = struct.unpack("<Q", stream.read(8))[0]
-                offset = stream.tell()
-
-            _warmup_mmap_file(path)
-
-            self._bin_buffer_mmap = np.memmap(path, mode="r", order="C")
-            self._bin_buffer = memoryview(self._bin_buffer_mmap)
-            self._sizes = np.frombuffer(
-                self._bin_buffer, dtype=np.int32, count=self._len, offset=offset
-            )
-            self._pointers = np.frombuffer(
-                self._bin_buffer,
-                dtype=np.int64,
-                count=self._len,
-                offset=offset + self._sizes.nbytes,
-            )
-
-        def __del__(self):
-            self._bin_buffer_mmap._mmap.close()
-            del self._bin_buffer_mmap
-
-        @property
-        def dtype(self):
-            return self._dtype
-
-        @property
-        def sizes(self):
-            return self._sizes
-
-        @lru_cache(maxsize=8)
-        def __getitem__(self, i):
-            return self._pointers[i], self._sizes[i]
-
-        def __len__(self):
-            return self._len
-
-    def __init__(self, path):
-        super().__init__()
-
-        self._path = None
-        self._index = None
-        self._bin_buffer = None
-
-        self._do_init(path)
-
-    def __getstate__(self):
-        return self._path
-
-    def __setstate__(self, state):
-        self._do_init(state)
-
-    def _do_init(self, path):
-        self._path = path
-        self._index = self.Index(index_file_path(self._path))
-
-        _warmup_mmap_file(data_file_path(self._path))
-        self._bin_buffer_mmap = np.memmap(
-            data_file_path(self._path), mode="r", order="C"
-        )
-        self._bin_buffer = memoryview(self._bin_buffer_mmap)
-
-    def __del__(self):
-        self._bin_buffer_mmap._mmap.close()
-        del self._bin_buffer_mmap
-        del self._index
-
-    def __len__(self):
-        return len(self._index)
-
-    @lru_cache(maxsize=8)
-    def __getitem__(self, i):
-        ptr, size = self._index[i]
-        np_array = np.frombuffer(
-            self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr
-        )
-        if self._index.dtype != np.int32:
-            np_array = np_array.astype(np.int32)
-
-        return torch.from_numpy(np_array)
-
-    @property
-    def sizes(self):
-        return self._index.sizes
-
-    @property
-    def supports_prefetch(self):
-        return False
-
-    @staticmethod
-    def exists(path):
-        return PathManager.exists(index_file_path(path)) and PathManager.exists(
-            data_file_path(path)
-        )
-
-
-def get_indexed_dataset_to_local(path):
-    local_index_path = PathManager.get_local_path(index_file_path(path))
-    local_data_path = PathManager.get_local_path(data_file_path(path))
-
-    assert local_index_path.endswith(".idx") and local_data_path.endswith(".bin"), (
-        "PathManager.get_local_path does not return files with expected patterns: "
-        f"{local_index_path} and {local_data_path}"
-    )
-
-    local_path = local_data_path[:-4]  # stripping surfix ".bin"
-    assert local_path == local_index_path[:-4]  # stripping surfix ".idx"
-    return local_path
-
-
-class MMapIndexedDatasetBuilder(object):
-    def __init__(self, out_file, dtype=np.int64):
-        self._data_file = open(out_file, "wb")
-        self._dtype = dtype
-        self._sizes = []
-
-    def add_item(self, tensor):
-        np_array = np.array(tensor.numpy(), dtype=self._dtype)
-        self._data_file.write(np_array.tobytes(order="C"))
-        self._sizes.append(np_array.size)
-
-    def merge_file_(self, another_file):
-        # Concatenate index
-        index = MMapIndexedDataset.Index(index_file_path(another_file))
-        assert index.dtype == self._dtype
-
-        for size in index.sizes:
-            self._sizes.append(size)
-
-        # Concatenate data
-        with open(data_file_path(another_file), "rb") as f:
-            shutil.copyfileobj(f, self._data_file)
-
-    def finalize(self, index_file):
-        self._data_file.close()
-
-        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
-            index.write(self._sizes)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/iterators.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/iterators.py
deleted file mode 100644
index ef41fed739a3b30f7b3e598ad6042ea506ba01fa..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/iterators.py
+++ /dev/null
@@ -1,594 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import itertools
-import logging
-import math
-import operator
-import os
-import queue
-import time
-from threading import Thread
-
-import numpy as np
-import torch
-from fairseq.data import data_utils
-
-
-logger = logging.getLogger(__name__)
-
-# Object used by _background_consumer to signal the source is exhausted
-# to the main thread.
-_sentinel = object()
-
-
-class CountingIterator(object):
-    """Wrapper around an iterable that maintains the iteration count.
-
-    Args:
-        iterable (iterable): iterable to wrap
-        start (int): starting iteration count. Note that this doesn't
-            actually advance the iterator.
-        total (int): override the iterator length returned by
-            ``__len__``. This can be used to truncate *iterator*.
-
-    Attributes:
-        n (int): number of elements consumed from this iterator
-    """
-
-    def __init__(self, iterable, start=None, total=None):
-        self.iterable = iterable
-        self.itr = iter(self)
-
-        if start is None:
-            self.n = getattr(iterable, "n", 0)
-        else:
-            self.n = start
-
-        if total is None:
-            self.total = self.n + len(iterable)
-        else:
-            self.total = total
-
-    def __len__(self):
-        return self.total
-
-    def __iter__(self):
-        for x in self.iterable:
-            if self.n >= self.total:
-                raise RuntimeError(
-                    "Mismatch between actual and expected iterable length. "
-                    "This may be caused by resuming training from a checkpoint using "
-                    "a different number of GPUs, in which case you can try the "
-                    "--reset-dataloader option. Alternatively you may have a train or "
-                    "validation set that is smaller than the number of GPUs. If none "
-                    "of these apply, please report this to the fairseq developers."
-                )
-            self.n += 1
-            yield x
-
-    def __next__(self):
-        return next(self.itr)
-
-    def has_next(self):
-        """Whether the iterator has been exhausted."""
-        return self.n < len(self)
-
-    def skip(self, num_to_skip):
-        """Fast-forward the iterator by skipping *num_to_skip* elements."""
-        next(itertools.islice(self.itr, num_to_skip, num_to_skip), None)
-        return self
-
-    def take(self, n):
-        """
-        Truncates the iterator to n elements at most.
-        """
-        self.total = min(self.total, n)
-
-        # Propagate this change to the underlying iterator
-        # Only take after what we have already consumed (i.e. after restarting
-        # from checkpoint mid epoch, we have to subtract self.n which is the
-        # starting point)
-        #
-        # This to maintain the invariant self.total = self.n + len(iterable),
-        # before calling __next__ or __iter__
-        propagated_take = max(n - self.n, 0)
-        if hasattr(self.iterable, "take"):
-            self.iterable.take(propagated_take)
-        else:
-            self.iterable = itertools.islice(self.iterable, propagated_take)
-
-
-class EpochBatchIterating(object):
-    def __len__(self) -> int:
-        raise NotImplementedError
-
-    @property
-    def next_epoch_idx(self):
-        raise NotImplementedError
-
-    def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
-        """Return a new iterator over the dataset.
-
-        Args:
-            shuffle (bool, optional): shuffle batches before returning the
-                iterator (default: True).
-            fix_batches_to_gpus: ensure that batches are always
-                allocated to the same shards across epochs. Requires
-                that :attr:`dataset` supports prefetching (default: False).
-        """
-        raise NotImplementedError
-
-    def end_of_epoch(self) -> bool:
-        """Returns whether the most recent epoch iterator has been exhausted"""
-        raise NotImplementedError
-
-    @property
-    def iterations_in_epoch(self) -> int:
-        """The number of consumed batches in the current epoch."""
-        raise NotImplementedError
-
-    def state_dict(self):
-        """Returns a dictionary containing a whole state of the iterator."""
-        raise NotImplementedError
-
-    def load_state_dict(self, state_dict):
-        """Copies the state of the iterator from the given *state_dict*."""
-        raise NotImplementedError
-
-
-class StreamingEpochBatchIterator(EpochBatchIterating):
-    def __init__(
-        self,
-        dataset,
-        epoch=1,
-        num_shards=1,
-        shard_id=0,
-    ):
-        assert isinstance(dataset, torch.utils.data.IterableDataset)
-        self.dataset = dataset
-        self.epoch = max(epoch, 1)  # we use 1-based indexing for epochs
-        self._current_epoch_iterator = None
-        self.num_shards = num_shards
-        self.shard_id = shard_id
-
-    @property
-    def next_epoch_idx(self):
-        """Return the epoch index after *next_epoch_itr* is called."""
-        if self._current_epoch_iterator is not None and self.end_of_epoch():
-            return self.epoch + 1
-        else:
-            return self.epoch
-
-    def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
-        self.epoch = self.next_epoch_idx
-        if hasattr(self.dataset, "set_epoch"):
-            self.dataset.set_epoch(self.epoch)
-        self._current_epoch_iterator = CountingIterator(
-            iterable=ShardedIterator(
-                iterable=self.dataset,
-                num_shards=self.num_shards,
-                shard_id=self.shard_id,
-            ),
-        )
-        return self._current_epoch_iterator
-
-    def end_of_epoch(self) -> bool:
-        return not self._current_epoch_iterator.has_next()
-
-    @property
-    def iterations_in_epoch(self) -> int:
-        if self._current_epoch_iterator is not None:
-            return self._current_epoch_iterator.n
-        return 0
-
-    def state_dict(self):
-        return {
-            "epoch": self.epoch,
-        }
-
-    def load_state_dict(self, state_dict):
-        self.epoch = state_dict["epoch"]
-
-
-class EpochBatchIterator(EpochBatchIterating):
-    """A multi-epoch iterator over a :class:`torch.utils.data.Dataset`.
-
-    Compared to :class:`torch.utils.data.DataLoader`, this iterator:
-
-    - can be reused across multiple epochs with the :func:`next_epoch_itr`
-      method (optionally shuffled between epochs)
-    - can be serialized/deserialized with the :func:`state_dict` and
-      :func:`load_state_dict` methods
-    - supports sharding with the *num_shards* and *shard_id* arguments
-
-    Args:
-        dataset (~torch.utils.data.Dataset): dataset from which to load the data
-        collate_fn (callable): merges a list of samples to form a mini-batch
-        batch_sampler (~torch.utils.data.Sampler or a callable): an iterator over batches of
-            indices, or a callable to create such an iterator (~torch.utils.data.Sampler).
-            A callable batch_sampler will be called for each epoch to enable per epoch dynamic
-            batch iterators defined by this callable batch_sampler.
-        seed (int, optional): seed for random number generator for
-            reproducibility (default: 1).
-        num_shards (int, optional): shard the data iterator into N
-            shards (default: 1).
-        shard_id (int, optional): which shard of the data iterator to
-            return (default: 0).
-        num_workers (int, optional): how many subprocesses to use for data
-            loading. 0 means the data will be loaded in the main process
-            (default: 0).
-        epoch (int, optional): the epoch to start the iterator from
-            (default: 1).
-        buffer_size (int, optional): the number of batches to keep ready in the
-            queue. Helps speeding up dataloading. When buffer_size is zero, the
-            default torch.utils.data.DataLoader preloading is used.
-        timeout (int, optional): if positive, the timeout value for collecting a batch
-            from workers. Should always be non-negative (default: ``0``).
-        disable_shuffling (bool, optional): force disable shuffling
-            (default: ``False``).
-    """
-
-    def __init__(
-        self,
-        dataset,
-        collate_fn,
-        batch_sampler,
-        seed=1,
-        num_shards=1,
-        shard_id=0,
-        num_workers=0,
-        epoch=1,
-        buffer_size=0,
-        timeout=0,
-        disable_shuffling=False,
-    ):
-        assert isinstance(dataset, torch.utils.data.Dataset)
-        self.dataset = dataset
-        self.collate_fn = collate_fn
-        self.batch_sampler = batch_sampler
-        self._frozen_batches = (
-            tuple(batch_sampler) if not callable(batch_sampler) else None
-        )
-        self.seed = seed
-        self.num_shards = num_shards
-        self.shard_id = shard_id
-        self.num_workers = num_workers
-        # This upper limit here is to prevent people from abusing this feature
-        # in a shared computing environment.
-        self.buffer_size = min(buffer_size, 20)
-        self.timeout = timeout
-        self.disable_shuffling = disable_shuffling
-
-        self.epoch = max(epoch, 1)  # we use 1-based indexing for epochs
-        self.shuffle = not disable_shuffling
-        self._cur_epoch_itr = None
-        self._next_epoch_itr = None
-        self._supports_prefetch = getattr(dataset, "supports_prefetch", False)
-
-    @property
-    def frozen_batches(self):
-        if self._frozen_batches is None:
-            self._frozen_batches = tuple(self.batch_sampler(self.dataset, self.epoch))
-        return self._frozen_batches
-
-    @property
-    def first_batch(self):
-        if len(self.frozen_batches) == 0:
-            raise Exception(
-                "The dataset is empty. This could indicate "
-                "that all elements in the dataset have been skipped. "
-                "Try increasing the max number of allowed tokens or using "
-                "a larger dataset."
-            )
-
-        if getattr(self.dataset, "supports_fetch_outside_dataloader", True):
-            return self.collate_fn([self.dataset[i] for i in self.frozen_batches[0]])
-        else:
-            return "DUMMY"
-
-    def __len__(self):
-        return int(math.ceil(len(self.frozen_batches) / float(self.num_shards)))
-
-    @property
-    def n(self):
-        return self.iterations_in_epoch
-
-    @property
-    def next_epoch_idx(self):
-        """Return the epoch index after *next_epoch_itr* is called."""
-        if self._next_epoch_itr is not None:
-            return self.epoch
-        elif self._cur_epoch_itr is not None and self.end_of_epoch():
-            return self.epoch + 1
-        else:
-            return self.epoch
-
-    def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False):
-        """Return a new iterator over the dataset.
-
-        Args:
-            shuffle (bool, optional): shuffle batches before returning the
-                iterator (default: True).
-            fix_batches_to_gpus: ensure that batches are always
-                allocated to the same shards across epochs. Requires
-                that :attr:`dataset` supports prefetching (default: False).
-        """
-        if self.disable_shuffling:
-            shuffle = False
-        self.epoch = self.next_epoch_idx
-        if hasattr(self.dataset, "set_epoch"):
-            self.dataset.set_epoch(self.epoch)
-        if self._next_epoch_itr is not None:
-            self._cur_epoch_itr = self._next_epoch_itr
-            self._next_epoch_itr = None
-        else:
-            if callable(self.batch_sampler):
-                # reset _frozen_batches to refresh the next epoch
-                self._frozen_batches = None
-            self._cur_epoch_itr = self._get_iterator_for_epoch(
-                self.epoch,
-                shuffle,
-                fix_batches_to_gpus=fix_batches_to_gpus,
-            )
-        self.shuffle = shuffle
-        return self._cur_epoch_itr
-
-    def end_of_epoch(self) -> bool:
-        """Returns whether the most recent epoch iterator has been exhausted"""
-        return not self._cur_epoch_itr.has_next()
-
-    @property
-    def iterations_in_epoch(self):
-        """The number of consumed batches in the current epoch."""
-        if self._cur_epoch_itr is not None:
-            return self._cur_epoch_itr.n
-        elif self._next_epoch_itr is not None:
-            return self._next_epoch_itr.n
-        return 0
-
-    def state_dict(self):
-        """Returns a dictionary containing a whole state of the iterator."""
-        if self.end_of_epoch():
-            epoch = self.epoch + 1
-            iter_in_epoch = 0
-        else:
-            epoch = self.epoch
-            iter_in_epoch = self.iterations_in_epoch
-        return {
-            "version": 2,
-            "epoch": epoch,
-            "iterations_in_epoch": iter_in_epoch,
-            "shuffle": self.shuffle,
-        }
-
-    def load_state_dict(self, state_dict):
-        """Copies the state of the iterator from the given *state_dict*."""
-        self.epoch = state_dict["epoch"]
-        itr_pos = state_dict.get("iterations_in_epoch", 0)
-        version = state_dict.get("version", 1)
-        if itr_pos > 0:
-            # fast-forward epoch iterator
-            self._next_epoch_itr = self._get_iterator_for_epoch(
-                self.epoch,
-                shuffle=state_dict.get("shuffle", True),
-                offset=itr_pos,
-            )
-            if self._next_epoch_itr is None:
-                if version == 1:
-                    # legacy behavior: we finished the epoch, increment epoch counter
-                    self.epoch += 1
-                else:
-                    raise RuntimeError(
-                        "Cannot resume training due to dataloader mismatch, please "
-                        "report this to the fairseq developers. You can relaunch "
-                        "training with `--reset-dataloader` and it should work."
-                    )
-        else:
-            self._next_epoch_itr = None
-
-    def _get_iterator_for_epoch(
-        self, epoch, shuffle, fix_batches_to_gpus=False, offset=0
-    ):
-        def shuffle_batches(batches, seed):
-            with data_utils.numpy_seed(seed):
-                np.random.shuffle(batches)
-            return batches
-
-        if self._supports_prefetch:
-            batches = self.frozen_batches
-
-            if shuffle and not fix_batches_to_gpus:
-                batches = shuffle_batches(list(batches), self.seed + epoch)
-
-            batches = list(
-                ShardedIterator(batches, self.num_shards, self.shard_id, fill_value=[])
-            )
-            self.dataset.prefetch([i for s in batches for i in s])
-
-            if shuffle and fix_batches_to_gpus:
-                batches = shuffle_batches(batches, self.seed + epoch + self.shard_id)
-        else:
-            if shuffle:
-                batches = shuffle_batches(list(self.frozen_batches), self.seed + epoch)
-            else:
-                batches = self.frozen_batches
-            batches = list(
-                ShardedIterator(batches, self.num_shards, self.shard_id, fill_value=[])
-            )
-
-        if offset > 0 and offset >= len(batches):
-            return None
-
-        if self.num_workers > 0:
-            os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning"
-
-        # Create data loader
-        itr = torch.utils.data.DataLoader(
-            self.dataset,
-            collate_fn=self.collate_fn,
-            batch_sampler=batches[offset:],
-            num_workers=self.num_workers,
-            timeout=self.timeout,
-        )
-
-        # Wrap with a BufferedIterator if needed
-        if self.buffer_size > 0:
-            itr = BufferedIterator(self.buffer_size, itr)
-
-        # Wrap with CoutingIterator
-        itr = CountingIterator(itr, start=offset)
-        return itr
-
-
-class GroupedIterator(CountingIterator):
-    """Wrapper around an iterable that returns groups (chunks) of items.
-
-    Args:
-        iterable (iterable): iterable to wrap
-        chunk_size (int): size of each chunk
-
-    Attributes:
-        n (int): number of elements consumed from this iterator
-    """
-
-    def __init__(self, iterable, chunk_size):
-        itr = _chunk_iterator(iterable, chunk_size)
-        super().__init__(
-            itr,
-            start=int(math.ceil(getattr(iterable, "n", 0) / float(chunk_size))),
-            total=int(math.ceil(len(iterable) / float(chunk_size))),
-        )
-        self.chunk_size = chunk_size
-
-
-def _chunk_iterator(itr, chunk_size):
-    chunk = []
-    for x in itr:
-        chunk.append(x)
-        if len(chunk) == chunk_size:
-            yield chunk
-            chunk = []
-    if len(chunk) > 0:
-        yield chunk
-
-
-class ShardedIterator(CountingIterator):
-    """A sharded wrapper around an iterable, padded to length.
-
-    Args:
-        iterable (iterable): iterable to wrap
-        num_shards (int): number of shards to split the iterable into
-        shard_id (int): which shard to iterator over
-        fill_value (Any, optional): padding value when the iterable doesn't
-            evenly divide *num_shards* (default: None).
-
-    Attributes:
-        n (int): number of elements consumed from this iterator
-    """
-
-    def __init__(self, iterable, num_shards, shard_id, fill_value=None):
-        if shard_id < 0 or shard_id >= num_shards:
-            raise ValueError("shard_id must be between 0 and num_shards")
-        sharded_len = int(math.ceil(len(iterable) / float(num_shards)))
-        itr = map(
-            operator.itemgetter(1),
-            itertools.zip_longest(
-                range(sharded_len),
-                itertools.islice(iterable, shard_id, len(iterable), num_shards),
-                fillvalue=fill_value,
-            ),
-        )
-        super().__init__(
-            itr,
-            start=int(math.ceil(getattr(iterable, "n", 0) / float(num_shards))),
-            total=sharded_len,
-        )
-
-
-class BackgroundConsumer(Thread):
-    def __init__(self, queue, source, max_len):
-        Thread.__init__(self)
-
-        self._queue = queue
-        self._source = source
-        self._max_len = max_len
-        self.count = 0
-
-    def run(self):
-        try:
-            for item in self._source:
-                self._queue.put(item)
-
-                # Stop if we reached the maximum length
-                self.count += 1
-                if self._max_len is not None and self.count >= self._max_len:
-                    break
-
-            # Signal the consumer we are done.
-            self._queue.put(_sentinel)
-        except Exception as e:
-            self._queue.put(e)
-
-
-class BufferedIterator(object):
-    def __init__(self, size, iterable):
-        self._queue = queue.Queue(size)
-        self._iterable = iterable
-        self._consumer = None
-
-        self.start_time = time.time()
-        self.warning_time = None
-
-        self.total = len(iterable)
-
-    def _create_consumer(self):
-        self._consumer = BackgroundConsumer(
-            self._queue,
-            self._iterable,
-            self.total,
-        )
-        self._consumer.daemon = True
-        self._consumer.start()
-
-    def __iter__(self):
-        return self
-
-    def __len__(self):
-        return self.total
-
-    def take(self, n):
-        self.total = min(self.total, n)
-
-        # Propagate this change to the underlying iterator
-        if hasattr(self._iterable, "take"):
-            self._iterable.take(n)
-
-    def __next__(self):
-        # Create consumer if not created yet
-        if self._consumer is None:
-            self._create_consumer()
-
-        # Notify the user if there is a data loading bottleneck
-        if self._queue.qsize() < min(2, max(1, self._queue.maxsize // 2)):
-            if time.time() - self.start_time > 5 * 60:
-                if (
-                    self.warning_time is None
-                    or time.time() - self.warning_time > 15 * 60
-                ):
-                    logger.debug(
-                        "Data loading buffer is empty or nearly empty. This may "
-                        "indicate a data loading bottleneck, and increasing the "
-                        "number of workers (--num-workers) may help."
-                    )
-                    self.warning_time = time.time()
-
-        # Get next example
-        item = self._queue.get(True)
-        if isinstance(item, Exception):
-            raise item
-        if item is _sentinel:
-            raise StopIteration()
-        return item
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/language_pair_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/language_pair_dataset.py
deleted file mode 100644
index 1286b7944736a68336fbdeac0d7a1a998304f0fe..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/language_pair_dataset.py
+++ /dev/null
@@ -1,474 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-import numpy as np
-import torch
-from fairseq.data import FairseqDataset, data_utils
-
-
-logger = logging.getLogger(__name__)
-
-
-def collate(
-    samples,
-    pad_idx,
-    eos_idx,
-    left_pad_source=True,
-    left_pad_target=False,
-    input_feeding=True,
-    pad_to_length=None,
-    pad_to_multiple=1,
-):
-    if len(samples) == 0:
-        return {}
-
-    def merge(key, other_key, left_pad, move_eos_to_beginning=False, pad_to_length=None):
-        return data_utils.collate_tokens(
-            [s[key] for s in samples],
-            [s[other_key] for s in samples],
-            pad_idx,
-            eos_idx,
-            left_pad,
-            move_eos_to_beginning,
-            pad_to_length=pad_to_length,
-            pad_to_multiple=pad_to_multiple,
-        )
-
-    def check_alignment(alignment, src_len, tgt_len):
-        if alignment is None or len(alignment) == 0:
-            return False
-        if (
-            alignment[:, 0].max().item() >= src_len - 1
-            or alignment[:, 1].max().item() >= tgt_len - 1
-        ):
-            logger.warning("alignment size mismatch found, skipping alignment!")
-            return False
-        return True
-
-    def compute_alignment_weights(alignments):
-        """
-        Given a tensor of shape [:, 2] containing the source-target indices
-        corresponding to the alignments, a weight vector containing the
-        inverse frequency of each target index is computed.
-        For e.g. if alignments = [[5, 7], [2, 3], [1, 3], [4, 2]], then
-        a tensor containing [1., 0.5, 0.5, 1] should be returned (since target
-        index 3 is repeated twice)
-        """
-        align_tgt = alignments[:, 1]
-        _, align_tgt_i, align_tgt_c = torch.unique(
-            align_tgt, return_inverse=True, return_counts=True
-        )
-        align_weights = align_tgt_c[align_tgt_i[np.arange(len(align_tgt))]]
-        return 1.0 / align_weights.float()
-
-    id = torch.IntTensor([s["id"] for s in samples])
-    src_tokens = merge(
-        "source", "target",
-        left_pad=left_pad_source,
-        pad_to_length=pad_to_length["source"] if pad_to_length is not None else None,
-    )
-    # sort by descending source length
-    src_lengths = torch.IntTensor(
-        [s["source"].ne(pad_idx).int().sum() for s in samples]
-    )
-    src_lengths, sort_order = src_lengths.sort(descending=True)
-    id = id.index_select(0, sort_order)
-    src_tokens = src_tokens.index_select(0, sort_order)
-
-    prev_output_tokens = None
-    target = None
-    if samples[0].get("target", None) is not None:
-        target = merge(
-            "target", "source",
-            left_pad=left_pad_target,
-            pad_to_length=pad_to_length["target"]
-            if pad_to_length is not None
-            else None,
-        )
-        target = target.index_select(0, sort_order)
-        tgt_lengths = torch.IntTensor(
-            [s["target"].ne(pad_idx).int().sum() for s in samples]
-        ).index_select(0, sort_order)
-        ntokens = tgt_lengths.sum().item()
-
-        if samples[0].get("prev_output_tokens", None) is not None:
-            prev_output_tokens = merge("prev_output_tokens", left_pad=left_pad_target)
-        elif input_feeding:
-            # we create a shifted version of targets for feeding the
-            # previous output token(s) into the next decoder step
-            prev_output_tokens = merge(
-                "target", "source",
-                left_pad=left_pad_target,
-                move_eos_to_beginning=True,
-                pad_to_length=pad_to_length["target"]
-                if pad_to_length is not None
-                else None,
-            )
-    else:
-        ntokens = src_lengths.sum().item()
-
-    batch = {
-        "id": id,
-        "nsentences": len(samples),
-        "ntokens": ntokens,
-        "net_input": {
-            "src_tokens": src_tokens,
-            "src_lengths": src_lengths,
-        },
-        "target": target,
-    }
-    if prev_output_tokens is not None:
-        batch["net_input"]["prev_output_tokens"] = prev_output_tokens.index_select(
-            0, sort_order
-        )
-
-    if samples[0].get("alignment", None) is not None:
-        bsz, tgt_sz = batch["target"].shape
-        src_sz = batch["net_input"]["src_tokens"].shape[1]
-
-        offsets = torch.zeros((len(sort_order), 2), dtype=torch.int)
-        offsets[:, 1] += torch.arange(len(sort_order), dtype=torch.int) * tgt_sz
-        if left_pad_source:
-            offsets[:, 0] += src_sz - src_lengths
-        if left_pad_target:
-            offsets[:, 1] += tgt_sz - tgt_lengths
-
-        alignments = [
-            alignment + offset
-            for align_idx, offset, src_len, tgt_len in zip(
-                sort_order, offsets, src_lengths, tgt_lengths
-            )
-            for alignment in [samples[align_idx]["alignment"].view(-1, 2)]
-            if check_alignment(alignment, src_len, tgt_len)
-        ]
-
-        if len(alignments) > 0:
-            alignments = torch.cat(alignments, dim=0)
-            align_weights = compute_alignment_weights(alignments)
-
-            batch["alignments"] = alignments
-            batch["align_weights"] = align_weights
-
-    if samples[0].get("constraints", None) is not None:
-        # Collate the packed constraints across the samples, padding to
-        # the length of the longest sample.
-        lens = [sample.get("constraints").size(0) for sample in samples]
-        max_len = max(lens)
-        constraints = torch.zeros((len(samples), max(lens))).int()
-        for i, sample in enumerate(samples):
-            constraints[i, 0 : lens[i]] = samples[i].get("constraints")
-        batch["constraints"] = constraints
-
-    return batch
-
-
-class LanguagePairDataset(FairseqDataset):
-    """
-    A pair of torch.utils.data.Datasets.
-
-    Args:
-        src (torch.utils.data.Dataset): source dataset to wrap
-        src_sizes (List[int]): source sentence lengths
-        src_dict (~fairseq.data.Dictionary): source vocabulary
-        tgt (torch.utils.data.Dataset, optional): target dataset to wrap
-        tgt_sizes (List[int], optional): target sentence lengths
-        tgt_dict (~fairseq.data.Dictionary, optional): target vocabulary
-        left_pad_source (bool, optional): pad source tensors on the left side
-            (default: True).
-        left_pad_target (bool, optional): pad target tensors on the left side
-            (default: False).
-        shuffle (bool, optional): shuffle dataset elements before batching
-            (default: True).
-        input_feeding (bool, optional): create a shifted version of the targets
-            to be passed into the model for teacher forcing (default: True).
-        remove_eos_from_source (bool, optional): if set, removes eos from end
-            of source if it's present (default: False).
-        append_eos_to_target (bool, optional): if set, appends eos to end of
-            target if it's absent (default: False).
-        align_dataset (torch.utils.data.Dataset, optional): dataset
-            containing alignments.
-        constraints (Tensor, optional): 2d tensor with a concatenated, zero-
-            delimited list of constraints for each sentence.
-        append_bos (bool, optional): if set, appends bos to the beginning of
-            source/target sentence.
-        num_buckets (int, optional): if set to a value greater than 0, then
-            batches will be bucketed into the given number of batch shapes.
-        src_lang_id (int, optional): source language ID, if set, the collated batch
-            will contain a field 'src_lang_id' in 'net_input' which indicates the
-            source language of the samples.
-        tgt_lang_id (int, optional): target language ID, if set, the collated batch
-            will contain a field 'tgt_lang_id' which indicates the target language
-             of the samples.
-    """
-
-    def __init__(
-        self,
-        src,
-        src_sizes,
-        src_dict,
-        tgt=None,
-        tgt_sizes=None,
-        tgt_dict=None,
-        left_pad_source=True,
-        left_pad_target=False,
-        shuffle=True,
-        input_feeding=True,
-        remove_eos_from_source=False,
-        append_eos_to_target=False,
-        align_dataset=None,
-        constraints=None,
-        append_bos=False,
-        eos=None,
-        num_buckets=0,
-        src_lang_id=None,
-        tgt_lang_id=None,
-        pad_to_multiple=1,
-    ):
-        if tgt_dict is not None:
-            assert src_dict.pad() == tgt_dict.pad()
-            assert src_dict.eos() == tgt_dict.eos()
-            assert src_dict.unk() == tgt_dict.unk()
-        if tgt is not None:
-            assert len(src) == len(
-                tgt
-            ), "Source and target must contain the same number of examples"
-        self.src = src
-        self.tgt = tgt
-        self.src_sizes = np.array(src_sizes)
-        self.tgt_sizes = np.array(tgt_sizes) if tgt_sizes is not None else None
-        self.sizes = (
-            np.vstack((self.src_sizes, self.tgt_sizes)).T
-            if self.tgt_sizes is not None
-            else self.src_sizes
-        )
-        self.src_dict = src_dict
-        self.tgt_dict = tgt_dict
-        self.left_pad_source = left_pad_source
-        self.left_pad_target = left_pad_target
-        self.shuffle = shuffle
-        self.input_feeding = input_feeding
-        self.remove_eos_from_source = remove_eos_from_source
-        self.append_eos_to_target = append_eos_to_target
-        self.align_dataset = align_dataset
-        if self.align_dataset is not None:
-            assert (
-                self.tgt_sizes is not None
-            ), "Both source and target needed when alignments are provided"
-        self.constraints = constraints
-        self.append_bos = append_bos
-        self.eos = eos if eos is not None else src_dict.eos()
-        self.src_lang_id = src_lang_id
-        self.tgt_lang_id = tgt_lang_id
-        if num_buckets > 0:
-            from fairseq.data import BucketPadLengthDataset
-
-            self.src = BucketPadLengthDataset(
-                self.src,
-                sizes=self.src_sizes,
-                num_buckets=num_buckets,
-                pad_idx=self.src_dict.pad(),
-                left_pad=self.left_pad_source,
-            )
-            self.src_sizes = self.src.sizes
-            logger.info("bucketing source lengths: {}".format(list(self.src.buckets)))
-            if self.tgt is not None:
-                self.tgt = BucketPadLengthDataset(
-                    self.tgt,
-                    sizes=self.tgt_sizes,
-                    num_buckets=num_buckets,
-                    pad_idx=self.tgt_dict.pad(),
-                    left_pad=self.left_pad_target,
-                )
-                self.tgt_sizes = self.tgt.sizes
-                logger.info(
-                    "bucketing target lengths: {}".format(list(self.tgt.buckets))
-                )
-
-            # determine bucket sizes using self.num_tokens, which will return
-            # the padded lengths (thanks to BucketPadLengthDataset)
-            num_tokens = np.vectorize(self.num_tokens, otypes=[np.int])
-            self.bucketed_num_tokens = num_tokens(np.arange(len(self.src)))
-            self.buckets = [
-                (None, num_tokens) for num_tokens in np.unique(self.bucketed_num_tokens)
-            ]
-        else:
-            self.buckets = None
-        self.pad_to_multiple = pad_to_multiple
-
-    def get_batch_shapes(self):
-        return self.buckets
-
-    def __getitem__(self, index):
-        tgt_item = self.tgt[index] if self.tgt is not None else None
-        src_item = self.src[index]
-        # Append EOS to end of tgt sentence if it does not have an EOS and remove
-        # EOS from end of src sentence if it exists. This is useful when we use
-        # use existing datasets for opposite directions i.e., when we want to
-        # use tgt_dataset as src_dataset and vice versa
-        if self.append_eos_to_target:
-            eos = self.tgt_dict.eos() if self.tgt_dict else self.src_dict.eos()
-            if self.tgt and self.tgt[index][-1] != eos:
-                tgt_item = torch.cat([self.tgt[index], torch.IntTensor([eos])])
-
-        if self.append_bos:
-            bos = self.tgt_dict.bos() if self.tgt_dict else self.src_dict.bos()
-            if self.tgt and self.tgt[index][0] != bos:
-                tgt_item = torch.cat([torch.IntTensor([bos]), self.tgt[index]])
-
-            bos = self.src_dict.bos()
-            if self.src[index][0] != bos:
-                src_item = torch.cat([torch.IntTensor([bos]), self.src[index]])
-
-        if self.remove_eos_from_source:
-            eos = self.src_dict.eos()
-            if self.src[index][-1] == eos:
-                src_item = self.src[index][:-1]
-
-        example = {
-            "id": index,
-            "source": src_item,
-            "target": tgt_item,
-        }
-        if self.align_dataset is not None:
-            example["alignment"] = self.align_dataset[index]
-        if self.constraints is not None:
-            example["constraints"] = self.constraints[index]
-        return example
-
-    def __len__(self):
-        return len(self.src)
-
-    def collater(self, samples, pad_to_length=None):
-        """Merge a list of samples to form a mini-batch.
-
-        Args:
-            samples (List[dict]): samples to collate
-            pad_to_length (dict, optional): a dictionary of
-                {'source': source_pad_to_length, 'target': target_pad_to_length}
-                to indicate the max length to pad to in source and target respectively.
-
-        Returns:
-            dict: a mini-batch with the following keys:
-
-                - `id` (LongTensor): example IDs in the original input order
-                - `ntokens` (int): total number of tokens in the batch
-                - `net_input` (dict): the input to the Model, containing keys:
-
-                  - `src_tokens` (LongTensor): a padded 2D Tensor of tokens in
-                    the source sentence of shape `(bsz, src_len)`. Padding will
-                    appear on the left if *left_pad_source* is ``True``.
-                  - `src_lengths` (LongTensor): 1D Tensor of the unpadded
-                    lengths of each source sentence of shape `(bsz)`
-                  - `prev_output_tokens` (LongTensor): a padded 2D Tensor of
-                    tokens in the target sentence, shifted right by one
-                    position for teacher forcing, of shape `(bsz, tgt_len)`.
-                    This key will not be present if *input_feeding* is
-                    ``False``.  Padding will appear on the left if
-                    *left_pad_target* is ``True``.
-                  - `src_lang_id` (LongTensor): a long Tensor which contains source
-                    language IDs of each sample in the batch
-
-                - `target` (LongTensor): a padded 2D Tensor of tokens in the
-                  target sentence of shape `(bsz, tgt_len)`. Padding will appear
-                  on the left if *left_pad_target* is ``True``.
-                - `tgt_lang_id` (LongTensor): a long Tensor which contains target language
-                   IDs of each sample in the batch
-        """
-        res = collate(
-            samples,
-            pad_idx=self.src_dict.pad(),
-            eos_idx=self.eos,
-            left_pad_source=self.left_pad_source,
-            left_pad_target=self.left_pad_target,
-            input_feeding=self.input_feeding,
-            pad_to_length=pad_to_length,
-            pad_to_multiple=self.pad_to_multiple,
-        )
-        if self.src_lang_id is not None or self.tgt_lang_id is not None:
-            src_tokens = res["net_input"]["src_tokens"]
-            bsz = src_tokens.size(0)
-            if self.src_lang_id is not None:
-                res["net_input"]["src_lang_id"] = (
-                    torch.IntTensor([[self.src_lang_id]]).expand(bsz, 1).to(src_tokens)
-                )
-            if self.tgt_lang_id is not None:
-                res["tgt_lang_id"] = (
-                    torch.IntTensor([[self.tgt_lang_id]]).expand(bsz, 1).to(src_tokens)
-                )
-        return res
-
-    def num_tokens(self, index):
-        """Return the number of tokens in a sample. This value is used to
-        enforce ``--max-tokens`` during batching."""
-        buckets = [16, 32, 64, 128, 256, 512, 1024]
-        src = max(
-            self.src_sizes[index],
-            self.tgt_sizes[index] if self.tgt_sizes is not None else 0,
-        )
-        for buck in buckets:
-            if src <= buck:
-                return buck
-
-    def size(self, index):
-        """Return an example's size as a float or tuple. This value is used when
-        filtering a dataset with ``--max-positions``."""
-        return (
-            self.src_sizes[index],
-            self.tgt_sizes[index] if self.tgt_sizes is not None else 0,
-        )
-
-    def ordered_indices(self):
-        """Return an ordered list of indices. Batches will be constructed based
-        on this order."""
-        if self.shuffle:
-            indices = np.random.permutation(len(self)).astype(np.int64)
-        else:
-            indices = np.arange(len(self), dtype=np.int64)
-        if self.buckets is None:
-            # sort by target length, then source length
-            if self.tgt_sizes is not None:
-                indices = indices[np.argsort(self.tgt_sizes[indices], kind="mergesort")]
-            return indices[np.argsort(self.src_sizes[indices], kind="mergesort")]
-        else:
-            # sort by bucketed_num_tokens, which is:
-            #   max(padded_src_len, padded_tgt_len)
-            return indices[
-                np.argsort(self.bucketed_num_tokens[indices], kind="mergesort")
-            ]
-
-    @property
-    def supports_prefetch(self):
-        return getattr(self.src, "supports_prefetch", False) and (
-            getattr(self.tgt, "supports_prefetch", False) or self.tgt is None
-        )
-
-    def prefetch(self, indices):
-        self.src.prefetch(indices)
-        if self.tgt is not None:
-            self.tgt.prefetch(indices)
-        if self.align_dataset is not None:
-            self.align_dataset.prefetch(indices)
-
-    def filter_indices_by_size(self, indices, max_sizes):
-        """Filter a list of sample indices. Remove those that are longer
-            than specified in max_sizes.
-
-        Args:
-            indices (np.array): original array of sample indices
-            max_sizes (int or list[int] or tuple[int]): max sample size,
-                can be defined separately for src and tgt (then list or tuple)
-
-        Returns:
-            np.array: filtered sample array
-            list: list of removed indices
-        """
-        return data_utils.filter_paired_dataset_indices_by_size(
-            self.src_sizes,
-            self.tgt_sizes,
-            indices,
-            max_sizes,
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/legacy/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/legacy/__init__.py
deleted file mode 100644
index 9bd5c72b5e9d7f67fb7e4ef10808d7ec08967ff4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/legacy/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .block_pair_dataset import BlockPairDataset
-from .masked_lm_dataset import MaskedLMDataset
-from .masked_lm_dictionary import BertDictionary, MaskedLMDictionary
-
-
-__all__ = [
-    "BertDictionary",
-    "BlockPairDataset",
-    "MaskedLMDataset",
-    "MaskedLMDictionary",
-]
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/legacy/block_pair_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/legacy/block_pair_dataset.py
deleted file mode 100644
index ba069b46052286c531b4f9706d96788732cd2ad2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/legacy/block_pair_dataset.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import numpy as np
-import torch
-from fairseq.data import FairseqDataset
-
-
-class BlockPairDataset(FairseqDataset):
-    """Break a Dataset of tokens into sentence pair blocks for next sentence
-       prediction as well as masked language model.
-
-       High-level logics are:
-       1. break input tensor to tensor blocks
-       2. pair the blocks with 50% next sentence and 50% random sentence
-       3. return paired blocks as well as related segment labels
-
-    Args:
-        dataset (~torch.utils.data.Dataset): dataset to break into blocks
-        sizes: array of sentence lengths
-        dictionary: dictionary for the task
-        block_size: maximum block size
-        break_mode: mode for breaking copurs into block pairs. currently we support
-            2 modes
-            doc: respect document boundaries and each part of the pair should belong to on document
-            none: don't respect any boundary and cut tokens evenly
-        short_seq_prob: probability for generating shorter block pairs
-        doc_break_size: Size for empty line separating documents. Typically 1 if
-                        the sentences have eos, 0 otherwise.
-    """
-
-    def __init__(
-        self,
-        dataset,
-        dictionary,
-        sizes,
-        block_size,
-        break_mode="doc",
-        short_seq_prob=0.1,
-        doc_break_size=1,
-    ):
-        super().__init__()
-        self.dataset = dataset
-        self.pad = dictionary.pad()
-        self.eos = dictionary.eos()
-        self.cls = dictionary.cls()
-        self.mask = dictionary.mask()
-        self.sep = dictionary.sep()
-        self.break_mode = break_mode
-        self.dictionary = dictionary
-        self.short_seq_prob = short_seq_prob
-        self.block_indices = []
-
-        assert len(dataset) == len(sizes)
-
-        if break_mode == "doc":
-            cur_doc = []
-            for sent_id, sz in enumerate(sizes):
-                assert doc_break_size == 0 or sz != 0, (
-                    "when doc_break_size is non-zero, we expect documents to be"
-                    "separated by a blank line with a single eos."
-                )
-                # empty line as document separator
-                if sz == doc_break_size:
-                    if len(cur_doc) == 0:
-                        continue
-                    self.block_indices.append(cur_doc)
-                    cur_doc = []
-                else:
-                    cur_doc.append(sent_id)
-            max_num_tokens = block_size - 3  # Account for [CLS], [SEP], [SEP]
-            self.sent_pairs = []
-            self.sizes = []
-            for doc_id, doc in enumerate(self.block_indices):
-                self._generate_sentence_pair(doc, doc_id, max_num_tokens, sizes)
-        elif break_mode is None or break_mode == "none":
-            # each block should have half of the block size since we are constructing block pair
-            sent_length = (block_size - 3) // 2
-            total_len = sum(dataset.sizes)
-            length = math.ceil(total_len / sent_length)
-
-            def block_at(i):
-                start = i * sent_length
-                end = min(start + sent_length, total_len)
-                return (start, end)
-
-            sent_indices = np.array([block_at(i) for i in range(length)])
-            sent_sizes = np.array([e - s for s, e in sent_indices])
-            dataset_index = self._sent_to_dataset_index(sent_sizes)
-
-            # pair sentences
-            self._pair_sentences(dataset_index)
-        else:
-            raise ValueError("Invalid break_mode: " + break_mode)
-
-    def _pair_sentences(self, dataset_index):
-        """
-        Give a list of evenly cut blocks/sentences, pair these sentences with 50%
-        consecutive sentences and 50% random sentences.
-        This is used for none break mode
-        """
-        # pair sentences
-        for sent_id, sent in enumerate(dataset_index):
-            next_sent_label = (
-                1 if np.random.rand() > 0.5 and sent_id != len(dataset_index) - 1 else 0
-            )
-            if next_sent_label:
-                next_sent = dataset_index[sent_id + 1]
-            else:
-                next_sent = dataset_index[
-                    self._skip_sampling(len(dataset_index), [sent_id, sent_id + 1])
-                ]
-            self.sent_pairs.append((sent, next_sent, next_sent_label))
-
-            # The current blocks don't include the special tokens but the
-            # sizes already account for this
-            self.sizes.append(3 + sent[3] + next_sent[3])
-
-    def _sent_to_dataset_index(self, sent_sizes):
-        """
-        Build index mapping block indices to the underlying dataset indices
-        """
-        dataset_index = []
-        ds_idx, ds_remaining = -1, 0
-        for to_consume in sent_sizes:
-            sent_size = to_consume
-            if ds_remaining == 0:
-                ds_idx += 1
-                ds_remaining = sent_sizes[ds_idx]
-            start_ds_idx = ds_idx
-            start_offset = sent_sizes[ds_idx] - ds_remaining
-            while to_consume > ds_remaining:
-                to_consume -= ds_remaining
-                ds_idx += 1
-                ds_remaining = sent_sizes[ds_idx]
-            ds_remaining -= to_consume
-            dataset_index.append(
-                (
-                    start_ds_idx,  # starting index in dataset
-                    start_offset,  # starting offset within starting index
-                    ds_idx,  # ending index in dataset
-                    sent_size,  # sentence length
-                )
-            )
-        assert ds_remaining == 0
-        assert ds_idx == len(self.dataset) - 1
-        return dataset_index
-
-    def _generate_sentence_pair(self, doc, doc_id, max_num_tokens, sizes):
-        """
-        Go through a single document and genrate sentence paris from it
-        """
-        current_chunk = []
-        current_length = 0
-        curr = 0
-        # To provide more randomness, we decrease target seq length for parts of
-        # samples (10% by default). Note that max_num_tokens is the hard threshold
-        # for batching and will never be changed.
-        target_seq_length = max_num_tokens
-        if np.random.random() < self.short_seq_prob:
-            target_seq_length = np.random.randint(2, max_num_tokens)
-        # loop through all sentences in document
-        while curr < len(doc):
-            sent_id = doc[curr]
-            current_chunk.append(sent_id)
-            current_length = sum(sizes[current_chunk])
-            # split chunk and generate pair when exceed target_seq_length or
-            # finish the loop
-            if curr == len(doc) - 1 or current_length >= target_seq_length:
-                # split the chunk into 2 parts
-                a_end = 1
-                if len(current_chunk) > 2:
-                    a_end = np.random.randint(1, len(current_chunk) - 1)
-                sent_a = current_chunk[:a_end]
-                len_a = sum(sizes[sent_a])
-                # generate next sentence label, note that if there is only 1 sentence
-                # in current chunk, label is always 0
-                next_sent_label = (
-                    1 if np.random.rand() > 0.5 and len(current_chunk) != 1 else 0
-                )
-                if not next_sent_label:
-                    # if next sentence label is 0, sample sent_b from a random doc
-                    target_b_length = target_seq_length - len_a
-                    rand_doc_id = self._skip_sampling(len(self.block_indices), [doc_id])
-                    random_doc = self.block_indices[rand_doc_id]
-                    random_start = np.random.randint(0, len(random_doc))
-                    sent_b = []
-                    len_b = 0
-                    for j in range(random_start, len(random_doc)):
-                        sent_b.append(random_doc[j])
-                        len_b = sum(sizes[sent_b])
-                        if len_b >= target_b_length:
-                            break
-                    # return the second part of the chunk since it's not used
-                    num_unused_segments = len(current_chunk) - a_end
-                    curr -= num_unused_segments
-                else:
-                    # if next sentence label is 1, use the second part of chunk as sent_B
-                    sent_b = current_chunk[a_end:]
-                    len_b = sum(sizes[sent_b])
-                # currently sent_a and sent_B may be longer than max_num_tokens,
-                # truncate them and return block idx and offsets for them
-                sent_a, sent_b = self._truncate_sentences(
-                    sent_a, sent_b, max_num_tokens
-                )
-                self.sent_pairs.append((sent_a, sent_b, next_sent_label))
-                self.sizes.append(3 + sent_a[3] + sent_b[3])
-                current_chunk = []
-            curr += 1
-
-    def _skip_sampling(self, total, skip_ids):
-        """
-        Generate a random integer which is not in skip_ids. Sample range is [0, total)
-        TODO: ids in skip_ids should be consecutive, we can extend it to more generic version later
-        """
-        rand_id = np.random.randint(total - len(skip_ids))
-        return rand_id if rand_id < min(skip_ids) else rand_id + len(skip_ids)
-
-    def _truncate_sentences(self, sent_a, sent_b, max_num_tokens):
-        """
-        Trancate a pair of sentence to limit total length under max_num_tokens
-        Logics:
-            1. Truncate longer sentence
-            2. Tokens to be truncated could be at the beginning or the end of the sentnce
-        Returns:
-            Truncated sentences represented by dataset idx
-        """
-        len_a, len_b = sum(self.dataset.sizes[sent_a]), sum(self.dataset.sizes[sent_b])
-        front_cut_a = front_cut_b = end_cut_a = end_cut_b = 0
-
-        while True:
-            total_length = (
-                len_a + len_b - front_cut_a - front_cut_b - end_cut_a - end_cut_b
-            )
-            if total_length <= max_num_tokens:
-                break
-
-            if len_a - front_cut_a - end_cut_a > len_b - front_cut_b - end_cut_b:
-                if np.random.rand() < 0.5:
-                    front_cut_a += 1
-                else:
-                    end_cut_a += 1
-            else:
-                if np.random.rand() < 0.5:
-                    front_cut_b += 1
-                else:
-                    end_cut_b += 1
-
-        # calculate ds indices as well as offsets and return
-        truncated_sent_a = self._cut_sentence(sent_a, front_cut_a, end_cut_a)
-        truncated_sent_b = self._cut_sentence(sent_b, front_cut_b, end_cut_b)
-        return truncated_sent_a, truncated_sent_b
-
-    def _cut_sentence(self, sent, front_cut, end_cut):
-        """
-        Cut a sentence based on the numbers of tokens to be cut from beginning and end
-        Represent the sentence as dataset idx and return
-        """
-        start_ds_idx, end_ds_idx, offset = sent[0], sent[-1], 0
-        target_len = sum(self.dataset.sizes[sent]) - front_cut - end_cut
-        while front_cut > 0:
-            if self.dataset.sizes[start_ds_idx] > front_cut:
-                offset += front_cut
-                break
-            else:
-                front_cut -= self.dataset.sizes[start_ds_idx]
-                start_ds_idx += 1
-        while end_cut > 0:
-            if self.dataset.sizes[end_ds_idx] > end_cut:
-                break
-            else:
-                end_cut -= self.dataset.sizes[end_ds_idx]
-                end_ds_idx -= 1
-        return start_ds_idx, offset, end_ds_idx, target_len
-
-    def _fetch_block(self, start_ds_idx, offset, end_ds_idx, length):
-        """
-        Fetch a block of tokens based on its dataset idx
-        """
-        buffer = torch.cat(
-            [self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)]
-        )
-        s, e = offset, offset + length
-        return buffer[s:e]
-
-    def __getitem__(self, index):
-        block1, block2, next_sent_label = self.sent_pairs[index]
-        block1 = self._fetch_block(*block1)
-        block2 = self._fetch_block(*block2)
-        return block1, block2, next_sent_label
-
-    def __len__(self):
-        return len(self.sizes)
-
-    @property
-    def supports_prefetch(self):
-        return getattr(self.dataset, "supports_prefetch", False)
-
-    def prefetch(self, indices):
-        prefetch_idx = set()
-        for index in indices:
-            for block1, block2, _ in [self.sent_pairs[index]]:
-                for ds_idx in range(block1[0], block1[2] + 1):
-                    prefetch_idx.add(ds_idx)
-                for ds_idx in range(block2[0], block2[2] + 1):
-                    prefetch_idx.add(ds_idx)
-        self.dataset.prefetch(prefetch_idx)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/legacy/masked_lm_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/legacy/masked_lm_dataset.py
deleted file mode 100644
index dd8ea2c60aff306ab3a756223a298a28d41a4991..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/legacy/masked_lm_dataset.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from typing import Dict, List, Tuple
-
-import numpy as np
-import torch
-from fairseq.data import Dictionary, FairseqDataset, data_utils
-from fairseq.data.concat_dataset import ConcatDataset
-from fairseq.data.legacy.block_pair_dataset import BlockPairDataset
-from fairseq.data.token_block_dataset import TokenBlockDataset
-
-
-class MaskedLMDataset(FairseqDataset):
-    """
-    A wrapper Dataset for masked language modelling. The dataset
-    wraps around TokenBlockDataset or BlockedPairDataset and creates a batch
-    where the input blocks are masked according to the specified masking
-    probability. Additionally the batch can also contain sentence level targets
-    if this is specified.
-
-    Args:
-        dataset: Dataset which generates blocks of data. Only BlockPairDataset
-            and TokenBlockDataset are supported.
-        sizes: Sentence lengths
-        vocab: Dictionary with the vocabulary and special tokens.
-        pad_idx: Id of padding token in dictionary
-        mask_idx: Id of mask token in dictionary
-        classif_token_idx: Id of classification token in dictionary. This is the
-            token associated with the sentence embedding (Eg: CLS for BERT)
-        sep_token_idx: Id of separator token in dictionary
-            (Eg: SEP in BERT)
-        seed: Seed for random number generator for reproducibility.
-        shuffle: Shuffle the elements before batching.
-        has_pairs: Specifies whether the underlying dataset
-            generates a pair of blocks along with a sentence_target or not.
-            Setting it to True assumes that the underlying dataset generates a
-            label for the pair of sentences which is surfaced as
-            sentence_target. The default value assumes a single block with no
-            sentence target.
-        segment_id: An optional segment id for filling in the segment labels
-            when we are in the single block setting (Eg: XLM). Default is 0.
-        masking_ratio: specifies what percentage of the blocks should be masked.
-        masking_prob: specifies the probability of a given token being
-            replaced with the "MASK" token.
-        random_token_prob: specifies the probability of a given token being
-            replaced by a random token from the vocabulary.
-    """
-
-    def __init__(
-        self,
-        dataset: FairseqDataset,
-        sizes: np.ndarray,
-        vocab: Dictionary,
-        pad_idx: int,
-        mask_idx: int,
-        classif_token_idx: int,
-        sep_token_idx: int,
-        seed: int = 1,
-        shuffle: bool = True,
-        has_pairs: bool = True,
-        segment_id: int = 0,
-        masking_ratio: float = 0.15,
-        masking_prob: float = 0.8,
-        random_token_prob: float = 0.1,
-    ):
-        # Make sure the input datasets are the ones supported
-        assert (
-            isinstance(dataset, TokenBlockDataset)
-            or isinstance(dataset, BlockPairDataset)
-            or isinstance(dataset, ConcatDataset)
-        ), (
-            "MaskedLMDataset only wraps TokenBlockDataset or BlockPairDataset or "
-            "ConcatDataset"
-        )
-
-        self.dataset = dataset
-        self.sizes = np.array(sizes)
-        self.vocab = vocab
-        self.pad_idx = pad_idx
-        self.mask_idx = mask_idx
-        self.classif_token_idx = classif_token_idx
-        self.sep_token_idx = sep_token_idx
-        self.shuffle = shuffle
-        self.seed = seed
-        self.has_pairs = has_pairs
-        self.segment_id = segment_id
-        self.masking_ratio = masking_ratio
-        self.masking_prob = masking_prob
-        self.random_token_prob = random_token_prob
-
-        # If we have only one block then sizes needs to be updated to include
-        # the classification token
-        if not has_pairs:
-            self.sizes = self.sizes + 1
-
-    def __getitem__(self, index: int):
-        # if has_pairs, then expect 2 blocks and a sentence target
-        if self.has_pairs:
-            (block_one, block_two, sentence_target) = self.dataset[index]
-        else:
-            block_one = self.dataset[index]
-
-        return {
-            "id": index,
-            "block_one": block_one,
-            "block_two": block_two if self.has_pairs else None,
-            "sentence_target": sentence_target if self.has_pairs else None,
-        }
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def _mask_block(
-        self,
-        sentence: np.ndarray,
-        mask_idx: int,
-        pad_idx: int,
-        dictionary_token_range: Tuple,
-    ):
-        """
-        Mask tokens for Masked Language Model training
-        Samples mask_ratio tokens that will be predicted by LM.
-
-        Note:This function may not be efficient enough since we had multiple
-        conversions between np and torch, we can replace them with torch
-        operators later.
-
-        Args:
-            sentence: 1d tensor to be masked
-            mask_idx: index to use for masking the sentence
-            pad_idx: index to use for masking the target for tokens we aren't
-                predicting
-            dictionary_token_range: range of indices in dictionary which can
-                be used for random word replacement
-                (e.g. without special characters)
-        Return:
-            masked_sent: masked sentence
-            target: target with words which we are not predicting replaced
-                by pad_idx
-        """
-        masked_sent = np.copy(sentence)
-        sent_length = len(sentence)
-        mask_num = math.ceil(sent_length * self.masking_ratio)
-        mask = np.random.choice(sent_length, mask_num, replace=False)
-        target = np.copy(sentence)
-
-        for i in range(sent_length):
-            if i in mask:
-                rand = np.random.random()
-
-                # replace with mask if probability is less than masking_prob
-                # (Eg: 0.8)
-                if rand < self.masking_prob:
-                    masked_sent[i] = mask_idx
-
-                # replace with random token if probability is less than
-                # masking_prob + random_token_prob (Eg: 0.9)
-                elif rand < (self.masking_prob + self.random_token_prob):
-                    # sample random token from dictionary
-                    masked_sent[i] = np.random.randint(
-                        dictionary_token_range[0], dictionary_token_range[1]
-                    )
-            else:
-                target[i] = pad_idx
-
-        return masked_sent, target
-
-    def _collate(self, samples: List[Dict], pad_idx: int, eos_idx: int):
-        """
-        Does the heavy lifting for creating a batch from the input list of
-        examples. The logic is as follows:
-            1. Mask the input blocks. In case has_pair is True then we have 2
-               blocks to mask.
-            2. Prepend the first masked block tensor with the special token
-               used as sentence embedding. Eg: CLS in BERT. This happens
-               irrespective of the value of has_pair.
-            3. If has_pair is True, then append the first masked block with the
-               special separator token (eg: SEP for BERT) and compute segment
-               label accordingly. In this case, also append the second masked
-               block with this special separator token and compute its segment
-               label.
-            4. For the targets tensor, prepend and append with padding index
-               accordingly.
-            5. Concatenate all tensors.
-        """
-        if len(samples) == 0:
-            return {}
-        # To ensure determinism, we reset the state of the PRNG after every
-        # batch based on the seed and the first id of the batch. This ensures
-        # that across epochs we get the same mask for the same example. This
-        # is needed for reproducibility and is how BERT does masking
-        # TODO: Can we add deteminism without this constraint?
-        with data_utils.numpy_seed(self.seed + samples[0]["id"]):
-            for s in samples:
-
-                # token range is needed for replacing with random token during
-                # masking
-                token_range = (self.vocab.nspecial, len(self.vocab))
-
-                # mask according to specified probabilities.
-                masked_blk_one, masked_tgt_one = self._mask_block(
-                    s["block_one"],
-                    self.mask_idx,
-                    self.pad_idx,
-                    token_range,
-                )
-
-                tokens = np.concatenate([[self.classif_token_idx], masked_blk_one])
-                targets = np.concatenate([[self.pad_idx], masked_tgt_one])
-                segments = np.ones(len(tokens)) * self.segment_id
-
-                # if has_pairs is True then we need to add the SEP token to both
-                # the blocks after masking and re-compute segments based on the new
-                # lengths.
-                if self.has_pairs:
-                    tokens_one = np.concatenate([tokens, [self.sep_token_idx]])
-                    targets_one = np.concatenate([targets, [self.pad_idx]])
-
-                    masked_blk_two, masked_tgt_two = self._mask_block(
-                        s["block_two"], self.mask_idx, self.pad_idx, token_range
-                    )
-                    tokens_two = np.concatenate([masked_blk_two, [self.sep_token_idx]])
-                    targets_two = np.concatenate([masked_tgt_two, [self.pad_idx]])
-
-                    # block + 1 sep + 1 special (CLS)
-                    segments_one = np.zeros(len(tokens_one))
-                    # block + 1 sep
-                    segments_two = np.ones(len(tokens_two))
-
-                    tokens = np.concatenate([tokens_one, tokens_two])
-                    targets = np.concatenate([targets_one, targets_two])
-                    segments = np.concatenate([segments_one, segments_two])
-
-                s["source"] = torch.LongTensor(tokens)
-                s["segment_labels"] = torch.LongTensor(segments)
-                s["lm_target"] = torch.LongTensor(targets)
-
-        def merge(key):
-            return data_utils.collate_tokens(
-                [s[key] for s in samples], pad_idx, eos_idx, left_pad=False
-            )
-
-        return {
-            "id": torch.LongTensor([s["id"] for s in samples]),
-            "ntokens": sum(len(s["source"]) for s in samples),
-            "net_input": {
-                "src_tokens": merge("source"),
-                "segment_labels": merge("segment_labels"),
-            },
-            "lm_target": merge("lm_target"),
-            "sentence_target": torch.LongTensor([s["sentence_target"] for s in samples])
-            if self.has_pairs
-            else None,
-            "nsentences": len(samples),
-        }
-
-    def collater(self, samples: List[Dict]):
-        """Merge a list of samples to form a mini-batch.
-
-        Args:
-            samples (List[dict]): samples to collate
-
-        Returns:
-            dict: a mini-batch of data
-        """
-        return self._collate(samples, self.vocab.pad(), self.vocab.eos())
-
-    def num_tokens(self, index: int):
-        """
-        Return the number of tokens in a sample. This value is used to
-        enforce max-tokens during batching.
-        """
-        return self.sizes[index]
-
-    def size(self, index: int):
-        """
-        Return an example's size as a float or tuple. This value is used when
-        filtering a dataset with max-positions.
-        """
-        return self.sizes[index]
-
-    def ordered_indices(self):
-        """
-        Return an ordered list of indices. Batches will be constructed based
-        on this order.
-        """
-        if self.shuffle:
-            return np.random.permutation(len(self))
-        else:
-            order = [np.arange(len(self))]
-            order.append(self.sizes)
-            return np.lexsort(order)
-
-    @property
-    def supports_prefetch(self):
-        return getattr(self.dataset, "supports_prefetch", False)
-
-    def prefetch(self, indices):
-        self.dataset.prefetch(indices)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/legacy/masked_lm_dictionary.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/legacy/masked_lm_dictionary.py
deleted file mode 100644
index dee88f7a3ed72ea465ea4e8ffe7b1c01ff6f57f1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/legacy/masked_lm_dictionary.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.data import Dictionary
-
-
-class MaskedLMDictionary(Dictionary):
-    """
-    Dictionary for Masked Language Modelling tasks. This extends Dictionary by
-    adding the mask symbol.
-    """
-
-    def __init__(
-        self,
-        pad="<pad>",
-        eos="</s>",
-        unk="<unk>",
-        mask="<mask>",
-    ):
-        super().__init__(pad=pad, eos=eos, unk=unk)
-        self.mask_word = mask
-        self.mask_index = self.add_symbol(mask)
-        self.nspecial = len(self.symbols)
-
-    def mask(self):
-        """Helper to get index of mask symbol"""
-        return self.mask_index
-
-
-class BertDictionary(MaskedLMDictionary):
-    """
-    Dictionary for BERT task. This extends MaskedLMDictionary by adding support
-    for cls and sep symbols.
-    """
-
-    def __init__(
-        self,
-        pad="<pad>",
-        eos="</s>",
-        unk="<unk>",
-        mask="<mask>",
-        cls="<cls>",
-        sep="<sep>",
-    ):
-        super().__init__(pad=pad, eos=eos, unk=unk, mask=mask)
-        self.cls_word = cls
-        self.sep_word = sep
-        self.cls_index = self.add_symbol(cls)
-        self.sep_index = self.add_symbol(sep)
-        self.nspecial = len(self.symbols)
-
-    def cls(self):
-        """Helper to get index of cls symbol"""
-        return self.cls_index
-
-    def sep(self):
-        """Helper to get index of sep symbol"""
-        return self.sep_index
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/list_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/list_dataset.py
deleted file mode 100644
index 12f00aa43661d6bad701c9e72653ba8779136906..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/list_dataset.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from . import BaseWrapperDataset
-
-
-class ListDataset(BaseWrapperDataset):
-    def __init__(self, dataset, sizes=None):
-        super().__init__(dataset)
-        self._sizes = sizes
-
-    def __iter__(self):
-        for x in self.dataset:
-            yield x
-
-    def collater(self, samples):
-        return samples
-
-    @property
-    def sizes(self):
-        return self._sizes
-
-    def num_tokens(self, index):
-        return self.sizes[index]
-
-    def size(self, index):
-        return self.sizes[index]
-
-    def set_epoch(self, epoch):
-        pass
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/lm_context_window_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/lm_context_window_dataset.py
deleted file mode 100644
index 29ad887b7daf0658cc137ea2199c76fc2502b2ec..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/lm_context_window_dataset.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-from fairseq.data.monolingual_dataset import MonolingualDataset
-
-from . import FairseqDataset
-
-
-class LMContextWindowDataset(FairseqDataset):
-    """Wraps a MonolingualDataset and provides more context for evaluation."""
-
-    def __init__(self, dataset, tokens_per_sample, context_window, pad_idx):
-        assert isinstance(dataset, MonolingualDataset)
-        assert context_window > 0
-        self.dataset = dataset
-        self.tokens_per_sample = tokens_per_sample
-        self.context_window = context_window
-        self.pad_idx = pad_idx
-        self.prev_tokens = np.empty([0])
-
-    def __getitem__(self, index):
-        return self.dataset[index]
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def collater(self, samples):
-        sample = self.dataset.collater(samples)
-
-        pad = self.pad_idx
-        max_sample_len = self.tokens_per_sample + self.context_window
-
-        bsz, tsz = sample["net_input"]["src_tokens"].shape
-        start_idxs = [0] * bsz
-        toks = sample["net_input"]["src_tokens"]
-        lengths = sample["net_input"]["src_lengths"]
-        tgt = sample["target"]
-        new_toks = np.empty([bsz, tsz + self.context_window], dtype=np.int64)
-        new_tgt = np.full([bsz, tsz + self.context_window], pad, dtype=np.int64)
-        sample_lens = toks.ne(pad).long().sum(dim=1).cpu()
-        for i in range(bsz):
-            sample_len = sample_lens[i]
-            extra = len(self.prev_tokens) + sample_len - max_sample_len
-            if extra > 0:
-                self.prev_tokens = self.prev_tokens[extra:]
-            pads = np.full(self.context_window - len(self.prev_tokens), pad)
-            new_toks[i] = np.concatenate([self.prev_tokens, toks[i].numpy(), pads])
-            new_tgt[
-                i, len(self.prev_tokens) : len(self.prev_tokens) + len(tgt[i])
-            ] = tgt[i]
-            start_idxs[i] = len(self.prev_tokens)
-            lengths[i] += len(self.prev_tokens)
-            self.prev_tokens = new_toks[i][new_toks[i] != pad][-self.context_window :]
-        sample["net_input"]["src_tokens"] = torch.from_numpy(new_toks)
-        sample["target"] = torch.from_numpy(new_tgt)
-        sample["start_indices"] = start_idxs
-
-        return sample
-
-    def num_tokens(self, index):
-        return self.dataset.num_tokens(index)
-
-    def size(self, index):
-        return self.dataset.size(index)
-
-    def ordered_indices(self):
-        # NOTE we don't shuffle the data to retain access to the previous dataset elements
-        return np.arange(len(self.dataset))
-
-    @property
-    def supports_prefetch(self):
-        return getattr(self.dataset, "supports_prefetch", False)
-
-    def prefetch(self, indices):
-        return self.dataset.prefetch(indices)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/lru_cache_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/lru_cache_dataset.py
deleted file mode 100644
index a7854ac1701392754ce5795cafe9c634671aebdf..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/lru_cache_dataset.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from functools import lru_cache
-
-from . import BaseWrapperDataset
-
-
-class LRUCacheDataset(BaseWrapperDataset):
-    def __init__(self, dataset, token=None):
-        super().__init__(dataset)
-
-    @lru_cache(maxsize=8)
-    def __getitem__(self, index):
-        return self.dataset[index]
-
-    @lru_cache(maxsize=8)
-    def collater(self, samples):
-        return self.dataset.collater(samples)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/mask_tokens_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/mask_tokens_dataset.py
deleted file mode 100644
index 8ea86245f76c4cf233bf6f023138ef341538a267..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/mask_tokens_dataset.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from functools import lru_cache
-
-import numpy as np
-import torch
-from fairseq.data import Dictionary, data_utils
-
-from . import BaseWrapperDataset, LRUCacheDataset
-
-
-class MaskTokensDataset(BaseWrapperDataset):
-    """
-    A wrapper Dataset for masked language modeling.
-
-    Input items are masked according to the specified masking probability.
-
-    Args:
-        dataset: Dataset to wrap.
-        sizes: Sentence lengths
-        vocab: Dictionary with the vocabulary and special tokens.
-        pad_idx: Id of pad token in vocab
-        mask_idx: Id of mask token in vocab
-        return_masked_tokens: controls whether to return the non-masked tokens
-            (the default) or to return a tensor with the original masked token
-            IDs (and *pad_idx* elsewhere). The latter is useful as targets for
-            masked LM training.
-        seed: Seed for random number generator for reproducibility.
-        mask_prob: probability of replacing a token with *mask_idx*.
-        leave_unmasked_prob: probability that a masked token is unmasked.
-        random_token_prob: probability of replacing a masked token with a
-            random token from the vocabulary.
-        freq_weighted_replacement: sample random replacement words based on
-            word frequencies in the vocab.
-        mask_whole_words: only mask whole words. This should be a byte mask
-            over vocab indices, indicating whether it is the beginning of a
-            word. We will extend any mask to encompass the whole word.
-        bpe: BPE to use for whole-word masking.
-    """
-
-    @classmethod
-    def apply_mask(cls, dataset: torch.utils.data.Dataset, *args, **kwargs):
-        """Return the source and target datasets for masked LM training."""
-        dataset = LRUCacheDataset(dataset)
-        return (
-            LRUCacheDataset(cls(dataset, *args, **kwargs, return_masked_tokens=False)),
-            LRUCacheDataset(cls(dataset, *args, **kwargs, return_masked_tokens=True)),
-        )
-
-    def __init__(
-        self,
-        dataset: torch.utils.data.Dataset,
-        vocab: Dictionary,
-        pad_idx: int,
-        mask_idx: int,
-        return_masked_tokens: bool = False,
-        seed: int = 1,
-        mask_prob: float = 0.15,
-        leave_unmasked_prob: float = 0.1,
-        random_token_prob: float = 0.1,
-        freq_weighted_replacement: bool = False,
-        mask_whole_words: torch.Tensor = None,
-    ):
-        assert 0.0 < mask_prob < 1.0
-        assert 0.0 <= random_token_prob <= 1.0
-        assert 0.0 <= leave_unmasked_prob <= 1.0
-        assert random_token_prob + leave_unmasked_prob <= 1.0
-
-        self.dataset = dataset
-        self.vocab = vocab
-        self.pad_idx = pad_idx
-        self.mask_idx = mask_idx
-        self.return_masked_tokens = return_masked_tokens
-        self.seed = seed
-        self.mask_prob = mask_prob
-        self.leave_unmasked_prob = leave_unmasked_prob
-        self.random_token_prob = random_token_prob
-        self.mask_whole_words = mask_whole_words
-
-        if random_token_prob > 0.0:
-            if freq_weighted_replacement:
-                weights = np.array(self.vocab.count)
-            else:
-                weights = np.ones(len(self.vocab))
-            weights[: self.vocab.nspecial] = 0
-            self.weights = weights / weights.sum()
-
-        self.epoch = 0
-
-    @property
-    def can_reuse_epoch_itr_across_epochs(self):
-        return True  # only the noise changes, not item sizes
-
-    def set_epoch(self, epoch, **unused):
-        super().set_epoch(epoch)
-        self.epoch = epoch
-
-    @lru_cache(maxsize=8)
-    def __getitem__(self, index: int):
-        with data_utils.numpy_seed(self.seed, self.epoch, index):
-            item = self.dataset[index]
-            sz = len(item)
-
-            assert (
-                self.mask_idx not in item
-            ), "Dataset contains mask_idx (={}), this is not expected!".format(
-                self.mask_idx,
-            )
-
-            if self.mask_whole_words is not None:
-                word_begins_mask = self.mask_whole_words.gather(0, item)
-                word_begins_idx = word_begins_mask.nonzero().view(-1)
-                sz = len(word_begins_idx)
-                words = np.split(word_begins_mask, word_begins_idx)[1:]
-                assert len(words) == sz
-                word_lens = list(map(len, words))
-
-            # decide elements to mask
-            mask = np.full(sz, False)
-            num_mask = int(
-                # add a random number for probabilistic rounding
-                self.mask_prob * sz
-                + np.random.rand()
-            )
-            mask[np.random.choice(sz, num_mask, replace=False)] = True
-
-            if self.return_masked_tokens:
-                # exit early if we're just returning the masked tokens
-                # (i.e., the targets for masked LM training)
-                if self.mask_whole_words is not None:
-                    mask = np.repeat(mask, word_lens)
-                new_item = np.full(len(mask), self.pad_idx)
-                new_item[mask] = item[torch.from_numpy(mask.astype(np.uint8)) == 1]
-                return torch.from_numpy(new_item)
-
-            # decide unmasking and random replacement
-            rand_or_unmask_prob = self.random_token_prob + self.leave_unmasked_prob
-            if rand_or_unmask_prob > 0.0:
-                rand_or_unmask = mask & (np.random.rand(sz) < rand_or_unmask_prob)
-                if self.random_token_prob == 0.0:
-                    unmask = rand_or_unmask
-                    rand_mask = None
-                elif self.leave_unmasked_prob == 0.0:
-                    unmask = None
-                    rand_mask = rand_or_unmask
-                else:
-                    unmask_prob = self.leave_unmasked_prob / rand_or_unmask_prob
-                    decision = np.random.rand(sz) < unmask_prob
-                    unmask = rand_or_unmask & decision
-                    rand_mask = rand_or_unmask & (~decision)
-            else:
-                unmask = rand_mask = None
-
-            if unmask is not None:
-                mask = mask ^ unmask
-
-            if self.mask_whole_words is not None:
-                mask = np.repeat(mask, word_lens)
-
-            new_item = np.copy(item)
-            new_item[mask] = self.mask_idx
-            if rand_mask is not None:
-                num_rand = rand_mask.sum()
-                if num_rand > 0:
-                    if self.mask_whole_words is not None:
-                        rand_mask = np.repeat(rand_mask, word_lens)
-                        num_rand = rand_mask.sum()
-
-                    new_item[rand_mask] = np.random.choice(
-                        len(self.vocab),
-                        num_rand,
-                        p=self.weights,
-                    )
-
-            return torch.from_numpy(new_item)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/monolingual_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/monolingual_dataset.py
deleted file mode 100644
index ec73f1fda8e1b91a49c53f506b70870e9e76402c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/monolingual_dataset.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-
-from . import FairseqDataset, data_utils
-
-
-def collate(samples, pad_idx, eos_idx):
-    if len(samples) == 0:
-        return {}
-
-    def merge(key, is_list=False):
-        if is_list:
-            res = []
-            for i in range(len(samples[0][key])):
-                res.append(
-                    data_utils.collate_tokens(
-                        [s[key][i] for s in samples],
-                        pad_idx,
-                        eos_idx,
-                        left_pad=False,
-                    )
-                )
-            return res
-        else:
-            return data_utils.collate_tokens(
-                [s[key] for s in samples],
-                pad_idx,
-                eos_idx,
-                left_pad=False,
-            )
-
-    src_tokens = merge("source")
-    if samples[0]["target"] is not None:
-        is_target_list = isinstance(samples[0]["target"], list)
-        target = merge("target", is_target_list)
-    else:
-        target = src_tokens
-
-    return {
-        "id": torch.LongTensor([s["id"] for s in samples]),
-        "nsentences": len(samples),
-        "ntokens": sum(len(s["source"]) for s in samples),
-        "net_input": {
-            "src_tokens": src_tokens,
-            "src_lengths": torch.LongTensor([s["source"].numel() for s in samples]),
-        },
-        "target": target,
-    }
-
-
-class MonolingualDataset(FairseqDataset):
-    """
-    A wrapper around torch.utils.data.Dataset for monolingual data.
-
-    Args:
-        dataset (torch.utils.data.Dataset): dataset to wrap
-        sizes (List[int]): sentence lengths
-        vocab (~fairseq.data.Dictionary): vocabulary
-        shuffle (bool, optional): shuffle the elements before batching
-            (default: True).
-    """
-
-    def __init__(
-        self,
-        dataset,
-        sizes,
-        src_vocab,
-        tgt_vocab,
-        add_eos_for_other_targets,
-        shuffle,
-        targets=None,
-        add_bos_token=False,
-    ):
-        self.dataset = dataset
-        self.sizes = np.array(sizes)
-        self.vocab = src_vocab
-        self.tgt_vocab = tgt_vocab
-        self.add_eos_for_other_targets = add_eos_for_other_targets
-        self.shuffle = shuffle
-        self.add_bos_token = add_bos_token
-
-        assert targets is None or all(
-            t in {"self", "future", "past"} for t in targets
-        ), "targets must be none or one of 'self', 'future', 'past'"
-        if targets is not None and len(targets) == 0:
-            targets = None
-        self.targets = targets
-
-    def __getitem__(self, index):
-        if self.targets is not None:
-            # *future_target* is the original sentence
-            # *source* is shifted right by 1 (maybe left-padded with eos)
-            # *past_target* is shifted right by 2 (left-padded as needed)
-            #
-            # Left-to-right language models should condition on *source* and
-            # predict *future_target*.
-            # Right-to-left language models should condition on *source* and
-            # predict *past_target*.
-            source, future_target, past_target = self.dataset[index]
-            source, target = self._make_source_target(
-                source, future_target, past_target
-            )
-        else:
-            source = self.dataset[index]
-            target = None
-        source, target = self._maybe_add_bos(source, target)
-        return {"id": index, "source": source, "target": target}
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def _make_source_target(self, source, future_target, past_target):
-        if self.targets is not None:
-            target = []
-
-            if (
-                self.add_eos_for_other_targets
-                and (("self" in self.targets) or ("past" in self.targets))
-                and source[-1] != self.vocab.eos()
-            ):
-                # append eos at the end of source
-                source = torch.cat([source, source.new([self.vocab.eos()])])
-
-                if "future" in self.targets:
-                    future_target = torch.cat(
-                        [future_target, future_target.new([self.vocab.pad()])]
-                    )
-                if "past" in self.targets:
-                    # first token is before the start of sentence which is only used in "none" break mode when
-                    # add_eos_for_other_targets is False
-                    past_target = torch.cat(
-                        [
-                            past_target.new([self.vocab.pad()]),
-                            past_target[1:],
-                            source[-2, None],
-                        ]
-                    )
-
-            for t in self.targets:
-                if t == "self":
-                    target.append(source)
-                elif t == "future":
-                    target.append(future_target)
-                elif t == "past":
-                    target.append(past_target)
-                else:
-                    raise Exception("invalid target " + t)
-
-            if len(target) == 1:
-                target = target[0]
-        else:
-            target = future_target
-
-        return source, self._filter_vocab(target)
-
-    def _maybe_add_bos(self, source, target):
-        if self.add_bos_token:
-            source = torch.cat([source.new([self.vocab.bos()]), source])
-            if target is not None:
-                target = torch.cat([target.new([self.tgt_vocab.bos()]), target])
-        return source, target
-
-    def _filter_vocab(self, target):
-        if len(self.tgt_vocab) != len(self.vocab):
-
-            def _filter(target):
-                mask = target.ge(len(self.tgt_vocab))
-                if mask.any():
-                    target[mask] = self.tgt_vocab.unk()
-                return target
-
-            if isinstance(target, list):
-                return [_filter(t) for t in target]
-            return _filter(target)
-        return target
-
-    def collater(self, samples):
-        """Merge a list of samples to form a mini-batch.
-
-        Args:
-            samples (List[dict]): samples to collate
-
-        Returns:
-            dict: a mini-batch with the following keys:
-
-                - `id` (LongTensor): example IDs in the original input order
-                - `ntokens` (int): total number of tokens in the batch
-                - `net_input` (dict): the input to the Model, containing keys:
-
-                  - `src_tokens` (LongTensor): a padded 2D Tensor of tokens in
-                    the source sentence of shape `(bsz, src_len)`. Padding will
-                    appear on the right.
-
-                - `target` (LongTensor): a padded 2D Tensor of tokens in the
-                  target sentence of shape `(bsz, tgt_len)`. Padding will appear
-                  on the right.
-        """
-        return collate(samples, self.vocab.pad(), self.vocab.eos())
-
-    def num_tokens(self, index):
-        """Return the number of tokens in a sample. This value is used to
-        enforce ``--max-tokens`` during batching."""
-        return self.sizes[index]
-
-    def size(self, index):
-        """Return an example's size as a float or tuple. This value is used when
-        filtering a dataset with ``--max-positions``."""
-        return self.sizes[index]
-
-    def ordered_indices(self):
-        """Return an ordered list of indices. Batches will be constructed based
-        on this order."""
-        if self.shuffle:
-            order = [np.random.permutation(len(self))]
-        else:
-            order = [np.arange(len(self))]
-        order.append(self.sizes)
-        return np.lexsort(order)
-
-    @property
-    def supports_prefetch(self):
-        return getattr(self.dataset, "supports_prefetch", False)
-
-    def prefetch(self, indices):
-        self.dataset.prefetch(indices)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multi_corpus_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multi_corpus_dataset.py
deleted file mode 100644
index d2457666d688f773a98af8eb610a4f5756b02dc0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multi_corpus_dataset.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-from collections import OrderedDict
-from typing import Dict, List
-
-import numpy as np
-from fairseq.data import data_utils
-
-from . import FairseqDataset
-
-
-logger = logging.getLogger(__name__)
-
-
-class MultiCorpusDataset(FairseqDataset):
-    """
-    Stores multiple instances of FairseqDataset together. Requires each instance
-    to be the same dataset, as the collate method needs to work on batches with
-    samples from each dataset.
-
-    Allows specifying a distribution over the datasets to use. Note that unlike
-    MultiCorpusSampledDataset, this distribution allows sampling for each item,
-    rather than on a batch level.
-
-    Each time ordered_indices() is called, a new sample is generated with
-    the specified distribution.
-
-    Args:
-        datasets: a OrderedDict of FairseqDataset instances.
-        distribution: a List containing the probability of getting an utterance from
-                        corresponding dataset
-        seed: random seed for sampling the datsets
-        sort_indices: if true, will sort the ordered indices by size
-    """
-
-    def __init__(
-        self,
-        datasets: Dict[str, FairseqDataset],
-        distribution: List[float],
-        seed: int,
-        sort_indices: bool = False,
-    ):
-        super().__init__()
-        assert isinstance(datasets, OrderedDict)
-        assert len(datasets) == len(distribution)
-        self.datasets = datasets
-        self.distribution = distribution
-        self.seed = seed
-        self.sort_indices = sort_indices
-
-        # Avoid repeated conversions to list later
-        self.dataset_list = list(datasets.values())
-        self.total_num_instances = 0
-
-        first_dataset = list(self.datasets.values())[0]
-
-        self.dataset_offsets = []
-        for dataset in datasets.values():
-            assert isinstance(dataset, FairseqDataset)
-            assert type(dataset) is type(first_dataset)
-            self.dataset_offsets.append(self.total_num_instances)
-            self.total_num_instances += len(dataset)
-
-    def ordered_indices(self):
-        with data_utils.numpy_seed(self.seed, self.epoch):
-            # Used to store the order of indices of each dataset to use
-            indices = [
-                np.random.permutation(len(dataset))
-                for dataset in self.datasets.values()
-            ]
-            # Keep track of which samples we've  used for each dataset
-            counters = [0 for _ in self.datasets]
-
-            sampled_indices = [
-                self._sample(indices, counters) for _ in range(self.total_num_instances)
-            ]
-            if self.sort_indices:
-                sampled_indices.sort(key=lambda i: self.num_tokens(i))
-            return np.array(sampled_indices, dtype=np.int64)
-
-    def _sample(self, indices, counters):
-        # First pick dataset
-        dataset_idx = np.random.choice(len(self.distribution), p=self.distribution)
-
-        # Then get dataset internal index
-        idx = indices[dataset_idx][counters[dataset_idx]]
-
-        # Convert to multi-datasets index
-        idx += self.dataset_offsets[dataset_idx]
-
-        counters[dataset_idx] += 1
-
-        # Reset if we reach end
-        if counters[dataset_idx] == len(self.dataset_list[dataset_idx]):
-            counters[dataset_idx] = 0
-            indices[dataset_idx] = np.random.permutation(
-                len(self.dataset_list[dataset_idx])
-            )
-
-        return idx
-
-    def _map_index(self, index: int):
-        """
-        If dataset A has length N and dataset B has length M
-        then index 1 maps to index 1 of dataset A, and index N + 1
-        maps to index 1 of B.
-        """
-        counter = 0
-        for key, dataset in self.datasets.items():
-            if index < counter + len(dataset):
-                return index - counter, key
-            counter += len(dataset)
-        raise ValueError(
-            "Invalid index: {}, max: {}".format(index, self.total_num_instances)
-        )
-
-    def __len__(self):
-        """
-        Length of this dataset is the sum of individual datasets
-        """
-        return self.total_num_instances
-
-    def __getitem__(self, index):
-        index, key = self._map_index(index)
-        return self.datasets[key][index]
-
-    def collater(self, samples):
-        """
-        Since we enforce all datsets to be the same, collating is just
-        picking the first one and doing collate.
-        """
-        if len(samples) == 0:
-            return None
-
-        return list(self.datasets.values())[0].collater(samples)
-
-    def num_tokens(self, index: int):
-        index, key = self._map_index(index)
-        return self.datasets[key].num_tokens(index)
-
-    def size(self, index: int):
-        index, key = self._map_index(index)
-        return self.datasets[key].size(index)
-
-    @property
-    def can_reuse_epoch_itr_across_epochs(self):
-        return False
-
-    def set_epoch(self, epoch, **unused):
-        super().set_epoch(epoch)
-        self.epoch = epoch
-
-    @property
-    def supports_prefetch(self):
-        return False
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multi_corpus_sampled_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multi_corpus_sampled_dataset.py
deleted file mode 100644
index ad8e951cc905a73fea28b4fac449e307cadfa52f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multi_corpus_sampled_dataset.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from collections import OrderedDict
-from typing import Callable, Dict, List
-
-import numpy as np
-
-from . import FairseqDataset
-
-
-def uniform_sampler(x):
-    # Sample from uniform distribution
-    return np.random.choice(x, 1).item()
-
-
-class MultiCorpusSampledDataset(FairseqDataset):
-    """
-    Stores multiple instances of FairseqDataset together and in every iteration
-    creates a batch by first sampling a dataset according to a specified
-    probability distribution and then getting instances from that dataset.
-
-    Args:
-        datasets: an OrderedDict of FairseqDataset instances.
-        sampling_func: A function for sampling over list of dataset keys.
-            The default strategy is to sample uniformly.
-    """
-
-    def __init__(
-        self,
-        datasets: Dict[str, FairseqDataset],
-        sampling_func: Callable[[List], int] = None,
-    ):
-        super().__init__()
-        assert isinstance(datasets, OrderedDict)
-        self.datasets = datasets
-        if sampling_func is None:
-            sampling_func = uniform_sampler
-        self.sampling_func = sampling_func
-
-        self.total_num_instances = 0
-        for _, dataset in datasets.items():
-            assert isinstance(dataset, FairseqDataset)
-            self.total_num_instances += len(dataset)
-
-        self._ordered_indices = None
-
-    def __len__(self):
-        """
-        Length of this dataset is the sum of individual datasets
-        """
-        return self.total_num_instances
-
-    def ordered_indices(self):
-        """
-        Ordered indices for batching. Here we call the underlying
-        dataset's ordered_indices() so that we get the same random ordering
-        as we would have from using the underlying dataset directly.
-        """
-        if self._ordered_indices is None:
-            self._ordered_indices = OrderedDict(
-                [
-                    (key, dataset.ordered_indices())
-                    for key, dataset in self.datasets.items()
-                ]
-            )
-        return np.arange(len(self))
-
-    def _map_index_to_dataset(self, key: int, index: int):
-        """
-        Different underlying datasets have different lengths. In order to ensure
-        we are not accessing an index outside the range of the current dataset
-        size, we wrap around. This function should be called after we have
-        created an ordering for this and all underlying datasets.
-        """
-        assert (
-            self._ordered_indices is not None
-        ), "Must call MultiCorpusSampledDataset.ordered_indices() first"
-        mapped_index = index % len(self.datasets[key])
-        return self._ordered_indices[key][mapped_index]
-
-    def __getitem__(self, index: int):
-        """
-        Get the item associated with index from each underlying dataset.
-        Since index is in the range of [0, TotalNumInstances], we need to
-        map the index to the dataset before retrieving the item.
-        """
-        return OrderedDict(
-            [
-                (key, dataset[self._map_index_to_dataset(key, index)])
-                for key, dataset in self.datasets.items()
-            ]
-        )
-
-    def collater(self, samples: List[Dict]):
-        """
-        Generate a mini-batch for this dataset.
-        To convert this into a regular mini-batch we use the following
-        logic:
-            1. Select a dataset using the specified probability distribution.
-            2. Call the collater function of the selected dataset.
-        """
-        if len(samples) == 0:
-            return None
-
-        selected_key = self.sampling_func(list(self.datasets.keys()))
-        selected_samples = [sample[selected_key] for sample in samples]
-        return self.datasets[selected_key].collater(selected_samples)
-
-    def num_tokens(self, index: int):
-        """
-        Return an example's length (number of tokens), used for batching. Here
-        we return the max across all examples at index across all underlying
-        datasets.
-        """
-        return max(
-            dataset.num_tokens(self._map_index_to_dataset(key, index))
-            for key, dataset in self.datasets.items()
-        )
-
-    def size(self, index: int):
-        """
-        Return an example's size as a float or tuple. Here we return the max
-        across all underlying datasets. This value is used when filtering a
-        dataset with max-positions.
-        """
-        return max(
-            dataset.size(self._map_index_to_dataset(key, index))
-            for key, dataset in self.datasets.items()
-        )
-
-    @property
-    def supports_prefetch(self):
-        return all(
-            getattr(dataset, "supports_prefetch", False)
-            for dataset in self.datasets.values()
-        )
-
-    def prefetch(self, indices):
-        for key, dataset in self.datasets.items():
-            dataset.prefetch(
-                [self._map_index_to_dataset(key, index) for index in indices]
-            )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/__init__.py
deleted file mode 100644
index 6264236915a7269a4d920ee8213004374dd86a9a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/multilingual_data_manager.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/multilingual_data_manager.py
deleted file mode 100644
index 8c14f4e3ad2417032bbe63dc145d270c78e967c7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/multilingual_data_manager.py
+++ /dev/null
@@ -1,1042 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import itertools
-import json
-import logging
-import math
-import os
-from collections import OrderedDict, defaultdict
-
-from fairseq import utils
-from fairseq.data import (
-    AppendTokenDataset,
-    ConcatDataset,
-    Dictionary,
-    LanguagePairDataset,
-    PrependTokenDataset,
-    SampledMultiDataset,
-    SampledMultiEpochDataset,
-    StripTokenDataset,
-    TransformEosLangPairDataset,
-    TruncateDataset,
-    data_utils,
-    indexed_dataset,
-)
-from fairseq.data.multilingual.multilingual_utils import (
-    EncoderLangtok,
-    LangTokSpec,
-    LangTokStyle,
-    augment_dictionary,
-    get_lang_tok,
-)
-from fairseq.data.multilingual.sampled_multi_dataset import CollateFormat
-from fairseq.file_io import PathManager
-from fairseq.utils import FileContentsAction, csv_str_list, eval_str_dict
-
-
-logger = logging.getLogger(__name__)
-
-
-def _lang_id(dic: Dictionary, lang: str):
-    """Return language ID index."""
-    idx = dic.index(lang)
-    assert idx != dic.unk_index, "cannot find language ID for lang {}".format(lang)
-    return idx
-
-
-def load_sampling_weights(from_file):
-    with open(from_file) as f:
-        weights = json.load(f)
-    return weights
-
-
-class MultilingualDatasetManager(object):
-    def __init__(self, args, lang_pairs, langs, dicts, sampling_method):
-        super().__init__()
-        self.args = args
-        self.seed = args.seed
-        self.lang_pairs = lang_pairs
-        self.langs = langs
-        self.dicts = dicts
-        self.lang_dict = self.create_lang_dictionary(self.langs)
-        self.sampling_method = sampling_method
-        self.sampling_scheduler = None
-        self._has_sharded_data = False
-        self._num_shards_dict = {}
-        self._training_data_sizes = defaultdict(lambda: {})
-
-    @classmethod
-    def setup_data_manager(cls, args, lang_pairs, langs, dicts, sampling_method):
-        return MultilingualDatasetManager(
-            args, lang_pairs, langs, dicts, sampling_method
-        )
-
-    @staticmethod
-    def add_args(parser):
-        parser.add_argument(
-            "data",
-            help="colon separated path to data directories list, \
-                            will be iterated upon during epochs in round-robin manner",
-            action=FileContentsAction,
-        )
-        parser.add_argument(
-            "--langs",
-            default=None,
-            type=csv_str_list,
-            help="a list of languages comma sperated languages which can appear in lang-pairs; "
-            "note that the ordering determines language token IDs",
-        )
-        parser.add_argument(
-            "--lang-dict",
-            default=None,
-            type=str,
-            help="an external file which contains a list of "
-            "languages which can appear in lang-pairs; "
-            "note that the ordering determines language token IDs; "
-            "--langs and --lang-dict are two exclusive options",
-        )
-        parser.add_argument(
-            "--lang-tok-style",
-            default=LangTokStyle.multilingual.value,
-            type=str,
-            choices=[LangTokStyle.multilingual.value, LangTokStyle.mbart.value],
-            help="language token styles",
-        )
-
-        parser.add_argument(
-            "--load-alignments",
-            action="store_true",
-            help="load the binarized alignments",
-        )
-        parser.add_argument(
-            "--left-pad-source",
-            default="True",
-            type=str,
-            metavar="BOOL",
-            help="pad the source on the left",
-        )
-        parser.add_argument(
-            "--left-pad-target",
-            default="False",
-            type=str,
-            metavar="BOOL",
-            help="pad the target on the left",
-        )
-        parser.add_argument(
-            "--max-source-positions",
-            default=1024,
-            type=int,
-            metavar="N",
-            help="max number of tokens in the source sequence",
-        )
-        parser.add_argument(
-            "--max-target-positions",
-            default=1024,
-            type=int,
-            metavar="N",
-            help="max number of tokens in the target sequence",
-        )
-        parser.add_argument(
-            "--upsample-primary",
-            default=1,
-            type=int,
-            help="amount to upsample primary dataset",
-        )
-        parser.add_argument(
-            "--truncate-source",
-            action="store_true",
-            default=False,
-            help="truncate source to max-source-positions",
-        )
-        parser.add_argument(
-            "--encoder-langtok",
-            default=None,
-            type=str,
-            choices=[EncoderLangtok.src.value, EncoderLangtok.tgt.value],
-            metavar="SRCTGT",
-            help="prepend to the beginning of source sentence the source or target "
-            "language token. (src/tgt)",
-        )
-        parser.add_argument(
-            "--decoder-langtok",
-            action="store_true",
-            help="prepend to the beginning of target sentence the target language token",
-        )
-        parser.add_argument(
-            "--lang-tok-replacing-bos-eos", action="store_true", default=False
-        )
-        parser.add_argument(
-            "--enable-lang-ids",
-            default=False,
-            action="store_true",
-            help="whether to include language IDs in samples",
-        )
-        parser.add_argument(
-            "--enable-reservsed-directions-shared-datasets",
-            default=False,
-            action="store_true",
-            help="whether to allow datasets be used in reversed directions",
-        )
-
-        parser.add_argument(
-            "--extra-data",
-            help='a dictionary of data name to this path, \
-                            e.g. {"mined", path_to_mined_data, "denoised": path_to_denoised_data}',
-            type=lambda uf: eval_str_dict(uf, type=str),
-            default=None,
-        )
-        parser.add_argument(
-            "--extra-lang-pairs",
-            help='a dictionary of data name to the language pairs they serve, \
-                            e.g. {"mined": comma-separated-lang-pairs, "denoised":  comma-separated-lang-pairs}',
-            type=lambda uf: eval_str_dict(uf, type=str),
-            default=None,
-        )
-        parser.add_argument(
-            "--fixed-dictionary",
-            help="Fixed dictionary to use with model path",
-            default=None,
-            type=str,
-        )
-        parser.add_argument(
-            "--langtoks-specs",
-            help='a list of comma separated data types that a set of language tokens to be specialized for, \
-                            e.g. "main,dae,mined". There will be a set of language tokens added to the vocab to \
-                            distinguish languages in different training data types. If not specified, default language \
-                            tokens per languages will be added',
-            default=LangTokSpec.main.value,
-            type=csv_str_list,
-        )
-        parser.add_argument(
-            "--langtoks",
-            help='a dictionary of how to add language tokens, \
-                            e.g. {"mined": (None, "tgt"), "mono_dae": ("src.dae", "tgt"), "main": \
-                            ("src", "tgt")}, or {"mined": ("src.mined", "tgt")}',
-            default=None,
-            type=lambda uf: eval_str_dict(uf, type=str),
-        )
-        parser.add_argument(
-            "--sampling-weights-from-file",
-            help='a file contain a python dictionary of how to sample data sets, \
-                                e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
-                                    "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
-            default=None,
-            type=str,
-        )
-        parser.add_argument(
-            "--sampling-weights",
-            help='a dictionary of how to sample data sets, \
-                            e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \
-                                   "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }',
-            default=None,
-            type=lambda uf: eval_str_dict(uf, type=str),
-        )
-        parser.add_argument(
-            "--virtual-epoch-size",
-            default=1000000,
-            type=int,
-            help="virtual epoch size to speed up data loading",
-        )
-        parser.add_argument(
-            "--virtual-data-size",
-            default=None,
-            type=int,
-            help="virtual data size of the whole joint dataset to speed"
-            "up data loading and have specific dynamic sampling strategy interval",
-        )
-
-    @classmethod
-    def load_langs(cls, args, **kwargs):
-        if args.lang_dict and args.langs:
-            raise ValueError("--langs and --lang-dict can not both be specified")
-        if args.lang_dict is None and args.langs is None:
-            logger.warning(
-                "External language dictionary is not provided; "
-                "use lang-pairs to infer the set of supported languages. "
-                "The language ordering is not stable which might cause "
-                "misalignment in pretraining and finetuning."
-            )
-            # infer from lang_pairs as it is
-            langs = list(
-                {x for lang_pair in args.lang_pairs for x in lang_pair.split("-")}
-            )
-            langs = sorted(langs)
-            logger.info(f"inferred language list: {langs}")
-        elif args.lang_dict:
-            with open(
-                PathManager.get_local_path(args.lang_dict), "r", encoding="utf-8"
-            ) as f:
-                langs = [lang.strip() for lang in f.readlines() if lang.strip()]
-                logger.info(
-                    f"loaded language list from {args.lang_dict} as they are ordered in file"
-                )
-        elif args.langs:
-            langs = args.langs
-            logger.info(
-                f"parsed the language list as they are ordered in the option: {langs}"
-            )
-        return langs
-
-    def has_sharded_data(self, split):
-        return self._has_sharded_data and split == getattr(
-            self.args, "train_subset", None
-        )
-
-    def _shared_collater(self):
-        return not (self.args.extra_data and "mono_dae" in self.args.extra_data) and (
-            not self.args.lang_tok_replacing_bos_eos
-        )
-
-    def estimate_global_pass_epoch(self, epoch):
-        if self.args.virtual_epoch_size is None or self.args.virtual_data_size is None:
-            return None
-        # one epoch more for remaining data in each shard
-        virtual_epochs_per_shard = math.ceil(
-            self.args.virtual_data_size / self.args.virtual_epoch_size
-        )
-        # note that fairseq epoch / shard_epoch starts from 1
-        shard_epoch = (epoch - 1) // virtual_epochs_per_shard + 1
-        return shard_epoch
-
-    @classmethod
-    def prepare(cls, load_dictionary, args, **kargs):
-        args.left_pad_source = utils.eval_bool(args.left_pad_source)
-        args.left_pad_target = utils.eval_bool(args.left_pad_target)
-
-        if not hasattr(args, "shuffle_instance"):
-            args.shuffle_instance = False
-        if args.langtoks is None:
-            args.langtoks = {}
-        if "main" not in args.langtoks:
-            src_langtok_spec = args.encoder_langtok if args.encoder_langtok else None
-            tgt_langtok_spec = "tgt" if args.decoder_langtok else None
-            args.langtoks["main"] = (src_langtok_spec, tgt_langtok_spec)
-
-        def check_langs(langs, pairs):
-            messages = []
-            for src, tgt in pairs:
-                if src not in langs or tgt not in langs:
-                    messages.append(
-                        f"language pair {src}-{tgt} contains languages "
-                        "that are not in the language dictionary"
-                    )
-            if len(messages) > 0:
-                raise ValueError(" ".join(messages) + f"; langs: {langs}")
-
-        if args.lang_pairs is None:
-            raise ValueError(
-                "--lang-pairs is required. List all the language pairs in the training objective."
-            )
-        if isinstance(args.lang_pairs, str):
-            args.lang_pairs = args.lang_pairs.split(",")
-        if args.source_lang is not None or args.target_lang is not None:
-            training = False
-        else:
-            training = True
-        language_list = cls.load_langs(args, **kargs)
-        check_langs(
-            language_list,
-            (
-                [p.split("-") for p in args.lang_pairs]
-                if training
-                else [(args.source_lang, args.target_lang)]
-            ),
-        )
-
-        # load dictionaries
-        if training:
-            extra_lang_pairs = (
-                list(
-                    {p for _, v in args.extra_lang_pairs.items() for p in v.split(",")}
-                )
-                if args.extra_lang_pairs
-                else []
-            )
-            langs_to_load_dicts = sorted(
-                {x for p in args.lang_pairs + extra_lang_pairs for x in p.split("-")}
-            )
-        else:
-            langs_to_load_dicts = sorted([args.source_lang, args.target_lang])
-
-        dicts = OrderedDict()
-        paths = utils.split_paths(args.data)
-        assert len(paths) > 0
-        for lang in langs_to_load_dicts:
-            if args.fixed_dictionary is not None:
-                dicts[lang] = load_dictionary(args.fixed_dictionary)
-            else:
-                dicts[lang] = load_dictionary(
-                    os.path.join(paths[0], "dict.{}.txt".format(lang))
-                )
-                augment_dictionary(
-                    dictionary=dicts[lang],
-                    language_list=language_list,
-                    lang_tok_style=args.lang_tok_style,
-                    langtoks_specs=args.langtoks_specs,
-                    extra_data=args.extra_data,
-                )
-            if len(dicts) > 0:
-                assert dicts[lang].pad() == dicts[langs_to_load_dicts[0]].pad()
-                assert dicts[lang].eos() == dicts[langs_to_load_dicts[0]].eos()
-                assert dicts[lang].unk() == dicts[langs_to_load_dicts[0]].unk()
-            logger.info("[{}] dictionary: {} types".format(lang, len(dicts[lang])))
-        return language_list, dicts, training
-
-    @classmethod
-    def create_lang_dictionary(cls, langs):
-        unk = "<unk>"
-        # hack to remove symbols other than unk as they are not needed by lang dict
-        lang_dict = Dictionary(pad=unk, eos=unk, unk=unk, bos=unk)
-        for lang in langs:
-            lang_dict.add_symbol(lang)
-        return lang_dict
-
-    @classmethod
-    def get_langtok_index(cls, lang_tok, dic):
-        idx = dic.index(lang_tok)
-        assert (
-            idx != dic.unk_index
-        ), "cannot find language token {} in the dictionary".format(lang_tok)
-        return idx
-
-    def get_encoder_langtok(self, src_lang, tgt_lang, spec=None):
-        if spec is None:
-            return None
-        if spec and spec.startswith("src"):
-            if src_lang is None:
-                return None
-            langtok = get_lang_tok(
-                lang=src_lang, lang_tok_style=self.args.lang_tok_style, spec=spec
-            )
-        else:
-            if tgt_lang is None:
-                return None
-            langtok = get_lang_tok(
-                lang=tgt_lang, lang_tok_style=self.args.lang_tok_style, spec=spec
-            )
-        return self.get_langtok_index(
-            langtok, self.dicts[src_lang if src_lang else tgt_lang]
-        )
-
-    def get_decoder_langtok(self, tgt_lang, spec=None):
-        if spec is None:
-            return None
-        langtok = get_lang_tok(
-            lang=tgt_lang, lang_tok_style=self.args.lang_tok_style, spec=spec
-        )
-        return self.get_langtok_index(langtok, self.dicts[tgt_lang])
-
-    @classmethod
-    def load_data(cls, path, vdict, impl):
-        dataset = data_utils.load_indexed_dataset(path, vdict, impl)
-        return dataset
-
-    @classmethod
-    def split_exists(cls, split, src, tgt, lang, data_path, dataset_impl):
-        filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang))
-        return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
-
-    def load_lang_dataset(
-        self,
-        data_path,
-        split,
-        src,
-        src_dict,
-        tgt,
-        tgt_dict,
-        combine,
-        dataset_impl,
-        upsample_primary,
-        max_source_positions,
-        prepend_bos=False,
-        load_alignments=False,
-        truncate_source=False,
-    ):
-
-        src_datasets = []
-        tgt_datasets = []
-
-        for k in itertools.count():
-            split_k = split + (str(k) if k > 0 else "")
-
-            # infer langcode
-            if self.split_exists(split_k, src, tgt, src, data_path, dataset_impl):
-                prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt))
-            elif self.split_exists(split_k, tgt, src, src, data_path, dataset_impl):
-                prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src))
-            else:
-                if k > 0:
-                    break
-                else:
-                    logger.error(
-                        f"Dataset not found: {data_path}, {split_k}, {src}, {tgt}"
-                    )
-                    raise FileNotFoundError(
-                        "Dataset not found: {} ({})".format(split, data_path)
-                    )
-
-            src_dataset = self.load_data(prefix + src, src_dict, dataset_impl)
-            if truncate_source:
-                src_dataset = AppendTokenDataset(
-                    TruncateDataset(
-                        StripTokenDataset(src_dataset, src_dict.eos()),
-                        max_source_positions - 1,
-                    ),
-                    src_dict.eos(),
-                )
-            src_datasets.append(src_dataset)
-            tgt_datasets.append(self.load_data(prefix + tgt, tgt_dict, dataset_impl))
-
-            logger.info(
-                "{} {} {}-{} {} examples".format(
-                    data_path, split_k, src, tgt, len(src_datasets[-1])
-                )
-            )
-
-            if not combine:
-                break
-
-        assert len(src_datasets) == len(tgt_datasets)
-
-        if len(src_datasets) == 1:
-            src_dataset, tgt_dataset = src_datasets[0], tgt_datasets[0]
-        else:
-            sample_ratios = [1] * len(src_datasets)
-            sample_ratios[0] = upsample_primary
-            src_dataset = ConcatDataset(src_datasets, sample_ratios)
-            tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios)
-
-        if prepend_bos:
-            assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index")
-            src_dataset = PrependTokenDataset(src_dataset, src_dict.bos())
-            tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos())
-
-        align_dataset = None
-        if load_alignments:
-            align_path = os.path.join(
-                data_path, "{}.align.{}-{}".format(split, src, tgt)
-            )
-            if indexed_dataset.dataset_exists(align_path, impl=dataset_impl):
-                align_dataset = data_utils.load_indexed_dataset(
-                    align_path, None, dataset_impl
-                )
-
-        return src_dataset, tgt_dataset, align_dataset
-
-    def load_langpair_dataset(
-        self,
-        data_path,
-        split,
-        src,
-        src_dict,
-        tgt,
-        tgt_dict,
-        combine,
-        dataset_impl,
-        upsample_primary,
-        left_pad_source,
-        left_pad_target,
-        max_source_positions,
-        max_target_positions,
-        prepend_bos=False,
-        load_alignments=False,
-        truncate_source=False,
-        src_dataset_transform_func=lambda dataset: dataset,
-        tgt_dataset_transform_func=lambda dataset: dataset,
-        src_lang_id=None,
-        tgt_lang_id=None,
-        langpairs_sharing_datasets=None,
-    ):
-        norm_direction = "-".join(sorted([src, tgt]))
-        if langpairs_sharing_datasets is not None:
-            src_dataset = langpairs_sharing_datasets.get(
-                (data_path, split, norm_direction, src), "NotInCache"
-            )
-            tgt_dataset = langpairs_sharing_datasets.get(
-                (data_path, split, norm_direction, tgt), "NotInCache"
-            )
-            align_dataset = langpairs_sharing_datasets.get(
-                (data_path, split, norm_direction, src, tgt), "NotInCache"
-            )
-
-        # a hack: any one is not in cache, we need to reload them
-        if (
-            langpairs_sharing_datasets is None
-            or src_dataset == "NotInCache"
-            or tgt_dataset == "NotInCache"
-            or align_dataset == "NotInCache"
-            or split != getattr(self.args, "train_subset", None)
-        ):
-            # source and target datasets can be reused in reversed directions to save memory
-            # reversed directions of valid and test data will not share source and target datasets
-            src_dataset, tgt_dataset, align_dataset = self.load_lang_dataset(
-                data_path,
-                split,
-                src,
-                src_dict,
-                tgt,
-                tgt_dict,
-                combine,
-                dataset_impl,
-                upsample_primary,
-                max_source_positions=max_source_positions,
-                prepend_bos=prepend_bos,
-                load_alignments=load_alignments,
-                truncate_source=truncate_source,
-            )
-            src_dataset = src_dataset_transform_func(src_dataset)
-            tgt_dataset = tgt_dataset_transform_func(tgt_dataset)
-            if langpairs_sharing_datasets is not None:
-                langpairs_sharing_datasets[
-                    (data_path, split, norm_direction, src)
-                ] = src_dataset
-                langpairs_sharing_datasets[
-                    (data_path, split, norm_direction, tgt)
-                ] = tgt_dataset
-                langpairs_sharing_datasets[
-                    (data_path, split, norm_direction, src, tgt)
-                ] = align_dataset
-                if align_dataset is None:
-                    # no align data so flag the reverse direction as well in sharing
-                    langpairs_sharing_datasets[
-                        (data_path, split, norm_direction, tgt, src)
-                    ] = align_dataset
-        else:
-            logger.info(
-                f"Reusing source and target datasets of [{split}] {tgt}-{src} for reversed direction: "
-                f"[{split}] {src}-{tgt}: src length={len(src_dataset)}; tgt length={len(tgt_dataset)}"
-            )
-
-        return LanguagePairDataset(
-            src_dataset,
-            src_dataset.sizes,
-            src_dict,
-            tgt_dataset,
-            tgt_dataset.sizes if tgt_dataset is not None else None,
-            tgt_dict,
-            left_pad_source=left_pad_source,
-            left_pad_target=left_pad_target,
-            align_dataset=align_dataset,
-            src_lang_id=src_lang_id,
-            tgt_lang_id=tgt_lang_id,
-        )
-
-    def src_dataset_tranform_func(self, src_lang, tgt_lang, dataset, spec=None):
-        if self.args.lang_tok_replacing_bos_eos:
-            # it is handled by self.alter_dataset_langtok
-            # TODO: Unifiy with alter_dataset_langtok
-            return dataset
-        if spec is None:
-            return dataset
-        tok = self.get_encoder_langtok(src_lang, tgt_lang, spec)
-        if tok:
-            return PrependTokenDataset(dataset, tok)
-        return dataset
-
-    def tgt_dataset_tranform_func(self, source_lang, target_lang, dataset, spec=None):
-        if dataset is None:
-            # note that target dataset can be None during inference time
-            return None
-        if self.args.lang_tok_replacing_bos_eos:
-            # TODO: Unifiy with alter_dataset_langtok
-            # It is handled by self.alter_dataset_langtok.
-            # The complication in self.alter_dataset_langtok
-            # makes a unified framework difficult.
-            return dataset
-        # if not self.args.decoder_langtok:
-        if not spec:
-            return dataset
-        tok = self.get_decoder_langtok(target_lang, spec)
-        if tok:
-            return PrependTokenDataset(dataset, tok)
-        return dataset
-
-    def alter_dataset_langtok(
-        self,
-        lang_pair_dataset,
-        src_eos=None,
-        src_lang=None,
-        tgt_eos=None,
-        tgt_lang=None,
-        src_langtok_spec=None,
-        tgt_langtok_spec=None,
-    ):
-        if src_langtok_spec is None and tgt_langtok_spec is None:
-            return lang_pair_dataset
-
-        new_src_eos = None
-        if (
-            src_langtok_spec is not None
-            and src_eos is not None
-            and (src_lang is not None or tgt_lang is not None)
-        ):
-            new_src_eos = self.get_encoder_langtok(src_lang, tgt_lang, src_langtok_spec)
-        else:
-            src_eos = None
-
-        new_tgt_bos = None
-        if tgt_langtok_spec and tgt_eos is not None and tgt_lang is not None:
-            new_tgt_bos = self.get_decoder_langtok(tgt_lang, tgt_langtok_spec)
-        else:
-            tgt_eos = None
-
-        return TransformEosLangPairDataset(
-            lang_pair_dataset,
-            src_eos=src_eos,
-            new_src_eos=new_src_eos,
-            tgt_bos=tgt_eos,
-            new_tgt_bos=new_tgt_bos,
-        )
-
-    def load_a_dataset(
-        self,
-        split,
-        data_path,
-        src,
-        src_dict,
-        tgt,
-        tgt_dict,
-        combine,
-        prepend_bos=False,
-        langpairs_sharing_datasets=None,
-        data_category=None,
-        **extra_kwargs,
-    ):
-        dataset_impl = self.args.dataset_impl
-        upsample_primary = self.args.upsample_primary
-        left_pad_source = self.args.left_pad_source
-        left_pad_target = self.args.left_pad_target
-        max_source_positions = self.args.max_source_positions
-        max_target_positions = self.args.max_target_positions
-        load_alignments = self.args.load_alignments
-        truncate_source = self.args.truncate_source
-        src_dataset_transform_func = self.src_dataset_tranform_func
-        tgt_dataset_transform_func = self.tgt_dataset_tranform_func
-        enable_lang_ids = self.args.enable_lang_ids
-        lang_dictionary = self.lang_dict
-        src_langtok_spec, tgt_langtok_spec = extra_kwargs["langtok_spec"]
-
-        src_langtok = self.get_encoder_langtok(src, tgt, src_langtok_spec)
-        tgt_langtok = self.get_decoder_langtok(tgt, tgt_langtok_spec)
-        logger.info(
-            f"{data_category}:{src}-{tgt} src_langtok: {src_langtok}; tgt_langtok: {tgt_langtok}"
-        )
-
-        langpair_ds = self.load_langpair_dataset(
-            data_path,
-            split,
-            src,
-            src_dict,
-            tgt,
-            tgt_dict,
-            combine,
-            dataset_impl,
-            upsample_primary,
-            left_pad_source,
-            left_pad_target,
-            max_source_positions,
-            max_target_positions,
-            prepend_bos,
-            load_alignments,
-            truncate_source,
-            src_dataset_transform_func=lambda dataset: src_dataset_transform_func(
-                src, tgt, dataset, src_langtok_spec
-            ),
-            tgt_dataset_transform_func=lambda dataset: tgt_dataset_transform_func(
-                src, tgt, dataset, tgt_langtok_spec
-            ),
-            src_lang_id=_lang_id(lang_dictionary, src)
-            if enable_lang_ids and lang_dictionary is not None
-            else None,
-            tgt_lang_id=_lang_id(lang_dictionary, tgt)
-            if enable_lang_ids and lang_dictionary is not None
-            else None,
-            langpairs_sharing_datasets=langpairs_sharing_datasets,
-        )
-        # TODO: handle modified lang toks for mined data and dae data
-        if self.args.lang_tok_replacing_bos_eos:
-            ds = self.alter_dataset_langtok(
-                langpair_ds,
-                src_eos=self.dicts[src if src else tgt].eos(),
-                src_lang=src,
-                tgt_eos=self.dicts[tgt].eos(),
-                tgt_lang=tgt,
-                src_langtok_spec=src_langtok_spec,
-                tgt_langtok_spec=tgt_langtok_spec,
-            )
-        else:
-            ds = langpair_ds
-        return ds
-
-    def load_split_langpair_datasets(self, split, data_param_list):
-        datasets = []
-        langpairs_sharing_datasets = (
-            {} if self.args.enable_reservsed_directions_shared_datasets else None
-        )
-        for param in data_param_list:
-            ds = self.load_a_dataset(
-                split=split,
-                langpairs_sharing_datasets=langpairs_sharing_datasets,
-                **param,
-            )
-            datasets.append(ds)
-        return datasets
-
-    def get_data_paths_and_lang_pairs(self, split):
-        datapaths = {"main": self.args.data}
-        lang_pairs = {"main": self.lang_pairs}
-        if split == getattr(self.args, "train_subset", None):
-            # only training data can have extra data and extra language pairs
-            if self.args.extra_data:
-                extra_datapaths = self.args.extra_data
-                datapaths.update(extra_datapaths)
-            if self.args.extra_lang_pairs:
-                extra_lang_pairs = {
-                    k: v.split(",") for k, v in self.args.extra_lang_pairs.items()
-                }
-                lang_pairs.update(extra_lang_pairs)
-        return datapaths, lang_pairs
-
-    @classmethod
-    def get_dataset_key(cls, data_category, src, tgt):
-        return f"{data_category}:{src}-{tgt}"
-
-    @classmethod
-    def _get_shard_num_dict(cls, split, paths):
-        shards = defaultdict(int)
-        for path in paths:
-            files = PathManager.ls(path)
-            directions = set()
-            for f in files:
-                if f.startswith(split) and f.endswith(".idx"):
-                    # idx files of the form "{split}.{src}-{tgt}.{lang}.idx"
-                    direction = f.split(".")[-3]
-                    directions.add(direction)
-            for direction in directions:
-                shards[direction] += 1
-        return shards
-
-    def get_split_num_data_shards(self, split):
-        if split in self._num_shards_dict:
-            return self._num_shards_dict[split]
-        num_shards_dict = {}
-        data_paths, lang_pairs = self.get_data_paths_and_lang_pairs(split)
-
-        for data_category, paths in data_paths.items():
-            if data_category not in lang_pairs:
-                continue
-            paths = utils.split_paths(paths)
-            shards_dict = self._get_shard_num_dict(split, paths)
-            lang_dirs = [
-                lang_pair.split("-") for lang_pair in lang_pairs[data_category]
-            ]
-            lang_dirs = [x if len(x) > 1 else (x[0], x[0]) for x in lang_dirs]
-            for src, tgt in lang_dirs:
-                key = self.get_dataset_key(data_category, src, tgt)
-                if "mono_" in data_category:
-                    # monolingual data requires tgt only
-                    assert src is None or src == tgt, (
-                        f"error: src={src}, "
-                        "tgt={tgt} for data_category={data_category}"
-                    )
-                    num_shards_dict[key] = shards_dict[tgt]
-                else:
-                    if f"{src}-{tgt}" in shards_dict:
-                        num_shards_dict[key] = shards_dict[f"{src}-{tgt}"]
-                    elif f"{tgt}-{src}" in shards_dict:
-                        # follow the fairseq tradition to use reversed direction data if it is not available
-                        num_shards_dict[key] = shards_dict[f"{tgt}-{src}"]
-        self._num_shards_dict[split] = num_shards_dict
-        logger.info(f"[{split}] num of shards: {num_shards_dict}")
-        return num_shards_dict
-
-    @classmethod
-    def get_shard_id(cls, num_shards, epoch, shard_epoch=None):
-        shard = epoch if shard_epoch is None else shard_epoch
-        shard = (shard - 1) % num_shards
-        return shard
-
-    def get_split_data_path(self, paths, epoch, shard_epoch, num_shards):
-        path = paths[self.get_shard_id(num_shards, epoch, shard_epoch)]
-        return path
-
-    def get_split_data_param_list(self, split, epoch, shard_epoch=None):
-        # TODO: to extend with extra datasets and keys and loop over different shard data paths
-        param_list = []
-        data_paths, lang_pairs = self.get_data_paths_and_lang_pairs(split)
-        logger.info(f"langtoks settings: {self.args.langtoks}")
-        split_num_shards_dict = self.get_split_num_data_shards(split)
-        for data_category, paths in data_paths.items():
-            if data_category not in lang_pairs:
-                continue
-            paths = utils.split_paths(paths)
-            assert len(paths) > 0
-            if len(paths) > 1:
-                self._has_sharded_data = True
-            if split != getattr(self.args, "train_subset", None):
-                # if not training data set, use the first shard for valid and test
-                paths = paths[:1]
-
-            if data_category in self.args.langtoks:
-                lang_tok_spec = self.args.langtoks[data_category]
-            else:
-                # default to None
-                lang_tok_spec = (None, None)
-
-            # infer langcode
-            lang_dirs = [
-                lang_pair.split("-") for lang_pair in lang_pairs[data_category]
-            ]
-            lang_dirs = [x if len(x) > 1 else (x[0], x[0]) for x in lang_dirs]
-            for src, tgt in lang_dirs:
-                assert src is not None or data_category == "mono_dae", (
-                    f"error: src={src}, " "tgt={tgt} for data_category={data_category}"
-                )
-                # logger.info(f"preparing param for {data_category}: {src} - {tgt}")
-                key = self.get_dataset_key(data_category, src, tgt)
-                data_path = self.get_split_data_path(
-                    paths, epoch, shard_epoch, split_num_shards_dict[key]
-                )
-                param_list.append(
-                    {
-                        "key": key,
-                        "data_path": data_path,
-                        "split": split,
-                        "src": src,
-                        "src_dict": self.dicts[src]
-                        if src and data_category != "mono_dae"
-                        else None,
-                        "tgt": tgt,
-                        "tgt_dict": self.dicts[tgt],
-                        "data_category": data_category,
-                        "langtok_spec": lang_tok_spec,
-                    }
-                )
-        return param_list
-
-    def get_train_dataset_sizes(
-        self, data_param_list, datasets, epoch, shard_epoch=None
-    ):
-        num_shards = [
-            self.get_split_num_data_shards(param["split"])[param["key"]]
-            for param in data_param_list
-        ]
-        data_sizes = []
-        for (key, d), num_shard in zip(datasets, num_shards):
-            my_data_sizes = self._training_data_sizes[key]
-            shard_ind = self.get_shard_id(num_shard, epoch, shard_epoch)
-            if shard_ind not in my_data_sizes:
-                my_data_sizes[shard_ind] = len(d)
-            known_size = max(my_data_sizes.values())
-            data_sizes.append(
-                # If we don't know the data size of the shard yet,
-                # use the the max known data size to approximate.
-                # Note that we preprocess shards by a designated shard size
-                # and put any remaining data at the end into the last shard so
-                # the max shard size approximation is almost correct before loading
-                # the last shard; after loading the last shard, it will have the
-                # exact data sizes of the whole data size.
-                (key, sum(my_data_sizes.get(i, known_size) for i in range(num_shard)))
-            )
-        logger.info(
-            f"estimated total data sizes of all shards used in sampling ratios: {data_sizes}. "
-            "Note that if the data a shard has not been loaded yet, use the max known data size to approximate"
-        )
-        return [s for _, s in data_sizes]
-
-    def get_train_sampling_ratios(
-        self, data_param_list, datasets, epoch=1, shard_epoch=None
-    ):
-        data_sizes = self.get_train_dataset_sizes(
-            data_param_list, datasets, epoch, shard_epoch
-        )
-        sampling_func = self.sampling_method.sampling_method_selector()
-        sample_ratios = sampling_func(data_sizes) if sampling_func is not None else None
-        return sample_ratios
-
-    def get_sampling_ratios(self, data_param_list, datasets, epoch, shard_epoch=None):
-        if self.args.sampling_weights_from_file:
-            weights = load_sampling_weights(self.args.sampling_weights_from_file)
-            sample_ratios = [weights[k] for k, _ in datasets]
-            logger.info(
-                "| ignoring --sampling-weights when loadding sampling weights "
-                f"from file {self.args.sampling_weights_from_file}"
-            )
-        elif self.args.sampling_weights:
-            sample_ratios = [self.args.sampling_weights[k] for k, _ in datasets]
-        else:
-            sample_ratios = self.get_train_sampling_ratios(
-                data_param_list, datasets, epoch, shard_epoch
-            )
-
-        if sample_ratios is not None:
-            logger.info(
-                "| Upsample ratios: {}".format(
-                    list(zip(map(lambda x: x["key"], data_param_list), sample_ratios))
-                )
-            )
-            assert len(sample_ratios) == len(datasets)
-        return sample_ratios
-
-    def load_split_datasets(
-        self, split, training, epoch=1, combine=False, shard_epoch=None, **kwargs
-    ):
-        data_param_list = self.get_split_data_param_list(
-            split, epoch, shard_epoch=shard_epoch
-        )
-        langpairs_sharing_datasets = (
-            {} if self.args.enable_reservsed_directions_shared_datasets else None
-        )
-        datasets = [
-            (
-                param["key"],
-                self.load_a_dataset(
-                    combine=combine,
-                    langpairs_sharing_datasets=langpairs_sharing_datasets,
-                    **param,
-                ),
-            )
-            for param in data_param_list
-        ]
-        return datasets, data_param_list
-
-    def load_into_concat_dataset(self, split, datasets, data_param_list):
-        if self.args.lang_tok_replacing_bos_eos:
-            # TODO: to investigate why TransformEosLangPairDataset doesn't work with ConcatDataset
-            return SampledMultiDataset(
-                OrderedDict(datasets),
-                sampling_ratios=None,
-                eval_key=None,
-                collate_format=CollateFormat.single,
-                virtual_size=None,
-                split=split,
-            )
-        return ConcatDataset([d for _, d in datasets])
-
-    def load_sampled_multi_epoch_dataset(
-        self, split, training, epoch=0, combine=False, shard_epoch=None, **kwargs
-    ):
-        datasets, data_param_list = self.load_split_datasets(
-            split, training, epoch, combine, shard_epoch=shard_epoch, **kwargs
-        )
-        if training and split == getattr(self.args, "train_subset", None):
-            sample_ratios = self.get_sampling_ratios(data_param_list, datasets, epoch)
-            return SampledMultiEpochDataset(
-                OrderedDict(datasets),
-                epoch=epoch,
-                shard_epoch=shard_epoch,
-                # valid and test datasets will be degenerate to concating datasets:
-                sampling_ratios=sample_ratios,
-                eval_key=None,
-                collate_format=CollateFormat.single,
-                virtual_size=self.args.virtual_data_size,
-                split=split,
-                virtual_epoch_size=self.args.virtual_epoch_size,
-                # if not using lang_tok altering, simplified to use the same collater
-                shared_collater=self._shared_collater(),
-            )
-        else:
-            return self.load_into_concat_dataset(split, datasets, data_param_list)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/multilingual_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/multilingual_utils.py
deleted file mode 100644
index 62d0d83486984f8dfe981f6326f9d8596655c940..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/multilingual_utils.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# BSD 3-Clause License
-#
-# Copyright (c) 2017 xxxx
-# All rights reserved.
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# ============================================================================
-from enum import Enum
-from typing import Dict, List, Optional, Sequence
-
-import torch
-from fairseq.data import Dictionary
-
-
-class EncoderLangtok(Enum):
-    """
-    Prepend to the beginning of source sentence either the
-    source or target language token. (src/tgt).
-    """
-
-    src = "src"
-    tgt = "tgt"
-
-
-class LangTokSpec(Enum):
-    main = "main"
-    mono_dae = "mono_dae"
-
-
-class LangTokStyle(Enum):
-    multilingual = "multilingual"
-    mbart = "mbart"
-
-
-@torch.jit.export
-def get_lang_tok(
-    lang: str, lang_tok_style: str, spec: str = LangTokSpec.main.value
-) -> str:
-    # TOKEN_STYLES can't be defined outside this fn since it needs to be
-    # TorchScriptable.
-    TOKEN_STYLES: Dict[str, str] = {
-        LangTokStyle.mbart.value: "[{}]",
-        LangTokStyle.multilingual.value: "__{}__",
-    }
-
-    if spec.endswith("dae"):
-        lang = f"{lang}_dae"
-    elif spec.endswith("mined"):
-        lang = f"{lang}_mined"
-    style = TOKEN_STYLES[lang_tok_style]
-    return style.format(lang)
-
-
-def augment_dictionary(
-    dictionary: Dictionary,
-    language_list: List[str],
-    lang_tok_style: str,
-    langtoks_specs: Sequence[str] = (LangTokSpec.main.value,),
-    extra_data: Optional[Dict[str, str]] = None,
-) -> None:
-    for spec in langtoks_specs:
-        for language in language_list:
-            dictionary.add_symbol(
-                get_lang_tok(lang=language, lang_tok_style=lang_tok_style, spec=spec)
-            )
-
-    if lang_tok_style == LangTokStyle.mbart.value or (
-        extra_data is not None and LangTokSpec.mono_dae.value in extra_data
-    ):
-        dictionary.add_symbol("<mask>")
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/sampled_multi_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/sampled_multi_dataset.py
deleted file mode 100644
index 3f544b099ffdd5187343df8bf6cc0d73b0e721d7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/sampled_multi_dataset.py
+++ /dev/null
@@ -1,458 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import datetime
-import hashlib
-import logging
-import time
-from bisect import bisect_right
-from collections import OrderedDict, defaultdict
-from enum import Enum
-from typing import List
-
-import numpy as np
-import torch
-from fairseq import distributed_utils
-from fairseq.data import FairseqDataset, data_utils
-
-
-def get_time_gap(s, e):
-    return (
-        datetime.datetime.fromtimestamp(e) - datetime.datetime.fromtimestamp(s)
-    ).__str__()
-
-
-logger = logging.getLogger(__name__)
-
-
-def default_virtual_size_func(datasets, ratios, max_scale_up=1.5):
-    sizes = [len(d) for d in datasets]
-    if ratios is None:
-        return sum(sizes)
-    largest_idx = np.argmax(sizes)
-    largest_r = ratios[largest_idx]
-    largest_s = sizes[largest_idx]
-    # set virtual sizes relative to the largest dataset
-    virtual_sizes = [(r / largest_r) * largest_s for r in ratios]
-    vsize = sum(virtual_sizes)
-    max_size = sum(sizes) * max_scale_up
-    return int(vsize if vsize < max_size else max_size)
-
-
-class CollateFormat(Enum):
-    single = 1
-    ordered_dict = 2
-
-
-class SampledMultiDataset(FairseqDataset):
-    """Samples from multiple sub-datasets according to given sampling ratios.
-    Args:
-        datasets (
-            List[~torch.utils.data.Dataset]
-            or OrderedDict[str, ~torch.utils.data.Dataset]
-        ): datasets
-        sampling_ratios (List[float]): list of probability of each dataset to be sampled
-            (default: None, which corresponds to concatenating all dataset together).
-        seed (int): RNG seed to use (default: 2).
-        epoch (int): starting epoch number (default: 1).
-        eval_key (str, optional): a key used at evaluation time that causes
-            this instance to pass-through batches from *datasets[eval_key]*.
-        collate_format (CollateFormat):  collater output format, either CollateFormat.ordered_dict or
-            CollateFormat.single (default: CollateFormat.single) where CollateFormat.single configures
-            the collater to output batches of data mixed from all sub-datasets,
-            and CollateFormat.ordered_dict configures the collater to output a dictionary of batches indexed by keys
-            of sub-datasets.
-            Note that not all sub-datasets will present in a single batch in both formats.
-        virtual_size (int, or callable): the expected virtual size of the dataset (default: default_virtual_size_func).
-        split (str): the split of the data, e.g. 'train', 'valid' or 'test'.
-        shared_collater (bool): whether or not to all sub-datasets have the same collater.
-        shuffle (bool): whether or not to shuffle data (default: True).
-    """
-
-    def __init__(
-        self,
-        datasets,
-        sampling_ratios=None,
-        seed=2,
-        epoch=1,
-        eval_key=None,
-        collate_format=CollateFormat.single,
-        virtual_size=default_virtual_size_func,
-        split="",
-        shared_collater=False,
-        shuffle=True,
-    ):
-        super().__init__()
-        self.shared_collater = shared_collater
-        self.shuffle = shuffle
-
-        if isinstance(datasets, OrderedDict):
-            self.keys = list(datasets.keys())
-            datasets = list(datasets.values())
-        elif isinstance(datasets, List):
-            self.keys = list(range(len(datasets)))
-        else:
-            raise AssertionError()
-        self.datasets = datasets
-        self.split = split
-
-        self.eval_key = eval_key
-        if self.eval_key is not None:
-            self.collate_format = CollateFormat.single
-        else:
-            self.collate_format = collate_format
-
-        self.seed = seed
-        self._cur_epoch = None
-
-        self.cumulated_sizes = None
-        # self.datasets[k][self._cur_indices[i]] is the data item i in this sampled dataset
-        # namely, data item i is sampled from the kth sub-dataset self.datasets[k]
-        # where self.cumulated_sizes[k-1] <= i < self.cumulated_sizes[k]
-        self._cur_indices = None
-
-        self._sizes = None
-        self.virtual_size_per_dataset = None
-        # caching properties
-        self._reset_cached_properties()
-        self.setup_sampling(sampling_ratios, virtual_size)
-        self.set_epoch(epoch)
-
-    def _clean_if_not_none(self, var_list):
-        for v in var_list:
-            if v is not None:
-                del v
-
-    def _reset_cached_properties(self):
-        self._clean_if_not_none([self._sizes, self._cur_indices])
-        self._sizes = None
-        self._cur_indices = None
-
-    def setup_sampling(self, sample_ratios, virtual_size):
-        sizes = [len(d) for d in self.datasets]
-        if sample_ratios is None:
-            # default back to concating datasets
-            self.sample_ratios = None
-            self.virtual_size = sum(sizes)
-        else:
-            if not isinstance(sample_ratios, np.ndarray):
-                sample_ratios = np.array(sample_ratios)
-            self.sample_ratios = sample_ratios
-            virtual_size = (
-                default_virtual_size_func if virtual_size is None else virtual_size
-            )
-            self.virtual_size = (
-                virtual_size(self.datasets, self.sample_ratios)
-                if callable(virtual_size)
-                else virtual_size
-            )
-
-    def adjust_sampling(self, epoch, sampling_ratios, virtual_size):
-        if sampling_ratios is not None:
-            sampling_ratios = self._sync_sample_ratios(sampling_ratios)
-            self.setup_sampling(sampling_ratios, virtual_size)
-
-    def _sync_sample_ratios(self, ratios):
-        # in case the ratios are not precisely the same across processes
-        # also to ensure every procresses update the ratios in the same pace
-        ratios = torch.DoubleTensor(ratios)
-        if torch.distributed.is_initialized():
-            if torch.cuda.is_available():
-                distributed_utils.all_reduce(ratios.cuda())
-            else:
-                distributed_utils.all_reduce(ratios)
-            ret = ratios.cpu()
-            ret = ret.numpy()
-        return ret
-
-    def random_choice_in_dataset(self, rng, dataset, choice_size):
-        if hasattr(dataset, "random_choice_in_dataset"):
-            return dataset.random_choice_in_dataset(rng, choice_size)
-        dataset_size = len(dataset)
-        return rng.choice(
-            dataset_size, choice_size, replace=(choice_size > dataset_size)
-        )
-
-    def get_virtual_indices(self, rng, datasets, sample_ratios, virtual_size):
-        def get_counts(sample_ratios):
-            counts = np.array([virtual_size * r for r in sample_ratios], dtype=np.int64)
-            diff = virtual_size - counts.sum()
-            assert diff >= 0
-            # due to round-offs, the size might not match the desired sizes
-            if diff > 0:
-                dataset_indices = rng.choice(
-                    len(sample_ratios), size=diff, p=sample_ratios
-                )
-                for i in dataset_indices:
-                    counts[i] += 1
-            return counts
-
-        def get_in_dataset_indices(datasets, sizes, sample_ratios):
-            counts = get_counts(sample_ratios)
-            # uniformally sample desired counts for each dataset
-            # if the desired counts are large, sample with replacement:
-            indices = [
-                self.random_choice_in_dataset(rng, d, c)
-                for c, d in zip(counts, datasets)
-            ]
-            return indices
-
-        sizes = [len(d) for d in datasets]
-        if sample_ratios is None:
-            # default back to concating datasets
-            in_dataset_indices = [list(range(s)) for s in sizes]
-            virtual_sizes_per_dataset = sizes
-        else:
-            ratios = sample_ratios / sample_ratios.sum()
-            in_dataset_indices = get_in_dataset_indices(datasets, sizes, ratios)
-            virtual_sizes_per_dataset = [len(d) for d in in_dataset_indices]
-        virtual_sizes_per_dataset = np.array(virtual_sizes_per_dataset, np.int64)
-        cumulative_sizes = np.cumsum(virtual_sizes_per_dataset)
-        assert sum(virtual_sizes_per_dataset) == virtual_size
-        assert cumulative_sizes[-1] == virtual_size
-        if virtual_size < sum(sizes):
-            logger.warning(
-                f"virtual data size ({virtual_size}) is less than real data size ({sum(sizes)})."
-                " If virtual size << real data size, there could be data coverage issue."
-            )
-        in_dataset_indices = np.hstack(in_dataset_indices)
-        return in_dataset_indices, cumulative_sizes, virtual_sizes_per_dataset
-
-    def _get_dataset_and_index(self, index):
-        i = bisect_right(self.cumulated_sizes, index)
-        return i, self._cur_indices[index]
-
-    def __getitem__(self, index):
-        # self.__getitem__(index) returns self.datasets[k][self._cur_indices[index]]
-        # where k satisfies self.cumulated_sizes[k - 1] <= k < self.cumulated_sizes[k]
-        ds_idx, ds_sample_idx = self._get_dataset_and_index(index)
-        ret = (ds_idx, self.datasets[ds_idx][ds_sample_idx])
-        return ret
-
-    def num_tokens(self, index):
-        return self.sizes[index].max()
-
-    def size(self, index):
-        return self.sizes[index]
-
-    def __len__(self):
-        return self.virtual_size
-
-    def collater(self, samples, **extra_args):
-        """Merge a list of samples to form a mini-batch."""
-        if len(samples) == 0:
-            return None
-        if self.collate_format == "ordered_dict":
-            collect_samples = [[] for _ in range(len(self.datasets))]
-            for (i, sample) in samples:
-                collect_samples[i].append(sample)
-            batch = OrderedDict(
-                [
-                    (self.keys[i], dataset.collater(collect_samples[i]))
-                    for i, (key, dataset) in enumerate(zip(self.keys, self.datasets))
-                    if len(collect_samples[i]) > 0
-                ]
-            )
-        elif self.shared_collater:
-            batch = self.datasets[0].collater([s for _, s in samples])
-        else:
-            samples_dict = defaultdict(list)
-            pad_to_length = (
-                defaultdict(int)
-                if "pad_to_length" not in extra_args
-                else extra_args["pad_to_length"]
-            )
-            for ds_idx, s in samples:
-                pad_to_length["source"] = max(
-                    pad_to_length["source"], s["source"].size(0)
-                )
-                if s["target"] is not None:
-                    pad_to_length["target"] = max(
-                        pad_to_length["target"], s["target"].size(0)
-                    )
-                samples_dict[ds_idx].append(s)
-            batches = [
-                self.datasets[i].collater(samples_dict[i], pad_to_length=pad_to_length)
-                for i in range(len(self.datasets))
-                if len(samples_dict[i]) > 0
-            ]
-
-            def straight_data(tensors):
-                batch = torch.cat(tensors, dim=0)
-                return batch
-
-            src_lengths = straight_data(
-                [b["net_input"]["src_lengths"] for b in batches]
-            )
-            src_lengths, sort_order = src_lengths.sort(descending=True)
-
-            def straight_order(tensors):
-                batch = straight_data(tensors)
-                return batch.index_select(0, sort_order)
-
-            batch = {
-                "id": straight_order([b["id"] for b in batches]),
-                "nsentences": sum(b["nsentences"] for b in batches),
-                "ntokens": sum(b["ntokens"] for b in batches),
-                "net_input": {
-                    "src_tokens": straight_order(
-                        [b["net_input"]["src_tokens"] for b in batches]
-                    ),
-                    "src_lengths": src_lengths,
-                },
-                "target": straight_order([b["target"] for b in batches])
-                if batches[0]["target"] is not None
-                else None,
-            }
-            if "prev_output_tokens" in batches[0]["net_input"]:
-                batch["net_input"]["prev_output_tokens"] = straight_order(
-                    [b["net_input"]["prev_output_tokens"] for b in batches]
-                )
-            if "src_lang_id" in batches[0]["net_input"]:
-                batch["net_input"]["src_lang_id"] = straight_order(
-                    [b["net_input"]["src_lang_id"] for b in batches]
-                )
-            if "tgt_lang_id" in batches[0]:
-                batch["tgt_lang_id"] = straight_order(
-                    [b["tgt_lang_id"] for b in batches]
-                )
-        return batch
-
-    @property
-    def sizes(self):
-        if self._sizes is not None:
-            return self._sizes
-        start_time = time.time()
-        in_sub_dataset_indices = [
-            self._cur_indices[
-                0 if i == 0 else self.cumulated_sizes[i - 1] : self.cumulated_sizes[i]
-            ]
-            for i in range(len(self.datasets))
-        ]
-        sub_dataset_sizes = [
-            d.sizes[indices]
-            for d, indices in zip(self.datasets, in_sub_dataset_indices)
-        ]
-        self._sizes = np.vstack(sub_dataset_sizes)
-        logger.info(f"sizes() calling time: {get_time_gap(start_time, time.time())}")
-        return self._sizes
-
-    def ordered_indices(self):
-        if self.shuffle:
-            indices = np.random.permutation(len(self))
-        else:
-            indices = np.arange(len(self))
-
-        sizes = self.sizes
-        tgt_sizes = sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None
-        src_sizes = (
-            sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes
-        )
-
-        # sort by target length, then source length
-        if tgt_sizes is not None:
-            indices = indices[np.argsort(tgt_sizes[indices], kind="mergesort")]
-        sort_indices = indices[np.argsort(src_sizes[indices], kind="mergesort")]
-        return sort_indices
-
-    def prefetch(self, indices):
-        prefetch_indices = [[] for _ in range(len(self.datasets))]
-        for i in indices:
-            ds_idx, ds_sample_idx = self._get_dataset_and_index(i)
-            prefetch_indices[ds_idx].append(ds_sample_idx)
-        for i in range(len(prefetch_indices)):
-            self.datasets[i].prefetch(prefetch_indices[i])
-
-    @property
-    def can_reuse_epoch_itr_across_epochs(self):
-        return False
-
-    def set_epoch(self, epoch):
-        super().set_epoch(epoch)
-        if epoch == self._cur_epoch:
-            # re-enter so return
-            return
-        for d in self.datasets:
-            if hasattr(d, "set_epoch"):
-                d.set_epoch(epoch)
-        self._cur_epoch = epoch
-        self._establish_virtual_datasets()
-
-    def _establish_virtual_datasets(self):
-        if self.sample_ratios is None and self._cur_indices is not None:
-            # not a samping dataset, no need to resample if indices are already established
-            return
-        self._reset_cached_properties()
-
-        start_time = time.time()
-        # Generate a weighted sample of indices as a function of the
-        # random seed and the current epoch.
-        rng = np.random.RandomState(
-            [
-                int(
-                    hashlib.sha1(
-                        str(self.__class__.__name__).encode("utf-8")
-                    ).hexdigest(),
-                    16,
-                )
-                % (2 ** 32),
-                self.seed % (2 ** 32),  # global seed
-                self._cur_epoch,  # epoch index,
-            ]
-        )
-        self._clean_if_not_none(
-            [self.cumulated_sizes, self.virtual_size_per_dataset, self._sizes]
-        )
-        self._sizes = None
-
-        indices, cumulated_sizes, virtual_size_per_dataset = self.get_virtual_indices(
-            rng, self.datasets, self.sample_ratios, self.virtual_size
-        )
-        self._cur_indices = indices
-        self.cumulated_sizes = cumulated_sizes
-        self.virtual_size_per_dataset = virtual_size_per_dataset
-
-        raw_sizes = [len(d) for d in self.datasets]
-        sampled_sizes = self.virtual_size_per_dataset
-        logger.info(
-            f"[{self.split}] Raw sizes: {str(dict(zip(self.keys, raw_sizes)))}; "
-            f"raw total size: {sum(raw_sizes)}"
-        )
-        logger.info(
-            f"[{self.split}] Resampled sizes: {str(dict(zip(self.keys, sampled_sizes)))}; "
-            f"resampled total size: {sum(sampled_sizes)}"
-        )
-        if self.sample_ratios is not None:
-            logger.info(
-                f"[{self.split}] Upsampling ratios: {str(dict(zip(self.keys, self.sample_ratios)))}"
-            )
-        else:
-            logger.info(f"[{self.split}] A concat dataset")
-        logger.info(
-            f"[{self.split}] virtual dataset established time: {get_time_gap(start_time, time.time())}"
-        )
-
-    def filter_indices_by_size(self, indices, max_sizes):
-        """Filter a list of sample indices. Remove those that are longer
-            than specified in max_sizes.
-
-        Args:
-            indices (np.array): original array of sample indices
-            max_sizes (int or list[int] or tuple[int]): max sample size,
-                can be defined separately for src and tgt (then list or tuple)
-
-        Returns:
-            np.array: filtered sample array
-            list: list of removed indices
-        """
-        sizes = self.sizes
-        tgt_sizes = sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None
-        src_sizes = (
-            sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes
-        )
-
-        return data_utils.filter_paired_dataset_indices_by_size(
-            src_sizes, tgt_sizes, indices, max_sizes
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
deleted file mode 100644
index 17387b2f85c0ee76db1a003091331b46de8d8def..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/sampled_multi_epoch_dataset.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import hashlib
-import logging
-import math
-
-import numpy as np
-from fairseq.data import SampledMultiDataset
-
-from .sampled_multi_dataset import CollateFormat, default_virtual_size_func
-
-
-logger = logging.getLogger(__name__)
-
-
-class SampledMultiEpochDataset(SampledMultiDataset):
-    """Samples from multiple sub-datasets according to sampling ratios
-       using virtual epoch sizes to speed up dataloading.
-    Args:
-        datasets (
-            List[~torch.utils.data.Dataset]
-            or OrderedDict[str, ~torch.utils.data.Dataset]
-        ): datasets
-        sampling_ratios (List[float]): list of probability of each dataset to be sampled
-            (default: None, which corresponds to concating all dataset together).
-        seed (int): RNG seed to use (default: 2).
-        epoch (int): starting epoch number (default: 1).
-        eval_key (str, optional): a key used at evaluation time that causes
-            this instance to pass-through batches from *datasets[eval_key]*.
-        collate_format (CollateFormat):  collater output format, either CollateFormat.ordered_dict or
-            CollateFormat.single (default: CollateFormat.single) where CollateFormat.single configures
-            the collater to output batches of data mixed from all sub-datasets,
-            and CollateFormat.ordered_dict configures the collater to output a dictionary of batches indexed by keys
-            of sub-datasets.
-            Note that not all sub-datasets will present in a single batch in both formats.
-        virtual_size (int, or callable): the expected virtual size of the dataset (default: default_virtual_size_func).
-        split (str): the split of the data, e.g. 'train', 'valid' or 'test'.
-        virtual_epoch_size (int): virtual epoch size, the dataset will go through the data by
-            this virtual epoch size one by one to speed up data loading, e.g. indicing and filtering
-            can be performed whenever a virtual epoch is loaded without waiting for the whole dataset to be loaded.
-        shared_collater (bool): whether or not to all sub-datasets have the same collater.
-        shard_epoch (int): the real epoch number for shard selection.
-        shuffle (bool): whether or not to shuffle data (default: True).
-    """
-
-    def __init__(
-        self,
-        datasets,
-        sampling_ratios=None,
-        seed=2,
-        epoch=1,
-        eval_key=None,
-        collate_format=CollateFormat.single,
-        virtual_size=default_virtual_size_func,
-        split="",
-        virtual_epoch_size=None,
-        shared_collater=False,
-        shard_epoch=1,
-        shuffle=True,
-    ):
-        self.virtual_epoch_size = virtual_epoch_size
-        self._current_epoch_start_index = None
-        self._random_global_indices = None
-        self.shard_epoch = shard_epoch if shard_epoch is not None else 1
-        self.load_next_shard = None
-        self._epoch_sizes = None
-        super().__init__(
-            datasets=datasets,
-            sampling_ratios=sampling_ratios,
-            seed=seed,
-            epoch=epoch,
-            eval_key=eval_key,
-            collate_format=collate_format,
-            virtual_size=virtual_size,
-            split=split,
-            shared_collater=shared_collater,
-            shuffle=shuffle,
-        )
-
-    def _setup(self, epoch):
-        self.virtual_epoch_size = (
-            self.virtual_epoch_size
-            if self.virtual_epoch_size is not None
-            else self.virtual_size
-        )
-        if self.virtual_epoch_size > self.virtual_size:
-            logger.warning(
-                f"virtual epoch size {self.virtual_epoch_size} "
-                f"is greater than virtual dataset size {self.virtual_size}"
-            )
-            self.virtual_epoch_size = self.virtual_size
-        self.num_virtual_epochs = math.ceil(self.virtual_size / self.virtual_epoch_size)
-        self._current_epoch_start_index = self._get_epoch_start_index(epoch)
-        logger.info(
-            f"virtual epoch size {self.virtual_epoch_size}; virtual dataset size {self.virtual_size}"
-        )
-
-    def _map_epoch_index_to_global(self, index):
-        index = self._current_epoch_start_index + index
-        # add randomness
-        return self._random_global_indices[index]
-
-    @property
-    def sizes(self):
-        if self._epoch_sizes is not None:
-            return self._epoch_sizes
-        _sizes = super().sizes
-        indices = self._random_global_indices[
-            self._current_epoch_start_index : self._current_epoch_start_index
-            + len(self)
-        ]
-        self._epoch_sizes = _sizes[indices]
-        # del super()._sizes to save memory
-        del self._sizes
-        self._sizes = None
-        return self._epoch_sizes
-
-    def _get_dataset_and_index(self, index):
-        i = self._map_epoch_index_to_global(index)
-        return super()._get_dataset_and_index(i)
-
-    def __len__(self):
-        return (
-            self.virtual_epoch_size
-            if self._current_epoch_start_index + self.virtual_epoch_size
-            < self.virtual_size
-            else self.virtual_size - self._current_epoch_start_index
-        )
-
-    def set_epoch(self, epoch):
-        if self._current_epoch_start_index is None:
-            # initializing epoch idnices of a virtual dataset
-            self._setup(epoch)
-            self._next_virtual_epoch(epoch)
-        else:
-            # working on already intialized epoch indices
-            if epoch == self._cur_epoch:
-                # re-enter so return
-                return
-            self._next_virtual_epoch(epoch)
-
-    def _get_epoch_start_index(self, epoch):
-        assert epoch >= 1  # fairseq is using 1-based epoch everywhere
-        return ((epoch - 1) % self.num_virtual_epochs) * self.virtual_epoch_size
-
-    def _next_global_indices(self, epoch):
-        rng = np.random.RandomState(
-            [
-                int(
-                    hashlib.sha1(
-                        str(self.__class__.__name__).encode("utf-8")
-                    ).hexdigest(),
-                    16,
-                )
-                % (2 ** 32),
-                self.seed % (2 ** 32),  # global seed
-                epoch,  # epoch index,
-            ]
-        )
-        del self._random_global_indices
-        self._random_global_indices = rng.choice(
-            self.virtual_size, self.virtual_size, replace=False
-        )
-        if self.load_next_shard is None:
-            self.load_next_shard = False
-        else:
-            # increase shard epoch for next loading
-            self.shard_epoch += 1
-            self.load_next_shard = True
-            logger.info(
-                "to load next epoch/shard in next load_dataset: "
-                f"epoch={epoch}/shard_epoch={self.shard_epoch}"
-            )
-
-    def _next_virtual_epoch(self, epoch):
-        index = self._get_epoch_start_index(epoch)
-        if index == 0 or self._random_global_indices is None:
-            # need to start from the beginning,
-            # so call super().set_epoch(epoch) to establish the global virtual indices
-            logger.info(
-                "establishing a new set of global virtual indices for "
-                f"epoch={epoch}/shard_epoch={self.shard_epoch}"
-            )
-            super().set_epoch(epoch)
-            self._next_global_indices(epoch)
-        else:
-            self._cur_epoch = epoch
-
-        # reset cache sizes and ordered_indices for the epoch after moving to a new epoch
-        self._clean_if_not_none(
-            [
-                self._epoch_sizes,
-            ]
-        )
-        self._epoch_sizes = None
-        self._current_epoch_start_index = index
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/sampling_method.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/sampling_method.py
deleted file mode 100644
index 140c68f01d60e902ef88f11f30f8813dc15fc681..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/multilingual/sampling_method.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-from typing import List
-
-
-logger = logging.getLogger(__name__)
-
-
-def uniform(dataset_sizes: List[int]):
-    return [1.0] * len(dataset_sizes)
-
-
-def temperature_sampling(dataset_sizes, temp):
-    total_size = sum(dataset_sizes)
-    return [(size / total_size) ** (1.0 / temp) for size in dataset_sizes]
-
-
-def make_temperature_sampling(temp=1.0):
-    def sampling_func(dataset_sizes):
-        return temperature_sampling(dataset_sizes, temp)
-
-    return sampling_func
-
-
-def make_ratio_sampling(ratios):
-    def sampling_func(dataset_sizes):
-        return ratios
-
-    return sampling_func
-
-
-class SamplingMethod:
-    @staticmethod
-    def add_arguments(parser):
-        parser.add_argument(
-            "--sampling-method",
-            choices=[
-                "uniform",
-                "temperature",
-                "concat",
-                "RoundRobin",
-            ],
-            type=str,
-            default="concat",
-            help="The method to sample data per language pairs",
-        )
-        parser.add_argument(
-            "--sampling-temperature",
-            default=1.5,
-            type=float,
-            help="only work with --sampling-method temperature",
-        )
-
-    @staticmethod
-    def build_sampler(args, task):
-        return SamplingMethod(args, task)
-
-    def __init__(self, args, task):
-        self.args = args
-        self.task = task
-
-    def is_adaptive(self):
-        return False
-
-    def sampling_method_selector(self):
-        args = self.args
-        logger.info(f"selected sampler: {args.sampling_method}")
-        if args.sampling_method == "uniform":
-            return uniform
-        elif args.sampling_method == "temperature" or self.is_adaptive():
-            return make_temperature_sampling(float(args.sampling_temperature))
-        else:
-            # default to concating all data set together
-            return None
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/nested_dictionary_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/nested_dictionary_dataset.py
deleted file mode 100644
index 52e74abddacc923c5e29b0a0c41d7efc85482d3b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/nested_dictionary_dataset.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from collections import OrderedDict
-
-import torch
-from torch.utils.data.dataloader import default_collate
-
-from . import FairseqDataset
-
-
-def _flatten(dico, prefix=None):
-    """Flatten a nested dictionary."""
-    new_dico = OrderedDict()
-    if isinstance(dico, dict):
-        prefix = prefix + "." if prefix is not None else ""
-        for k, v in dico.items():
-            if v is None:
-                continue
-            new_dico.update(_flatten(v, prefix + k))
-    elif isinstance(dico, list):
-        for i, v in enumerate(dico):
-            new_dico.update(_flatten(v, prefix + ".[" + str(i) + "]"))
-    else:
-        new_dico = OrderedDict({prefix: dico})
-    return new_dico
-
-
-def _unflatten(dico):
-    """Unflatten a flattened dictionary into a nested dictionary."""
-    new_dico = OrderedDict()
-    for full_k, v in dico.items():
-        full_k = full_k.split(".")
-        node = new_dico
-        for k in full_k[:-1]:
-            if k.startswith("[") and k.endswith("]"):
-                k = int(k[1:-1])
-            if k not in node:
-                node[k] = OrderedDict()
-            node = node[k]
-        node[full_k[-1]] = v
-    return new_dico
-
-
-class NestedDictionaryDataset(FairseqDataset):
-    def __init__(self, defn, sizes=None):
-        super().__init__()
-        self.defn = _flatten(defn)
-        self.sizes = [sizes] if not isinstance(sizes, (list, tuple)) else sizes
-
-        first = None
-        for v in self.defn.values():
-            if not isinstance(
-                v,
-                (
-                    FairseqDataset,
-                    torch.utils.data.Dataset,
-                ),
-            ):
-                raise ValueError("Expected Dataset but found: {}".format(v.__class__))
-            first = first or v
-            if len(v) > 0:
-                assert len(v) == len(first), "dataset lengths must match"
-
-        self._len = len(first)
-
-    def __getitem__(self, index):
-        return OrderedDict((k, ds[index]) for k, ds in self.defn.items())
-
-    def __len__(self):
-        return self._len
-
-    def collater(self, samples):
-        """Merge a list of samples to form a mini-batch.
-
-        Args:
-            samples (List[dict]): samples to collate
-
-        Returns:
-            dict: a mini-batch suitable for forwarding with a Model
-        """
-        if len(samples) == 0:
-            return {}
-        sample = OrderedDict()
-        for k, ds in self.defn.items():
-            try:
-                sample[k] = ds.collater([s[k] for s in samples])
-            except NotImplementedError:
-                sample[k] = default_collate([s[k] for s in samples])
-        return _unflatten(sample)
-
-    def num_tokens(self, index):
-        """Return the number of tokens in a sample. This value is used to
-        enforce ``--max-tokens`` during batching."""
-        return max(s[index] for s in self.sizes)
-
-    def size(self, index):
-        """Return an example's size as a float or tuple. This value is used when
-        filtering a dataset with ``--max-positions``."""
-        if len(self.sizes) == 1:
-            return self.sizes[0][index]
-        else:
-            return (s[index] for s in self.sizes)
-
-    @property
-    def supports_prefetch(self):
-        """Whether this dataset supports prefetching."""
-        return any(ds.supports_prefetch for ds in self.defn.values())
-
-    def prefetch(self, indices):
-        """Prefetch the data required for this epoch."""
-        for ds in self.defn.values():
-            if getattr(ds, "supports_prefetch", False):
-                ds.prefetch(indices)
-
-    @property
-    def can_reuse_epoch_itr_across_epochs(self):
-        return all(ds.can_reuse_epoch_itr_across_epochs for ds in self.defn.values())
-
-    def set_epoch(self, epoch):
-        super().set_epoch(epoch)
-        for ds in self.defn.values():
-            ds.set_epoch(epoch)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/noising.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/noising.py
deleted file mode 100644
index 9643d1aa6a7db41315da83cf5db5cb4d7f8af185..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/noising.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-from fairseq.data import data_utils
-
-
-class WordNoising(object):
-    """Generate a noisy version of a sentence, without changing words themselves."""
-
-    def __init__(self, dictionary, bpe_cont_marker="@@", bpe_end_marker=None):
-        self.dictionary = dictionary
-        self.bpe_end = None
-        if bpe_cont_marker:
-            self.bpe_end = np.array(
-                [
-                    not self.dictionary[i].endswith(bpe_cont_marker)
-                    for i in range(len(self.dictionary))
-                ]
-            )
-        elif bpe_end_marker:
-            self.bpe_end = np.array(
-                [
-                    self.dictionary[i].endswith(bpe_end_marker)
-                    for i in range(len(self.dictionary))
-                ]
-            )
-
-        self.get_word_idx = (
-            self._get_bpe_word_idx if self.bpe_end is not None else self._get_token_idx
-        )
-
-    def noising(self, x, lengths, noising_prob=0.0):
-        raise NotImplementedError()
-
-    def _get_bpe_word_idx(self, x):
-        """
-        Given a list of BPE tokens, for every index in the tokens list,
-        return the index of the word grouping that it belongs to.
-        For example, for input x corresponding to ["how", "are", "y@@", "ou"],
-        return [[0], [1], [2], [2]].
-        """
-        # x: (T x B)
-        bpe_end = self.bpe_end[x]
-
-        if x.size(0) == 1 and x.size(1) == 1:
-            # Special case when we only have one word in x. If x = [[N]],
-            # bpe_end is a scalar (bool) instead of a 2-dim array of bools,
-            # which makes the sum operation below fail.
-            return np.array([[0]])
-
-        # do a reduce front sum to generate word ids
-        word_idx = bpe_end[::-1].cumsum(0)[::-1]
-        word_idx = word_idx.max(0)[None, :] - word_idx
-        return word_idx
-
-    def _get_token_idx(self, x):
-        """
-        This is to extend noising functions to be able to apply to non-bpe
-        tokens, e.g. word or characters.
-        """
-        x = torch.t(x)
-        word_idx = np.array([range(len(x_i)) for x_i in x])
-        return np.transpose(word_idx)
-
-
-class WordDropout(WordNoising):
-    """Randomly drop input words. If not passing blank_idx (default is None),
-    then dropped words will be removed. Otherwise, it will be replaced by the
-    blank_idx."""
-
-    def __init__(
-        self,
-        dictionary,
-        default_dropout_prob=0.1,
-        bpe_cont_marker="@@",
-        bpe_end_marker=None,
-    ):
-        super().__init__(dictionary, bpe_cont_marker, bpe_end_marker)
-        self.default_dropout_prob = default_dropout_prob
-
-    def noising(self, x, lengths, dropout_prob=None, blank_idx=None):
-        if dropout_prob is None:
-            dropout_prob = self.default_dropout_prob
-        # x: (T x B), lengths: B
-        if dropout_prob == 0:
-            return x, lengths
-
-        assert 0 < dropout_prob < 1
-
-        # be sure to drop entire words
-        word_idx = self.get_word_idx(x)
-        sentences = []
-        modified_lengths = []
-        for i in range(lengths.size(0)):
-            # Since dropout probabilities need to apply over non-pad tokens,
-            # it is not trivial to generate the keep mask without consider
-            # input lengths; otherwise, this could be done outside the loop
-
-            # We want to drop whole words based on word_idx grouping
-            num_words = max(word_idx[:, i]) + 1
-
-            # ith example: [x0, x1, ..., eos, pad, ..., pad]
-            # We should only generate keep probs for non-EOS tokens. Thus if the
-            # input sentence ends in EOS, the last word idx is not included in
-            # the dropout mask generation and we append True to always keep EOS.
-            # Otherwise, just generate the dropout mask for all word idx
-            # positions.
-            has_eos = x[lengths[i] - 1, i] == self.dictionary.eos()
-            if has_eos:  # has eos?
-                keep = np.random.rand(num_words - 1) >= dropout_prob
-                keep = np.append(keep, [True])  # keep EOS symbol
-            else:
-                keep = np.random.rand(num_words) >= dropout_prob
-
-            words = x[: lengths[i], i].tolist()
-
-            # TODO: speed up the following loop
-            # drop words from the input according to keep
-            new_s = [
-                w if keep[word_idx[j, i]] else blank_idx for j, w in enumerate(words)
-            ]
-            new_s = [w for w in new_s if w is not None]
-            # we need to have at least one word in the sentence (more than the
-            # start / end sentence symbols)
-            if len(new_s) <= 1:
-                # insert at beginning in case the only token left is EOS
-                # EOS should be at end of list.
-                new_s.insert(0, words[np.random.randint(0, len(words))])
-            assert len(new_s) >= 1 and (
-                not has_eos  # Either don't have EOS at end or last token is EOS
-                or (len(new_s) >= 2 and new_s[-1] == self.dictionary.eos())
-            ), "New sentence is invalid."
-            sentences.append(new_s)
-            modified_lengths.append(len(new_s))
-        # re-construct input
-        modified_lengths = torch.LongTensor(modified_lengths)
-        modified_x = torch.LongTensor(
-            modified_lengths.max(), modified_lengths.size(0)
-        ).fill_(self.dictionary.pad())
-        for i in range(modified_lengths.size(0)):
-            modified_x[: modified_lengths[i], i].copy_(torch.LongTensor(sentences[i]))
-
-        return modified_x, modified_lengths
-
-
-class WordShuffle(WordNoising):
-    """Shuffle words by no more than k positions."""
-
-    def __init__(
-        self,
-        dictionary,
-        default_max_shuffle_distance=3,
-        bpe_cont_marker="@@",
-        bpe_end_marker=None,
-    ):
-        super().__init__(dictionary, bpe_cont_marker, bpe_end_marker)
-        self.default_max_shuffle_distance = 3
-
-    def noising(self, x, lengths, max_shuffle_distance=None):
-        if max_shuffle_distance is None:
-            max_shuffle_distance = self.default_max_shuffle_distance
-        # x: (T x B), lengths: B
-        if max_shuffle_distance == 0:
-            return x, lengths
-
-        # max_shuffle_distance < 1 will return the same sequence
-        assert max_shuffle_distance > 1
-
-        # define noise word scores
-        noise = np.random.uniform(
-            0,
-            max_shuffle_distance,
-            size=(x.size(0), x.size(1)),
-        )
-        noise[0] = -1  # do not move start sentence symbol
-        # be sure to shuffle entire words
-        word_idx = self.get_word_idx(x)
-        x2 = x.clone()
-        for i in range(lengths.size(0)):
-            length_no_eos = lengths[i]
-            if x[lengths[i] - 1, i] == self.dictionary.eos():
-                length_no_eos = lengths[i] - 1
-            # generate a random permutation
-            scores = word_idx[:length_no_eos, i] + noise[word_idx[:length_no_eos, i], i]
-            # ensure no reordering inside a word
-            scores += 1e-6 * np.arange(length_no_eos.item())
-            permutation = scores.argsort()
-            # shuffle words
-            x2[:length_no_eos, i].copy_(
-                x2[:length_no_eos, i][torch.from_numpy(permutation)]
-            )
-        return x2, lengths
-
-
-class UnsupervisedMTNoising(WordNoising):
-    """
-    Implements the default configuration for noising in UnsupervisedMT
-    (github.com/facebookresearch/UnsupervisedMT)
-    """
-
-    def __init__(
-        self,
-        dictionary,
-        max_word_shuffle_distance,
-        word_dropout_prob,
-        word_blanking_prob,
-        bpe_cont_marker="@@",
-        bpe_end_marker=None,
-    ):
-        super().__init__(dictionary)
-        self.max_word_shuffle_distance = max_word_shuffle_distance
-        self.word_dropout_prob = word_dropout_prob
-        self.word_blanking_prob = word_blanking_prob
-
-        self.word_dropout = WordDropout(
-            dictionary=dictionary,
-            bpe_cont_marker=bpe_cont_marker,
-            bpe_end_marker=bpe_end_marker,
-        )
-        self.word_shuffle = WordShuffle(
-            dictionary=dictionary,
-            bpe_cont_marker=bpe_cont_marker,
-            bpe_end_marker=bpe_end_marker,
-        )
-
-    def noising(self, x, lengths):
-        # 1. Word Shuffle
-        noisy_src_tokens, noisy_src_lengths = self.word_shuffle.noising(
-            x=x,
-            lengths=lengths,
-            max_shuffle_distance=self.max_word_shuffle_distance,
-        )
-        # 2. Word Dropout
-        noisy_src_tokens, noisy_src_lengths = self.word_dropout.noising(
-            x=noisy_src_tokens,
-            lengths=noisy_src_lengths,
-            dropout_prob=self.word_dropout_prob,
-        )
-        # 3. Word Blanking
-        noisy_src_tokens, noisy_src_lengths = self.word_dropout.noising(
-            x=noisy_src_tokens,
-            lengths=noisy_src_lengths,
-            dropout_prob=self.word_blanking_prob,
-            blank_idx=self.dictionary.unk(),
-        )
-
-        return noisy_src_tokens
-
-
-class NoisingDataset(torch.utils.data.Dataset):
-    def __init__(
-        self,
-        src_dataset,
-        src_dict,
-        seed,
-        noiser=None,
-        noising_class=UnsupervisedMTNoising,
-        **kwargs
-    ):
-        """
-        Wrap a :class:`~torch.utils.data.Dataset` and apply noise to the
-        samples based on the supplied noising configuration.
-
-        Args:
-            src_dataset (~torch.utils.data.Dataset): dataset to wrap.
-                to build self.src_dataset --
-                a LanguagePairDataset with src dataset as the source dataset and
-                None as the target dataset. Should NOT have padding so that
-                src_lengths are accurately calculated by language_pair_dataset
-                collate function.
-                We use language_pair_dataset here to encapsulate the tgt_dataset
-                so we can re-use the LanguagePairDataset collater to format the
-                batches in the structure that SequenceGenerator expects.
-            src_dict (~fairseq.data.Dictionary): source dictionary
-            seed (int): seed to use when generating random noise
-            noiser (WordNoising): a pre-initialized :class:`WordNoising`
-                instance. If this is None, a new instance will be created using
-                *noising_class* and *kwargs*.
-            noising_class (class, optional): class to use to initialize a
-                default :class:`WordNoising` instance.
-            kwargs (dict, optional): arguments to initialize the default
-                :class:`WordNoising` instance given by *noiser*.
-        """
-        self.src_dataset = src_dataset
-        self.src_dict = src_dict
-        self.seed = seed
-        self.noiser = (
-            noiser
-            if noiser is not None
-            else noising_class(
-                dictionary=src_dict,
-                **kwargs,
-            )
-        )
-
-    def __getitem__(self, index):
-        """
-        Returns a single noisy sample. Multiple samples are fed to the collater
-        create a noising dataset batch.
-        """
-        src_tokens = self.src_dataset[index]
-        src_lengths = torch.LongTensor([len(src_tokens)])
-        src_tokens = src_tokens.unsqueeze(0)
-
-        # Transpose src tokens to fit expected shape of x in noising function
-        # (batch size, sequence length) -> (sequence length, batch size)
-        src_tokens_t = torch.t(src_tokens)
-
-        with data_utils.numpy_seed(self.seed + index):
-            noisy_src_tokens = self.noiser.noising(src_tokens_t, src_lengths)
-
-        # Transpose back to expected src_tokens format
-        # (sequence length, 1) -> (1, sequence length)
-        noisy_src_tokens = torch.t(noisy_src_tokens)
-        return noisy_src_tokens[0]
-
-    def __len__(self):
-        """
-        The length of the noising dataset is the length of src.
-        """
-        return len(self.src_dataset)
-
-    @property
-    def supports_prefetch(self):
-        return self.src_dataset.supports_prefetch
-
-    def prefetch(self, indices):
-        if self.src_dataset.supports_prefetch:
-            self.src_dataset.prefetch(indices)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/num_samples_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/num_samples_dataset.py
deleted file mode 100644
index 99a17495c701d8a05e0268f98bf453905e11d078..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/num_samples_dataset.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from . import FairseqDataset
-
-
-class NumSamplesDataset(FairseqDataset):
-    def __getitem__(self, index):
-        return 1
-
-    def __len__(self):
-        return 0
-
-    def collater(self, samples):
-        return sum(samples)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/numel_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/numel_dataset.py
deleted file mode 100644
index ac86dfd2f1d89055de909656d61d6aca85523f00..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/numel_dataset.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-
-from . import BaseWrapperDataset
-
-
-class NumelDataset(BaseWrapperDataset):
-    def __init__(self, dataset, reduce=False):
-        super().__init__(dataset)
-        self.reduce = reduce
-
-    def __getitem__(self, index):
-        item = self.dataset[index]
-        if torch.is_tensor(item):
-            return torch.numel(item)
-        else:
-            return np.size(item)
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def collater(self, samples):
-        if self.reduce:
-            return sum(samples)
-        else:
-            return torch.tensor(samples)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/offset_tokens_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/offset_tokens_dataset.py
deleted file mode 100644
index 6fabbdcdaa1a8f70d8d8c07db4cd53754503c194..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/offset_tokens_dataset.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from . import BaseWrapperDataset
-
-
-class OffsetTokensDataset(BaseWrapperDataset):
-    def __init__(self, dataset, offset):
-        super().__init__(dataset)
-        self.offset = offset
-
-    def __getitem__(self, idx):
-        return self.dataset[idx] + self.offset
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/pad_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/pad_dataset.py
deleted file mode 100644
index 8075bba6a9efc5f8421368ee0b2ae66afe3f5009..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/pad_dataset.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.data import data_utils
-
-from . import BaseWrapperDataset
-
-
-class PadDataset(BaseWrapperDataset):
-    def __init__(self, dataset, pad_idx, left_pad):
-        super().__init__(dataset)
-        self.pad_idx = pad_idx
-        self.left_pad = left_pad
-
-    def collater(self, samples):
-        return data_utils.collate_tokens(samples, self.pad_idx, left_pad=self.left_pad)
-
-
-class LeftPadDataset(PadDataset):
-    def __init__(self, dataset, pad_idx):
-        super().__init__(dataset, pad_idx, left_pad=True)
-
-
-class RightPadDataset(PadDataset):
-    def __init__(self, dataset, pad_idx):
-        super().__init__(dataset, pad_idx, left_pad=False)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/plasma_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/plasma_utils.py
deleted file mode 100644
index 2b12646783608521e6db6157066d81a556633b92..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/plasma_utils.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import subprocess
-import tempfile
-
-
-class PlasmaArray(object):
-    """
-    Wrapper around numpy arrays that automatically moves the data to shared
-    memory upon serialization. This is particularly helpful when passing numpy
-    arrays through multiprocessing, so that data is not unnecessarily
-    duplicated or pickled.
-    """
-
-    def __init__(self, array):
-        super().__init__()
-        self.array = array
-        self.disable = array.nbytes < 134217728  # disable for arrays <128MB
-        self.object_id = None
-        self.path = None
-
-        # variables with underscores shouldn't be pickled
-        self._client = None
-        self._server = None
-        self._server_tmp = None
-        self._plasma = None
-
-    @property
-    def plasma(self):
-        if self._plasma is None and not self.disable:
-            try:
-                import pyarrow.plasma as plasma
-
-                self._plasma = plasma
-            except ImportError:
-                self._plasma = None
-        return self._plasma
-
-    def start_server(self):
-        if self.plasma is None or self._server is not None:
-            return
-        assert self.object_id is None
-        assert self.path is None
-        self._server_tmp = tempfile.NamedTemporaryFile()
-        self.path = self._server_tmp.name
-        self._server = subprocess.Popen(
-            [
-                "plasma_store",
-                "-m",
-                str(int(1.05 * self.array.nbytes)),
-                "-s",
-                self.path,
-            ]
-        )
-
-    @property
-    def client(self):
-        if self._client is None:
-            assert self.path is not None
-            self._client = self.plasma.connect(self.path)
-        return self._client
-
-    def __getstate__(self):
-        if self.plasma is None:
-            return self.__dict__
-        if self.object_id is None:
-            self.start_server()
-            self.object_id = self.client.put(self.array)
-        state = self.__dict__.copy()
-        del state["array"]
-        state["_client"] = None
-        state["_server"] = None
-        state["_server_tmp"] = None
-        state["_plasma"] = None
-        return state
-
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-        if self.plasma is None:
-            return
-        self.array = self.client.get(self.object_id)
-
-    def __del__(self):
-        if self._server is not None:
-            self._server.kill()
-            self._server = None
-            self._server_tmp.close()
-            self._server_tmp = None
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/prepend_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/prepend_dataset.py
deleted file mode 100644
index ad74784d2d7920e4a6225282d95543ce16ea50d9..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/prepend_dataset.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-
-from . import BaseWrapperDataset
-
-
-class PrependDataset(BaseWrapperDataset):
-    def __init__(self, dataset, prepend_getter, ensure_first_token_is=None):
-        super().__init__(dataset)
-        self.prepend_getter = prepend_getter
-        self.ensure_first_token = ensure_first_token_is
-
-    def __getitem__(self, idx):
-        item = self.dataset[idx]
-        is_tuple = isinstance(item, tuple)
-        src = item[0] if is_tuple else item
-
-        assert self.ensure_first_token is None or src[0] == self.ensure_first_token
-        prepend_idx = self.prepend_getter(self.dataset, idx)
-        assert isinstance(prepend_idx, int)
-        src[0] = prepend_idx
-        item = tuple((src,) + item[1:]) if is_tuple else src
-        return item
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/prepend_token_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/prepend_token_dataset.py
deleted file mode 100644
index fd1331f4c44c1595eb9bb78baa0cf5cf3bcce9ad..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/prepend_token_dataset.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-
-from . import BaseWrapperDataset
-
-
-class PrependTokenDataset(BaseWrapperDataset):
-    def __init__(self, dataset, token=None):
-        super().__init__(dataset)
-        self.token = token
-        if token is not None:
-            self._sizes = np.array(dataset.sizes) + 1
-        else:
-            self._sizes = dataset.sizes
-
-    def __getitem__(self, idx):
-        item = self.dataset[idx]
-        if self.token is not None:
-            item = torch.cat([item.new([self.token]), item])
-        return item
-
-    @property
-    def sizes(self):
-        return self._sizes
-
-    def num_tokens(self, index):
-        n = self.dataset.num_tokens(index)
-        if self.token is not None:
-            n += 1
-        return n
-
-    def size(self, index):
-        n = self.dataset.size(index)
-        if self.token is not None:
-            n += 1
-        return n
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/raw_label_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/raw_label_dataset.py
deleted file mode 100644
index d054904f419bd64855d33a2a770b43f671c7c8d8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/raw_label_dataset.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-
-from . import FairseqDataset
-
-
-class RawLabelDataset(FairseqDataset):
-    def __init__(self, labels):
-        super().__init__()
-        self.labels = labels
-
-    def __getitem__(self, index):
-        return self.labels[index]
-
-    def __len__(self):
-        return len(self.labels)
-
-    def collater(self, samples):
-        return torch.tensor(samples)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/replace_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/replace_dataset.py
deleted file mode 100644
index 5aac2ba96bee0a8bb65f4c9e56fa0b17248ee1d9..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/replace_dataset.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from . import BaseWrapperDataset
-
-
-class ReplaceDataset(BaseWrapperDataset):
-    """Replaces tokens found in the dataset by a specified replacement token
-
-    Args:
-        dataset (~torch.utils.data.Dataset): dataset to replace tokens in
-        replace_map(Dictionary[int,int]): map of token to replace -> replacement token
-        offsets (List[int]): do not replace tokens before (from left if pos, right if neg) this offset. should be
-        as many as the number of objects returned by the underlying dataset __getitem__ method.
-    """
-
-    def __init__(self, dataset, replace_map, offsets):
-        super().__init__(dataset)
-        assert len(replace_map) > 0
-        self.replace_map = replace_map
-        self.offsets = offsets
-
-    def __getitem__(self, index):
-        item = self.dataset[index]
-        is_tuple = isinstance(item, tuple)
-        srcs = item if is_tuple else [item]
-
-        for offset, src in zip(self.offsets, srcs):
-            for k, v in self.replace_map.items():
-                src_off = src[offset:] if offset >= 0 else src[:offset]
-                src_off.masked_fill_(src_off == k, v)
-
-        item = srcs if is_tuple else srcs[0]
-        return item
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/resampling_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/resampling_dataset.py
deleted file mode 100644
index 3d3b993164dc3962df48bacff26714328e843e80..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/resampling_dataset.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-import numpy as np
-from fairseq.data import BaseWrapperDataset, plasma_utils
-
-
-logger = logging.getLogger(__name__)
-
-
-class ResamplingDataset(BaseWrapperDataset):
-    """Randomly samples from a given dataset at each epoch.
-
-    Sampling is done with or without replacement, depending on the "replace"
-    parameter.
-
-    Optionally, the epoch size can be rescaled. This is potentially desirable
-    to increase per-epoch coverage of the base dataset (since sampling with
-    replacement means that many items in the dataset will be left out). In the
-    case of sampling without replacement, size_ratio should be strictly less
-    than 1.
-
-    Args:
-        dataset (~torch.utils.data.Dataset): dataset on which to sample.
-        weights (List[float]): list of probability weights
-            (default: None, which corresponds to uniform sampling).
-        replace (bool): sampling mode; True for "with replacement", or False
-            for "without replacement" (default: True)
-        size_ratio (float): the ratio to subsample to; must be positive
-            (default: 1.0).
-        batch_by_size (bool): whether or not to batch by sequence length
-            (default: True).
-        seed (int): RNG seed to use (default: 0).
-        epoch (int): starting epoch number (default: 1).
-    """
-
-    def __init__(
-        self,
-        dataset,
-        weights=None,
-        replace=True,
-        size_ratio=1.0,
-        batch_by_size=True,
-        seed=0,
-        epoch=1,
-    ):
-        super().__init__(dataset)
-
-        if weights is None:
-            self.weights = None
-
-        else:
-            assert len(weights) == len(dataset)
-            weights_arr = np.array(weights, dtype=np.float64)
-            weights_arr /= weights_arr.sum()
-            self.weights = plasma_utils.PlasmaArray(weights_arr)
-
-        self.replace = replace
-
-        assert size_ratio > 0.0
-        if not self.replace:
-            assert size_ratio < 1.0
-        self.size_ratio = float(size_ratio)
-        self.actual_size = np.ceil(len(dataset) * self.size_ratio).astype(int)
-
-        self.batch_by_size = batch_by_size
-        self.seed = seed
-
-        self._cur_epoch = None
-        self._cur_indices = None
-
-        self.set_epoch(epoch)
-
-    def __getitem__(self, index):
-        return self.dataset[self._cur_indices.array[index]]
-
-    def __len__(self):
-        return self.actual_size
-
-    @property
-    def sizes(self):
-        if isinstance(self.dataset.sizes, list):
-            return [s[self._cur_indices.array] for s in self.dataset.sizes]
-        return self.dataset.sizes[self._cur_indices.array]
-
-    def num_tokens(self, index):
-        return self.dataset.num_tokens(self._cur_indices.array[index])
-
-    def size(self, index):
-        return self.dataset.size(self._cur_indices.array[index])
-
-    def ordered_indices(self):
-        if self.batch_by_size:
-            order = [
-                np.arange(len(self)),
-                self.sizes,
-            ]  # No need to handle `self.shuffle == True`
-            return np.lexsort(order)
-        else:
-            return np.arange(len(self))
-
-    def prefetch(self, indices):
-        self.dataset.prefetch(self._cur_indices.array[indices])
-
-    @property
-    def can_reuse_epoch_itr_across_epochs(self):
-        return False
-
-    def set_epoch(self, epoch):
-        logger.debug("ResamplingDataset.set_epoch: {}".format(epoch))
-        super().set_epoch(epoch)
-
-        if epoch == self._cur_epoch:
-            return
-
-        self._cur_epoch = epoch
-
-        # Generate a weighted sample of indices as a function of the
-        # random seed and the current epoch.
-
-        rng = np.random.RandomState(
-            [
-                42,  # magic number
-                self.seed % (2 ** 32),  # global seed
-                self._cur_epoch,  # epoch index
-            ]
-        )
-        self._cur_indices = plasma_utils.PlasmaArray(
-            rng.choice(
-                len(self.dataset),
-                self.actual_size,
-                replace=self.replace,
-                p=(None if self.weights is None else self.weights.array),
-            )
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/roll_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/roll_dataset.py
deleted file mode 100644
index a2915eeb3e8fb4dfb4b2bb33e0464ad0783d854c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/roll_dataset.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-
-from . import BaseWrapperDataset
-
-
-class RollDataset(BaseWrapperDataset):
-    def __init__(self, dataset, shifts):
-        super().__init__(dataset)
-        self.shifts = shifts
-
-    def __getitem__(self, index):
-        item = self.dataset[index]
-        return torch.roll(item, self.shifts)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/round_robin_zip_datasets.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/round_robin_zip_datasets.py
deleted file mode 100644
index 690823fc86d23603b6ef79c7e60b27387ac3c58a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/round_robin_zip_datasets.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from collections import OrderedDict
-
-import numpy as np
-
-from . import FairseqDataset
-
-
-class RoundRobinZipDatasets(FairseqDataset):
-    """Zip multiple :class:`~fairseq.data.FairseqDataset` instances together.
-
-    Shorter datasets are repeated in a round-robin fashion to match the length
-    of the longest one.
-
-    Args:
-        datasets (Dict[~fairseq.data.FairseqDataset]): a dictionary of
-            :class:`~fairseq.data.FairseqDataset` instances.
-        eval_key (str, optional): a key used at evaluation time that causes
-            this instance to pass-through batches from *datasets[eval_key]*.
-    """
-
-    def __init__(self, datasets, eval_key=None):
-        super().__init__()
-        assert isinstance(datasets, OrderedDict)
-        self.datasets = datasets
-        self.eval_key = eval_key
-
-        self.longest_dataset = None
-        self.longest_dataset_key = None
-        for key, dataset in datasets.items():
-            assert isinstance(dataset, FairseqDataset)
-            if self.longest_dataset is None or len(dataset) > len(self.longest_dataset):
-                self.longest_dataset = dataset
-                self.longest_dataset_key = key
-
-        self._ordered_indices = None
-
-    def _map_index(self, key, index):
-        assert (
-            self._ordered_indices is not None
-        ), "Must call RoundRobinZipDatasets.ordered_indices() first"
-        return self._ordered_indices[key][index % len(self.datasets[key])]
-
-    def __getitem__(self, index):
-        if self.eval_key is None:
-            return OrderedDict(
-                [
-                    (key, dataset[self._map_index(key, index)])
-                    for key, dataset in self.datasets.items()
-                ]
-            )
-        else:
-            # at evaluation time it's useful to pass-through batches from a single key
-            return self.datasets[self.eval_key][self._map_index(self.eval_key, index)]
-
-    def __len__(self):
-        return len(self.longest_dataset)
-
-    def collater(self, samples):
-        """Merge a list of samples to form a mini-batch."""
-        if len(samples) == 0:
-            return None
-        if self.eval_key is None:
-            return OrderedDict(
-                [
-                    (key, dataset.collater([sample[key] for sample in samples]))
-                    for key, dataset in self.datasets.items()
-                ]
-            )
-        else:
-            # at evaluation time it's useful to pass-through batches from a single key
-            return self.datasets[self.eval_key].collater(samples)
-
-    def num_tokens(self, index):
-        """Return an example's length (number of tokens), used for batching."""
-        # TODO make it configurable whether to use max() or sum() here
-        return max(
-            dataset.num_tokens(self._map_index(key, index))
-            for key, dataset in self.datasets.items()
-        )
-
-    def size(self, index):
-        """Return an example's size as a float or tuple. This value is used when
-        filtering a dataset with ``--max-positions``."""
-        return {
-            key: dataset.size(self._map_index(key, index))
-            for key, dataset in self.datasets.items()
-        }
-
-    def ordered_indices(self):
-        """Ordered indices for batching."""
-        if self._ordered_indices is None:
-            # Call the underlying dataset's ordered_indices() here, so that we
-            # get the same random ordering as we would have from using the
-            # underlying dataset directly.
-            self._ordered_indices = OrderedDict(
-                [
-                    (key, dataset.ordered_indices())
-                    for key, dataset in self.datasets.items()
-                ]
-            )
-        return np.arange(len(self))
-
-    @property
-    def supports_prefetch(self):
-        return all(
-            getattr(dataset, "supports_prefetch", False)
-            for dataset in self.datasets.values()
-        )
-
-    def prefetch(self, indices):
-        for key, dataset in self.datasets.items():
-            dataset.prefetch([self._map_index(key, index) for index in indices])
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/shorten_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/shorten_dataset.py
deleted file mode 100644
index 6ebb5d88feb3f29d1512a0873df304915d051209..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/shorten_dataset.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-from fairseq.data import data_utils
-
-from . import BaseWrapperDataset
-
-
-class TruncateDataset(BaseWrapperDataset):
-    """Truncate a sequence by returning the first truncation_length tokens"""
-
-    def __init__(self, dataset, truncation_length):
-        super().__init__(dataset)
-        assert truncation_length is not None
-        self.truncation_length = truncation_length
-        self.dataset = dataset
-
-    def __getitem__(self, index):
-        item = self.dataset[index]
-        item_len = item.size(0)
-        if item_len > self.truncation_length:
-            item = item[: self.truncation_length]
-        return item
-
-    @property
-    def sizes(self):
-        return np.minimum(self.dataset.sizes, self.truncation_length)
-
-    def __len__(self):
-        return len(self.dataset)
-
-
-class RandomCropDataset(TruncateDataset):
-    """Truncate a sequence by returning a random crop of truncation_length tokens"""
-
-    def __init__(self, dataset, truncation_length, seed=1):
-        super().__init__(dataset, truncation_length)
-        self.seed = seed
-        self.epoch = 0
-
-    @property
-    def can_reuse_epoch_itr_across_epochs(self):
-        return True  # only the crop changes, not item sizes
-
-    def set_epoch(self, epoch, **unused):
-        super().set_epoch(epoch)
-        self.epoch = epoch
-
-    def __getitem__(self, index):
-        with data_utils.numpy_seed(self.seed, self.epoch, index):
-            item = self.dataset[index]
-            item_len = item.size(0)
-            excess = item_len - self.truncation_length
-            if excess > 0:
-                start_idx = np.random.randint(0, excess)
-                item = item[start_idx : start_idx + self.truncation_length]
-            return item
-
-
-def maybe_shorten_dataset(
-    dataset,
-    split,
-    shorten_data_split_list,
-    shorten_method,
-    tokens_per_sample,
-    seed,
-):
-    truncate_split = (
-        split in shorten_data_split_list.split(",") or len(shorten_data_split_list) == 0
-    )
-    if shorten_method == "truncate" and truncate_split:
-        dataset = TruncateDataset(dataset, tokens_per_sample)
-    elif shorten_method == "random_crop" and truncate_split:
-        dataset = RandomCropDataset(dataset, tokens_per_sample, seed)
-    return dataset
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/sort_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/sort_dataset.py
deleted file mode 100644
index b3890e7279e1f26db2e48ec0a91c639e9299d60f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/sort_dataset.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-
-from . import BaseWrapperDataset
-
-
-class SortDataset(BaseWrapperDataset):
-    def __init__(self, dataset, sort_order):
-        super().__init__(dataset)
-        if not isinstance(sort_order, (list, tuple)):
-            sort_order = [sort_order]
-        self.sort_order = sort_order
-
-        assert all(len(so) == len(dataset) for so in sort_order)
-
-    def ordered_indices(self):
-        return np.lexsort(self.sort_order)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/strip_token_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/strip_token_dataset.py
deleted file mode 100644
index cae39ba4d2f8106398eccd7eb0cf5c2194ec0db5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/strip_token_dataset.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from . import BaseWrapperDataset
-
-
-class StripTokenDataset(BaseWrapperDataset):
-    def __init__(self, dataset, id_to_strip):
-        super().__init__(dataset)
-        self.id_to_strip = id_to_strip
-
-    def __getitem__(self, index):
-        item = self.dataset[index]
-        while len(item) > 0 and item[-1] == self.id_to_strip:
-            item = item[:-1]
-        while len(item) > 0 and item[0] == self.id_to_strip:
-            item = item[1:]
-        return item
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/subsample_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/subsample_dataset.py
deleted file mode 100644
index 48feaf883f87dc95f8637c24d3c96f3f9fd8bd1d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/subsample_dataset.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-import numpy as np
-
-from . import BaseWrapperDataset
-
-
-logger = logging.getLogger(__name__)
-
-
-class SubsampleDataset(BaseWrapperDataset):
-    """Subsamples a given dataset by a specified ratio. Subsampling is done on the number of examples
-
-    Args:
-        dataset (~torch.utils.data.Dataset): dataset to subsample
-        size_ratio(float): the ratio to subsample to. must be between 0 and 1 (exclusive)
-    """
-
-    def __init__(self, dataset, size_ratio, shuffle=False):
-        super().__init__(dataset)
-        assert size_ratio < 1
-        self.actual_size = np.ceil(len(dataset) * size_ratio).astype(int)
-        self.indices = np.random.choice(
-            list(range(len(self.dataset))), self.actual_size, replace=False
-        )
-        self.shuffle = shuffle
-        logger.info(
-            "subsampled dataset from {} to {} (ratio={})".format(
-                len(self.dataset), self.actual_size, size_ratio
-            )
-        )
-
-    def __getitem__(self, index):
-        return self.dataset[self.indices[index]]
-
-    def __len__(self):
-        return self.actual_size
-
-    def collater(self, samples):
-        return self.dataset.collater(samples)
-
-    @property
-    def sizes(self):
-        return self.dataset.sizes[self.indices]
-
-    @property
-    def name(self):
-        return self.dataset.name
-
-    def num_tokens(self, index):
-        return self.dataset.num_tokens(self.indices[index])
-
-    def size(self, index):
-        return self.dataset.size(self.indices[index])
-
-    def ordered_indices(self):
-        """Return an ordered list of indices. Batches will be constructed based
-        on this order."""
-        if self.shuffle:
-            order = [np.random.permutation(len(self))]
-        else:
-            order = [np.arange(len(self))]
-        order.append(self.sizes)
-        return np.lexsort(order)
-
-    def prefetch(self, indices):
-        self.dataset.prefetch(self.indices[indices])
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/token_block_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/token_block_dataset.py
deleted file mode 100644
index aa33f9d06f37fa6a1e239d9733a2725ec158f6a8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/token_block_dataset.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-from fairseq.data import FairseqDataset, plasma_utils
-
-
-class TokenBlockDataset(FairseqDataset):
-    """Break a Dataset of tokens into blocks.
-
-    Args:
-        dataset (~torch.utils.data.Dataset): dataset to break into blocks
-        sizes (List[int]): sentence lengths (required for 'complete' and 'eos')
-        block_size (int): maximum block size (ignored in 'eos' break mode)
-        break_mode (str, optional): Mode used for breaking tokens. Values can
-            be one of:
-            - 'none': break tokens into equally sized blocks (up to block_size)
-            - 'complete': break tokens into blocks (up to block_size) such that
-                blocks contains complete sentences, although block_size may be
-                exceeded if some sentences exceed block_size
-            - 'complete_doc': similar to 'complete' mode, but do not
-                cross document boundaries
-            - 'eos': each block contains one sentence (block_size is ignored)
-        include_targets (bool, optional): return next tokens as targets
-            (default: False).
-        document_sep_len (int, optional): document separator size (required for
-            'complete_doc' break mode). Typically 1 if the sentences have eos
-            and 0 otherwise.
-    """
-
-    def __init__(
-        self,
-        dataset,
-        sizes,
-        block_size,
-        pad,
-        eos,
-        break_mode=None,
-        include_targets=False,
-        document_sep_len=1,
-    ):
-        try:
-            from fairseq.data.token_block_utils_fast import (
-                _get_slice_indices_fast,
-                _get_block_to_dataset_index_fast,
-            )
-        except ImportError:
-            raise ImportError(
-                "Please build Cython components with: `pip install --editable .` "
-                "or `python setup.py build_ext --inplace`"
-            )
-
-        super().__init__()
-        self.dataset = dataset
-        self.pad = pad
-        self.eos = eos
-        self.include_targets = include_targets
-
-        assert len(dataset) == len(sizes)
-        assert len(dataset) > 0
-
-        if isinstance(sizes, list):
-            sizes = np.array(sizes, dtype=np.int64)
-        else:
-            if torch.is_tensor(sizes):
-                sizes = sizes.numpy()
-            sizes = sizes.astype(np.int64)
-
-        break_mode = break_mode if break_mode is not None else "none"
-
-        # For "eos" break-mode, block_size is not required parameters.
-        if break_mode == "eos" and block_size is None:
-            block_size = 0
-
-        slice_indices = _get_slice_indices_fast(
-            sizes, str(break_mode), block_size, document_sep_len
-        )
-        self._sizes = slice_indices[:, 1] - slice_indices[:, 0]
-
-        # build index mapping block indices to the underlying dataset indices
-        if break_mode == "eos":
-            # much faster version for eos break mode
-            block_to_dataset_index = np.stack(
-                [
-                    np.arange(len(sizes)),  # starting index in dataset
-                    np.zeros(
-                        len(sizes), dtype=np.long
-                    ),  # starting offset within starting index
-                    np.arange(len(sizes)),  # ending index in dataset
-                ],
-                1,
-            )
-        else:
-            block_to_dataset_index = _get_block_to_dataset_index_fast(
-                sizes,
-                slice_indices,
-            )
-        self._slice_indices = plasma_utils.PlasmaArray(slice_indices)
-        self._sizes = plasma_utils.PlasmaArray(self._sizes)
-        self._block_to_dataset_index = plasma_utils.PlasmaArray(block_to_dataset_index)
-
-    @property
-    def slice_indices(self):
-        return self._slice_indices.array
-
-    @property
-    def sizes(self):
-        return self._sizes.array
-
-    @property
-    def block_to_dataset_index(self):
-        return self._block_to_dataset_index.array
-
-    def attr(self, attr: str, index: int):
-        start_ds_idx, _, _ = self.block_to_dataset_index[index]
-        return self.dataset.attr(attr, start_ds_idx)
-
-    def __getitem__(self, index):
-        start_ds_idx, start_offset, end_ds_idx = self.block_to_dataset_index[index]
-
-        buffer = torch.cat(
-            [self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)]
-        )
-
-        slice_s, slice_e = self.slice_indices[index]
-        length = slice_e - slice_s
-        s, e = start_offset, start_offset + length
-        item = buffer[s:e]
-
-        if self.include_targets:
-            # *target* is the original sentence (=item)
-            # *source* is shifted right by 1 (maybe left-padded with eos)
-            # *past_target* is shifted right by 2 (left-padded as needed)
-            if s == 0:
-                source = torch.cat([item.new([self.eos]), buffer[0 : e - 1]])
-                past_target = torch.cat(
-                    [item.new([self.pad, self.eos]), buffer[0 : e - 2]]
-                )
-            else:
-                source = buffer[s - 1 : e - 1]
-                if s == 1:
-                    past_target = torch.cat([item.new([self.eos]), buffer[0 : e - 2]])
-                else:
-                    past_target = buffer[s - 2 : e - 2]
-
-            return source, item, past_target
-
-        return item
-
-    def __len__(self):
-        return len(self.slice_indices)
-
-    @property
-    def supports_prefetch(self):
-        return getattr(self.dataset, "supports_prefetch", False)
-
-    def prefetch(self, indices):
-        self.dataset.prefetch(
-            {
-                ds_idx
-                for index in indices
-                for start_ds_idx, _, end_ds_idx in [self.block_to_dataset_index[index]]
-                for ds_idx in range(start_ds_idx, end_ds_idx + 1)
-            }
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/token_block_utils_fast.pyx b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/token_block_utils_fast.pyx
deleted file mode 100644
index 08af4f30613a7b6ffa965a7c7084acabec8f8749..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/token_block_utils_fast.pyx
+++ /dev/null
@@ -1,187 +0,0 @@
-# cython: language_level=3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-from itertools import chain
-from libc.math cimport ceil
-
-cimport cython
-cimport numpy as np
-
-from libc.stdint cimport int32_t, int64_t
-
-DTYPE = np.int64
-ctypedef int64_t DTYPE_t
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.nonecheck(False)
-cdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_none_mode(np.ndarray[DTYPE_t, ndim=1] sizes, int block_size):
-    cdef DTYPE_t total_size = sizes.sum()
-    cdef DTYPE_t length = <DTYPE_t> ceil(total_size / <double> block_size)
-    cdef np.ndarray[DTYPE_t, ndim=2] slice_indices = np.zeros([length, 2], dtype=DTYPE)
-    cdef DTYPE_t[:, :] slice_indices_view = slice_indices
-    cdef DTYPE_t i
-    cdef DTYPE_t start
-    cdef DTYPE_t end
-    for i in range(length):
-        start = i * block_size
-        end = min(start + block_size, total_size)
-        slice_indices_view[i][0] = start
-        slice_indices_view[i][1] = end
-    return slice_indices
-
-
-cdef np.ndarray[DTYPE_t, ndim=2] _fast_convert_to_np_array(list list_of_list):
-    """
-    Faster function to convert DTYPE_t list of list.
-    Only fast when there are huge number of rows and low number of columns.
-    """
-    cdef np.ndarray[DTYPE_t, ndim=1] flat = np.fromiter(chain.from_iterable(list_of_list), DTYPE, -1)
-    return flat.reshape((len(list_of_list), -1))
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.nonecheck(False)
-cpdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_fast(np.ndarray[DTYPE_t, ndim=1] sizes, str break_mode, int block_size, int document_sep_len):
-    cdef DTYPE_t tok_idx = 0
-    cdef DTYPE_t sz_idx = 0
-    cdef DTYPE_t curr_size = 0
-    cdef DTYPE_t i = 0
-    cdef DTYPE_t length
-    cdef DTYPE_t total_size
-    cdef DTYPE_t[:] sizes_view = sizes
-    cdef np.ndarray[DTYPE_t, ndim=2] slice_indices
-    cdef list slice_indices_list = []
-
-    if break_mode is None or break_mode == 'none':
-        slice_indices = _get_slice_indices_none_mode(sizes, block_size)
-    elif break_mode == 'complete':
-        while sz_idx < len(sizes_view):
-            if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0:
-                curr_size += sizes_view[sz_idx]
-                sz_idx += 1
-            else:
-                slice_indices_list.append((tok_idx, tok_idx + curr_size))
-                tok_idx += curr_size
-                curr_size = 0
-        if curr_size > 0:
-            slice_indices_list.append((tok_idx, tok_idx + curr_size))
-        slice_indices = _fast_convert_to_np_array(slice_indices_list)
-    elif break_mode == 'complete_doc':
-        while sz_idx < len(sizes_view):
-            if (
-                (curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0)
-                # an empty sentence indicates end-of-document:
-                and sizes_view[sz_idx] != document_sep_len
-            ):
-                curr_size += sizes_view[sz_idx]
-                sz_idx += 1
-            else:
-                # Only keep non-empty documents.
-                if curr_size > 1:
-                    slice_indices_list.append((tok_idx, tok_idx + curr_size))
-                tok_idx += curr_size
-                curr_size = 0
-                if sizes_view[sz_idx] == document_sep_len:
-                    tok_idx += sizes_view[sz_idx]
-                    sz_idx += 1
-        if curr_size > 1:
-            slice_indices_list.append((tok_idx, tok_idx + curr_size))
-        slice_indices = _fast_convert_to_np_array(slice_indices_list)
-    elif break_mode == 'eos':
-        slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE)
-        cumsum = sizes.cumsum(axis=0)
-        slice_indices[1:, 0] = cumsum[:cumsum.shape[0] - 1]
-        slice_indices[:, 1] = cumsum
-    else:
-        raise ValueError('Invalid break_mode: ' + break_mode)
-    return slice_indices
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-@cython.nonecheck(False)
-cpdef np.ndarray[DTYPE_t, ndim=2] _get_block_to_dataset_index_fast(np.ndarray[DTYPE_t, ndim=1] sizes, np.ndarray[DTYPE_t, ndim=2] slice_indices):
-    cdef DTYPE_t start_ds_idx
-    cdef DTYPE_t start_offset
-    cdef DTYPE_t end_ds_idx
-    cdef DTYPE_t i
-    cdef DTYPE_t s
-    cdef DTYPE_t e
-    cdef DatasetSearcher ds = DatasetSearcher(sizes)
-    cdef np.ndarray[DTYPE_t, ndim=2] block_to_dataset_index = np.zeros([len(slice_indices), 3], dtype=DTYPE)
-    cdef DTYPE_t[:, :] block_to_dataset_index_view = block_to_dataset_index
-    cdef DTYPE_t[:, :] slice_indices_view = slice_indices
-    cdef Py_ssize_t x_max = slice_indices.shape[0]
-
-    for i in range(x_max):
-        s = slice_indices_view[i][0]
-        e = slice_indices_view[i][1]
-        ds.seek(s)
-        start_ds_idx = ds.current_index
-        start_offset = ds.current_offset
-        if e <= s:
-            end_ds_idx = start_ds_idx
-        else:
-            ds.seek(e - 1)
-            end_ds_idx = ds.current_index
-        block_to_dataset_index_view[i][0] = start_ds_idx  # starting index in dataset
-        block_to_dataset_index_view[i][1] = start_offset  # starting offset within starting index
-        block_to_dataset_index_view[i][2] = end_ds_idx    # ending index in dataset
-    return block_to_dataset_index
-
-
-cdef class DatasetSearcher(object):
-    """Helper for mapping "flat" indices to indices and offsets in an
-    underlying dataset."""
-    cdef DTYPE_t current_i
-    cdef DTYPE_t current_offset
-    cdef DTYPE_t current_index
-    cdef DTYPE_t[:] sizes
-
-    def __init__(self, DTYPE_t[:] sizes):
-        self.sizes = sizes
-        self.reset()
-
-    cdef reset(self):
-        self.current_offset = 0     # offset within current index in underlying dataset
-        self.current_i = 0          # "flat" index
-        self.current_index = 0      # index in underlying dataset
-
-    @cython.boundscheck(False)
-    @cython.wraparound(False)
-    @cython.nonecheck(False)
-    cdef int step(self, DTYPE_t i):
-        cdef DTYPE_t to_consume
-        cdef DTYPE_t remaining
-        if i < self.current_i:
-            self.reset()
-        if i > self.current_i:
-            to_consume = i - self.current_i
-            remaining = self.sizes[self.current_index] - self.current_offset
-            if remaining > to_consume:
-                self.current_offset += to_consume
-                self.current_i += to_consume
-            else:
-                assert remaining >= 0
-                self.current_i += remaining
-                self.current_index += 1
-                self.current_offset = 0
-                return 1
-        return 0
-
-    @cython.boundscheck(False)
-    @cython.wraparound(False)
-    @cython.nonecheck(False)
-    cdef seek(self, DTYPE_t i):
-        cdef int not_done = 1
-        while not_done == 1:
-            not_done = self.step(i)
-        assert self.current_i == i
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/transform_eos_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/transform_eos_dataset.py
deleted file mode 100644
index fb14ff018edf13b20f5d0e486692dfb0a37ec6d1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/transform_eos_dataset.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-
-from . import FairseqDataset
-
-
-class TransformEosDataset(FairseqDataset):
-    """A :class:`~fairseq.data.FairseqDataset` wrapper that appends/prepends/strips EOS.
-
-    Note that the transformation is applied in :func:`collater`.
-
-    Args:
-        dataset (~fairseq.data.FairseqDataset): dataset to wrap
-        eos (int): index of the end-of-sentence symbol
-        append_eos_to_src (bool, optional): append EOS to the end of src
-        remove_eos_from_src (bool, optional): remove EOS from the end of src
-        append_eos_to_tgt (bool, optional): append EOS to the end of tgt
-        remove_eos_from_tgt (bool, optional): remove EOS from the end of tgt
-    """
-
-    def __init__(
-        self,
-        dataset,
-        eos,
-        append_eos_to_src=False,
-        remove_eos_from_src=False,
-        append_eos_to_tgt=False,
-        remove_eos_from_tgt=False,
-        has_target=True,
-    ):
-        if not isinstance(dataset, FairseqDataset):
-            raise ValueError("dataset must be an instance of FairseqDataset")
-        if append_eos_to_src and remove_eos_from_src:
-            raise ValueError("cannot combine append_eos_to_src and remove_eos_from_src")
-        if append_eos_to_tgt and remove_eos_from_tgt:
-            raise ValueError("cannot combine append_eos_to_tgt and remove_eos_from_tgt")
-
-        self.dataset = dataset
-        self.eos = torch.LongTensor([eos])
-        self.append_eos_to_src = append_eos_to_src
-        self.remove_eos_from_src = remove_eos_from_src
-        self.append_eos_to_tgt = append_eos_to_tgt
-        self.remove_eos_from_tgt = remove_eos_from_tgt
-        self.has_target = has_target
-
-        # precompute how we should adjust the reported sizes
-        self._src_delta = 0
-        self._src_delta += 1 if append_eos_to_src else 0
-        self._src_delta -= 1 if remove_eos_from_src else 0
-        self._tgt_delta = 0
-        self._tgt_delta += 1 if append_eos_to_tgt else 0
-        self._tgt_delta -= 1 if remove_eos_from_tgt else 0
-
-        self._checked_src = False
-        self._checked_tgt = False
-
-    def _check_src(self, src, expect_eos):
-        if not self._checked_src:
-            assert (src[-1] == self.eos[0]) == expect_eos
-            self._checked_src = True
-
-    def _check_tgt(self, tgt, expect_eos):
-        if self.has_target and not self._checked_tgt:
-            assert (tgt[-1] == self.eos[0]) == expect_eos
-            self._checked_tgt = True
-
-    def __getitem__(self, index):
-        return self.dataset[index]
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def collater(self, samples):
-        def transform(item):
-            if self.append_eos_to_src:
-                self.eos = self.eos.to(device=item["source"].device)
-                self._check_src(item["source"], expect_eos=False)
-                item["source"] = torch.cat([item["source"], self.eos])
-            if self.remove_eos_from_src:
-                self.eos = self.eos.to(device=item["source"].device)
-                self._check_src(item["source"], expect_eos=True)
-                item["source"] = item["source"][:-1]
-            if self.append_eos_to_tgt:
-                self.eos = self.eos.to(device=item["target"].device)
-                self._check_tgt(item["target"], expect_eos=False)
-                item["target"] = torch.cat([item["target"], self.eos])
-            if self.remove_eos_from_tgt:
-                self.eos = self.eos.to(device=item["target"].device)
-                self._check_tgt(item["target"], expect_eos=True)
-                item["target"] = item["target"][:-1]
-            return item
-
-        samples = list(map(transform, samples))
-        return self.dataset.collater(samples)
-
-    def num_tokens(self, index):
-        return self.dataset.num_tokens(index)
-
-    def size(self, index):
-        if self.has_target:
-            src_len, tgt_len = self.dataset.size(index)
-            return (src_len + self._src_delta, tgt_len + self._tgt_delta)
-        else:
-            return self.dataset.size(index)
-
-    def ordered_indices(self):
-        # NOTE: we assume that the ordering does not change based on the
-        # addition or removal of eos
-        return self.dataset.ordered_indices()
-
-    @property
-    def supports_prefetch(self):
-        return getattr(self.dataset, "supports_prefetch", False)
-
-    def prefetch(self, indices):
-        return self.dataset.prefetch(indices)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/transform_eos_lang_pair_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/transform_eos_lang_pair_dataset.py
deleted file mode 100644
index 1dd3d93d2b41898ba6b25ba0255abbcebcf495b7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/data/transform_eos_lang_pair_dataset.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-from typing import Optional
-
-import torch
-
-from . import FairseqDataset
-
-
-class TransformEosLangPairDataset(FairseqDataset):
-    """A :class:`~fairseq.data.FairseqDataset` wrapper that transform bos on
-    collated samples of language pair dataset.
-
-    Note that the transformation is applied in :func:`collater`.
-
-    Args:
-        dataset (~fairseq.data.FairseqDataset): dataset that collates sample into
-            LanguagePairDataset schema
-        src_eos (int): original source end-of-sentence symbol index to be replaced
-        new_src_eos (int, optional): new end-of-sentence symbol index to replace source eos symbol
-        tgt_bos (int, optional): original target beginning-of-sentence symbol index to be replaced
-        new_tgt_bos (int, optional): new beginning-of-sentence symbol index to replace at the
-            beginning of 'prev_output_tokens'
-    """
-
-    def __init__(
-        self,
-        dataset: FairseqDataset,
-        src_eos: int,
-        new_src_eos: Optional[int] = None,
-        tgt_bos: Optional[int] = None,
-        new_tgt_bos: Optional[int] = None,
-    ):
-        self.dataset = dataset
-        self.src_eos = src_eos
-        self.new_src_eos = new_src_eos
-        self.tgt_bos = tgt_bos
-        self.new_tgt_bos = new_tgt_bos
-
-    def __getitem__(self, index):
-        return self.dataset[index]
-
-    def __len__(self):
-        return len(self.dataset)
-
-    def collater(self, samples, **extra_args):
-        samples = self.dataset.collater(samples, **extra_args)
-
-        if self.new_src_eos is not None:
-            if self.dataset.left_pad_source:
-                assert (
-                    samples["net_input"]["src_tokens"][:, -1] != self.src_eos
-                ).sum() == 0
-                samples["net_input"]["src_tokens"][:, -1] = self.new_src_eos
-            else:
-                eos_idx = samples["net_input"]["src_lengths"] - 1
-                assert (
-                    samples["net_input"]["src_tokens"][
-                        torch.arange(eos_idx.size(0)), eos_idx
-                    ]
-                    != self.src_eos
-                ).sum() == 0
-                eos_idx = eos_idx.resize_(len(samples["net_input"]["src_lengths"]), 1)
-                samples["net_input"]["src_tokens"].scatter_(
-                    1, eos_idx, self.new_src_eos
-                )
-
-        if (
-            self.new_tgt_bos is not None
-            and "prev_output_tokens" in samples["net_input"]
-        ):
-            if self.dataset.left_pad_target:
-                # TODO: support different padding direction on target side
-                raise NotImplementedError(
-                    "TransformEosLangPairDataset does not implement --left-pad-target True option"
-                )
-            else:
-                assert (
-                    samples["net_input"]["prev_output_tokens"][:, 0] != self.tgt_bos
-                ).sum() == 0
-                samples["net_input"]["prev_output_tokens"][:, 0] = self.new_tgt_bos
-
-        return samples
-
-    def num_tokens(self, index):
-        return self.dataset.num_tokens(index)
-
-    def size(self, index):
-        return self.dataset.size(index)
-
-    @property
-    def sizes(self):
-        # dataset.sizes can be a dynamically computed sizes:
-        return self.dataset.sizes
-
-    def ordered_indices(self):
-        return self.dataset.ordered_indices()
-
-    @property
-    def supports_prefetch(self):
-        return getattr(self.dataset, "supports_prefetch", False)
-
-    def prefetch(self, indices):
-        return self.dataset.prefetch(indices)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/dataclass/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/dataclass/__init__.py
deleted file mode 100644
index 32870814d5418ab7a9846332e9b43929a0632a9f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/dataclass/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .utils import ChoiceEnum, FairseqDataclass
-
-
-__all__ = ["FairseqDataclass", "ChoiceEnum"]
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/dataclass/constants.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/dataclass/constants.py
deleted file mode 100644
index 21b36450f9107ac3c1f991505c8c4be05435a294..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/dataclass/constants.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.dataclass.utils import ChoiceEnum
-
-
-LOG_FORMAT_CHOICES = ChoiceEnum(["json", "none", "simple", "tqdm"])
-DDP_BACKEND_CHOICES = ChoiceEnum(["c10d", "no_c10d"])
-DISTRIBUTED_WRAPPER_CHOICES = ChoiceEnum(["DDP", "SlowMo"])
-ZERO_SHARDING_CHOICES = ChoiceEnum(["none", "os"])
-PIPELINE_CHECKPOINT_CHOICES = ChoiceEnum(["always", "never", "except_last"])
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/dataclass/data_class.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/dataclass/data_class.py
deleted file mode 100644
index 749da6ec4dcfe4950db5723522502ce0518f3094..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/dataclass/data_class.py
+++ /dev/null
@@ -1,777 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import sys
-from argparse import Namespace
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Tuple, Type
-
-import torch
-from fairseq.criterions import CRITERION_DATACLASS_REGISTRY
-from fairseq.data.indexed_dataset import get_available_dataset_impl
-from fairseq.dataclass.constants import (
-    DDP_BACKEND_CHOICES,
-    DISTRIBUTED_WRAPPER_CHOICES,
-    LOG_FORMAT_CHOICES,
-    PIPELINE_CHECKPOINT_CHOICES,
-    ZERO_SHARDING_CHOICES,
-)
-from fairseq.dataclass.utils import ChoiceEnum, FairseqDataclass
-from fairseq.models import ARCH_MODEL_REGISTRY, MODEL_DATACLASS_REGISTRY
-from fairseq.optim import OPTIMIZER_DATACLASS_REGISTRY
-from fairseq.optim.bmuf import FairseqBMUFConfig
-from fairseq.optim.lr_scheduler import LR_SCHEDULER_DATACLASS_REGISTRY
-from fairseq.tasks import TASK_DATACLASS_REGISTRY
-from hydra.core.config_store import ConfigStore
-
-
-@dataclass
-class CommonParams(FairseqDataclass):
-    # This is the core dataclass including common parameters shared by all different jobs. Please append your params to other dataclasses if they were
-    # used for a particular purpose or task, such as those dedicated for `distributed training`, `optimization`, etc.
-    no_progress_bar: bool = field(
-        default=False, metadata={"help": "disable progress bar"}
-    )
-    log_interval: int = field(
-        default=100,
-        metadata={
-            "help": "log progress every N batches (when progress bar is disabled)"
-        },
-    )
-    log_format: Optional[LOG_FORMAT_CHOICES] = field(
-        default=None, metadata={"help": "log format to use"}
-    )
-    tensorboard_logdir: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "path to save logs for tensorboard, should match --logdir "
-            "of running tensorboard (default: no tensorboard logging)"
-        },
-    )
-    seed: int = field(
-        default=1, metadata={"help": "pseudo random number generator seed"}
-    )
-    cpu: bool = field(default=False, metadata={"help": "use CPU instead of CUDA"})
-    tpu: bool = field(default=False, metadata={"help": "use TPU instead of CUDA"})
-    bf16: bool = field(default=False, metadata={"help": "use bfloat16; implies --tpu"})
-    npu: bool = field(default=False, metadata={"help": "use NPU instead of CUDA"})
-    npu_id: int = field(
-        default=0, metadata={"help": "which npu id to train"}
-    )
-    memory_efficient_bf16: bool = field(
-        default=False,
-        metadata={
-            "help": "use a memory-efficient version of BF16 training; implies --bf16"
-        },
-    )
-    fp16: bool = field(default=False, metadata={"help": "use FP16"})
-    memory_efficient_fp16: bool = field(
-        default=False,
-        metadata={
-            "help": "use a memory-efficient version of FP16 training; implies --fp16"
-        },
-    )
-    fp16_no_flatten_grads: bool = field(
-        default=False, metadata={"help": "don't flatten FP16 grads tensor"}
-    )
-    fp16_init_scale: int = field(
-        default=2 ** 7, metadata={"help": "default FP16 loss scale"}
-    )
-    fp16_scale_window: Optional[int] = field(
-        default=None,
-        metadata={"help": "number of updates before increasing loss scale"},
-    )
-    fp16_scale_tolerance: float = field(
-        default=0.0,
-        metadata={
-            "help": "pct of updates that can overflow before decreasing the loss scale"
-        },
-    )
-    min_loss_scale: float = field(
-        default=1e-4,
-        metadata={"help": "minimum FP16 loss scale, after which training is stopped"},
-    )
-    threshold_loss_scale: Optional[float] = field(
-        default=None, metadata={"help": "threshold FP16 loss scale from below"}
-    )
-    user_dir: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "path to a python module containing custom extensions (tasks and/or architectures)"
-        },
-    )
-    empty_cache_freq: int = field(
-        default=0,
-        metadata={"help": "how often to clear the PyTorch CUDA cache (0 to disable)"},
-    )
-    all_gather_list_size: int = field(
-        default=16384,
-        metadata={"help": "number of bytes reserved for gathering stats from workers"},
-    )
-    model_parallel_size: int = field(
-        default=1, metadata={"help": "total number of GPUs to parallelize model over"}
-    )
-    checkpoint_suffix: str = field(
-        default="", metadata={"help": "suffix to add to the checkpoint file name"}
-    )
-    checkpoint_shard_count: int = field(
-        default=1,
-        metadata={
-            "help": "Number of shards containing the checkpoint - "
-            "if the checkpoint is over 300GB, it is preferable "
-            "to split it into shards to prevent OOM on CPU while loading "
-            "the checkpoint"
-        },
-    )
-    quantization_config_path: Optional[str] = field(
-        default=None, metadata={"help": "path to quantization config file"}
-    )
-    profile: bool = field(
-        default=False, metadata={"help": "enable autograd profiler emit_nvtx"}
-    )
-
-
-@dataclass
-class DistributedTrainingParams(FairseqDataclass):
-    distributed_world_size: int = field(
-        default=max(1, torch.cuda.device_count()),
-        metadata={
-            "help": "total number of GPUs across all nodes (default: all visible GPUs)"
-        },
-    )
-    distributed_rank: Optional[int] = field(
-        default=0, metadata={"help": "rank of the current worker"}
-    )
-    distributed_backend: str = field(
-        default="hccl", metadata={"help": "distributed backend"}
-    )
-    distributed_init_method: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "typically tcp://hostname:port that will be used to "
-            "establish initial connetion"
-        },
-    )
-    distributed_port: int = field(
-        default=-1,
-        metadata={
-            "help": "port number (not required if using --distributed-init-method)"
-        },
-    )
-    device_id: int = field(
-        default=0,
-        metadata={
-            "help": "which GPU to use (usually configured automatically)",
-            "argparse_alias": "--local_rank",
-        },
-    )
-    distributed_no_spawn: bool = field(
-        default=False,
-        metadata={
-            "help": "do not spawn multiple processes even if multiple GPUs are visible"
-        },
-    )
-    ddp_backend: DDP_BACKEND_CHOICES = field(
-        default="c10d", metadata={"help": "DistributedDataParallel backend"}
-    )
-    bucket_cap_mb: int = field(
-        default=25, metadata={"help": "bucket size for reduction"}
-    )
-    fix_batches_to_gpus: bool = field(
-        default=False,
-        metadata={
-            "help": "don't shuffle batches between GPUs; this reduces overall "
-            "randomness and may affect precision but avoids the cost of re-reading the data"
-        },
-    )
-    find_unused_parameters: bool = field(
-        default=False,
-        metadata={
-            "help": "disable unused parameter detection (not applicable to "
-            "no_c10d ddp-backend"
-        },
-    )
-    fast_stat_sync: bool = field(
-        default=False,
-        metadata={"help": "[deprecated] this is now defined per Criterion"},
-    )
-    broadcast_buffers: bool = field(
-        default=False,
-        metadata={
-            "help": "Copy non-trainable parameters between GPUs, such as "
-            "batchnorm population statistics"
-        },
-    )
-    distributed_wrapper: DISTRIBUTED_WRAPPER_CHOICES = field(
-        default="DDP", metadata={"help": "DistributedDataParallel backend"}
-    )
-    slowmo_momentum: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "SlowMo momentum term; by default use 0.0 for 16 GPUs, "
-            "0.2 for 32 GPUs; 0.5 for 64 GPUs, 0.6 for > 64 GPUs"
-        },
-    )
-    slowmo_algorithm: str = field(
-        default="LocalSGD", metadata={"help": "whether to use LocalSGD or SGP"}
-    )
-    localsgd_frequency: int = field(
-        default=3, metadata={"help": "Local SGD allreduce frequency"}
-    )
-    nprocs_per_node: int = field(
-        default=max(1, torch.cuda.device_count()),
-        metadata={
-            "help": "number of GPUs in each node. An allreduce operation across GPUs in "
-            "a node is very fast. Hence, we do allreduce across GPUs in a node, "
-            "and gossip across different nodes"
-        },
-    )
-    pipeline_model_parallel: bool = field(
-        default=False,
-        metadata={"help": "if set, use pipeline model parallelism across GPUs"},
-    )
-    pipeline_balance: str = field(
-        default=None,
-        metadata={
-            "help": "partition the model into N_K pieces, where each piece "
-            "contains N_i layers. The sum(args.pipeline_balance) "
-            "should equal the total number of layers in the model"
-        },
-    )
-    pipeline_devices: str = field(
-        default=None,
-        metadata={
-            "help": "a list of device indices indicating which device to place "
-            "each of the N_K partitions. The length of this list should "
-            "equal the length of the --pipeline-balance argument"
-        },
-    )
-    pipeline_chunks: int = field(
-        default=0, metadata={"help": "microbatch count for pipeline model parallelism"}
-    )
-    pipeline_encoder_balance: str = field(
-        default=None,
-        metadata={
-            "help": "partition the pipeline parallel encoder into N_K pieces, where each piece "
-            "contains N_i layers. The sum(args.pipeline_encoder_balance) "
-            "should equal the total number of encoder layers in the model"
-        },
-    )
-    pipeline_encoder_devices: str = field(
-        default=None,
-        metadata={
-            "help": "a list of device indices indicating which device to place "
-            "each of the N_K partitions. The length of this list should "
-            "equal the length of the --pipeline-encoder-balance argument"
-        },
-    )
-    pipeline_decoder_balance: str = field(
-        default=None,
-        metadata={
-            "help": "partition the pipeline parallel decoder into N_K pieces, where each piece "
-            "contains N_i layers. The sum(args.pipeline_decoder_balance) "
-            "should equal the total number of decoder layers in the model"
-        },
-    )
-    pipeline_decoder_devices: str = field(
-        default=None,
-        metadata={
-            "help": "a list of device indices indicating which device to place "
-            "each of the N_K partitions. The length of this list should "
-            "equal the length of the --pipeline-decoder-balance argument"
-        },
-    )
-    pipeline_checkpoint: PIPELINE_CHECKPOINT_CHOICES = field(
-        default="never",
-        metadata={"help": "checkpointing mode for pipeline model parallelism"},
-    )
-    zero_sharding: ZERO_SHARDING_CHOICES = field(
-        default="none", metadata={"help": "ZeRO sharding"}
-    )
-
-
-@dataclass
-class DatasetParams(FairseqDataclass):
-    num_workers: int = field(
-        default=1, metadata={"help": "how many subprocesses to use for data loading"}
-    )
-    skip_invalid_size_inputs_valid_test: bool = field(
-        default=False,
-        metadata={"help": "ignore too long or too short lines in valid and test set"},
-    )
-    max_tokens: Optional[int] = field(
-        default=None, metadata={"help": "maximum number of tokens in a batch"}
-    )
-    batch_size: Optional[int] = field(
-        default=None, metadata={"help": "number of examples in a batch"}
-    )
-    required_batch_size_multiple: int = field(
-        default=8, metadata={"help": "batch size will be a multiplier of this value"}
-    )
-    required_seq_len_multiple: int = field(
-        default=1,
-        metadata={
-            "help": "maximum sequence length in batch will be a multiplier of this value"
-        },
-    )
-    dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = field(
-        default=None, metadata={"help": "output dataset implementation"}
-    )
-    data_buffer_size: int = field(
-        default=10, metadata={"help": "Number of batches to preload"}
-    )
-    train_subset: str = field(
-        default="train",
-        metadata={"help": "data subset to use for training (e.g. train, valid, test)"},
-    )
-    valid_subset: str = field(
-        default="valid",
-        metadata={
-            "help": "comma separated list of data subsets to use for validation"
-            " (e.g. train, valid, test)"
-        },
-    )
-    validate_interval: int = field(
-        default=1, metadata={"help": "validate every N epochs"}
-    )
-    validate_interval_updates: int = field(
-        default=0, metadata={"help": "validate every N updates"}
-    )
-    validate_after_updates: int = field(
-        default=0, metadata={"help": "dont validate until reaching this many updates"}
-    )
-    fixed_validation_seed: Optional[int] = field(
-        default=None, metadata={"help": "specified random seed for validation"}
-    )
-    disable_validation: bool = field(
-        default=False, metadata={"help": "disable validation"}
-    )
-    max_tokens_valid: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "maximum number of tokens in a validation batch"
-            " (defaults to --max-tokens)"
-        },
-    )
-    batch_size_valid: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "batch size of the validation batch" " (defaults to --batch-size)"
-        },
-    )
-    curriculum: int = field(
-        default=0, metadata={"help": "don't shuffle batches for first N epochs"}
-    )
-    gen_subset: str = field(
-        default="test",
-        metadata={"help": "data subset to generate (train, valid, test)"},
-    )
-    num_shards: int = field(
-        default=1, metadata={"help": "shard generation over N shards"}
-    )
-    shard_id: int = field(
-        default=0, metadata={"help": "id of the shard to generate (id < num_shards)"}
-    )
-
-
-@dataclass
-class OptimizationParams(FairseqDataclass):
-    max_epoch: int = field(
-        default=0, metadata={"help": "force stop training at specified epoch"}
-    )
-    max_update: int = field(
-        default=0, metadata={"help": "force stop training at specified update"}
-    )
-    stop_time_hours: float = field(
-        default=0,
-        metadata={
-            "help": "force stop training after specified cumulative time (if >0)"
-        },
-    )
-    clip_norm: float = field(
-        default=0.0, metadata={"help": "clip threshold of gradients"}
-    )
-    sentence_avg: bool = field(
-        default=False,
-        metadata={
-            "help": "normalize gradients by the number of sentences in a batch"
-            " (default is to normalize by number of tokens)"
-        },
-    )
-    update_freq: List[int] = field(
-        default_factory=lambda: [1],
-        metadata={"help": "update parameters every N_i batches, when in epoch i"},
-    )
-    lr: List[float] = field(
-        default_factory=lambda: [0.25],
-        metadata={
-            "help": "learning rate for the first N epochs; all epochs >N using LR_N"
-            " (note: this may be interpreted differently depending on --lr-scheduler)"
-        },
-    )
-    min_lr: float = field(
-        default=-1.0,
-        metadata={"help": "stop training when the learning rate reaches this minimum"},
-    )
-    use_bmuf: bool = field(
-        default=False,
-        metadata={
-            "help": "specify global optimizer for syncing models on different GPUs/shards"
-        },
-    )
-
-
-@dataclass
-class CheckpointParams(FairseqDataclass):
-    save_dir: str = field(
-        default="checkpoints", metadata={"help": "path to save checkpoints"}
-    )
-    restore_file: str = field(
-        default="checkpoint_last.pt",
-        metadata={
-            "help": "filename from which to load checkpoint "
-            "(default: <save-dir>/checkpoint_last.pt"
-        },
-    )
-    finetune_from_model: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "finetune from a pretrained model; note that meters and lr scheduler will be reset"
-        },
-    )
-    reset_dataloader: bool = field(
-        default=False,
-        metadata={
-            "help": "if set, does not reload dataloader state from the checkpoint"
-        },
-    )
-    reset_lr_scheduler: bool = field(
-        default=False,
-        metadata={
-            "help": "if set, does not load lr scheduler state from the checkpoint"
-        },
-    )
-    reset_meters: bool = field(
-        default=False,
-        metadata={"help": "if set, does not load meters from the checkpoint"},
-    )
-    reset_optimizer: bool = field(
-        default=False,
-        metadata={"help": "if set, does not load optimizer state from the checkpoint"},
-    )
-    optimizer_overrides: str = field(
-        default="{}",
-        metadata={
-            "help": "a dictionary used to override optimizer args when loading a checkpoint"
-        },
-    )
-    save_interval: int = field(
-        default=1, metadata={"help": "save a checkpoint every N epochs"}
-    )
-    save_interval_updates: int = field(
-        default=0, metadata={"help": "save a checkpoint (and validate) every N updates"}
-    )
-    keep_interval_updates: int = field(
-        default=-1,
-        metadata={
-            "help": "keep the last N checkpoints saved with --save-interval-updates"
-        },
-    )
-    keep_last_epochs: int = field(
-        default=-1, metadata={"help": "keep last N epoch checkpoints"}
-    )
-    keep_best_checkpoints: int = field(
-        default=-1, metadata={"help": "keep best N checkpoints based on scores"}
-    )
-    no_save: bool = field(
-        default=False, metadata={"help": "don't save models or checkpoints"}
-    )
-    no_epoch_checkpoints: bool = field(
-        default=False, metadata={"help": "only store last and best checkpoints"}
-    )
-    no_last_checkpoints: bool = field(
-        default=False, metadata={"help": "don't store last checkpoints"}
-    )
-    no_save_optimizer_state: bool = field(
-        default=False,
-        metadata={"help": "don't save optimizer-state as part of checkpoint"},
-    )
-    best_checkpoint_metric: str = field(
-        default="loss", metadata={"help": 'metric to use for saving "best" checkpoints'}
-    )
-    maximize_best_checkpoint_metric: bool = field(
-        default=False,
-        metadata={
-            "help": 'select the largest metric value for saving "best" checkpoints'
-        },
-    )
-    patience: int = field(
-        default=-1,
-        metadata={
-            "help": (
-                "early stop training if valid performance doesn't "
-                "improve for N consecutive validation runs; note "
-                "that this is influenced by --validate-interval"
-            )
-        },
-    )
-
-
-@dataclass
-class CommonEvalParams(FairseqDataclass):
-    path: Optional[str] = field(
-        default=None, metadata={"help": "path(s) to model file(s), colon separated"}
-    )
-    remove_bpe: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "remove BPE tokens before scoring (can be set to sentencepiece)",
-            "argparse_const": "@@ ",
-        },
-    )
-    quiet: bool = field(default=False, metadata={"help": "only print final scores"})
-    model_overrides: str = field(
-        default="{}",
-        metadata={
-            "help": "a dictionary used to override model args at generation that were used during model training"
-        },
-    )
-    results_path: Optional[str] = field(
-        default=None, metadata={"help": "path to save eval results (optional)"}
-    )
-
-
-@dataclass
-class EvalLMParams(FairseqDataclass):
-    output_word_probs: bool = field(
-        default=False,
-        metadata={
-            "help": "if set, outputs words and their predicted log probabilities to standard output"
-        },
-    )
-    output_word_stats: bool = field(
-        default=False,
-        metadata={
-            "help": "if set, outputs word statistics such as word count, average probability, etc"
-        },
-    )
-    context_window: int = field(
-        default=0,
-        metadata={
-            "help": "ensures that every evaluated token has access to a context of at least this size, if possible"
-        },
-    )
-    softmax_batch: int = field(
-        default=sys.maxsize,
-        metadata={
-            "help": "if BxT is more than this, will batch the softmax over vocab to this amount of tokens, in order to fit into GPU memory"
-        },
-    )
-
-
-@dataclass
-class TrainingConfig(FairseqDataclass):
-    """Config for training, a composition of training params"""
-
-    common: CommonParams = CommonParams()
-    distributed_training: DistributedTrainingParams = DistributedTrainingParams()
-    dataset: DatasetParams = DatasetParams()
-    optimization: OptimizationParams = OptimizationParams()
-    checkpoint: CheckpointParams = CheckpointParams()
-    bmuf: FairseqBMUFConfig = FairseqBMUFConfig()
-
-
-@dataclass
-class EvalLMConfig(FairseqDataclass):
-    """Config for eval lm, a composition of eval_lm params"""
-
-    common: CommonParams = CommonParams()
-    distributed_training: DistributedTrainingParams = DistributedTrainingParams()
-    dataset: DatasetParams = DatasetParams()
-    optimization: OptimizationParams = OptimizationParams()
-    checkpoint: CheckpointParams = CheckpointParams()
-    bmuf: FairseqBMUFConfig = FairseqBMUFConfig()
-    common_eval: CommonEvalParams = CommonEvalParams()
-    eval_lm: EvalLMParams = EvalLMParams()
-
-
-def register_params_dataclass(
-    cs: ConfigStore, name: str, group: str, data_class: Type[FairseqDataclass]
-) -> None:
-    """register params dataclass in config store"""
-    node_ = data_class(_name=data_class.name())
-    cs.store(name=name, group=group, node=node_)
-
-
-def register_module_dataclass(
-    cs: ConfigStore, registry: Dict[str, Any], group: str
-) -> None:
-    """register dataclasses defined in modules in config store, for example, in migrated tasks, models, etc."""
-    # note that if `group == model`, we register all model archs, not the model name.
-    for k, v in registry.items():
-        if v is not None:
-            node_ = v(_name=k)
-            cs.store(name=k, group=group, node=node_)
-
-
-def register_training_hydra_cfg(cs: ConfigStore, name: str = "default") -> None:
-    """cs: config store instance, register common training configs"""
-
-    register_params_dataclass(
-        cs, name="training_params", group="params", data_class=TrainingConfig
-    )
-
-    register_module_dataclass(cs, TASK_DATACLASS_REGISTRY, "task")
-    register_module_dataclass(cs, MODEL_DATACLASS_REGISTRY, "model")
-    register_module_dataclass(cs, CRITERION_DATACLASS_REGISTRY, "criterion")
-    register_module_dataclass(cs, OPTIMIZER_DATACLASS_REGISTRY, "optimizer")
-    register_module_dataclass(cs, LR_SCHEDULER_DATACLASS_REGISTRY, "lr_scheduler")
-
-
-def register_eval_lm_hydra_cfg(cs: ConfigStore, name: str = "default") -> None:
-    """cs: config store instance, register common training configs"""
-
-    register_params_dataclass(
-        cs, name="eval_lm_params", group="params", data_class=EvalLMConfig
-    )
-
-    register_module_dataclass(cs, TASK_DATACLASS_REGISTRY, "task")
-    register_module_dataclass(cs, CRITERION_DATACLASS_REGISTRY, "criterion")
-    register_module_dataclass(cs, OPTIMIZER_DATACLASS_REGISTRY, "optimizer")
-    register_module_dataclass(cs, LR_SCHEDULER_DATACLASS_REGISTRY, "lr_scheduler")
-
-
-def _override_attr(
-    sub_node: str, data_class: Type[FairseqDataclass], args: Namespace
-) -> List[str]:
-    overrides = []
-    for k in data_class.__dataclass_fields__.keys():
-        if k == "_name":
-            # private member, skip
-            continue
-        if not hasattr(args, k):
-            # print(f"cannot override {sub_node}.{k} since args does not have attribute {k}")
-            continue
-        if getattr(args, k) is None:
-            overrides.append("{}.{}=null".format(sub_node, k))
-        elif getattr(args, k) == "":
-            overrides.append("{}.{}=''".format(sub_node, k))
-        elif isinstance(getattr(args, k), str):
-            if (
-                getattr(args, k).startswith("[")
-                or getattr(args, k).startswith("(")
-                or getattr(args, k).startswith("{")
-                or ("," in getattr(args, k))
-            ):
-                overrides.append("{}.{}='{}'".format(sub_node, k, getattr(args, k)))
-            else:
-                overrides.append("{}.{}={}".format(sub_node, k, getattr(args, k)))
-        else:
-            overrides.append("{}.{}={}".format(sub_node, k, getattr(args, k)))
-    return overrides
-
-
-def override_training_args(args: Namespace) -> Tuple[List[str], List[str]]:
-    overrides = []
-
-    overrides.extend(_override_attr("params.common", CommonParams, args))
-    overrides.extend(_override_attr("params.dataset", DatasetParams, args))
-    overrides.extend(
-        _override_attr("params.distributed_training", DistributedTrainingParams, args)
-    )
-    overrides.extend(_override_attr("params.optimization", OptimizationParams, args))
-    overrides.extend(_override_attr("params.checkpoint", CheckpointParams, args))
-    overrides.extend(_override_attr("params.bmuf", FairseqBMUFConfig, args))
-    module_overrides, module_deletes = override_module_args(args)
-    overrides.extend(module_overrides)
-
-    return overrides, module_deletes
-
-
-def override_eval_lm_args(args: Namespace) -> Tuple[List[str], List[str]]:
-    overrides = []
-
-    overrides.extend(_override_attr("params.common", CommonParams, args))
-    overrides.extend(_override_attr("params.dataset", DatasetParams, args))
-    overrides.extend(
-        _override_attr("params.distributed_training", DistributedTrainingParams, args)
-    )
-    overrides.extend(_override_attr("params.common_eval", CommonEvalParams, args))
-    overrides.extend(_override_attr("params.eval_lm", EvalLMParams, args))
-    overrides.extend(_override_attr("params.bmuf", FairseqBMUFConfig, args))
-    module_overrides, module_deletes = override_module_args(args)
-    overrides.extend(module_overrides)
-
-    return overrides, module_deletes
-
-
-def override_module_args(args: Namespace) -> Tuple[List[str], List[str]]:
-    """use the field in args to overrides those in cfg"""
-    overrides = []
-    deletes = []
-
-    if args is not None:
-        assert (
-            hasattr(args, "task")
-            and hasattr(args, "criterion")
-            and hasattr(args, "optimizer")
-            and hasattr(args, "lr_scheduler")
-        )
-        if args.task in TASK_DATACLASS_REGISTRY:
-            overrides.append("task={}".format(args.task))
-            overrides.append("task._name={}".format(args.task))
-            overrides.extend(
-                _override_attr("task", TASK_DATACLASS_REGISTRY[args.task], args)
-            )
-        else:
-            deletes.append("task")
-        if args.criterion in CRITERION_DATACLASS_REGISTRY:
-            overrides.append("criterion={}".format(args.criterion))
-            overrides.append("criterion._name={}".format(args.criterion))
-            overrides.extend(
-                _override_attr(
-                    "criterion", CRITERION_DATACLASS_REGISTRY[args.criterion], args
-                )
-            )
-        else:
-            deletes.append("criterion")
-        if args.optimizer in OPTIMIZER_DATACLASS_REGISTRY:
-            overrides.append("optimizer={}".format(args.optimizer))
-            overrides.append("optimizer._name={}".format(args.optimizer))
-            overrides.extend(
-                _override_attr(
-                    "optimizer", OPTIMIZER_DATACLASS_REGISTRY[args.optimizer], args
-                )
-            )
-        else:
-            deletes.append("optimizer")
-        if args.lr_scheduler in LR_SCHEDULER_DATACLASS_REGISTRY:
-            overrides.append("lr_scheduler={}".format(args.lr_scheduler))
-            overrides.append("lr_scheduler._name={}".format(args.lr_scheduler))
-            overrides.extend(
-                _override_attr(
-                    "lr_scheduler",
-                    LR_SCHEDULER_DATACLASS_REGISTRY[args.lr_scheduler],
-                    args,
-                )
-            )
-        else:
-            deletes.append("lr_scheduler")
-
-        no_dc = True
-        if hasattr(args, "arch"):
-            if args.arch in ARCH_MODEL_REGISTRY:
-                m_cls = ARCH_MODEL_REGISTRY[args.arch]
-                dc = getattr(m_cls, "__dataclass", None)
-                if dc is not None:
-                    overrides.append("model={}".format(args.arch))
-                    overrides.append("model._name={}".format(args.arch))
-                    # override model params with those exist in args
-                    overrides.extend(_override_attr("model", dc, args))
-                    no_dc = False
-        if no_dc:
-            deletes.append("model")
-
-    return overrides, deletes
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/dataclass/utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/dataclass/utils.py
deleted file mode 100644
index 836a077707d9c42c879cfa21bed63ec967f1534d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/dataclass/utils.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from argparse import ArgumentParser
-from dataclasses import MISSING, dataclass
-from enum import Enum
-from typing import Any, Dict, List, Optional
-
-
-def eval_str_list(x, x_type=float):
-    if x is None:
-        return None
-    if isinstance(x, str):
-        x = eval(x)
-    try:
-        return list(map(x_type, x))
-    except TypeError:
-        return [x_type(x)]
-
-
-class StrEnum(Enum):
-    def __str__(self):
-        return self.value
-
-    def __eq__(self, other: str):
-        return self.value == other
-
-    def __repr__(self):
-        return self.value
-
-
-def ChoiceEnum(choices: List[str]):
-    """return the Enum class used to enforce list of choices"""
-    return StrEnum("Choices", {k: k for k in choices})
-
-
-@dataclass
-class FairseqDataclass:
-    """fairseq base dataclass that supported fetching attributes and metas"""
-
-    _name: Optional[str] = None
-
-    @staticmethod
-    def name():
-        return None
-
-    def _get_all_attributes(self) -> List[str]:
-        return [k for k in self.__dataclass_fields__.keys()]
-
-    def _get_meta(
-        self, attribute_name: str, meta: str, default: Optional[Any] = None
-    ) -> Any:
-        return self.__dataclass_fields__[attribute_name].metadata.get(meta, default)
-
-    def _get_name(self, attribute_name: str) -> str:
-        return self.__dataclass_fields__[attribute_name].name
-
-    def _get_default(self, attribute_name: str) -> Any:
-        if hasattr(self, attribute_name):
-            if str(getattr(self, attribute_name)).startswith("${"):
-                return str(getattr(self, attribute_name))
-            elif str(self.__dataclass_fields__[attribute_name].default).startswith(
-                "${"
-            ):
-                return str(self.__dataclass_fields__[attribute_name].default)
-            elif (
-                getattr(self, attribute_name)
-                != self.__dataclass_fields__[attribute_name].default
-            ):
-                return getattr(self, attribute_name)
-        return self.__dataclass_fields__[attribute_name].default
-
-    def _get_default_factory(self, attribute_name: str) -> Any:
-        if hasattr(self, attribute_name):
-            if str(getattr(self, attribute_name)).startswith("${"):
-                return str(getattr(self, attribute_name))
-            elif str(self.__dataclass_fields__[attribute_name].default).startswith(
-                "${"
-            ):
-                return str(self.__dataclass_fields__[attribute_name].default)
-            elif (
-                getattr(self, attribute_name)
-                != self.__dataclass_fields__[attribute_name].default_factory()
-            ):
-                return getattr(self, attribute_name)
-        return self.__dataclass_fields__[attribute_name].default_factory()
-
-    def _get_type(self, attribute_name: str) -> Any:
-        return self.__dataclass_fields__[attribute_name].type
-
-    def _get_help(self, attribute_name: str) -> Any:
-        return self._get_meta(attribute_name, "help")
-
-    def _get_argparse_const(self, attribute_name: str) -> Any:
-        return self._get_meta(attribute_name, "argparse_const")
-
-    def _get_argparse_alias(self, attribute_name: str) -> Any:
-        return self._get_meta(attribute_name, "argparse_alias")
-
-    def _get_choices(self, attribute_name: str) -> Any:
-        return self._get_meta(attribute_name, "choices")
-
-
-def gen_parser_from_dataclass(
-    parser: ArgumentParser,
-    dataclass_instance: FairseqDataclass,
-    delete_default: bool = False,
-) -> None:
-    """convert a dataclass instance to tailing parser arguments"""
-    import re
-
-    def argparse_name(name: str):
-        if name == "data":
-            # normally data is positional args
-            return name
-        if name == "_name":
-            # private member, skip
-            return None
-        return "--" + name.replace("_", "-")
-
-    def interpret_dc_type(field_type):
-        if isinstance(field_type, str):
-            raise RuntimeError()
-        typestring = str(field_type)
-        if re.match(r"(typing.|^)Union\[(.*), NoneType\]$", typestring):
-            return field_type.__args__[0]
-        return field_type
-
-    def get_kwargs_from_dc(
-        dataclass_instance: FairseqDataclass, k: str
-    ) -> Dict[str, Any]:
-        """k: dataclass attributes"""
-        field_type = dataclass_instance._get_type(k)
-        inter_type = interpret_dc_type(field_type)
-        if isinstance(inter_type, type) and issubclass(inter_type, List):
-            field_default = dataclass_instance._get_default_factory(k)
-        else:
-            field_default = dataclass_instance._get_default(k)
-
-        if isinstance(inter_type, type) and issubclass(inter_type, Enum):
-            field_choices = [t.value for t in list(inter_type)]
-        else:
-            field_choices = None
-
-        field_help = dataclass_instance._get_help(k)
-        field_const = dataclass_instance._get_argparse_const(k)
-        kwargs = {}
-        if isinstance(field_default, str) and field_default.startswith("${"):
-            kwargs["default"] = field_default
-        else:
-            if field_default is MISSING:
-                kwargs["required"] = True
-            if field_choices is not None:
-                kwargs["choices"] = field_choices
-            if (isinstance(inter_type, type) and issubclass(inter_type, List)) or (
-                "List" in str(inter_type)
-            ):
-                if "int" in str(inter_type):
-                    kwargs["type"] = lambda x: eval_str_list(x, int)
-                elif "float" in str(inter_type):
-                    kwargs["type"] = lambda x: eval_str_list(x, float)
-                elif "str" in str(inter_type):
-                    kwargs["type"] = lambda x: eval_str_list(x, str)
-                else:
-                    raise NotImplementedError()
-                if field_default is not MISSING:
-                    kwargs["default"] = ",".join(map(str, field_default))
-            elif (
-                isinstance(inter_type, type) and issubclass(inter_type, Enum)
-            ) or "Enum" in str(inter_type):
-                kwargs["type"] = str
-                if field_default is not MISSING:
-                    if isinstance(field_default, Enum):
-                        kwargs["default"] = field_default.value
-                    else:
-                        kwargs["default"] = field_default
-            elif inter_type is bool:
-                kwargs["action"] = (
-                    "store_false" if field_default is True else "store_true"
-                )
-                kwargs["default"] = field_default
-            else:
-                kwargs["type"] = inter_type
-                if field_default is not MISSING:
-                    kwargs["default"] = field_default
-
-        kwargs["help"] = field_help
-        if field_const is not None:
-            kwargs["const"] = field_const
-            kwargs["nargs"] = "?"
-        return kwargs
-
-    for k in dataclass_instance._get_all_attributes():
-        field_name = argparse_name(dataclass_instance._get_name(k))
-        if field_name is None:
-            continue
-
-        kwargs = get_kwargs_from_dc(dataclass_instance, k)
-
-        field_args = [field_name]
-        alias = dataclass_instance._get_argparse_alias(k)
-        if alias is not None:
-            field_args.append(alias)
-
-        if "default" in kwargs:
-            if isinstance(kwargs["default"], str) and kwargs["default"].startswith(
-                "${"
-            ):
-                if kwargs["help"] is None:
-                    # this is a field with a name that will be added elsewhere
-                    continue
-                else:
-                    del kwargs["default"]
-            if delete_default:
-                del kwargs["default"]
-        try:
-            parser.add_argument(*field_args, **kwargs)
-        except ArgumentError:
-            pass
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/distributed_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/distributed_utils.py
deleted file mode 100644
index ed1072d0a592170f538e3e5663d3e35a56e13cfc..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/distributed_utils.py
+++ /dev/null
@@ -1,450 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-import pickle
-import random
-import socket
-import struct
-import subprocess
-import warnings
-from collections import OrderedDict
-from typing import Any, Dict, Mapping
-
-import torch
-import torch.distributed as dist
-from fairseq import utils
-
-
-logger = logging.getLogger(__name__)
-
-
-def is_master(args):
-    return args.distributed_rank == 0
-
-
-def infer_init_method(args, force_distributed=False):
-    if args.distributed_init_method is not None or getattr(args, "tpu", False):
-        return
-
-    if args.pipeline_model_parallel:
-        balance_exists = (
-            args.pipeline_balance is not None
-            or args.pipeline_encoder_balance is not None
-            or args.pipeline_decoder_balance is not None
-        )
-        devices_exist = (
-            args.pipeline_devices is not None
-            or args.pipeline_encoder_devices is not None
-            or args.pipeline_decoder_devices is not None
-        )
-        if not balance_exists:
-            raise ValueError(
-                "--pipeline-balance is currently required for pipeline model parallelism"
-            )
-        if not devices_exist:
-            raise ValueError(
-                "--pipeline-devices is currently required for pipeline model parallelism"
-            )
-
-        args.pipeline_balance = utils.eval_str_list(args.pipeline_balance, type=int)
-        if args.pipeline_devices is not None:
-            args.pipeline_devices = utils.eval_str_list(args.pipeline_devices, type=int)
-            num_pipeline_devices = len(set(args.pipeline_devices))
-        else:
-            args.pipeline_encoder_devices = utils.eval_str_list(
-                args.pipeline_encoder_devices, type=int
-            )
-            args.pipeline_decoder_devices = utils.eval_str_list(
-                args.pipeline_decoder_devices, type=int
-            )
-            num_pipeline_devices = len(
-                set(args.pipeline_encoder_devices + args.pipeline_decoder_devices)
-            )
-        gpus_per_node = torch.cuda.device_count()
-        assert (
-            gpus_per_node >= num_pipeline_devices
-            and gpus_per_node % num_pipeline_devices == 0
-        ), (
-            "the number of unique device IDs in --pipeline-devices must evenly divide "
-            "the number of GPUs per node (multi-node pipelining is not yet supported)"
-        )
-        num_pipelines_per_node = gpus_per_node // num_pipeline_devices
-
-    # support torch.distributed.launch
-    if all(
-        key in os.environ
-        for key in ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK"]
-    ):
-        args.distributed_init_method = "env://"
-        args.distributed_world_size = int(os.environ["WORLD_SIZE"])
-        args.distributed_rank = int(os.environ["RANK"])
-        # processes are created by torch.distributed.launch
-        args.distributed_no_spawn = True
-
-    # we can determine the init method automatically for Slurm
-    elif args.distributed_port > 0:
-        node_list = os.environ.get("SLURM_STEP_NODELIST")
-        if node_list is None:
-            node_list = os.environ.get("SLURM_JOB_NODELIST")
-        if node_list is not None:
-            try:
-                hostnames = subprocess.check_output(
-                    ["scontrol", "show", "hostnames", node_list]
-                )
-                args.distributed_init_method = "tcp://{host}:{port}".format(
-                    host=hostnames.split()[0].decode("utf-8"),
-                    port=args.distributed_port,
-                )
-                nnodes = int(os.environ.get("SLURM_NNODES"))
-                ntasks_per_node = os.environ.get("SLURM_NTASKS_PER_NODE")
-                if ntasks_per_node is not None:
-                    ntasks_per_node = int(ntasks_per_node)
-                else:
-                    ntasks = int(os.environ.get("SLURM_NTASKS"))
-                    nnodes = int(os.environ.get("SLURM_NNODES"))
-                    assert ntasks % nnodes == 0
-                    ntasks_per_node = int(ntasks / nnodes)
-                if ntasks_per_node == 1:
-                    gpus_per_node = torch.cuda.device_count()
-                    node_id = int(os.environ.get("SLURM_NODEID"))
-                    args.distributed_rank = node_id * gpus_per_node
-                    args.distributed_world_size = nnodes * gpus_per_node
-                elif args.pipeline_model_parallel:
-                    assert ntasks_per_node == num_pipelines_per_node, (
-                        "SLURM --ntasks-per-node must match number of pipelines per "
-                        "node (={})".format(num_pipelines_per_node)
-                    )
-                    args.distributed_no_spawn = True
-                    # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on
-                    # the first node, [1, 2] on the second node, etc. This
-                    # matches torch.distributed.launch.
-                    node_id = int(os.environ.get("SLURM_NODEID"))
-                    local_id = int(os.environ.get("SLURM_LOCALID"))
-                    args.distributed_rank = node_id * num_pipelines_per_node + local_id
-                    # In the above example, device_id will always be in [0, 1],
-                    # which also matches torch.distributed.launch.
-                    args.device_id = local_id
-                    # We also want to set distributed_world_size to be the total
-                    # number of pipelines across all nodes.
-                    args.distributed_world_size = nnodes * num_pipelines_per_node
-                else:
-                    assert ntasks_per_node == args.distributed_world_size // nnodes
-                    args.distributed_no_spawn = True
-                    args.distributed_rank = int(os.environ.get("SLURM_PROCID"))
-                    args.device_id = int(os.environ.get("SLURM_LOCALID"))
-            except subprocess.CalledProcessError as e:  # scontrol failed
-                raise e
-            except FileNotFoundError:  # Slurm is not installed
-                pass
-
-    elif args.distributed_world_size > 1 or force_distributed:
-        # fallback for single node with multiple GPUs
-        assert args.distributed_world_size <= torch.npu.device_count()
-        os.environ['MASTER_ADDR'] = '127.0.0.1'
-        os.environ['MASTER_PORT'] = '29688'
-        args.distributed_init_method = "env://"
-
-    if args.pipeline_model_parallel:
-        if not args.distributed_no_spawn:
-            # When distributed_no_spawn is False, we expect distributed_rank and
-            # distributed_world_size to be based on the total number of GPUs, so
-            # we need to correct them to be based on the number of pipelines.
-            assert args.distributed_world_size % num_pipeline_devices == 0
-            args.distributed_world_size = (
-                args.distributed_world_size // num_pipeline_devices
-            )
-            # In the case of 4-way MP on nodes with 8 GPUs, we want
-            # distributed_rank to be the starting GPU index for each pipeline
-            # i.e., 0, 2, ...
-            assert args.distributed_rank % gpus_per_node == 0
-            assert args.distributed_rank % num_pipeline_devices == 0
-            args.distributed_rank = args.distributed_rank // num_pipeline_devices
-            # launch one process per pipeline
-            args.distributed_num_procs = num_pipelines_per_node
-
-        # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0
-        # and 4, indicating the starting device IDs for each pipeline
-        args.device_id *= num_pipeline_devices
-
-        if args.device_id > 0:
-            # if there's multiple pipelines on a node (e.g., 4-way MP on an 8
-            # GPU node), we need to adjust pipeline_devices accordingly
-            logger.debug(
-                "setting CUDA device={} on rank {}".format(
-                    args.device_id, args.distributed_rank
-                )
-            )
-            torch.cuda.set_device(args.device_id)
-            args.pipeline_devices = [args.device_id + d for d in args.pipeline_devices]
-            logger.info(
-                "setting pipeline_devices={} on rank {}".format(
-                    args.pipeline_devices, args.distributed_rank
-                ),
-            )
-    elif not args.distributed_no_spawn:
-        if args.npu:
-            args.distributed_num_procs = min(torch.npu.device_count(),
-                                             args.distributed_world_size)
-        else:
-            args.distributed_num_procs = min(
-                torch.cuda.device_count(),
-                args.distributed_world_size,
-            )
-
-
-def distributed_init(args):
-    if not getattr(args, "tpu", False):
-        if torch.distributed.is_initialized():
-            warnings.warn(
-                "Distributed is already initialized, cannot initialize twice!"
-            )
-        else:
-            logger.info(
-                "distributed init (rank {}): {}".format(
-                    args.distributed_rank,
-                    args.distributed_init_method,
-                )
-            )
-            dist.init_process_group(
-                backend=args.distributed_backend,
-                init_method=args.distributed_init_method,
-                world_size=args.distributed_world_size,
-                rank=args.distributed_rank,
-            )
-            logger.info(
-                "initialized host {} as rank {}".format(
-                    socket.gethostname(),
-                    args.distributed_rank,
-                )
-            )
-
-            # perform a dummy all-reduce to initialize the NCCL communicator
-            if torch.npu.is_available():
-                dist.all_reduce(torch.zeros(1).npu())
-
-        args.distributed_rank = torch.distributed.get_rank()
-    else:
-        import torch_xla.core.xla_model as xm
-
-        assert xm.xrt_world_size() == args.distributed_world_size
-        args.device_id = xm.get_local_ordinal()
-        args.distributed_rank = xm.get_ordinal()
-        xm.rendezvous("distributed_init")  # wait for all workers
-        xm.mark_step()
-
-    if not is_master(args):
-        logging.getLogger().setLevel(logging.WARNING)
-
-    if args.model_parallel_size > 1:
-        try:
-            from fairseq.model_parallel.megatron.mpu import (
-                get_model_parallel_rank,
-                initialize_model_parallel,
-                model_parallel_cuda_manual_seed,
-            )
-        except ImportError:
-            raise ImportError(
-                "\n\nPlease install the megatron submodule:"
-                "\n\n  git submodule update --init "
-                "fairseq/model_parallel/megatron"
-            )
-        initialize_model_parallel(args.model_parallel_size)
-        model_parallel_cuda_manual_seed(args.seed)
-        model_part_number = get_model_parallel_rank()
-        args.checkpoint_suffix += "-model_part-{0}".format(model_part_number)
-    return args.distributed_rank
-
-
-def distributed_main(i, main, args, kwargs):
-    args.device_id = i
-    if torch.npu.is_available() and not args.cpu and not getattr(args, "tpu", False):
-        torch.npu.set_device(args.device_id)
-    if args.distributed_rank is None:  # torch.multiprocessing.spawn
-        args.distributed_rank = kwargs.pop("start_rank", 0) + i
-
-    args.distributed_rank = distributed_init(args)
-
-    after_distributed_init_fn = kwargs.pop("after_distributed_init_fn", None)
-    if after_distributed_init_fn:
-        args = after_distributed_init_fn(args)
-
-    main(args, **kwargs)
-
-
-def call_main(args, main, **kwargs):
-    if args.distributed_init_method is None:
-        infer_init_method(args)
-
-    if args.distributed_init_method is not None:
-        # distributed training
-        if not args.distributed_no_spawn:
-            start_rank = args.distributed_rank
-            args.distributed_rank = None  # assign automatically
-            kwargs["start_rank"] = start_rank
-            torch.multiprocessing.spawn(
-                fn=distributed_main,
-                args=(main, args, kwargs),
-                nprocs=args.distributed_num_procs,
-            )
-        else:
-            distributed_main(args.device_id, main, args, kwargs)
-    elif getattr(args, "tpu", False) and args.distributed_world_size > 1:
-        import torch_xla.distributed.xla_multiprocessing as xmp
-
-        torch.multiprocessing.set_sharing_strategy("file_system")
-        xmp.spawn(
-            fn=distributed_main,
-            args=(main, args, kwargs),
-            nprocs=8,  # use all 8 TPU cores
-        )
-    else:
-        # single GPU main
-        main(args, **kwargs)
-
-
-def get_rank():
-    return dist.get_rank()
-
-
-def get_world_size():
-    return dist.get_world_size()
-
-
-def get_default_group():
-    return dist.group.WORLD
-
-
-def all_reduce(tensor, group=None):
-    if isinstance(group, tuple) and group[0] == "tpu":
-        import torch_xla.core.xla_model as xm
-
-        return xm.all_reduce("sum", [tensor], groups=group[1])
-    else:
-        if group is None:
-            group = get_default_group()
-        return dist.all_reduce(tensor, group=group)
-
-
-def all_gather_list(data, group=None, max_size=16384):
-    """Gathers arbitrary data from all nodes into a list.
-
-    Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python
-    data. Note that *data* must be picklable.
-
-    Args:
-        data (Any): data from the local worker to be gathered on other workers
-        group (optional): group of the collective
-        max_size (int, optional): maximum size of the data to be gathered
-            across workers
-    """
-    rank = get_rank()
-    world_size = get_world_size()
-
-    buffer_size = max_size * world_size
-    if (
-        not hasattr(all_gather_list, "_buffer")
-        or all_gather_list._buffer.numel() < buffer_size
-    ):
-        all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size)
-        all_gather_list._cpu_buffer = torch.ByteTensor(max_size).pin_memory()
-    buffer = all_gather_list._buffer
-    buffer.zero_()
-    cpu_buffer = all_gather_list._cpu_buffer
-
-    data = utils.move_to_cpu(data)
-    enc = pickle.dumps(data)
-    enc_size = len(enc)
-    header_size = 4  # size of header that contains the length of the encoded data
-    size = header_size + enc_size
-    if size > max_size:
-        raise ValueError(
-            "encoded data size ({}) exceeds max_size ({})".format(size, max_size)
-        )
-
-    header = struct.pack(">I", enc_size)
-    cpu_buffer[:size] = torch.ByteTensor(list(header + enc))
-    start = rank * max_size
-    buffer[start : start + size].copy_(cpu_buffer[:size])
-
-    all_reduce(buffer, group=group)
-
-    buffer = buffer.cpu()
-    try:
-        result = []
-        for i in range(world_size):
-            out_buffer = buffer[i * max_size : (i + 1) * max_size]
-            (enc_size,) = struct.unpack(">I", bytes(out_buffer[:header_size].tolist()))
-            if enc_size > 0:
-                result.append(
-                    pickle.loads(
-                        bytes(out_buffer[header_size : header_size + enc_size].tolist())
-                    )
-                )
-        return result
-    except pickle.UnpicklingError:
-        raise Exception(
-            "Unable to unpickle data from other workers. all_gather_list requires all "
-            "workers to enter the function together, so this error usually indicates "
-            "that the workers have fallen out of sync somehow. Workers can fall out of "
-            "sync if one of them runs out of memory, or if there are other conditions "
-            "in your training script that can cause one worker to finish an epoch "
-            "while other workers are still iterating over their portions of the data. "
-            "Try rerunning with --ddp-backend=no_c10d and see if that helps."
-        )
-
-
-def all_reduce_dict(
-    data: Mapping[str, Any],
-    device,
-    group=None,
-) -> Dict[str, Any]:
-    """
-    AllReduce a dictionary of values across workers. We separately
-    reduce items that are already on the device and items on CPU for
-    better performance.
-
-    Args:
-        data (Mapping[str, Any]): dictionary of data to all-reduce, but
-            cannot be a nested dictionary
-        device (torch.device): device for the reduction
-        group (optional): group of the collective
-    """
-    data_keys = list(data.keys())
-
-    # We want to separately reduce items that are already on the
-    # device and items on CPU for performance reasons.
-    cpu_data = OrderedDict()
-    device_data = OrderedDict()
-    for k in data_keys:
-        t = data[k]
-        if not torch.is_tensor(t):
-            cpu_data[k] = torch.tensor(t, dtype=torch.float32)
-        elif t.device.type != 'npu':
-            cpu_data[k] = t.to(dtype=torch.float32)
-        else:
-            device_data[k] = t.to(dtype=torch.float32)
-
-    def _all_reduce_dict(data: OrderedDict):
-        if len(data) == 0:
-            return data
-        buf = torch.cat([t.view(-1) for t in data.values()]).to(device=device)
-        all_reduce(buf, group=group)
-        split_buf = torch.split(buf, [t.numel() for t in data.values()])
-        reduced_data = [t.view_as(orig) for t, orig in zip(split_buf, data.values())]
-        return OrderedDict(zip(data.keys(), reduced_data))
-
-    cpu_data = _all_reduce_dict(cpu_data)
-    device_data = _all_reduce_dict(device_data)
-
-    def get_from_stack(key):
-        if key in cpu_data:
-            return cpu_data[key]
-        elif key in device_data:
-            return device_data[key]
-        raise KeyError
-
-    return OrderedDict([(key, get_from_stack(key)) for key in data_keys])
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/file_io.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/file_io.py
deleted file mode 100644
index d6672569229a7f88c19863f26b99894734542593..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/file_io.py
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import shutil
-from typing import List, Optional
-
-
-try:
-    from fvcore.common.file_io import PathManager as FVCorePathManager
-
-except ImportError:
-    FVCorePathManager = None
-
-
-class PathManager:
-    """
-    Wrapper for insulating OSS I/O (using Python builtin operations) from
-    fvcore's PathManager abstraction (for transparently handling various
-    internal backends).
-    """
-
-    @staticmethod
-    def open(
-        path: str,
-        mode: str = "r",
-        buffering: int = -1,
-        encoding: Optional[str] = None,
-        errors: Optional[str] = None,
-        newline: Optional[str] = None,
-    ):
-        if FVCorePathManager:
-            return FVCorePathManager.open(
-                path=path,
-                mode=mode,
-                buffering=buffering,
-                encoding=encoding,
-                errors=errors,
-                newline=newline,
-            )
-        return open(
-            path,
-            mode=mode,
-            buffering=buffering,
-            encoding=encoding,
-            errors=errors,
-            newline=newline,
-        )
-
-    @staticmethod
-    def copy(src_path: str, dst_path: str, overwrite: bool = False) -> bool:
-        if FVCorePathManager:
-            return FVCorePathManager.copy(
-                src_path=src_path, dst_path=dst_path, overwrite=overwrite
-            )
-        return shutil.copyfile(src_path, dst_path)
-
-    @staticmethod
-    def get_local_path(path: str, **kwargs) -> str:
-        if FVCorePathManager:
-            return FVCorePathManager.get_local_path(path, **kwargs)
-        return path
-
-    @staticmethod
-    def exists(path: str) -> bool:
-        if FVCorePathManager:
-            return FVCorePathManager.exists(path)
-        return os.path.exists(path)
-
-    @staticmethod
-    def isfile(path: str) -> bool:
-        if FVCorePathManager:
-            return FVCorePathManager.isfile(path)
-        return os.path.isfile(path)
-
-    @staticmethod
-    def ls(path: str) -> List[str]:
-        if FVCorePathManager:
-            return FVCorePathManager.ls(path)
-        return os.listdir(path)
-
-    @staticmethod
-    def mkdirs(path: str) -> None:
-        if FVCorePathManager:
-            return FVCorePathManager.mkdirs(path)
-        os.makedirs(path, exist_ok=True)
-
-    @staticmethod
-    def rm(path: str) -> None:
-        if FVCorePathManager:
-            return FVCorePathManager.rm(path)
-        os.remove(path)
-
-    @staticmethod
-    def chmod(path: str, mode: int) -> None:
-        if "manifold" not in path:
-            os.chmod(path, mode)
-
-    @staticmethod
-    def register_handler(handler) -> None:
-        if FVCorePathManager:
-            return FVCorePathManager.register_handler(handler=handler)
-
-    @staticmethod
-    def copy_from_local(
-        local_path: str, dst_path: str, overwrite: bool = False, **kwargs
-    ) -> None:
-        if FVCorePathManager:
-            return FVCorePathManager.copy_from_local(
-                local_path=local_path, dst_path=dst_path, overwrite=overwrite, **kwargs
-            )
-        return shutil.copyfile(local_path, dst_path)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/file_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/file_utils.py
deleted file mode 100644
index 0a94ac711243bd46a8b7875a526e3e1598abdf41..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/file_utils.py
+++ /dev/null
@@ -1,353 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Utilities for working with the local dataset cache.
-This file is adapted from `AllenNLP <https://github.com/allenai/allennlp>`_.
-and `huggingface <https://github.com/huggingface>`_.
-"""
-
-import fnmatch
-import json
-import logging
-import os
-import shutil
-import tarfile
-import tempfile
-from functools import partial, wraps
-from hashlib import sha256
-from io import open
-
-
-try:
-    from torch.hub import _get_torch_home
-
-    torch_cache_home = _get_torch_home()
-except ImportError:
-    torch_cache_home = os.path.expanduser(
-        os.getenv(
-            "TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch")
-        )
-    )
-default_cache_path = os.path.join(torch_cache_home, "pytorch_fairseq")
-
-try:
-    from urllib.parse import urlparse
-except ImportError:
-    from urlparse import urlparse
-
-try:
-    from pathlib import Path
-
-    PYTORCH_FAIRSEQ_CACHE = Path(os.getenv("PYTORCH_FAIRSEQ_CACHE", default_cache_path))
-except (AttributeError, ImportError):
-    PYTORCH_FAIRSEQ_CACHE = os.getenv("PYTORCH_FAIRSEQ_CACHE", default_cache_path)
-
-CONFIG_NAME = "config.json"
-WEIGHTS_NAME = "pytorch_model.bin"
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
-
-def load_archive_file(archive_file):
-    # redirect to the cache, if necessary
-    try:
-        resolved_archive_file = cached_path(archive_file, cache_dir=None)
-    except EnvironmentError:
-        logger.info(
-            "Archive name '{}' was not found in archive name list. "
-            "We assumed '{}' was a path or URL but couldn't find any file "
-            "associated to this path or URL.".format(
-                archive_file,
-                archive_file,
-            )
-        )
-        return None
-
-    if resolved_archive_file == archive_file:
-        logger.info("loading archive file {}".format(archive_file))
-    else:
-        logger.info(
-            "loading archive file {} from cache at {}".format(
-                archive_file, resolved_archive_file
-            )
-        )
-
-    # Extract archive to temp dir and replace .tar.bz2 if necessary
-    tempdir = None
-    if not os.path.isdir(resolved_archive_file):
-        tempdir = tempfile.mkdtemp()
-        logger.info(
-            "extracting archive file {} to temp dir {}".format(
-                resolved_archive_file, tempdir
-            )
-        )
-        ext = os.path.splitext(archive_file)[1][1:]
-        with tarfile.open(resolved_archive_file, "r:" + ext) as archive:
-            top_dir = os.path.commonprefix(archive.getnames())
-            archive.extractall(tempdir)
-        os.remove(resolved_archive_file)
-        shutil.move(os.path.join(tempdir, top_dir), resolved_archive_file)
-        shutil.rmtree(tempdir)
-
-    return resolved_archive_file
-
-
-def url_to_filename(url, etag=None):
-    """
-    Convert `url` into a hashed filename in a repeatable way.
-    If `etag` is specified, append its hash to the URL's, delimited
-    by a period.
-    """
-    url_bytes = url.encode("utf-8")
-    url_hash = sha256(url_bytes)
-    filename = url_hash.hexdigest()
-
-    if etag:
-        etag_bytes = etag.encode("utf-8")
-        etag_hash = sha256(etag_bytes)
-        filename += "." + etag_hash.hexdigest()
-
-    return filename
-
-
-def filename_to_url(filename, cache_dir=None):
-    """
-    Return the url and etag (which may be ``None``) stored for `filename`.
-    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
-    """
-    if cache_dir is None:
-        cache_dir = PYTORCH_FAIRSEQ_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    cache_path = os.path.join(cache_dir, filename)
-    if not os.path.exists(cache_path):
-        raise EnvironmentError("file {} not found".format(cache_path))
-
-    meta_path = cache_path + ".json"
-    if not os.path.exists(meta_path):
-        raise EnvironmentError("file {} not found".format(meta_path))
-
-    with open(meta_path, encoding="utf-8") as meta_file:
-        metadata = json.load(meta_file)
-    url = metadata["url"]
-    etag = metadata["etag"]
-
-    return url, etag
-
-
-def cached_path(url_or_filename, cache_dir=None):
-    """
-    Given something that might be a URL (or might be a local path),
-    determine which. If it's a URL, download the file and cache it, and
-    return the path to the cached file. If it's already a local path,
-    make sure the file exists and then return the path.
-    """
-    if cache_dir is None:
-        cache_dir = PYTORCH_FAIRSEQ_CACHE
-    if isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    parsed = urlparse(url_or_filename)
-
-    if parsed.scheme in ("http", "https", "s3"):
-        # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(url_or_filename, cache_dir)
-    elif os.path.exists(url_or_filename):
-        # File, and it exists.
-        return url_or_filename
-    elif parsed.scheme == "":
-        # File, but it doesn't exist.
-        raise EnvironmentError("file {} not found".format(url_or_filename))
-    else:
-        # Something unknown
-        raise ValueError(
-            "unable to parse {} as a URL or as a local path".format(url_or_filename)
-        )
-
-
-def split_s3_path(url):
-    """Split a full s3 path into the bucket name and path."""
-    parsed = urlparse(url)
-    if not parsed.netloc or not parsed.path:
-        raise ValueError("bad s3 path {}".format(url))
-    bucket_name = parsed.netloc
-    s3_path = parsed.path
-    # Remove '/' at beginning of path.
-    if s3_path.startswith("/"):
-        s3_path = s3_path[1:]
-    return bucket_name, s3_path
-
-
-def s3_request(func):
-    """
-    Wrapper function for s3 requests in order to create more helpful error
-    messages.
-    """
-
-    @wraps(func)
-    def wrapper(url, *args, **kwargs):
-        from botocore.exceptions import ClientError
-
-        try:
-            return func(url, *args, **kwargs)
-        except ClientError as exc:
-            if int(exc.response["Error"]["Code"]) == 404:
-                raise EnvironmentError("file {} not found".format(url))
-            else:
-                raise
-
-    return wrapper
-
-
-@s3_request
-def s3_etag(url):
-    """Check ETag on S3 object."""
-    import boto3
-
-    s3_resource = boto3.resource("s3")
-    bucket_name, s3_path = split_s3_path(url)
-    s3_object = s3_resource.Object(bucket_name, s3_path)
-    return s3_object.e_tag
-
-
-@s3_request
-def s3_get(url, temp_file):
-    """Pull a file directly from S3."""
-    import boto3
-
-    s3_resource = boto3.resource("s3")
-    bucket_name, s3_path = split_s3_path(url)
-    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
-
-
-def request_wrap_timeout(func, url):
-    import requests
-
-    for attempt, timeout in enumerate([10, 20, 40, 60, 60]):
-        try:
-            return func(timeout=timeout)
-        except requests.exceptions.Timeout as e:
-            logger.warning(
-                "Request for %s timed-out (attempt %d). Retrying with a timeout of %d secs",
-                url,
-                attempt,
-                timeout,
-                exc_info=e,
-            )
-            continue
-    raise RuntimeError(f"Unable to fetch file {url}")
-
-
-def http_get(url, temp_file):
-    import requests
-    from tqdm import tqdm
-
-    req = request_wrap_timeout(partial(requests.get, url, stream=True), url)
-    content_length = req.headers.get("Content-Length")
-    total = int(content_length) if content_length is not None else None
-    progress = tqdm(unit="B", total=total)
-    for chunk in req.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
-
-
-def get_from_cache(url, cache_dir=None):
-    """
-    Given a URL, look for the corresponding dataset in the local cache.
-    If it's not there, download it. Then return the path to the cached file.
-    """
-    if cache_dir is None:
-        cache_dir = PYTORCH_FAIRSEQ_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    if not os.path.exists(cache_dir):
-        os.makedirs(cache_dir)
-
-    # Get eTag to add to filename, if it exists.
-    if url.startswith("s3://"):
-        etag = s3_etag(url)
-    else:
-        try:
-            import requests
-
-            response = request_wrap_timeout(
-                partial(requests.head, url, allow_redirects=True), url
-            )
-            if response.status_code != 200:
-                etag = None
-            else:
-                etag = response.headers.get("ETag")
-        except EnvironmentError:
-            etag = None
-
-    filename = url_to_filename(url, etag)
-
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-
-    # If we don't have a connection (etag is None) and can't identify the file
-    # try to get the last downloaded one
-    if not os.path.exists(cache_path) and etag is None:
-        matching_files = fnmatch.filter(os.listdir(cache_dir), filename + ".*")
-        matching_files = list(filter(lambda s: not s.endswith(".json"), matching_files))
-        if matching_files:
-            cache_path = os.path.join(cache_dir, matching_files[-1])
-
-    if not os.path.exists(cache_path):
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with tempfile.NamedTemporaryFile() as temp_file:
-            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
-
-            # GET file object
-            if url.startswith("s3://"):
-                s3_get(url, temp_file)
-            else:
-                http_get(url, temp_file)
-
-            # we are copying the file before closing it, so flush to avoid truncation
-            temp_file.flush()
-            # shutil.copyfileobj() starts at the current position, so go to the start
-            temp_file.seek(0)
-
-            logger.info("copying %s to cache at %s", temp_file.name, cache_path)
-            with open(cache_path, "wb") as cache_file:
-                shutil.copyfileobj(temp_file, cache_file)
-
-            logger.info("creating metadata file for %s", cache_path)
-            meta = {"url": url, "etag": etag}
-            meta_path = cache_path + ".json"
-            with open(meta_path, "w") as meta_file:
-                output_string = json.dumps(meta)
-                meta_file.write(output_string)
-
-            logger.info("removing temp file %s", temp_file.name)
-
-    return cache_path
-
-
-def read_set_from_file(filename):
-    """
-    Extract a de-duped collection (set) of text from a file.
-    Expected file format is one item per line.
-    """
-    collection = set()
-    with open(filename, "r", encoding="utf-8") as file_:
-        for line in file_:
-            collection.add(line.rstrip())
-    return collection
-
-
-def get_file_extension(path, dot=True, lower=True):
-    ext = os.path.splitext(path)[1]
-    ext = ext if dot else ext[1:]
-    return ext.lower() if lower else ext
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/hub_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/hub_utils.py
deleted file mode 100644
index b293e54e2a5b77df69b8044b526837b6fbceb3aa..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/hub_utils.py
+++ /dev/null
@@ -1,294 +0,0 @@
-#!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import copy
-import logging
-import os
-from typing import Any, Dict, Iterator, List, Tuple
-
-import torch
-from fairseq import utils
-from fairseq.data import encoders
-from torch import nn
-
-
-logger = logging.getLogger(__name__)
-
-
-def from_pretrained(
-    model_name_or_path,
-    checkpoint_file="model.pt",
-    data_name_or_path=".",
-    archive_map=None,
-    **kwargs
-):
-    from fairseq import checkpoint_utils, file_utils
-
-    if archive_map is not None:
-        if model_name_or_path in archive_map:
-            model_name_or_path = archive_map[model_name_or_path]
-        if data_name_or_path is not None and data_name_or_path in archive_map:
-            data_name_or_path = archive_map[data_name_or_path]
-
-        # allow archive_map to set default arg_overrides (e.g., tokenizer, bpe)
-        # for each model
-        if isinstance(model_name_or_path, dict):
-            for k, v in model_name_or_path.items():
-                if k == "checkpoint_file":
-                    checkpoint_file = v
-                elif (
-                    k != "path"
-                    # only set kwargs that don't already have overrides
-                    and k not in kwargs
-                ):
-                    kwargs[k] = v
-            model_name_or_path = model_name_or_path["path"]
-
-    model_path = file_utils.load_archive_file(model_name_or_path)
-
-    # convenience hack for loading data and BPE codes from model archive
-    if data_name_or_path.startswith("."):
-        kwargs["data"] = os.path.abspath(os.path.join(model_path, data_name_or_path))
-    else:
-        kwargs["data"] = file_utils.load_archive_file(data_name_or_path)
-    for file, arg in {
-        "code": "bpe_codes",
-        "bpecodes": "bpe_codes",
-        "sentencepiece.bpe.model": "sentencepiece_model",
-    }.items():
-        path = os.path.join(model_path, file)
-        if os.path.exists(path):
-            kwargs[arg] = path
-
-    if "user_dir" in kwargs:
-        utils.import_user_module(argparse.Namespace(user_dir=kwargs["user_dir"]))
-
-    models, args, task = checkpoint_utils.load_model_ensemble_and_task(
-        [os.path.join(model_path, cpt) for cpt in checkpoint_file.split(os.pathsep)],
-        arg_overrides=kwargs,
-    )
-
-    return {
-        "args": args,
-        "task": task,
-        "models": models,
-    }
-
-
-class GeneratorHubInterface(nn.Module):
-    """
-    PyTorch Hub interface for generating sequences from a pre-trained
-    translation or language model.
-    """
-
-    def __init__(self, args, task, models):
-        super().__init__()
-        self.args = args
-        self.task = task
-        self.models = nn.ModuleList(models)
-        self.src_dict = task.source_dictionary
-        self.tgt_dict = task.target_dictionary
-
-        # optimize model for generation
-        for model in self.models:
-            model.prepare_for_inference_(args)
-
-        # Load alignment dictionary for unknown word replacement
-        # (None if no unknown word replacement, empty if no path to align dictionary)
-        self.align_dict = utils.load_align_dict(getattr(args, "replace_unk", None))
-
-        self.tokenizer = encoders.build_tokenizer(args)
-        self.bpe = encoders.build_bpe(args)
-
-        self.max_positions = utils.resolve_max_positions(
-            self.task.max_positions(), *[model.max_positions() for model in models]
-        )
-
-        # this is useful for determining the device
-        self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
-
-    @property
-    def device(self):
-        return self._float_tensor.device
-
-    def translate(
-        self, sentences: List[str], beam: int = 5, verbose: bool = False, **kwargs
-    ) -> List[str]:
-        return self.sample(sentences, beam, verbose, **kwargs)
-
-    def sample(
-        self, sentences: List[str], beam: int = 1, verbose: bool = False, **kwargs
-    ) -> List[str]:
-        if isinstance(sentences, str):
-            return self.sample([sentences], beam=beam, verbose=verbose, **kwargs)[0]
-        tokenized_sentences = [self.encode(sentence) for sentence in sentences]
-        batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs)
-        return [self.decode(hypos[0]["tokens"]) for hypos in batched_hypos]
-
-    def score(self, sentences: List[str], **kwargs):
-        if isinstance(sentences, str):
-            return self.score([sentences], **kwargs)[0]
-        # NOTE: this doesn't support translation tasks currently
-        tokenized_sentences = [self.encode(sentence) for sentence in sentences]
-        return [
-            hypos[0]
-            for hypos in self.generate(
-                tokenized_sentences, score_reference=True, **kwargs
-            )
-        ]
-
-    def generate(
-        self,
-        tokenized_sentences: List[torch.LongTensor],
-        beam: int = 5,
-        verbose: bool = False,
-        skip_invalid_size_inputs=False,
-        inference_step_args=None,
-        **kwargs
-    ) -> List[List[Dict[str, torch.Tensor]]]:
-        if torch.is_tensor(tokenized_sentences) and tokenized_sentences.dim() == 1:
-            return self.generate(
-                tokenized_sentences.unsqueeze(0), beam=beam, verbose=verbose, **kwargs
-            )[0]
-
-        # build generator using current args as well as any kwargs
-        gen_args = copy.copy(self.args)
-        gen_args.beam = beam
-        for k, v in kwargs.items():
-            setattr(gen_args, k, v)
-        generator = self.task.build_generator(self.models, gen_args)
-
-        inference_step_args = inference_step_args or {}
-        results = []
-        for batch in self._build_batches(tokenized_sentences, skip_invalid_size_inputs):
-            batch = utils.apply_to_sample(lambda t: t.to(self.device), batch)
-            translations = self.task.inference_step(
-                generator, self.models, batch, **inference_step_args
-            )
-            for id, hypos in zip(batch["id"].tolist(), translations):
-                results.append((id, hypos))
-
-        # sort output to match input order
-        outputs = [hypos for _, hypos in sorted(results, key=lambda x: x[0])]
-
-        if verbose:
-
-            def getarg(name, default):
-                return getattr(gen_args, name, getattr(self.args, name, default))
-
-            for source_tokens, target_hypotheses in zip(tokenized_sentences, outputs):
-                src_str_with_unk = self.string(source_tokens)
-                logger.info("S\t{}".format(src_str_with_unk))
-                for hypo in target_hypotheses:
-                    hypo_str = self.decode(hypo["tokens"])
-                    logger.info("H\t{}\t{}".format(hypo["score"], hypo_str))
-                    logger.info(
-                        "P\t{}".format(
-                            " ".join(
-                                map(
-                                    lambda x: "{:.4f}".format(x),
-                                    hypo["positional_scores"].tolist(),
-                                )
-                            )
-                        )
-                    )
-                    if hypo["alignment"] is not None and getarg(
-                        "print_alignment", False
-                    ):
-                        logger.info(
-                            "A\t{}".format(
-                                " ".join(
-                                    [
-                                        "{}-{}".format(src_idx, tgt_idx)
-                                        for src_idx, tgt_idx in hypo["alignment"]
-                                    ]
-                                )
-                            )
-                        )
-        return outputs
-
-    def encode(self, sentence: str) -> torch.LongTensor:
-        sentence = self.tokenize(sentence)
-        sentence = self.apply_bpe(sentence)
-        return self.binarize(sentence)
-
-    def decode(self, tokens: torch.LongTensor) -> str:
-        sentence = self.string(tokens)
-        sentence = self.remove_bpe(sentence)
-        return self.detokenize(sentence)
-
-    def tokenize(self, sentence: str) -> str:
-        if self.tokenizer is not None:
-            sentence = self.tokenizer.encode(sentence)
-        return sentence
-
-    def detokenize(self, sentence: str) -> str:
-        if self.tokenizer is not None:
-            sentence = self.tokenizer.decode(sentence)
-        return sentence
-
-    def apply_bpe(self, sentence: str) -> str:
-        if self.bpe is not None:
-            sentence = self.bpe.encode(sentence)
-        return sentence
-
-    def remove_bpe(self, sentence: str) -> str:
-        if self.bpe is not None:
-            sentence = self.bpe.decode(sentence)
-        return sentence
-
-    def binarize(self, sentence: str) -> torch.LongTensor:
-        return self.src_dict.encode_line(sentence, add_if_not_exist=False).long()
-
-    def string(self, tokens: torch.LongTensor) -> str:
-        return self.tgt_dict.string(tokens)
-
-    def _build_batches(
-        self, tokens: List[List[int]], skip_invalid_size_inputs: bool
-    ) -> Iterator[Dict[str, Any]]:
-        lengths = torch.LongTensor([t.numel() for t in tokens])
-        batch_iterator = self.task.get_batch_iterator(
-            dataset=self.task.build_dataset_for_inference(tokens, lengths),
-            max_tokens=self.args.max_tokens,
-            max_sentences=self.args.batch_size,
-            max_positions=self.max_positions,
-            ignore_invalid_inputs=skip_invalid_size_inputs,
-            disable_iterator_cache=True,
-        ).next_epoch_itr(shuffle=False)
-        return batch_iterator
-
-
-class BPEHubInterface(object):
-    """PyTorch Hub interface for Byte-Pair Encoding (BPE)."""
-
-    def __init__(self, bpe, **kwargs):
-        super().__init__()
-        args = argparse.Namespace(bpe=bpe, **kwargs)
-        self.bpe = encoders.build_bpe(args)
-        assert self.bpe is not None
-
-    def encode(self, sentence: str) -> str:
-        return self.bpe.encode(sentence)
-
-    def decode(self, sentence: str) -> str:
-        return self.bpe.decode(sentence)
-
-
-class TokenizerHubInterface(object):
-    """PyTorch Hub interface for tokenization."""
-
-    def __init__(self, tokenizer, **kwargs):
-        super().__init__()
-        args = argparse.Namespace(tokenizer=tokenizer, **kwargs)
-        self.tokenizer = encoders.build_tokenizer(args)
-        assert self.tokenizer is not None
-
-    def encode(self, sentence: str) -> str:
-        return self.tokenizer.encode(sentence)
-
-    def decode(self, sentence: str) -> str:
-        return self.tokenizer.decode(sentence)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/incremental_decoding_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/incremental_decoding_utils.py
deleted file mode 100644
index b26e6cd01cd4cbdffa23d88b354eb4a55a94189b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/incremental_decoding_utils.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import uuid
-from typing import Dict, Optional
-
-from torch import Tensor
-
-
-class FairseqIncrementalState(object):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.init_incremental_state()
-
-    def init_incremental_state(self):
-        self._incremental_state_id = str(uuid.uuid4())
-
-    def _get_full_incremental_state_key(self, key: str) -> str:
-        return "{}.{}".format(self._incremental_state_id, key)
-
-    def get_incremental_state(
-        self,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
-        key: str,
-    ) -> Optional[Dict[str, Optional[Tensor]]]:
-        """Helper for getting incremental state for an nn.Module."""
-        full_key = self._get_full_incremental_state_key(key)
-        if incremental_state is None or full_key not in incremental_state:
-            return None
-        return incremental_state[full_key]
-
-    def set_incremental_state(
-        self,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
-        key: str,
-        value: Dict[str, Optional[Tensor]],
-    ) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]:
-        """Helper for setting incremental state for an nn.Module."""
-        if incremental_state is not None:
-            full_key = self._get_full_incremental_state_key(key)
-            incremental_state[full_key] = value
-        return incremental_state
-
-
-def with_incremental_state(cls):
-    cls.__bases__ = (FairseqIncrementalState,) + tuple(
-        b for b in cls.__bases__ if b != FairseqIncrementalState
-    )
-    return cls
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/iterative_refinement_generator.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/iterative_refinement_generator.py
deleted file mode 100644
index 4fb0946f499329ceb130761b59675d761df1c158..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/iterative_refinement_generator.py
+++ /dev/null
@@ -1,359 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from collections import namedtuple
-
-import numpy as np
-import torch
-from fairseq import utils
-
-
-DecoderOut = namedtuple(
-    "IterativeRefinementDecoderOut",
-    ["output_tokens", "output_scores", "attn", "step", "max_step", "history"],
-)
-
-
-class IterativeRefinementGenerator(object):
-    def __init__(
-        self,
-        tgt_dict,
-        models=None,
-        eos_penalty=0.0,
-        max_iter=10,
-        max_ratio=2,
-        beam_size=1,
-        decoding_format=None,
-        retain_dropout=False,
-        adaptive=True,
-        retain_history=False,
-        reranking=False,
-    ):
-        """
-        Generates translations based on iterative refinement.
-
-        Args:
-            tgt_dict: target dictionary
-            eos_penalty: if > 0.0, it penalized early-stopping in decoding
-            max_iter: maximum number of refinement iterations
-            max_ratio: generate sequences of maximum length ax, where x is the source length
-            decoding_format: decoding mode in {'unigram', 'ensemble', 'vote', 'dp', 'bs'}
-            retain_dropout: retaining dropout in the inference
-            adaptive: decoding with early stop
-        """
-        self.bos = tgt_dict.bos()
-        self.pad = tgt_dict.pad()
-        self.unk = tgt_dict.unk()
-        self.eos = tgt_dict.eos()
-        self.vocab_size = len(tgt_dict)
-        self.eos_penalty = eos_penalty
-        self.max_iter = max_iter
-        self.max_ratio = max_ratio
-        self.beam_size = beam_size
-        self.reranking = reranking
-        self.decoding_format = decoding_format
-        self.retain_dropout = retain_dropout
-        self.retain_history = retain_history
-        self.adaptive = adaptive
-        self.models = models
-
-    def generate_batched_itr(
-        self,
-        data_itr,
-        maxlen_a=None,
-        maxlen_b=None,
-        cuda=False,
-        timer=None,
-        prefix_size=0,
-    ):
-        """Iterate over a batched dataset and yield individual translations.
-
-        Args:
-            maxlen_a/b: generate sequences of maximum length ax + b,
-                where x is the source sentence length.
-            cuda: use GPU for generation
-            timer: StopwatchMeter for timing generations.
-        """
-
-        for sample in data_itr:
-            if "net_input" not in sample:
-                continue
-            if timer is not None:
-                timer.start()
-            with torch.no_grad():
-                hypos = self.generate(
-                    self.models,
-                    sample,
-                    prefix_tokens=sample["target"][:, :prefix_size]
-                    if prefix_size > 0
-                    else None,
-                )
-            if timer is not None:
-                timer.stop(sample["ntokens"])
-            for i, id in enumerate(sample["id"]):
-                # remove padding
-                src = utils.strip_pad(sample["net_input"]["src_tokens"][i, :], self.pad)
-                ref = utils.strip_pad(sample["target"][i, :], self.pad)
-                yield id, src, ref, hypos[i]
-
-    @torch.no_grad()
-    def generate(self, models, sample, prefix_tokens=None, constraints=None):
-        if constraints is not None:
-            raise NotImplementedError(
-                "Constrained decoding with the IterativeRefinementGenerator is not supported"
-            )
-
-        # TODO: iterative refinement generator does not support ensemble for now.
-        if not self.retain_dropout:
-            for model in models:
-                model.eval()
-
-        model, reranker = models[0], None
-        if self.reranking:
-            assert len(models) > 1, "Assuming the last checkpoint is the reranker"
-            assert (
-                self.beam_size > 1
-            ), "Reranking requires multiple translation for each example"
-
-            reranker = models[-1]
-            models = models[:-1]
-
-        if len(models) > 1 and hasattr(model, "enable_ensemble"):
-            assert model.allow_ensemble, "{} does not support ensembling".format(
-                model.__class__.__name__
-            )
-            model.enable_ensemble(models)
-
-        # TODO: better encoder inputs?
-        src_tokens = sample["net_input"]["src_tokens"]
-        src_lengths = sample["net_input"]["src_lengths"]
-        bsz, src_len = src_tokens.size()
-
-        # initialize
-        encoder_out = model.forward_encoder([src_tokens, src_lengths])
-        prev_decoder_out = model.initialize_output_tokens(encoder_out, src_tokens)
-
-        if self.beam_size > 1:
-            assert (
-                model.allow_length_beam
-            ), "{} does not support decoding with length beam.".format(
-                model.__class__.__name__
-            )
-
-            # regenerate data based on length-beam
-            length_beam_order = (
-                utils.new_arange(src_tokens, self.beam_size, bsz).t().reshape(-1)
-            )
-            encoder_out = model.encoder.reorder_encoder_out(
-                encoder_out, length_beam_order
-            )
-            prev_decoder_out = model.regenerate_length_beam(
-                prev_decoder_out, self.beam_size
-            )
-            bsz = bsz * self.beam_size
-
-        sent_idxs = torch.arange(bsz)
-        prev_output_tokens = prev_decoder_out.output_tokens.clone()
-
-        if self.retain_history:
-            prev_decoder_out = prev_decoder_out._replace(history=[prev_output_tokens])
-
-        finalized = [[] for _ in range(bsz)]
-
-        def is_a_loop(x, y, s, a):
-            b, l_x, l_y = x.size(0), x.size(1), y.size(1)
-            if l_x > l_y:
-                y = torch.cat([y, x.new_zeros(b, l_x - l_y).fill_(self.pad)], 1)
-                s = torch.cat([s, s.new_zeros(b, l_x - l_y)], 1)
-                if a is not None:
-                    a = torch.cat([a, a.new_zeros(b, l_x - l_y, a.size(2))], 1)
-            elif l_x < l_y:
-                x = torch.cat([x, y.new_zeros(b, l_y - l_x).fill_(self.pad)], 1)
-            return (x == y).all(1), y, s, a
-
-        def finalized_hypos(step, prev_out_token, prev_out_score, prev_out_attn):
-            cutoff = prev_out_token.ne(self.pad)
-            tokens = prev_out_token[cutoff]
-            if prev_out_score is None:
-                scores, score = None, None
-            else:
-                scores = prev_out_score[cutoff]
-                score = scores.mean()
-
-            if prev_out_attn is None:
-                hypo_attn, alignment = None, None
-            else:
-                hypo_attn = prev_out_attn[cutoff]
-                alignment = hypo_attn.max(dim=1)[1]
-            return {
-                "steps": step,
-                "tokens": tokens,
-                "positional_scores": scores,
-                "score": score,
-                "hypo_attn": hypo_attn,
-                "alignment": alignment,
-            }
-
-        for step in range(self.max_iter + 1):
-
-            decoder_options = {
-                "eos_penalty": self.eos_penalty,
-                "max_ratio": self.max_ratio,
-                "decoding_format": self.decoding_format,
-            }
-            prev_decoder_out = prev_decoder_out._replace(
-                step=step,
-                max_step=self.max_iter + 1,
-            )
-
-            decoder_out = model.forward_decoder(
-                prev_decoder_out, encoder_out, **decoder_options
-            )
-
-            if self.adaptive:
-                # terminate if there is a loop
-                terminated, out_tokens, out_scores, out_attn = is_a_loop(
-                    prev_output_tokens,
-                    decoder_out.output_tokens,
-                    decoder_out.output_scores,
-                    decoder_out.attn,
-                )
-                decoder_out = decoder_out._replace(
-                    output_tokens=out_tokens,
-                    output_scores=out_scores,
-                    attn=out_attn,
-                )
-
-            else:
-                terminated = decoder_out.output_tokens.new_zeros(
-                    decoder_out.output_tokens.size(0)
-                ).bool()
-
-            if step == self.max_iter:  # reach last iteration, terminate
-                terminated.fill_(1)
-
-            # collect finalized sentences
-            finalized_idxs = sent_idxs[terminated]
-            finalized_tokens = decoder_out.output_tokens[terminated]
-            finalized_scores = decoder_out.output_scores[terminated]
-            finalized_attn = (
-                None
-                if (decoder_out.attn is None or decoder_out.attn.size(0) == 0)
-                else decoder_out.attn[terminated]
-            )
-
-            if self.retain_history:
-                finalized_history_tokens = [h[terminated] for h in decoder_out.history]
-
-            for i in range(finalized_idxs.size(0)):
-                finalized[finalized_idxs[i]] = [
-                    finalized_hypos(
-                        step,
-                        finalized_tokens[i],
-                        finalized_scores[i],
-                        None if finalized_attn is None else finalized_attn[i],
-                    )
-                ]
-
-                if self.retain_history:
-                    finalized[finalized_idxs[i]][0]["history"] = []
-                    for j in range(len(finalized_history_tokens)):
-                        finalized[finalized_idxs[i]][0]["history"].append(
-                            finalized_hypos(
-                                step, finalized_history_tokens[j][i], None, None
-                            )
-                        )
-
-            # check if all terminated
-            if terminated.sum() == terminated.size(0):
-                break
-
-            # for next step
-            not_terminated = ~terminated
-            prev_decoder_out = decoder_out._replace(
-                output_tokens=decoder_out.output_tokens[not_terminated],
-                output_scores=decoder_out.output_scores[not_terminated],
-                attn=decoder_out.attn[not_terminated]
-                if (decoder_out.attn is not None and decoder_out.attn.size(0) > 0)
-                else None,
-                history=[h[not_terminated] for h in decoder_out.history]
-                if decoder_out.history is not None
-                else None,
-            )
-            encoder_out = model.encoder.reorder_encoder_out(
-                encoder_out, not_terminated.nonzero(as_tuple=False).squeeze()
-            )
-            sent_idxs = sent_idxs[not_terminated]
-            prev_output_tokens = prev_decoder_out.output_tokens.clone()
-
-        if self.beam_size > 1:
-            if reranker is not None:
-                finalized = self.rerank(
-                    reranker, finalized, [src_tokens, src_lengths], self.beam_size
-                )
-
-            # aggregate information from length beam
-            finalized = [
-                finalized[
-                    np.argmax(
-                        [
-                            finalized[self.beam_size * i + j][0]["score"]
-                            for j in range(self.beam_size)
-                        ]
-                    )
-                    + self.beam_size * i
-                ]
-                for i in range(len(finalized) // self.beam_size)
-            ]
-
-        return finalized
-
-    def rerank(self, reranker, finalized, encoder_input, beam_size):
-        def rebuild_batch(finalized):
-            finalized_tokens = [f[0]["tokens"] for f in finalized]
-            finalized_maxlen = max(f.size(0) for f in finalized_tokens)
-            final_output_tokens = (
-                finalized_tokens[0]
-                .new_zeros(len(finalized_tokens), finalized_maxlen)
-                .fill_(self.pad)
-            )
-            for i, f in enumerate(finalized_tokens):
-                final_output_tokens[i, : f.size(0)] = f
-            return final_output_tokens
-
-        final_output_tokens = rebuild_batch(finalized)
-        final_output_tokens[
-            :, 0
-        ] = self.eos  # autoregressive model assumes starting with EOS
-
-        reranker_encoder_out = reranker.encoder(*encoder_input)
-        length_beam_order = (
-            utils.new_arange(
-                final_output_tokens, beam_size, reranker_encoder_out.encoder_out.size(1)
-            )
-            .t()
-            .reshape(-1)
-        )
-        reranker_encoder_out = reranker.encoder.reorder_encoder_out(
-            reranker_encoder_out, length_beam_order
-        )
-        reranking_scores = reranker.get_normalized_probs(
-            reranker.decoder(final_output_tokens[:, :-1], reranker_encoder_out),
-            True,
-            None,
-        )
-        reranking_scores = reranking_scores.gather(2, final_output_tokens[:, 1:, None])
-        reranking_masks = final_output_tokens[:, 1:].ne(self.pad)
-        reranking_scores = (
-            reranking_scores[:, :, 0].masked_fill_(~reranking_masks, 0).sum(1)
-        )
-        reranking_scores = reranking_scores / reranking_masks.sum(1).type_as(
-            reranking_scores
-        )
-
-        for i in range(len(finalized)):
-            finalized[i][0]["score"] = reranking_scores[i]
-
-        return finalized
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/legacy_distributed_data_parallel.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/legacy_distributed_data_parallel.py
deleted file mode 100644
index 44f87c7c4253b18179e2174e8657e1e2ecf18176..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/legacy_distributed_data_parallel.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-A modified version of the legacy DistributedDataParallel module that uses c10d
-communication primitives. This version is simpler than the latest PyTorch
-version and is useful for debugging. Notably it does not overlap gradient
-communication with the backward pass, which makes it slower but more robust
-than the PyTorch version.
-
-This version also supports the *no_sync* context manager, which allows faster
-training with `--update-freq`.
-"""
-
-import copy
-from collections import OrderedDict
-from contextlib import contextmanager
-
-import torch
-from torch import nn
-from torch.autograd import Variable
-
-from . import distributed_utils
-
-
-class LegacyDistributedDataParallel(nn.Module):
-    """Implements distributed data parallelism at the module level.
-
-    A simplified version of :class:`torch.nn.parallel.DistributedDataParallel`.
-    This version uses a c10d process group for communication and does not
-    broadcast buffers.
-
-    Args:
-        module (~torch.nn.Module): module to be parallelized
-        world_size (int): number of parallel workers
-        process_group (optional): the c10d process group to be used for
-            distributed data all-reduction. If None, the default process group
-            will be used.
-        buffer_size (int, optional): number of elements to buffer before
-            performing all-reduce (default: 256M).
-    """
-
-    def __init__(self, module, world_size, process_group=None, buffer_size=2 ** 28):
-        super().__init__()
-
-        self.module = module
-        self.world_size = world_size
-        self.process_group = process_group
-
-        # Never use a bigger buffer than the number of model params
-        self.buffer_size = min(buffer_size, sum(p.numel() for p in module.parameters()))
-        self.buffer = None
-
-        # We can also forcibly accumulate grads locally and only do the
-        # all-reduce at some later time
-        self.accumulate_grads = False
-
-        # make per-device lists of parameters
-        paramlists = OrderedDict()
-        for param in self.module.parameters():
-            device = param.device
-            if paramlists.get(device) is None:
-                paramlists[device] = []
-            paramlists[device] += [param]
-        self.per_device_params = list(paramlists.values())
-
-    def __getstate__(self):
-        attrs = copy.copy(self.__dict__)
-        return attrs
-
-    def __setstate__(self, state):
-        super().__setstate__(state)
-
-    @contextmanager
-    def no_sync(self):
-        """A context manager to disable gradient synchronization."""
-        old_accumulate_grads = self.accumulate_grads
-        self.accumulate_grads = True
-        yield
-        self.accumulate_grads = old_accumulate_grads
-
-    def forward(self, *inputs, **kwargs):
-        return self.module(*inputs, **kwargs)
-
-    def all_reduce(self):
-        """
-        This function must be called explicitly after backward to reduce
-        gradients. There is no automatic hook like c10d.
-        """
-
-        def all_reduce_params(params):
-            buffer = self.buffer
-            nonzero_buffer = False
-            if len(params) > 1:
-                offset = 0
-                for p in params:
-                    sz = p.numel()
-                    if p.grad is not None:
-                        buffer[offset : offset + sz].copy_(p.grad.data.view(-1))
-                        nonzero_buffer = True
-                    else:
-                        buffer[offset : offset + sz].zero_()
-                    offset += sz
-            else:
-                # we only have a single grad to all-reduce
-                p = params[0]
-                if p.grad is not None:
-                    buffer = p.grad.data
-                    nonzero_buffer = True
-                elif p.numel() <= self.buffer.numel():
-                    buffer = buffer[: p.numel()]
-                    buffer.zero_()
-                else:
-                    buffer = torch.zeros_like(p)
-
-            if nonzero_buffer:
-                buffer.div_(self.world_size)
-
-            distributed_utils.all_reduce(buffer, self.process_group)
-
-            # copy all-reduced grads back into their original place
-            offset = 0
-            for p in params:
-                sz = p.numel()
-                if p.grad is not None:
-                    p.grad.data.copy_(buffer[offset : offset + sz].view_as(p))
-                else:
-                    p.grad = buffer[offset : offset + sz].view_as(p).clone()
-                offset += sz
-
-        def reduction_fn():
-            # This function only needs to be called once
-            if self.accumulate_grads:
-                return
-
-            if self.buffer is None:
-                self.buffer = next(self.module.parameters()).new(self.buffer_size)
-
-            for params in self.per_device_params:
-                # All-reduce the gradients in buckets
-                offset = 0
-                buffered_params = []
-                for param in params:
-                    if not param.requires_grad:
-                        continue
-                    if param.grad is None:
-                        param.grad = torch.zeros_like(param)
-                    if param.grad.requires_grad:
-                        raise RuntimeError(
-                            "DistributedDataParallel only works "
-                            "with gradients that don't require "
-                            "grad"
-                        )
-                    sz = param.numel()
-                    if sz > self.buffer.numel():
-                        # all-reduce big params directly
-                        all_reduce_params([param])
-                    else:
-                        if offset + sz > self.buffer.numel():
-                            all_reduce_params(buffered_params)
-                            offset = 0
-                            buffered_params.clear()
-                        buffered_params.append(param)
-                        offset += sz
-
-                if len(buffered_params) > 0:
-                    all_reduce_params(buffered_params)
-
-        reduction_fn()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/logging/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/logging/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/logging/meters.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/logging/meters.py
deleted file mode 100644
index 6793ef54e6383d8cb3bba804997497708a3b334c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/logging/meters.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import bisect
-import time
-from collections import OrderedDict
-from typing import Dict, Optional
-
-
-try:
-    import torch
-
-    def type_as(a, b):
-        if torch.is_tensor(a) and torch.is_tensor(b):
-            return a.to(b)
-        else:
-            return a
-
-
-except ImportError:
-    torch = None
-
-    def type_as(a, b):
-        return a
-
-
-try:
-    import numpy as np
-except ImportError:
-    np = None
-
-
-class Meter(object):
-    """Base class for Meters."""
-
-    def __init__(self):
-        pass
-
-    def state_dict(self):
-        return {}
-
-    def load_state_dict(self, state_dict):
-        pass
-
-    def reset(self):
-        raise NotImplementedError
-
-    @property
-    def smoothed_value(self) -> float:
-        """Smoothed value used for logging."""
-        raise NotImplementedError
-
-
-def safe_round(number, ndigits):
-    if hasattr(number, "__round__"):
-        return round(number, ndigits)
-    elif torch is not None and torch.is_tensor(number) and number.numel() == 1:
-        return safe_round(number.item(), ndigits)
-    elif np is not None and np.ndim(number) == 0 and hasattr(number, "item"):
-        return safe_round(number.item(), ndigits)
-    else:
-        return number
-
-
-class AverageMeter(Meter):
-    """Computes and stores the average and current value"""
-
-    def __init__(self, round: Optional[int] = None):
-        self.round = round
-        self.reset()
-
-    def reset(self):
-        self.val = None  # most recent update
-        self.sum = 0  # sum from all updates
-        self.count = 0  # total n from all updates
-
-    def update(self, val, n=1):
-        if val is not None:
-            self.val = val
-            if n > 0:
-                self.sum = type_as(self.sum, val) + (val * n)
-                self.count = type_as(self.count, n) + n
-
-    def state_dict(self):
-        return {
-            "val": self.val,
-            "sum": self.sum,
-            "count": self.count,
-            "round": self.round,
-        }
-
-    def load_state_dict(self, state_dict):
-        self.val = state_dict["val"]
-        self.sum = state_dict["sum"]
-        self.count = state_dict["count"]
-        self.round = state_dict.get("round", None)
-
-    @property
-    def avg(self):
-        return self.sum / self.count if self.count > 0 else self.val
-
-    @property
-    def smoothed_value(self) -> float:
-        val = self.avg
-        if self.round is not None and val is not None:
-            val = safe_round(val, self.round)
-        return val
-
-
-class TimeMeter(Meter):
-    """Computes the average occurrence of some event per second"""
-
-    def __init__(
-        self,
-        init: int = 0,
-        n: int = 0,
-        round: Optional[int] = None,
-    ):
-        self.round = round
-        self.reset(init, n)
-
-    def reset(self, init=0, n=0):
-        self.init = init
-        self.start = time.perf_counter()
-        self.n = n
-        self.i = 0
-
-    def update(self, val=1):
-        self.n = type_as(self.n, val) + val
-        self.i += 1
-
-    def state_dict(self):
-        return {
-            "init": self.elapsed_time,
-            "n": self.n,
-            "round": self.round,
-        }
-
-    def load_state_dict(self, state_dict):
-        if "start" in state_dict:
-            # backwards compatibility for old state_dicts
-            self.reset(init=state_dict["init"])
-        else:
-            self.reset(init=state_dict["init"], n=state_dict["n"])
-            self.round = state_dict.get("round", None)
-
-    @property
-    def avg(self):
-        return self.n / self.elapsed_time
-
-    @property
-    def elapsed_time(self):
-        return self.init + (time.perf_counter() - self.start)
-
-    @property
-    def smoothed_value(self) -> float:
-        val = self.avg
-        if self.round is not None and val is not None:
-            val = safe_round(val, self.round)
-        return val
-
-
-class StopwatchMeter(Meter):
-    """Computes the sum/avg duration of some event in seconds"""
-
-    def __init__(self, round: Optional[int] = None):
-        self.round = round
-        self.sum = 0
-        self.n = 0
-        self.start_time = None
-
-    def start(self):
-        self.start_time = time.perf_counter()
-
-    def stop(self, n=1, prehook=None):
-        if self.start_time is not None:
-            if prehook is not None:
-                prehook()
-            delta = time.perf_counter() - self.start_time
-            self.sum = self.sum + delta
-            self.n = type_as(self.n, n) + n
-
-    def reset(self):
-        self.sum = 0  # cumulative time during which stopwatch was active
-        self.n = 0  # total n across all start/stop
-        self.start()
-
-    def state_dict(self):
-        return {
-            "sum": self.sum,
-            "n": self.n,
-            "round": self.round,
-        }
-
-    def load_state_dict(self, state_dict):
-        self.sum = state_dict["sum"]
-        self.n = state_dict["n"]
-        self.start_time = None
-        self.round = state_dict.get("round", None)
-
-    @property
-    def avg(self):
-        return self.sum / self.n if self.n > 0 else self.sum
-
-    @property
-    def elapsed_time(self):
-        if self.start_time is None:
-            return 0.0
-        return time.perf_counter() - self.start_time
-
-    @property
-    def smoothed_value(self) -> float:
-        val = self.avg if self.sum > 0 else self.elapsed_time
-        if self.round is not None and val is not None:
-            val = safe_round(val, self.round)
-        return val
-
-
-class MetersDict(OrderedDict):
-    """A sorted dictionary of :class:`Meters`.
-
-    Meters are sorted according to a priority that is given when the
-    meter is first added to the dictionary.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.priorities = []
-
-    def __setitem__(self, key, value):
-        assert key not in self, "MetersDict doesn't support reassignment"
-        priority, value = value
-        bisect.insort(self.priorities, (priority, len(self.priorities), key))
-        super().__setitem__(key, value)
-        for _, _, key in self.priorities:  # reorder dict to match priorities
-            self.move_to_end(key)
-
-    def add_meter(self, key, meter, priority):
-        self.__setitem__(key, (priority, meter))
-
-    def state_dict(self):
-        return [
-            (pri, key, self[key].__class__.__name__, self[key].state_dict())
-            for pri, _, key in self.priorities
-            # can't serialize DerivedMeter instances
-            if not isinstance(self[key], MetersDict._DerivedMeter)
-        ]
-
-    def load_state_dict(self, state_dict):
-        self.clear()
-        self.priorities.clear()
-        for pri, key, meter_cls, meter_state in state_dict:
-            meter = globals()[meter_cls]()
-            meter.load_state_dict(meter_state)
-            self.add_meter(key, meter, pri)
-
-    def get_smoothed_value(self, key: str) -> float:
-        """Get a single smoothed value."""
-        meter = self[key]
-        if isinstance(meter, MetersDict._DerivedMeter):
-            return meter.fn(self)
-        else:
-            return meter.smoothed_value
-
-    def get_smoothed_values(self) -> Dict[str, float]:
-        """Get all smoothed values."""
-        return OrderedDict(
-            [
-                (key, self.get_smoothed_value(key))
-                for key in self.keys()
-                if not key.startswith("_")
-            ]
-        )
-
-    def reset(self):
-        """Reset Meter instances."""
-        for meter in self.values():
-            if isinstance(meter, MetersDict._DerivedMeter):
-                continue
-            meter.reset()
-
-    class _DerivedMeter(Meter):
-        """A Meter whose values are derived from other Meters."""
-
-        def __init__(self, fn):
-            self.fn = fn
-
-        def reset(self):
-            pass
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/logging/metrics.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/logging/metrics.py
deleted file mode 100644
index 7b56e31592da6f7362b1fee780071081df43fa28..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/logging/metrics.py
+++ /dev/null
@@ -1,288 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-A standalone module for aggregating metrics.
-
-Metrics can be logged from anywhere using the `log_*` functions defined
-in this module. The logged values will be aggregated dynamically based
-on the aggregation context in which the logging occurs. See the
-:func:`aggregate` context manager for more details.
-"""
-
-import contextlib
-import time
-import uuid
-from collections import OrderedDict, defaultdict
-from typing import Callable, Dict, List, Optional
-
-from .meters import *
-
-
-# Aggregation contexts are considered "active" when inside the scope
-# created by the :func:`aggregate` context manager.
-_aggregators = OrderedDict()
-_active_aggregators = OrderedDict()
-_active_aggregators_cnt = defaultdict(lambda: 0)
-
-
-def reset() -> None:
-    """Reset all metrics aggregators."""
-    _aggregators.clear()
-    _active_aggregators.clear()
-    _active_aggregators_cnt.clear()
-
-    # The "default" aggregator observes all logged values.
-    _aggregators["default"] = MetersDict()
-    _active_aggregators["default"] = _aggregators["default"]
-    _active_aggregators_cnt["default"] = 1
-
-
-reset()
-
-
-@contextlib.contextmanager
-def aggregate(name: Optional[str] = None, new_root: bool = False):
-    """Context manager to aggregate metrics under a given name.
-
-    Aggregations can be nested. If *new_root* is ``False``, then logged
-    metrics will be recorded along the entire stack of nested
-    aggregators, including a global "default" aggregator. If *new_root*
-    is ``True``, then this aggregator will be the root of a new
-    aggregation stack, thus bypassing any parent aggregators.
-
-    Note that aggregation contexts are uniquely identified by their
-    *name* (e.g., train, valid). Creating a context with an existing
-    name will reuse the corresponding :class:`MetersDict` instance.
-    If no name is given, then a temporary aggregator will be created.
-
-    Usage::
-
-        with metrics.aggregate("train"):
-            for step, batch in enumerate(epoch):
-                with metrics.aggregate("train_inner") as agg:
-                    metrics.log_scalar("loss", get_loss(batch))
-                    if step % log_interval == 0:
-                        print(agg.get_smoothed_value("loss"))
-                        agg.reset()
-        print(metrics.get_smoothed_values("train")["loss"])
-
-    Args:
-        name (str): name of the aggregation. Defaults to a
-            random/temporary name if not given explicitly.
-        new_root (bool): make this aggregation the root of a new
-            aggregation stack.
-    """
-    if name is None:
-        # generate a temporary name
-        name = str(uuid.uuid4())
-        assert name not in _aggregators
-        agg = MetersDict()
-    else:
-        assert name != "default"
-        agg = _aggregators.setdefault(name, MetersDict())
-
-    if new_root:
-        backup_aggregators = _active_aggregators.copy()
-        _active_aggregators.clear()
-        backup_aggregators_cnt = _active_aggregators_cnt.copy()
-        _active_aggregators_cnt.clear()
-
-    _active_aggregators[name] = agg
-    _active_aggregators_cnt[name] += 1
-
-    yield agg
-
-    _active_aggregators_cnt[name] -= 1
-    if _active_aggregators_cnt[name] == 0 and name in _active_aggregators:
-        del _active_aggregators[name]
-
-    if new_root:
-        _active_aggregators.clear()
-        _active_aggregators.update(backup_aggregators)
-        _active_aggregators_cnt.clear()
-        _active_aggregators_cnt.update(backup_aggregators_cnt)
-
-
-def get_active_aggregators() -> List[MetersDict]:
-    return list(_active_aggregators.values())
-
-
-def log_scalar(
-    key: str,
-    value: float,
-    weight: float = 1,
-    priority: int = 10,
-    round: Optional[int] = None,
-):
-    """Log a scalar value.
-
-    Args:
-        key (str): name of the field to log
-        value (float): value to log
-        weight (float): weight that this value contributes to the average.
-            A weight of 0 will always log the latest value.
-        priority (int): smaller values are logged earlier in the output
-        round (Optional[int]): number of digits to round to when displaying
-    """
-    for agg in get_active_aggregators():
-        if key not in agg:
-            agg.add_meter(key, AverageMeter(round=round), priority)
-        agg[key].update(value, weight)
-
-
-def log_derived(key: str, fn: Callable[[MetersDict], float], priority: int = 20):
-    """Log a scalar value derived from other meters.
-
-    Args:
-        key (str): name of the field to log
-        fn (Callable[[MetersDict], float]): function that takes a single
-            argument *meters* and returns the derived value
-        priority (int): smaller values are logged earlier in the output
-    """
-    for agg in get_active_aggregators():
-        if key not in agg:
-            agg.add_meter(key, MetersDict._DerivedMeter(fn), priority)
-
-
-def log_speed(
-    key: str,
-    value: float,
-    priority: int = 30,
-    round: Optional[int] = None,
-):
-    """Log the rate of some quantity per second.
-
-    Args:
-        key (str): name of the field to log
-        value (float): value to log
-        priority (int): smaller values are logged earlier in the output
-        round (Optional[int]): number of digits to round to when displaying
-    """
-    for agg in get_active_aggregators():
-        if key not in agg:
-            agg.add_meter(key, TimeMeter(round=round), priority)
-            agg[key].reset()  # reset meter on the first call
-        else:
-            agg[key].update(value)
-
-
-def log_start_time(key: str, priority: int = 40, round: Optional[int] = None):
-    """Log the duration of some event in seconds.
-
-    The duration will be computed once :func:`log_stop_time` is called.
-
-    Args:
-        key (str): name of the field to log
-        priority (int): smaller values are logged earlier in the output
-        round (Optional[int]): number of digits to round to when displaying
-    """
-    for agg in get_active_aggregators():
-        if key not in agg:
-            agg.add_meter(key, StopwatchMeter(round=round), priority)
-        agg[key].start()
-
-
-def log_stop_time(key: str, weight: float = 0.0, prehook=None):
-    """Log the duration of some event in seconds.
-
-    The duration will be computed since :func:`log_start_time` was called.
-    Set weight > 0 to report the average time instead of the sum.
-
-    Args:
-        key (str): name of the field to log
-        weight (float): weight that this time contributes to the average
-        prehook (function, no arguments): will be called before the timer
-        is stopped. For example, use prehook=torch.cuda.synchronize to
-        make sure all gpu operations are done before timer is stopped.
-    """
-    for agg in get_active_aggregators():
-        if key in agg:
-            agg[key].stop(weight, prehook)
-
-
-def log_custom(
-    new_meter_fn: Callable[[], Meter],
-    key: str,
-    *args,
-    priority: int = 50,
-    **kwargs,
-):
-    """Log using a custom Meter.
-
-    Any extra *args* or *kwargs* will be passed through to the Meter's
-    *update* method.
-
-    Args:
-        new_meter_fn (Callable[[], Meter]): function that returns a new
-            Meter instance
-        key (str): name of the field to log
-        priority (int): smaller values are logged earlier in the output
-    """
-    for agg in get_active_aggregators():
-        if key not in agg:
-            agg.add_meter(key, new_meter_fn(), priority)
-        agg[key].update(*args, **kwargs)
-
-
-def reset_meter(name: str, key: str) -> None:
-    """Reset Meter instance aggregated under a given *name* and *key*."""
-    meter = get_meter(name, key)
-    if meter is not None:
-        meter.reset()
-
-
-def reset_meters(name: str) -> None:
-    """Reset Meter instances aggregated under a given *name*."""
-    meters = get_meters(name)
-    if meters is not None:
-        meters.reset()
-
-
-def get_meter(name: str, key: str) -> Meter:
-    """Get a single Meter instance aggregated under *name* and *key*.
-
-    Returns:
-        Meter or None if no metrics have been logged under *name* and *key*.
-    """
-    if name not in _aggregators:
-        return None
-    return _aggregators[name].get(key, None)
-
-
-def get_meters(name: str) -> MetersDict:
-    """Get Meter instances aggregated under a given *name*.
-
-    Returns:
-        MetersDict or None if no metrics have been logged under *name*.
-    """
-    return _aggregators.get(name, None)
-
-
-def get_smoothed_value(name: str, key: str) -> float:
-    """Get a single smoothed value.
-
-    Raises:
-        KeyError: if no metrics have been logged under *name* and *key*.
-    """
-    return _aggregators[name].get_smoothed_value(key)
-
-
-def get_smoothed_values(name: str) -> Dict[str, float]:
-    """Get smoothed values aggregated under a given *name*.
-
-    Raises:
-        KeyError: if no metrics have been logged under *name*.
-    """
-    return _aggregators[name].get_smoothed_values()
-
-
-def state_dict():
-    return OrderedDict([(name, agg.state_dict()) for name, agg in _aggregators.items()])
-
-
-def load_state_dict(state_dict):
-    for name, agg_state in state_dict.items():
-        _aggregators[name] = MetersDict()
-        _aggregators[name].load_state_dict(agg_state)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/logging/progress_bar.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/logging/progress_bar.py
deleted file mode 100644
index 63e53948154f828c1bc424ba654774b13af5b76f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/logging/progress_bar.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Wrapper around various loggers and progress bars (e.g., tqdm).
-"""
-
-import atexit
-import json
-import logging
-import os
-import sys
-from collections import OrderedDict
-from contextlib import contextmanager
-from numbers import Number
-from typing import Optional
-
-import torch
-
-from .meters import AverageMeter, StopwatchMeter, TimeMeter
-
-
-logger = logging.getLogger(__name__)
-
-
-def progress_bar(
-    iterator,
-    log_format: Optional[str] = None,
-    log_interval: int = 100,
-    epoch: Optional[int] = None,
-    prefix: Optional[str] = None,
-    tensorboard_logdir: Optional[str] = None,
-    default_log_format: str = "tqdm",
-):
-    if log_format is None:
-        log_format = default_log_format
-    if log_format == "tqdm" and not sys.stderr.isatty():
-        log_format = "simple"
-
-    if log_format == "json":
-        bar = JsonProgressBar(iterator, epoch, prefix, log_interval)
-    elif log_format == "none":
-        bar = NoopProgressBar(iterator, epoch, prefix)
-    elif log_format == "simple":
-        bar = SimpleProgressBar(iterator, epoch, prefix, log_interval)
-    elif log_format == "tqdm":
-        bar = TqdmProgressBar(iterator, epoch, prefix)
-    else:
-        raise ValueError("Unknown log format: {}".format(log_format))
-
-    if tensorboard_logdir:
-        try:
-            # [FB only] custom wrapper for TensorBoard
-            import palaas  # noqa
-            from .fb_tbmf_wrapper import FbTbmfWrapper
-
-            bar = FbTbmfWrapper(bar, log_interval)
-        except ImportError:
-            bar = TensorboardProgressBarWrapper(bar, tensorboard_logdir)
-
-    return bar
-
-
-def build_progress_bar(
-    args,
-    iterator,
-    epoch: Optional[int] = None,
-    prefix: Optional[str] = None,
-    default: str = "tqdm",
-    no_progress_bar: str = "none",
-):
-    """Legacy wrapper that takes an argparse.Namespace."""
-    if getattr(args, "no_progress_bar", False):
-        default = no_progress_bar
-    if getattr(args, "distributed_rank", 0) == 0:
-        tensorboard_logdir = getattr(args, "tensorboard_logdir", None)
-    else:
-        tensorboard_logdir = None
-    return progress_bar(
-        iterator,
-        log_format=args.log_format,
-        log_interval=args.log_interval,
-        epoch=epoch,
-        prefix=prefix,
-        tensorboard_logdir=tensorboard_logdir,
-        default_log_format=default,
-    )
-
-
-def format_stat(stat):
-    if isinstance(stat, Number):
-        stat = "{:g}".format(stat)
-    elif isinstance(stat, AverageMeter):
-        stat = "{:.3f}".format(stat.avg)
-    elif isinstance(stat, TimeMeter):
-        stat = "{:g}".format(round(stat.avg))
-    elif isinstance(stat, StopwatchMeter):
-        stat = "{:g}".format(round(stat.sum))
-    elif torch.is_tensor(stat):
-        stat = stat.tolist()
-    return stat
-
-
-class BaseProgressBar(object):
-    """Abstract class for progress bars."""
-
-    def __init__(self, iterable, epoch=None, prefix=None):
-        self.iterable = iterable
-        self.n = getattr(iterable, "n", 0)
-        self.epoch = epoch
-        self.prefix = ""
-        if epoch is not None:
-            self.prefix += "epoch {:03d}".format(epoch)
-        if prefix is not None:
-            self.prefix += " | {}".format(prefix)
-
-    def __len__(self):
-        return len(self.iterable)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, *exc):
-        return False
-
-    def __iter__(self):
-        raise NotImplementedError
-
-    def log(self, stats, tag=None, step=None):
-        """Log intermediate stats according to log_interval."""
-        raise NotImplementedError
-
-    def print(self, stats, tag=None, step=None):
-        """Print end-of-epoch stats."""
-        raise NotImplementedError
-
-    def _str_commas(self, stats):
-        return ", ".join(key + "=" + stats[key].strip() for key in stats.keys())
-
-    def _str_pipes(self, stats):
-        return " | ".join(key + " " + stats[key].strip() for key in stats.keys())
-
-    def _format_stats(self, stats):
-        postfix = OrderedDict(stats)
-        # Preprocess stats according to datatype
-        for key in postfix.keys():
-            postfix[key] = str(format_stat(postfix[key]))
-        return postfix
-
-
-@contextmanager
-def rename_logger(logger, new_name):
-    old_name = logger.name
-    if new_name is not None:
-        logger.name = new_name
-    yield logger
-    logger.name = old_name
-
-
-class JsonProgressBar(BaseProgressBar):
-    """Log output in JSON format."""
-
-    def __init__(self, iterable, epoch=None, prefix=None, log_interval=1000):
-        super().__init__(iterable, epoch, prefix)
-        self.log_interval = log_interval
-        self.i = None
-        self.size = None
-
-    def __iter__(self):
-        self.size = len(self.iterable)
-        for i, obj in enumerate(self.iterable, start=self.n):
-            self.i = i
-            yield obj
-
-    def log(self, stats, tag=None, step=None):
-        """Log intermediate stats according to log_interval."""
-        step = step or self.i or 0
-        if step > 0 and self.log_interval is not None and step % self.log_interval == 0:
-            update = (
-                self.epoch - 1 + (self.i + 1) / float(self.size)
-                if self.epoch is not None
-                else None
-            )
-            stats = self._format_stats(stats, epoch=self.epoch, update=update)
-            with rename_logger(logger, tag):
-                logger.info(json.dumps(stats))
-
-    def print(self, stats, tag=None, step=None):
-        """Print end-of-epoch stats."""
-        self.stats = stats
-        if tag is not None:
-            self.stats = OrderedDict(
-                [(tag + "_" + k, v) for k, v in self.stats.items()]
-            )
-        stats = self._format_stats(self.stats, epoch=self.epoch)
-        with rename_logger(logger, tag):
-            logger.info(json.dumps(stats))
-
-    def _format_stats(self, stats, epoch=None, update=None):
-        postfix = OrderedDict()
-        if epoch is not None:
-            postfix["epoch"] = epoch
-        if update is not None:
-            postfix["update"] = round(update, 3)
-        # Preprocess stats according to datatype
-        for key in stats.keys():
-            postfix[key] = format_stat(stats[key])
-        return postfix
-
-
-class NoopProgressBar(BaseProgressBar):
-    """No logging."""
-
-    def __init__(self, iterable, epoch=None, prefix=None):
-        super().__init__(iterable, epoch, prefix)
-
-    def __iter__(self):
-        for obj in self.iterable:
-            yield obj
-
-    def log(self, stats, tag=None, step=None):
-        """Log intermediate stats according to log_interval."""
-        pass
-
-    def print(self, stats, tag=None, step=None):
-        """Print end-of-epoch stats."""
-        pass
-
-
-class SimpleProgressBar(BaseProgressBar):
-    """A minimal logger for non-TTY environments."""
-
-    def __init__(self, iterable, epoch=None, prefix=None, log_interval=1000):
-        super().__init__(iterable, epoch, prefix)
-        self.log_interval = log_interval
-        self.i = None
-        self.size = None
-
-    def __iter__(self):
-        self.size = len(self.iterable)
-        for i, obj in enumerate(self.iterable, start=self.n):
-            self.i = i
-            yield obj
-
-    def log(self, stats, tag=None, step=None):
-        """Log intermediate stats according to log_interval."""
-        step = step or self.i or 0
-        if step > 0 and self.log_interval is not None and step % self.log_interval == 0:
-            stats = self._format_stats(stats)
-            postfix = self._str_commas(stats)
-            with rename_logger(logger, tag):
-                logger.info(
-                    "{}:  {:5d} / {:d} {}".format(
-                        self.prefix, self.i + 1, self.size, postfix
-                    )
-                )
-
-    def print(self, stats, tag=None, step=None):
-        """Print end-of-epoch stats."""
-        postfix = self._str_pipes(self._format_stats(stats))
-        with rename_logger(logger, tag):
-            logger.info("{} | {}".format(self.prefix, postfix))
-
-
-class TqdmProgressBar(BaseProgressBar):
-    """Log to tqdm."""
-
-    def __init__(self, iterable, epoch=None, prefix=None):
-        super().__init__(iterable, epoch, prefix)
-        from tqdm import tqdm
-
-        self.tqdm = tqdm(
-            iterable,
-            self.prefix,
-            leave=False,
-            disable=(logger.getEffectiveLevel() > logging.INFO),
-        )
-
-    def __iter__(self):
-        return iter(self.tqdm)
-
-    def log(self, stats, tag=None, step=None):
-        """Log intermediate stats according to log_interval."""
-        self.tqdm.set_postfix(self._format_stats(stats), refresh=False)
-
-    def print(self, stats, tag=None, step=None):
-        """Print end-of-epoch stats."""
-        postfix = self._str_pipes(self._format_stats(stats))
-        with rename_logger(logger, tag):
-            logger.info("{} | {}".format(self.prefix, postfix))
-
-
-try:
-    _tensorboard_writers = {}
-    from tensorboardX import SummaryWriter
-except ImportError:
-    SummaryWriter = None
-
-
-def _close_writers():
-    for w in _tensorboard_writers.values():
-        w.close()
-
-
-atexit.register(_close_writers)
-
-
-class TensorboardProgressBarWrapper(BaseProgressBar):
-    """Log to tensorboard."""
-
-    def __init__(self, wrapped_bar, tensorboard_logdir):
-        self.wrapped_bar = wrapped_bar
-        self.tensorboard_logdir = tensorboard_logdir
-
-        if SummaryWriter is None:
-            logger.warning(
-                "tensorboard not found, please install with: pip install tensorboardX"
-            )
-
-    def _writer(self, key):
-        if SummaryWriter is None:
-            return None
-        _writers = _tensorboard_writers
-        if key not in _writers:
-            _writers[key] = SummaryWriter(os.path.join(self.tensorboard_logdir, key))
-            _writers[key].add_text("sys.argv", " ".join(sys.argv))
-        return _writers[key]
-
-    def __iter__(self):
-        return iter(self.wrapped_bar)
-
-    def log(self, stats, tag=None, step=None):
-        """Log intermediate stats to tensorboard."""
-        self._log_to_tensorboard(stats, tag, step)
-        self.wrapped_bar.log(stats, tag=tag, step=step)
-
-    def print(self, stats, tag=None, step=None):
-        """Print end-of-epoch stats."""
-        self._log_to_tensorboard(stats, tag, step)
-        self.wrapped_bar.print(stats, tag=tag, step=step)
-
-    def _log_to_tensorboard(self, stats, tag=None, step=None):
-        writer = self._writer(tag or "")
-        if writer is None:
-            return
-        if step is None:
-            step = stats["num_updates"]
-        for key in stats.keys() - {"num_updates"}:
-            if isinstance(stats[key], AverageMeter):
-                writer.add_scalar(key, stats[key].val, step)
-            elif isinstance(stats[key], Number):
-                writer.add_scalar(key, stats[key], step)
-        writer.flush()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/__init__.py
deleted file mode 100644
index 69f21684872f72ae8ee26d9ff7d2d2b6e6d526c3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from . import criterions, models, modules  # noqa
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/criterions/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/criterions/__init__.py
deleted file mode 100644
index 6239b50362799e020708d4ee0d8a7e90ed48a902..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/criterions/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import importlib
-import os
-
-
-# automatically import any Python files in the criterions/ directory
-for file in os.listdir(os.path.dirname(__file__)):
-    if file.endswith(".py") and not file.startswith("_"):
-        module = file[: file.find(".py")]
-        importlib.import_module("fairseq.model_parallel.criterions." + module)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
deleted file mode 100644
index 35c50ee1521963c5cb6dfb7036ccf43401c6c6ac..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-from fairseq import metrics, utils
-from fairseq.criterions import FairseqCriterion, register_criterion
-
-
-try:
-    from fairseq.model_parallel.megatron.mpu.cross_entropy import (
-        vocab_parallel_cross_entropy,
-    )
-
-    has_megatron_submodule = True
-except (ImportError, ModuleNotFoundError):
-    has_megatron_submodule = False
-
-
-@register_criterion("vocab_parallel_cross_entropy")
-class VocabParallelCrossEntropyCriterion(FairseqCriterion):
-    def __init__(self, task, sentence_avg):
-        super().__init__(task)
-        self.sentence_avg = sentence_avg
-        if not has_megatron_submodule:
-            raise ImportError(
-                "\n\nPlease install the megatron submodule:"
-                "\n\n  git submodule update --init "
-                "fairseq/model_parallel/megatron"
-            )
-
-    def forward(self, model, sample, reduce=True):
-        """Compute the loss for the given sample.
-
-        Returns a tuple with three elements:
-        1) the loss
-        2) the sample size, which is used as the denominator for the gradient
-        3) logging outputs to display while training
-        """
-        net_output = model(**sample["net_input"])
-        target = sample["target"]
-
-        loss = vocab_parallel_cross_entropy(net_output[0].float(), target)
-        loss = (loss * (target != self.padding_idx)).sum()
-        sample_size = (
-            sample["target"].size(0) if self.sentence_avg else sample["ntokens"]
-        )
-        logging_output = {
-            "loss": utils.item(loss.data) if reduce else loss.data,
-            "ntokens": sample["ntokens"],
-            "nsentences": sample["target"].size(0),
-            "sample_size": sample_size,
-        }
-        return loss, sample_size, logging_output
-
-    @staticmethod
-    def reduce_metrics(logging_outputs) -> None:
-        """Aggregate logging outputs from data parallel training."""
-        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
-        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
-        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
-
-        metrics.log_scalar(
-            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
-        )
-        if sample_size != ntokens:
-            metrics.log_scalar(
-                "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
-            )
-            metrics.log_derived(
-                "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
-            )
-        else:
-            metrics.log_derived(
-                "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
-            )
-
-    @staticmethod
-    def logging_outputs_can_be_summed() -> bool:
-        """
-        Whether the logging outputs returned by `forward` can be summed
-        across workers prior to calling `reduce_metrics`. Setting this
-        to True will improves distributed training speed.
-        """
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/megatron_trainer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/megatron_trainer.py
deleted file mode 100644
index 761ffc8e61b1181689f117fb670218dce15994f5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/megatron_trainer.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Train a network across multiple GPUs.
-"""
-
-from fairseq import distributed_utils
-from fairseq.trainer import Trainer
-
-
-try:
-    from fairseq.model_parallel.megatron.mpu import (
-        get_data_parallel_group,
-        get_data_parallel_rank,
-        get_data_parallel_world_size,
-        get_model_parallel_group,
-        get_model_parallel_src_rank,
-    )
-
-    has_megatron_submodule = True
-except (ImportError, ModuleNotFoundError):
-    has_megatron_submodule = False
-
-
-class MegatronTrainer(Trainer):
-    """Main class for model parallel with data parallel training."""
-
-    def __init__(self, args, task, model, criterion):
-        if not has_megatron_submodule:
-            raise ImportError(
-                "\n\nPlease install the megatron submodule:"
-                "\n\n  git submodule update --init "
-                "fairseq/model_parallel/megatron"
-            )
-        super().__init__(args, task, model, criterion)
-
-    @property
-    def data_parallel_world_size(self):
-        return get_data_parallel_world_size()
-
-    @property
-    def data_parallel_process_group(self):
-        return get_data_parallel_group()
-
-    @property
-    def data_parallel_rank(self):
-        return get_data_parallel_rank()
-
-    @property
-    def is_data_parallel_master(self):
-        return get_model_parallel_src_rank() == 0
-
-    def clip_grad_norm(self, clip_norm):
-        def _aggregate_model_parallel_grad_norm(total_norm):
-            total_norm = total_norm ** 2
-            distributed_utils.all_reduce(total_norm, group=get_model_parallel_group())
-            total_norm = total_norm ** 0.5
-            return total_norm
-
-        return self.optimizer.clip_grad_norm(
-            clip_norm,
-            aggregate_norm_fn=_aggregate_model_parallel_grad_norm,
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/__init__.py
deleted file mode 100644
index 3532479e52a0e1f1ba204c6f5d51c71c98ee5df0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import importlib
-import os
-
-
-# automatically import any Python files in the models/ directory
-models_dir = os.path.dirname(__file__)
-for file in os.listdir(models_dir):
-    path = os.path.join(models_dir, file)
-    if (
-        not file.startswith("_")
-        and not file.startswith(".")
-        and (file.endswith(".py") or os.path.isdir(path))
-    ):
-        model_name = file[: file.find(".py")] if file.endswith(".py") else file
-        module = importlib.import_module("fairseq.model_parallel.models." + model_name)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py
deleted file mode 100644
index 117827c3e9c176477f33e3a6fd7fe19a922411a2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .model import *  # noqa
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py
deleted file mode 100644
index eb81ded341257ba0a43c4d0867e8f3c83f276bc7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py
+++ /dev/null
@@ -1,600 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from collections import namedtuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import options, utils
-from fairseq.modules import (
-    AdaptiveSoftmax,
-    LayerNorm,
-    MultiheadAttention,
-    PositionalEmbedding,
-)
-
-
-EncoderOut = namedtuple(
-    "TransformerEncoderOut",
-    [
-        "encoder_out",  # T x B x C
-        "encoder_padding_mask",  # B x T
-        "encoder_embedding",  # B x T x C
-        "encoder_states",  # List[T x B x C]
-    ],
-)
-
-
-class TransformerEncoderEmbedding(nn.Module):
-    """ Encoder Embedding + Positional Embedding """
-
-    def __init__(self, args, embed_tokens):
-        super().__init__()
-        self.dropout = args.dropout
-        self.max_source_positions = args.max_source_positions
-        self.embed_tokens = embed_tokens
-        if isinstance(embed_tokens, nn.ModuleList):
-            self.padding_idx = embed_tokens[0].padding_idx
-            embed_dim = sum(e.embedding_dim for e in embed_tokens)
-        else:
-            self.padding_idx = embed_tokens.padding_idx
-            embed_dim = embed_tokens.embedding_dim
-        self.embed_scale = math.sqrt(embed_dim)
-        self.embed_positions = (
-            PositionalEmbedding(
-                args.max_source_positions,
-                embed_dim,
-                self.padding_idx,
-                learned=args.encoder_learned_pos,
-            )
-            if not args.no_token_positional_embeddings
-            else None
-        )
-        if getattr(args, "layernorm_embedding", False):
-            self.layernorm_embedding = LayerNorm(embed_dim)
-        else:
-            self.layernorm_embedding = None
-
-    def forward(self, input):
-        # embed tokens and positions
-        src_tokens = input[0]
-        prev_output_tokens = input[2]
-        if isinstance(self.embed_tokens, nn.ModuleList):
-            x_embed_list = []
-            for embed_tokens_part in self.embed_tokens:
-                x_embed_list.append(embed_tokens_part(src_tokens))
-
-            embedded = torch.cat(x_embed_list, dim=-1)
-        else:
-            embedded = self.embed_tokens(src_tokens)
-        x = embed = self.embed_scale * embedded
-        if self.embed_positions is not None:
-            x = embed + self.embed_positions(src_tokens)
-        if self.layernorm_embedding:
-            x = self.layernorm_embedding(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-
-        # compute padding mask
-        encoder_padding_mask = src_tokens.eq(self.padding_idx)
-        return (x, encoder_padding_mask, prev_output_tokens)
-
-
-class TransformerEncoderLayerNorm(nn.Module):
-    """
-    Layer norm at the the end of all encoder layers if
-    args.encoder_enormalize_before = True
-    """
-
-    def __init__(self, args, embed_dim):
-        super().__init__()
-        if args.encoder_normalize_before:
-            self.layer_norm = LayerNorm(embed_dim)
-        else:
-            self.layer_norm = None
-
-    def forward(self, input):
-        x = input[0]
-        encoder_padding_mask = input[1]
-        prev_output_tokens = input[2]
-        if self.layer_norm:
-            x = self.layer_norm(x)
-        # keeping track of the incremental_state is not supported yet
-        return (x, encoder_padding_mask, prev_output_tokens)
-
-
-class TransformerDecoderEmbedding(nn.Module):
-    """ Decoder Embedding + Positional Embedding """
-
-    def __init__(self, args, embed_tokens):
-        super().__init__()
-        self.dropout = args.dropout
-        self.share_input_output_embed = args.share_decoder_input_output_embed
-        input_embed_dim = (
-            sum(e.embedding_dim for e in embed_tokens)
-            if isinstance(embed_tokens, nn.ModuleList)
-            else embed_tokens.embedding_dim
-        )
-        embed_dim = args.decoder_embed_dim
-        self.output_embed_dim = args.decoder_output_dim
-
-        padding_idx = (
-            embed_tokens[0].padding_idx
-            if isinstance(embed_tokens, nn.ModuleList)
-            else embed_tokens.padding_idx
-        )
-        self.max_target_positions = args.max_target_positions
-
-        self.embed_tokens = embed_tokens
-        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim
-
-        self.project_in_dim = (
-            Linear(input_embed_dim, embed_dim, bias=False)
-            if embed_dim != input_embed_dim
-            else None
-        )
-
-        self.embed_positions = (
-            PositionalEmbedding(
-                args.max_target_positions,
-                embed_dim,
-                padding_idx,
-                learned=args.decoder_learned_pos,
-            )
-            if not args.no_token_positional_embeddings
-            else None
-        )
-
-    def forward(self, input):
-        mt_task = False
-        if isinstance(input, tuple):
-            if len(input) == 3:
-                encoder_out = input[0]
-                encoder_padding_mask = input[1]
-                prev_output_tokens = input[2]
-                incremental_state = None  # Hardcoding to avoid passing of None objects
-                mt_task = True
-            else:
-                # HACK for now, need to fix (TODO sidgoyal)
-                prev_output_tokens = input[0]
-                # discard "src_lengths"
-                encoder_out = None
-                encoder_padding_mask = None
-                incremental_state = None
-
-        else:
-            prev_output_tokens = input
-            encoder_out = None
-            encoder_padding_mask = None
-            incremental_state = None
-
-        positions = (
-            self.embed_positions(
-                prev_output_tokens,
-                incremental_state=incremental_state,
-            )
-            if self.embed_positions is not None
-            else None
-        )
-
-        if incremental_state is not None:
-            prev_output_tokens = prev_output_tokens[:, -1:]
-            if positions is not None:
-                positions = positions[:, -1:]
-
-        # embed tokens and positions
-
-        if isinstance(self.embed_tokens, nn.ModuleList):
-            x_embed_list = []
-            for embed_tokens_part in self.embed_tokens:
-                x_embed_list.append(embed_tokens_part(prev_output_tokens))
-
-            x = self.embed_scale * torch.cat(x_embed_list, dim=-1)
-        else:
-            x = self.embed_scale * self.embed_tokens(prev_output_tokens)
-
-        if self.project_in_dim is not None:
-            x = self.project_in_dim(x)
-
-        if positions is not None:
-            x += positions
-        x = F.dropout(x, p=self.dropout, training=self.training)
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-        if mt_task:
-            return (x, encoder_out, encoder_padding_mask)
-        return x
-
-
-class TransformerDecoderOutputLayer(nn.Module):
-    def __init__(self, args, embed_tokens, dictionary):
-        super().__init__()
-        self.share_input_output_embed = args.share_decoder_input_output_embed
-        self.embed_tokens = embed_tokens
-        self.output_embed_dim = args.decoder_output_dim
-        embed_dim = args.decoder_embed_dim
-
-        self.project_out_dim = (
-            Linear(embed_dim, self.output_embed_dim, bias=False)
-            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights
-            else None
-        )
-        self.adaptive_softmax = None
-        if args.adaptive_softmax_cutoff is not None:
-            assert not isinstance(embed_tokens, nn.ModuleList)
-            self.adaptive_softmax = AdaptiveSoftmax(
-                len(dictionary),
-                self.output_embed_dim,
-                options.eval_str_list(args.adaptive_softmax_cutoff, type=int),
-                dropout=args.adaptive_softmax_dropout,
-                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
-                factor=args.adaptive_softmax_factor,
-                tie_proj=args.tie_adaptive_proj,
-            )
-        elif not self.share_input_output_embed:
-            self.embed_tokens = nn.Parameter(
-                torch.Tensor(len(dictionary), self.output_embed_dim)
-            )
-            nn.init.normal_(
-                self.embed_tokens, mean=0, std=self.output_embed_dim ** -0.5
-            )
-
-        if args.decoder_normalize_before and not getattr(
-            args, "no_decoder_final_norm", False
-        ):
-            self.layer_norm = LayerNorm(embed_dim)
-        else:
-            self.layer_norm = None
-
-    def forward(self, input, apply_final_proj=True):
-        if isinstance(input, tuple):
-            x = input[0]
-        else:
-            x = input
-
-        if self.layer_norm:
-            x = self.layer_norm(x)
-
-        # T x B x C -> B x T x C
-        x = x.transpose(0, 1)
-
-        if self.project_out_dim is not None:
-            x = self.project_out_dim(x)
-        if apply_final_proj:
-            x = self.output_layer(x)
-        return x
-
-    def output_layer(self, features, **kwargs):
-        """Project features to the vocabulary size."""
-        if self.adaptive_softmax is None:
-            # project back to size of vocabulary
-            if self.share_input_output_embed:
-                if isinstance(self.embed_tokens, nn.ModuleList):
-                    output = None
-                    for i, emb in enumerate(self.embed_tokens):
-                        sidx = i * emb.embedding_dim
-                        eidx = (i + 1) * emb.embedding_dim
-                        if output is None:
-                            output = F.linear(features[:, :, sidx:eidx], emb.weight)
-                        else:
-                            output += F.linear(features[:, :, sidx:eidx], emb.weight)
-
-                    return output
-                else:
-                    return F.linear(features, self.embed_tokens.weight)
-            else:
-                return F.linear(features, self.embed_tokens)
-        else:
-            return features
-
-
-class TransformerEncoderLayer(nn.Module):
-    """Encoder layer block.
-    In the original paper each operation (multi-head attention or FFN) is
-    postprocessed with: `dropout -> add residual -> layernorm`. In the
-    tensor2tensor code they suggest that learning is more robust when
-    preprocessing each layer with layernorm and postprocessing with:
-    `dropout -> add residual`. We default to the approach in the paper, but the
-    tensor2tensor approach can be enabled by setting
-    *args.encoder_normalize_before* to ``True``.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-    """
-
-    def __init__(self, args):
-        super().__init__()
-        self.embed_dim = args.encoder_embed_dim
-        self.self_attn = MultiheadAttention(
-            self.embed_dim,
-            args.encoder_attention_heads,
-            dropout=args.attention_dropout,
-            self_attention=True,
-        )
-        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
-        self.dropout = args.dropout
-        self.activation_fn = utils.get_activation_fn(
-            activation=getattr(args, "activation_fn", "relu")
-        )
-        self.activation_dropout = getattr(args, "activation_dropout", 0)
-        if self.activation_dropout == 0:
-            # for backwards compatibility with models that use args.relu_dropout
-            self.activation_dropout = getattr(args, "relu_dropout", 0)
-        self.normalize_before = args.encoder_normalize_before
-        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
-        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
-        self.final_layer_norm = LayerNorm(self.embed_dim)
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        """
-        Rename layer norm states from `...layer_norms.0.weight` to
-        `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
-        `...final_layer_norm.weight`
-        """
-        layer_norm_map = {"0": "self_attn_layer_norm", "1": "final_layer_norm"}
-        for old, new in layer_norm_map.items():
-            for m in ("weight", "bias"):
-                k = "{}.layer_norms.{}.{}".format(name, old, m)
-                if k in state_dict:
-                    state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k]
-                    del state_dict[k]
-
-    def forward(self, input):
-        """
-        Args:
-            input (Tuple):
-                input[0] (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
-                input[1] (ByteTensor/FloatTensor): encoder padding mask -
-                    binary ByteTensor of shape `(batch, src_len)` where padding elements
-                    are indicated by ``1``.
-                input[2] (LongTensor): previous decoder outputs of shape
-                    `(batch, tgt_len)`, for teacher forcing)
-        Returns:
-            output (Tuple):
-                output[0] (Tensor): encoded output of shape `(batch, src_len, embed_dim)`
-                output[1] (ByteTensor/FloatTensor): encoder padding mask
-                output[2] (LongTensor): previous decoder outputs
-        """
-        x = input[0]
-        encoder_padding_mask = input[1]
-        prev_output_tokens = input[2]
-        residual = x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
-        x, _ = self.self_attn(
-            query=x, key=x, value=x, key_padding_mask=encoder_padding_mask
-        )
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
-
-        residual = x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
-        x = self.activation_fn(self.fc1(x))
-        x = F.dropout(x, p=self.activation_dropout, training=self.training)
-        x = self.fc2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
-        return (x, encoder_padding_mask, prev_output_tokens)
-
-    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
-        assert before ^ after
-        if after ^ self.normalize_before:
-            return layer_norm(x)
-        else:
-            return x
-
-
-class TransformerDecoderLayer(nn.Module):
-    """Decoder layer block.
-
-    In the original paper each operation (multi-head attention, encoder
-    attention or FFN) is postprocessed with: `dropout -> add residual ->
-    layernorm`. In the tensor2tensor code they suggest that learning is more
-    robust when preprocessing each layer with layernorm and postprocessing with:
-    `dropout -> add residual`. We default to the approach in the paper, but the
-    tensor2tensor approach can be enabled by setting
-    *args.decoder_normalize_before* to ``True``.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        no_encoder_attn (bool, optional): whether to attend to encoder outputs
-            (default: False).
-    """
-
-    def __init__(
-        self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
-    ):
-        super().__init__()
-        self.embed_dim = args.decoder_embed_dim
-        self.self_attn = MultiheadAttention(
-            embed_dim=self.embed_dim,
-            num_heads=args.decoder_attention_heads,
-            dropout=args.attention_dropout,
-            add_bias_kv=add_bias_kv,
-            add_zero_attn=add_zero_attn,
-            self_attention=True,
-        )
-        self.dropout = args.dropout
-        self.activation_fn = utils.get_activation_fn(
-            activation=getattr(args, "activation_fn", "relu")
-        )
-        self.activation_dropout = getattr(args, "activation_dropout", 0)
-        if self.activation_dropout == 0:
-            # for backwards compatibility with models that use args.relu_dropout
-            self.activation_dropout = getattr(args, "relu_dropout", 0)
-        self.normalize_before = args.decoder_normalize_before
-
-        # use layerNorm rather than FusedLayerNorm for exporting.
-        # char_inputs can be used to determint this.
-        # TODO  remove this once we update apex with the fix
-        export = getattr(args, "char_inputs", False)
-        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
-
-        if no_encoder_attn:
-            self.encoder_attn = None
-            self.encoder_attn_layer_norm = None
-        else:
-            self.encoder_attn = MultiheadAttention(
-                self.embed_dim,
-                args.decoder_attention_heads,
-                kdim=getattr(args, "encoder_embed_dim", None),
-                vdim=getattr(args, "encoder_embed_dim", None),
-                dropout=args.attention_dropout,
-                encoder_decoder_attention=True,
-            )
-            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
-
-        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
-        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
-
-        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
-        self.need_attn = True
-
-        self.onnx_trace = False
-
-    def prepare_for_onnx_export_(self):
-        self.onnx_trace = True
-
-    def forward(self, input):
-        """
-        Args:
-            input (Tuple):
-                input[0] (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
-                input[1] (Tensor): encoder output of shape `(batch, src_len, embed_dim)`
-                input[2] (ByteTensor/FloatTensor): encoder padding mask -
-                    binary ByteTensor of shape `(batch, src_len)` where padding elements
-                    are indicated by ``1``.
-        Returns:
-            output (Tuple):
-                output[0] (Tensor): encoded output of shape `(batch, src_len, embed_dim)`
-                output[1] (ByteTensor/FloatTensor): encoder padding mask
-                output[2] (LongTensor): previous decoder outputs
-        """
-        # Note: incremental state is not yet supported
-        mt_task = False
-        if isinstance(input, tuple):
-            x = input[0]
-            encoder_out = input[1]
-            encoder_padding_mask = input[2]
-            incremental_state = None
-            mt_task = True
-        else:
-            x = input
-            encoder_out = None
-            encoder_padding_mask = None
-            incremental_state = None
-
-        if incremental_state is None:
-            self_attn_mask = self.buffered_future_mask(x)
-        else:
-            self_attn_mask = None
-
-        # TODO: add back prev_self_attn_state, prev_attn_state,
-        # self_attn_padding_mask
-        prev_self_attn_state = None
-        prev_attn_state = None
-        self_attn_padding_mask = None
-
-        residual = x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
-        if prev_self_attn_state is not None:
-            if incremental_state is None:
-                incremental_state = {}
-            prev_key, prev_value = prev_self_attn_state
-            saved_state = {"prev_key": prev_key, "prev_value": prev_value}
-            self.self_attn._set_input_buffer(incremental_state, saved_state)
-        x, attn = self.self_attn(
-            query=x,
-            key=x,
-            value=x,
-            key_padding_mask=self_attn_padding_mask,
-            incremental_state=incremental_state,
-            need_weights=False,
-            attn_mask=self_attn_mask,
-        )
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)
-
-        if self.encoder_attn is not None:
-            residual = x
-            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
-            if prev_attn_state is not None:
-                if incremental_state is None:
-                    incremental_state = {}
-                prev_key, prev_value = prev_attn_state
-                saved_state = {"prev_key": prev_key, "prev_value": prev_value}
-                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
-            x, attn = self.encoder_attn(
-                query=x,
-                key=encoder_out,
-                value=encoder_out,
-                key_padding_mask=encoder_padding_mask,
-                incremental_state=incremental_state,
-                static_kv=True,
-                need_weights=(not self.training and self.need_attn),
-            )
-            x = F.dropout(x, p=self.dropout, training=self.training)
-            x = residual + x
-            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)
-
-        residual = x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
-        x = self.activation_fn(self.fc1(x))
-        x = F.dropout(x, p=self.activation_dropout, training=self.training)
-        x = self.fc2(x)
-        x = F.dropout(x, p=self.dropout, training=self.training)
-        x = residual + x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
-
-        if mt_task:
-            return (x, encoder_out, encoder_padding_mask)
-        return x
-
-    def buffered_future_mask(self, tensor):
-        dim = tensor.size(0)
-        if (
-            not hasattr(self, "_future_mask")
-            or self._future_mask is None
-            or self._future_mask.device != tensor.device
-        ):
-            self._future_mask = torch.triu(
-                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
-            )
-        if self._future_mask.size(0) < dim:
-            self._future_mask = torch.triu(
-                utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1
-            )
-        return self._future_mask[:dim, :dim]
-
-    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
-        assert before ^ after
-        if after ^ self.normalize_before:
-            return layer_norm(x)
-        else:
-            return x
-
-    def make_generation_fast_(self, need_attn=False, **kwargs):
-        self.need_attn = need_attn
-
-
-def Embedding(num_embeddings, embedding_dim, padding_idx):
-    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
-    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
-    nn.init.constant_(m.weight[padding_idx], 0)
-    return m
-
-
-def Linear(in_features, out_features, bias=True):
-    m = nn.Linear(in_features, out_features, bias)
-    nn.init.xavier_uniform_(m.weight)
-    if bias:
-        nn.init.constant_(m.bias, 0.0)
-    return m
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
deleted file mode 100644
index cbfc6ae4a0bfb8e8c66403a621d5ad6e52996b1a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
+++ /dev/null
@@ -1,721 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.model_parallel.models.pipeline_parallel_transformer.layers import (
-    Embedding,
-    TransformerDecoderEmbedding,
-    TransformerDecoderLayer,
-    TransformerDecoderOutputLayer,
-    TransformerEncoderEmbedding,
-    TransformerEncoderLayer,
-    TransformerEncoderLayerNorm,
-)
-from fairseq.models import (
-    BaseFairseqModel,
-    FairseqDecoder,
-    FairseqEncoder,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.models.fairseq_encoder import EncoderOut
-from fairseq.models.transformer import (
-    base_architecture,
-    transformer_iwslt_de_en,
-    transformer_wmt_en_de_big,
-)
-from fairseq.modules import SinusoidalPositionalEmbedding
-
-
-logger = logging.getLogger(__name__)
-
-
-DEFAULT_MAX_SOURCE_POSITIONS = 1024
-DEFAULT_MAX_TARGET_POSITIONS = 1024
-
-
-@register_model("pipeline_parallel_transformer")
-class PipelineParallelTransformerModel(BaseFairseqModel):
-    def __init__(self, encoder, decoder, balance, devices, chunks, checkpoint):
-        try:
-            from fairscale.nn import Pipe
-        except ImportError:
-            raise ImportError("Please install fairscale with: pip install fairscale")
-        super().__init__()
-        assert isinstance(encoder, FairseqEncoder)
-        assert isinstance(decoder, FairseqDecoder)
-        encoder_module_list = (
-            [encoder.embedding_layer]
-            + list(encoder.encoder_layers)
-            + [encoder.final_layer_norm]
-        )
-        self.num_encoder_modules = len(encoder_module_list)
-        decoder_module_list = (
-            [decoder.embedding_layer]
-            + list(decoder.decoder_layers)
-            + [decoder.decoder_output_layer]
-        )
-        self.num_decoder_modules = len(decoder_module_list)
-        module_list = encoder_module_list + decoder_module_list
-        self.devices = devices
-        self.model = Pipe(
-            nn.Sequential(*module_list),
-            balance=balance,
-            devices=devices,
-            chunks=chunks,
-            checkpoint=checkpoint,
-        )
-        self.encoder_max_positions = self.max_positions_helper(
-            encoder.embedding_layer, "max_source_positions"
-        )
-        self.decoder_max_positions = self.max_positions_helper(
-            decoder.embedding_layer, "max_target_positions"
-        )
-        self.adaptive_softmax = getattr(decoder, "adaptive_softmax", None)
-        # Note: To be populated during inference
-        self.encoder = None
-        self.decoder = None
-
-    def forward(self, src_tokens, src_lengths, prev_output_tokens):
-        if self.training:
-            input_lst = [src_tokens, src_lengths, prev_output_tokens]
-            input = tuple(i.to(self.devices[0], non_blocking=True) for i in input_lst)
-            return self.model(input)
-        else:
-            assert self.encoder is not None and self.decoder is not None, (
-                "encoder and decoder need to be initialized by "
-                + "calling the `prepare_for_inference_()` method"
-            )
-            encoder_output_tuple = self.encoder(input)
-            return self.decoder(encoder_output_tuple)
-
-    def prepare_for_inference_(self, args):
-        if self.encoder is not None and self.decoder is not None:
-            logger.info("Encoder and Decoder already initialized")
-            return
-        encoder_module_list = []
-        decoder_module_list = []
-        module_count = 0
-        for partition in self.model.partitions:
-            for module in partition:
-                if module_count < self.num_encoder_modules:
-                    encoder_module_list.append(module)
-                else:
-                    decoder_module_list.append(module)
-                module_count += 1
-        self.model = None
-        self.encoder = TransformerEncoder(args, None, None, encoder_module_list)
-        self.decoder = TransformerDecoder(
-            args, None, None, decoder_module_list=decoder_module_list
-        )
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--activation-fn',
-                            choices=utils.get_available_activation_fns(),
-                            help='activation function to use')
-        parser.add_argument('--dropout', type=float, metavar='D',
-                            help='dropout probability')
-        parser.add_argument('--attention-dropout', type=float, metavar='D',
-                            help='dropout probability for attention weights')
-        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
-                            help='dropout probability after activation in FFN.')
-        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
-                            help='path to pre-trained encoder embedding')
-        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
-                            help='encoder embedding dimension')
-        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
-                            help='encoder embedding dimension for FFN')
-        parser.add_argument('--encoder-layers', type=int, metavar='N',
-                            help='num encoder layers')
-        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
-                            help='num encoder attention heads')
-        parser.add_argument('--encoder-normalize-before', action='store_true',
-                            help='apply layernorm before each encoder block')
-        parser.add_argument('--encoder-learned-pos', action='store_true',
-                            help='use learned positional embeddings in the encoder')
-        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
-                            help='path to pre-trained decoder embedding')
-        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension')
-        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension for FFN')
-        parser.add_argument('--decoder-layers', type=int, metavar='N',
-                            help='num decoder layers')
-        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
-                            help='num decoder attention heads')
-        parser.add_argument('--decoder-learned-pos', action='store_true',
-                            help='use learned positional embeddings in the decoder')
-        parser.add_argument('--decoder-normalize-before', action='store_true',
-                            help='apply layernorm before each decoder block')
-        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
-                            help='share decoder input and output embeddings')
-        parser.add_argument('--share-all-embeddings', action='store_true',
-                            help='share encoder, decoder and output embeddings'
-                                 ' (requires shared dictionary and embed dim)')
-        parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
-                            help='if set, disables positional embeddings (outside self attention)')
-        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
-                            help='comma separated list of adaptive softmax cutoff points. '
-                                 'Must be used with adaptive_loss criterion'),
-        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
-                            help='sets adaptive softmax dropout for the tail projections')
-        parser.add_argument('--num-embedding-chunks', type=int, metavar='N', default=1,
-                            help='Number of embedding layer chunks (enables more even distribution'
-                                 'of optimizer states across data parallel nodes'
-                                 'when using optimizer state sharding and'
-                                 'a big embedding vocabulary)')
-        # fmt: on
-
-    @classmethod
-    def build_model_base(cls, args, task):
-        """Build a new model instance."""
-
-        # make sure all arguments are present in older models
-        base_architecture(args)
-
-        if not hasattr(args, "max_source_positions"):
-            args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
-        if not hasattr(args, "max_target_positions"):
-            args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
-
-        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
-
-        def build_embedding(dictionary, embed_dim, path=None, num_embed_chunks=1):
-            assert embed_dim % num_embed_chunks == 0, (
-                f"Number of embedding chunks = {num_embed_chunks} should be "
-                + f"divisible by the embedding dimension = {embed_dim}"
-            )
-            assert path is None or num_embed_chunks == 1, (
-                "Loading embedding from a path with number of embedding chunks > 1"
-                + " is not yet supported"
-            )
-            num_embeddings = len(dictionary)
-            padding_idx = dictionary.pad()
-            # if provided, load from preloaded dictionaries
-            if path:
-                emb = Embedding(num_embeddings, embed_dim, padding_idx)
-                embed_dict = utils.parse_embedding(path)
-                utils.load_embedding(embed_dict, dictionary, emb)
-            else:
-                embed_chunk_dim = embed_dim // num_embed_chunks
-                emb = nn.ModuleList()
-                for i in range(num_embed_chunks):
-                    emb.append(Embedding(num_embeddings, embed_chunk_dim, padding_idx))
-            return emb
-
-        num_embed_chunks = args.num_embedding_chunks
-        if args.share_all_embeddings:
-            if src_dict != tgt_dict:
-                raise ValueError("--share-all-embeddings requires a joined dictionary")
-            if args.encoder_embed_dim != args.decoder_embed_dim:
-                raise ValueError(
-                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
-                )
-            if args.decoder_embed_path and (
-                args.decoder_embed_path != args.encoder_embed_path
-            ):
-                raise ValueError(
-                    "--share-all-embeddings not compatible with --decoder-embed-path"
-                )
-            encoder_embed_tokens = build_embedding(
-                src_dict,
-                args.encoder_embed_dim,
-                args.encoder_embed_path,
-                num_embed_chunks,
-            )
-            decoder_embed_tokens = encoder_embed_tokens
-            args.share_decoder_input_output_embed = True
-        else:
-            assert args.share_decoder_input_output_embed or num_embed_chunks == 1, (
-                "Not sharing decoder I/O embeddings is not yet supported with number of "
-                + "embedding chunks > 1"
-            )
-            encoder_embed_tokens = build_embedding(
-                src_dict,
-                args.encoder_embed_dim,
-                args.encoder_embed_path,
-                num_embed_chunks,
-            )
-            decoder_embed_tokens = build_embedding(
-                tgt_dict,
-                args.decoder_embed_dim,
-                args.decoder_embed_path,
-                num_embed_chunks,
-            )
-
-        encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
-        decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
-        return (encoder, decoder)
-
-    @classmethod
-    def build_encoder(cls, args, src_dict, embed_tokens):
-        return TransformerEncoder(args, src_dict, embed_tokens)
-
-    @classmethod
-    def build_decoder(cls, args, tgt_dict, embed_tokens):
-        return TransformerDecoder(args, tgt_dict, embed_tokens)
-
-    @classmethod
-    def build_model(cls, args, task):
-        encoder, decoder = cls.build_model_base(args, task)
-        return PipelineParallelTransformerModel(
-            encoder=encoder,
-            decoder=decoder,
-            balance=utils.eval_str_list(args.pipeline_balance, type=int),
-            devices=utils.eval_str_list(args.pipeline_devices, type=int),
-            chunks=args.pipeline_chunks,
-            checkpoint=args.pipeline_checkpoint,
-        )
-
-    def output_layer(self, features, **kwargs):
-        """Project features to the default output size (typically vocabulary size)."""
-        return self.decoder.output_layer(features, **kwargs)
-
-    def max_positions(self):
-        """Maximum length supported by the model."""
-        return (self.encoder_max_positions, self.decoder_max_positions)
-
-    def max_positions_helper(
-        self, embedding_layer, max_positions_field="max_source_positions"
-    ):
-        """Maximum input length supported by the encoder or decoder."""
-        if embedding_layer.embed_positions is None:
-            return getattr(embedding_layer, max_positions_field)
-        return min(
-            getattr(embedding_layer, max_positions_field),
-            embedding_layer.embed_positions.max_positions,
-        )
-
-    def get_normalized_probs(self, net_output, log_probs, sample=None):
-        """Get normalized probabilities (or log probs) from a net's output."""
-
-        if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None:
-            if sample is not None:
-                assert "target" in sample
-                target = sample["target"]
-            else:
-                target = None
-            out = self.adaptive_softmax.get_log_prob(net_output, target=target)
-            return out.exp_() if not log_probs else out
-
-        # A Pipe() module returns a tuple of tensors as the output.
-        # In this case, the tuple has one element - the output tensor of logits
-        logits = net_output if isinstance(net_output, torch.Tensor) else net_output[0]
-        if log_probs:
-            return utils.log_softmax(logits, dim=-1, onnx_trace=False)
-        else:
-            return utils.softmax(logits, dim=-1, onnx_trace=False)
-
-    def max_decoder_positions(self):
-        """Maximum length supported by the decoder."""
-        return self.decoder_max_positions
-
-    def load_state_dict(self, state_dict, strict=True, args=None):
-        """Copies parameters and buffers from *state_dict* into this module and
-        its descendants.
-
-        Overrides the method in :class:`nn.Module`. Compared with that method
-        this additionally "upgrades" *state_dicts* from old checkpoints.
-        """
-        self.upgrade_state_dict(state_dict)
-        is_regular_transformer = not any("model.partitions" in k for k in state_dict)
-        if is_regular_transformer:
-            state_dict = self.convert_to_pipeline_parallel_state_dict(state_dict)
-        return super().load_state_dict(state_dict, strict)
-
-    def convert_to_pipeline_parallel_state_dict(self, state_dict):
-        new_state_dict = self.state_dict()
-        encoder_layer_idx = 0
-        decoder_layer_idx = 0
-        encoder_key_suffixes = [
-            "self_attn.k_proj.weight",
-            "self_attn.k_proj.bias",
-            "self_attn.v_proj.weight",
-            "self_attn.v_proj.bias",
-            "self_attn.q_proj.weight",
-            "self_attn.q_proj.bias",
-            "self_attn.out_proj.weight",
-            "self_attn.out_proj.bias",
-            "self_attn_layer_norm.weight",
-            "self_attn_layer_norm.bias",
-            "fc1.weight",
-            "fc1.bias",
-            "fc2.weight",
-            "fc2.bias",
-            "final_layer_norm.weight",
-            "final_layer_norm.bias",
-        ]
-        decoder_key_suffixes = [
-            "self_attn.k_proj.weight",
-            "self_attn.k_proj.bias",
-            "self_attn.v_proj.weight",
-            "self_attn.v_proj.bias",
-            "self_attn.q_proj.weight",
-            "self_attn.q_proj.bias",
-            "self_attn.out_proj.weight",
-            "self_attn.out_proj.bias",
-            "self_attn_layer_norm.weight",
-            "self_attn_layer_norm.bias",
-            "encoder_attn.k_proj.weight",
-            "encoder_attn.k_proj.bias",
-            "encoder_attn.v_proj.weight",
-            "encoder_attn.v_proj.bias",
-            "encoder_attn.q_proj.weight",
-            "encoder_attn.q_proj.bias",
-            "encoder_attn.out_proj.weight",
-            "encoder_attn.out_proj.bias",
-            "encoder_attn_layer_norm.weight",
-            "encoder_attn_layer_norm.bias",
-            "fc1.weight",
-            "fc1.bias",
-            "fc2.weight",
-            "fc2.bias",
-            "final_layer_norm.weight",
-            "final_layer_norm.bias",
-        ]
-        for pid, partition in enumerate(self.model.partitions):
-            logger.info(f"Begin Partition {pid}")
-            for mid, module in enumerate(partition):
-                # fmt: off
-                if isinstance(module, TransformerEncoderEmbedding):
-                    new_state_dict[f'model.partitions.{pid}.{mid}.embed_tokens.weight'] = state_dict['encoder.embed_tokens.weight']
-                    new_state_dict[f'model.partitions.{pid}.{mid}.embed_positions._float_tensor'] = state_dict['encoder.embed_positions._float_tensor']
-                if isinstance(module, TransformerEncoderLayer):
-                    for suffix in encoder_key_suffixes:
-                        new_state_dict[f'model.partitions.{pid}.{mid}.{suffix}'] = state_dict[f'encoder.layers.{encoder_layer_idx}.{suffix}']
-                    encoder_layer_idx += 1
-                if isinstance(module, TransformerDecoderLayer):
-                    for suffix in decoder_key_suffixes:
-                        new_state_dict[f'model.partitions.{pid}.{mid}.{suffix}'] = state_dict[f'decoder.layers.{decoder_layer_idx}.{suffix}']
-                    decoder_layer_idx += 1
-                if isinstance(module, TransformerEncoderLayerNorm):
-                    if 'encoder.layer_norm.weight' in state_dict:
-                        new_state_dict[f'model.partitions.{pid}.{mid}.layer_norm.weight'] = state_dict['encoder.layer_norm.weight']
-                        new_state_dict[f'model.partitions.{pid}.{mid}.layer_norm.bias'] = state_dict['encoder.layer_norm.bias']
-                if isinstance(module, TransformerDecoderEmbedding):
-                    new_state_dict[f'model.partitions.{pid}.{mid}.embed_tokens.weight'] = state_dict['decoder.embed_tokens.weight']
-                    new_state_dict[f'model.partitions.{pid}.{mid}.embed_positions._float_tensor'] = state_dict['decoder.embed_positions._float_tensor']
-                if isinstance(module, TransformerDecoderOutputLayer):
-                    new_state_dict[f'model.partitions.{pid}.{mid}.output_projection.weight'] = state_dict['decoder.output_projection.weight']
-                # fmt: on
-        return new_state_dict
-
-
-class TransformerEncoder(FairseqEncoder):
-    """
-    Transformer encoder consisting of *args.encoder_layers* layers. Each layer
-    is a :class:`TransformerEncoderLayer`.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        dictionary (~fairseq.data.Dictionary): encoding dictionary
-        embed_tokens (torch.nn.Embedding): input embedding
-    """
-
-    def __init__(self, args, dictionary, embed_tokens, encoder_module_list=None):
-        super().__init__(dictionary)
-        self.register_buffer("version", torch.Tensor([3]))
-        try:
-            from fairscale.nn import Pipe
-        except ImportError:
-            raise ImportError("Please install fairscale with: pip install fairscale")
-        if encoder_module_list is None:
-            embedding_layer = TransformerEncoderEmbedding(args, embed_tokens)
-            layers = [TransformerEncoderLayer(args) for i in range(args.encoder_layers)]
-            if isinstance(embed_tokens, nn.ModuleList):
-                emb_dim = sum(e.embedding_dim for e in embed_tokens)
-            else:
-                emb_dim = embed_tokens.embedding_dim
-            final_layer_norm = TransformerEncoderLayerNorm(args, emb_dim)
-            encoder_module_list = [embedding_layer] + layers + [final_layer_norm]
-        self.use_pipeline = getattr(args, "pipeline_encoder_balance", None) is not None
-        if self.use_pipeline:
-            encoder_balance = utils.eval_str_list(
-                args.pipeline_encoder_balance, type=int
-            )
-            encoder_devices = utils.eval_str_list(
-                args.pipeline_encoder_devices, type=int
-            )
-            assert sum(encoder_balance) == len(encoder_module_list), (
-                f"Sum of encoder_balance={encoder_balance} is not equal "
-                + f"to num_encoder_modules={len(encoder_module_list)}"
-            )
-            self.model = Pipe(
-                module=nn.Sequential(*encoder_module_list),
-                balance=encoder_balance,
-                devices=encoder_devices,
-                chunks=args.pipeline_chunks,
-                checkpoint=args.pipeline_checkpoint,
-            )
-        else:
-            self.embedding_layer = encoder_module_list[0]
-            self.encoder_layers = nn.Sequential(*encoder_module_list[1:-1])
-            self.final_layer_norm = encoder_module_list[-1]
-
-    def forward(self, src_tokens, src_lengths):
-        """
-        Args:
-            input_tuple(
-                src_tokens (LongTensor): tokens in the source language of shape
-                    `(batch, src_len)`
-                src_lengths (torch.LongTensor): lengths of each source sentence of
-                    shape `(batch)`
-            )
-
-        Returns:
-            output_tuple(
-                - **encoder_out** (Tensor): the last encoder layer's output of
-                  shape `(src_len, batch, embed_dim)`
-                - **encoder_padding_mask** (ByteTensor): the positions of
-                  padding elements of shape `(batch, src_len)`
-                - prev_output_tokens
-                - **encoder_states** (List[Tensor]): all intermediate
-                  hidden states of shape `(src_len, batch, embed_dim)`.
-                  Only populated if *return_all_hiddens* is True.
-            )
-        """
-        dummy_prev_output_tokens = torch.zeros(
-            1, dtype=src_tokens.dtype, device=src_tokens.device
-        )
-        input_tuple = (src_tokens, src_lengths, dummy_prev_output_tokens)
-        if self.use_pipeline:
-            input_tuple = tuple(i.to(self.model.devices[0]) for i in input_tuple)
-            encoder_out = self.model(input_tuple)
-        else:
-            encoder_embed_output_tuple = self.embedding_layer(input_tuple)
-            encoder_layers_output = self.encoder_layers(encoder_embed_output_tuple)
-            encoder_out = self.final_layer_norm(encoder_layers_output)
-        # first element is the encoder output
-        # second element is the encoder padding mask
-        # the remaining elements of EncoderOut are not computed by
-        # the PipelineParallelTransformer
-        return EncoderOut(encoder_out[0], encoder_out[1], None, None, None, None)
-
-    def reorder_encoder_out(self, encoder_out, new_order):
-        """
-        Reorder encoder output according to *new_order*.
-
-        Args:
-            encoder_out: output from the ``forward()`` method
-            new_order (LongTensor): desired order
-
-        Returns:
-            *encoder_out* rearranged according to *new_order*
-        """
-        if encoder_out.encoder_out is not None:
-            encoder_out = encoder_out._replace(
-                encoder_out=encoder_out.encoder_out.index_select(1, new_order)
-            )
-        if encoder_out.encoder_padding_mask is not None:
-            encoder_out = encoder_out._replace(
-                encoder_padding_mask=encoder_out.encoder_padding_mask.index_select(
-                    0, new_order
-                )
-            )
-        if encoder_out.encoder_embedding is not None:
-            encoder_out = encoder_out._replace(
-                encoder_embedding=encoder_out.encoder_embedding.index_select(
-                    0, new_order
-                )
-            )
-        if encoder_out.encoder_states is not None:
-            for idx, state in enumerate(encoder_out.encoder_states):
-                encoder_out.encoder_states[idx] = state.index_select(1, new_order)
-        return encoder_out
-
-    def max_positions(self):
-        """Maximum input length supported by the encoder."""
-        if self.embedding_layer.embed_positions is None:
-            return self.embedding_layer.max_source_positions
-        return min(
-            self.embedding_layer.max_source_positions,
-            self.embedding_layer.embed_positions.max_positions,
-        )
-
-
-class TransformerDecoder(FairseqDecoder):
-    """
-    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
-    is a :class:`TransformerDecoderLayer`.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        dictionary (~fairseq.data.Dictionary): decoding dictionary
-        embed_tokens (torch.nn.Embedding): output embedding
-        no_encoder_attn (bool, optional): whether to attend to encoder outputs
-            (default: False).
-    """
-
-    def __init__(
-        self,
-        args,
-        dictionary,
-        embed_tokens,
-        no_encoder_attn=False,
-        decoder_module_list=None,
-    ):
-        super().__init__(dictionary)
-        self.register_buffer("version", torch.Tensor([3]))
-        try:
-            from fairscale.nn import Pipe
-        except ImportError:
-            raise ImportError("Please install fairscale with: pip install fairscale")
-        if decoder_module_list is None:
-            embedding_layer = TransformerDecoderEmbedding(args, embed_tokens)
-            layers = [
-                TransformerDecoderLayer(args, no_encoder_attn)
-                for _ in range(args.decoder_layers)
-            ]
-            decoder_output_layer = TransformerDecoderOutputLayer(
-                args, embed_tokens, dictionary
-            )
-            decoder_module_list = [embedding_layer] + layers + [decoder_output_layer]
-        self.use_pipeline = getattr(args, "pipeline_decoder_balance", None) is not None
-        if self.use_pipeline:
-            decoder_balance = utils.eval_str_list(
-                args.pipeline_decoder_balance, type=int
-            )
-            decoder_devices = utils.eval_str_list(
-                args.pipeline_decoder_devices, type=int
-            )
-            assert sum(decoder_balance) == len(decoder_module_list), (
-                f"Sum of decoder_balance={decoder_balance} is not equal "
-                + f"to num_decoder_modules={len(decoder_module_list)}"
-            )
-            self.model = Pipe(
-                module=nn.Sequential(*decoder_module_list),
-                balance=decoder_balance,
-                devices=decoder_devices,
-                chunks=args.pipeline_chunks,
-                checkpoint=args.pipeline_checkpoint,
-            )
-        else:
-            self.embedding_layer = decoder_module_list[0]
-            self.decoder_layers = nn.Sequential(*decoder_module_list[1:-1])
-            self.decoder_output_layer = decoder_module_list[-1]
-
-    def forward(
-        self,
-        prev_output_tokens,
-        encoder_out=None,
-    ):
-        """
-        Args:
-            prev_output_tokens (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for teacher forcing
-            encoder_out (optional): output from the encoder, used for
-                encoder-side attention
-            incremental_state (dict): dictionary used for storing state during
-                :ref:`Incremental decoding`
-            features_only (bool, optional): only return features without
-                applying output layer (default: False).
-
-        Returns:
-            tuple:
-                - the decoder's output of shape `(batch, tgt_len, vocab)`
-                - a dictionary with any model-specific outputs
-        """
-        input_tuple = (
-            encoder_out.encoder_out,
-            encoder_out.encoder_padding_mask,
-            prev_output_tokens,
-        )
-        if self.use_pipeline:
-            input_tuple = tuple(i.to(self.model.devices[0]) for i in input_tuple)
-            return (self.model(input_tuple),)
-        else:
-            embed_layer_output = self.embedding_layer(input_tuple)
-            state = self.decoder_layers(embed_layer_output)
-            return (self.decoder_output_layer(state),)
-
-    def output_layer(self, features, **kwargs):
-        """Project features to the vocabulary size."""
-        if self.adaptive_softmax is None:
-            # project back to size of vocabulary
-            if self.share_input_output_embed:
-                return F.linear(features, self.embed_tokens.weight)
-            else:
-                return F.linear(features, self.embed_out)
-        else:
-            return features
-
-    def max_positions(self):
-        """Maximum output length supported by the decoder."""
-        if self.embedding_layer.embed_positions is None:
-            return self.embedding_layer.max_target_positions
-        return min(
-            self.embedding_layer.max_target_positions,
-            self.embedding_layer.embed_positions.max_positions,
-        )
-
-    def buffered_future_mask(self, tensor):
-        dim = tensor.size(0)
-        if (
-            not hasattr(self, "_future_mask")
-            or self._future_mask is None
-            or self._future_mask.device != tensor.device
-            or self._future_mask.size(0) < dim
-        ):
-            self._future_mask = torch.triu(
-                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
-            )
-        return self._future_mask[:dim, :dim]
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        """Upgrade a (possibly old) state dict for new versions of fairseq."""
-        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
-            weights_key = "{}.embed_positions.weights".format(name)
-            if weights_key in state_dict:
-                del state_dict[weights_key]
-            state_dict[
-                "{}.embed_positions._float_tensor".format(name)
-            ] = torch.FloatTensor(1)
-
-        for i in range(len(self.layers)):
-            # update layer norms
-            layer_norm_map = {
-                "0": "self_attn_layer_norm",
-                "1": "encoder_attn_layer_norm",
-                "2": "final_layer_norm",
-            }
-            for old, new in layer_norm_map.items():
-                for m in ("weight", "bias"):
-                    k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m)
-                    if k in state_dict:
-                        state_dict[
-                            "{}.layers.{}.{}.{}".format(name, i, new, m)
-                        ] = state_dict[k]
-                        del state_dict[k]
-
-        version_key = "{}.version".format(name)
-        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2:
-            # earlier checkpoints did not normalize after the stack of layers
-            self.layer_norm = None
-            self.normalize = False
-            state_dict[version_key] = torch.Tensor([1])
-
-        return state_dict
-
-
-@register_model_architecture(
-    "pipeline_parallel_transformer", "transformer_iwslt_de_en_pipeline_parallel"
-)
-def transformer_iwslt_de_en_dist(args):
-    transformer_iwslt_de_en(args)
-
-
-@register_model_architecture(
-    "pipeline_parallel_transformer", "transformer_wmt_en_de_big_pipeline_parallel"
-)
-def transformer_wmt_en_de_big_dist(args):
-    transformer_wmt_en_de_big(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/roberta/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/roberta/__init__.py
deleted file mode 100644
index 117827c3e9c176477f33e3a6fd7fe19a922411a2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/roberta/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .model import *  # noqa
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/roberta/model.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/roberta/model.py
deleted file mode 100644
index 68ad88d2a56516624e4a3a10cf4d5bec5f17d2e6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/roberta/model.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-RoBERTa: A Robustly Optimized BERT Pretraining Approach.
-"""
-
-import logging
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.model_parallel.modules import ModelParallelTransformerSentenceEncoder
-from fairseq.models import FairseqEncoder, register_model, register_model_architecture
-from fairseq.models.roberta import (
-    RobertaClassificationHead,
-    RobertaEncoder,
-    RobertaLMHead,
-    RobertaModel,
-)
-from fairseq.modules import LayerNorm, TransformerSentenceEncoder
-from fairseq.modules.transformer_sentence_encoder import init_bert_params
-
-
-try:
-    from fairseq.model_parallel.megatron.mpu import (
-        copy_to_model_parallel_region,
-        gather_from_model_parallel_region,
-        ColumnParallelLinear,
-        RowParallelLinear,
-    )
-
-    has_megatron_submodule = True
-except (ImportError, ModuleNotFoundError):
-    has_megatron_submodule = False
-
-logger = logging.getLogger(__name__)
-
-
-@register_model("model_parallel_roberta")
-class ModelParallelRobertaModel(RobertaModel):
-    def __init__(self, args, encoder):
-        super().__init__(args, encoder)
-
-        self.classification_heads = nn.ModuleDict()
-
-    @staticmethod
-    def add_args(parser):
-        super(ModelParallelRobertaModel, ModelParallelRobertaModel).add_args(parser)
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-
-        # make sure all arguments are present
-        base_architecture(args)
-
-        task.source_dictionary.pad_to_multiple_(args.model_parallel_size * 8)
-        task.target_dictionary.pad_to_multiple_(args.model_parallel_size * 8)
-
-        if not hasattr(args, "max_positions"):
-            args.max_positions = args.tokens_per_sample
-
-        if getattr(args, "untie_weights_roberta", False):
-            raise NotImplementedError(
-                "--untie-weights-roberta is not supported in model parallel mode"
-            )
-
-        encoder = ModelParallelRobertaEncoder(args, task.source_dictionary)
-        return cls(args, encoder)
-
-    def forward(
-        self,
-        src_tokens,
-        features_only=False,
-        return_all_hiddens=False,
-        classification_head_name=None,
-        **kwargs
-    ):
-        if classification_head_name is not None:
-            features_only = True
-
-        x, extra = self.encoder(src_tokens, features_only, return_all_hiddens, **kwargs)
-
-        if classification_head_name is not None:
-            x = self.classification_heads[classification_head_name](x)
-        return x, extra
-
-    def register_classification_head(
-        self, name, num_classes=None, inner_dim=None, **kwargs
-    ):
-        """Register a classification head."""
-        if name in self.classification_heads:
-            prev_num_classes = self.classification_heads[name].out_proj.out_features
-            prev_inner_dim = self.classification_heads[name].dense.out_features
-            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
-                logger.warning(
-                    're-registering head "{}" with num_classes {} (prev: {}) '
-                    "and inner_dim {} (prev: {})".format(
-                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
-                    )
-                )
-        self.classification_heads[name] = ModelParallelRobertaClassificationHead(
-            self.args.encoder_embed_dim,
-            inner_dim or self.args.encoder_embed_dim,
-            num_classes,
-            self.args.pooler_activation_fn,
-            self.args.pooler_dropout,
-        )
-
-
-class ModelParallelRobertaLMHead(nn.Module):
-    """Head for masked language modeling."""
-
-    def __init__(self, embed_dim, output_dim, activation_fn, weight=None):
-        super().__init__()
-        self.dense = ColumnParallelLinear(embed_dim, embed_dim, gather_output=True)
-        self.activation_fn = utils.get_activation_fn(activation_fn)
-        self.layer_norm = LayerNorm(embed_dim)
-
-        if weight is None:
-            weight = nn.Linear(embed_dim, output_dim, bias=False).weight
-        self.weight = weight
-        self.bias = nn.Parameter(torch.zeros(output_dim))
-
-    def forward(self, features, masked_tokens=None, **kwargs):
-        # Only project the unmasked tokens while training,
-        # saves both memory and computation
-        if masked_tokens is not None:
-            features = features[masked_tokens, :]
-
-        x = self.dense(features)
-        x = self.activation_fn(x)
-        x = self.layer_norm(x)
-
-        x = copy_to_model_parallel_region(x)
-        # project back to size of vocabulary with bias
-        x = F.linear(x, self.weight)
-        x = gather_from_model_parallel_region(x).contiguous()
-        x = x + self.bias
-        return x
-
-
-class ModelParallelRobertaClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(
-        self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout
-    ):
-        super().__init__()
-        self.dense = ColumnParallelLinear(input_dim, inner_dim, gather_output=True)
-        self.activation_fn = utils.get_activation_fn(activation_fn)
-        self.dropout = nn.Dropout(p=pooler_dropout)
-        self.out_proj = nn.Linear(inner_dim, num_classes)
-
-    def forward(self, features, **kwargs):
-        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
-        x = self.dropout(x)
-        x = self.dense(x)
-        x = self.activation_fn(x)
-        x = self.dropout(x)
-        x = self.out_proj(x)
-        return x
-
-
-class ModelParallelRobertaEncoder(FairseqEncoder):
-    """RoBERTa encoder.
-
-    Implements the :class:`~fairseq.models.FairseqDecoder` interface required
-    by :class:`~fairseq.models.FairseqLanguageModel`.
-    """
-
-    def __init__(self, args, dictionary):
-        super().__init__(dictionary)
-        self.args = args
-
-        # RoBERTa is a sentence encoder model, so users will intuitively trim
-        # encoder layers. However, the implementation uses the fairseq decoder,
-        # so we fix here.
-        if args.encoder_layers_to_keep:
-            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
-            args.decoder_layers_to_keep = args.encoder_layers_to_keep
-            args.encoder_layers_to_keep = None
-
-        self.sentence_encoder = ModelParallelTransformerSentenceEncoder(
-            padding_idx=dictionary.pad(),
-            vocab_size=len(dictionary),
-            num_encoder_layers=args.encoder_layers,
-            embedding_dim=args.encoder_embed_dim,
-            ffn_embedding_dim=args.encoder_ffn_embed_dim,
-            num_attention_heads=args.encoder_attention_heads,
-            dropout=args.dropout,
-            attention_dropout=args.attention_dropout,
-            activation_dropout=args.activation_dropout,
-            layerdrop=args.encoder_layerdrop,
-            max_seq_len=args.max_positions,
-            num_segments=0,
-            encoder_normalize_before=False,
-            apply_bert_init=False,
-            activation_fn=args.activation_fn,
-        )
-        self.lm_head = ModelParallelRobertaLMHead(
-            embed_dim=args.encoder_embed_dim,
-            output_dim=len(dictionary),
-            activation_fn=args.activation_fn,
-            weight=self.sentence_encoder.embed_tokens.weight,
-        )
-
-    def forward(
-        self,
-        src_tokens,
-        features_only=False,
-        return_all_hiddens=False,
-        masked_tokens=None,
-        **unused
-    ):
-        """
-        Args:
-            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
-            features_only (bool, optional): skip LM head and just return
-                features. If True, the output will be of shape
-                `(batch, src_len, embed_dim)`.
-            return_all_hiddens (bool, optional): also return all of the
-                intermediate hidden states (default: False).
-
-        Returns:
-            tuple:
-                - the LM output of shape `(batch, src_len, vocab)`
-                - a dictionary of additional data, where 'inner_states'
-                  is a list of hidden states. Note that the hidden
-                  states have shape `(src_len, batch, vocab)`.
-        """
-        x, extra = self.extract_features(
-            src_tokens, return_all_hiddens=return_all_hiddens
-        )
-        if not features_only:
-            x = self.output_layer(x, masked_tokens=masked_tokens)
-        return x, extra
-
-    def extract_features(self, src_tokens, return_all_hiddens=False, **unused):
-        inner_states, _ = self.sentence_encoder(
-            src_tokens,
-            last_state_only=not return_all_hiddens,
-        )
-        features = inner_states[-1].transpose(0, 1)  # T x B x C -> B x T x C
-        return features, {"inner_states": inner_states if return_all_hiddens else None}
-
-    def output_layer(self, features, masked_tokens=None, **unused):
-        return self.lm_head(features, masked_tokens)
-
-    def max_positions(self):
-        """Maximum output length supported by the encoder."""
-        return self.args.max_positions
-
-
-@register_model_architecture("model_parallel_roberta", "model_parallel_roberta")
-def base_architecture(args):
-    args.encoder_layers = getattr(args, "encoder_layers", 12)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
-
-    args.activation_fn = getattr(args, "activation_fn", "gelu")
-    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
-
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
-    args.pooler_dropout = getattr(args, "pooler_dropout", 0.0)
-    args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None)
-    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
-
-
-@register_model_architecture("model_parallel_roberta", "model_parallel_roberta_base")
-def roberta_base_architecture(args):
-    base_architecture(args)
-
-
-@register_model_architecture("model_parallel_roberta", "model_parallel_roberta_large")
-def roberta_large_architecture(args):
-    args.encoder_layers = getattr(args, "encoder_layers", 24)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
-    base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/transformer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/transformer.py
deleted file mode 100644
index 4f346452263a615ec7a3c695106c22b90b041286..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/transformer.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq.model_parallel.modules import (
-    ModelParallelTransformerDecoderLayer,
-    ModelParallelTransformerEncoderLayer,
-)
-from fairseq.models import register_model
-from fairseq.models.transformer import (
-    TransformerDecoder,
-    TransformerEncoder,
-    TransformerModel,
-)
-
-
-try:
-    from fairseq.model_parallel.megatron.mpu import (
-        copy_to_model_parallel_region,
-        gather_from_model_parallel_region,
-        VocabParallelEmbedding,
-    )
-
-    has_megatron_submodule = True
-except (ImportError, ModuleNotFoundError):
-    has_megatron_submodule = False
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_model("model_parallel_transformer")
-class ModelParallelTransformerModel(TransformerModel):
-    """
-    Model parallel Transformer model.
-    """
-
-    @classmethod
-    def build_embedding(cls, args, dictionary, embed_dim, path=None):
-        if not has_megatron_submodule:
-            raise ImportError(
-                "\n\nPlease install the megatron submodule:"
-                "\n\n  git submodule update --init "
-                "fairseq/model_parallel/megatron"
-            )
-        dictionary.pad_to_multiple_(args.model_parallel_size * 8)
-        num_embeddings = len(dictionary)
-        padding_idx = dictionary.pad()
-
-        def _vocab_init(tensor, **kwargs):
-            nn.init.normal_(tensor, mean=0, std=num_embeddings ** -0.5)
-            nn.init.constant_(tensor[1], 0)
-
-        emb = VocabParallelEmbedding(
-            num_embeddings, embed_dim, padding_idx, init_method=_vocab_init
-        )
-        # if provided, load from preloaded dictionaries
-        if path:
-            raise NotImplementedError(
-                "Loading of embedding from path is not supported for model parallel"
-            )
-        return emb
-
-    @classmethod
-    def build_encoder(cls, args, src_dict, embed_tokens):
-        return ModelParallelTransformerEncoder(args, src_dict, embed_tokens)
-
-    @classmethod
-    def build_decoder(cls, args, tgt_dict, embed_tokens):
-        return ModelParallelTransformerDecoder(
-            args,
-            tgt_dict,
-            embed_tokens,
-            no_encoder_attn=getattr(args, "no_cross_attention", False),
-        )
-
-
-class ModelParallelTransformerEncoder(TransformerEncoder):
-    """
-    Model parallel Transformer encoder consisting of *args.encoder_layers* layers. Each layer
-    is a :class:`ModelParallelTransformerEncoderLayer`.
-    """
-
-    def build_encoder_layer(self, args):
-        return ModelParallelTransformerEncoderLayer(args)
-
-
-class ModelParallelTransformerDecoder(TransformerDecoder):
-    """
-    Model Parallel Transformer decoder consisting of *args.decoder_layers* layers. Each layer
-    is a :class:`ModelParallelTransformerDecoderLayer`.
-    """
-
-    def build_decoder_layer(self, args, no_encoder_attn=False):
-        return ModelParallelTransformerDecoderLayer(args, no_encoder_attn)
-
-    def output_layer(self, features, **kwargs):
-        """Project features to the vocabulary size."""
-        if not self.share_input_output_embed:
-            raise NotImplementedError(
-                "Model parallel training currently requires --share-decoder-input-output-embed"
-            )
-
-        features = copy_to_model_parallel_region(features)
-
-        # project back to size of vocabulary
-        x = self.output_projection(features)
-
-        if getattr(self.args, "criterion") != "vocab_parallel_cross_entropy":
-            x = gather_from_model_parallel_region(x).contiguous()
-        return x
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/transformer_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/transformer_lm.py
deleted file mode 100644
index 5db6efb7b16e9d8455734a3688b74dba2dafebbe..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/models/transformer_lm.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch.nn as nn
-from fairseq.model_parallel.models.transformer import ModelParallelTransformerDecoder
-from fairseq.models import register_model, register_model_architecture
-from fairseq.models.transformer_lm import TransformerLanguageModel
-
-
-try:
-    from fairseq.model_parallel.megatron.mpu import VocabParallelEmbedding
-
-    has_megatron_submodule = True
-except (ImportError, ModuleNotFoundError):
-    has_megatron_submodule = False
-
-
-DEFAULT_MAX_TARGET_POSITIONS = 1024
-
-
-@register_model("model_parallel_transformer_lm")
-class ModelParallelTransformerLanguageModel(TransformerLanguageModel):
-
-    @staticmethod
-    def add_args(parser):
-        TransformerLanguageModel.add_args(parser)
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-        if not has_megatron_submodule:
-            raise ImportError(
-                "\n\nPlease install the megatron submodule:"
-                "\n\n  git submodule update --init "
-                "fairseq/model_parallel/megatron"
-            )
-
-        # make sure all arguments are present in older models
-        base_lm_architecture(args)
-
-        task.source_dictionary.pad_to_multiple_(args.model_parallel_size * 8)
-        task.target_dictionary.pad_to_multiple_(args.model_parallel_size * 8)
-
-        if args.decoder_layers_to_keep:
-            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
-
-        if getattr(args, "max_target_positions", None) is None:
-            args.max_target_positions = getattr(
-                args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
-            )
-
-        if args.character_embeddings:
-            raise NotImplementedError(
-                "Character embeddings is not supported for model parallel"
-            )
-        elif args.adaptive_input:
-            raise NotImplementedError(
-                "Adaptive input is not supported for model parallel"
-            )
-        else:
-            embed_tokens = cls.build_embedding(
-                args, task.source_dictionary, args.decoder_input_dim
-            )
-
-        decoder = ModelParallelTransformerDecoder(
-            args,
-            task.target_dictionary,
-            embed_tokens,
-            no_encoder_attn=True,
-        )
-        return cls(decoder)
-
-    @classmethod
-    def build_embedding(cls, args, dictionary, embed_dim, path=None):
-        def _vocab_init(tensor, **kwargs):
-            nn.init.normal_(tensor, mean=0, std=embed_dim ** -0.5)
-            nn.init.constant_(tensor[1], 0)
-
-        embed_tokens = VocabParallelEmbedding(
-            len(dictionary), embed_dim, dictionary.pad(), init_method=_vocab_init
-        )
-        return embed_tokens
-
-
-def base_lm_architecture(args):
-    # backward compatibility for older model checkpoints
-    if hasattr(args, "no_tie_adaptive_proj"):
-        # previous models defined --no-tie-adaptive-proj, so use the existence of
-        # that option to determine if this is an "old" model checkpoint
-        args.no_decoder_final_norm = True  # old models always set this to True
-        if args.no_tie_adaptive_proj is False:
-            args.tie_adaptive_proj = True
-    if hasattr(args, "decoder_final_norm"):
-        args.no_decoder_final_norm = not args.decoder_final_norm
-
-    args.activation_fn = getattr(args, "activation_fn", "relu")
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
-    args.relu_dropout = getattr(args, "relu_dropout", 0.0)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048)
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    # Model training is not stable without this
-    args.decoder_normalize_before = True
-    args.no_decoder_final_norm = getattr(args, "no_decoder_final_norm", False)
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4)
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.character_embeddings = getattr(args, "character_embeddings", False)
-    args.character_filters = getattr(
-        args,
-        "character_filters",
-        "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
-    )
-    args.character_embedding_dim = getattr(args, "character_embedding_dim", 4)
-    args.char_embedder_highway_layers = getattr(args, "char_embedder_highway_layers", 2)
-    args.adaptive_input = getattr(args, "adaptive_input", False)
-    args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4)
-    args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None)
-    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
-    args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
-    args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None)
-    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
-    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
-    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0.0)
-    args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8)
-    args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0.0)
-    args.add_bos_token = getattr(args, "add_bos_token", False)
-
-
-@register_model_architecture("model_parallel_transformer_lm", "transformer_lm_megatron")
-def transformer_lm_megatron(args):
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 3072)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 3072 * 4)
-    args.decoder_layers = getattr(args, "decoder_layers", 72)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 32)
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_fn = getattr(args, "activation_fn", "gelu")
-    base_lm_architecture(args)
-
-
-@register_model_architecture(
-    "model_parallel_transformer_lm", "transformer_lm_megatron_11b"
-)
-def transformer_lm_megatron_11b(args):
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 3072)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 3072 * 6)
-    args.decoder_layers = getattr(args, "decoder_layers", 72)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 32)
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_fn = getattr(args, "activation_fn", "gelu")
-    base_lm_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/__init__.py
deleted file mode 100644
index fb45b3c9e0e23956d71f845e46057f4286fccbe6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""isort:skip_file"""
-
-from .multihead_attention import ModelParallelMultiheadAttention
-from .transformer_layer import (
-    ModelParallelTransformerEncoderLayer,
-    ModelParallelTransformerDecoderLayer,
-)
-from .transformer_sentence_encoder_layer import (
-    ModelParallelTransformerSentenceEncoderLayer,
-)
-from .transformer_sentence_encoder import ModelParallelTransformerSentenceEncoder
-
-__all__ = [
-    "ModelParallelMultiheadAttention",
-    "ModelParallelTransformerEncoderLayer",
-    "ModelParallelTransformerDecoderLayer",
-    "ModelParallelTransformerSentenceEncoder",
-    "ModelParallelTransformerSentenceEncoderLayer",
-]
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/multihead_attention.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/multihead_attention.py
deleted file mode 100644
index 4164bf913136c0002ea1ffa9b3ed7b27ac140732..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/multihead_attention.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Dict, Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.incremental_decoding_utils import with_incremental_state
-from fairseq.modules.fairseq_dropout import FairseqDropout
-from torch import Tensor, nn
-
-
-try:
-    from fairseq.model_parallel.megatron.mpu import (
-        get_cuda_rng_tracker,
-        get_model_parallel_world_size,
-        ColumnParallelLinear,
-        RowParallelLinear,
-    )
-
-    has_megatron_submodule = True
-except (ImportError, ModuleNotFoundError):
-    has_megatron_submodule = False
-
-
-@with_incremental_state
-class ModelParallelMultiheadAttention(nn.Module):
-    """Model parallel Multi-headed attention.
-    This performs the Multi-headed attention over multiple gpus.
-
-    See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details.
-    """
-
-    def __init__(
-        self,
-        embed_dim,
-        num_heads,
-        kdim=None,
-        vdim=None,
-        dropout=0.0,
-        bias=True,
-        self_attention=False,
-        encoder_decoder_attention=False,
-    ):
-        super().__init__()
-        if not has_megatron_submodule:
-            raise ImportError(
-                "\n\nPlease install the megatron submodule:"
-                "\n\n  git submodule update --init "
-                "fairseq/model_parallel/megatron"
-            )
-        self.embed_dim = embed_dim
-        self.kdim = kdim if kdim is not None else embed_dim
-        self.vdim = vdim if vdim is not None else embed_dim
-        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
-
-        self.model_parallel_size = get_model_parallel_world_size()
-
-        self.num_heads_partition = num_heads // self.model_parallel_size
-        assert (
-            self.num_heads_partition * self.model_parallel_size == num_heads
-        ), "Number of heads must be divisible by model parallel size"
-
-        self.dropout_module = FairseqDropout(
-            dropout, module_name=self.__class__.__name__
-        )
-        self.head_dim = embed_dim // num_heads
-        assert (
-            self.head_dim * num_heads == self.embed_dim
-        ), "embed_dim must be divisible by num_heads"
-        self.scaling = self.head_dim ** -0.5
-
-        self.self_attention = self_attention
-        self.encoder_decoder_attention = encoder_decoder_attention
-
-        assert (
-            not self.self_attention or self.qkv_same_dim
-        ), "Self-attention requires query, key and value to be of the same size"
-
-        self.k_proj = ColumnParallelLinear(
-            self.kdim, embed_dim, bias=bias, gather_output=False
-        )
-        self.v_proj = ColumnParallelLinear(
-            self.vdim, embed_dim, bias=bias, gather_output=False
-        )
-        self.q_proj = ColumnParallelLinear(
-            embed_dim, embed_dim, bias=bias, gather_output=False
-        )
-        self.out_proj = RowParallelLinear(
-            embed_dim, embed_dim, bias=bias, input_is_parallel=True
-        )
-
-        self.tpu = False
-
-    def prepare_for_tpu_(self, **kwargs):
-        self.tpu = True
-
-    def forward(
-        self,
-        query,
-        key: Optional[Tensor],
-        value: Optional[Tensor],
-        key_padding_mask: Optional[Tensor] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        static_kv: bool = False,
-        attn_mask: Optional[Tensor] = None,
-        **unused_kwargs,
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        """Input shape: Time x Batch x Channel
-
-        Args:
-            key_padding_mask (ByteTensor, optional): mask to exclude
-                keys that are pads, of shape `(batch, src_len)`, where
-                padding elements are indicated by 1s.
-            attn_mask (ByteTensor, optional): typically used to
-                implement causal attention, where the mask prevents the
-                attention from looking forward in time (default: None).
-        """
-        tgt_len, bsz, embed_dim = query.size()
-        assert embed_dim == self.embed_dim
-        assert list(query.size()) == [tgt_len, bsz, embed_dim]
-
-        if incremental_state is not None:
-            saved_state = self._get_input_buffer(incremental_state)
-            if saved_state is not None and "prev_key" in saved_state:
-                # previous time steps are cached - no need to recompute
-                # key and value if they are static
-                if static_kv:
-                    assert self.encoder_decoder_attention and not self.self_attention
-                    key = value = None
-        else:
-            saved_state = None
-
-        if self.self_attention:
-            q = self.q_proj(query)
-            k = self.k_proj(query)
-            v = self.v_proj(query)
-        elif self.encoder_decoder_attention:
-            # encoder-decoder attention
-            q = self.q_proj(query)
-            if key is None:
-                assert value is None
-                k = v = None
-            else:
-                k = self.k_proj(key)
-                v = self.v_proj(key)
-
-        else:
-            assert key is not None and value is not None
-            q = self.q_proj(query)
-            k = self.k_proj(key)
-            v = self.v_proj(value)
-        q *= self.scaling
-
-        q = (
-            q.contiguous()
-            .view(tgt_len, bsz * self.num_heads_partition, self.head_dim)
-            .transpose(0, 1)
-        )
-        if k is not None:
-            k = (
-                k.contiguous()
-                .view(-1, bsz * self.num_heads_partition, self.head_dim)
-                .transpose(0, 1)
-            )
-        if v is not None:
-            v = (
-                v.contiguous()
-                .view(-1, bsz * self.num_heads_partition, self.head_dim)
-                .transpose(0, 1)
-            )
-
-        if saved_state is not None:
-            # saved states are stored with shape (bsz, num_heads_partition, seq_len, head_dim)
-            if "prev_key" in saved_state:
-                _prev_key = saved_state["prev_key"]
-                assert _prev_key is not None
-                prev_key = _prev_key.view(
-                    bsz * self.num_heads_partition, -1, self.head_dim
-                )
-                if static_kv:
-                    k = prev_key
-                else:
-                    assert k is not None
-                    k = torch.cat([prev_key, k], dim=1)
-            if "prev_value" in saved_state:
-                _prev_value = saved_state["prev_value"]
-                assert _prev_value is not None
-                prev_value = _prev_value.view(
-                    bsz * self.num_heads_partition, -1, self.head_dim
-                )
-                if static_kv:
-                    v = prev_value
-                else:
-                    assert v is not None
-                    v = torch.cat([prev_value, v], dim=1)
-            prev_key_padding_mask: Optional[Tensor] = None
-            if "prev_key_padding_mask" in saved_state:
-                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
-            assert k is not None and v is not None
-            key_padding_mask = (
-                ModelParallelMultiheadAttention._append_prev_key_padding_mask(
-                    key_padding_mask=key_padding_mask,
-                    prev_key_padding_mask=prev_key_padding_mask,
-                    batch_size=bsz,
-                    src_len=k.size(1),
-                    static_kv=static_kv,
-                )
-            )
-
-            saved_state["prev_key"] = k.view(
-                bsz, self.num_heads_partition, -1, self.head_dim
-            )
-            saved_state["prev_value"] = v.view(
-                bsz, self.num_heads_partition, -1, self.head_dim
-            )
-            saved_state["prev_key_padding_mask"] = key_padding_mask
-            # In this branch incremental_state is never None
-            assert incremental_state is not None
-            incremental_state = self._set_input_buffer(incremental_state, saved_state)
-        assert k is not None
-        src_len = k.size(1)
-
-        # This is part of a workaround to get around fork/join parallelism
-        # not supporting Optional types.
-        if key_padding_mask is not None and key_padding_mask.dim() == 0:
-            key_padding_mask = None
-
-        if key_padding_mask is not None:
-            assert key_padding_mask.size(0) == bsz
-            assert key_padding_mask.size(1) == src_len
-
-        attn_weights = torch.bmm(q, k.transpose(1, 2))
-
-        assert list(attn_weights.size()) == [
-            bsz * self.num_heads_partition,
-            tgt_len,
-            src_len,
-        ]
-
-        if attn_mask is not None:
-            attn_mask = attn_mask.unsqueeze(0)
-            attn_weights += attn_mask
-
-        if key_padding_mask is not None:
-            # don't attend to padding symbols
-            attn_weights = attn_weights.view(
-                bsz, self.num_heads_partition, tgt_len, src_len
-            )
-            if not self.tpu:
-                attn_weights = attn_weights.masked_fill(
-                    key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
-                    float("-inf"),
-                )
-            else:
-                attn_weights = attn_weights.transpose(0, 2)
-                attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
-                attn_weights = attn_weights.transpose(0, 2)
-            attn_weights = attn_weights.view(
-                bsz * self.num_heads_partition, tgt_len, src_len
-            )
-
-        attn_weights_float = utils.softmax(attn_weights, dim=-1)
-        attn_weights = attn_weights_float.type_as(attn_weights)
-
-        with get_cuda_rng_tracker().fork():
-            attn_probs = self.dropout_module(attn_weights)
-
-        assert v is not None
-        attn = torch.bmm(attn_probs, v)
-        assert list(attn.size()) == [
-            bsz * self.num_heads_partition,
-            tgt_len,
-            self.head_dim,
-        ]
-        embed_dim_partition = embed_dim // self.model_parallel_size
-        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim_partition)
-        attn = self.out_proj(attn)
-        # return attn_weights None to keep the return type same as single gpu multihead attention
-        # This will be deprecated.
-        attn_weights: Optional[Tensor] = None
-
-        return attn, attn_weights
-
-    @staticmethod
-    def _append_prev_key_padding_mask(
-        key_padding_mask: Optional[Tensor],
-        prev_key_padding_mask: Optional[Tensor],
-        batch_size: int,
-        src_len: int,
-        static_kv: bool,
-    ) -> Optional[Tensor]:
-        # saved key padding masks have shape (bsz, seq_len)
-        if prev_key_padding_mask is not None and static_kv:
-            new_key_padding_mask = prev_key_padding_mask
-        elif prev_key_padding_mask is not None and key_padding_mask is not None:
-            new_key_padding_mask = torch.cat(
-                [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
-            )
-        # During incremental decoding, as the padding token enters and
-        # leaves the frame, there will be a time when prev or current
-        # is None
-        elif prev_key_padding_mask is not None:
-
-            filler = torch.zeros(batch_size, src_len - prev_key_padding_mask.size(1))
-            if prev_key_padding_mask.is_cuda:
-                filler = filler.cuda()
-            new_key_padding_mask = torch.cat(
-                [prev_key_padding_mask.float(), filler.float()], dim=1
-            )
-        elif key_padding_mask is not None:
-            filler = torch.zeros(batch_size, src_len - key_padding_mask.size(1))
-            if key_padding_mask.is_cuda:
-                filler = filler.cuda()
-            new_key_padding_mask = torch.cat(
-                [filler.float(), key_padding_mask.float()], dim=1
-            )
-        else:
-            new_key_padding_mask = prev_key_padding_mask
-        return new_key_padding_mask
-
-    def reorder_incremental_state(
-        self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order
-    ):
-        """Reorder buffered internal state (for incremental generation)."""
-        input_buffer = self._get_input_buffer(incremental_state)
-        if input_buffer is not None:
-            for k in input_buffer.keys():
-                if input_buffer[k] is not None:
-                    input_buffer[k] = input_buffer[k].index_select(0, new_order)
-            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
-        return incremental_state
-
-    def _get_input_buffer(
-        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
-    ) -> Dict[str, Optional[Tensor]]:
-        result = self.get_incremental_state(incremental_state, "attn_state")
-        if result is not None:
-            return result
-        else:
-            empty_result: Dict[str, Optional[Tensor]] = {}
-            return empty_result
-
-    def _set_input_buffer(
-        self,
-        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-        buffer: Dict[str, Optional[Tensor]],
-    ):
-        return self.set_incremental_state(incremental_state, "attn_state", buffer)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/transformer_layer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/transformer_layer.py
deleted file mode 100644
index 7ab53c6e5f12f15562717effb86ab8cb8d6b4fa3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/transformer_layer.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.model_parallel.modules import ModelParallelMultiheadAttention
-from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer
-
-
-try:
-    from fairseq.model_parallel.megatron.mpu import (
-        ColumnParallelLinear,
-        RowParallelLinear,
-    )
-
-    has_megatron_submodule = True
-except (ImportError, ModuleNotFoundError):
-    has_megatron_submodule = False
-
-
-class ModelParallelTransformerEncoderLayer(TransformerEncoderLayer):
-    """Encoder layer block over multiple gpus.
-
-    See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details.
-    """
-
-    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
-        if q_noise > 0:
-            raise NotImplementedError
-        return ColumnParallelLinear(input_dim, output_dim, gather_output=False)
-
-    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
-        if q_noise > 0:
-            raise NotImplementedError
-        return RowParallelLinear(input_dim, output_dim, input_is_parallel=True)
-
-    def build_self_attention(self, embed_dim, args, **unused_kwargs):
-        return ModelParallelMultiheadAttention(
-            embed_dim,
-            args.encoder_attention_heads,
-            dropout=args.attention_dropout,
-            self_attention=True,
-        )
-
-
-class ModelParallelTransformerDecoderLayer(TransformerDecoderLayer):
-    """Decoder layer block.
-
-    See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details.
-    """
-
-    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
-        if q_noise > 0:
-            raise NotImplementedError
-        return ColumnParallelLinear(input_dim, output_dim, gather_output=False)
-
-    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
-        if q_noise > 0:
-            raise NotImplementedError
-        return RowParallelLinear(input_dim, output_dim, input_is_parallel=True)
-
-    def build_self_attention(self, embed_dim, args, **unused_kwargs):
-        return ModelParallelMultiheadAttention(
-            embed_dim=embed_dim,
-            num_heads=args.decoder_attention_heads,
-            dropout=args.attention_dropout,
-            self_attention=not getattr(args, "cross_self_attention", False),
-        )
-
-    def build_encoder_attention(self, embed_dim, args, **unused_kwargs):
-        return ModelParallelMultiheadAttention(
-            embed_dim=embed_dim,
-            num_heads=args.decoder_attention_heads,
-            kdim=getattr(args, "encoder_embed_dim", None),
-            vdim=getattr(args, "encoder_embed_dim", None),
-            dropout=args.attention_dropout,
-            encoder_decoder_attention=True,
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/transformer_sentence_encoder.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/transformer_sentence_encoder.py
deleted file mode 100644
index a5d50a33c6cd45151bb2757214eda59f101291f4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/transformer_sentence_encoder.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import random
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq.model_parallel.modules import ModelParallelTransformerSentenceEncoderLayer
-from fairseq.modules import (
-    LayerNorm,
-    MultiheadAttention,
-    PositionalEmbedding,
-    TransformerSentenceEncoder,
-)
-
-
-try:
-    from fairseq.model_parallel.megatron.mpu import VocabParallelEmbedding
-
-    has_megatron_submodule = True
-except (ImportError, ModuleNotFoundError):
-    has_megatron_submodule = False
-
-
-class ModelParallelTransformerSentenceEncoder(TransformerSentenceEncoder):
-    """
-    Implementation for a Model Parallel Bi-directional Transformer based
-    Sentence Encoder used in BERT/XLM style pre-trained models.
-    """
-
-    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
-        return VocabParallelEmbedding(vocab_size, embedding_dim, padding_idx)
-
-    def build_transformer_sentence_encoder_layer(
-        self,
-        embedding_dim,
-        ffn_embedding_dim,
-        num_attention_heads,
-        dropout,
-        attention_dropout,
-        activation_dropout,
-        activation_fn,
-        export,
-        **unused,
-    ):
-        return ModelParallelTransformerSentenceEncoderLayer(
-            embedding_dim=embedding_dim,
-            ffn_embedding_dim=ffn_embedding_dim,
-            num_attention_heads=num_attention_heads,
-            dropout=dropout,
-            attention_dropout=attention_dropout,
-            activation_dropout=activation_dropout,
-            activation_fn=activation_fn,
-            export=export,
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/transformer_sentence_encoder_layer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/transformer_sentence_encoder_layer.py
deleted file mode 100644
index e10bf5233210b6ec2ac4fe4b0c59dc4bb6d8e4c8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/model_parallel/modules/transformer_sentence_encoder_layer.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.model_parallel.modules import ModelParallelMultiheadAttention
-from fairseq.modules import TransformerSentenceEncoderLayer
-
-
-try:
-    from fairseq.model_parallel.megatron.mpu import (
-        ColumnParallelLinear,
-        RowParallelLinear,
-    )
-
-    has_megatron_submodule = True
-except (ImportError, ModuleNotFoundError):
-    has_megatron_submodule = False
-
-
-class ModelParallelTransformerSentenceEncoderLayer(TransformerSentenceEncoderLayer):
-    """
-    Implements a Model Parallel Transformer Encoder Layer used in
-    BERT/XLM style pre-trained models.
-    """
-
-    def build_fc1(self, input_dim, output_dim, **unused):
-        return ColumnParallelLinear(input_dim, output_dim, gather_output=False)
-
-    def build_fc2(self, input_dim, output_dim, **unused):
-        return RowParallelLinear(input_dim, output_dim, input_is_parallel=True)
-
-    def build_self_attention(
-        self,
-        embed_dim,
-        num_attention_heads,
-        dropout,
-        **kwargs,
-    ):
-        return ModelParallelMultiheadAttention(
-            embed_dim, num_attention_heads, dropout=dropout, self_attention=True
-        )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        self_attn_mask: torch.Tensor = None,
-        self_attn_padding_mask: torch.Tensor = None,
-    ):
-        """
-        LayerNorm is applied either before or after the self-attention/ffn
-        modules similar to the original Transformer imlementation.
-        """
-        residual = x
-        x = self.self_attn_layer_norm(x)
-        x, attn = self.self_attn(
-            query=x,
-            key=x,
-            value=x,
-            key_padding_mask=self_attn_padding_mask,
-            need_weights=False,
-            attn_mask=self_attn_mask,
-        )
-        x = self.dropout_module(x)
-        x = residual + x
-
-        residual = x
-        x = self.final_layer_norm(x)
-        x = self.activation_fn(self.fc1(x))
-        x = self.activation_dropout_module(x)
-        x = self.fc2(x)
-        x = self.dropout_module(x)
-        x = residual + x
-        return x, None
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/__init__.py
deleted file mode 100644
index 7ff94427115ca9d750a403725afac32a17b80e0d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/__init__.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""isort:skip_file"""
-
-import argparse
-import importlib
-import os
-from argparse import Namespace
-from typing import Union
-
-import fairseq
-from fairseq.dataclass import FairseqDataclass
-from omegaconf import DictConfig, OmegaConf
-
-from .composite_encoder import CompositeEncoder
-from .distributed_fairseq_model import DistributedFairseqModel
-from .fairseq_decoder import FairseqDecoder
-from .fairseq_encoder import FairseqEncoder
-from .fairseq_incremental_decoder import FairseqIncrementalDecoder
-from .fairseq_model import (
-    BaseFairseqModel,
-    FairseqEncoderDecoderModel,
-    FairseqEncoderModel,
-    FairseqLanguageModel,
-    FairseqModel,
-    FairseqMultiModel,
-)
-
-
-MODEL_REGISTRY = {}
-MODEL_DATACLASS_REGISTRY = {}
-ARCH_MODEL_REGISTRY = {}
-ARCH_MODEL_NAME_REGISTRY = {}
-ARCH_MODEL_INV_REGISTRY = {}
-ARCH_CONFIG_REGISTRY = {}
-
-
-__all__ = [
-    "BaseFairseqModel",
-    "CompositeEncoder",
-    "DistributedFairseqModel",
-    "FairseqDecoder",
-    "FairseqEncoder",
-    "FairseqEncoderDecoderModel",
-    "FairseqEncoderModel",
-    "FairseqIncrementalDecoder",
-    "FairseqLanguageModel",
-    "FairseqModel",
-    "FairseqMultiModel",
-]
-
-
-def build_model(model_cfg: Union[DictConfig, Namespace], task):
-    if isinstance(model_cfg, DictConfig):
-        return ARCH_MODEL_REGISTRY[model_cfg._name].build_model(model_cfg, task)
-    return ARCH_MODEL_REGISTRY[model_cfg.arch].build_model(model_cfg, task)
-
-
-def register_model(name, dataclass=None):
-    """
-    New model types can be added to fairseq with the :func:`register_model`
-    function decorator.
-
-    For example::
-
-        @register_model('lstm')
-        class LSTM(FairseqEncoderDecoderModel):
-            (...)
-
-    .. note:: All models must implement the :class:`BaseFairseqModel` interface.
-        Typically you will extend :class:`FairseqEncoderDecoderModel` for
-        sequence-to-sequence tasks or :class:`FairseqLanguageModel` for
-        language modeling tasks.
-
-    Args:
-        name (str): the name of the model
-    """
-
-    def register_model_cls(cls):
-        if name in MODEL_REGISTRY:
-            raise ValueError("Cannot register duplicate model ({})".format(name))
-        if not issubclass(cls, BaseFairseqModel):
-            raise ValueError(
-                "Model ({}: {}) must extend BaseFairseqModel".format(name, cls.__name__)
-            )
-        MODEL_REGISTRY[name] = cls
-        if dataclass is not None and not issubclass(dataclass, FairseqDataclass):
-            raise ValueError(
-                "Dataclass {} must extend FairseqDataclass".format(dataclass)
-            )
-
-        cls.__dataclass = dataclass
-        MODEL_DATACLASS_REGISTRY[name] = dataclass
-        return cls
-
-    return register_model_cls
-
-
-def register_model_architecture(model_name, arch_name):
-    """
-    New model architectures can be added to fairseq with the
-    :func:`register_model_architecture` function decorator. After registration,
-    model architectures can be selected with the ``--arch`` command-line
-    argument.
-
-    For example::
-
-        @register_model_architecture('lstm', 'lstm_luong_wmt_en_de')
-        def lstm_luong_wmt_en_de(args):
-            args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 1000)
-            (...)
-
-    The decorated function should take a single argument *args*, which is a
-    :class:`argparse.Namespace` of arguments parsed from the command-line. The
-    decorated function should modify these arguments in-place to match the
-    desired architecture.
-
-    Args:
-        model_name (str): the name of the Model (Model must already be
-            registered)
-        arch_name (str): the name of the model architecture (``--arch``)
-    """
-
-    def arch_override_from_yaml(args, arch):
-        root_dir = os.path.dirname(os.path.dirname(fairseq.__file__))
-        yaml_path = os.path.join(root_dir, "config/model/{}.yaml".format(arch))
-        if not os.path.exists(yaml_path):
-            raise RuntimeError(f"yaml file {yaml_path} does not exist!")
-        arch_cfg = OmegaConf.load(yaml_path)
-        for k, v in arch_cfg.items():
-            setattr(args, k, getattr(args, k, v))
-
-    def register_model_arch_fn(fn):
-        if model_name not in MODEL_REGISTRY:
-            raise ValueError(
-                "Cannot register model architecture for unknown model type ({})".format(
-                    model_name
-                )
-            )
-        if arch_name in ARCH_MODEL_REGISTRY:
-            raise ValueError(
-                "Cannot register duplicate model architecture ({})".format(arch_name)
-            )
-        if not callable(fn):
-            raise ValueError(
-                "Model architecture must be callable ({})".format(arch_name)
-            )
-        ARCH_MODEL_REGISTRY[arch_name] = MODEL_REGISTRY[model_name]
-        ARCH_MODEL_NAME_REGISTRY[arch_name] = model_name
-        ARCH_MODEL_INV_REGISTRY.setdefault(model_name, []).append(arch_name)
-        if type(fn) is type and issubclass(fn, BaseFairseqModel):
-            # for model classes migrated with hydra
-            # in this case, we are using this decorator directly on model class since
-            # we do not need arch overriding functions.
-            ARCH_CONFIG_REGISTRY[arch_name] = lambda args: arch_override_from_yaml(
-                args, arch=arch_name
-            )
-        else:
-            ARCH_CONFIG_REGISTRY[arch_name] = fn
-        return fn
-
-    return register_model_arch_fn
-
-
-# automatically import any Python files in the models/ directory
-models_dir = os.path.dirname(__file__)
-for file in os.listdir(models_dir):
-    path = os.path.join(models_dir, file)
-    if (
-        not file.startswith("_")
-        and not file.startswith(".")
-        and (file.endswith(".py") or os.path.isdir(path))
-    ):
-        model_name = file[: file.find(".py")] if file.endswith(".py") else file
-        module = importlib.import_module("fairseq.models." + model_name)
-
-        # extra `model_parser` for sphinx
-        if model_name in MODEL_REGISTRY:
-            parser = argparse.ArgumentParser(add_help=False)
-            group_archs = parser.add_argument_group("Named architectures")
-            group_archs.add_argument(
-                "--arch", choices=ARCH_MODEL_INV_REGISTRY[model_name]
-            )
-            group_args = parser.add_argument_group("Additional command-line arguments")
-            MODEL_REGISTRY[model_name].add_args(group_args)
-            globals()[model_name + "_parser"] = parser
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/bart/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/bart/__init__.py
deleted file mode 100644
index a701923f7e5a2a8aa9b75e5580ddea22907f53ee..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/bart/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .hub_interface import *  # noqa
-from .model import *  # noqa
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/bart/hub_interface.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/bart/hub_interface.py
deleted file mode 100644
index cdabe36010bdfde5680f7fd6439b9b2c56c660bd..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/bart/hub_interface.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import copy
-import logging
-from typing import List
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.data import encoders
-
-
-logger = logging.getLogger(__name__)
-
-
-class BARTHubInterface(nn.Module):
-    """A simple PyTorch Hub interface to BART.
-
-    Usage: https://github.com/pytorch/fairseq/tree/master/examples/bart
-    """
-
-    def __init__(self, args, task, model):
-        super().__init__()
-        self.args = args
-        self.task = task
-        self.model = model
-
-        self.bpe = encoders.build_bpe(args)
-
-        self.max_positions = min(
-            utils.resolve_max_positions(
-                self.task.max_positions(),
-                self.model.max_positions(),
-            )
-        )
-
-        # this is useful for determining the device
-        self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
-
-    @property
-    def device(self):
-        return self._float_tensor.device
-
-    def encode(
-        self, sentence: str, *addl_sentences, no_separator=True
-    ) -> torch.LongTensor:
-        """
-        BPE-encode a sentence (or multiple sentences).
-
-        Every sequence begins with a beginning-of-sentence (`<s>`) symbol.
-        Every sentence ends with an end-of-sentence (`</s>`).
-
-        Example (single sentence): `<s> a b c </s>`
-        Example (sentence pair): `<s> d e f </s> 1 2 3 </s>`
-
-        The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE
-        requires leading spaces. For example::
-
-            >>> bart.encode('Hello world').tolist()
-            [0, 31414, 232, 2]
-            >>> bart.encode(' world').tolist()
-            [0, 232, 2]
-            >>> bart.encode('world').tolist()
-            [0, 8331, 2]
-        """
-        tokens = self.bpe.encode(sentence)
-        if len(tokens.split(" ")) > self.max_positions - 2:
-            tokens = " ".join(tokens.split(" ")[: self.max_positions - 2])
-        bpe_sentence = "<s> " + tokens + " </s>"
-        for s in addl_sentences:
-            bpe_sentence += " </s>" if not no_separator else ""
-            bpe_sentence += " " + self.bpe.encode(s) + " </s>"
-        tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=False)
-        return tokens.long()
-
-    def decode(self, tokens: torch.LongTensor):
-        assert tokens.dim() == 1
-        tokens = tokens.cpu().numpy()
-        if tokens[0] == self.task.source_dictionary.bos():
-            tokens = tokens[1:]  # remove <s>
-        eos_mask = tokens == self.task.source_dictionary.eos()
-        doc_mask = eos_mask[1:] & eos_mask[:-1]
-        sentences = np.split(tokens, doc_mask.nonzero()[0] + 1)
-        sentences = [
-            self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences
-        ]
-        if len(sentences) == 1:
-            return sentences[0]
-        return sentences
-
-    def _build_sample(self, src_tokens: List[torch.LongTensor]):
-        # assert torch.is_tensor(src_tokens)
-        dataset = self.task.build_dataset_for_inference(
-            src_tokens,
-            [x.numel() for x in src_tokens],
-        )
-        sample = dataset.collater(dataset)
-        sample = utils.apply_to_sample(lambda tensor: tensor.to(self.device), sample)
-        return sample
-
-    def sample(
-        self, sentences: List[str], beam: int = 1, verbose: bool = False, **kwargs
-    ) -> str:
-        input = [self.encode(sentence) for sentence in sentences]
-        hypos = self.generate(input, beam, verbose, **kwargs)
-        return [self.decode(x["tokens"]) for x in hypos]
-
-    def generate(
-        self,
-        tokens: List[torch.LongTensor],
-        beam: int = 5,
-        verbose: bool = False,
-        **kwargs
-    ) -> torch.LongTensor:
-        sample = self._build_sample(tokens)
-
-        # build generator using current args as well as any kwargs
-        gen_args = copy.copy(self.args)
-        gen_args.beam = beam
-        for k, v in kwargs.items():
-            setattr(gen_args, k, v)
-        generator = self.task.build_generator([self.model], gen_args)
-        translations = self.task.inference_step(
-            generator,
-            [self.model],
-            sample,
-            prefix_tokens=sample["net_input"]["src_tokens"]
-            .new_zeros((len(tokens), 1))
-            .fill_(self.task.source_dictionary.bos()),
-        )
-
-        if verbose:
-            src_str_with_unk = self.string(tokens)
-            logger.info("S\t{}".format(src_str_with_unk))
-
-        def getarg(name, default):
-            return getattr(gen_args, name, getattr(self.args, name, default))
-
-        # Process top predictions
-        hypos = [x[0] for x in translations]
-        hypos = [v for _, v in sorted(zip(sample["id"].tolist(), hypos))]
-        return hypos
-
-    def extract_features(
-        self, tokens: torch.LongTensor, return_all_hiddens: bool = False
-    ) -> torch.Tensor:
-        if tokens.dim() == 1:
-            tokens = tokens.unsqueeze(0)
-        if tokens.size(-1) > min(self.model.max_positions()):
-            raise ValueError(
-                "tokens exceeds maximum length: {} > {}".format(
-                    tokens.size(-1), self.model.max_positions()
-                )
-            )
-        tokens.to(device=self.device),
-        prev_output_tokens = tokens.clone()
-
-        prev_output_tokens[:, 0] = tokens.gather(
-            1,
-            (tokens.ne(self.task.source_dictionary.pad()).sum(dim=1) - 1).unsqueeze(-1),
-        ).squeeze()
-
-        prev_output_tokens[:, 1:] = tokens[:, :-1]
-        features, extra = self.model(
-            src_tokens=tokens,
-            src_lengths=None,
-            prev_output_tokens=prev_output_tokens,
-            features_only=True,
-            return_all_hiddens=return_all_hiddens,
-        )
-        if return_all_hiddens:
-            # convert from T x B x C -> B x T x C
-            inner_states = extra["inner_states"]
-            return [inner_state.transpose(0, 1) for inner_state in inner_states]
-        else:
-            return features  # just the last layer's features
-
-    def register_classification_head(
-        self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs
-    ):
-        self.model.register_classification_head(
-            name, num_classes=num_classes, embedding_size=embedding_size, **kwargs
-        )
-
-    def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False):
-        if tokens.dim() == 1:
-            tokens = tokens.unsqueeze(0)
-        features = self.extract_features(tokens.to(device=self.device))
-        sentence_representation = features[
-            tokens.eq(self.task.source_dictionary.eos()), :
-        ].view(features.size(0), -1, features.size(-1))[:, -1, :]
-
-        logits = self.model.classification_heads[head](sentence_representation)
-        if return_logits:
-            return logits
-        return F.log_softmax(logits, dim=-1)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/bart/model.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/bart/model.py
deleted file mode 100644
index 0f22352b68187a8edc79db97beba5a8d9ff9ded6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/bart/model.py
+++ /dev/null
@@ -1,368 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-BART: Denoising Sequence-to-Sequence Pre-training for
-Natural Language Generation, Translation, and Comprehension
-"""
-
-import logging
-
-import torch
-import torch.nn as nn
-from fairseq import utils
-from fairseq.models import register_model, register_model_architecture
-from fairseq.models.transformer import TransformerModel
-from fairseq.modules.transformer_sentence_encoder import init_bert_params
-
-from .hub_interface import BARTHubInterface
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_model("bart")
-class BARTModel(TransformerModel):
-    @classmethod
-    def hub_models(cls):
-        return {
-            "bart.base": "http://dl.fbaipublicfiles.com/fairseq/models/bart.base.tar.gz",
-            "bart.large": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz",
-            "bart.large.mnli": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.mnli.tar.gz",
-            "bart.large.cnn": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.cnn.tar.gz",
-            "bart.large.xsum": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.xsum.tar.gz",
-        }
-
-    def __init__(self, args, encoder, decoder):
-        super().__init__(args, encoder, decoder)
-
-        # We follow BERT's random weight initialization
-        self.apply(init_bert_params)
-
-        self.classification_heads = nn.ModuleDict()
-
-    @staticmethod
-    def add_args(parser):
-        super(BARTModel, BARTModel).add_args(parser)
-        parser.add_argument(
-            "--pooler-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability in the masked_lm pooler layers",
-        )
-        parser.add_argument(
-            "--pooler-activation-fn",
-            choices=utils.get_available_activation_fns(),
-            help="activation function to use for pooler layer",
-        )
-        parser.add_argument(
-            "--spectral-norm-classification-head",
-            action="store_true",
-            help="Apply spectral normalization on the classification head",
-        )
-
-    @property
-    def supported_targets(self):
-        return {"self"}
-
-    def forward(
-        self,
-        src_tokens,
-        src_lengths,
-        prev_output_tokens,
-        features_only=False,
-        classification_head_name=None,
-        token_embeddings=None,
-        **kwargs,
-    ):
-        if classification_head_name is not None:
-            features_only = True
-
-        encoder_out = self.encoder(
-            src_tokens,
-            src_lengths=src_lengths,
-            token_embeddings=token_embeddings,
-            **kwargs,
-        )
-        x, extra = self.decoder(
-            prev_output_tokens,
-            encoder_out=encoder_out,
-            features_only=features_only,
-            **kwargs,
-        )
-
-        if classification_head_name is not None:
-            sentence_representation = x[
-                src_tokens.eq(self.encoder.dictionary.eos()), :
-            ].view(x.size(0), -1, x.size(-1))[:, -1, :]
-            x = self.classification_heads[classification_head_name](
-                sentence_representation
-            )
-        return x, extra
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        model_name_or_path,
-        checkpoint_file="model.pt",
-        data_name_or_path=".",
-        bpe="gpt2",
-        **kwargs,
-    ):
-        from fairseq import hub_utils
-
-        x = hub_utils.from_pretrained(
-            model_name_or_path,
-            checkpoint_file,
-            data_name_or_path,
-            archive_map=cls.hub_models(),
-            bpe=bpe,
-            load_checkpoint_heads=True,
-            **kwargs,
-        )
-        return BARTHubInterface(x["args"], x["task"], x["models"][0])
-
-    def register_classification_head(
-        self, name, num_classes=None, inner_dim=None, **kwargs
-    ):
-        """Register a classification head."""
-        logger.info("Registering classification head: {0}".format(name))
-        if name in self.classification_heads:
-            prev_num_classes = self.classification_heads[name].out_proj.out_features
-            prev_inner_dim = self.classification_heads[name].dense.out_features
-            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
-                logger.warning(
-                    're-registering head "{}" with num_classes {} (prev: {}) '
-                    "and inner_dim {} (prev: {})".format(
-                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
-                    )
-                )
-        self.classification_heads[name] = BARTClassificationHead(
-            input_dim=self.args.encoder_embed_dim,
-            inner_dim=inner_dim or self.args.encoder_embed_dim,
-            num_classes=num_classes,
-            activation_fn=self.args.pooler_activation_fn,
-            pooler_dropout=self.args.pooler_dropout,
-            do_spectral_norm=self.args.spectral_norm_classification_head,
-        )
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        super().upgrade_state_dict_named(state_dict, name)
-
-        prefix = name + "." if name != "" else ""
-        current_head_names = (
-            []
-            if not hasattr(self, "classification_heads")
-            else self.classification_heads.keys()
-        )
-
-        # Handle new classification heads present in the state dict.
-        keys_to_delete = []
-        for k in state_dict.keys():
-            if not k.startswith(prefix + "classification_heads."):
-                continue
-
-            head_name = k[len(prefix + "classification_heads.") :].split(".")[0]
-            num_classes = state_dict[
-                prefix + "classification_heads." + head_name + ".out_proj.weight"
-            ].size(0)
-            inner_dim = state_dict[
-                prefix + "classification_heads." + head_name + ".dense.weight"
-            ].size(0)
-
-            if getattr(self.args, "load_checkpoint_heads", False):
-                if head_name not in current_head_names:
-                    self.register_classification_head(head_name, num_classes, inner_dim)
-            else:
-                if head_name not in current_head_names:
-                    logger.warning(
-                        "deleting classification head ({}) from checkpoint "
-                        "not present in current model: {}".format(head_name, k)
-                    )
-                    keys_to_delete.append(k)
-                elif (
-                    num_classes
-                    != self.classification_heads[head_name].out_proj.out_features
-                    or inner_dim
-                    != self.classification_heads[head_name].dense.out_features
-                ):
-                    logger.warning(
-                        "deleting classification head ({}) from checkpoint "
-                        "with different dimensions than current model: {}".format(
-                            head_name, k
-                        )
-                    )
-                    keys_to_delete.append(k)
-        for k in keys_to_delete:
-            del state_dict[k]
-
-        def truncate_emb(key):
-            if key in state_dict:
-                state_dict[key] = state_dict[key][:-1, :]
-
-        # When finetuning on translation task, remove last row of
-        # embedding matrix that corresponds to mask_idx token.
-        loaded_dict_size = state_dict["encoder.embed_tokens.weight"].size(0)
-        if (
-            loaded_dict_size == len(self.encoder.dictionary) + 1
-            and "<mask>" not in self.encoder.dictionary
-        ):
-            truncate_emb("encoder.embed_tokens.weight")
-            truncate_emb("decoder.embed_tokens.weight")
-            truncate_emb("encoder.output_projection.weight")
-            truncate_emb("decoder.output_projection.weight")
-
-        # When continued pretraining on new set of languages for mbart,
-        # add extra lang embeddings at the end of embed_tokens.
-        # Note: newly added languages are assumed to have been added at the end.
-        if self.args.task == "multilingual_denoising" and loaded_dict_size < len(
-            self.encoder.dictionary
-        ):
-            logger.info(
-                "Adding extra language embeddings not found in pretrained model for "
-                "continued pretraining of MBART on new set of languages."
-            )
-            loaded_mask_token_embedding = state_dict["encoder.embed_tokens.weight"][
-                -1, :
-            ]
-
-            num_langids_to_add = len(self.encoder.dictionary) - loaded_dict_size
-            embed_dim = state_dict["encoder.embed_tokens.weight"].size(1)
-
-            new_lang_embed_to_add = torch.zeros(num_langids_to_add, embed_dim)
-            nn.init.normal_(new_lang_embed_to_add, mean=0, std=embed_dim ** -0.5)
-            new_lang_embed_to_add = new_lang_embed_to_add.to(
-                dtype=state_dict["encoder.embed_tokens.weight"].dtype,
-            )
-
-            state_dict["encoder.embed_tokens.weight"] = torch.cat(
-                [
-                    state_dict["encoder.embed_tokens.weight"][
-                        : loaded_dict_size - 1, :
-                    ],
-                    new_lang_embed_to_add,
-                    loaded_mask_token_embedding.unsqueeze(0),
-                ]
-            )
-            state_dict["decoder.embed_tokens.weight"] = torch.cat(
-                [
-                    state_dict["decoder.embed_tokens.weight"][
-                        : loaded_dict_size - 1, :
-                    ],
-                    new_lang_embed_to_add,
-                    loaded_mask_token_embedding.unsqueeze(0),
-                ]
-            )
-
-        # Copy any newly-added classification heads into the state dict
-        # with their current weights.
-        if hasattr(self, "classification_heads"):
-            cur_state = self.classification_heads.state_dict()
-            for k, v in cur_state.items():
-                if prefix + "classification_heads." + k not in state_dict:
-                    logger.info("Overwriting", prefix + "classification_heads." + k)
-                    state_dict[prefix + "classification_heads." + k] = v
-
-
-class BARTClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(
-        self,
-        input_dim,
-        inner_dim,
-        num_classes,
-        activation_fn,
-        pooler_dropout,
-        do_spectral_norm=False,
-    ):
-        super().__init__()
-        self.dense = nn.Linear(input_dim, inner_dim)
-        self.activation_fn = utils.get_activation_fn(activation_fn)
-        self.dropout = nn.Dropout(p=pooler_dropout)
-        self.out_proj = nn.Linear(inner_dim, num_classes)
-
-        if do_spectral_norm:
-            self.out_proj = torch.nn.utils.spectral_norm(self.out_proj)
-
-    def forward(self, features, **kwargs):
-        x = features
-        x = self.dropout(x)
-        x = self.dense(x)
-        x = self.activation_fn(x)
-        x = self.dropout(x)
-        x = self.out_proj(x)
-        return x
-
-
-@register_model_architecture("bart", "bart_large")
-def bart_large_architecture(args):
-    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 1024)
-    args.encoder_layers = getattr(args, "encoder_layers", 12)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True)
-    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
-    args.decoder_ffn_embed_dim = getattr(
-        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
-    )
-    args.decoder_layers = getattr(args, "decoder_layers", 12)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", True)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
-    args.relu_dropout = getattr(args, "relu_dropout", 0.0)
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.max_target_positions = getattr(args, "max_target_positions", 1024)
-    args.max_source_positions = getattr(args, "max_source_positions", 1024)
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", True
-    )
-    args.share_all_embeddings = getattr(args, "share_all_embeddings", True)
-
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-
-    args.no_scale_embedding = getattr(args, "no_scale_embedding", True)
-    args.layernorm_embedding = getattr(args, "layernorm_embedding", True)
-
-    args.activation_fn = getattr(args, "activation_fn", "gelu")
-    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
-    args.pooler_dropout = getattr(args, "pooler_dropout", 0.0)
-
-
-@register_model_architecture("bart", "bart_base")
-def bart_base_architecture(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 768)
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 12)
-    bart_large_architecture(args)
-
-
-@register_model_architecture("bart", "mbart_large")
-def mbart_large_architecture(args):
-    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
-    bart_large_architecture(args)
-
-
-@register_model_architecture("bart", "mbart_base")
-def mbart_base_architecture(args):
-    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
-    bart_base_architecture(args)
-
-
-@register_model_architecture("bart", "mbart_base_wmt20")
-def mbart_base_wmt20_architecture(args):
-    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
-    mbart_base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/composite_encoder.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/composite_encoder.py
deleted file mode 100644
index 4e20fe3a833a2d87876cbec294ad2bebfba7f591..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/composite_encoder.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .fairseq_encoder import FairseqEncoder
-
-
-class CompositeEncoder(FairseqEncoder):
-    """
-    A wrapper around a dictionary of :class:`FairseqEncoder` objects.
-
-    We run forward on each encoder and return a dictionary of outputs. The first
-    encoder's dictionary is used for initialization.
-
-    Args:
-        encoders (dict): a dictionary of :class:`FairseqEncoder` objects.
-    """
-
-    def __init__(self, encoders):
-        super().__init__(next(iter(encoders.values())).dictionary)
-        self.encoders = encoders
-        for key in self.encoders:
-            self.add_module(key, self.encoders[key])
-
-    def forward(self, src_tokens, src_lengths):
-        """
-        Args:
-            src_tokens (LongTensor): tokens in the source language of shape
-                `(batch, src_len)`
-            src_lengths (LongTensor): lengths of each source sentence of shape
-                `(batch)`
-
-        Returns:
-            dict:
-                the outputs from each Encoder
-        """
-        encoder_out = {}
-        for key in self.encoders:
-            encoder_out[key] = self.encoders[key](src_tokens, src_lengths)
-        return encoder_out
-
-    def reorder_encoder_out(self, encoder_out, new_order):
-        """Reorder encoder output according to new_order."""
-        for key in self.encoders:
-            encoder_out[key] = self.encoders[key].reorder_encoder_out(
-                encoder_out[key], new_order
-            )
-        return encoder_out
-
-    def max_positions(self):
-        return min(self.encoders[key].max_positions() for key in self.encoders)
-
-    def upgrade_state_dict(self, state_dict):
-        for key in self.encoders:
-            self.encoders[key].upgrade_state_dict(state_dict)
-        return state_dict
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/distributed_fairseq_model.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/distributed_fairseq_model.py
deleted file mode 100644
index ece10c6333f486176a8851c4b39b2e6617e37e51..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/distributed_fairseq_model.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import inspect
-
-import torch.nn as nn
-from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel
-
-
-_GOSSIP_DISABLED = False
-try:
-    import gossip
-except ImportError:
-    _GOSSIP_DISABLED = True
-
-
-def DistributedFairseqModel(args, model, process_group=None):
-    """
-    Wrap a *model* to support distributed data parallel training.
-
-    This is similar to the built-in DistributedDataParallel, but allows
-    additional configuration of the DistributedDataParallel class to
-    use, and also provides easier access to the wrapped model by
-    forwarding requests for missing attributes to the wrapped model.
-
-    Args:
-        args (argparse.Namespace): fairseq args
-        model (BaseFairseqModel): model to wrap
-    """
-    # determine which DDP class to extend
-    assert isinstance(model, nn.Module)
-    if args.distributed_wrapper == "DDP" and args.ddp_backend == "c10d":
-        ddp_class = nn.parallel.DistributedDataParallel
-        init_kwargs = dict(
-            module=model,
-            device_ids=[args.device_id],
-            output_device=args.device_id,
-            broadcast_buffers=args.broadcast_buffers,
-            bucket_cap_mb=args.bucket_cap_mb,
-            process_group=process_group,
-        )
-        # Maintain backward compatibility
-        if "check_reduction" in inspect.getargspec(ddp_class)[0]:
-            init_kwargs["check_reduction"] = True
-        if "find_unused_parameters" in inspect.getargspec(ddp_class)[0]:
-            init_kwargs["find_unused_parameters"] = args.find_unused_parameters
-    elif args.distributed_wrapper == "DDP" and args.ddp_backend == "no_c10d":
-        ddp_class = LegacyDistributedDataParallel
-        init_kwargs = dict(
-            module=model,
-            world_size=args.distributed_world_size,
-            buffer_size=2 ** 28,
-            process_group=process_group,
-        )
-    elif args.distributed_wrapper == "SlowMo":
-        if _GOSSIP_DISABLED:
-            raise ImportError(
-                "Cannot find gossip library. Please install from: "
-                "github.com/facebookresearch/stochastic_gradient_push"
-            )
-        ddp_class = gossip.GossipDataParallel
-
-        # The values of slowmo_momentum below were obtained by tuning on the
-        # En-De 16 dataset by training the transformer_wmt_en_de_large model
-        if args.slowmo_momentum is None:
-            if args.distributed_world_size <= 16:
-                args.slowmo_momentum = 0.0
-            elif args.distributed_world_size <= 32:
-                args.slowmo_momentum = 0.2
-            elif args.distributed_world_size <= 64:
-                args.slowmo_momentum = 0.5
-            else:
-                args.slowmo_momentum = 0.6
-
-        init_kwargs = dict(
-            module=model,
-            device_ids=[args.device_id],
-            output_device=args.device_id,
-            broadcast_buffers=args.broadcast_buffers,
-            nprocs_per_node=args.nprocs_per_node,
-            slowmo_momentum=args.slowmo_momentum,
-            localsgd=(args.slowmo_algorithm == "LocalSGD"),
-            localsgd_frequency=args.localsgd_frequency,
-        )
-    else:
-        raise ValueError("Unknown --ddp-backend: " + args.ddp_backend)
-
-    class _DistributedFairseqModel(ddp_class):
-        """Extend DistributedDataParallel to check for missing
-        attributes in the wrapped module."""
-
-        def __init__(self, *args, **kwargs):
-            super().__init__(*args, **kwargs)
-
-        def __getattr__(self, name):
-            wrapped_module = super().__getattr__("module")
-            if hasattr(wrapped_module, name):
-                return getattr(wrapped_module, name)
-            return super().__getattr__(name)
-
-    return _DistributedFairseqModel(**init_kwargs)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fairseq_decoder.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fairseq_decoder.py
deleted file mode 100644
index fb6c52dc7ffd95c63e0b43512db398cbb8b91582..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fairseq_decoder.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Dict, List, Optional, Tuple
-
-import torch.nn as nn
-from fairseq import utils
-from torch import Tensor
-
-
-class FairseqDecoder(nn.Module):
-    """Base class for decoders."""
-
-    def __init__(self, dictionary):
-        super().__init__()
-        self.dictionary = dictionary
-        self.onnx_trace = False
-
-    def forward(self, prev_output_tokens, encoder_out=None, **kwargs):
-        """
-        Args:
-            prev_output_tokens (LongTensor): shifted output tokens of shape
-                `(batch, tgt_len)`, for teacher forcing
-            encoder_out (dict, optional): output from the encoder, used for
-                encoder-side attention
-
-        Returns:
-            tuple:
-                - the decoder's output of shape `(batch, tgt_len, vocab)`
-                - a dictionary with any model-specific outputs
-        """
-        x, extra = self.extract_features(
-            prev_output_tokens, encoder_out=encoder_out, **kwargs
-        )
-        x = self.output_layer(x)
-        return x, extra
-
-    def extract_features(self, prev_output_tokens, encoder_out=None, **kwargs):
-        """
-        Returns:
-            tuple:
-                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
-                - a dictionary with any model-specific outputs
-        """
-        raise NotImplementedError
-
-    def output_layer(self, features, **kwargs):
-        """
-        Project features to the default output size, e.g., vocabulary size.
-
-        Args:
-            features (Tensor): features returned by *extract_features*.
-        """
-        raise NotImplementedError
-
-    def get_normalized_probs(
-        self,
-        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
-        log_probs: bool,
-        sample: Optional[Dict[str, Tensor]] = None,
-    ):
-        """Get normalized probabilities (or log probs) from a net's output."""
-
-        if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None:
-            if sample is not None:
-                assert "target" in sample
-                target = sample["target"]
-            else:
-                target = None
-            out = self.adaptive_softmax.get_log_prob(net_output[0], target=target)
-            return out.exp_() if not log_probs else out
-
-        logits = net_output[0]
-        if log_probs:
-            return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
-        else:
-            return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
-
-    def max_positions(self):
-        """Maximum input length supported by the decoder."""
-        return 1e6  # an arbitrary large number
-
-    def upgrade_state_dict(self, state_dict):
-        """Upgrade a (possibly old) state dict for new versions of fairseq."""
-        return state_dict
-
-    def prepare_for_onnx_export_(self):
-        self.onnx_trace = True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fairseq_encoder.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fairseq_encoder.py
deleted file mode 100644
index c8873daa283163881a7dc0190e8b25353abed410..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fairseq_encoder.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Dict, List, NamedTuple, Optional
-
-import torch
-import torch.nn as nn
-from torch import Tensor
-
-
-EncoderOut = NamedTuple(
-    "EncoderOut",
-    [
-        ("encoder_out", Tensor),  # T x B x C
-        ("encoder_padding_mask", Optional[Tensor]),  # B x T
-        ("encoder_embedding", Optional[Tensor]),  # B x T x C
-        ("encoder_states", Optional[List[Tensor]]),  # List[T x B x C]
-        ("src_tokens", Optional[Tensor]),  # B x T
-        ("src_lengths", Optional[Tensor]),  # B x 1
-    ],
-)
-
-
-class FairseqEncoder(nn.Module):
-    """Base class for encoders."""
-
-    def __init__(self, dictionary):
-        super().__init__()
-        self.dictionary = dictionary
-
-    def forward(self, src_tokens, src_lengths=None, **kwargs):
-        """
-        Args:
-            src_tokens (LongTensor): tokens in the source language of shape
-                `(batch, src_len)`
-            src_lengths (LongTensor): lengths of each source sentence of shape
-                `(batch)`
-        """
-        raise NotImplementedError
-
-    def forward_torchscript(self, net_input: Dict[str, Tensor]):
-        """A TorchScript-compatible version of forward.
-
-        Encoders which use additional arguments may want to override
-        this method for TorchScript compatibility.
-        """
-        if torch.jit.is_scripting():
-            return self.forward(
-                src_tokens=net_input["src_tokens"],
-                src_lengths=net_input["src_lengths"],
-            )
-        else:
-            return self.forward_non_torchscript(net_input)
-
-    @torch.jit.unused
-    def forward_non_torchscript(self, net_input: Dict[str, Tensor]):
-        encoder_input = {
-            k: v for k, v in net_input.items() if k != "prev_output_tokens"
-        }
-        return self.forward(**encoder_input)
-
-    def reorder_encoder_out(self, encoder_out, new_order):
-        """
-        Reorder encoder output according to `new_order`.
-
-        Args:
-            encoder_out: output from the ``forward()`` method
-            new_order (LongTensor): desired order
-
-        Returns:
-            `encoder_out` rearranged according to `new_order`
-        """
-        raise NotImplementedError
-
-    def max_positions(self):
-        """Maximum input length supported by the encoder."""
-        return 1e6  # an arbitrary large number
-
-    def upgrade_state_dict(self, state_dict):
-        """Upgrade a (possibly old) state dict for new versions of fairseq."""
-        return state_dict
-
-    def set_num_updates(self, num_updates):
-        """State from trainer to pass along to model at every update."""
-
-        def _apply(m):
-            if hasattr(m, "set_num_updates") and m != self:
-                m.set_num_updates(num_updates)
-
-        self.apply(_apply)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fairseq_incremental_decoder.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fairseq_incremental_decoder.py
deleted file mode 100644
index cc72a0f8f3da238a8ce846240e5008d91ce1bc1a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fairseq_incremental_decoder.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-from typing import Dict, Optional
-
-from fairseq.incremental_decoding_utils import with_incremental_state
-from fairseq.models import FairseqDecoder
-from torch import Tensor
-
-
-logger = logging.getLogger(__name__)
-
-
-@with_incremental_state
-class FairseqIncrementalDecoder(FairseqDecoder):
-    """Base class for incremental decoders.
-
-    Incremental decoding is a special mode at inference time where the Model
-    only receives a single timestep of input corresponding to the previous
-    output token (for teacher forcing) and must produce the next output
-    *incrementally*. Thus the model must cache any long-term state that is
-    needed about the sequence, e.g., hidden states, convolutional states, etc.
-
-    Compared to the standard :class:`FairseqDecoder` interface, the incremental
-    decoder interface allows :func:`forward` functions to take an extra keyword
-    argument (*incremental_state*) that can be used to cache state across
-    time-steps.
-
-    The :class:`FairseqIncrementalDecoder` interface also defines the
-    :func:`reorder_incremental_state` method, which is used during beam search
-    to select and reorder the incremental state based on the selection of beams.
-
-    To learn more about how incremental decoding works, refer to `this blog
-    <http://www.telesens.co/2019/04/21/understanding-incremental-decoding-in-fairseq/>`_.
-    """
-
-    def __init__(self, dictionary):
-        super().__init__(dictionary)
-
-    def forward(
-        self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs
-    ):
-        """
-        Args:
-            prev_output_tokens (LongTensor): shifted output tokens of shape
-                `(batch, tgt_len)`, for teacher forcing
-            encoder_out (dict, optional): output from the encoder, used for
-                encoder-side attention
-            incremental_state (dict, optional): dictionary used for storing
-                state during :ref:`Incremental decoding`
-
-        Returns:
-            tuple:
-                - the decoder's output of shape `(batch, tgt_len, vocab)`
-                - a dictionary with any model-specific outputs
-        """
-        raise NotImplementedError
-
-    def extract_features(
-        self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs
-    ):
-        """
-        Returns:
-            tuple:
-                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
-                - a dictionary with any model-specific outputs
-        """
-        raise NotImplementedError
-
-    def reorder_incremental_state(
-        self,
-        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-        new_order: Tensor,
-    ):
-        """Reorder incremental state.
-
-        This will be called when the order of the input has changed from the
-        previous time step. A typical use case is beam search, where the input
-        order changes between time steps based on the selection of beams.
-        """
-        pass
-
-    def reorder_incremental_state_scripting(
-        self,
-        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-        new_order: Tensor,
-    ):
-        """Main entry point for reordering the incremental state.
-
-        Due to limitations in TorchScript, we call this function in
-        :class:`fairseq.sequence_generator.SequenceGenerator` instead of
-        calling :func:`reorder_incremental_state` directly.
-        """
-        for module in self.modules():
-            if hasattr(module, "reorder_incremental_state"):
-                result = module.reorder_incremental_state(incremental_state, new_order)
-                if result is not None:
-                    incremental_state = result
-
-    def set_beam_size(self, beam_size):
-        """Sets the beam size in the decoder and all children."""
-        if getattr(self, "_beam_size", -1) != beam_size:
-            seen = set()
-
-            def apply_set_beam_size(module):
-                if (
-                    module != self
-                    and hasattr(module, "set_beam_size")
-                    and module not in seen
-                ):
-                    seen.add(module)
-                    module.set_beam_size(beam_size)
-
-            self.apply(apply_set_beam_size)
-            self._beam_size = beam_size
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fairseq_model.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fairseq_model.py
deleted file mode 100644
index 0f66a77c1bfccebd66f791c6401cf375a83364b0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fairseq_model.py
+++ /dev/null
@@ -1,565 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Base classes for various fairseq models.
-"""
-
-import logging
-from typing import Dict, List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.checkpoint_utils import prune_state_dict
-from fairseq.data import Dictionary
-from fairseq.dataclass.utils import gen_parser_from_dataclass
-from fairseq.models import FairseqDecoder, FairseqEncoder
-from torch import Tensor
-
-
-logger = logging.getLogger(__name__)
-
-
-class BaseFairseqModel(nn.Module):
-    """Base class for fairseq models."""
-
-    def __init__(self):
-        super().__init__()
-        self._is_generation_fast = False
-
-    @classmethod
-    def add_args(cls, parser):
-        """Add model-specific arguments to the parser."""
-        dc = getattr(cls, "__dataclass", None)
-        if dc is not None:
-            # do not set defaults so that settings defaults from various architectures still works
-            gen_parser_from_dataclass(parser, dc(), delete_default=True)
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-        raise NotImplementedError("Model must implement the build_model method")
-
-    def get_targets(self, sample, net_output):
-        """Get targets from either the sample or the net's output."""
-        return sample["target"]
-
-    def get_normalized_probs(
-        self,
-        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
-        log_probs: bool,
-        sample: Optional[Dict[str, Tensor]] = None,
-    ):
-        """Get normalized probabilities (or log probs) from a net's output."""
-        return self.get_normalized_probs_scriptable(net_output, log_probs, sample)
-
-    # TorchScript doesn't support super() method so that the scriptable Subclass
-    # can't access the base class model in Torchscript.
-    # Current workaround is to add a helper function with different name and
-    # call the helper function from scriptable Subclass.
-    def get_normalized_probs_scriptable(
-        self,
-        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
-        log_probs: bool,
-        sample: Optional[Dict[str, Tensor]] = None,
-    ):
-        """Scriptable helper function for get_normalized_probs in ~BaseFairseqModel"""
-        if hasattr(self, "decoder"):
-            return self.decoder.get_normalized_probs(net_output, log_probs, sample)
-        elif torch.is_tensor(net_output):
-            # syntactic sugar for simple models which don't have a decoder
-            # (e.g., the classification tutorial)
-            logits = net_output.float()
-            if log_probs:
-                return F.log_softmax(logits, dim=-1)
-            else:
-                return F.softmax(logits, dim=-1)
-        raise NotImplementedError
-
-    def extract_features(self, *args, **kwargs):
-        """Similar to *forward* but only return features."""
-        return self(*args, **kwargs)
-
-    def max_positions(self):
-        """Maximum length supported by the model."""
-        return None
-
-    def remove_prefix(self, state_dict:dict):
-        new_state_dict = {}
-        for k,v in state_dict.items():
-            if k.startswith('encoder') or k.startswith('decoder'):
-                new_state_dict[k] = v
-            else:
-                new_k = '.'.join(k.split('.')[1:])
-                new_state_dict[new_k] = v
-        return new_state_dict
-
-    def load_state_dict(self, state_dict, strict=True, args=None):
-        """Copies parameters and buffers from *state_dict* into this module and
-        its descendants.
-
-        Overrides the method in :class:`nn.Module`. Compared with that method
-        this additionally "upgrades" *state_dicts* from old checkpoints.
-        """
-        state_dict = self.remove_prefix(state_dict)
-        self.upgrade_state_dict(state_dict)
-        new_state_dict = prune_state_dict(state_dict, args)
-        return super().load_state_dict(new_state_dict, strict)
-
-    def upgrade_state_dict(self, state_dict):
-        """Upgrade old state dicts to work with newer code."""
-        self.upgrade_state_dict_named(state_dict, "")
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        """Upgrade old state dicts to work with newer code.
-
-        Args:
-            state_dict (dict): state dictionary to upgrade, in place
-            name (str): the state dict key corresponding to the current module
-        """
-        assert state_dict is not None
-
-        def do_upgrade(m, prefix):
-            if len(prefix) > 0:
-                prefix += "."
-
-            for n, c in m.named_children():
-                name = prefix + n
-                if hasattr(c, "upgrade_state_dict_named"):
-                    c.upgrade_state_dict_named(state_dict, name)
-                elif hasattr(c, "upgrade_state_dict"):
-                    c.upgrade_state_dict(state_dict)
-                do_upgrade(c, name)
-
-        do_upgrade(self, name)
-
-    def set_num_updates(self, num_updates):
-        """State from trainer to pass along to model at every update."""
-
-        def _apply(m):
-            if hasattr(m, "set_num_updates") and m != self:
-                m.set_num_updates(num_updates)
-
-        self.apply(_apply)
-
-    def prepare_for_inference_(self, args):
-        """Prepare model for inference."""
-        kwargs = {}
-        kwargs["beamable_mm_beam_size"] = (
-            None if getattr(args, "no_beamable_mm", False) else getattr(args, "beam", 5)
-        )
-        kwargs["need_attn"] = getattr(args, "print_alignment", False)
-        if hasattr(args, "retain_dropout"):
-            kwargs["retain_dropout"] = args.retain_dropout
-            kwargs["retain_dropout_modules"] = getattr(
-                args, "retain_dropout_modules", None
-            )
-        self.make_generation_fast_(**kwargs)
-
-    def make_generation_fast_(self, **kwargs):
-        """
-        Legacy entry point to optimize model for faster generation.
-        Prefer prepare_for_inference_.
-        """
-        if self._is_generation_fast:
-            return  # only apply once
-        self._is_generation_fast = True
-
-        # remove weight norm from all modules in the network
-        def apply_remove_weight_norm(module):
-            try:
-                nn.utils.remove_weight_norm(module)
-            except (AttributeError, ValueError):  # this module didn't have weight norm
-                return
-
-        self.apply(apply_remove_weight_norm)
-
-        def apply_make_generation_fast_(module, prefix):
-            if len(prefix) > 0:
-                prefix += "."
-
-            base_func = BaseFairseqModel.make_generation_fast_
-            for n, m in module.named_modules():
-                if (
-                    m != self
-                    and hasattr(m, "make_generation_fast_")
-                    # don't call this implementation again, e.g., if
-                    # children modules also inherit from BaseFairseqModel
-                    and m.make_generation_fast_.__func__ is not base_func
-                ):
-                    name = prefix + n
-                    m.make_generation_fast_(name=name, **kwargs)
-
-        apply_make_generation_fast_(self, "")
-
-        def train(mode=True):
-            if mode:
-                raise RuntimeError("cannot train after make_generation_fast")
-
-        # this model should no longer be used for training
-        self.eval()
-        self.train = train
-
-    def prepare_for_onnx_export_(self, **kwargs):
-        """Make model exportable via ONNX trace."""
-        seen = set()
-
-        def apply_prepare_for_onnx_export_(module):
-            if (
-                module != self
-                and hasattr(module, "prepare_for_onnx_export_")
-                and module not in seen
-            ):
-                seen.add(module)
-                module.prepare_for_onnx_export_(**kwargs)
-
-        self.apply(apply_prepare_for_onnx_export_)
-
-    def prepare_for_tpu_(self, **kwargs):
-        """Optionally modify model for use on TPUs."""
-        seen = set()
-
-        def apply_prepare_for_tpu_(module):
-            if (
-                module != self
-                and hasattr(module, "prepare_for_tpu_")
-                and module not in seen
-            ):
-                seen.add(module)
-                module.prepare_for_tpu_(**kwargs)
-
-        self.apply(apply_prepare_for_tpu_)
-
-    @classmethod
-    def upgrade_args(cls, args):
-        if hasattr(args, "max_sentences") and not hasattr(args, "batch_size"):
-            args.batch_size = args.max_sentences
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        model_name_or_path,
-        checkpoint_file="model.pt",
-        data_name_or_path=".",
-        **kwargs,
-    ):
-        """
-        Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model
-        file. Downloads and caches the pre-trained model file if needed.
-
-        The base implementation returns a
-        :class:`~fairseq.hub_utils.GeneratorHubInterface`, which can be used to
-        generate translations or sample from language models. The underlying
-        :class:`~fairseq.models.FairseqModel` can be accessed via the
-        *generator.models* attribute.
-
-        Other models may override this to implement custom hub interfaces.
-
-        Args:
-            model_name_or_path (str): either the name of a pre-trained model to
-                load or a path/URL to a pre-trained model state dict
-            checkpoint_file (str, optional): colon-separated list of checkpoint
-                files in the model archive to ensemble (default: 'model.pt')
-            data_name_or_path (str, optional): point args.data to the archive
-                at the given path/URL. Can start with '.' or './' to reuse the
-                model archive path.
-        """
-        from fairseq import hub_utils
-
-        x = hub_utils.from_pretrained(
-            model_name_or_path,
-            checkpoint_file,
-            data_name_or_path,
-            archive_map=cls.hub_models(),
-            **kwargs,
-        )
-
-        cls.upgrade_args(x["args"])
-
-        logger.info(x["args"])
-        return hub_utils.GeneratorHubInterface(x["args"], x["task"], x["models"])
-
-    @classmethod
-    def hub_models(cls):
-        return {}
-
-
-class FairseqEncoderDecoderModel(BaseFairseqModel):
-    """Base class for encoder-decoder models.
-
-    Args:
-        encoder (FairseqEncoder): the encoder
-        decoder (FairseqDecoder): the decoder
-    """
-
-    def __init__(self, encoder, decoder):
-        super().__init__()
-
-        self.encoder = encoder
-        self.decoder = decoder
-        assert isinstance(self.encoder, FairseqEncoder)
-        assert isinstance(self.decoder, FairseqDecoder)
-
-    def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
-        """
-        Run the forward pass for an encoder-decoder model.
-
-        First feed a batch of source tokens through the encoder. Then, feed the
-        encoder output and previous decoder outputs (i.e., teacher forcing) to
-        the decoder to produce the next outputs::
-
-            encoder_out = self.encoder(src_tokens, src_lengths)
-            return self.decoder(prev_output_tokens, encoder_out)
-
-        Args:
-            src_tokens (LongTensor): tokens in the source language of shape
-                `(batch, src_len)`
-            src_lengths (LongTensor): source sentence lengths of shape `(batch)`
-            prev_output_tokens (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for teacher forcing
-
-        Returns:
-            tuple:
-                - the decoder's output of shape `(batch, tgt_len, vocab)`
-                - a dictionary with any model-specific outputs
-        """
-        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
-        decoder_out = self.decoder(
-            prev_output_tokens, encoder_out=encoder_out, **kwargs
-        )
-        return decoder_out
-
-    def forward_decoder(self, prev_output_tokens, **kwargs):
-        return self.decoder(prev_output_tokens, **kwargs)
-
-    def extract_features(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
-        """
-        Similar to *forward* but only return features.
-
-        Returns:
-            tuple:
-                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
-                - a dictionary with any model-specific outputs
-        """
-        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
-        features = self.decoder.extract_features(
-            prev_output_tokens, encoder_out=encoder_out, **kwargs
-        )
-        return features
-
-    def output_layer(self, features, **kwargs):
-        """Project features to the default output size (typically vocabulary size)."""
-        return self.decoder.output_layer(features, **kwargs)
-
-    def max_positions(self):
-        """Maximum length supported by the model."""
-        return (self.encoder.max_positions(), self.decoder.max_positions())
-
-    def max_decoder_positions(self):
-        """Maximum length supported by the decoder."""
-        return self.decoder.max_positions()
-
-
-class FairseqModel(FairseqEncoderDecoderModel):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        utils.deprecation_warning(
-            "FairseqModel is deprecated, please use FairseqEncoderDecoderModel "
-            "or BaseFairseqModel instead",
-            stacklevel=4,
-        )
-
-
-class FairseqMultiModel(BaseFairseqModel):
-    """Base class for combining multiple encoder-decoder models."""
-
-    def __init__(self, encoders, decoders):
-        super().__init__()
-        assert encoders.keys() == decoders.keys()
-        self.keys = list(encoders.keys())
-        for key in self.keys:
-            assert isinstance(encoders[key], FairseqEncoder)
-            assert isinstance(decoders[key], FairseqDecoder)
-
-        self.models = nn.ModuleDict(
-            {
-                key: FairseqEncoderDecoderModel(encoders[key], decoders[key])
-                for key in self.keys
-            }
-        )
-
-    @staticmethod
-    def build_shared_embeddings(
-        dicts: Dict[str, Dictionary],
-        langs: List[str],
-        embed_dim: int,
-        build_embedding: callable,
-        pretrained_embed_path: Optional[str] = None,
-    ):
-        """
-        Helper function to build shared embeddings for a set of languages after
-        checking that all dicts corresponding to those languages are equivalent.
-
-        Args:
-            dicts: Dict of lang_id to its corresponding Dictionary
-            langs: languages that we want to share embeddings for
-            embed_dim: embedding dimension
-            build_embedding: callable function to actually build the embedding
-            pretrained_embed_path: Optional path to load pretrained embeddings
-        """
-        shared_dict = dicts[langs[0]]
-        if any(dicts[lang] != shared_dict for lang in langs):
-            raise ValueError(
-                "--share-*-embeddings requires a joined dictionary: "
-                "--share-encoder-embeddings requires a joined source "
-                "dictionary, --share-decoder-embeddings requires a joined "
-                "target dictionary, and --share-all-embeddings requires a "
-                "joint source + target dictionary."
-            )
-        return build_embedding(shared_dict, embed_dim, pretrained_embed_path)
-
-    def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
-        raise NotImplementedError
-
-    def max_positions(self):
-        """Maximum length supported by the model."""
-        return {
-            key: (
-                self.models[key].encoder.max_positions(),
-                self.models[key].decoder.max_positions(),
-            )
-            for key in self.keys
-        }
-
-    def max_decoder_positions(self):
-        """Maximum length supported by the decoder."""
-        return min(model.decoder.max_positions() for model in self.models.values())
-
-    @property
-    def encoder(self):
-        return self.models[self.keys[0]].encoder
-
-    @property
-    def decoder(self):
-        return self.models[self.keys[0]].decoder
-
-    def forward_decoder(self, prev_output_tokens, **kwargs):
-        return self.decoder(prev_output_tokens, **kwargs)
-
-    def load_state_dict(self, state_dict, strict=True, args=None):
-        """Copies parameters and buffers from *state_dict* into this module and
-        its descendants.
-
-        Overrides the method in :class:`nn.Module`. Compared with that method
-        this additionally "upgrades" *state_dicts* from old checkpoints.
-        """
-        self.upgrade_state_dict(state_dict)
-        new_state_dict = prune_state_dict(state_dict, args)
-        return super().load_state_dict(new_state_dict, strict)
-
-
-class FairseqLanguageModel(BaseFairseqModel):
-    """Base class for decoder-only models.
-
-    Args:
-        decoder (FairseqDecoder): the decoder
-    """
-
-    def __init__(self, decoder):
-        super().__init__()
-        self.decoder = decoder
-        assert isinstance(self.decoder, FairseqDecoder)
-
-    def forward(self, src_tokens, **kwargs):
-        """
-        Run the forward pass for a decoder-only model.
-
-        Feeds a batch of tokens through the decoder to predict the next tokens.
-
-        Args:
-            src_tokens (LongTensor): tokens on which to condition the decoder,
-                of shape `(batch, tgt_len)`
-            src_lengths (LongTensor): source sentence lengths of shape `(batch)`
-
-        Returns:
-            tuple:
-                - the decoder's output of shape `(batch, seq_len, vocab)`
-                - a dictionary with any model-specific outputs
-        """
-        return self.decoder(src_tokens, **kwargs)
-
-    def forward_decoder(self, prev_output_tokens, **kwargs):
-        return self.decoder(prev_output_tokens, **kwargs)
-
-    def extract_features(self, src_tokens, **kwargs):
-        """
-        Similar to *forward* but only return features.
-
-        Returns:
-            tuple:
-                - the decoder's features of shape `(batch, seq_len, embed_dim)`
-                - a dictionary with any model-specific outputs
-        """
-        return self.decoder.extract_features(src_tokens, **kwargs)
-
-    def output_layer(self, features, **kwargs):
-        """Project features to the default output size (typically vocabulary size)."""
-        return self.decoder.output_layer(features, **kwargs)
-
-    def max_positions(self):
-        """Maximum length supported by the model."""
-        return self.decoder.max_positions()
-
-    def max_decoder_positions(self):
-        """Maximum length supported by the decoder."""
-        return self.decoder.max_positions()
-
-    @property
-    def supported_targets(self):
-        return {"future"}
-
-
-class FairseqEncoderModel(BaseFairseqModel):
-    """Base class for encoder-only models.
-
-    Args:
-        encoder (FairseqEncoder): the encoder
-    """
-
-    def __init__(self, encoder):
-        super().__init__()
-        self.encoder = encoder
-        assert isinstance(self.encoder, FairseqEncoder)
-
-    def forward(self, src_tokens, src_lengths, **kwargs):
-        """
-        Run the forward pass for a encoder-only model.
-
-        Feeds a batch of tokens through the encoder to generate features.
-
-        Args:
-            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
-            src_lengths (LongTensor): source sentence lengths of shape `(batch)`
-
-        Returns:
-            the encoder's output, typically of shape `(batch, src_len, features)`
-        """
-        return self.encoder(src_tokens, src_lengths, **kwargs)
-
-    def get_normalized_probs(self, net_output, log_probs, sample=None):
-        """Get normalized probabilities (or log probs) from a net's output."""
-        encoder_out = net_output["encoder_out"]
-        if torch.is_tensor(encoder_out):
-            logits = encoder_out.float()
-            if log_probs:
-                return F.log_softmax(logits, dim=-1)
-            else:
-                return F.softmax(logits, dim=-1)
-        raise NotImplementedError
-
-    def max_positions(self):
-        """Maximum length supported by the model."""
-        return self.encoder.max_positions()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fconv.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fconv.py
deleted file mode 100644
index c99a2151014d816ec9aff6f4b27d71224dd7b4cf..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fconv.py
+++ /dev/null
@@ -1,756 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.models import (
-    FairseqEncoder,
-    FairseqEncoderDecoderModel,
-    FairseqIncrementalDecoder,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.modules import (
-    AdaptiveSoftmax,
-    BeamableMM,
-    FairseqDropout,
-    GradMultiply,
-    LearnedPositionalEmbedding,
-    LinearizedConvolution,
-)
-
-
-@register_model("fconv")
-class FConvModel(FairseqEncoderDecoderModel):
-    """
-    A fully convolutional model, i.e. a convolutional encoder and a
-    convolutional decoder, as described in `"Convolutional Sequence to Sequence
-    Learning" (Gehring et al., 2017) <https://arxiv.org/abs/1705.03122>`_.
-
-    Args:
-        encoder (FConvEncoder): the encoder
-        decoder (FConvDecoder): the decoder
-
-    The Convolutional model provides the following named architectures and
-    command-line arguments:
-
-    .. argparse::
-        :ref: fairseq.models.fconv_parser
-        :prog:
-    """
-
-    @classmethod
-    def hub_models(cls):
-        def moses_subword(path):
-            return {
-                "path": path,
-                "tokenizer": "moses",
-                "bpe": "subword_nmt",
-            }
-
-        return {
-            "conv.wmt14.en-fr": moses_subword(
-                "https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2"
-            ),
-            "conv.wmt14.en-de": moses_subword(
-                "https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-de.fconv-py.tar.bz2"
-            ),
-            "conv.wmt17.en-de": moses_subword(
-                "https://dl.fbaipublicfiles.com/fairseq/models/wmt17.v2.en-de.fconv-py.tar.bz2"
-            ),
-        }
-
-    def __init__(self, encoder, decoder):
-        super().__init__(encoder, decoder)
-        self.encoder.num_attention_layers = sum(
-            layer is not None for layer in decoder.attention
-        )
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--dropout', type=float, metavar='D',
-                            help='dropout probability')
-        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
-                            help='encoder embedding dimension')
-        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
-                            help='path to pre-trained encoder embedding')
-        parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
-                            help='encoder layers [(dim, kernel_size), ...]')
-        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension')
-        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
-                            help='path to pre-trained decoder embedding')
-        parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
-                            help='decoder layers [(dim, kernel_size), ...]')
-        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
-                            help='decoder output embedding dimension')
-        parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
-                            help='decoder attention [True, ...]')
-        parser.add_argument('--share-input-output-embed', action='store_true',
-                            help='share input and output embeddings (requires'
-                                 ' --decoder-out-embed-dim and --decoder-embed-dim'
-                                 ' to be equal)')
-        # fmt: on
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-        # make sure that all args are properly defaulted (in case there are any new ones)
-        base_architecture(args)
-
-        encoder_embed_dict = None
-        if args.encoder_embed_path:
-            encoder_embed_dict = utils.parse_embedding(args.encoder_embed_path)
-            utils.print_embed_overlap(encoder_embed_dict, task.source_dictionary)
-
-        decoder_embed_dict = None
-        if args.decoder_embed_path:
-            decoder_embed_dict = utils.parse_embedding(args.decoder_embed_path)
-            utils.print_embed_overlap(decoder_embed_dict, task.target_dictionary)
-
-        encoder = FConvEncoder(
-            dictionary=task.source_dictionary,
-            embed_dim=args.encoder_embed_dim,
-            embed_dict=encoder_embed_dict,
-            convolutions=eval(args.encoder_layers),
-            dropout=args.dropout,
-            max_positions=args.max_source_positions,
-        )
-        decoder = FConvDecoder(
-            dictionary=task.target_dictionary,
-            embed_dim=args.decoder_embed_dim,
-            embed_dict=decoder_embed_dict,
-            convolutions=eval(args.decoder_layers),
-            out_embed_dim=args.decoder_out_embed_dim,
-            attention=eval(args.decoder_attention),
-            dropout=args.dropout,
-            max_positions=args.max_target_positions,
-            share_embed=args.share_input_output_embed,
-        )
-        return FConvModel(encoder, decoder)
-
-
-class FConvEncoder(FairseqEncoder):
-    """
-    Convolutional encoder consisting of `len(convolutions)` layers.
-
-    Args:
-        dictionary (~fairseq.data.Dictionary): encoding dictionary
-        embed_dim (int, optional): embedding dimension
-        embed_dict (str, optional): filename from which to load pre-trained
-            embeddings
-        max_positions (int, optional): maximum supported input sequence length
-        convolutions (list, optional): the convolutional layer structure. Each
-            list item `i` corresponds to convolutional layer `i`. Layers are
-            given as ``(out_channels, kernel_width, [residual])``. Residual
-            connections are added between layers when ``residual=1`` (which is
-            the default behavior).
-        dropout (float, optional): dropout to be applied before each conv layer
-    """
-
-    def __init__(
-        self,
-        dictionary,
-        embed_dim=512,
-        embed_dict=None,
-        max_positions=1024,
-        convolutions=((512, 3),) * 20,
-        dropout=0.1,
-    ):
-        super().__init__(dictionary)
-        self.dropout_module = FairseqDropout(
-            dropout, module_name=self.__class__.__name__
-        )
-        self.num_attention_layers = None
-
-        num_embeddings = len(dictionary)
-        self.padding_idx = dictionary.pad()
-        self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
-        if embed_dict:
-            self.embed_tokens = utils.load_embedding(
-                embed_dict, self.dictionary, self.embed_tokens
-            )
-
-        self.embed_positions = PositionalEmbedding(
-            max_positions,
-            embed_dim,
-            self.padding_idx,
-        )
-
-        convolutions = extend_conv_spec(convolutions)
-        in_channels = convolutions[0][0]
-        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
-        self.projections = nn.ModuleList()
-        self.convolutions = nn.ModuleList()
-        self.residuals = []
-
-        layer_in_channels = [in_channels]
-        for _, (out_channels, kernel_size, residual) in enumerate(convolutions):
-            if residual == 0:
-                residual_dim = out_channels
-            else:
-                residual_dim = layer_in_channels[-residual]
-            self.projections.append(
-                Linear(residual_dim, out_channels)
-                if residual_dim != out_channels
-                else None
-            )
-            if kernel_size % 2 == 1:
-                padding = kernel_size // 2
-            else:
-                padding = 0
-            self.convolutions.append(
-                ConvTBC(
-                    in_channels,
-                    out_channels * 2,
-                    kernel_size,
-                    dropout=dropout,
-                    padding=padding,
-                )
-            )
-            self.residuals.append(residual)
-            in_channels = out_channels
-            layer_in_channels.append(out_channels)
-        self.fc2 = Linear(in_channels, embed_dim)
-
-    def forward(self, src_tokens, src_lengths):
-        """
-        Args:
-            src_tokens (LongTensor): tokens in the source language of shape
-                `(batch, src_len)`
-            src_lengths (LongTensor): lengths of each source sentence of shape
-                `(batch)`
-
-        Returns:
-            dict:
-                - **encoder_out** (tuple): a tuple with two elements, where the
-                  first element is the last encoder layer's output and the
-                  second element is the same quantity summed with the input
-                  embedding (used for attention). The shape of both tensors is
-                  `(batch, src_len, embed_dim)`.
-                - **encoder_padding_mask** (ByteTensor): the positions of
-                  padding elements of shape `(batch, src_len)`
-        """
-        # embed tokens and positions
-        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
-        x = self.dropout_module(x)
-        input_embedding = x
-
-        # project to size of convolution
-        x = self.fc1(x)
-
-        # used to mask padding in input
-        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
-        if not encoder_padding_mask.any():
-            encoder_padding_mask = None
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-
-        residuals = [x]
-        # temporal convolutions
-        for proj, conv, res_layer in zip(
-            self.projections, self.convolutions, self.residuals
-        ):
-            if res_layer > 0:
-                residual = residuals[-res_layer]
-                residual = residual if proj is None else proj(residual)
-            else:
-                residual = None
-
-            if encoder_padding_mask is not None:
-                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
-
-            x = self.dropout_module(x)
-            if conv.kernel_size[0] % 2 == 1:
-                # padding is implicit in the conv
-                x = conv(x)
-            else:
-                padding_l = (conv.kernel_size[0] - 1) // 2
-                padding_r = conv.kernel_size[0] // 2
-                x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
-                x = conv(x)
-            x = F.glu(x, dim=2)
-
-            if residual is not None:
-                x = (x + residual) * math.sqrt(0.5)
-            residuals.append(x)
-
-        # T x B x C -> B x T x C
-        x = x.transpose(1, 0)
-
-        # project back to size of embedding
-        x = self.fc2(x)
-
-        if encoder_padding_mask is not None:
-            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
-            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
-
-        # scale gradients (this only affects backward, not forward)
-        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))
-
-        # add output to input embedding for attention
-        y = (x + input_embedding) * math.sqrt(0.5)
-
-        return {
-            "encoder_out": (x, y),
-            "encoder_padding_mask": encoder_padding_mask,  # B x T
-        }
-
-    def reorder_encoder_out(self, encoder_out, new_order):
-        if encoder_out["encoder_out"] is not None:
-            encoder_out["encoder_out"] = (
-                encoder_out["encoder_out"][0].index_select(0, new_order),
-                encoder_out["encoder_out"][1].index_select(0, new_order),
-            )
-        if encoder_out["encoder_padding_mask"] is not None:
-            encoder_out["encoder_padding_mask"] = encoder_out[
-                "encoder_padding_mask"
-            ].index_select(0, new_order)
-        return encoder_out
-
-    def max_positions(self):
-        """Maximum input length supported by the encoder."""
-        return self.embed_positions.max_positions
-
-
-class AttentionLayer(nn.Module):
-    def __init__(self, conv_channels, embed_dim, bmm=None):
-        super().__init__()
-        # projects from output of convolution to embedding dimension
-        self.in_projection = Linear(conv_channels, embed_dim)
-        # projects from embedding dimension to convolution size
-        self.out_projection = Linear(embed_dim, conv_channels)
-
-        self.bmm = bmm if bmm is not None else torch.bmm
-
-    def forward(self, x, target_embedding, encoder_out, encoder_padding_mask):
-        residual = x
-
-        # attention
-        x = (self.in_projection(x) + target_embedding) * math.sqrt(0.5)
-        x = self.bmm(x, encoder_out[0])
-
-        # don't attend over padding
-        if encoder_padding_mask is not None:
-            x = (
-                x.float()
-                .masked_fill(encoder_padding_mask.unsqueeze(1), float("-inf"))
-                .type_as(x)
-            )  # FP16 support: cast to float and back
-
-        # softmax over last dim
-        sz = x.size()
-        x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1)
-        x = x.view(sz)
-        attn_scores = x
-
-        x = self.bmm(x, encoder_out[1])
-
-        # scale attention output (respecting potentially different lengths)
-        s = encoder_out[1].size(1)
-        if encoder_padding_mask is None:
-            x = x * (s * math.sqrt(1.0 / s))
-        else:
-            s = s - encoder_padding_mask.type_as(x).sum(
-                dim=1, keepdim=True
-            )  # exclude padding
-            s = s.unsqueeze(-1)
-            x = x * (s * s.rsqrt())
-
-        # project back
-        x = (self.out_projection(x) + residual) * math.sqrt(0.5)
-        return x, attn_scores
-
-    def make_generation_fast_(self, beamable_mm_beam_size=None, **kwargs):
-        """Replace torch.bmm with BeamableMM."""
-        if beamable_mm_beam_size is not None:
-            del self.bmm
-            self.add_module("bmm", BeamableMM(beamable_mm_beam_size))
-
-
-class FConvDecoder(FairseqIncrementalDecoder):
-    """Convolutional decoder"""
-
-    def __init__(
-        self,
-        dictionary,
-        embed_dim=512,
-        embed_dict=None,
-        out_embed_dim=256,
-        max_positions=1024,
-        convolutions=((512, 3),) * 20,
-        attention=True,
-        dropout=0.1,
-        share_embed=False,
-        positional_embeddings=True,
-        adaptive_softmax_cutoff=None,
-        adaptive_softmax_dropout=0.0,
-    ):
-        super().__init__(dictionary)
-        self.register_buffer("version", torch.Tensor([2]))
-        self.dropout_module = FairseqDropout(
-            dropout, module_name=self.__class__.__name__
-        )
-        self.need_attn = True
-
-        convolutions = extend_conv_spec(convolutions)
-        in_channels = convolutions[0][0]
-        if isinstance(attention, bool):
-            # expand True into [True, True, ...] and do the same with False
-            attention = [attention] * len(convolutions)
-        if not isinstance(attention, list) or len(attention) != len(convolutions):
-            raise ValueError(
-                "Attention is expected to be a list of booleans of "
-                "length equal to the number of layers."
-            )
-
-        num_embeddings = len(dictionary)
-        padding_idx = dictionary.pad()
-        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
-        if embed_dict:
-            self.embed_tokens = utils.load_embedding(
-                embed_dict, self.dictionary, self.embed_tokens
-            )
-
-        self.embed_positions = (
-            PositionalEmbedding(
-                max_positions,
-                embed_dim,
-                padding_idx,
-            )
-            if positional_embeddings
-            else None
-        )
-
-        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
-        self.projections = nn.ModuleList()
-        self.convolutions = nn.ModuleList()
-        self.attention = nn.ModuleList()
-        self.residuals = []
-
-        layer_in_channels = [in_channels]
-        for i, (out_channels, kernel_size, residual) in enumerate(convolutions):
-            if residual == 0:
-                residual_dim = out_channels
-            else:
-                residual_dim = layer_in_channels[-residual]
-            self.projections.append(
-                Linear(residual_dim, out_channels)
-                if residual_dim != out_channels
-                else None
-            )
-            self.convolutions.append(
-                LinearizedConv1d(
-                    in_channels,
-                    out_channels * 2,
-                    kernel_size,
-                    padding=(kernel_size - 1),
-                    dropout=dropout,
-                )
-            )
-            self.attention.append(
-                AttentionLayer(out_channels, embed_dim) if attention[i] else None
-            )
-            self.residuals.append(residual)
-            in_channels = out_channels
-            layer_in_channels.append(out_channels)
-
-        self.adaptive_softmax = None
-        self.fc2 = self.fc3 = None
-
-        if adaptive_softmax_cutoff is not None:
-            assert not share_embed
-            self.adaptive_softmax = AdaptiveSoftmax(
-                num_embeddings,
-                in_channels,
-                adaptive_softmax_cutoff,
-                dropout=adaptive_softmax_dropout,
-            )
-        else:
-            self.fc2 = Linear(in_channels, out_embed_dim)
-            if share_embed:
-                assert out_embed_dim == embed_dim, (
-                    "Shared embed weights implies same dimensions "
-                    " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim)
-                )
-                self.fc3 = nn.Linear(out_embed_dim, num_embeddings)
-                self.fc3.weight = self.embed_tokens.weight
-            else:
-                self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
-
-    def forward(
-        self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused
-    ):
-        if encoder_out is not None:
-            encoder_padding_mask = encoder_out["encoder_padding_mask"]
-            encoder_out = encoder_out["encoder_out"]
-
-            # split and transpose encoder outputs
-            encoder_a, encoder_b = self._split_encoder_out(
-                encoder_out, incremental_state
-            )
-
-        if self.embed_positions is not None:
-            pos_embed = self.embed_positions(prev_output_tokens, incremental_state)
-        else:
-            pos_embed = 0
-
-        if incremental_state is not None:
-            prev_output_tokens = prev_output_tokens[:, -1:]
-        x = self._embed_tokens(prev_output_tokens, incremental_state)
-
-        # embed tokens and combine with positional embeddings
-        x += pos_embed
-        x = self.dropout_module(x)
-        target_embedding = x
-
-        # project to size of convolution
-        x = self.fc1(x)
-
-        # B x T x C -> T x B x C
-        x = self._transpose_if_training(x, incremental_state)
-
-        # temporal convolutions
-        avg_attn_scores = None
-        num_attn_layers = len(self.attention)
-        residuals = [x]
-        for proj, conv, attention, res_layer in zip(
-            self.projections, self.convolutions, self.attention, self.residuals
-        ):
-            if res_layer > 0:
-                residual = residuals[-res_layer]
-                residual = residual if proj is None else proj(residual)
-            else:
-                residual = None
-
-            x = self.dropout_module(x)
-            x = conv(x, incremental_state)
-            x = F.glu(x, dim=2)
-
-            # attention
-            if attention is not None:
-                x = self._transpose_if_training(x, incremental_state)
-
-                x, attn_scores = attention(
-                    x, target_embedding, (encoder_a, encoder_b), encoder_padding_mask
-                )
-
-                if not self.training and self.need_attn:
-                    attn_scores = attn_scores / num_attn_layers
-                    if avg_attn_scores is None:
-                        avg_attn_scores = attn_scores
-                    else:
-                        avg_attn_scores.add_(attn_scores)
-
-                x = self._transpose_if_training(x, incremental_state)
-
-            # residual
-            if residual is not None:
-                x = (x + residual) * math.sqrt(0.5)
-            residuals.append(x)
-
-        # T x B x C -> B x T x C
-        x = self._transpose_if_training(x, incremental_state)
-
-        # project back to size of vocabulary if not using adaptive softmax
-        if self.fc2 is not None and self.fc3 is not None:
-            x = self.fc2(x)
-            x = self.dropout_module(x)
-            x = self.fc3(x)
-
-        return x, avg_attn_scores
-
-    def reorder_incremental_state(self, incremental_state, new_order):
-        super().reorder_incremental_state(incremental_state, new_order)
-        encoder_out = utils.get_incremental_state(
-            self, incremental_state, "encoder_out"
-        )
-        if encoder_out is not None:
-            encoder_out = tuple(eo.index_select(0, new_order) for eo in encoder_out)
-            utils.set_incremental_state(
-                self, incremental_state, "encoder_out", encoder_out
-            )
-
-    def max_positions(self):
-        """Maximum output length supported by the decoder."""
-        return (
-            self.embed_positions.max_positions
-            if self.embed_positions is not None
-            else float("inf")
-        )
-
-    def upgrade_state_dict(self, state_dict):
-        if utils.item(state_dict.get("decoder.version", torch.Tensor([1]))[0]) < 2:
-            # old models use incorrect weight norm dimension
-            for i, conv in enumerate(self.convolutions):
-                # reconfigure weight norm
-                nn.utils.remove_weight_norm(conv)
-                self.convolutions[i] = nn.utils.weight_norm(conv, dim=0)
-            state_dict["decoder.version"] = torch.Tensor([1])
-        return state_dict
-
-    def make_generation_fast_(self, need_attn=False, **kwargs):
-        self.need_attn = need_attn
-
-    def _embed_tokens(self, tokens, incremental_state):
-        if incremental_state is not None:
-            # keep only the last token for incremental forward pass
-            tokens = tokens[:, -1:]
-        return self.embed_tokens(tokens)
-
-    def _split_encoder_out(self, encoder_out, incremental_state):
-        """Split and transpose encoder outputs.
-
-        This is cached when doing incremental inference.
-        """
-        cached_result = utils.get_incremental_state(
-            self, incremental_state, "encoder_out"
-        )
-        if cached_result is not None:
-            return cached_result
-
-        # transpose only once to speed up attention layers
-        encoder_a, encoder_b = encoder_out
-        encoder_a = encoder_a.transpose(1, 2).contiguous()
-        result = (encoder_a, encoder_b)
-
-        if incremental_state is not None:
-            utils.set_incremental_state(self, incremental_state, "encoder_out", result)
-        return result
-
-    def _transpose_if_training(self, x, incremental_state):
-        if incremental_state is None:
-            x = x.transpose(0, 1)
-        return x
-
-
-def extend_conv_spec(convolutions):
-    """
-    Extends convolutional spec that is a list of tuples of 2 or 3 parameters
-    (kernel size, dim size and optionally how many layers behind to look for residual)
-    to default the residual propagation param if it is not specified
-    """
-    extended = []
-    for spec in convolutions:
-        if len(spec) == 3:
-            extended.append(spec)
-        elif len(spec) == 2:
-            extended.append(spec + (1,))
-        else:
-            raise Exception(
-                "invalid number of parameters in convolution spec "
-                + str(spec)
-                + ". expected 2 or 3"
-            )
-    return tuple(extended)
-
-
-def Embedding(num_embeddings, embedding_dim, padding_idx):
-    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
-    nn.init.normal_(m.weight, 0, 0.1)
-    nn.init.constant_(m.weight[padding_idx], 0)
-    return m
-
-
-def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx):
-    m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
-    nn.init.normal_(m.weight, 0, 0.1)
-    nn.init.constant_(m.weight[padding_idx], 0)
-    return m
-
-
-def Linear(in_features, out_features, dropout=0.0):
-    """Weight-normalized Linear layer (input: N x T x C)"""
-    m = nn.Linear(in_features, out_features)
-    nn.init.normal_(m.weight, mean=0, std=math.sqrt((1 - dropout) / in_features))
-    nn.init.constant_(m.bias, 0)
-    return nn.utils.weight_norm(m)
-
-
-def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs):
-    """Weight-normalized Conv1d layer optimized for decoding"""
-    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
-    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
-    nn.init.normal_(m.weight, mean=0, std=std)
-    nn.init.constant_(m.bias, 0)
-    return nn.utils.weight_norm(m, dim=2)
-
-
-def ConvTBC(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs):
-    """Weight-normalized Conv1d layer"""
-    from fairseq.modules import ConvTBC
-
-    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
-    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
-    nn.init.normal_(m.weight, mean=0, std=std)
-    nn.init.constant_(m.bias, 0)
-    return nn.utils.weight_norm(m, dim=2)
-
-
-@register_model_architecture("fconv", "fconv")
-def base_architecture(args):
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
-    args.encoder_layers = getattr(args, "encoder_layers", "[(512, 3)] * 20")
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
-    args.decoder_layers = getattr(args, "decoder_layers", "[(512, 3)] * 20")
-    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256)
-    args.decoder_attention = getattr(args, "decoder_attention", "True")
-    args.share_input_output_embed = getattr(args, "share_input_output_embed", False)
-
-
-@register_model_architecture("fconv", "fconv_iwslt_de_en")
-def fconv_iwslt_de_en(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
-    args.encoder_layers = getattr(args, "encoder_layers", "[(256, 3)] * 4")
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
-    args.decoder_layers = getattr(args, "decoder_layers", "[(256, 3)] * 3")
-    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256)
-    base_architecture(args)
-
-
-@register_model_architecture("fconv", "fconv_wmt_en_ro")
-def fconv_wmt_en_ro(args):
-    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512)
-    base_architecture(args)
-
-
-@register_model_architecture("fconv", "fconv_wmt_en_de")
-def fconv_wmt_en_de(args):
-    convs = "[(512, 3)] * 9"  # first 9 layers have 512 units
-    convs += " + [(1024, 3)] * 4"  # next 4 layers have 1024 units
-    convs += " + [(2048, 1)] * 2"  # final 2 layers use 1x1 convolutions
-
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
-    args.encoder_layers = getattr(args, "encoder_layers", convs)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 768)
-    args.decoder_layers = getattr(args, "decoder_layers", convs)
-    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512)
-    base_architecture(args)
-
-
-@register_model_architecture("fconv", "fconv_wmt_en_fr")
-def fconv_wmt_en_fr(args):
-    convs = "[(512, 3)] * 6"  # first 6 layers have 512 units
-    convs += " + [(768, 3)] * 4"  # next 4 layers have 768 units
-    convs += " + [(1024, 3)] * 3"  # next 3 layers have 1024 units
-    convs += " + [(2048, 1)] * 1"  # next 1 layer uses 1x1 convolutions
-    convs += " + [(4096, 1)] * 1"  # final 1 layer uses 1x1 convolutions
-
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
-    args.encoder_layers = getattr(args, "encoder_layers", convs)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 768)
-    args.decoder_layers = getattr(args, "decoder_layers", convs)
-    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512)
-    base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fconv_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fconv_lm.py
deleted file mode 100644
index 07391eaa2908eacd2709176942d920c483c4f066..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fconv_lm.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq import utils
-from fairseq.models import (
-    FairseqLanguageModel,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.models.fconv import FConvDecoder
-
-
-@register_model("fconv_lm")
-class FConvLanguageModel(FairseqLanguageModel):
-    def __init__(self, decoder):
-        super().__init__(decoder)
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        parser.add_argument(
-            "--dropout", type=float, metavar="D", help="dropout probability"
-        )
-        parser.add_argument(
-            "--decoder-embed-dim",
-            type=int,
-            metavar="N",
-            help="decoder embedding dimension",
-        )
-        parser.add_argument(
-            "--decoder-layers",
-            type=str,
-            metavar="EXPR",
-            help="decoder layers [(dim, kernel_size), ...]",
-        )
-        parser.add_argument(
-            "--decoder-out-embed-dim",
-            type=int,
-            metavar="N",
-            help="decoder output embedding dimension",
-        )
-        parser.add_argument(
-            "--adaptive-softmax-cutoff",
-            metavar="EXPR",
-            help="comma separated list of adaptive softmax cutoff points. "
-            "Must be used with adaptive_loss criterion",
-        )
-        parser.add_argument(
-            "--adaptive-softmax-dropout",
-            type=float,
-            metavar="D",
-            help="sets adaptive softmax dropout for the tail projections",
-        )
-        parser.add_argument(
-            "--decoder-attention",
-            type=str,
-            metavar="EXPR",
-            help="decoder attention [True, ...]",
-        )
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-        # make sure all arguments are present in older models
-        base_lm_architecture(args)
-
-        if hasattr(args, "max_target_positions") and not hasattr(
-            args, "tokens_per_sample"
-        ):
-            args.tokens_per_sample = args.max_target_positions
-
-        decoder = FConvDecoder(
-            dictionary=task.target_dictionary,
-            embed_dim=args.decoder_embed_dim,
-            convolutions=eval(args.decoder_layers),
-            out_embed_dim=args.decoder_embed_dim,
-            attention=eval(args.decoder_attention),
-            dropout=args.dropout,
-            max_positions=args.tokens_per_sample,
-            share_embed=False,
-            positional_embeddings=False,
-            adaptive_softmax_cutoff=(
-                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int)
-                if args.criterion == "adaptive_loss"
-                else None
-            ),
-            adaptive_softmax_dropout=args.adaptive_softmax_dropout,
-        )
-        return FConvLanguageModel(decoder)
-
-
-@register_model_architecture("fconv_lm", "fconv_lm")
-def base_lm_architecture(args):
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 128)
-    args.decoder_layers = getattr(args, "decoder_layers", "[(1268, 4)] * 13")
-    args.decoder_attention = getattr(args, "decoder_attention", "False")
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-
-
-@register_model_architecture("fconv_lm", "fconv_lm_dauphin_wikitext103")
-def fconv_lm_dauphin_wikitext103(args):
-    layers = "[(850, 6)] * 3"
-    layers += " + [(850, 1)] * 1"
-    layers += " + [(850, 5)] * 4"
-    layers += " + [(850, 1)] * 1"
-    layers += " + [(850, 4)] * 3"
-    layers += " + [(1024, 4)] * 1"
-    layers += " + [(2048, 4)] * 1"
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 280)
-    args.decoder_layers = getattr(args, "decoder_layers", layers)
-    args.decoder_attention = getattr(args, "decoder_attention", "False")
-    args.adaptive_softmax_cutoff = getattr(
-        args, "adaptive_softmax_cutoff", "10000,20000,200000"
-    )
-    base_lm_architecture(args)
-
-
-@register_model_architecture("fconv_lm", "fconv_lm_dauphin_gbw")
-def fconv_lm_dauphin_gbw(args):
-    layers = "[(512, 5)]"
-    layers += " + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3"
-    layers += " + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3"
-    layers += " + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6"
-    layers += " + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]"
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 128)
-    args.decoder_layers = getattr(args, "decoder_layers", layers)
-    args.decoder_attention = getattr(args, "decoder_attention", "False")
-    args.adaptive_softmax_cutoff = getattr(
-        args, "adaptive_softmax_cutoff", "10000,50000,200000"
-    )
-    base_lm_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fconv_self_att.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fconv_self_att.py
deleted file mode 100644
index 8357ef7847ed25a62345e219c41906156828c233..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/fconv_self_att.py
+++ /dev/null
@@ -1,674 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import math
-import os
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import checkpoint_utils
-from fairseq.incremental_decoding_utils import with_incremental_state
-from fairseq.models import (
-    CompositeEncoder,
-    FairseqDecoder,
-    FairseqEncoder,
-    FairseqEncoderDecoderModel,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.modules import (
-    DownsampledMultiHeadAttention,
-    FairseqDropout,
-    GradMultiply,
-    LayerNorm,
-    LearnedPositionalEmbedding,
-    LinearizedConvolution,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_model("fconv_self_att")
-class FConvModelSelfAtt(FairseqEncoderDecoderModel):
-    @classmethod
-    def hub_models(cls):
-        return {
-            "conv.stories.pretrained": {
-                "path": "https://dl.fbaipublicfiles.com/fairseq/models/stories_checkpoint.tar.gz",
-                "checkpoint_file": "pretrained_checkpoint.pt",
-                "tokenizer": "nltk",
-            },
-            "conv.stories": {
-                "path": "https://dl.fbaipublicfiles.com/fairseq/models/stories_checkpoint.tar.gz",
-                "checkpoint_file": "fusion_checkpoint.pt",
-                "tokenizer": "nltk",
-                "pretrained": "True",
-                "pretrained_checkpoint": "./pretrained_checkpoint.pt",
-            },
-            # Test set containing dictionaries
-            "data.stories": "https://dl.fbaipublicfiles.com/fairseq/data/stories_test.tar.bz2",
-        }
-
-    def __init__(self, encoder, decoder, pretrained_encoder=None):
-        super().__init__(encoder, decoder)
-        self.encoder.num_attention_layers = sum(
-            layer is not None for layer in decoder.attention
-        )
-        self.pretrained_encoder = pretrained_encoder
-        if self.pretrained_encoder is None:
-            encoders = {"encoder": encoder}
-        else:
-            encoders = {"encoder": encoder, "pretrained": self.pretrained_encoder}
-        # for fusion model, CompositeEncoder contains both pretrained and training encoders
-        # these are forwarded and then combined in the decoder
-        self.encoder = CompositeEncoder(encoders)
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--dropout', type=float, metavar='D',
-                            help='dropout probability')
-        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
-                            help='encoder embedding dimension')
-        parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
-                            help='encoder layers [(dim, kernel_size), ...]')
-        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension')
-        parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
-                            help='decoder layers [(dim, kernel_size), ...]')
-        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
-                            help='decoder output embedding dimension')
-        parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
-                            help='decoder attention [True, ...]')
-        parser.add_argument('--self-attention', type=str, metavar='EXPR',
-                            help='decoder self-attention layers, ex: [True] + [False]*5')
-        parser.add_argument('--multihead-attention-nheads', type=int,
-                            help='Number of heads to use in attention')
-        parser.add_argument('--multihead-self-attention-nheads', type=int,
-                            help='Number of heads to use in self-attention')
-        parser.add_argument('--encoder-attention', type=str, metavar='EXPR',
-                            help='encoder attention [True, ...]')
-        parser.add_argument('--encoder-attention-nheads', type=int,
-                            help='Number of heads to use in encoder attention')
-        parser.add_argument('--project-input', type=str, metavar='EXPR',
-                            help='Use projections in self-attention [True, ...]')
-        parser.add_argument('--gated-attention', type=str, metavar='EXPR',
-                            help='Use GLU layers in self-attention projections [True, ...]')
-        parser.add_argument('--downsample', type=str, metavar='EXPR',
-                            help='Use downsampling in self-attention [True, ...]')
-        parser.add_argument('--pretrained-checkpoint', metavar='DIR',
-                            help='path to load checkpoint from pretrained model')
-        parser.add_argument('--pretrained', type=str, metavar='EXPR',
-                            help='use pretrained model when training [True, ...]')
-        # fmt: on
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-        trained_encoder, trained_decoder = None, None
-        pretrained = eval(args.pretrained)
-        if pretrained:
-            logger.info("loading pretrained model")
-            if not os.path.exists(args.pretrained_checkpoint):
-                new_pretrained_checkpoint = os.path.join(
-                    args.data, args.pretrained_checkpoint
-                )
-                if os.path.exists(new_pretrained_checkpoint):
-                    args.pretrained_checkpoint = new_pretrained_checkpoint
-            trained_model = checkpoint_utils.load_model_ensemble(
-                filenames=[args.pretrained_checkpoint],
-                task=task,
-            )[0][0]
-            trained_decoder = list(trained_model.children())[1]
-            trained_encoder = list(trained_model.children())[0]
-
-            # freeze pretrained model
-            for param in trained_decoder.parameters():
-                param.requires_grad = False
-            for param in trained_encoder.parameters():
-                param.requires_grad = False
-
-        encoder = FConvEncoder(
-            task.source_dictionary,
-            embed_dim=args.encoder_embed_dim,
-            convolutions=eval(args.encoder_layers),
-            dropout=args.dropout,
-            max_positions=args.max_source_positions,
-            attention=eval(args.encoder_attention),
-            attention_nheads=args.encoder_attention_nheads,
-        )
-
-        decoder = FConvDecoder(
-            task.target_dictionary,
-            embed_dim=args.decoder_embed_dim,
-            convolutions=eval(args.decoder_layers),
-            out_embed_dim=args.decoder_out_embed_dim,
-            attention=eval(args.decoder_attention),
-            dropout=args.dropout,
-            max_positions=args.max_target_positions,
-            selfattention=eval(args.self_attention),
-            attention_nheads=args.multihead_attention_nheads,
-            selfattention_nheads=args.multihead_self_attention_nheads,
-            project_input=eval(args.project_input),
-            gated_attention=eval(args.gated_attention),
-            downsample=eval(args.downsample),
-            pretrained=pretrained,
-            trained_decoder=trained_decoder,
-        )
-        model = FConvModelSelfAtt(encoder, decoder, trained_encoder)
-
-        return model
-
-    @property
-    def pretrained(self):
-        return self.pretrained_encoder is not None
-
-
-class FConvEncoder(FairseqEncoder):
-    """Convolutional encoder"""
-
-    def __init__(
-        self,
-        dictionary,
-        embed_dim=512,
-        max_positions=1024,
-        convolutions=((512, 3),) * 20,
-        dropout=0.1,
-        attention=False,
-        attention_nheads=1,
-    ):
-        super().__init__(dictionary)
-        self.dropout_module = FairseqDropout(
-            dropout, module_name=self.__class__.__name__
-        )
-        self.num_attention_layers = None
-
-        num_embeddings = len(dictionary)
-        self.padding_idx = dictionary.pad()
-        self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
-        self.embed_positions = PositionalEmbedding(
-            max_positions,
-            embed_dim,
-            self.padding_idx,
-        )
-
-        def expand_bool_array(val):
-            if isinstance(val, bool):
-                # expand True into [True, True, ...] and do the same with False
-                return [val] * len(convolutions)
-            return val
-
-        attention = expand_bool_array(attention)
-
-        in_channels = convolutions[0][0]
-        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
-        self.projections = nn.ModuleList()
-        self.convolutions = nn.ModuleList()
-        self.attention = nn.ModuleList()
-        self.attproj = nn.ModuleList()
-        for i, (out_channels, kernel_size) in enumerate(convolutions):
-            self.projections.append(
-                Linear(in_channels, out_channels)
-                if in_channels != out_channels
-                else None
-            )
-            self.convolutions.append(
-                ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout)
-            )
-
-            self.attention.append(
-                SelfAttention(out_channels, embed_dim, attention_nheads)
-                if attention[i]
-                else None
-            )
-            in_channels = out_channels
-
-        self.fc2 = Linear(in_channels, embed_dim)
-
-    def forward(self, src_tokens, src_lengths):
-        # embed tokens and positions
-        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
-        x = self.dropout_module(x)
-        input_embedding = x.transpose(0, 1)
-
-        # project to size of convolution
-        x = self.fc1(x)
-
-        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
-        if not encoder_padding_mask.any():
-            encoder_padding_mask = None
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-
-        # temporal convolutions
-        for proj, conv, attention in zip(
-            self.projections, self.convolutions, self.attention
-        ):
-            residual = x if proj is None else proj(x)
-
-            if encoder_padding_mask is not None:
-                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
-
-            x = self.dropout_module(x)
-            padding_l = (conv.kernel_size[0] - 1) // 2
-            padding_r = conv.kernel_size[0] // 2
-            x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
-            x = conv(x)
-            x = F.glu(x, dim=2)
-            if attention is not None:
-                x = attention(x)
-            x = (x + residual) * math.sqrt(0.5)
-
-        # T x B x C -> B x T x C
-        x = x.transpose(1, 0)
-
-        # project back to size of embedding
-        x = self.fc2(x)
-
-        if encoder_padding_mask is not None:
-            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
-            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)
-
-        # scale gradients (this only affects backward, not forward)
-        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))
-
-        # add output to input embedding for attention
-        y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5)
-
-        return {
-            "encoder_out": (x, y),
-            "encoder_padding_mask": encoder_padding_mask,  # B x T
-        }
-
-    def reorder_encoder_out(self, encoder_out, new_order):
-        encoder_out["encoder_out"] = tuple(
-            eo.index_select(0, new_order) for eo in encoder_out["encoder_out"]
-        )
-
-        if encoder_out["encoder_padding_mask"] is not None:
-            encoder_out["encoder_padding_mask"] = encoder_out[
-                "encoder_padding_mask"
-            ].index_select(0, new_order)
-
-        if "pretrained" in encoder_out:
-            encoder_out["pretrained"]["encoder_out"] = tuple(
-                eo.index_select(0, new_order)
-                for eo in encoder_out["pretrained"]["encoder_out"]
-            )
-
-        return encoder_out
-
-    def max_positions(self):
-        """Maximum input length supported by the encoder."""
-        return self.embed_positions.max_positions
-
-
-@with_incremental_state
-class FConvDecoder(FairseqDecoder):
-    """Convolutional decoder"""
-
-    def __init__(
-        self,
-        dictionary,
-        embed_dim=512,
-        out_embed_dim=256,
-        max_positions=1024,
-        convolutions=((512, 3),) * 8,
-        attention=True,
-        dropout=0.1,
-        selfattention=False,
-        attention_nheads=1,
-        selfattention_nheads=1,
-        project_input=False,
-        gated_attention=False,
-        downsample=False,
-        pretrained=False,
-        trained_decoder=None,
-    ):
-        super().__init__(dictionary)
-        self.register_buffer("version", torch.Tensor([2]))
-        self.pretrained = pretrained
-        self.pretrained_decoder = trained_decoder
-        self.dropout_module = FairseqDropout(
-            dropout, module_name=self.__class__.__name__
-        )
-        self.need_attn = True
-        in_channels = convolutions[0][0]
-
-        def expand_bool_array(val):
-            if isinstance(val, bool):
-                # expand True into [True, True, ...] and do the same with False
-                return [val] * len(convolutions)
-            return val
-
-        attention = expand_bool_array(attention)
-        selfattention = expand_bool_array(selfattention)
-
-        if not isinstance(attention, list) or len(attention) != len(convolutions):
-            raise ValueError(
-                "Attention is expected to be a list of booleans of "
-                "length equal to the number of layers."
-            )
-
-        num_embeddings = len(dictionary)
-        padding_idx = dictionary.pad()
-        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
-
-        self.embed_positions = PositionalEmbedding(
-            max_positions,
-            embed_dim,
-            padding_idx,
-        )
-
-        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
-        self.projections = nn.ModuleList()
-        self.convolutions = nn.ModuleList()
-        self.attention = nn.ModuleList()
-        self.selfattention = nn.ModuleList()
-        self.attproj = nn.ModuleList()
-        for i, (out_channels, kernel_size) in enumerate(convolutions):
-            self.projections.append(
-                Linear(in_channels, out_channels)
-                if in_channels != out_channels
-                else None
-            )
-            self.convolutions.append(
-                LinearizedConv1d(
-                    in_channels,
-                    out_channels * 2,
-                    kernel_size,
-                    padding=(kernel_size - 1),
-                    dropout=dropout,
-                )
-            )
-
-            self.attention.append(
-                DownsampledMultiHeadAttention(
-                    out_channels,
-                    embed_dim,
-                    attention_nheads,
-                    project_input=project_input,
-                    gated=False,
-                    downsample=False,
-                )
-                if attention[i]
-                else None
-            )
-
-            self.attproj.append(
-                Linear(out_channels, embed_dim, dropout=dropout)
-                if attention[i]
-                else None
-            )
-            self.selfattention.append(
-                SelfAttention(
-                    out_channels,
-                    embed_dim,
-                    selfattention_nheads,
-                    project_input=project_input,
-                    gated=gated_attention,
-                    downsample=downsample,
-                )
-                if selfattention[i]
-                else None
-            )
-            in_channels = out_channels
-
-        self.fc2 = Linear(in_channels, out_embed_dim)
-        self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)
-
-        # model fusion
-        if self.pretrained:
-            # independent gates are learned from the concatenated input
-            self.gate1 = nn.Sequential(
-                Linear(out_embed_dim * 2, out_embed_dim), nn.Sigmoid()
-            )
-            self.gate2 = nn.Sequential(
-                Linear(out_embed_dim * 2, out_embed_dim), nn.Sigmoid()
-            )
-            # pretrained and trained models are joined
-            self.joining = nn.Sequential(
-                Linear(out_embed_dim * 2, out_embed_dim * 2),
-                LayerNorm(out_embed_dim * 2),
-                nn.GLU(),
-                Linear(out_embed_dim, out_embed_dim * 2),
-                LayerNorm(out_embed_dim * 2),
-                nn.GLU(),
-                Linear(out_embed_dim, out_embed_dim),
-                LayerNorm(out_embed_dim),
-            )
-            # pretrained model contains an output layer that is nhid -> vocab size
-            # but the models are combined in their hidden state
-            # the hook stores the output of the pretrained model forward
-            self.pretrained_outputs = {}
-
-            def save_output():
-                def hook(a, b, output):
-                    self.pretrained_outputs["out"] = output
-
-                return hook
-
-            self.pretrained_decoder.fc2.register_forward_hook(save_output())
-
-    def forward(self, prev_output_tokens, encoder_out):
-        trained_encoder_out = encoder_out["pretrained"] if self.pretrained else None
-        encoder_out = encoder_out["encoder"]["encoder_out"]
-
-        encoder_a, encoder_b = self._split_encoder_out(encoder_out)
-
-        # embed positions
-        positions = self.embed_positions(prev_output_tokens)
-
-        # embed tokens and positions
-        x = self.embed_tokens(prev_output_tokens) + positions
-        x = self.dropout_module(x)
-        target_embedding = x.transpose(0, 1)
-
-        # project to size of convolution
-        x = self.fc1(x)
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-
-        # temporal convolutions
-        avg_attn_scores = None
-        for proj, conv, attention, selfattention, attproj in zip(
-            self.projections,
-            self.convolutions,
-            self.attention,
-            self.selfattention,
-            self.attproj,
-        ):
-            residual = x if proj is None else proj(x)
-
-            x = self.dropout_module(x)
-            x = conv(x)
-            x = F.glu(x, dim=2)
-
-            # attention
-            if attention is not None:
-                r = x
-                x, attn_scores = attention(
-                    attproj(x) + target_embedding, encoder_a, encoder_b
-                )
-                x = x + r
-                if not self.training and self.need_attn:
-                    if avg_attn_scores is None:
-                        avg_attn_scores = attn_scores
-                    else:
-                        avg_attn_scores.add_(attn_scores)
-
-            if selfattention is not None:
-                x = selfattention(x)
-
-            x = (x + residual) * math.sqrt(0.5)
-
-        # T x B x C -> B x T x C
-        x = x.transpose(0, 1)
-
-        # project back to size of vocabulary
-        x = self.fc2(x)
-        x = self.dropout_module(x)
-        if not self.pretrained:
-            x = self.fc3(x)
-
-        # fusion gating
-        if self.pretrained:
-            trained_x, _ = self.pretrained_decoder.forward(
-                prev_output_tokens, trained_encoder_out
-            )
-            y = torch.cat([x, self.pretrained_outputs["out"]], dim=-1)
-            gate1 = self.gate1(y)
-            gate2 = self.gate2(y)
-            gated_x1 = gate1 * x
-            gated_x2 = gate2 * self.pretrained_outputs["out"]
-            fusion = torch.cat([gated_x1, gated_x2], dim=-1)
-            fusion = self.joining(fusion)
-            fusion_output = self.fc3(fusion)
-            return fusion_output, avg_attn_scores
-        else:
-            return x, avg_attn_scores
-
-    def max_positions(self):
-        """Maximum output length supported by the decoder."""
-        return self.embed_positions.max_positions
-
-    def make_generation_fast_(self, need_attn=False, **kwargs):
-        self.need_attn = need_attn
-
-    def _split_encoder_out(self, encoder_out):
-        """Split and transpose encoder outputs."""
-        # transpose only once to speed up attention layers
-        encoder_a, encoder_b = encoder_out
-        encoder_a = encoder_a.transpose(0, 1).contiguous()
-        encoder_b = encoder_b.transpose(0, 1).contiguous()
-        result = (encoder_a, encoder_b)
-        return result
-
-
-class SelfAttention(nn.Module):
-    def __init__(
-        self,
-        out_channels,
-        embed_dim,
-        num_heads,
-        project_input=False,
-        gated=False,
-        downsample=False,
-    ):
-        super().__init__()
-        self.attention = DownsampledMultiHeadAttention(
-            out_channels,
-            embed_dim,
-            num_heads,
-            dropout=0,
-            bias=True,
-            project_input=project_input,
-            gated=gated,
-            downsample=downsample,
-        )
-        self.in_proj_q = Linear(out_channels, embed_dim)
-        self.in_proj_k = Linear(out_channels, embed_dim)
-        self.in_proj_v = Linear(out_channels, embed_dim)
-        self.ln = LayerNorm(out_channels)
-
-    def forward(self, x):
-        residual = x
-        query = self.in_proj_q(x)
-        key = self.in_proj_k(x)
-        value = self.in_proj_v(x)
-        x, _ = self.attention(
-            query, key, value, mask_future_timesteps=True, use_scalar_bias=True
-        )
-        return self.ln(x + residual)
-
-
-def Embedding(num_embeddings, embedding_dim, padding_idx):
-    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
-    m.weight.data.normal_(0, 0.1)
-    return m
-
-
-def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx):
-    m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
-    m.weight.data.normal_(0, 0.1)
-    return m
-
-
-def Linear(in_features, out_features, dropout=0.0):
-    """Weight-normalized Linear layer (input: N x T x C)"""
-    m = nn.Linear(in_features, out_features)
-    m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
-    m.bias.data.zero_()
-    return m
-
-
-def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs):
-    """Weight-normalized Conv1d layer optimized for decoding"""
-    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
-    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
-    m.weight.data.normal_(mean=0, std=std)
-    m.bias.data.zero_()
-    return m
-
-
-def ConvTBC(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs):
-    """Weight-normalized Conv1d layer"""
-    from fairseq.modules import ConvTBC
-
-    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
-    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
-    m.weight.data.normal_(mean=0, std=std)
-    m.bias.data.zero_()
-    return m
-
-
-@register_model_architecture("fconv_self_att", "fconv_self_att")
-def base_architecture(args):
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_layers = getattr(args, "encoder_layers", "[(512, 3)] * 3")
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.decoder_layers = getattr(args, "decoder_layers", "[(512, 3)] * 8")
-    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256)
-    args.decoder_attention = getattr(args, "decoder_attention", "True")
-    args.self_attention = getattr(args, "self_attention", "False")
-    args.encoder_attention = getattr(args, "encoder_attention", "False")
-    args.multihead_attention_nheads = getattr(args, "multihead_attention_nheads", 1)
-    args.multihead_self_attention_nheads = getattr(
-        args, "multihead_self_attention_nheads", 1
-    )
-    args.encoder_attention_nheads = getattr(args, "encoder_attention_nheads", 1)
-    args.project_input = getattr(args, "project_input", "False")
-    args.gated_attention = getattr(args, "gated_attention", "False")
-    args.downsample = getattr(args, "downsample", "False")
-    args.pretrained_checkpoint = getattr(args, "pretrained_checkpoint", "")
-    args.pretrained = getattr(args, "pretrained", "False")
-
-
-@register_model_architecture("fconv_self_att", "fconv_self_att_wp")
-def fconv_self_att_wp(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
-    args.encoder_layers = getattr(
-        args, "encoder_layers", "[(128, 3)] * 2 + [(512,3)] * 1"
-    )
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
-    args.decoder_layers = getattr(
-        args, "decoder_layers", "[(512, 4)] * 4 + [(768, 4)] * 2 + [(1024, 4)] * 1"
-    )
-    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256)
-    args.self_attention = getattr(args, "self_attention", "True")
-    args.multihead_self_attention_nheads = getattr(
-        args, "multihead_self_attention_nheads", 4
-    )
-    args.project_input = getattr(args, "project_input", "True")
-    args.gated_attention = getattr(args, "gated_attention", "True")
-    args.downsample = getattr(args, "downsample", "True")
-    base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/huggingface/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/huggingface/__init__.py
deleted file mode 100644
index f7911c2c8edf516855023a285b18935e5389ec02..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/huggingface/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import importlib
-import os
-
-
-# automatically import any Python files in the models/huggingface/ directory
-models_dir = os.path.dirname(__file__)
-for file in os.listdir(models_dir):
-    path = os.path.join(models_dir, file)
-    if (
-        not file.startswith("_")
-        and not file.startswith(".")
-        and (file.endswith(".py") or os.path.isdir(path))
-    ):
-        model_name = file[: file.find(".py")] if file.endswith(".py") else file
-        module = importlib.import_module("fairseq.models.huggingface." + model_name)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/huggingface/hf_gpt2.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/huggingface/hf_gpt2.py
deleted file mode 100644
index 3a8eb78198f5808557092f814e92f1c9d72933ec..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/huggingface/hf_gpt2.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-import sys
-from typing import Dict, List, Optional
-
-import torch
-from fairseq.models import (
-    FairseqIncrementalDecoder,
-    FairseqLanguageModel,
-    register_model,
-    register_model_architecture,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-DEFAULT_MAX_TARGET_POSITIONS = 1024
-
-
-@register_model("hf_gpt2")
-class HuggingFaceGPT2LanguageModel(FairseqLanguageModel):
-    def __init__(self, decoder):
-        super().__init__(decoder)
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--embed-dim', type=int, metavar='N',
-                            help='embedding dimension')
-        parser.add_argument('--num-attention-heads', type=int, metavar='N',
-                            help='num attention heads')
-        parser.add_argument('--num-layers', type=int, metavar='N',
-                            help='num layers')
-        parser.add_argument('--dropout', type=float, metavar='D',
-                            help='dropout probability for all fully connected layers '
-                                 'in the embeddings, encoder, and pooler')
-        parser.add_argument('--attention-dropout', type=float, metavar='D',
-                            help='dropout probability for attention weights')
-        # fmt: on
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-        default_architecture(args)
-        return cls(HuggingFaceGPT2Decoder(args, task))
-
-
-class HuggingFaceGPT2Decoder(FairseqIncrementalDecoder):
-    def __init__(self, args, task):
-        try:
-            from transformers import GPT2Config, GPT2LMHeadModel
-        except ImportError:
-            raise ImportError(
-                "\n\nPlease install huggingface/transformers with:"
-                "\n\n  pip install transformers"
-            )
-
-        super().__init__(task.target_dictionary)
-
-        config = GPT2Config(
-            vocab_size=len(task.target_dictionary),
-            n_positions=args.max_target_positions + 1,
-            n_ctx=args.max_target_positions,
-            n_embd=args.embed_dim,
-            n_layer=args.num_layers,
-            n_head=args.num_attention_heads,
-            resid_pdrop=args.dropout,
-            embd_pdrop=args.dropout,
-            attn_pdrop=args.attention_dropout,
-            layer_norm_epsilon=1e-6,
-        )
-        self.model = GPT2LMHeadModel(config)
-
-        # set zero embedding for padding symbol
-        self.pad_idx = task.target_dictionary.pad()
-        self.model.transformer.wte.weight.data[self.pad_idx].zero_()
-        self.model.transformer.wpe.weight.data[0].zero_()
-
-    def forward(
-        self,
-        prev_output_tokens,
-        src_lengths=None,
-        incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None,
-        encoder_out=None,
-    ):
-        features = self.extract_features(prev_output_tokens, incremental_state)
-        lm_logits = self.model.lm_head(features)
-        return (lm_logits,)
-
-    def extract_features(
-        self,
-        prev_output_tokens,
-        incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None,
-    ):
-        if incremental_state:
-            past = self.get_incremental_state("past")
-        else:
-            past = None
-
-        # don't attend to padding symbols
-        attention_mask = prev_output_tokens.ne(self.pad_idx).int()
-
-        # set position ids to exclude padding symbols
-        position_ids = attention_mask * (
-            torch.arange(1, 1 + prev_output_tokens.size(1))
-            .to(prev_output_tokens)
-            .repeat(prev_output_tokens.size(0), 1)
-        )
-
-        outputs = self.model.transformer(
-            input_ids=prev_output_tokens,
-            past=past,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-        )
-        last_hidden_states = outputs[0]
-
-        if incremental_state:
-            self.set_incremental_state(incremental_state, "past", outputs[1])
-
-        return last_hidden_states
-
-    def max_positions(self):
-        return self.model.config.n_positions - 1
-
-
-@register_model_architecture("hf_gpt2", "hf_gpt2")
-def default_architecture(args):
-    if getattr(args, "max_target_positions", None) is None:
-        args.max_target_positions = getattr(
-            args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
-        )
-    args.embed_dim = getattr(args, "embed_dim", 768)
-    args.num_attention_heads = getattr(args, "num_attention_heads", 12)
-    args.num_layers = getattr(args, "num_layers", 12)
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-
-
-@register_model_architecture("hf_gpt2", "hf_gpt2_medium")
-def hf_gpt2_medium(args):
-    args.embed_dim = getattr(args, "embed_dim", 1024)
-    args.num_attention_heads = getattr(args, "num_attention_heads", 16)
-    args.num_layers = getattr(args, "num_layers", 24)
-    default_architecture(args)
-
-
-@register_model_architecture("hf_gpt2", "hf_gpt2_large")
-def hf_gpt2_large(args):
-    args.embed_dim = getattr(args, "embed_dim", 1280)
-    args.num_attention_heads = getattr(args, "num_attention_heads", 20)
-    args.num_layers = getattr(args, "num_layers", 36)
-    default_architecture(args)
-
-
-@register_model_architecture("hf_gpt2", "hf_gpt2_xl")
-def hf_gpt2_xl(args):
-    args.embed_dim = getattr(args, "embed_dim", 1600)
-    args.num_attention_heads = getattr(args, "num_attention_heads", 25)
-    args.num_layers = getattr(args, "num_layers", 48)
-    default_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/lightconv.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/lightconv.py
deleted file mode 100644
index b614da366513091132c8b6bd8b8e170cce33a1c4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/lightconv.py
+++ /dev/null
@@ -1,1018 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.models import (
-    FairseqEncoder,
-    FairseqEncoderDecoderModel,
-    FairseqIncrementalDecoder,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.modules import (
-    AdaptiveSoftmax,
-    DynamicConv,
-    FairseqDropout,
-    LayerNorm,
-    LightweightConv,
-    MultiheadAttention,
-    PositionalEmbedding,
-)
-
-
-@register_model("lightconv")
-class LightConvModel(FairseqEncoderDecoderModel):
-    """
-    LightConv and DynamicConv model from `"Pay Less Attention with Lightweight and Dynamic Convolutions" (Wu, et al, 2019)
-    <https://openreview.net/pdf?id=SkVhlh09tX>`_.
-    To use LightConv please set ``--encoder-conv-type lightweight --decoder-conv-type lightweight``
-    To use DynamicConv please set ``--encoder-conv-type dynamic --decoder-conv-type dynamic``
-
-    Args:
-        encoder (LightConvEncoder): the encoder
-        decoder (LightConvDecoder): the decoder
-
-    The LightConv model provides the following named architectures and
-    command-line arguments:
-
-    .. argparse::
-        :ref: fairseq.models.lightconv_parser
-        :prog:
-    """
-
-    @classmethod
-    def hub_models(cls):
-        # fmt: off
-
-        def moses_subword(path):
-            return {
-                'path': path,
-                'tokenizer': 'moses',
-                'bpe': 'subword_nmt',
-            }
-
-        return {
-            'lightconv.no_glu.iwslt14.de-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.lightconv.tar.gz'),
-            'dynamicconv.no_glu.iwslt14.de-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.dynamicconv.tar.gz'),
-            'lightconv.no_glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv.tar.gz'),
-            'dynamicconv.no_glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv.tar.gz'),
-            'lightconv.glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz'),
-            'dynamicconv.glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz'),
-            'lightconv.glu.wmt17.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz'),
-            'dynamicconv.glu.wmt17.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz'),
-            'lightconv.glu.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.lightconv-glu.tar.gz'),
-            'dynamicconv.glu.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.dynamicconv-glu.tar.gz'),
-            'lightconv.glu.wmt17.zh-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.lightconv-glu.tar.gz'),
-            'dynamicconv.glu.wmt17.zh-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.dynamicconv-glu.tar.gz'),
-        }
-        # fmt: on
-
-    def __init__(self, encoder, decoder):
-        super().__init__(encoder, decoder)
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        parser.add_argument(
-            "--dropout", type=float, metavar="D", help="dropout probability"
-        )
-        parser.add_argument(
-            "--attention-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability for attention weights",
-        )
-        parser.add_argument(
-            "--relu-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability after ReLU in FFN",
-        )
-        parser.add_argument(
-            "--input-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability of the inputs",
-        )
-        parser.add_argument(
-            "--encoder-embed-path",
-            type=str,
-            metavar="STR",
-            help="path to pre-trained encoder embedding",
-        )
-        parser.add_argument(
-            "--encoder-embed-dim",
-            type=int,
-            metavar="N",
-            help="encoder embedding dimension",
-        )
-        parser.add_argument(
-            "--encoder-conv-dim",
-            type=int,
-            metavar="N",
-            help="encoder embedding dimension",
-        )
-        parser.add_argument(
-            "--encoder-ffn-embed-dim",
-            type=int,
-            metavar="N",
-            help="encoder embedding dimension for FFN",
-        )
-        parser.add_argument(
-            "--encoder-layers", type=int, metavar="N", help="num encoder layers"
-        )
-        parser.add_argument(
-            "--encoder-attention-heads",
-            type=int,
-            metavar="N",
-            help="num encoder attention heads or LightConv/DynamicConv heads",
-        )
-        parser.add_argument(
-            "--encoder-normalize-before",
-            action="store_true",
-            help="apply layernorm before each encoder block",
-        )
-        parser.add_argument(
-            "--encoder-learned-pos",
-            action="store_true",
-            help="use learned positional embeddings in the encoder",
-        )
-        parser.add_argument(
-            "--decoder-embed-path",
-            type=str,
-            metavar="STR",
-            help="path to pre-trained decoder embedding",
-        )
-        parser.add_argument(
-            "--decoder-embed-dim",
-            type=int,
-            metavar="N",
-            help="decoder embedding dimension",
-        )
-        parser.add_argument(
-            "--decoder-conv-dim",
-            type=int,
-            metavar="N",
-            help="decoder embedding dimension",
-        )
-        parser.add_argument(
-            "--decoder-ffn-embed-dim",
-            type=int,
-            metavar="N",
-            help="decoder embedding dimension for FFN",
-        )
-        parser.add_argument(
-            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
-        )
-        parser.add_argument(
-            "--decoder-attention-heads",
-            type=int,
-            metavar="N",
-            help="num decoder attention heads or LightConv/DynamicConv heads",
-        )
-        parser.add_argument(
-            "--decoder-learned-pos",
-            action="store_true",
-            help="use learned positional embeddings in the decoder",
-        )
-        parser.add_argument(
-            "--decoder-normalize-before",
-            action="store_true",
-            help="apply layernorm before each decoder block",
-        )
-        parser.add_argument(
-            "--share-decoder-input-output-embed",
-            action="store_true",
-            help="share decoder input and output embeddings",
-        )
-        parser.add_argument(
-            "--share-all-embeddings",
-            action="store_true",
-            help="share encoder, decoder and output embeddings"
-            " (requires shared dictionary and embed dim)",
-        )
-        parser.add_argument(
-            "--adaptive-softmax-cutoff",
-            metavar="EXPR",
-            help="comma separated list of adaptive softmax cutoff points. "
-            "Must be used with adaptive_loss criterion",
-        ),
-        parser.add_argument(
-            "--adaptive-softmax-dropout",
-            type=float,
-            metavar="D",
-            help="sets adaptive softmax dropout for the tail projections",
-        )
-
-        """LightConv and DynamicConv arguments"""
-        parser.add_argument(
-            "--encoder-kernel-size-list",
-            type=lambda x: utils.eval_str_list(x, int),
-            help='list of kernel size (default: "[3,7,15,31,31,31,31]")',
-        )
-        parser.add_argument(
-            "--decoder-kernel-size-list",
-            type=lambda x: utils.eval_str_list(x, int),
-            help='list of kernel size (default: "[3,7,15,31,31,31]")',
-        )
-        parser.add_argument(
-            "--encoder-glu", type=utils.eval_bool, help="glu after in proj"
-        )
-        parser.add_argument(
-            "--decoder-glu", type=utils.eval_bool, help="glu after in proj"
-        )
-        parser.add_argument(
-            "--encoder-conv-type",
-            default="dynamic",
-            type=str,
-            choices=["dynamic", "lightweight"],
-            help="type of convolution",
-        )
-        parser.add_argument(
-            "--decoder-conv-type",
-            default="dynamic",
-            type=str,
-            choices=["dynamic", "lightweight"],
-            help="type of convolution",
-        )
-        parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool)
-        parser.add_argument(
-            "--weight-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability for conv weights",
-        )
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-
-        # make sure all arguments are present in older models
-        base_architecture(args)
-
-        if not hasattr(args, "max_source_positions"):
-            args.max_source_positions = 1024
-        if not hasattr(args, "max_target_positions"):
-            args.max_target_positions = 1024
-
-        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
-
-        def build_embedding(dictionary, embed_dim, path=None):
-            num_embeddings = len(dictionary)
-            padding_idx = dictionary.pad()
-            emb = Embedding(num_embeddings, embed_dim, padding_idx)
-            # if provided, load from preloaded dictionaries
-            if path:
-                embed_dict = utils.parse_embedding(path)
-                utils.load_embedding(embed_dict, dictionary, emb)
-            return emb
-
-        if args.share_all_embeddings:
-            if src_dict != tgt_dict:
-                raise RuntimeError(
-                    "--share-all-embeddings requires a joined dictionary"
-                )
-            if args.encoder_embed_dim != args.decoder_embed_dim:
-                raise RuntimeError(
-                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
-                )
-            if args.decoder_embed_path and (
-                args.decoder_embed_path != args.encoder_embed_path
-            ):
-                raise RuntimeError(
-                    "--share-all-embeddings not compatible with --decoder-embed-path"
-                )
-            encoder_embed_tokens = build_embedding(
-                src_dict, args.encoder_embed_dim, args.encoder_embed_path
-            )
-            decoder_embed_tokens = encoder_embed_tokens
-            args.share_decoder_input_output_embed = True
-        else:
-            encoder_embed_tokens = build_embedding(
-                src_dict, args.encoder_embed_dim, args.encoder_embed_path
-            )
-            decoder_embed_tokens = build_embedding(
-                tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
-            )
-
-        encoder = LightConvEncoder(args, src_dict, encoder_embed_tokens)
-        decoder = LightConvDecoder(args, tgt_dict, decoder_embed_tokens)
-        return LightConvModel(encoder, decoder)
-
-
-class LightConvEncoder(FairseqEncoder):
-    """
-    LightConv encoder consisting of *args.encoder_layers* layers. Each layer
-    is a :class:`LightConvEncoderLayer`.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        dictionary (~fairseq.data.Dictionary): encoding dictionary
-        embed_tokens (torch.nn.Embedding): input embedding
-    """
-
-    def __init__(self, args, dictionary, embed_tokens):
-        super().__init__(dictionary)
-        self.dropout_module = FairseqDropout(
-            args.dropout, module_name=self.__class__.__name__
-        )
-
-        embed_dim = embed_tokens.embedding_dim
-        self.padding_idx = embed_tokens.padding_idx
-        self.max_source_positions = args.max_source_positions
-
-        self.embed_tokens = embed_tokens
-        self.embed_scale = math.sqrt(embed_dim)
-        self.embed_positions = (
-            PositionalEmbedding(
-                args.max_source_positions,
-                embed_dim,
-                self.padding_idx,
-                learned=args.encoder_learned_pos,
-            )
-            if not args.no_token_positional_embeddings
-            else None
-        )
-
-        self.layers = nn.ModuleList([])
-        self.layers.extend(
-            [
-                LightConvEncoderLayer(
-                    args, kernel_size=args.encoder_kernel_size_list[i]
-                )
-                for i in range(args.encoder_layers)
-            ]
-        )
-        self.register_buffer("version", torch.Tensor([2]))
-        self.normalize = args.encoder_normalize_before
-        if self.normalize:
-            self.layer_norm = LayerNorm(embed_dim)
-
-    def forward(self, src_tokens, **unused):
-        """
-        Args:
-            src_tokens (LongTensor): tokens in the source language of shape
-                `(batch, src_len)`
-
-        Returns:
-            dict:
-                - **encoder_out** (Tensor): the last encoder layer's output of
-                  shape `(src_len, batch, embed_dim)`
-                - **encoder_padding_mask** (ByteTensor): the positions of
-                  padding elements of shape `(batch, src_len)`
-        """
-        # embed tokens and positions
-        x = self.embed_scale * self.embed_tokens(src_tokens)
-        if self.embed_positions is not None:
-            x += self.embed_positions(src_tokens)
-        x = self.dropout_module(x)
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-
-        # compute padding mask
-        encoder_padding_mask = src_tokens.eq(self.padding_idx)
-        if not encoder_padding_mask.any():
-            encoder_padding_mask = None
-
-        # encoder layers
-        for layer in self.layers:
-            x = layer(x, encoder_padding_mask)
-
-        if self.normalize:
-            x = self.layer_norm(x)
-
-        return {
-            "encoder_out": x,  # T x B x C
-            "encoder_padding_mask": encoder_padding_mask,  # B x T
-        }
-
-    def reorder_encoder_out(self, encoder_out, new_order):
-        """
-        Reorder encoder output according to *new_order*.
-
-        Args:
-            encoder_out: output from the ``forward()`` method
-            new_order (LongTensor): desired order
-
-        Returns:
-            *encoder_out* rearranged according to *new_order*
-        """
-        if encoder_out["encoder_out"] is not None:
-            encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
-                1, new_order
-            )
-        if encoder_out["encoder_padding_mask"] is not None:
-            encoder_out["encoder_padding_mask"] = encoder_out[
-                "encoder_padding_mask"
-            ].index_select(0, new_order)
-        return encoder_out
-
-    def max_positions(self):
-        """Maximum input length supported by the encoder."""
-        if self.embed_positions is None:
-            return self.max_source_positions
-        return min(self.max_source_positions, self.embed_positions.max_positions)
-
-
-class LightConvDecoder(FairseqIncrementalDecoder):
-    """
-    LightConv decoder consisting of *args.decoder_layers* layers. Each layer
-    is a :class:`LightConvDecoderLayer`.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        dictionary (~fairseq.data.Dictionary): decoding dictionary
-        embed_tokens (torch.nn.Embedding): output embedding
-        no_encoder_attn (bool, optional): whether to attend to encoder outputs.
-            Default: ``False``
-    """
-
-    def __init__(
-        self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True
-    ):
-        super().__init__(dictionary)
-        self.dropout_module = FairseqDropout(
-            args.dropout, module_name=self.__class__.__name__
-        )
-        self.share_input_output_embed = args.share_decoder_input_output_embed
-
-        input_embed_dim = embed_tokens.embedding_dim
-        embed_dim = args.decoder_embed_dim
-        output_embed_dim = args.decoder_output_dim
-
-        padding_idx = embed_tokens.padding_idx
-        self.max_target_positions = args.max_target_positions
-
-        self.embed_tokens = embed_tokens
-        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim
-
-        self.project_in_dim = (
-            Linear(input_embed_dim, embed_dim, bias=False)
-            if embed_dim != input_embed_dim
-            else None
-        )
-
-        self.embed_positions = (
-            PositionalEmbedding(
-                args.max_target_positions,
-                embed_dim,
-                padding_idx,
-                learned=args.decoder_learned_pos,
-            )
-            if not args.no_token_positional_embeddings
-            else None
-        )
-
-        self.layers = nn.ModuleList([])
-        self.layers.extend(
-            [
-                LightConvDecoderLayer(
-                    args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i]
-                )
-                for i in range(args.decoder_layers)
-            ]
-        )
-
-        self.adaptive_softmax = None
-
-        self.project_out_dim = (
-            Linear(embed_dim, output_embed_dim, bias=False)
-            if embed_dim != output_embed_dim and not args.tie_adaptive_weights
-            else None
-        )
-
-        if args.adaptive_softmax_cutoff is not None:
-            self.adaptive_softmax = AdaptiveSoftmax(
-                len(dictionary),
-                output_embed_dim,
-                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int),
-                dropout=args.adaptive_softmax_dropout,
-                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
-                factor=args.adaptive_softmax_factor,
-                tie_proj=args.tie_adaptive_proj,
-            )
-        elif not self.share_input_output_embed:
-            self.embed_out = nn.Parameter(
-                torch.Tensor(len(dictionary), output_embed_dim)
-            )
-            nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim ** -0.5)
-        self.register_buffer("version", torch.Tensor([2]))
-        self.normalize = args.decoder_normalize_before and final_norm
-        if self.normalize:
-            self.layer_norm = LayerNorm(embed_dim)
-
-    def forward(
-        self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs
-    ):
-        """
-        Args:
-            prev_output_tokens (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for teacher forcing
-            encoder_out (Tensor, optional): output from the encoder, used for
-                encoder-side attention
-            incremental_state (dict): dictionary used for storing state during
-                :ref:`Incremental decoding`
-
-        Returns:
-            tuple:
-                - the last decoder layer's output of shape `(batch, tgt_len,
-                  vocab)`
-                - the last decoder layer's attention weights of shape `(batch,
-                  tgt_len, src_len)`
-        """
-        # embed positions
-        positions = (
-            self.embed_positions(
-                prev_output_tokens,
-                incremental_state=incremental_state,
-            )
-            if self.embed_positions is not None
-            else None
-        )
-
-        if incremental_state is not None:
-            prev_output_tokens = prev_output_tokens[:, -1:]
-            if positions is not None:
-                positions = positions[:, -1:]
-
-        # embed tokens and positions
-        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
-
-        if self.project_in_dim is not None:
-            x = self.project_in_dim(x)
-
-        if positions is not None:
-            x += positions
-        x = self.dropout_module(x)
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-        attn = None
-
-        inner_states = [x]
-
-        # decoder layers
-        for layer in self.layers:
-            x, attn = layer(
-                x,
-                encoder_out["encoder_out"] if encoder_out is not None else None,
-                encoder_out["encoder_padding_mask"]
-                if encoder_out is not None
-                else None,
-                incremental_state,
-            )
-            inner_states.append(x)
-
-        if self.normalize:
-            x = self.layer_norm(x)
-
-        # T x B x C -> B x T x C
-        x = x.transpose(0, 1)
-
-        if self.project_out_dim is not None:
-            x = self.project_out_dim(x)
-
-        if self.adaptive_softmax is None:
-            # project back to size of vocabulary
-            if self.share_input_output_embed:
-                x = F.linear(x, self.embed_tokens.weight)
-            else:
-                x = F.linear(x, self.embed_out)
-
-        return x, {"attn": attn, "inner_states": inner_states}
-
-    def max_positions(self):
-        """Maximum output length supported by the decoder."""
-        if self.embed_positions is None:
-            return self.max_target_positions
-        return min(self.max_target_positions, self.embed_positions.max_positions)
-
-    def buffered_future_mask(self, tensor):
-        dim = tensor.size(0)
-        if (
-            not hasattr(self, "_future_mask")
-            or self._future_mask is None
-            or self._future_mask.device != tensor.device
-        ):
-            self._future_mask = torch.triu(
-                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
-            )
-        if self._future_mask.size(0) < dim:
-            self._future_mask = torch.triu(
-                utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1
-            )
-        return self._future_mask[:dim, :dim]
-
-
-class LightConvEncoderLayer(nn.Module):
-    """Encoder layer block.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        kernel_size: kernel size of the convolution
-    """
-
-    def __init__(self, args, kernel_size=0):
-        super().__init__()
-        self.embed_dim = args.encoder_embed_dim
-        self.conv_dim = args.encoder_conv_dim
-        padding_l = (
-            kernel_size // 2
-            if kernel_size % 2 == 1
-            else ((kernel_size - 1) // 2, kernel_size // 2)
-        )
-
-        if args.encoder_glu:
-            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
-            self.act = nn.GLU()
-        else:
-            self.linear1 = Linear(self.embed_dim, self.conv_dim)
-            self.act = None
-        if args.encoder_conv_type == "lightweight":
-            self.conv = LightweightConv(
-                self.conv_dim,
-                kernel_size,
-                padding_l=padding_l,
-                weight_softmax=args.weight_softmax,
-                num_heads=args.encoder_attention_heads,
-                weight_dropout=args.weight_dropout,
-            )
-        elif args.encoder_conv_type == "dynamic":
-            self.conv = DynamicConv(
-                self.conv_dim,
-                kernel_size,
-                padding_l=padding_l,
-                weight_softmax=args.weight_softmax,
-                num_heads=args.encoder_attention_heads,
-                weight_dropout=args.weight_dropout,
-            )
-        else:
-            raise NotImplementedError
-        self.linear2 = Linear(self.conv_dim, self.embed_dim)
-
-        self.dropout_module = FairseqDropout(
-            args.dropout, module_name=self.__class__.__name__
-        )
-        self.relu_dropout_module = FairseqDropout(
-            args.relu_dropout, module_name=self.__class__.__name__
-        )
-        self.input_dropout_module = FairseqDropout(
-            args.input_dropout, module_name=self.__class__.__name__
-        )
-        self.normalize_before = args.encoder_normalize_before
-        self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim)
-        self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim)
-        self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)])
-
-    def forward(self, x, encoder_padding_mask):
-        """
-        Args:
-            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
-                `(batch, src_len)` where padding elements are indicated by ``1``.
-
-        Returns:
-            encoded output of shape `(batch, src_len, embed_dim)`
-        """
-        residual = x
-        x = self.maybe_layer_norm(0, x, before=True)
-        x = self.input_dropout_module(x)
-        x = self.linear1(x)
-        if self.act is not None:
-            x = self.act(x)
-        if encoder_padding_mask is not None:
-            x = x.masked_fill(encoder_padding_mask.transpose(0, 1).unsqueeze(2), 0)
-        x = self.conv(x)
-        x = self.linear2(x)
-        x = self.dropout_module(x)
-        x = residual + x
-        x = self.maybe_layer_norm(0, x, after=True)
-
-        residual = x
-        x = self.maybe_layer_norm(1, x, before=True)
-        x = F.relu(self.fc1(x))
-        x = self.relu_dropout_module(x)
-        x = self.fc2(x)
-        x = self.dropout_module(x)
-        x = residual + x
-        x = self.maybe_layer_norm(1, x, after=True)
-        return x
-
-    def maybe_layer_norm(self, i, x, before=False, after=False):
-        assert before ^ after
-        if after ^ self.normalize_before:
-            return self.layer_norms[i](x)
-        else:
-            return x
-
-    def extra_repr(self):
-        return (
-            "dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}".format(
-                self.dropout_module.p,
-                self.relu_dropout_module.p,
-                self.input_dropout_module.p,
-                self.normalize_before,
-            )
-        )
-
-
-class LightConvDecoderLayer(nn.Module):
-    """Decoder layer block.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        no_encoder_attn (bool, optional): whether to attend to encoder outputs.
-            Default: ``False``
-        kernel_size: kernel size of the convolution
-    """
-
-    def __init__(self, args, no_encoder_attn=False, kernel_size=0):
-        super().__init__()
-        self.embed_dim = args.decoder_embed_dim
-        self.conv_dim = args.decoder_conv_dim
-        if args.decoder_glu:
-            self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim)
-            self.act = nn.GLU()
-        else:
-            self.linear1 = Linear(self.embed_dim, self.conv_dim)
-            self.act = None
-        if args.decoder_conv_type == "lightweight":
-            self.conv = LightweightConv(
-                self.conv_dim,
-                kernel_size,
-                padding_l=kernel_size - 1,
-                weight_softmax=args.weight_softmax,
-                num_heads=args.decoder_attention_heads,
-                weight_dropout=args.weight_dropout,
-            )
-        elif args.decoder_conv_type == "dynamic":
-            self.conv = DynamicConv(
-                self.conv_dim,
-                kernel_size,
-                padding_l=kernel_size - 1,
-                weight_softmax=args.weight_softmax,
-                num_heads=args.decoder_attention_heads,
-                weight_dropout=args.weight_dropout,
-            )
-        else:
-            raise NotImplementedError
-        self.linear2 = Linear(self.conv_dim, self.embed_dim)
-
-        self.dropout_module = FairseqDropout(
-            args.dropout, module_name=self.__class__.__name__
-        )
-        self.relu_dropout_module = FairseqDropout(
-            args.relu_dropout, module_name=self.__class__.__name__
-        )
-        self.input_dropout_module = FairseqDropout(
-            args.input_dropout, module_name=self.__class__.__name__
-        )
-        self.normalize_before = args.decoder_normalize_before
-
-        self.conv_layer_norm = LayerNorm(self.embed_dim)
-
-        if no_encoder_attn:
-            self.encoder_attn = None
-            self.encoder_attn_layer_norm = None
-        else:
-            self.encoder_attn = MultiheadAttention(
-                self.embed_dim,
-                args.decoder_attention_heads,
-                dropout=args.attention_dropout,
-                encoder_decoder_attention=True,
-            )
-            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
-
-        self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim)
-        self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim)
-
-        self.final_layer_norm = LayerNorm(self.embed_dim)
-        self.need_attn = True
-
-    def forward(
-        self,
-        x,
-        encoder_out,
-        encoder_padding_mask,
-        incremental_state,
-        prev_conv_state=None,
-        prev_attn_state=None,
-        conv_mask=None,
-        conv_padding_mask=None,
-    ):
-        """
-        Args:
-            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
-                `(batch, src_len)` where padding elements are indicated by ``1``.
-
-        Returns:
-            encoded output of shape `(batch, src_len, embed_dim)`
-        """
-        residual = x
-        x = self.maybe_layer_norm(self.conv_layer_norm, x, before=True)
-        if prev_conv_state is not None:
-            if incremental_state is None:
-                incremental_state = {}
-            self.conv._set_input_buffer(incremental_state, prev_conv_state)
-        x = self.input_dropout_module(x)
-        x = self.linear1(x)
-        if self.act is not None:
-            x = self.act(x)
-        x = self.conv(x, incremental_state=incremental_state)
-        x = self.linear2(x)
-        x = self.dropout_module(x)
-        x = residual + x
-        x = self.maybe_layer_norm(self.conv_layer_norm, x, after=True)
-
-        attn = None
-        if self.encoder_attn is not None:
-            residual = x
-            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
-            if prev_attn_state is not None:
-                if incremental_state is None:
-                    incremental_state = {}
-                prev_key, prev_value = prev_attn_state
-                saved_state = {"prev_key": prev_key, "prev_value": prev_value}
-                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
-            x, attn = self.encoder_attn(
-                query=x,
-                key=encoder_out,
-                value=encoder_out,
-                key_padding_mask=encoder_padding_mask,
-                incremental_state=incremental_state,
-                static_kv=True,
-                need_weights=(not self.training and self.need_attn),
-            )
-            x = self.dropout_module(x)
-            x = residual + x
-            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)
-
-        residual = x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
-        x = F.relu(self.fc1(x))
-        x = self.relu_dropout_module(x)
-        x = self.fc2(x)
-        x = self.dropout_module(x)
-        x = residual + x
-        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
-        return x, attn
-
-    def maybe_layer_norm(self, layer_norm, x, before=False, after=False):
-        assert before ^ after
-        if after ^ self.normalize_before:
-            return layer_norm(x)
-        else:
-            return x
-
-    def make_generation_fast_(self, need_attn=False, **kwargs):
-        self.need_attn = need_attn
-
-    def extra_repr(self):
-        return (
-            "dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}".format(
-                self.dropout_module.p,
-                self.relu_dropout_module.p,
-                self.input_dropout_module.p,
-                self.normalize_before,
-            )
-        )
-
-
-def Embedding(num_embeddings, embedding_dim, padding_idx):
-    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
-    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
-    nn.init.constant_(m.weight[padding_idx], 0)
-    return m
-
-
-def Linear(in_features, out_features, bias=True):
-    m = nn.Linear(in_features, out_features, bias)
-    nn.init.xavier_uniform_(m.weight)
-    if bias:
-        nn.init.constant_(m.bias, 0.0)
-    return m
-
-
-@register_model_architecture("lightconv", "lightconv")
-def base_architecture(args):
-    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
-    args.encoder_layers = getattr(args, "encoder_layers", 7)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
-    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
-    args.decoder_ffn_embed_dim = getattr(
-        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
-    )
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
-    args.relu_dropout = getattr(args, "relu_dropout", 0.0)
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-
-    args.encoder_conv_dim = getattr(args, "encoder_conv_dim", args.encoder_embed_dim)
-    args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim)
-
-    args.encoder_kernel_size_list = getattr(
-        args, "encoder_kernel_size_list", [3, 7, 15, 31, 31, 31, 31]
-    )
-    args.decoder_kernel_size_list = getattr(
-        args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31]
-    )
-    if len(args.encoder_kernel_size_list) == 1:
-        args.encoder_kernel_size_list = (
-            args.encoder_kernel_size_list * args.encoder_layers
-        )
-    if len(args.decoder_kernel_size_list) == 1:
-        args.decoder_kernel_size_list = (
-            args.decoder_kernel_size_list * args.decoder_layers
-        )
-    assert (
-        len(args.encoder_kernel_size_list) == args.encoder_layers
-    ), "encoder_kernel_size_list doesn't match encoder_layers"
-    assert (
-        len(args.decoder_kernel_size_list) == args.decoder_layers
-    ), "decoder_kernel_size_list doesn't match decoder_layers"
-    args.encoder_glu = getattr(args, "encoder_glu", True)
-    args.decoder_glu = getattr(args, "decoder_glu", True)
-    args.input_dropout = getattr(args, "input_dropout", 0.1)
-    args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout)
-
-
-@register_model_architecture("lightconv", "lightconv_iwslt_de_en")
-def lightconv_iwslt_de_en(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
-    args.encoder_layers = getattr(args, "encoder_layers", 7)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.weight_dropout = getattr(args, "weight_dropout", 0.1)
-    args.encoder_glu = getattr(args, "encoder_glu", False)
-    args.decoder_glu = getattr(args, "decoder_glu", False)
-    args.input_dropout = getattr(args, "input_dropout", 0.0)
-    base_architecture(args)
-
-
-@register_model_architecture("lightconv", "lightconv_wmt_en_de")
-def lightconv_wmt_en_de(args):
-    base_architecture(args)
-
-
-@register_model_architecture("lightconv", "lightconv_wmt_en_de_big")
-def lightconv_wmt_en_de_big(args):
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
-    args.dropout = getattr(args, "dropout", 0.3)
-    base_architecture(args)
-
-
-@register_model_architecture("lightconv", "lightconv_wmt_en_fr_big")
-def lightconv_wmt_en_fr_big(args):
-    args.dropout = getattr(args, "dropout", 0.1)
-    lightconv_wmt_en_de_big(args)
-
-
-@register_model_architecture("lightconv", "lightconv_wmt_zh_en_big")
-def lightconv_wmt_zh_en_big(args):
-    args.dropout = getattr(args, "dropout", 0.2)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.2)
-    args.weight_dropout = getattr(args, "weight_dropout", 0.2)
-    lightconv_wmt_en_de_big(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/lightconv_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/lightconv_lm.py
deleted file mode 100644
index 1d9efc4e42a5ecc1b83338055f18ade5a83ea666..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/lightconv_lm.py
+++ /dev/null
@@ -1,306 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq import utils
-from fairseq.models import (
-    FairseqLanguageModel,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.models.lightconv import Embedding, LightConvDecoder
-from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder
-
-
-@register_model("lightconv_lm")
-class LightConvLanguageModel(FairseqLanguageModel):
-    def __init__(self, decoder):
-        super().__init__(decoder)
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        parser.add_argument(
-            "--dropout",
-            default=0.1,
-            type=float,
-            metavar="D",
-            help="dropout probability",
-        )
-        parser.add_argument(
-            "--attention-dropout",
-            default=0.0,
-            type=float,
-            metavar="D",
-            help="dropout probability for attention weights",
-        )
-        parser.add_argument(
-            "--relu-dropout",
-            default=0.0,
-            type=float,
-            metavar="D",
-            help="dropout probability after ReLU in FFN",
-        )
-        parser.add_argument(
-            "--input-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability of the inputs",
-        )
-        parser.add_argument(
-            "--decoder-embed-dim",
-            type=int,
-            metavar="N",
-            help="decoder embedding dimension",
-        )
-        parser.add_argument(
-            "--decoder-output-dim",
-            type=int,
-            metavar="N",
-            help="decoder output dimension",
-        )
-        parser.add_argument(
-            "--decoder-input-dim", type=int, metavar="N", help="decoder input dimension"
-        )
-        parser.add_argument(
-            "--decoder-ffn-embed-dim",
-            type=int,
-            metavar="N",
-            help="decoder embedding dimension for FFN",
-        )
-        parser.add_argument(
-            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
-        )
-        parser.add_argument(
-            "--decoder-attention-heads",
-            type=int,
-            metavar="N",
-            help="num decoder attention heads or LightConv/DynamicConv heads",
-        )
-        parser.add_argument(
-            "--decoder-normalize-before",
-            default=False,
-            action="store_true",
-            help="apply layernorm before each decoder block",
-        )
-        parser.add_argument(
-            "--adaptive-softmax-cutoff",
-            metavar="EXPR",
-            help="comma separated list of adaptive softmax cutoff points. "
-            "Must be used with adaptive_loss criterion",
-        )
-        parser.add_argument(
-            "--adaptive-softmax-dropout",
-            type=float,
-            metavar="D",
-            help="sets adaptive softmax dropout for the tail projections",
-        )
-        parser.add_argument(
-            "--adaptive-softmax-factor",
-            type=float,
-            metavar="N",
-            help="adaptive input factor",
-        )
-        parser.add_argument(
-            "--no-token-positional-embeddings",
-            default=False,
-            action="store_true",
-            help="if set, disables positional embeddings (outside self attention)",
-        )
-        parser.add_argument(
-            "--share-decoder-input-output-embed",
-            default=False,
-            action="store_true",
-            help="share decoder input and output embeddings",
-        )
-        parser.add_argument(
-            "--character-embeddings",
-            default=False,
-            action="store_true",
-            help="if set, uses character embedding convolutions to produce token embeddings",
-        )
-        parser.add_argument(
-            "--character-filters",
-            type=str,
-            metavar="LIST",
-            default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
-            help="size of character embeddings",
-        )
-        parser.add_argument(
-            "--character-embedding-dim",
-            type=int,
-            metavar="N",
-            default=4,
-            help="size of character embeddings",
-        )
-        parser.add_argument(
-            "--char-embedder-highway-layers",
-            type=int,
-            metavar="N",
-            default=2,
-            help="number of highway layers for character token embeddder",
-        )
-        parser.add_argument(
-            "--adaptive-input",
-            default=False,
-            action="store_true",
-            help="if set, uses adaptive input",
-        )
-        parser.add_argument(
-            "--adaptive-input-factor",
-            type=float,
-            metavar="N",
-            help="adaptive input factor",
-        )
-        parser.add_argument(
-            "--adaptive-input-cutoff",
-            metavar="EXPR",
-            help="comma separated list of adaptive input cutoff points.",
-        )
-        parser.add_argument(
-            "--tie-adaptive-weights",
-            action="store_true",
-            help="if set, ties the weights of adaptive softmax and adaptive input",
-        )
-        parser.add_argument(
-            "--tie-adaptive-proj",
-            action="store_true",
-            help="if set, ties the projection weights of adaptive softmax and adaptive input",
-        )
-        parser.add_argument(
-            "--decoder-learned-pos",
-            action="store_true",
-            help="use learned positional embeddings in the decoder",
-        )
-
-        """LightConv and DynamicConv arguments"""
-        parser.add_argument(
-            "--decoder-kernel-size-list",
-            type=lambda x: utils.eval_str_list(x, int),
-            help='list of kernel size (default: "[3,7,15,31,31,31]")',
-        )
-        parser.add_argument(
-            "--decoder-glu", type=utils.eval_bool, help="glu after in proj"
-        )
-        parser.add_argument(
-            "--decoder-conv-type",
-            default="dynamic",
-            type=str,
-            choices=["dynamic", "lightweight"],
-            help="type of convolution",
-        )
-        parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool)
-        parser.add_argument(
-            "--weight-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability for conv weights",
-        )
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-
-        # make sure all arguments are present in older models
-        base_lm_architecture(args)
-
-        if getattr(args, "max_source_positions", None) is None:
-            args.max_source_positions = args.tokens_per_sample
-        if getattr(args, "max_target_positions", None) is None:
-            args.max_target_positions = args.tokens_per_sample
-
-        if args.character_embeddings:
-            embed_tokens = CharacterTokenEmbedder(
-                task.dictionary,
-                eval(args.character_filters),
-                args.character_embedding_dim,
-                args.decoder_embed_dim,
-                args.char_embedder_highway_layers,
-            )
-        elif args.adaptive_input:
-            embed_tokens = AdaptiveInput(
-                len(task.dictionary),
-                task.dictionary.pad(),
-                args.decoder_input_dim,
-                args.adaptive_input_factor,
-                args.decoder_embed_dim,
-                utils.eval_str_list(args.adaptive_input_cutoff, type=int),
-            )
-        else:
-            embed_tokens = Embedding(
-                len(task.dictionary), args.decoder_input_dim, task.dictionary.pad()
-            )
-
-        if args.tie_adaptive_weights:
-            assert args.adaptive_input
-            assert args.adaptive_input_factor == args.adaptive_softmax_factor
-            assert (
-                args.adaptive_softmax_cutoff == args.adaptive_input_cutoff
-            ), "{} != {}".format(
-                args.adaptive_softmax_cutoff, args.adaptive_input_cutoff
-            )
-            assert args.decoder_input_dim == args.decoder_output_dim
-
-        decoder = LightConvDecoder(
-            args,
-            task.output_dictionary,
-            embed_tokens,
-            no_encoder_attn=True,
-            final_norm=False,
-        )
-        return LightConvLanguageModel(decoder)
-
-
-@register_model_architecture("lightconv_lm", "lightconv_lm")
-def base_lm_architecture(args):
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048)
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-
-    args.character_embeddings = getattr(args, "character_embeddings", False)
-
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-    args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim)
-
-    # The model training is not stable without this
-    args.decoder_normalize_before = True
-
-    args.adaptive_input = getattr(args, "adaptive_input", False)
-    args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4)
-    args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None)
-
-    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
-    args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False)
-
-    args.decoder_kernel_size_list = getattr(
-        args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31]
-    )
-    if len(args.decoder_kernel_size_list) == 1:
-        args.decoder_kernel_size_list = (
-            args.decoder_kernel_size_list * args.decoder_layers
-        )
-    assert (
-        len(args.decoder_kernel_size_list) == args.decoder_layers
-    ), "decoder_kernel_size_list doesn't match decoder_layers"
-    args.decoder_glu = getattr(args, "decoder_glu", True)
-    args.input_dropout = getattr(args, "input_dropout", 0.1)
-    args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout)
-
-
-@register_model_architecture("lightconv_lm", "lightconv_lm_gbw")
-def lightconv_lm_gbw(args):
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
-    base_lm_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/lstm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/lstm.py
deleted file mode 100644
index 1a9dca3c7513744d76f9b8cd28fc68bf3dbc4d18..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/lstm.py
+++ /dev/null
@@ -1,753 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Dict, List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.models import (
-    FairseqEncoder,
-    FairseqEncoderDecoderModel,
-    FairseqIncrementalDecoder,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.modules import AdaptiveSoftmax, FairseqDropout
-from torch import Tensor
-
-
-DEFAULT_MAX_SOURCE_POSITIONS = 1e5
-DEFAULT_MAX_TARGET_POSITIONS = 1e5
-
-
-@register_model("lstm")
-class LSTMModel(FairseqEncoderDecoderModel):
-    def __init__(self, encoder, decoder):
-        super().__init__(encoder, decoder)
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--dropout', type=float, metavar='D',
-                            help='dropout probability')
-        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
-                            help='encoder embedding dimension')
-        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
-                            help='path to pre-trained encoder embedding')
-        parser.add_argument('--encoder-freeze-embed', action='store_true',
-                            help='freeze encoder embeddings')
-        parser.add_argument('--encoder-hidden-size', type=int, metavar='N',
-                            help='encoder hidden size')
-        parser.add_argument('--encoder-layers', type=int, metavar='N',
-                            help='number of encoder layers')
-        parser.add_argument('--encoder-bidirectional', action='store_true',
-                            help='make all layers of encoder bidirectional')
-        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension')
-        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
-                            help='path to pre-trained decoder embedding')
-        parser.add_argument('--decoder-freeze-embed', action='store_true',
-                            help='freeze decoder embeddings')
-        parser.add_argument('--decoder-hidden-size', type=int, metavar='N',
-                            help='decoder hidden size')
-        parser.add_argument('--decoder-layers', type=int, metavar='N',
-                            help='number of decoder layers')
-        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
-                            help='decoder output embedding dimension')
-        parser.add_argument('--decoder-attention', type=str, metavar='BOOL',
-                            help='decoder attention')
-        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
-                            help='comma separated list of adaptive softmax cutoff points. '
-                                 'Must be used with adaptive_loss criterion')
-        parser.add_argument('--share-decoder-input-output-embed', default=False,
-                            action='store_true',
-                            help='share decoder input and output embeddings')
-        parser.add_argument('--share-all-embeddings', default=False, action='store_true',
-                            help='share encoder, decoder and output embeddings'
-                                 ' (requires shared dictionary and embed dim)')
-
-        # Granular dropout settings (if not specified these default to --dropout)
-        parser.add_argument('--encoder-dropout-in', type=float, metavar='D',
-                            help='dropout probability for encoder input embedding')
-        parser.add_argument('--encoder-dropout-out', type=float, metavar='D',
-                            help='dropout probability for encoder output')
-        parser.add_argument('--decoder-dropout-in', type=float, metavar='D',
-                            help='dropout probability for decoder input embedding')
-        parser.add_argument('--decoder-dropout-out', type=float, metavar='D',
-                            help='dropout probability for decoder output')
-        # fmt: on
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-        # make sure that all args are properly defaulted (in case there are any new ones)
-        base_architecture(args)
-
-        if args.encoder_layers != args.decoder_layers:
-            raise ValueError("--encoder-layers must match --decoder-layers")
-
-        max_source_positions = getattr(
-            args, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS
-        )
-        max_target_positions = getattr(
-            args, "max_target_positions", DEFAULT_MAX_TARGET_POSITIONS
-        )
-
-        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
-            num_embeddings = len(dictionary)
-            padding_idx = dictionary.pad()
-            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
-            embed_dict = utils.parse_embedding(embed_path)
-            utils.print_embed_overlap(embed_dict, dictionary)
-            return utils.load_embedding(embed_dict, dictionary, embed_tokens)
-
-        if args.encoder_embed_path:
-            pretrained_encoder_embed = load_pretrained_embedding_from_file(
-                args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim
-            )
-        else:
-            num_embeddings = len(task.source_dictionary)
-            pretrained_encoder_embed = Embedding(
-                num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad()
-            )
-
-        if args.share_all_embeddings:
-            # double check all parameters combinations are valid
-            if task.source_dictionary != task.target_dictionary:
-                raise ValueError("--share-all-embeddings requires a joint dictionary")
-            if args.decoder_embed_path and (
-                args.decoder_embed_path != args.encoder_embed_path
-            ):
-                raise ValueError(
-                    "--share-all-embed not compatible with --decoder-embed-path"
-                )
-            if args.encoder_embed_dim != args.decoder_embed_dim:
-                raise ValueError(
-                    "--share-all-embeddings requires --encoder-embed-dim to "
-                    "match --decoder-embed-dim"
-                )
-            pretrained_decoder_embed = pretrained_encoder_embed
-            args.share_decoder_input_output_embed = True
-        else:
-            # separate decoder input embeddings
-            pretrained_decoder_embed = None
-            if args.decoder_embed_path:
-                pretrained_decoder_embed = load_pretrained_embedding_from_file(
-                    args.decoder_embed_path,
-                    task.target_dictionary,
-                    args.decoder_embed_dim,
-                )
-        # one last double check of parameter combinations
-        if args.share_decoder_input_output_embed and (
-            args.decoder_embed_dim != args.decoder_out_embed_dim
-        ):
-            raise ValueError(
-                "--share-decoder-input-output-embeddings requires "
-                "--decoder-embed-dim to match --decoder-out-embed-dim"
-            )
-
-        if args.encoder_freeze_embed:
-            pretrained_encoder_embed.weight.requires_grad = False
-        if args.decoder_freeze_embed:
-            pretrained_decoder_embed.weight.requires_grad = False
-
-        encoder = LSTMEncoder(
-            dictionary=task.source_dictionary,
-            embed_dim=args.encoder_embed_dim,
-            hidden_size=args.encoder_hidden_size,
-            num_layers=args.encoder_layers,
-            dropout_in=args.encoder_dropout_in,
-            dropout_out=args.encoder_dropout_out,
-            bidirectional=args.encoder_bidirectional,
-            pretrained_embed=pretrained_encoder_embed,
-            max_source_positions=max_source_positions,
-        )
-        decoder = LSTMDecoder(
-            dictionary=task.target_dictionary,
-            embed_dim=args.decoder_embed_dim,
-            hidden_size=args.decoder_hidden_size,
-            out_embed_dim=args.decoder_out_embed_dim,
-            num_layers=args.decoder_layers,
-            dropout_in=args.decoder_dropout_in,
-            dropout_out=args.decoder_dropout_out,
-            attention=utils.eval_bool(args.decoder_attention),
-            encoder_output_units=encoder.output_units,
-            pretrained_embed=pretrained_decoder_embed,
-            share_input_output_embed=args.share_decoder_input_output_embed,
-            adaptive_softmax_cutoff=(
-                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int)
-                if args.criterion == "adaptive_loss"
-                else None
-            ),
-            max_target_positions=max_target_positions,
-            residuals=False,
-        )
-        return cls(encoder, decoder)
-
-    def forward(
-        self,
-        src_tokens,
-        src_lengths,
-        prev_output_tokens,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-    ):
-        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths)
-        decoder_out = self.decoder(
-            prev_output_tokens,
-            encoder_out=encoder_out,
-            incremental_state=incremental_state,
-        )
-        return decoder_out
-
-
-class LSTMEncoder(FairseqEncoder):
-    """LSTM encoder."""
-
-    def __init__(
-        self,
-        dictionary,
-        embed_dim=512,
-        hidden_size=512,
-        num_layers=1,
-        dropout_in=0.1,
-        dropout_out=0.1,
-        bidirectional=False,
-        left_pad=True,
-        pretrained_embed=None,
-        padding_idx=None,
-        max_source_positions=DEFAULT_MAX_SOURCE_POSITIONS,
-    ):
-        super().__init__(dictionary)
-        self.num_layers = num_layers
-        self.dropout_in_module = FairseqDropout(
-            dropout_in, module_name=self.__class__.__name__
-        )
-        self.dropout_out_module = FairseqDropout(
-            dropout_out, module_name=self.__class__.__name__
-        )
-        self.bidirectional = bidirectional
-        self.hidden_size = hidden_size
-        self.max_source_positions = max_source_positions
-
-        num_embeddings = len(dictionary)
-        self.padding_idx = padding_idx if padding_idx is not None else dictionary.pad()
-        if pretrained_embed is None:
-            self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
-        else:
-            self.embed_tokens = pretrained_embed
-
-        self.lstm = LSTM(
-            input_size=embed_dim,
-            hidden_size=hidden_size,
-            num_layers=num_layers,
-            dropout=self.dropout_out_module.p if num_layers > 1 else 0.0,
-            bidirectional=bidirectional,
-        )
-        self.left_pad = left_pad
-
-        self.output_units = hidden_size
-        if bidirectional:
-            self.output_units *= 2
-
-    def forward(
-        self,
-        src_tokens: Tensor,
-        src_lengths: Tensor,
-        enforce_sorted: bool = True,
-    ):
-        """
-        Args:
-            src_tokens (LongTensor): tokens in the source language of
-                shape `(batch, src_len)`
-            src_lengths (LongTensor): lengths of each source sentence of
-                shape `(batch)`
-            enforce_sorted (bool, optional): if True, `src_tokens` is
-                expected to contain sequences sorted by length in a
-                decreasing order. If False, this condition is not
-                required. Default: True.
-        """
-        if self.left_pad:
-            # nn.utils.rnn.pack_padded_sequence requires right-padding;
-            # convert left-padding to right-padding
-            src_tokens = utils.convert_padding_direction(
-                src_tokens,
-                torch.zeros_like(src_tokens).fill_(self.padding_idx),
-                left_to_right=True,
-            )
-
-        bsz, seqlen = src_tokens.size()
-
-        # embed tokens
-        x = self.embed_tokens(src_tokens)
-        x = self.dropout_in_module(x)
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-
-        # pack embedded source tokens into a PackedSequence
-        packed_x = nn.utils.rnn.pack_padded_sequence(
-            x, src_lengths.cpu(), enforce_sorted=enforce_sorted
-        )
-
-        # apply LSTM
-        if self.bidirectional:
-            state_size = 2 * self.num_layers, bsz, self.hidden_size
-        else:
-            state_size = self.num_layers, bsz, self.hidden_size
-        h0 = x.new_zeros(*state_size)
-        c0 = x.new_zeros(*state_size)
-        packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0))
-
-        # unpack outputs and apply dropout
-        x, _ = nn.utils.rnn.pad_packed_sequence(
-            packed_outs, padding_value=self.padding_idx * 1.0
-        )
-        x = self.dropout_out_module(x)
-        assert list(x.size()) == [seqlen, bsz, self.output_units]
-
-        if self.bidirectional:
-            final_hiddens = self.combine_bidir(final_hiddens, bsz)
-            final_cells = self.combine_bidir(final_cells, bsz)
-
-        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()
-
-        return tuple(
-            (
-                x,  # seq_len x batch x hidden
-                final_hiddens,  # num_layers x batch x num_directions*hidden
-                final_cells,  # num_layers x batch x num_directions*hidden
-                encoder_padding_mask,  # seq_len x batch
-            )
-        )
-
-    def combine_bidir(self, outs, bsz: int):
-        out = outs.view(self.num_layers, 2, bsz, -1).transpose(1, 2).contiguous()
-        return out.view(self.num_layers, bsz, -1)
-
-    def reorder_encoder_out(self, encoder_out, new_order):
-        return tuple(
-            (
-                encoder_out[0].index_select(1, new_order),
-                encoder_out[1].index_select(1, new_order),
-                encoder_out[2].index_select(1, new_order),
-                encoder_out[3].index_select(1, new_order),
-            )
-        )
-
-    def max_positions(self):
-        """Maximum input length supported by the encoder."""
-        return self.max_source_positions
-
-
-class AttentionLayer(nn.Module):
-    def __init__(self, input_embed_dim, source_embed_dim, output_embed_dim, bias=False):
-        super().__init__()
-
-        self.input_proj = Linear(input_embed_dim, source_embed_dim, bias=bias)
-        self.output_proj = Linear(
-            input_embed_dim + source_embed_dim, output_embed_dim, bias=bias
-        )
-
-    def forward(self, input, source_hids, encoder_padding_mask):
-        # input: bsz x input_embed_dim
-        # source_hids: srclen x bsz x source_embed_dim
-
-        # x: bsz x source_embed_dim
-        x = self.input_proj(input)
-
-        # compute attention
-        attn_scores = (source_hids * x.unsqueeze(0)).sum(dim=2)
-
-        # don't attend over padding
-        if encoder_padding_mask is not None:
-            attn_scores = (
-                attn_scores.float()
-                .masked_fill_(encoder_padding_mask, float("-inf"))
-                .type_as(attn_scores)
-            )  # FP16 support: cast to float and back
-
-        attn_scores = F.softmax(attn_scores, dim=0)  # srclen x bsz
-
-        # sum weighted sources
-        x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0)
-
-        x = torch.tanh(self.output_proj(torch.cat((x, input), dim=1)))
-        return x, attn_scores
-
-
-class LSTMDecoder(FairseqIncrementalDecoder):
-    """LSTM decoder."""
-
-    def __init__(
-        self,
-        dictionary,
-        embed_dim=512,
-        hidden_size=512,
-        out_embed_dim=512,
-        num_layers=1,
-        dropout_in=0.1,
-        dropout_out=0.1,
-        attention=True,
-        encoder_output_units=512,
-        pretrained_embed=None,
-        share_input_output_embed=False,
-        adaptive_softmax_cutoff=None,
-        max_target_positions=DEFAULT_MAX_TARGET_POSITIONS,
-        residuals=False,
-    ):
-        super().__init__(dictionary)
-        self.dropout_in_module = FairseqDropout(
-            dropout_in, module_name=self.__class__.__name__
-        )
-        self.dropout_out_module = FairseqDropout(
-            dropout_out, module_name=self.__class__.__name__
-        )
-        self.hidden_size = hidden_size
-        self.share_input_output_embed = share_input_output_embed
-        self.need_attn = True
-        self.max_target_positions = max_target_positions
-        self.residuals = residuals
-        self.num_layers = num_layers
-
-        self.adaptive_softmax = None
-        num_embeddings = len(dictionary)
-        padding_idx = dictionary.pad()
-        if pretrained_embed is None:
-            self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
-        else:
-            self.embed_tokens = pretrained_embed
-
-        self.encoder_output_units = encoder_output_units
-        if encoder_output_units != hidden_size and encoder_output_units != 0:
-            self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size)
-            self.encoder_cell_proj = Linear(encoder_output_units, hidden_size)
-        else:
-            self.encoder_hidden_proj = self.encoder_cell_proj = None
-
-        # disable input feeding if there is no encoder
-        # input feeding is described in arxiv.org/abs/1508.04025
-        input_feed_size = 0 if encoder_output_units == 0 else hidden_size
-        self.layers = nn.ModuleList(
-            [
-                LSTMCell(
-                    input_size=input_feed_size + embed_dim
-                    if layer == 0
-                    else hidden_size,
-                    hidden_size=hidden_size,
-                )
-                for layer in range(num_layers)
-            ]
-        )
-
-        if attention:
-            # TODO make bias configurable
-            self.attention = AttentionLayer(
-                hidden_size, encoder_output_units, hidden_size, bias=False
-            )
-        else:
-            self.attention = None
-
-        if hidden_size != out_embed_dim:
-            self.additional_fc = Linear(hidden_size, out_embed_dim)
-
-        if adaptive_softmax_cutoff is not None:
-            # setting adaptive_softmax dropout to dropout_out for now but can be redefined
-            self.adaptive_softmax = AdaptiveSoftmax(
-                num_embeddings,
-                hidden_size,
-                adaptive_softmax_cutoff,
-                dropout=dropout_out,
-            )
-        elif not self.share_input_output_embed:
-            self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
-
-    def forward(
-        self,
-        prev_output_tokens,
-        encoder_out: Optional[Tuple[Tensor, Tensor, Tensor, Tensor]] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        src_lengths: Optional[Tensor] = None,
-    ):
-        x, attn_scores = self.extract_features(
-            prev_output_tokens, encoder_out, incremental_state
-        )
-        return self.output_layer(x), attn_scores
-
-    def extract_features(
-        self,
-        prev_output_tokens,
-        encoder_out: Optional[Tuple[Tensor, Tensor, Tensor, Tensor]] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-    ):
-        """
-        Similar to *forward* but only return features.
-        """
-        # get outputs from encoder
-        if encoder_out is not None:
-            encoder_outs = encoder_out[0]
-            encoder_hiddens = encoder_out[1]
-            encoder_cells = encoder_out[2]
-            encoder_padding_mask = encoder_out[3]
-        else:
-            encoder_outs = torch.empty(0)
-            encoder_hiddens = torch.empty(0)
-            encoder_cells = torch.empty(0)
-            encoder_padding_mask = torch.empty(0)
-        srclen = encoder_outs.size(0)
-
-        if incremental_state is not None and len(incremental_state) > 0:
-            prev_output_tokens = prev_output_tokens[:, -1:]
-
-        bsz, seqlen = prev_output_tokens.size()
-
-        # embed tokens
-        x = self.embed_tokens(prev_output_tokens)
-        x = self.dropout_in_module(x)
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-
-        # initialize previous states (or get from cache during incremental generation)
-        if incremental_state is not None and len(incremental_state) > 0:
-            prev_hiddens, prev_cells, input_feed = self.get_cached_state(
-                incremental_state
-            )
-        elif encoder_out is not None:
-            # setup recurrent cells
-            prev_hiddens = [encoder_hiddens[i] for i in range(self.num_layers)]
-            prev_cells = [encoder_cells[i] for i in range(self.num_layers)]
-            if self.encoder_hidden_proj is not None:
-                prev_hiddens = [self.encoder_hidden_proj(y) for y in prev_hiddens]
-                prev_cells = [self.encoder_cell_proj(y) for y in prev_cells]
-            input_feed = x.new_zeros(bsz, self.hidden_size)
-        else:
-            # setup zero cells, since there is no encoder
-            zero_state = x.new_zeros(bsz, self.hidden_size)
-            prev_hiddens = [zero_state for i in range(self.num_layers)]
-            prev_cells = [zero_state for i in range(self.num_layers)]
-            input_feed = None
-
-        assert (
-            srclen > 0 or self.attention is None
-        ), "attention is not supported if there are no encoder outputs"
-        attn_scores = (
-            x.new_zeros(srclen, seqlen, bsz) if self.attention is not None else None
-        )
-        outs = []
-        for j in range(seqlen):
-            # input feeding: concatenate context vector from previous time step
-            if input_feed is not None:
-                input = torch.cat((x[j, :, :], input_feed), dim=1)
-            else:
-                input = x[j]
-
-            for i, rnn in enumerate(self.layers):
-                # recurrent cell
-                hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i]))
-
-                # hidden state becomes the input to the next layer
-                input = self.dropout_out_module(hidden)
-                if self.residuals:
-                    input = input + prev_hiddens[i]
-
-                # save state for next time step
-                prev_hiddens[i] = hidden
-                prev_cells[i] = cell
-
-            # apply attention using the last layer's hidden state
-            if self.attention is not None:
-                assert attn_scores is not None
-                out, attn_scores[:, j, :] = self.attention(
-                    hidden, encoder_outs, encoder_padding_mask
-                )
-            else:
-                out = hidden
-            out = self.dropout_out_module(out)
-
-            # input feeding
-            if input_feed is not None:
-                input_feed = out
-
-            # save final output
-            outs.append(out)
-
-        # Stack all the necessary tensors together and store
-        prev_hiddens_tensor = torch.stack(prev_hiddens)
-        prev_cells_tensor = torch.stack(prev_cells)
-        cache_state = torch.jit.annotate(
-            Dict[str, Optional[Tensor]],
-            {
-                "prev_hiddens": prev_hiddens_tensor,
-                "prev_cells": prev_cells_tensor,
-                "input_feed": input_feed,
-            },
-        )
-        self.set_incremental_state(incremental_state, "cached_state", cache_state)
-
-        # collect outputs across time steps
-        x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size)
-
-        # T x B x C -> B x T x C
-        x = x.transpose(1, 0)
-
-        if hasattr(self, "additional_fc") and self.adaptive_softmax is None:
-            x = self.additional_fc(x)
-            x = self.dropout_out_module(x)
-        # srclen x tgtlen x bsz -> bsz x tgtlen x srclen
-        if not self.training and self.need_attn and self.attention is not None:
-            assert attn_scores is not None
-            attn_scores = attn_scores.transpose(0, 2)
-        else:
-            attn_scores = None
-        return x, attn_scores
-
-    def output_layer(self, x):
-        """Project features to the vocabulary size."""
-        if self.adaptive_softmax is None:
-            if self.share_input_output_embed:
-                x = F.linear(x, self.embed_tokens.weight)
-            else:
-                x = self.fc_out(x)
-        return x
-
-    def get_cached_state(
-        self,
-        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-    ) -> Tuple[List[Tensor], List[Tensor], Optional[Tensor]]:
-        cached_state = self.get_incremental_state(incremental_state, "cached_state")
-        assert cached_state is not None
-        prev_hiddens_ = cached_state["prev_hiddens"]
-        assert prev_hiddens_ is not None
-        prev_cells_ = cached_state["prev_cells"]
-        assert prev_cells_ is not None
-        prev_hiddens = [prev_hiddens_[i] for i in range(self.num_layers)]
-        prev_cells = [prev_cells_[j] for j in range(self.num_layers)]
-        input_feed = cached_state[
-            "input_feed"
-        ]  # can be None for decoder-only language models
-        return prev_hiddens, prev_cells, input_feed
-
-    def reorder_incremental_state(
-        self,
-        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-        new_order: Tensor,
-    ):
-        if incremental_state is None or len(incremental_state) == 0:
-            return
-        prev_hiddens, prev_cells, input_feed = self.get_cached_state(incremental_state)
-        prev_hiddens = [p.index_select(0, new_order) for p in prev_hiddens]
-        prev_cells = [p.index_select(0, new_order) for p in prev_cells]
-        if input_feed is not None:
-            input_feed = input_feed.index_select(0, new_order)
-        cached_state_new = torch.jit.annotate(
-            Dict[str, Optional[Tensor]],
-            {
-                "prev_hiddens": torch.stack(prev_hiddens),
-                "prev_cells": torch.stack(prev_cells),
-                "input_feed": input_feed,
-            },
-        )
-        self.set_incremental_state(incremental_state, "cached_state", cached_state_new),
-        return
-
-    def max_positions(self):
-        """Maximum output length supported by the decoder."""
-        return self.max_target_positions
-
-    def make_generation_fast_(self, need_attn=False, **kwargs):
-        self.need_attn = need_attn
-
-
-def Embedding(num_embeddings, embedding_dim, padding_idx):
-    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
-    nn.init.uniform_(m.weight, -0.1, 0.1)
-    nn.init.constant_(m.weight[padding_idx], 0)
-    return m
-
-
-def LSTM(input_size, hidden_size, **kwargs):
-    m = nn.LSTM(input_size, hidden_size, **kwargs)
-    for name, param in m.named_parameters():
-        if "weight" in name or "bias" in name:
-            param.data.uniform_(-0.1, 0.1)
-    return m
-
-
-def LSTMCell(input_size, hidden_size, **kwargs):
-    m = nn.LSTMCell(input_size, hidden_size, **kwargs)
-    for name, param in m.named_parameters():
-        if "weight" in name or "bias" in name:
-            param.data.uniform_(-0.1, 0.1)
-    return m
-
-
-def Linear(in_features, out_features, bias=True, dropout=0.0):
-    """Linear layer (input: N x T x C)"""
-    m = nn.Linear(in_features, out_features, bias=bias)
-    m.weight.data.uniform_(-0.1, 0.1)
-    if bias:
-        m.bias.data.uniform_(-0.1, 0.1)
-    return m
-
-
-@register_model_architecture("lstm", "lstm")
-def base_architecture(args):
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
-    args.encoder_freeze_embed = getattr(args, "encoder_freeze_embed", False)
-    args.encoder_hidden_size = getattr(
-        args, "encoder_hidden_size", args.encoder_embed_dim
-    )
-    args.encoder_layers = getattr(args, "encoder_layers", 1)
-    args.encoder_bidirectional = getattr(args, "encoder_bidirectional", False)
-    args.encoder_dropout_in = getattr(args, "encoder_dropout_in", args.dropout)
-    args.encoder_dropout_out = getattr(args, "encoder_dropout_out", args.dropout)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
-    args.decoder_freeze_embed = getattr(args, "decoder_freeze_embed", False)
-    args.decoder_hidden_size = getattr(
-        args, "decoder_hidden_size", args.decoder_embed_dim
-    )
-    args.decoder_layers = getattr(args, "decoder_layers", 1)
-    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512)
-    args.decoder_attention = getattr(args, "decoder_attention", "1")
-    args.decoder_dropout_in = getattr(args, "decoder_dropout_in", args.dropout)
-    args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
-    args.adaptive_softmax_cutoff = getattr(
-        args, "adaptive_softmax_cutoff", "10000,50000,200000"
-    )
-
-
-@register_model_architecture("lstm", "lstm_wiseman_iwslt_de_en")
-def lstm_wiseman_iwslt_de_en(args):
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
-    args.encoder_dropout_in = getattr(args, "encoder_dropout_in", 0)
-    args.encoder_dropout_out = getattr(args, "encoder_dropout_out", 0)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
-    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256)
-    args.decoder_dropout_in = getattr(args, "decoder_dropout_in", 0)
-    args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout)
-    base_architecture(args)
-
-
-@register_model_architecture("lstm", "lstm_luong_wmt_en_de")
-def lstm_luong_wmt_en_de(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1000)
-    args.encoder_layers = getattr(args, "encoder_layers", 4)
-    args.encoder_dropout_out = getattr(args, "encoder_dropout_out", 0)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1000)
-    args.decoder_layers = getattr(args, "decoder_layers", 4)
-    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 1000)
-    args.decoder_dropout_out = getattr(args, "decoder_dropout_out", 0)
-    base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/lstm_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/lstm_lm.py
deleted file mode 100644
index 454f0ac36fab78bf02a8e2f07ed9607d1da87e34..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/lstm_lm.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq import utils
-from fairseq.models import (
-    FairseqLanguageModel,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.models.lstm import Embedding, LSTMDecoder
-
-
-DEFAULT_MAX_TARGET_POSITIONS = 1e5
-
-
-@register_model("lstm_lm")
-class LSTMLanguageModel(FairseqLanguageModel):
-    def __init__(self, decoder):
-        super().__init__(decoder)
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--dropout', type=float, metavar='D',
-                            help='dropout probability')
-        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension')
-        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
-                            help='path to pre-trained decoder embedding')
-        parser.add_argument('--decoder-hidden-size', type=int, metavar='N',
-                            help='decoder hidden size')
-        parser.add_argument('--decoder-layers', type=int, metavar='N',
-                            help='number of decoder layers')
-        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
-                            help='decoder output embedding dimension')
-        parser.add_argument('--decoder-attention', type=str, metavar='BOOL',
-                            help='decoder attention')
-        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
-                            help='comma separated list of adaptive softmax cutoff points. '
-                                 'Must be used with adaptive_loss criterion')
-        parser.add_argument('--residuals', default=False,
-                            action='store_true',
-                            help='applying residuals between LSTM layers')
-
-        # Granular dropout settings (if not specified these default to --dropout)
-        parser.add_argument('--decoder-dropout-in', type=float, metavar='D',
-                            help='dropout probability for decoder input embedding')
-        parser.add_argument('--decoder-dropout-out', type=float, metavar='D',
-                            help='dropout probability for decoder output')
-        parser.add_argument('--share-decoder-input-output-embed', default=False,
-                            action='store_true',
-                            help='share decoder input and output embeddings')
-        # fmt: on
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-
-        # make sure all arguments are present in older models
-        base_architecture(args)
-
-        if getattr(args, "max_target_positions", None) is not None:
-            max_target_positions = args.max_target_positions
-        else:
-            max_target_positions = getattr(
-                args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
-            )
-
-        def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim):
-            num_embeddings = len(dictionary)
-            padding_idx = dictionary.pad()
-            embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
-            embed_dict = utils.parse_embedding(embed_path)
-            utils.print_embed_overlap(embed_dict, dictionary)
-            return utils.load_embedding(embed_dict, dictionary, embed_tokens)
-
-        pretrained_decoder_embed = None
-        if args.decoder_embed_path:
-            pretrained_decoder_embed = load_pretrained_embedding_from_file(
-                args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim
-            )
-
-        if args.share_decoder_input_output_embed:
-            # double check all parameters combinations are valid
-            if task.source_dictionary != task.target_dictionary:
-                raise ValueError(
-                    "--share-decoder-input-output-embeddings requires a joint dictionary"
-                )
-
-            if args.decoder_embed_dim != args.decoder_out_embed_dim:
-                raise ValueError(
-                    "--share-decoder-input-output-embeddings requires "
-                    "--decoder-embed-dim to match --decoder-out-embed-dim"
-                )
-
-        decoder = LSTMDecoder(
-            dictionary=task.dictionary,
-            embed_dim=args.decoder_embed_dim,
-            hidden_size=args.decoder_hidden_size,
-            out_embed_dim=args.decoder_out_embed_dim,
-            num_layers=args.decoder_layers,
-            dropout_in=args.decoder_dropout_in,
-            dropout_out=args.decoder_dropout_out,
-            attention=False,  # decoder-only language model doesn't support attention
-            encoder_output_units=0,
-            pretrained_embed=pretrained_decoder_embed,
-            share_input_output_embed=args.share_decoder_input_output_embed,
-            adaptive_softmax_cutoff=(
-                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int)
-                if args.criterion == "adaptive_loss"
-                else None
-            ),
-            max_target_positions=max_target_positions,
-            residuals=args.residuals,
-        )
-
-        return cls(decoder)
-
-
-@register_model_architecture("lstm_lm", "lstm_lm")
-def base_architecture(args):
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
-    args.decoder_hidden_size = getattr(
-        args, "decoder_hidden_size", args.decoder_embed_dim
-    )
-    args.decoder_layers = getattr(args, "decoder_layers", 1)
-    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512)
-    args.decoder_attention = getattr(args, "decoder_attention", "0")
-    args.decoder_dropout_in = getattr(args, "decoder_dropout_in", args.dropout)
-    args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.adaptive_softmax_cutoff = getattr(
-        args, "adaptive_softmax_cutoff", "10000,50000,200000"
-    )
-    args.residuals = getattr(args, "residuals", False)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/masked_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/masked_lm.py
deleted file mode 100644
index c786de9125551f7247618b0a1d0867477894c755..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/masked_lm.py
+++ /dev/null
@@ -1,403 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.models import (
-    FairseqEncoder,
-    FairseqEncoderModel,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.modules import (
-    LayerNorm,
-    SinusoidalPositionalEmbedding,
-    TransformerSentenceEncoder,
-)
-from fairseq.modules.transformer_sentence_encoder import init_bert_params
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_model("masked_lm")
-class MaskedLMModel(FairseqEncoderModel):
-    """
-    Class for training a Masked Language Model. It also supports an
-    additional sentence level prediction if the sent-loss argument is set.
-    """
-
-    def __init__(self, args, encoder):
-        super().__init__(encoder)
-        self.args = args
-
-        # if specified then apply bert initialization on the model. We need
-        # to explictly call this to make sure that the output embeddings
-        # and projection layers are also correctly initialized
-        if getattr(args, "apply_bert_init", False):
-            self.apply(init_bert_params)
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        # Arguments related to dropout
-        parser.add_argument(
-            "--dropout", type=float, metavar="D", help="dropout probability"
-        )
-        parser.add_argument(
-            "--attention-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability for" " attention weights",
-        )
-        parser.add_argument(
-            "--act-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability after" " activation in FFN",
-        )
-
-        # Arguments related to hidden states and self-attention
-        parser.add_argument(
-            "--encoder-ffn-embed-dim",
-            type=int,
-            metavar="N",
-            help="encoder embedding dimension for FFN",
-        )
-        parser.add_argument(
-            "--encoder-layers", type=int, metavar="N", help="num encoder layers"
-        )
-        parser.add_argument(
-            "--encoder-attention-heads",
-            type=int,
-            metavar="N",
-            help="num encoder attention heads",
-        )
-
-        # Arguments related to input and output embeddings
-        parser.add_argument(
-            "--encoder-embed-dim",
-            type=int,
-            metavar="N",
-            help="encoder embedding dimension",
-        )
-        parser.add_argument(
-            "--share-encoder-input-output-embed",
-            action="store_true",
-            help="share encoder input" " and output embeddings",
-        )
-        parser.add_argument(
-            "--encoder-learned-pos",
-            action="store_true",
-            help="use learned positional embeddings in the encoder",
-        )
-        parser.add_argument(
-            "--no-token-positional-embeddings",
-            action="store_true",
-            help="if set, disables positional embeddings" " (outside self attention)",
-        )
-        parser.add_argument(
-            "--num-segment", type=int, metavar="N", help="num segment in the input"
-        )
-        parser.add_argument(
-            "--max-positions", type=int, help="number of positional embeddings to learn"
-        )
-
-        # Arguments related to sentence level prediction
-        parser.add_argument(
-            "--sentence-class-num",
-            type=int,
-            metavar="N",
-            help="number of classes for sentence task",
-        )
-        parser.add_argument(
-            "--sent-loss",
-            action="store_true",
-            help="if set," " calculate sentence level predictions",
-        )
-
-        # Arguments related to parameter initialization
-        parser.add_argument(
-            "--apply-bert-init",
-            action="store_true",
-            help="use custom param initialization for BERT",
-        )
-
-        # misc params
-        parser.add_argument(
-            "--activation-fn",
-            choices=utils.get_available_activation_fns(),
-            help="activation function to use",
-        )
-        parser.add_argument(
-            "--pooler-activation-fn",
-            choices=utils.get_available_activation_fns(),
-            help="Which activation function to use for pooler layer.",
-        )
-        parser.add_argument(
-            "--encoder-normalize-before",
-            action="store_true",
-            help="apply layernorm before each encoder block",
-        )
-
-    def forward(self, src_tokens, segment_labels=None, **kwargs):
-        return self.encoder(src_tokens, segment_labels=segment_labels, **kwargs)
-
-    def max_positions(self):
-        return self.encoder.max_positions
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-        # make sure all arguments are present in older models
-        base_architecture(args)
-
-        if not hasattr(args, "max_positions"):
-            args.max_positions = args.tokens_per_sample
-
-        logger.info(args)
-
-        encoder = MaskedLMEncoder(args, task.dictionary)
-        return cls(args, encoder)
-
-
-class MaskedLMEncoder(FairseqEncoder):
-    """
-    Encoder for Masked Language Modelling.
-    """
-
-    def __init__(self, args, dictionary):
-        super().__init__(dictionary)
-
-        self.padding_idx = dictionary.pad()
-        self.vocab_size = dictionary.__len__()
-        self.max_positions = args.max_positions
-
-        self.sentence_encoder = TransformerSentenceEncoder(
-            padding_idx=self.padding_idx,
-            vocab_size=self.vocab_size,
-            num_encoder_layers=args.encoder_layers,
-            embedding_dim=args.encoder_embed_dim,
-            ffn_embedding_dim=args.encoder_ffn_embed_dim,
-            num_attention_heads=args.encoder_attention_heads,
-            dropout=args.dropout,
-            attention_dropout=args.attention_dropout,
-            activation_dropout=args.act_dropout,
-            max_seq_len=self.max_positions,
-            num_segments=args.num_segment,
-            use_position_embeddings=not args.no_token_positional_embeddings,
-            encoder_normalize_before=args.encoder_normalize_before,
-            apply_bert_init=args.apply_bert_init,
-            activation_fn=args.activation_fn,
-            learned_pos_embedding=args.encoder_learned_pos,
-        )
-
-        self.share_input_output_embed = args.share_encoder_input_output_embed
-        self.embed_out = None
-        self.sentence_projection_layer = None
-        self.sentence_out_dim = args.sentence_class_num
-        self.lm_output_learned_bias = None
-
-        # Remove head is set to true during fine-tuning
-        self.load_softmax = not getattr(args, "remove_head", False)
-
-        self.masked_lm_pooler = nn.Linear(
-            args.encoder_embed_dim, args.encoder_embed_dim
-        )
-        self.pooler_activation = utils.get_activation_fn(args.pooler_activation_fn)
-
-        self.lm_head_transform_weight = nn.Linear(
-            args.encoder_embed_dim, args.encoder_embed_dim
-        )
-        self.activation_fn = utils.get_activation_fn(args.activation_fn)
-        self.layer_norm = LayerNorm(args.encoder_embed_dim)
-
-        self.lm_output_learned_bias = None
-        if self.load_softmax:
-            self.lm_output_learned_bias = nn.Parameter(torch.zeros(self.vocab_size))
-
-            if not self.share_input_output_embed:
-                self.embed_out = nn.Linear(
-                    args.encoder_embed_dim, self.vocab_size, bias=False
-                )
-
-            if args.sent_loss:
-                self.sentence_projection_layer = nn.Linear(
-                    args.encoder_embed_dim, self.sentence_out_dim, bias=False
-                )
-
-    def forward(self, src_tokens, segment_labels=None, masked_tokens=None, **unused):
-        """
-        Forward pass for Masked LM encoder. This first computes the token
-        embedding using the token embedding matrix, position embeddings (if
-        specified) and segment embeddings (if specified).
-
-        Here we assume that the sentence representation corresponds to the
-        output of the classification_token (see bert_task or cross_lingual_lm
-        task for more details).
-        Args:
-            - src_tokens: B x T matrix representing sentences
-            - segment_labels: B x T matrix representing segment label for tokens
-        Returns:
-            - a tuple of the following:
-                - logits for predictions in format B x T x C to be used in
-                  softmax afterwards
-                - a dictionary of additional data, where 'pooled_output' contains
-                  the representation for classification_token and 'inner_states'
-                  is a list of internal model states used to compute the
-                  predictions (similar in ELMO). 'sentence_logits'
-                  is the prediction logit for NSP task and is only computed if
-                  this is specified in the input arguments.
-        """
-
-        inner_states, sentence_rep = self.sentence_encoder(
-            src_tokens,
-            segment_labels=segment_labels,
-        )
-
-        x = inner_states[-1].transpose(0, 1)
-        # project masked tokens only
-        if masked_tokens is not None:
-            x = x[masked_tokens, :]
-        x = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(x)))
-
-        pooled_output = self.pooler_activation(self.masked_lm_pooler(sentence_rep))
-
-        # project back to size of vocabulary
-        if self.share_input_output_embed and hasattr(
-            self.sentence_encoder.embed_tokens, "weight"
-        ):
-            x = F.linear(x, self.sentence_encoder.embed_tokens.weight)
-        elif self.embed_out is not None:
-            x = self.embed_out(x)
-        if self.lm_output_learned_bias is not None:
-            x = x + self.lm_output_learned_bias
-        sentence_logits = None
-        if self.sentence_projection_layer:
-            sentence_logits = self.sentence_projection_layer(pooled_output)
-
-        return x, {
-            "inner_states": inner_states,
-            "pooled_output": pooled_output,
-            "sentence_logits": sentence_logits,
-        }
-
-    def max_positions(self):
-        """Maximum output length supported by the encoder."""
-        return self.max_positions
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        if isinstance(
-            self.sentence_encoder.embed_positions, SinusoidalPositionalEmbedding
-        ):
-            state_dict[
-                name + ".sentence_encoder.embed_positions._float_tensor"
-            ] = torch.FloatTensor(1)
-        if not self.load_softmax:
-            for k in list(state_dict.keys()):
-                if (
-                    "embed_out.weight" in k
-                    or "sentence_projection_layer.weight" in k
-                    or "lm_output_learned_bias" in k
-                ):
-                    del state_dict[k]
-        return state_dict
-
-
-@register_model_architecture("masked_lm", "masked_lm")
-def base_architecture(args):
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.act_dropout = getattr(args, "act_dropout", 0.0)
-
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
-
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
-    args.share_encoder_input_output_embed = getattr(
-        args, "share_encoder_input_output_embed", False
-    )
-    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.num_segment = getattr(args, "num_segment", 2)
-
-    args.sentence_class_num = getattr(args, "sentence_class_num", 2)
-    args.sent_loss = getattr(args, "sent_loss", False)
-
-    args.apply_bert_init = getattr(args, "apply_bert_init", False)
-
-    args.activation_fn = getattr(args, "activation_fn", "relu")
-    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-
-
-@register_model_architecture("masked_lm", "bert_base")
-def bert_base_architecture(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
-    args.share_encoder_input_output_embed = getattr(
-        args, "share_encoder_input_output_embed", True
-    )
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True)
-    args.num_segment = getattr(args, "num_segment", 2)
-
-    args.encoder_layers = getattr(args, "encoder_layers", 12)
-
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072)
-
-    args.sentence_class_num = getattr(args, "sentence_class_num", 2)
-    args.sent_loss = getattr(args, "sent_loss", True)
-
-    args.apply_bert_init = getattr(args, "apply_bert_init", True)
-
-    args.activation_fn = getattr(args, "activation_fn", "gelu")
-    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
-    base_architecture(args)
-
-
-@register_model_architecture("masked_lm", "bert_large")
-def bert_large_architecture(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
-    args.encoder_layers = getattr(args, "encoder_layers", 24)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
-    bert_base_architecture(args)
-
-
-@register_model_architecture("masked_lm", "xlm_base")
-def xlm_architecture(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
-    args.share_encoder_input_output_embed = getattr(
-        args, "share_encoder_input_output_embed", True
-    )
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True)
-    args.num_segment = getattr(args, "num_segment", 1)
-
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
-
-    args.sent_loss = getattr(args, "sent_loss", False)
-
-    args.activation_fn = getattr(args, "activation_fn", "gelu")
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
-    args.apply_bert_init = getattr(args, "apply_bert_init", True)
-    base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/model_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/model_utils.py
deleted file mode 100644
index 732d66b1d5f695151c26d29eb7f6b53179c269f1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/model_utils.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import List, Optional
-
-import torch
-from torch import Tensor
-
-
-@torch.jit.script
-def script_skip_tensor_list(x: List[Tensor], mask):
-    res = [xi[mask] if xi.size(0) == mask.size(0) else xi[:, mask] for xi in x]
-    outputs = []
-    for i, t in enumerate(res):
-        if t.numel() != 0:
-            outputs.append(t)
-        else:
-            outputs.append(x[i])
-    return outputs
-
-
-@torch.jit.script
-def script_skip_tensor(x: Tensor, mask):
-    # None case
-    if x.size(0) == 0:
-        return x
-    res = x[mask] if x.size(0) == mask.size(0) else x[:, mask]
-    if res.numel() == 0:
-        return x
-    else:
-        return res
-
-
-@torch.jit.script
-def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int):
-    """
-    Expand 2D/3D tensor on dim=1
-    """
-    if x is None:
-        return None
-
-    assert x.dim() == 2 or x.dim() == 3
-    assert trg_dim >= x.size(1), (trg_dim, x.size())
-    if trg_dim == x.size(1):
-        return x
-
-    dims = [x.size(0), trg_dim - x.size(1)]
-    if x.dim() == 3:
-        dims.append(x.size(2))
-    x = torch.cat([x, torch.zeros(dims).to(x).fill_(padding_idx)], 1)
-
-    return x
-
-
-@torch.jit.script
-def coalesce(x: Optional[Tensor], y: Tensor) -> Tensor:
-    return x if x is not None else y
-
-
-@torch.jit.script
-def fill_tensors(
-    x: Optional[Tensor], mask, y: Optional[Tensor], padding_idx: int
-) -> Optional[Tensor]:
-    """
-    Filling tensor x with y at masked positions (dim=0).
-    """
-    if x is None or x.size()[0] == 0 or y is None:
-        return x
-    assert x.dim() == y.dim() and mask.size(0) == x.size(0)
-    assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2))
-
-    n_selected = mask.sum()
-    if n_selected == 0:
-        return x
-    assert n_selected == y.size(0)
-    if n_selected == x.size(0):
-        return y
-
-    if x.size(1) < y.size(1):
-        x = expand_2d_or_3d_tensor(x, y.size(1), padding_idx)
-        x[mask] = y
-    elif x.size(1) > y.size(1):
-        x[mask] = torch.tensor(padding_idx).type_as(x)
-        if x.dim() == 2:
-            x[mask, : y.size(1)] = y
-        else:
-            x[mask, : y.size(1), :] = y
-    else:
-        x[mask] = y
-    return x
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/multilingual_transformer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/multilingual_transformer.py
deleted file mode 100644
index e3fbbd5710dfb10b16f5495c9131fa42b11544be..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/multilingual_transformer.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from collections import OrderedDict
-
-from fairseq import utils
-from fairseq.models import (
-    FairseqMultiModel,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.models.transformer import (
-    Embedding,
-    TransformerDecoder,
-    TransformerEncoder,
-    TransformerModel,
-    base_architecture,
-)
-
-
-@register_model("multilingual_transformer")
-class MultilingualTransformerModel(FairseqMultiModel):
-    """Train Transformer models for multiple language pairs simultaneously.
-
-    Requires `--task multilingual_translation`.
-
-    We inherit all arguments from TransformerModel and assume that all language
-    pairs use a single Transformer architecture. In addition, we provide several
-    options that are specific to the multilingual setting.
-
-    Args:
-        --share-encoder-embeddings: share encoder embeddings across all source languages
-        --share-decoder-embeddings: share decoder embeddings across all target languages
-        --share-encoders: share all encoder params (incl. embeddings) across all source languages
-        --share-decoders: share all decoder params (incl. embeddings) across all target languages
-    """
-
-    def __init__(self, encoders, decoders):
-        super().__init__(encoders, decoders)
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        TransformerModel.add_args(parser)
-        parser.add_argument(
-            "--share-encoder-embeddings",
-            action="store_true",
-            help="share encoder embeddings across languages",
-        )
-        parser.add_argument(
-            "--share-decoder-embeddings",
-            action="store_true",
-            help="share decoder embeddings across languages",
-        )
-        parser.add_argument(
-            "--share-encoders",
-            action="store_true",
-            help="share encoders across languages",
-        )
-        parser.add_argument(
-            "--share-decoders",
-            action="store_true",
-            help="share decoders across languages",
-        )
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-        from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
-
-        assert isinstance(task, MultilingualTranslationTask)
-
-        # make sure all arguments are present in older models
-        base_multilingual_architecture(args)
-
-        if not hasattr(args, "max_source_positions"):
-            args.max_source_positions = 1024
-        if not hasattr(args, "max_target_positions"):
-            args.max_target_positions = 1024
-
-        src_langs = [lang_pair.split("-")[0] for lang_pair in task.model_lang_pairs]
-        tgt_langs = [lang_pair.split("-")[1] for lang_pair in task.model_lang_pairs]
-
-        if args.share_encoders:
-            args.share_encoder_embeddings = True
-        if args.share_decoders:
-            args.share_decoder_embeddings = True
-
-        def build_embedding(dictionary, embed_dim, path=None):
-            num_embeddings = len(dictionary)
-            padding_idx = dictionary.pad()
-            emb = Embedding(num_embeddings, embed_dim, padding_idx)
-            # if provided, load from preloaded dictionaries
-            if path:
-                embed_dict = utils.parse_embedding(path)
-                utils.load_embedding(embed_dict, dictionary, emb)
-            return emb
-
-        # build shared embeddings (if applicable)
-        shared_encoder_embed_tokens, shared_decoder_embed_tokens = None, None
-        if args.share_all_embeddings:
-            if args.encoder_embed_dim != args.decoder_embed_dim:
-                raise ValueError(
-                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
-                )
-            if args.decoder_embed_path and (
-                args.decoder_embed_path != args.encoder_embed_path
-            ):
-                raise ValueError(
-                    "--share-all-embeddings not compatible with --decoder-embed-path"
-                )
-            shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
-                dicts=task.dicts,
-                langs=task.langs,
-                embed_dim=args.encoder_embed_dim,
-                build_embedding=build_embedding,
-                pretrained_embed_path=args.encoder_embed_path,
-            )
-            shared_decoder_embed_tokens = shared_encoder_embed_tokens
-            args.share_decoder_input_output_embed = True
-        else:
-            if args.share_encoder_embeddings:
-                shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
-                    dicts=task.dicts,
-                    langs=src_langs,
-                    embed_dim=args.encoder_embed_dim,
-                    build_embedding=build_embedding,
-                    pretrained_embed_path=args.encoder_embed_path,
-                )
-            if args.share_decoder_embeddings:
-                shared_decoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
-                    dicts=task.dicts,
-                    langs=tgt_langs,
-                    embed_dim=args.decoder_embed_dim,
-                    build_embedding=build_embedding,
-                    pretrained_embed_path=args.decoder_embed_path,
-                )
-
-        # encoders/decoders for each language
-        lang_encoders, lang_decoders = {}, {}
-
-        def get_encoder(lang):
-            if lang not in lang_encoders:
-                if shared_encoder_embed_tokens is not None:
-                    encoder_embed_tokens = shared_encoder_embed_tokens
-                else:
-                    encoder_embed_tokens = build_embedding(
-                        task.dicts[lang],
-                        args.encoder_embed_dim,
-                        args.encoder_embed_path,
-                    )
-                lang_encoders[lang] = cls._get_module_class(
-                    True, args, task.dicts[lang], encoder_embed_tokens, src_langs
-                )
-            return lang_encoders[lang]
-
-        def get_decoder(lang):
-            if lang not in lang_decoders:
-                if shared_decoder_embed_tokens is not None:
-                    decoder_embed_tokens = shared_decoder_embed_tokens
-                else:
-                    decoder_embed_tokens = build_embedding(
-                        task.dicts[lang],
-                        args.decoder_embed_dim,
-                        args.decoder_embed_path,
-                    )
-                lang_decoders[lang] = cls._get_module_class(
-                    False, args, task.dicts[lang], decoder_embed_tokens, tgt_langs
-                )
-            return lang_decoders[lang]
-
-        # shared encoders/decoders (if applicable)
-        shared_encoder, shared_decoder = None, None
-        if args.share_encoders:
-            shared_encoder = get_encoder(src_langs[0])
-        if args.share_decoders:
-            shared_decoder = get_decoder(tgt_langs[0])
-
-        encoders, decoders = OrderedDict(), OrderedDict()
-        for lang_pair, src, tgt in zip(task.model_lang_pairs, src_langs, tgt_langs):
-            encoders[lang_pair] = (
-                shared_encoder if shared_encoder is not None else get_encoder(src)
-            )
-            decoders[lang_pair] = (
-                shared_decoder if shared_decoder is not None else get_decoder(tgt)
-            )
-
-        return MultilingualTransformerModel(encoders, decoders)
-
-    @classmethod
-    def _get_module_class(cls, is_encoder, args, lang_dict, embed_tokens, langs):
-        module_class = TransformerEncoder if is_encoder else TransformerDecoder
-        return module_class(args, lang_dict, embed_tokens)
-
-    def load_state_dict(self, state_dict, strict=True, args=None):
-        state_dict_subset = state_dict.copy()
-        for k, _ in state_dict.items():
-            assert k.startswith("models.")
-            lang_pair = k.split(".")[1]
-            if lang_pair not in self.models:
-                del state_dict_subset[k]
-        super().load_state_dict(state_dict_subset, strict=strict, args=args)
-
-
-@register_model_architecture("multilingual_transformer", "multilingual_transformer")
-def base_multilingual_architecture(args):
-    base_architecture(args)
-    args.share_encoder_embeddings = getattr(args, "share_encoder_embeddings", False)
-    args.share_decoder_embeddings = getattr(args, "share_decoder_embeddings", False)
-    args.share_encoders = getattr(args, "share_encoders", False)
-    args.share_decoders = getattr(args, "share_decoders", False)
-
-
-@register_model_architecture(
-    "multilingual_transformer", "multilingual_transformer_iwslt_de_en"
-)
-def multilingual_transformer_iwslt_de_en(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    base_multilingual_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/__init__.py
deleted file mode 100644
index 05fe822487c3bcde8346648d5826f1669c6bc1ca..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""isort:skip_file"""
-
-from .fairseq_nat_model import *
-from .nonautoregressive_transformer import *
-from .nat_crf_transformer import *
-from .iterative_nonautoregressive_transformer import *
-from .cmlm_transformer import *
-from .levenshtein_transformer import *
-from .insertion_transformer import *
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/cmlm_transformer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/cmlm_transformer.py
deleted file mode 100644
index c876e9453c101c00bd8e93e6e6f1fb48dc26f993..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/cmlm_transformer.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-This file implements:
-Ghazvininejad, Marjan, et al.
-"Constant-time machine translation with conditional masked language models."
-arXiv preprint arXiv:1904.09324 (2019).
-"""
-
-from fairseq.models import register_model, register_model_architecture
-from fairseq.models.nat import NATransformerModel
-from fairseq.utils import new_arange
-
-
-def _skeptical_unmasking(output_scores, output_masks, p):
-    sorted_index = output_scores.sort(-1)[1]
-    boundary_len = (
-        (output_masks.sum(1, keepdim=True).type_as(output_scores) - 2) * p
-    ).long()
-    skeptical_mask = new_arange(output_masks) < boundary_len
-    return skeptical_mask.scatter(1, sorted_index, skeptical_mask)
-
-
-@register_model("cmlm_transformer")
-class CMLMNATransformerModel(NATransformerModel):
-    @staticmethod
-    def add_args(parser):
-        NATransformerModel.add_args(parser)
-
-    def forward(
-        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
-    ):
-        assert not self.decoder.src_embedding_copy, "do not support embedding copy."
-
-        # encoding
-        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
-        # length prediction
-        length_out = self.decoder.forward_length(
-            normalize=False, encoder_out=encoder_out
-        )
-        length_tgt = self.decoder.forward_length_prediction(
-            length_out, encoder_out, tgt_tokens
-        )
-
-        # decoding
-        word_ins_out = self.decoder(
-            normalize=False,
-            prev_output_tokens=prev_output_tokens,
-            encoder_out=encoder_out,
-        )
-        word_ins_mask = prev_output_tokens.eq(self.unk)
-
-        return {
-            "word_ins": {
-                "out": word_ins_out,
-                "tgt": tgt_tokens,
-                "mask": word_ins_mask,
-                "ls": self.args.label_smoothing,
-                "nll_loss": True,
-            },
-            "length": {
-                "out": length_out,
-                "tgt": length_tgt,
-                "factor": self.decoder.length_loss_factor,
-            },
-        }
-
-    def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs):
-
-        step = decoder_out.step
-        max_step = decoder_out.max_step
-
-        output_tokens = decoder_out.output_tokens
-        output_scores = decoder_out.output_scores
-        history = decoder_out.history
-
-        # execute the decoder
-        output_masks = output_tokens.eq(self.unk)
-        _scores, _tokens = self.decoder(
-            normalize=True,
-            prev_output_tokens=output_tokens,
-            encoder_out=encoder_out,
-        ).max(-1)
-        output_tokens.masked_scatter_(output_masks, _tokens[output_masks])
-        output_scores.masked_scatter_(output_masks, _scores[output_masks])
-
-        if history is not None:
-            history.append(output_tokens.clone())
-
-        # skeptical decoding (depend on the maximum decoding steps.)
-        if (step + 1) < max_step:
-            skeptical_mask = _skeptical_unmasking(
-                output_scores, output_tokens.ne(self.pad), 1 - (step + 1) / max_step
-            )
-
-            output_tokens.masked_fill_(skeptical_mask, self.unk)
-            output_scores.masked_fill_(skeptical_mask, 0.0)
-
-            if history is not None:
-                history.append(output_tokens.clone())
-
-        return decoder_out._replace(
-            output_tokens=output_tokens,
-            output_scores=output_scores,
-            attn=None,
-            history=history,
-        )
-
-
-@register_model_architecture("cmlm_transformer", "cmlm_transformer")
-def cmlm_base_architecture(args):
-    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
-    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
-    args.decoder_ffn_embed_dim = getattr(
-        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
-    )
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
-    args.activation_fn = getattr(args, "activation_fn", "relu")
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.share_all_embeddings = getattr(args, "share_all_embeddings", True)
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.adaptive_input = getattr(args, "adaptive_input", False)
-    args.apply_bert_init = getattr(args, "apply_bert_init", False)
-
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-
-    # --- special arguments ---
-    args.sg_length_pred = getattr(args, "sg_length_pred", False)
-    args.pred_length_offset = getattr(args, "pred_length_offset", False)
-    args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
-    args.ngram_predictor = getattr(args, "ngram_predictor", 1)
-    args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
-
-
-@register_model_architecture("cmlm_transformer", "cmlm_transformer_wmt_en_de")
-def cmlm_wmt_en_de(args):
-    cmlm_base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/fairseq_nat_model.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/fairseq_nat_model.py
deleted file mode 100644
index 1dbc29d0f49697329f50bbea9ee15bda0010f069..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/fairseq_nat_model.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-from fairseq.models.transformer import (
-    TransformerDecoder,
-    TransformerEncoder,
-    TransformerModel,
-)
-from fairseq.modules.transformer_sentence_encoder import init_bert_params
-
-
-def ensemble_encoder(func):
-    def wrapper(self, *args, **kwargs):
-        if self.ensemble_models is None or len(self.ensemble_models) == 1:
-            return func(self, *args, **kwargs)
-        encoder_outs = [func(model, *args, **kwargs) for model in self.ensemble_models]
-        _encoder_out = encoder_outs[0]
-
-        def stack(key):
-            outs = [getattr(e, key) for e in encoder_outs]
-            return torch.stack(outs, -1) if outs[0] is not None else None
-
-        return _encoder_out._replace(
-            encoder_out=stack("encoder_out"),
-            encoder_embedding=stack("encoder_embedding"),
-            encoder_states=stack("encoder_states"),
-        )
-
-    return wrapper
-
-
-def ensemble_decoder(func):
-    def wrapper(self, normalize=False, encoder_out=None, *args, **kwargs):
-        if self.ensemble_models is None or len(self.ensemble_models) == 1:
-            return func(
-                self, normalize=normalize, encoder_out=encoder_out, *args, **kwargs
-            )
-
-        action_outs = [
-            func(
-                model,
-                normalize=normalize,
-                encoder_out=encoder_out._replace(
-                    encoder_out=encoder_out.encoder_out[:, :, :, i]
-                ),
-                *args,
-                **kwargs
-            )
-            for i, model in enumerate(self.ensemble_models)
-        ]
-
-        if not isinstance(action_outs[0], tuple):  # return multiple values
-            action_outs = [[a] for a in action_outs]
-        else:
-            action_outs = [list(a) for a in action_outs]
-
-        ensembled_outs = []
-        for i in range(len(action_outs[0])):
-            if i == 0 and normalize:
-                ensembled_outs += [
-                    torch.logsumexp(
-                        torch.stack([a[i] for a in action_outs], -1), dim=-1
-                    )
-                    - math.log(len(self.ensemble_models))
-                ]
-            elif action_outs[0][i] is not None:
-                ensembled_outs += [torch.stack([a[i] for a in action_outs], -1)]
-            else:
-                ensembled_outs += [None]
-
-        if len(ensembled_outs) == 1:
-            return ensembled_outs[0]
-        return tuple(ensembled_outs)
-
-    return wrapper
-
-
-class FairseqNATModel(TransformerModel):
-    """
-    Abstract class for all nonautoregressive-based models
-    """
-
-    def __init__(self, args, encoder, decoder):
-        super().__init__(args, encoder, decoder)
-        self.tgt_dict = decoder.dictionary
-        self.bos = decoder.dictionary.bos()
-        self.eos = decoder.dictionary.eos()
-        self.pad = decoder.dictionary.pad()
-        self.unk = decoder.dictionary.unk()
-
-        self.ensemble_models = None
-
-    @property
-    def allow_length_beam(self):
-        return False
-
-    @property
-    def allow_ensemble(self):
-        return True
-
-    def enable_ensemble(self, models):
-        self.encoder.ensemble_models = [m.encoder for m in models]
-        self.decoder.ensemble_models = [m.decoder for m in models]
-
-    @staticmethod
-    def add_args(parser):
-        TransformerModel.add_args(parser)
-        parser.add_argument(
-            "--apply-bert-init",
-            action="store_true",
-            help="use custom param initialization for BERT",
-        )
-
-    @classmethod
-    def build_decoder(cls, args, tgt_dict, embed_tokens):
-        decoder = FairseqNATDecoder(args, tgt_dict, embed_tokens)
-        if getattr(args, "apply_bert_init", False):
-            decoder.apply(init_bert_params)
-        return decoder
-
-    @classmethod
-    def build_encoder(cls, args, src_dict, embed_tokens):
-        encoder = FairseqNATEncoder(args, src_dict, embed_tokens)
-        if getattr(args, "apply_bert_init", False):
-            encoder.apply(init_bert_params)
-        return encoder
-
-    def forward_encoder(self, encoder_inputs):
-        return self.encoder(*encoder_inputs)
-
-    def forward_decoder(self, *args, **kwargs):
-        return NotImplementedError
-
-    def initialize_output_tokens(self, *args, **kwargs):
-        return NotImplementedError
-
-    def forward(self, *args, **kwargs):
-        return NotImplementedError
-
-
-class FairseqNATEncoder(TransformerEncoder):
-    def __init__(self, args, dictionary, embed_tokens):
-        super().__init__(args, dictionary, embed_tokens)
-        self.ensemble_models = None
-
-    @ensemble_encoder
-    def forward(self, *args, **kwargs):
-        return super().forward(*args, **kwargs)
-
-
-class FairseqNATDecoder(TransformerDecoder):
-    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
-        super().__init__(args, dictionary, embed_tokens, no_encoder_attn)
-        self.ensemble_models = None
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/insertion_transformer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/insertion_transformer.py
deleted file mode 100644
index bc28000f59a3b9e8098f9fe710cc8335d39eea3e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/insertion_transformer.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-import torch.nn.functional as F
-from fairseq.models import register_model, register_model_architecture
-from fairseq.models.nat import (
-    FairseqNATModel,
-    LevenshteinTransformerDecoder,
-    LevenshteinTransformerModel,
-    ensemble_decoder,
-)
-from fairseq.models.transformer import Linear
-from fairseq.modules.transformer_sentence_encoder import init_bert_params
-from fairseq.utils import new_arange
-
-
-class NegativeDistanceScore(object):
-    def __init__(self):
-
-        # pre-compute some values
-        self.scores = {}
-
-        self.scores[0.5] = self.compute_score_full(50, 0.5)
-        self.scores[1.0] = self.compute_score_full(50, 1.0)
-        self.scores[2.0] = self.compute_score_full(50, 2.0)
-
-    def __call__(self, i, L, tau):
-        if (tau is None) or (tau > 1000):
-            return 1 / L
-
-        if tau in self.scores:
-            if L < self.scores[tau].shape[0]:
-                return self.scores[tau][L - 1, i]
-        return self.compute_score(L, tau)[i]
-
-    def compute_score(self, L, tau):
-        s = np.array([-abs(L / 2 - i) / tau for i in range(L)])
-        s = np.exp(s - s.max())
-        return s / s.sum()
-
-    def compute_score_full(self, L, tau):
-        s = -abs(np.arange(0, L - 1)[:, None] / 2 - np.arange(L)[None, :]) / tau
-        s = np.tril(s, 0) + np.triu(s - float("inf"), 1)
-        s = np.exp(s - s.max(1, keepdims=True))
-        return s / s.sum(1, keepdims=True)
-
-
-neg_scorer = NegativeDistanceScore()
-
-
-def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx, vocab_size, tau=None):
-    try:
-        from fairseq import libnat
-    except ImportError as e:
-        import sys
-
-        sys.stderr.write("ERROR: missing libnat. run `pip install --editable .`\n")
-        raise e
-
-    B = in_tokens.size(0)
-    T = in_tokens.size(1)
-    V = vocab_size
-
-    with torch.cuda.device_of(in_tokens):
-        in_tokens_list = [
-            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
-        ]
-        out_tokens_list = [
-            [t for t in s if t != padding_idx]
-            for i, s in enumerate(out_tokens.tolist())
-        ]
-
-    full_labels = libnat.suggested_ed2_path(
-        in_tokens_list, out_tokens_list, padding_idx
-    )
-    insert_labels = [a[:-1] for a in full_labels]
-
-    # numericalize1
-    insert_label_tensors = in_tokens.new_zeros(B * (T - 1) * V).float()
-    insert_index, insert_labels = zip(
-        *[
-            (w + (j + i * (T - 1)) * V, neg_scorer(k, len(label), tau))
-            for i, labels in enumerate(insert_labels)
-            for j, label in enumerate(labels[1:-1])
-            for k, w in enumerate(label)
-        ]
-    )  # HACK 1:-1
-    insert_index, insert_labels = [
-        torch.tensor(list(a), device=in_tokens.device)
-        for a in [insert_index, insert_labels]
-    ]
-    insert_label_tensors.scatter_(0, insert_index.long(), insert_labels)
-    insert_label_tensors = insert_label_tensors.view(B, T - 1, V)
-
-    return insert_label_tensors
-
-
-def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, padding_idx):
-
-    padding_masks = in_tokens[:, 1:].eq(padding_idx)
-    word_ins_scores.masked_fill_(padding_masks, 0.0)
-    word_ins_pred.masked_fill_(padding_masks, padding_idx)
-
-    in_coords = new_arange(in_tokens).type_as(in_scores)
-
-    # shift all padding predictions to infinite
-    out_coords = (in_coords[:, 1:] - 0.5).masked_fill(
-        word_ins_pred.eq(padding_idx), float("inf")
-    )
-    out_coords = torch.cat([in_coords, out_coords], 1).sort(-1)[1]
-    out_tokens = torch.cat([in_tokens, word_ins_pred], 1).gather(1, out_coords)
-    out_scores = torch.cat([in_scores, word_ins_scores], 1).gather(1, out_coords)
-    return out_tokens, out_scores
-
-
-@register_model("insertion_transformer")
-class InsertionTransformerModel(LevenshteinTransformerModel):
-    def __init__(self, args, encoder, decoder):
-        super().__init__(args, encoder, decoder)
-
-    @staticmethod
-    def add_args(parser):
-        FairseqNATModel.add_args(parser)
-        parser.add_argument("--label-tau", default=None, type=float)
-
-    @classmethod
-    def build_decoder(cls, args, tgt_dict, embed_tokens):
-        decoder = InsertionTransformerDecoder(args, tgt_dict, embed_tokens)
-        if getattr(args, "apply_bert_init", False):
-            decoder.apply(init_bert_params)
-        return decoder
-
-    def forward(
-        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
-    ):
-
-        assert tgt_tokens is not None, "forward function only supports training."
-
-        # encoding
-        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
-
-        # generate training labels for insertion
-        word_ins_out = self.decoder.forward_word_ins(
-            normalize=False,
-            prev_output_tokens=prev_output_tokens,
-            encoder_out=encoder_out,
-        )
-
-        word_ins_tgt = _get_ins_targets(
-            prev_output_tokens,
-            tgt_tokens,
-            self.pad,
-            self.unk,
-            len(self.tgt_dict),
-            tau=self.decoder.label_tau,
-        ).type_as(word_ins_out)
-        word_ins_masks = prev_output_tokens[:, 1:].ne(self.pad)
-
-        return {
-            "word_ins": {
-                "out": word_ins_out,
-                "tgt": word_ins_tgt,
-                "mask": word_ins_masks,
-                "ls": self.args.label_smoothing,
-                "nll_loss": True,
-            }
-        }
-
-    def forward_decoder(
-        self, decoder_out, encoder_out, eos_penalty=0.0, max_ratio=None, **kwargs
-    ):
-
-        output_tokens = decoder_out.output_tokens
-        output_scores = decoder_out.output_scores
-        history = decoder_out.history
-
-        # TODO: decoding for InsertionTransformer
-        word_ins_score = self.decoder.forward_word_ins(
-            normalize=True, prev_output_tokens=output_tokens, encoder_out=encoder_out
-        )
-
-        if eos_penalty > 0.0:
-            word_ins_score[:, :, self.pad] -= eos_penalty
-        word_ins_score, word_ins_pred = word_ins_score.max(-1)
-        output_tokens, output_scores = _apply_ins_words(
-            output_tokens, output_scores, word_ins_pred, word_ins_score, self.pad
-        )
-
-        # delete some unnecessary paddings
-        cut_off = output_tokens.ne(self.pad).sum(1).max()
-        output_tokens = output_tokens[:, :cut_off]
-        output_scores = output_scores[:, :cut_off]
-
-        if history is not None:
-            history.append(output_tokens.clone())
-
-        return decoder_out._replace(
-            output_tokens=output_tokens,
-            output_scores=output_scores,
-            attn=None,
-            history=history,
-        )
-
-
-class InsertionTransformerDecoder(LevenshteinTransformerDecoder):
-    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
-        # use the TransformerDecoder's __init__
-        super(LevenshteinTransformerDecoder, self).__init__(
-            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
-        )
-
-        self.dictionary = dictionary
-        self.bos = dictionary.bos()
-        self.unk = dictionary.unk()
-        self.eos = dictionary.eos()
-        self.pool_out = Linear(self.output_embed_dim * 2, self.output_embed_dim)
-
-        self.label_tau = getattr(args, "label_tau", None)
-
-    @ensemble_decoder
-    def forward_word_ins(self, normalize, encoder_out, prev_output_tokens):
-        features = self.extract_features(prev_output_tokens, encoder_out=encoder_out)[0]
-        features = self.pool_out(
-            torch.cat([features[:, :-1, :], features[:, 1:, :]], 2)
-        )
-        decoder_out = self.output_layer(features)
-        return F.log_softmax(decoder_out, -1) if normalize else decoder_out
-
-    def forward_mask_ins(self, *args, **kwargs):
-        raise NotImplementedError
-
-    def forward_word_del(self, *args, **kwargs):
-        raise NotImplementedError
-
-
-@register_model_architecture("insertion_transformer", "insertion_transformer")
-def insertion_base_architecture(args):
-    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
-    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
-    args.decoder_ffn_embed_dim = getattr(
-        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
-    )
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
-    args.activation_fn = getattr(args, "activation_fn", "relu")
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.adaptive_input = getattr(args, "adaptive_input", False)
-    args.apply_bert_init = getattr(args, "apply_bert_init", False)
-
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-
-    # special for insertion transformer
-    args.label_tau = getattr(args, "label_tau", None)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py
deleted file mode 100644
index bc39509980a80eb8c21e0bfdb304649ad3acc4d0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/iterative_nonautoregressive_transformer.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-from fairseq.models import register_model, register_model_architecture
-from fairseq.models.nat import NATransformerModel
-
-
-def _sequential_poisoning(s, V, beta=0.33, bos=2, eos=3, pad=1):
-    # s: input batch
-    # V: vocabulary size
-    rand_words = torch.randint(low=4, high=V, size=s.size(), device=s.device)
-    choices = torch.rand(size=s.size(), device=s.device)
-    choices.masked_fill_((s == pad) | (s == bos) | (s == eos), 1)
-
-    replace = choices < beta / 3
-    repeat = (choices >= beta / 3) & (choices < beta * 2 / 3)
-    swap = (choices >= beta * 2 / 3) & (choices < beta)
-    safe = choices >= beta
-
-    for i in range(s.size(1) - 1):
-        rand_word = rand_words[:, i]
-        next_word = s[:, i + 1]
-        self_word = s[:, i]
-
-        replace_i = replace[:, i]
-        swap_i = swap[:, i] & (next_word != 3)
-        repeat_i = repeat[:, i] & (next_word != 3)
-        safe_i = safe[:, i] | ((next_word == 3) & (~replace_i))
-
-        s[:, i] = (
-            self_word * (safe_i | repeat_i).long()
-            + next_word * swap_i.long()
-            + rand_word * replace_i.long()
-        )
-        s[:, i + 1] = (
-            next_word * (safe_i | replace_i).long()
-            + self_word * (swap_i | repeat_i).long()
-        )
-    return s
-
-
-def gumbel_noise(input, TINY=1e-8):
-    return (
-        input.new_zeros(*input.size())
-        .uniform_()
-        .add_(TINY)
-        .log_()
-        .neg_()
-        .add_(TINY)
-        .log_()
-        .neg_()
-    )
-
-
-@register_model("iterative_nonautoregressive_transformer")
-class IterNATransformerModel(NATransformerModel):
-    @staticmethod
-    def add_args(parser):
-        NATransformerModel.add_args(parser)
-        parser.add_argument(
-            "--train-step",
-            type=int,
-            help="number of refinement iterations during training",
-        )
-        parser.add_argument(
-            "--dae-ratio",
-            type=float,
-            help="the probability of switching to the denoising auto-encoder loss",
-        )
-        parser.add_argument(
-            "--stochastic-approx",
-            action="store_true",
-            help="sampling from the decoder as the inputs for next iteration",
-        )
-
-    @classmethod
-    def build_model(cls, args, task):
-        model = super().build_model(args, task)
-        model.train_step = getattr(args, "train_step", 4)
-        model.dae_ratio = getattr(args, "dae_ratio", 0.5)
-        model.stochastic_approx = getattr(args, "stochastic_approx", False)
-        return model
-
-    def forward(
-        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
-    ):
-
-        B, T = prev_output_tokens.size()
-
-        # encoding
-        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
-
-        # length prediction
-        length_out = self.decoder.forward_length(
-            normalize=False, encoder_out=encoder_out
-        )
-        length_tgt = self.decoder.forward_length_prediction(
-            length_out, encoder_out, tgt_tokens
-        )
-
-        # decoding
-        word_ins_outs, word_ins_tgts, word_ins_masks = [], [], []
-        for t in range(self.train_step):
-            word_ins_out = self.decoder(
-                normalize=False,
-                prev_output_tokens=prev_output_tokens,
-                encoder_out=encoder_out,
-                step=t,
-            )
-            word_ins_tgt = tgt_tokens
-            word_ins_mask = word_ins_tgt.ne(self.pad)
-
-            word_ins_outs.append(word_ins_out)
-            word_ins_tgts.append(word_ins_tgt)
-            word_ins_masks.append(word_ins_mask)
-
-            if t < (self.train_step - 1):
-                # prediction for next iteration
-                if self.stochastic_approx:
-                    word_ins_prediction = (
-                        word_ins_out + gumbel_noise(word_ins_out)
-                    ).max(-1)[1]
-                else:
-                    word_ins_prediction = word_ins_out.max(-1)[1]
-
-                prev_output_tokens = prev_output_tokens.masked_scatter(
-                    word_ins_mask, word_ins_prediction[word_ins_mask]
-                )
-
-                if self.dae_ratio > 0:
-                    # we do not perform denoising for the first iteration
-                    corrputed = (
-                        torch.rand(size=(B,), device=prev_output_tokens.device)
-                        < self.dae_ratio
-                    )
-                    corrputed_tokens = _sequential_poisoning(
-                        tgt_tokens[corrputed],
-                        len(self.tgt_dict),
-                        0.33,
-                        self.bos,
-                        self.eos,
-                        self.pad,
-                    )
-                    prev_output_tokens[corrputed] = corrputed_tokens
-
-        # concat everything
-        word_ins_out = torch.cat(word_ins_outs, 0)
-        word_ins_tgt = torch.cat(word_ins_tgts, 0)
-        word_ins_mask = torch.cat(word_ins_masks, 0)
-
-        return {
-            "word_ins": {
-                "out": word_ins_out,
-                "tgt": word_ins_tgt,
-                "mask": word_ins_mask,
-                "ls": self.args.label_smoothing,
-                "nll_loss": True,
-            },
-            "length": {
-                "out": length_out,
-                "tgt": length_tgt,
-                "factor": self.decoder.length_loss_factor,
-            },
-        }
-
-
-@register_model_architecture(
-    "iterative_nonautoregressive_transformer", "iterative_nonautoregressive_transformer"
-)
-def inat_base_architecture(args):
-    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
-    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
-    args.decoder_ffn_embed_dim = getattr(
-        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
-    )
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
-    args.activation_fn = getattr(args, "activation_fn", "relu")
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.adaptive_input = getattr(args, "adaptive_input", False)
-    args.apply_bert_init = getattr(args, "apply_bert_init", False)
-
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-
-    # --- special arguments ---
-    args.sg_length_pred = getattr(args, "sg_length_pred", False)
-    args.pred_length_offset = getattr(args, "pred_length_offset", False)
-    args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
-    args.ngram_predictor = getattr(args, "ngram_predictor", 1)
-    args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
-
-    args.train_step = getattr(args, "train_step", 4)
-    args.dae_ratio = getattr(args, "dae_ratio", 0.5)
-    args.stochastic_approx = getattr(args, "stochastic_approx", False)
-
-
-@register_model_architecture(
-    "iterative_nonautoregressive_transformer",
-    "iterative_nonautoregressive_transformer_wmt_en_de",
-)
-def iter_nat_wmt_en_de(args):
-    inat_base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/levenshtein_transformer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/levenshtein_transformer.py
deleted file mode 100644
index f7a3f003ca780bc557f17a5ab2b336e13211ac0e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/levenshtein_transformer.py
+++ /dev/null
@@ -1,502 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq.iterative_refinement_generator import DecoderOut
-from fairseq.models import register_model, register_model_architecture
-from fairseq.models.nat import FairseqNATDecoder, FairseqNATModel, ensemble_decoder
-from fairseq.models.transformer import Embedding, TransformerDecoderLayer
-from fairseq.modules.transformer_sentence_encoder import init_bert_params
-
-from .levenshtein_utils import (
-    _apply_del_words,
-    _apply_ins_masks,
-    _apply_ins_words,
-    _fill,
-    _get_del_targets,
-    _get_ins_targets,
-    _skip,
-    _skip_encoder_out,
-)
-
-
-@register_model("levenshtein_transformer")
-class LevenshteinTransformerModel(FairseqNATModel):
-    @property
-    def allow_length_beam(self):
-        return False
-
-    @staticmethod
-    def add_args(parser):
-        FairseqNATModel.add_args(parser)
-        parser.add_argument(
-            "--early-exit",
-            default="6,6,6",
-            type=str,
-            help="number of decoder layers before word_del, mask_ins, word_ins",
-        )
-        parser.add_argument(
-            "--no-share-discriminator",
-            action="store_true",
-            help="separate parameters for discriminator",
-        )
-        parser.add_argument(
-            "--no-share-maskpredictor",
-            action="store_true",
-            help="separate parameters for mask-predictor",
-        )
-        parser.add_argument(
-            "--share-discriminator-maskpredictor",
-            action="store_true",
-            help="share the parameters for both mask-predictor and discriminator",
-        )
-        parser.add_argument(
-            "--sampling-for-deletion",
-            action="store_true",
-            help="instead of argmax, use sampling to predict the tokens",
-        )
-
-    @classmethod
-    def build_decoder(cls, args, tgt_dict, embed_tokens):
-        decoder = LevenshteinTransformerDecoder(args, tgt_dict, embed_tokens)
-        if getattr(args, "apply_bert_init", False):
-            decoder.apply(init_bert_params)
-        return decoder
-
-    def forward(
-        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
-    ):
-
-        assert tgt_tokens is not None, "forward function only supports training."
-
-        # encoding
-        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
-
-        # generate training labels for insertion
-        masked_tgt_masks, masked_tgt_tokens, mask_ins_targets = _get_ins_targets(
-            prev_output_tokens, tgt_tokens, self.pad, self.unk
-        )
-        mask_ins_targets = mask_ins_targets.clamp(min=0, max=255)  # for safe prediction
-        mask_ins_masks = prev_output_tokens[:, 1:].ne(self.pad)
-
-        mask_ins_out, _ = self.decoder.forward_mask_ins(
-            normalize=False,
-            prev_output_tokens=prev_output_tokens,
-            encoder_out=encoder_out,
-        )
-        word_ins_out, _ = self.decoder.forward_word_ins(
-            normalize=False,
-            prev_output_tokens=masked_tgt_tokens,
-            encoder_out=encoder_out,
-        )
-
-        # make online prediction
-        if self.decoder.sampling_for_deletion:
-            word_predictions = torch.multinomial(
-                F.softmax(word_ins_out, -1).view(-1, word_ins_out.size(-1)), 1
-            ).view(word_ins_out.size(0), -1)
-        else:
-            word_predictions = F.log_softmax(word_ins_out, dim=-1).max(2)[1]
-
-        word_predictions.masked_scatter_(
-            ~masked_tgt_masks, tgt_tokens[~masked_tgt_masks]
-        )
-
-        # generate training labels for deletion
-        word_del_targets = _get_del_targets(word_predictions, tgt_tokens, self.pad)
-        word_del_out, _ = self.decoder.forward_word_del(
-            normalize=False,
-            prev_output_tokens=word_predictions,
-            encoder_out=encoder_out,
-        )
-        word_del_masks = word_predictions.ne(self.pad)
-
-        return {
-            "mask_ins": {
-                "out": mask_ins_out,
-                "tgt": mask_ins_targets,
-                "mask": mask_ins_masks,
-                "ls": 0.01,
-            },
-            "word_ins": {
-                "out": word_ins_out,
-                "tgt": tgt_tokens,
-                "mask": masked_tgt_masks,
-                "ls": self.args.label_smoothing,
-                "nll_loss": True,
-            },
-            "word_del": {
-                "out": word_del_out,
-                "tgt": word_del_targets,
-                "mask": word_del_masks,
-            },
-        }
-
-    def forward_decoder(
-        self, decoder_out, encoder_out, eos_penalty=0.0, max_ratio=None, **kwargs
-    ):
-
-        output_tokens = decoder_out.output_tokens
-        output_scores = decoder_out.output_scores
-        attn = decoder_out.attn
-        history = decoder_out.history
-
-        bsz = output_tokens.size(0)
-        if max_ratio is None:
-            max_lens = torch.zeros_like(output_tokens).fill_(255)
-        else:
-            if encoder_out.encoder_padding_mask is None:
-                max_src_len = encoder_out.encoder_out.size(0)
-                src_lens = encoder_out.encoder_out.new(bsz).fill_(max_src_len)
-            else:
-                src_lens = (~encoder_out.encoder_padding_mask).sum(1)
-            max_lens = (src_lens * max_ratio).clamp(min=10).long()
-
-        # delete words
-        # do not delete tokens if it is <s> </s>
-        can_del_word = output_tokens.ne(self.pad).sum(1) > 2
-        if can_del_word.sum() != 0:  # we cannot delete, skip
-            word_del_score, word_del_attn = self.decoder.forward_word_del(
-                normalize=True,
-                prev_output_tokens=_skip(output_tokens, can_del_word),
-                encoder_out=_skip_encoder_out(self.encoder, encoder_out, can_del_word),
-            )
-            word_del_pred = word_del_score.max(-1)[1].bool()
-
-            _tokens, _scores, _attn = _apply_del_words(
-                output_tokens[can_del_word],
-                output_scores[can_del_word],
-                word_del_attn,
-                word_del_pred,
-                self.pad,
-                self.bos,
-                self.eos,
-            )
-            output_tokens = _fill(output_tokens, can_del_word, _tokens, self.pad)
-            output_scores = _fill(output_scores, can_del_word, _scores, 0)
-            attn = _fill(attn, can_del_word, _attn, 0.0)
-
-            if history is not None:
-                history.append(output_tokens.clone())
-
-        # insert placeholders
-        can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lens
-        if can_ins_mask.sum() != 0:
-            mask_ins_score, _ = self.decoder.forward_mask_ins(
-                normalize=True,
-                prev_output_tokens=_skip(output_tokens, can_ins_mask),
-                encoder_out=_skip_encoder_out(self.encoder, encoder_out, can_ins_mask),
-            )
-            if eos_penalty > 0.0:
-                mask_ins_score[:, :, 0] = mask_ins_score[:, :, 0] - eos_penalty
-            mask_ins_pred = mask_ins_score.max(-1)[1]
-            mask_ins_pred = torch.min(
-                mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred)
-            )
-
-            _tokens, _scores = _apply_ins_masks(
-                output_tokens[can_ins_mask],
-                output_scores[can_ins_mask],
-                mask_ins_pred,
-                self.pad,
-                self.unk,
-                self.eos,
-            )
-            output_tokens = _fill(output_tokens, can_ins_mask, _tokens, self.pad)
-            output_scores = _fill(output_scores, can_ins_mask, _scores, 0)
-
-            if history is not None:
-                history.append(output_tokens.clone())
-
-        # insert words
-        can_ins_word = output_tokens.eq(self.unk).sum(1) > 0
-        if can_ins_word.sum() != 0:
-            word_ins_score, word_ins_attn = self.decoder.forward_word_ins(
-                normalize=True,
-                prev_output_tokens=_skip(output_tokens, can_ins_word),
-                encoder_out=_skip_encoder_out(self.encoder, encoder_out, can_ins_word),
-            )
-            word_ins_score, word_ins_pred = word_ins_score.max(-1)
-            _tokens, _scores = _apply_ins_words(
-                output_tokens[can_ins_word],
-                output_scores[can_ins_word],
-                word_ins_pred,
-                word_ins_score,
-                self.unk,
-            )
-
-            output_tokens = _fill(output_tokens, can_ins_word, _tokens, self.pad)
-            output_scores = _fill(output_scores, can_ins_word, _scores, 0)
-            attn = _fill(attn, can_ins_word, word_ins_attn, 0.0)
-
-            if history is not None:
-                history.append(output_tokens.clone())
-
-        # delete some unnecessary paddings
-        cut_off = output_tokens.ne(self.pad).sum(1).max()
-        output_tokens = output_tokens[:, :cut_off]
-        output_scores = output_scores[:, :cut_off]
-        attn = None if attn is None else attn[:, :cut_off, :]
-
-        return decoder_out._replace(
-            output_tokens=output_tokens,
-            output_scores=output_scores,
-            attn=attn,
-            history=history,
-        )
-
-    def initialize_output_tokens(self, encoder_out, src_tokens):
-        initial_output_tokens = src_tokens.new_zeros(src_tokens.size(0), 2)
-        initial_output_tokens[:, 0] = self.bos
-        initial_output_tokens[:, 1] = self.eos
-
-        initial_output_scores = initial_output_tokens.new_zeros(
-            *initial_output_tokens.size()
-        ).type_as(encoder_out.encoder_out)
-
-        return DecoderOut(
-            output_tokens=initial_output_tokens,
-            output_scores=initial_output_scores,
-            attn=None,
-            step=0,
-            max_step=0,
-            history=None,
-        )
-
-
-class LevenshteinTransformerDecoder(FairseqNATDecoder):
-    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
-        super().__init__(
-            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
-        )
-        self.dictionary = dictionary
-        self.bos = dictionary.bos()
-        self.unk = dictionary.unk()
-        self.eos = dictionary.eos()
-        self.sampling_for_deletion = getattr(args, "sampling_for_deletion", False)
-        self.embed_mask_ins = Embedding(256, self.output_embed_dim * 2, None)
-        self.embed_word_del = Embedding(2, self.output_embed_dim, None)
-
-        # del_word, ins_mask, ins_word
-        self.early_exit = [int(i) for i in args.early_exit.split(",")]
-        assert len(self.early_exit) == 3
-
-        # copy layers for mask-predict/deletion
-        self.layers_msk = None
-        if getattr(args, "no_share_maskpredictor", False):
-            self.layers_msk = nn.ModuleList(
-                [
-                    TransformerDecoderLayer(args, no_encoder_attn)
-                    for _ in range(self.early_exit[1])
-                ]
-            )
-        self.layers_del = None
-        if getattr(args, "no_share_discriminator", False):
-            self.layers_del = nn.ModuleList(
-                [
-                    TransformerDecoderLayer(args, no_encoder_attn)
-                    for _ in range(self.early_exit[0])
-                ]
-            )
-
-        if getattr(args, "share_discriminator_maskpredictor", False):
-            assert getattr(
-                args, "no_share_discriminator", False
-            ), "must set saperate discriminator"
-            self.layers_msk = self.layers_del
-
-    def extract_features(
-        self,
-        prev_output_tokens,
-        encoder_out=None,
-        early_exit=None,
-        layers=None,
-        **unused
-    ):
-        """
-        Similar to *forward* but only return features.
-        Inputs:
-            prev_output_tokens: Tensor(B, T)
-            encoder_out: a dictionary of hidden states and masks
-
-        Returns:
-            tuple:
-                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
-                - a dictionary with any model-specific outputs
-            the LevenshteinTransformer decoder has full-attention to all generated tokens
-        """
-        # embed positions
-        positions = (
-            self.embed_positions(prev_output_tokens)
-            if self.embed_positions is not None
-            else None
-        )
-
-        # embed tokens and positions
-        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
-        if self.project_in_dim is not None:
-            x = self.project_in_dim(x)
-
-        if positions is not None:
-            x += positions
-        x = self.dropout_module(x)
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-        attn = None
-        inner_states = [x]
-
-        # decoder layers
-        decoder_padding_mask = prev_output_tokens.eq(self.padding_idx)
-        layers = self.layers if layers is None else layers
-        early_exit = len(layers) if early_exit is None else early_exit
-        for _, layer in enumerate(layers[:early_exit]):
-            x, attn, _ = layer(
-                x,
-                encoder_out.encoder_out if encoder_out is not None else None,
-                encoder_out.encoder_padding_mask if encoder_out is not None else None,
-                self_attn_mask=None,
-                self_attn_padding_mask=decoder_padding_mask,
-            )
-            inner_states.append(x)
-
-        if self.layer_norm:
-            x = self.layer_norm(x)
-
-        # T x B x C -> B x T x C
-        x = x.transpose(0, 1)
-
-        if self.project_out_dim is not None:
-            x = self.project_out_dim(x)
-
-        return x, {"attn": attn, "inner_states": inner_states}
-
-    @ensemble_decoder
-    def forward_mask_ins(self, normalize, encoder_out, prev_output_tokens, **unused):
-        features, extra = self.extract_features(
-            prev_output_tokens,
-            encoder_out=encoder_out,
-            early_exit=self.early_exit[1],
-            layers=self.layers_msk,
-            **unused
-        )
-        features_cat = torch.cat([features[:, :-1, :], features[:, 1:, :]], 2)
-        decoder_out = F.linear(features_cat, self.embed_mask_ins.weight)
-        if normalize:
-            return F.log_softmax(decoder_out, -1), extra["attn"]
-        return decoder_out, extra["attn"]
-
-    @ensemble_decoder
-    def forward_word_ins(self, normalize, encoder_out, prev_output_tokens, **unused):
-        features, extra = self.extract_features(
-            prev_output_tokens,
-            encoder_out=encoder_out,
-            early_exit=self.early_exit[2],
-            layers=self.layers,
-            **unused
-        )
-        decoder_out = self.output_layer(features)
-        if normalize:
-            return F.log_softmax(decoder_out, -1), extra["attn"]
-        return decoder_out, extra["attn"]
-
-    @ensemble_decoder
-    def forward_word_del(self, normalize, encoder_out, prev_output_tokens, **unused):
-        features, extra = self.extract_features(
-            prev_output_tokens,
-            encoder_out=encoder_out,
-            early_exit=self.early_exit[0],
-            layers=self.layers_del,
-            **unused
-        )
-        decoder_out = F.linear(features, self.embed_word_del.weight)
-        if normalize:
-            return F.log_softmax(decoder_out, -1), extra["attn"]
-        return decoder_out, extra["attn"]
-
-
-@register_model_architecture("levenshtein_transformer", "levenshtein_transformer")
-def levenshtein_base_architecture(args):
-    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
-    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
-    args.decoder_ffn_embed_dim = getattr(
-        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
-    )
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
-    args.activation_fn = getattr(args, "activation_fn", "relu")
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.adaptive_input = getattr(args, "adaptive_input", False)
-    args.apply_bert_init = getattr(args, "apply_bert_init", False)
-
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.sampling_for_deletion = getattr(args, "sampling_for_deletion", False)
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-    args.early_exit = getattr(args, "early_exit", "6,6,6")
-    args.no_share_discriminator = getattr(args, "no_share_discriminator", False)
-    args.no_share_maskpredictor = getattr(args, "no_share_maskpredictor", False)
-    args.share_discriminator_maskpredictor = getattr(
-        args, "share_discriminator_maskpredictor", False
-    )
-    args.no_share_last_layer = getattr(args, "no_share_last_layer", False)
-
-
-@register_model_architecture(
-    "levenshtein_transformer", "levenshtein_transformer_wmt_en_de"
-)
-def levenshtein_transformer_wmt_en_de(args):
-    levenshtein_base_architecture(args)
-
-
-# similar parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017)
-@register_model_architecture(
-    "levenshtein_transformer", "levenshtein_transformer_vaswani_wmt_en_de_big"
-)
-def levenshtein_transformer_vaswani_wmt_en_de_big(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
-    args.dropout = getattr(args, "dropout", 0.3)
-    levenshtein_base_architecture(args)
-
-
-# default parameters used in tensor2tensor implementation
-@register_model_architecture(
-    "levenshtein_transformer", "levenshtein_transformer_wmt_en_de_big"
-)
-def levenshtein_transformer_wmt_en_de_big_t2t(args):
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.1)
-    levenshtein_transformer_vaswani_wmt_en_de_big(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/levenshtein_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/levenshtein_utils.py
deleted file mode 100644
index 375a98c2e11354de085f0a7926f407bd1a6a2ad4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/levenshtein_utils.py
+++ /dev/null
@@ -1,293 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-from fairseq.utils import new_arange
-
-
-# -------------- Helper Functions --------------------------------------------------- #
-
-
-def load_libnat():
-    try:
-        from fairseq import libnat_cuda
-
-        return libnat_cuda, True
-
-    except ImportError as e:
-        print(str(e) + "... fall back to CPU version")
-
-        try:
-            from fairseq import libnat
-
-            return libnat, False
-
-        except ImportError as e:
-            import sys
-
-            sys.stderr.write(
-                "ERROR: missing libnat_cuda. run `python setup.py build_ext --inplace`\n"
-            )
-            raise e
-
-
-def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx):
-    libnat, use_cuda = load_libnat()
-
-    def _get_ins_targets_cuda(in_tokens, out_tokens, padding_idx, unk_idx):
-        in_masks = in_tokens.ne(padding_idx)
-        out_masks = out_tokens.ne(padding_idx)
-        mask_ins_targets, masked_tgt_masks = libnat.generate_insertion_labels(
-            out_tokens.int(),
-            libnat.levenshtein_distance(
-                in_tokens.int(),
-                out_tokens.int(),
-                in_masks.sum(1).int(),
-                out_masks.sum(1).int(),
-            ),
-        )
-        masked_tgt_masks = masked_tgt_masks.bool() & out_masks
-        mask_ins_targets = mask_ins_targets.type_as(in_tokens)[
-            :, 1 : in_masks.size(1)
-        ].masked_fill_(~in_masks[:, 1:], 0)
-        masked_tgt_tokens = out_tokens.masked_fill(masked_tgt_masks, unk_idx)
-        return masked_tgt_masks, masked_tgt_tokens, mask_ins_targets
-
-    def _get_ins_targets_cpu(in_tokens, out_tokens, padding_idx, unk_idx):
-        in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1)
-
-        in_tokens_list = [
-            [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist())
-        ]
-        out_tokens_list = [
-            [t for t in s if t != padding_idx]
-            for i, s in enumerate(out_tokens.tolist())
-        ]
-
-        full_labels = libnat.suggested_ed2_path(
-            in_tokens_list, out_tokens_list, padding_idx
-        )
-        mask_inputs = [
-            [len(c) if c[0] != padding_idx else 0 for c in a[:-1]] for a in full_labels
-        ]
-
-        # generate labels
-        masked_tgt_masks = []
-        for mask_input in mask_inputs:
-            mask_label = []
-            for beam_size in mask_input[1:-1]:  # HACK 1:-1
-                mask_label += [0] + [1 for _ in range(beam_size)]
-            masked_tgt_masks.append(
-                mask_label + [0 for _ in range(out_seq_len - len(mask_label))]
-            )
-        mask_ins_targets = [
-            mask_input[1:-1]
-            + [0 for _ in range(in_seq_len - 1 - len(mask_input[1:-1]))]
-            for mask_input in mask_inputs
-        ]
-
-        # transform to tensor
-        masked_tgt_masks = torch.tensor(
-            masked_tgt_masks, device=out_tokens.device
-        ).bool()
-        mask_ins_targets = torch.tensor(mask_ins_targets, device=in_tokens.device)
-        masked_tgt_tokens = out_tokens.masked_fill(masked_tgt_masks, unk_idx)
-        return masked_tgt_masks, masked_tgt_tokens, mask_ins_targets
-
-    if use_cuda:
-        return _get_ins_targets_cuda(in_tokens, out_tokens, padding_idx, unk_idx)
-    return _get_ins_targets_cpu(in_tokens, out_tokens, padding_idx, unk_idx)
-
-
-def _get_del_targets(in_tokens, out_tokens, padding_idx):
-    libnat, use_cuda = load_libnat()
-
-    def _get_del_targets_cuda(in_tokens, out_tokens, padding_idx):
-        in_masks = in_tokens.ne(padding_idx)
-        out_masks = out_tokens.ne(padding_idx)
-
-        word_del_targets = libnat.generate_deletion_labels(
-            in_tokens.int(),
-            libnat.levenshtein_distance(
-                in_tokens.int(),
-                out_tokens.int(),
-                in_masks.sum(1).int(),
-                out_masks.sum(1).int(),
-            ),
-        )
-        word_del_targets = word_del_targets.type_as(in_tokens).masked_fill_(
-            ~in_masks, 0
-        )
-        return word_del_targets
-
-    def _get_del_targets_cpu(in_tokens, out_tokens, padding_idx):
-        out_seq_len = out_tokens.size(1)
-        with torch.cuda.device_of(in_tokens):
-            in_tokens_list = [
-                [t for t in s if t != padding_idx]
-                for i, s in enumerate(in_tokens.tolist())
-            ]
-            out_tokens_list = [
-                [t for t in s if t != padding_idx]
-                for i, s in enumerate(out_tokens.tolist())
-            ]
-
-        full_labels = libnat.suggested_ed2_path(
-            in_tokens_list, out_tokens_list, padding_idx
-        )
-        word_del_targets = [b[-1] for b in full_labels]
-        word_del_targets = [
-            labels + [0 for _ in range(out_seq_len - len(labels))]
-            for labels in word_del_targets
-        ]
-
-        # transform to tensor
-        word_del_targets = torch.tensor(word_del_targets, device=out_tokens.device)
-        return word_del_targets
-
-    if use_cuda:
-        return _get_del_targets_cuda(in_tokens, out_tokens, padding_idx)
-    return _get_del_targets_cpu(in_tokens, out_tokens, padding_idx)
-
-
-def _apply_ins_masks(
-    in_tokens, in_scores, mask_ins_pred, padding_idx, unk_idx, eos_idx
-):
-
-    in_masks = in_tokens.ne(padding_idx)
-    in_lengths = in_masks.sum(1)
-
-    # HACK: hacky way to shift all the paddings to eos first.
-    in_tokens.masked_fill_(~in_masks, eos_idx)
-    mask_ins_pred.masked_fill_(~in_masks[:, 1:], 0)
-
-    out_lengths = in_lengths + mask_ins_pred.sum(1)
-    out_max_len = out_lengths.max()
-    out_masks = new_arange(out_lengths, out_max_len)[None, :] < out_lengths[:, None]
-
-    reordering = (mask_ins_pred + in_masks[:, 1:].long()).cumsum(1)
-    out_tokens = (
-        in_tokens.new_zeros(in_tokens.size(0), out_max_len)
-        .fill_(padding_idx)
-        .masked_fill_(out_masks, unk_idx)
-    )
-    out_tokens[:, 0] = in_tokens[:, 0]
-    out_tokens.scatter_(1, reordering, in_tokens[:, 1:])
-
-    out_scores = None
-    if in_scores is not None:
-        in_scores.masked_fill_(~in_masks, 0)
-        out_scores = in_scores.new_zeros(*out_tokens.size())
-        out_scores[:, 0] = in_scores[:, 0]
-        out_scores.scatter_(1, reordering, in_scores[:, 1:])
-
-    return out_tokens, out_scores
-
-
-def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, unk_idx):
-    word_ins_masks = in_tokens.eq(unk_idx)
-    out_tokens = in_tokens.masked_scatter(word_ins_masks, word_ins_pred[word_ins_masks])
-
-    if in_scores is not None:
-        out_scores = in_scores.masked_scatter(
-            word_ins_masks, word_ins_scores[word_ins_masks]
-        )
-    else:
-        out_scores = None
-
-    return out_tokens, out_scores
-
-
-def _apply_del_words(
-    in_tokens, in_scores, in_attn, word_del_pred, padding_idx, bos_idx, eos_idx
-):
-    # apply deletion to a tensor
-    in_masks = in_tokens.ne(padding_idx)
-    bos_eos_masks = in_tokens.eq(bos_idx) | in_tokens.eq(eos_idx)
-
-    max_len = in_tokens.size(1)
-    word_del_pred.masked_fill_(~in_masks, 1)
-    word_del_pred.masked_fill_(bos_eos_masks, 0)
-
-    reordering = new_arange(in_tokens).masked_fill_(word_del_pred, max_len).sort(1)[1]
-
-    out_tokens = in_tokens.masked_fill(word_del_pred, padding_idx).gather(1, reordering)
-
-    out_scores = None
-    if in_scores is not None:
-        out_scores = in_scores.masked_fill(word_del_pred, 0).gather(1, reordering)
-
-    out_attn = None
-    if in_attn is not None:
-        _mask = word_del_pred[:, :, None].expand_as(in_attn)
-        _reordering = reordering[:, :, None].expand_as(in_attn)
-        out_attn = in_attn.masked_fill(_mask, 0.0).gather(1, _reordering)
-
-    return out_tokens, out_scores, out_attn
-
-
-def _skip(x, mask):
-    """
-    Getting sliced (dim=0) tensor by mask. Supporting tensor and list/dict of tensors.
-    """
-    if isinstance(x, int):
-        return x
-
-    if x is None:
-        return None
-
-    if isinstance(x, torch.Tensor):
-        if x.size(0) == mask.size(0):
-            return x[mask]
-        elif x.size(1) == mask.size(0):
-            return x[:, mask]
-
-    if isinstance(x, list):
-        return [_skip(x_i, mask) for x_i in x]
-
-    if isinstance(x, dict):
-        return {k: _skip(v, mask) for k, v in x.items()}
-
-    raise NotImplementedError
-
-
-def _skip_encoder_out(encoder, encoder_out, mask):
-    if not mask.any():
-        return encoder_out
-    else:
-        return encoder.reorder_encoder_out(
-            encoder_out, mask.nonzero(as_tuple=False).squeeze()
-        )
-
-
-def _fill(x, mask, y, padding_idx):
-    """
-    Filling tensor x with y at masked positions (dim=0).
-    """
-    if x is None:
-        return y
-    assert x.dim() == y.dim() and mask.size(0) == x.size(0)
-    assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2))
-    n_selected = mask.sum()
-    assert n_selected == y.size(0)
-
-    if n_selected == x.size(0):
-        return y
-
-    if x.size(1) < y.size(1):
-        dims = [x.size(0), y.size(1) - x.size(1)]
-        if x.dim() == 3:
-            dims.append(x.size(2))
-        x = torch.cat([x, x.new_zeros(*dims).fill_(padding_idx)], 1)
-        x[mask] = y
-    elif x.size(1) > y.size(1):
-        x[mask] = padding_idx
-        if x.dim() == 2:
-            x[mask, : y.size(1)] = y
-        else:
-            x[mask, : y.size(1), :] = y
-    else:
-        x[mask] = y
-    return x
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/nat_crf_transformer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/nat_crf_transformer.py
deleted file mode 100644
index d4b3cd931ceb077eb30db73df1d5d6cd714a86c2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/nat_crf_transformer.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-from fairseq.models import register_model, register_model_architecture
-from fairseq.models.nat import NATransformerModel, base_architecture
-from fairseq.modules import DynamicCRF
-
-
-@register_model("nacrf_transformer")
-class NACRFTransformerModel(NATransformerModel):
-    def __init__(self, args, encoder, decoder):
-        super().__init__(args, encoder, decoder)
-        self.crf_layer = DynamicCRF(
-            num_embedding=len(self.tgt_dict),
-            low_rank=args.crf_lowrank_approx,
-            beam_size=args.crf_beam_approx,
-        )
-
-    @property
-    def allow_ensemble(self):
-        return False
-
-    @staticmethod
-    def add_args(parser):
-        NATransformerModel.add_args(parser)
-        parser.add_argument(
-            "--crf-lowrank-approx",
-            type=int,
-            help="the dimension of low-rank approximation of transition",
-        )
-        parser.add_argument(
-            "--crf-beam-approx",
-            type=int,
-            help="the beam size for apporixmating the normalizing factor",
-        )
-        parser.add_argument(
-            "--word-ins-loss-factor",
-            type=float,
-            help="weights on NAT loss used to co-training with CRF loss.",
-        )
-
-    def forward(
-        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
-    ):
-        # encoding
-        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
-
-        # length prediction
-        length_out = self.decoder.forward_length(
-            normalize=False, encoder_out=encoder_out
-        )
-        length_tgt = self.decoder.forward_length_prediction(
-            length_out, encoder_out, tgt_tokens
-        )
-
-        # decoding
-        word_ins_out = self.decoder(
-            normalize=False,
-            prev_output_tokens=prev_output_tokens,
-            encoder_out=encoder_out,
-        )
-        word_ins_tgt, word_ins_mask = tgt_tokens, tgt_tokens.ne(self.pad)
-
-        # compute the log-likelihood of CRF
-        crf_nll = -self.crf_layer(word_ins_out, word_ins_tgt, word_ins_mask)
-        crf_nll = (crf_nll / word_ins_mask.type_as(crf_nll).sum(-1)).mean()
-
-        return {
-            "word_ins": {
-                "out": word_ins_out,
-                "tgt": word_ins_tgt,
-                "mask": word_ins_mask,
-                "ls": self.args.label_smoothing,
-                "nll_loss": True,
-                "factor": self.args.word_ins_loss_factor,
-            },
-            "word_crf": {"loss": crf_nll},
-            "length": {
-                "out": length_out,
-                "tgt": length_tgt,
-                "factor": self.decoder.length_loss_factor,
-            },
-        }
-
-    def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs):
-        output_tokens = decoder_out.output_tokens
-        output_scores = decoder_out.output_scores
-        history = decoder_out.history
-
-        # execute the decoder and get emission scores
-        output_masks = output_tokens.ne(self.pad)
-        word_ins_out = self.decoder(
-            normalize=False, prev_output_tokens=output_tokens, encoder_out=encoder_out
-        )
-
-        # run viterbi decoding through CRF
-        _scores, _tokens = self.crf_layer.forward_decoder(word_ins_out, output_masks)
-        output_tokens.masked_scatter_(output_masks, _tokens[output_masks])
-        output_scores.masked_scatter_(output_masks, _scores[output_masks])
-        if history is not None:
-            history.append(output_tokens.clone())
-
-        return decoder_out._replace(
-            output_tokens=output_tokens,
-            output_scores=output_scores,
-            attn=None,
-            history=history,
-        )
-
-
-@register_model_architecture("nacrf_transformer", "nacrf_transformer")
-def nacrf_base_architecture(args):
-    args.crf_lowrank_approx = getattr(args, "crf_lowrank_approx", 32)
-    args.crf_beam_approx = getattr(args, "crf_beam_approx", 64)
-    args.word_ins_loss_factor = getattr(args, "word_ins_loss_factor", 0.5)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
-    base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/nonautoregressive_ensembles.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/nonautoregressive_ensembles.py
deleted file mode 100644
index 46bb8aac4370815616704de928322880c929b59e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/nonautoregressive_ensembles.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-import torch.nn.functional as F
-from fairseq.models.nat import (
-    _apply_del_words,
-    _apply_ins_masks,
-    _apply_ins_words,
-    _fill,
-    _skip,
-    _skip_encoder_out,
-)
-
-
-class _EnsembleModelEncoder(object):
-    def __init__(self, models):
-        self.models = models
-
-    def reorder_encoder_out(self, encoder_outs, new_order):
-        encoder_outs = [
-            model.encoder.reorder_encoder_out(encoder_out, new_order)
-            for model, encoder_out in zip(self.models, encoder_outs)
-        ]
-        return encoder_outs
-
-
-class BasicEnsembleModel(torch.nn.Module):
-    """A wrapper around an ensemble of models."""
-
-    def __init__(self, models):
-        super().__init__()
-        self.models = torch.nn.ModuleList(models)
-        self.bos = self.models[0].decoder.dictionary.bos()
-        self.eos = self.models[0].decoder.dictionary.eos()
-        self.pad = self.models[0].decoder.dictionary.pad()
-        self.unk = self.models[0].decoder.dictionary.unk()
-        self.encoder = _EnsembleModelEncoder(self.models)
-
-    def has_encoder(self):
-        return hasattr(self.models[0], "encoder")
-
-    def max_decoder_positions(self):
-        return min(m.max_decoder_positions() for m in self.models)
-
-    @torch.no_grad()
-    def forward_encoder(self, encoder_input):
-        if not self.has_encoder():
-            return None
-        return [model.forward_encoder(encoder_input) for model in self.models]
-
-    @torch.no_grad()
-    def forward_decoder(self, *inputs):
-        raise NotImplementedError
-
-    def initialize_output_tokens(self, *inputs):
-        raise NotImplementedError
-
-
-class EnsembleLevT(BasicEnsembleModel):
-    """A wrapper around an ensemble of models."""
-
-    def __init__(self, models):
-        super().__init__(models)
-
-    @torch.no_grad()
-    def forward_decoder(
-        self, decoder_out, encoder_outs, eos_penalty=0.0, max_ratio=None, **kwargs
-    ):
-        # LevT ensembling
-        # A pipeline of three steps: deletion, placeholder, and word insertion.
-        # We need to average scores in each step in a pipeline way because of dependence.
-        # deletion
-        output_tokens = decoder_out.output_tokens
-        output_scores = decoder_out.output_scores
-        attn = decoder_out.attn
-
-        bsz = output_tokens.size(0)
-        if max_ratio is None:
-            max_lens = output_tokens.new().fill_(255)
-        else:
-            if encoder_outs[0].encoder_padding_mask is None:
-                src_lens = (
-                    encoder_outs[0]
-                    .encoder_out.new(bsz)
-                    .fill_(encoder_outs[0].encoder_out.size(1))
-                )
-            else:
-                src_lens = (~encoder_outs[0].encoder_padding_mask).sum(1)
-            max_lens = (src_lens * max_ratio).clamp(min=10).long()
-
-        # delete words
-        # do not delete tokens if it is <s> </s>
-        can_del_word = output_tokens.ne(self.pad).sum(1) > 2
-        if can_del_word.sum() != 0:  # we cannot delete, skip
-            output_tokens, output_scores, attn = self.forward_word_del(
-                encoder_outs,
-                output_tokens,
-                output_scores,
-                attn,
-                can_del_word,
-            )
-
-        # insert placeholders
-        can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lens
-        if can_ins_mask.sum() != 0:
-            output_tokens, output_scores = self.forward_mask_ins(
-                encoder_outs,
-                output_tokens,
-                output_scores,
-                can_ins_mask,
-                eos_penalty,
-                max_lens,
-            )
-
-        # insert words
-        can_ins_word = output_tokens.eq(self.unk).sum(1) > 0
-        if can_ins_word.sum() != 0:
-            output_tokens, output_scores, attn = self.forward_word_ins(
-                encoder_outs,
-                output_tokens,
-                output_scores,
-                attn,
-                can_ins_word,
-            )
-
-        # delete some unnecessary paddings
-        cut_off = output_tokens.ne(self.pad).sum(1).max()
-        output_tokens = output_tokens[:, :cut_off]
-        output_scores = output_scores[:, :cut_off]
-        attn = None if attn is None else attn[:, :cut_off, :]
-        return decoder_out._replace(
-            output_tokens=output_tokens,
-            output_scores=output_scores,
-            attn=attn,
-            history=None,
-        )
-
-    def forward_word_del(
-        self, encoder_outs, output_tokens, output_scores, attn, can_del_word
-    ):
-        word_del_score_avg = []
-        word_del_attn_avg = []
-        for model, encoder_out in zip(self.models, encoder_outs):
-            word_del_out, word_del_attn = model.decoder.forward_word_del(
-                _skip(output_tokens, can_del_word),
-                _skip_encoder_out(model.encoder, encoder_out, can_del_word),
-            )
-            word_del_score = F.log_softmax(word_del_out, 2)
-            word_del_score_avg.append(word_del_score)
-            word_del_attn_avg.append(word_del_attn)
-        word_del_score_avg = torch.logsumexp(
-            torch.stack(word_del_score_avg, dim=0), dim=0
-        ) - math.log(len(self.models))
-        word_del_pred = word_del_score_avg.max(-1)[1].bool()
-        if word_del_attn_avg[0] is not None:
-            word_del_attn_avg = torch.stack(word_del_attn_avg, dim=0) / len(self.models)
-        else:
-            word_del_attn_avg = None
-
-        _tokens, _scores, _attn = _apply_del_words(
-            output_tokens[can_del_word],
-            output_scores[can_del_word],
-            word_del_attn_avg,
-            word_del_pred,
-            self.pad,
-            self.bos,
-            self.eos,
-        )
-        output_tokens = _fill(output_tokens, can_del_word, _tokens, self.pad)
-        output_scores = _fill(output_scores, can_del_word, _scores, 0)
-        attn = _fill(attn, can_del_word, _attn, 0.0)
-        return output_tokens, output_scores, attn
-
-    def forward_mask_ins(
-        self,
-        encoder_outs,
-        output_tokens,
-        output_scores,
-        can_ins_mask,
-        eos_penalty,
-        max_lens,
-    ):
-        mask_ins_score_avg = []
-        for model, encoder_out in zip(self.models, encoder_outs):
-            mask_ins_out, _ = model.decoder.forward_mask_ins(
-                _skip(output_tokens, can_ins_mask),
-                _skip_encoder_out(model.encoder, encoder_out, can_ins_mask),
-            )
-            mask_ins_score = F.log_softmax(mask_ins_out, 2)
-            if eos_penalty > 0.0:
-                mask_ins_score[:, :, 0] -= eos_penalty
-            mask_ins_score_avg.append(mask_ins_score)
-        mask_ins_score_avg = torch.logsumexp(
-            torch.stack(mask_ins_score_avg, dim=0), dim=0
-        ) - math.log(len(self.models))
-        mask_ins_pred = mask_ins_score_avg.max(-1)[1]
-        mask_ins_pred = torch.min(
-            mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred)
-        )
-        _tokens, _scores = _apply_ins_masks(
-            output_tokens[can_ins_mask],
-            output_scores[can_ins_mask],
-            mask_ins_pred,
-            self.pad,
-            self.unk,
-            self.eos,
-        )
-        output_tokens = _fill(output_tokens, can_ins_mask, _tokens, self.pad)
-        output_scores = _fill(output_scores, can_ins_mask, _scores, 0)
-        return output_tokens, output_scores
-
-    def forward_word_ins(
-        self, encoder_outs, output_tokens, output_scores, attn, can_ins_word
-    ):
-        word_ins_score_avg = []
-        word_ins_attn_avg = []
-        for model, encoder_out in zip(self.models, encoder_outs):
-            word_ins_out, word_ins_attn = model.decoder.forward_word_ins(
-                _skip(output_tokens, can_ins_word),
-                _skip_encoder_out(model.encoder, encoder_out, can_ins_word),
-            )
-            word_ins_score = F.log_softmax(word_ins_out, 2)
-            word_ins_score_avg.append(word_ins_score)
-            word_ins_attn_avg.append(word_ins_attn)
-        word_ins_score_avg = torch.logsumexp(
-            torch.stack(word_ins_score_avg, dim=0), dim=0
-        ) - math.log(len(self.models))
-        if word_ins_attn_avg[0] is not None:
-            word_ins_attn_avg = torch.stack(word_ins_attn_avg, dim=0) / len(self.models)
-        else:
-            word_ins_attn_avg = None
-        word_ins_score_max, word_ins_pred = word_ins_score_avg.max(-1)
-
-        _tokens, _scores = _apply_ins_words(
-            output_tokens[can_ins_word],
-            output_scores[can_ins_word],
-            word_ins_pred,
-            word_ins_score_max,
-            self.unk,
-        )
-
-        output_tokens = _fill(output_tokens, can_ins_word, _tokens, self.pad)
-        output_scores = _fill(output_scores, can_ins_word, _scores, 0)
-        attn = _fill(attn, can_ins_word, word_ins_attn, 0.0)
-        return output_tokens, output_scores, attn
-
-    def initialize_output_tokens(self, encoder_outs, src_tokens):
-        # LevT doesn't do length prediction.
-        return self.models[0].initialize_output_tokens(encoder_outs[0], src_tokens)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/nonautoregressive_transformer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/nonautoregressive_transformer.py
deleted file mode 100644
index 735297fc290786a73617352d0c47ed72edef8e84..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/nat/nonautoregressive_transformer.py
+++ /dev/null
@@ -1,440 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.iterative_refinement_generator import DecoderOut
-from fairseq.models import register_model, register_model_architecture
-from fairseq.models.nat import FairseqNATDecoder, FairseqNATModel, ensemble_decoder
-from fairseq.models.transformer import Embedding
-from fairseq.modules.transformer_sentence_encoder import init_bert_params
-
-
-def _mean_pooling(enc_feats, src_masks):
-    # enc_feats: T x B x C
-    # src_masks: B x T or None
-    if src_masks is None:
-        enc_feats = enc_feats.mean(0)
-    else:
-        src_masks = (~src_masks).transpose(0, 1).type_as(enc_feats)
-        enc_feats = (
-            (enc_feats / src_masks.sum(0)[None, :, None]) * src_masks[:, :, None]
-        ).sum(0)
-    return enc_feats
-
-
-def _argmax(x, dim):
-    return (x == x.max(dim, keepdim=True)[0]).type_as(x)
-
-
-def _uniform_assignment(src_lens, trg_lens):
-    max_trg_len = trg_lens.max()
-    steps = (src_lens.float() - 1) / (trg_lens.float() - 1)  # step-size
-    # max_trg_len
-    index_t = utils.new_arange(trg_lens, max_trg_len).float()
-    index_t = steps[:, None] * index_t[None, :]  # batch_size X max_trg_len
-    index_t = torch.round(index_t).long().detach()
-    return index_t
-
-
-@register_model("nonautoregressive_transformer")
-class NATransformerModel(FairseqNATModel):
-    @property
-    def allow_length_beam(self):
-        return True
-
-    @staticmethod
-    def add_args(parser):
-        FairseqNATModel.add_args(parser)
-
-        # length prediction
-        parser.add_argument(
-            "--src-embedding-copy",
-            action="store_true",
-            help="copy encoder word embeddings as the initial input of the decoder",
-        )
-        parser.add_argument(
-            "--pred-length-offset",
-            action="store_true",
-            help="predicting the length difference between the target and source sentences",
-        )
-        parser.add_argument(
-            "--sg-length-pred",
-            action="store_true",
-            help="stop the gradients back-propagated from the length predictor",
-        )
-        parser.add_argument(
-            "--length-loss-factor",
-            type=float,
-            help="weights on the length prediction loss",
-        )
-
-    @classmethod
-    def build_decoder(cls, args, tgt_dict, embed_tokens):
-        decoder = NATransformerDecoder(args, tgt_dict, embed_tokens)
-        if getattr(args, "apply_bert_init", False):
-            decoder.apply(init_bert_params)
-        return decoder
-
-    def forward(
-        self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs
-    ):
-        # encoding
-        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
-
-        # length prediction
-        length_out = self.decoder.forward_length(
-            normalize=False, encoder_out=encoder_out
-        )
-        length_tgt = self.decoder.forward_length_prediction(
-            length_out, encoder_out, tgt_tokens
-        )
-
-        # decoding
-        word_ins_out = self.decoder(
-            normalize=False,
-            prev_output_tokens=prev_output_tokens,
-            encoder_out=encoder_out,
-        )
-
-        return {
-            "word_ins": {
-                "out": word_ins_out,
-                "tgt": tgt_tokens,
-                "mask": tgt_tokens.ne(self.pad),
-                "ls": self.args.label_smoothing,
-                "nll_loss": True,
-            },
-            "length": {
-                "out": length_out,
-                "tgt": length_tgt,
-                "factor": self.decoder.length_loss_factor,
-            },
-        }
-
-    def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs):
-        step = decoder_out.step
-        output_tokens = decoder_out.output_tokens
-        output_scores = decoder_out.output_scores
-        history = decoder_out.history
-
-        # execute the decoder
-        output_masks = output_tokens.ne(self.pad)
-        _scores, _tokens = self.decoder(
-            normalize=True,
-            prev_output_tokens=output_tokens,
-            encoder_out=encoder_out,
-            step=step,
-        ).max(-1)
-
-        output_tokens.masked_scatter_(output_masks, _tokens[output_masks])
-        output_scores.masked_scatter_(output_masks, _scores[output_masks])
-        if history is not None:
-            history.append(output_tokens.clone())
-
-        return decoder_out._replace(
-            output_tokens=output_tokens,
-            output_scores=output_scores,
-            attn=None,
-            history=history,
-        )
-
-    def initialize_output_tokens(self, encoder_out, src_tokens):
-        # length prediction
-        length_tgt = self.decoder.forward_length_prediction(
-            self.decoder.forward_length(normalize=True, encoder_out=encoder_out),
-            encoder_out=encoder_out,
-        )
-
-        max_length = length_tgt.clamp_(min=2).max()
-        idx_length = utils.new_arange(src_tokens, max_length)
-
-        initial_output_tokens = src_tokens.new_zeros(
-            src_tokens.size(0), max_length
-        ).fill_(self.pad)
-        initial_output_tokens.masked_fill_(
-            idx_length[None, :] < length_tgt[:, None], self.unk
-        )
-        initial_output_tokens[:, 0] = self.bos
-        initial_output_tokens.scatter_(1, length_tgt[:, None] - 1, self.eos)
-
-        initial_output_scores = initial_output_tokens.new_zeros(
-            *initial_output_tokens.size()
-        ).type_as(encoder_out.encoder_out)
-
-        return DecoderOut(
-            output_tokens=initial_output_tokens,
-            output_scores=initial_output_scores,
-            attn=None,
-            step=0,
-            max_step=0,
-            history=None,
-        )
-
-    def regenerate_length_beam(self, decoder_out, beam_size):
-        output_tokens = decoder_out.output_tokens
-        length_tgt = output_tokens.ne(self.pad).sum(1)
-        length_tgt = (
-            length_tgt[:, None]
-            + utils.new_arange(length_tgt, 1, beam_size)
-            - beam_size // 2
-        )
-        length_tgt = length_tgt.view(-1).clamp_(min=2)
-        max_length = length_tgt.max()
-        idx_length = utils.new_arange(length_tgt, max_length)
-
-        initial_output_tokens = output_tokens.new_zeros(
-            length_tgt.size(0), max_length
-        ).fill_(self.pad)
-        initial_output_tokens.masked_fill_(
-            idx_length[None, :] < length_tgt[:, None], self.unk
-        )
-        initial_output_tokens[:, 0] = self.bos
-        initial_output_tokens.scatter_(1, length_tgt[:, None] - 1, self.eos)
-
-        initial_output_scores = initial_output_tokens.new_zeros(
-            *initial_output_tokens.size()
-        ).type_as(decoder_out.output_scores)
-
-        return decoder_out._replace(
-            output_tokens=initial_output_tokens, output_scores=initial_output_scores
-        )
-
-
-class NATransformerDecoder(FairseqNATDecoder):
-    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
-        super().__init__(
-            args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn
-        )
-        self.dictionary = dictionary
-        self.bos = dictionary.bos()
-        self.unk = dictionary.unk()
-        self.eos = dictionary.eos()
-
-        self.encoder_embed_dim = args.encoder_embed_dim
-        self.sg_length_pred = getattr(args, "sg_length_pred", False)
-        self.pred_length_offset = getattr(args, "pred_length_offset", False)
-        self.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
-        self.src_embedding_copy = getattr(args, "src_embedding_copy", False)
-        self.embed_length = Embedding(256, self.encoder_embed_dim, None)
-
-    @ensemble_decoder
-    def forward(self, normalize, encoder_out, prev_output_tokens, step=0, **unused):
-        features, _ = self.extract_features(
-            prev_output_tokens,
-            encoder_out=encoder_out,
-            embedding_copy=(step == 0) & self.src_embedding_copy,
-        )
-        decoder_out = self.output_layer(features)
-        return F.log_softmax(decoder_out, -1) if normalize else decoder_out
-
-    @ensemble_decoder
-    def forward_length(self, normalize, encoder_out):
-        enc_feats = encoder_out.encoder_out  # T x B x C
-        src_masks = encoder_out.encoder_padding_mask  # B x T or None
-        enc_feats = _mean_pooling(enc_feats, src_masks)
-        if self.sg_length_pred:
-            enc_feats = enc_feats.detach()
-        length_out = F.linear(enc_feats, self.embed_length.weight)
-        return F.log_softmax(length_out, -1) if normalize else length_out
-
-    def extract_features(
-        self,
-        prev_output_tokens,
-        encoder_out=None,
-        early_exit=None,
-        embedding_copy=False,
-        **unused
-    ):
-        """
-        Similar to *forward* but only return features.
-
-        Inputs:
-            prev_output_tokens: Tensor(B, T)
-            encoder_out: a dictionary of hidden states and masks
-
-        Returns:
-            tuple:
-                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
-                - a dictionary with any model-specific outputs
-            the LevenshteinTransformer decoder has full-attention to all generated tokens
-        """
-        # embedding
-        if embedding_copy:
-            src_embd = encoder_out.encoder_embedding
-            src_mask = encoder_out.encoder_padding_mask
-            src_mask = (
-                ~src_mask
-                if src_mask is not None
-                else prev_output_tokens.new_ones(*src_embd.size()[:2]).bool()
-            )
-
-            x, decoder_padding_mask = self.forward_embedding(
-                prev_output_tokens,
-                self.forward_copying_source(
-                    src_embd, src_mask, prev_output_tokens.ne(self.padding_idx)
-                ),
-            )
-
-        else:
-
-            x, decoder_padding_mask = self.forward_embedding(prev_output_tokens)
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-        attn = None
-        inner_states = [x]
-
-        # decoder layers
-        for i, layer in enumerate(self.layers):
-
-            # early exit from the decoder.
-            if (early_exit is not None) and (i >= early_exit):
-                break
-
-            x, attn, _ = layer(
-                x,
-                encoder_out.encoder_out if encoder_out is not None else None,
-                encoder_out.encoder_padding_mask if encoder_out is not None else None,
-                self_attn_mask=None,
-                self_attn_padding_mask=decoder_padding_mask,
-            )
-            inner_states.append(x)
-
-        if self.layer_norm:
-            x = self.layer_norm(x)
-
-        # T x B x C -> B x T x C
-        x = x.transpose(0, 1)
-
-        if self.project_out_dim is not None:
-            x = self.project_out_dim(x)
-
-        return x, {"attn": attn, "inner_states": inner_states}
-
-    def forward_embedding(self, prev_output_tokens, states=None):
-        # embed positions
-        positions = (
-            self.embed_positions(prev_output_tokens)
-            if self.embed_positions is not None
-            else None
-        )
-
-        # embed tokens and positions
-        if states is None:
-            x = self.embed_scale * self.embed_tokens(prev_output_tokens)
-            if self.project_in_dim is not None:
-                x = self.project_in_dim(x)
-        else:
-            x = states
-
-        if positions is not None:
-            x += positions
-        x = self.dropout_module(x)
-        decoder_padding_mask = prev_output_tokens.eq(self.padding_idx)
-        return x, decoder_padding_mask
-
-    def forward_copying_source(self, src_embeds, src_masks, tgt_masks):
-        length_sources = src_masks.sum(1)
-        length_targets = tgt_masks.sum(1)
-        mapped_inputs = _uniform_assignment(length_sources, length_targets).masked_fill(
-            ~tgt_masks, 0
-        )
-        copied_embedding = torch.gather(
-            src_embeds,
-            1,
-            mapped_inputs.unsqueeze(-1).expand(
-                *mapped_inputs.size(), src_embeds.size(-1)
-            ),
-        )
-        return copied_embedding
-
-    def forward_length_prediction(self, length_out, encoder_out, tgt_tokens=None):
-        enc_feats = encoder_out.encoder_out  # T x B x C
-        src_masks = encoder_out.encoder_padding_mask  # B x T or None
-        if self.pred_length_offset:
-            if src_masks is None:
-                src_lengs = enc_feats.new_ones(enc_feats.size(1)).fill_(
-                    enc_feats.size(0)
-                )
-            else:
-                src_lengs = (~src_masks).transpose(0, 1).type_as(enc_feats).sum(0)
-            src_lengs = src_lengs.long()
-
-        if tgt_tokens is not None:
-            # obtain the length target
-            tgt_lengs = tgt_tokens.ne(self.padding_idx).sum(1).long()
-            if self.pred_length_offset:
-                length_tgt = tgt_lengs - src_lengs + 128
-            else:
-                length_tgt = tgt_lengs
-            length_tgt = length_tgt.clamp(min=0, max=255)
-
-        else:
-            # predict the length target (greedy for now)
-            # TODO: implementing length-beam
-            pred_lengs = length_out.max(-1)[1]
-            if self.pred_length_offset:
-                length_tgt = pred_lengs - 128 + src_lengs
-            else:
-                length_tgt = pred_lengs
-
-        return length_tgt
-
-
-@register_model_architecture(
-    "nonautoregressive_transformer", "nonautoregressive_transformer"
-)
-def base_architecture(args):
-    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
-    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
-    args.decoder_ffn_embed_dim = getattr(
-        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
-    )
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
-    args.activation_fn = getattr(args, "activation_fn", "relu")
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.adaptive_input = getattr(args, "adaptive_input", False)
-    args.apply_bert_init = getattr(args, "apply_bert_init", False)
-
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-
-    # --- special arguments ---
-    args.sg_length_pred = getattr(args, "sg_length_pred", False)
-    args.pred_length_offset = getattr(args, "pred_length_offset", False)
-    args.length_loss_factor = getattr(args, "length_loss_factor", 0.1)
-    args.src_embedding_copy = getattr(args, "src_embedding_copy", False)
-
-
-@register_model_architecture(
-    "nonautoregressive_transformer", "nonautoregressive_transformer_wmt_en_de"
-)
-def nonautoregressive_transformer_wmt_en_de(args):
-    base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/__init__.py
deleted file mode 100644
index 56579e591566e014d99ed5a283ee7135257f054c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .hub_interface import *  # noqa
-from .model import *  # noqa
-from .model_camembert import *  # noqa
-from .model_xlmr import *  # noqa
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/alignment_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/alignment_utils.py
deleted file mode 100644
index ccc7f74cb94d5b8baa2d4e9dfd44f653d47ee43e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/alignment_utils.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from collections import Counter
-from typing import List
-
-import torch
-
-
-def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]):
-    """
-    Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy).
-
-    Args:
-        roberta (RobertaHubInterface): RoBERTa instance
-        bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)`
-        other_tokens (List[str]): other tokens of shape `(T_words)`
-
-    Returns:
-        List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*.
-    """
-    assert bpe_tokens.dim() == 1
-    assert bpe_tokens[0] == 0
-
-    def clean(text):
-        return text.strip()
-
-    # remove whitespaces to simplify alignment
-    bpe_tokens = [roberta.task.source_dictionary.string([x]) for x in bpe_tokens]
-    bpe_tokens = [
-        clean(roberta.bpe.decode(x) if x not in {"<s>", ""} else x) for x in bpe_tokens
-    ]
-    other_tokens = [clean(str(o)) for o in other_tokens]
-
-    # strip leading <s>
-    bpe_tokens = bpe_tokens[1:]
-    assert "".join(bpe_tokens) == "".join(other_tokens)
-
-    # create alignment from every word to a list of BPE tokens
-    alignment = []
-    bpe_toks = filter(lambda item: item[1] != "", enumerate(bpe_tokens, start=1))
-    j, bpe_tok = next(bpe_toks)
-    for other_tok in other_tokens:
-        bpe_indices = []
-        while True:
-            if other_tok.startswith(bpe_tok):
-                bpe_indices.append(j)
-                other_tok = other_tok[len(bpe_tok) :]
-                try:
-                    j, bpe_tok = next(bpe_toks)
-                except StopIteration:
-                    j, bpe_tok = None, None
-            elif bpe_tok.startswith(other_tok):
-                # other_tok spans multiple BPE tokens
-                bpe_indices.append(j)
-                bpe_tok = bpe_tok[len(other_tok) :]
-                other_tok = ""
-            else:
-                raise Exception('Cannot align "{}" and "{}"'.format(other_tok, bpe_tok))
-            if other_tok == "":
-                break
-        assert len(bpe_indices) > 0
-        alignment.append(bpe_indices)
-    assert len(alignment) == len(other_tokens)
-
-    return alignment
-
-
-def align_features_to_words(roberta, features, alignment):
-    """
-    Align given features to words.
-
-    Args:
-        roberta (RobertaHubInterface): RoBERTa instance
-        features (torch.Tensor): features to align of shape `(T_bpe x C)`
-        alignment: alignment between BPE tokens and words returned by
-            func:`align_bpe_to_words`.
-    """
-    assert features.dim() == 2
-
-    bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices)
-    assert bpe_counts[0] == 0  # <s> shouldn't be aligned
-    denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))])
-    weighted_features = features / denom.unsqueeze(-1)
-
-    output = [weighted_features[0]]
-    largest_j = -1
-    for bpe_indices in alignment:
-        output.append(weighted_features[bpe_indices].sum(dim=0))
-        largest_j = max(largest_j, *bpe_indices)
-    for j in range(largest_j + 1, len(features)):
-        output.append(weighted_features[j])
-    output = torch.stack(output)
-    assert torch.all(torch.abs(output.sum(dim=0) - features.sum(dim=0)) < 1e-4)
-    return output
-
-
-def spacy_nlp():
-    if getattr(spacy_nlp, "_nlp", None) is None:
-        try:
-            from spacy.lang.en import English
-
-            spacy_nlp._nlp = English()
-        except ImportError:
-            raise ImportError("Please install spacy with: pip install spacy")
-    return spacy_nlp._nlp
-
-
-def spacy_tokenizer():
-    if getattr(spacy_tokenizer, "_tokenizer", None) is None:
-        try:
-            nlp = spacy_nlp()
-            spacy_tokenizer._tokenizer = nlp.Defaults.create_tokenizer(nlp)
-        except ImportError:
-            raise ImportError("Please install spacy with: pip install spacy")
-    return spacy_tokenizer._tokenizer
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/hub_interface.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/hub_interface.py
deleted file mode 100644
index 526823bd1ffd27269493c8807cb248d49997bc51..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/hub_interface.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.data import encoders
-
-
-class RobertaHubInterface(nn.Module):
-    """A simple PyTorch Hub interface to RoBERTa.
-
-    Usage: https://github.com/pytorch/fairseq/tree/master/examples/roberta
-    """
-
-    def __init__(self, args, task, model):
-        super().__init__()
-        self.args = args
-        self.task = task
-        self.model = model
-
-        self.bpe = encoders.build_bpe(args)
-
-        # this is useful for determining the device
-        self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
-
-    @property
-    def device(self):
-        return self._float_tensor.device
-
-    def encode(
-        self, sentence: str, *addl_sentences, no_separator=False
-    ) -> torch.LongTensor:
-        """
-        BPE-encode a sentence (or multiple sentences).
-
-        Every sequence begins with a beginning-of-sentence (`<s>`) symbol.
-        Every sentence ends with an end-of-sentence (`</s>`) and we use an
-        extra end-of-sentence (`</s>`) as a separator.
-
-        Example (single sentence): `<s> a b c </s>`
-        Example (sentence pair): `<s> d e f </s> </s> 1 2 3 </s>`
-
-        The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE
-        requires leading spaces. For example::
-
-            >>> roberta.encode('Hello world').tolist()
-            [0, 31414, 232, 2]
-            >>> roberta.encode(' world').tolist()
-            [0, 232, 2]
-            >>> roberta.encode('world').tolist()
-            [0, 8331, 2]
-        """
-        bpe_sentence = "<s> " + self.bpe.encode(sentence) + " </s>"
-        for s in addl_sentences:
-            bpe_sentence += " </s>" if not no_separator else ""
-            bpe_sentence += " " + self.bpe.encode(s) + " </s>"
-        tokens = self.task.source_dictionary.encode_line(
-            bpe_sentence, append_eos=False, add_if_not_exist=False
-        )
-        return tokens.long()
-
-    def decode(self, tokens: torch.LongTensor):
-        assert tokens.dim() == 1
-        tokens = tokens.numpy()
-        if tokens[0] == self.task.source_dictionary.bos():
-            tokens = tokens[1:]  # remove <s>
-        eos_mask = tokens == self.task.source_dictionary.eos()
-        doc_mask = eos_mask[1:] & eos_mask[:-1]
-        sentences = np.split(tokens, doc_mask.nonzero()[0] + 1)
-        sentences = [
-            self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences
-        ]
-        if len(sentences) == 1:
-            return sentences[0]
-        return sentences
-
-    def extract_features(
-        self, tokens: torch.LongTensor, return_all_hiddens: bool = False
-    ) -> torch.Tensor:
-        if tokens.dim() == 1:
-            tokens = tokens.unsqueeze(0)
-        if tokens.size(-1) > self.model.max_positions():
-            raise ValueError(
-                "tokens exceeds maximum length: {} > {}".format(
-                    tokens.size(-1), self.model.max_positions()
-                )
-            )
-        features, extra = self.model(
-            tokens.to(device=self.device),
-            features_only=True,
-            return_all_hiddens=return_all_hiddens,
-        )
-        if return_all_hiddens:
-            # convert from T x B x C -> B x T x C
-            inner_states = extra["inner_states"]
-            return [inner_state.transpose(0, 1) for inner_state in inner_states]
-        else:
-            return features  # just the last layer's features
-
-    def register_classification_head(
-        self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs
-    ):
-        self.model.register_classification_head(
-            name, num_classes=num_classes, embedding_size=embedding_size, **kwargs
-        )
-
-    def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False):
-        features = self.extract_features(tokens.to(device=self.device))
-        logits = self.model.classification_heads[head](features)
-        if return_logits:
-            return logits
-        return F.log_softmax(logits, dim=-1)
-
-    def extract_features_aligned_to_words(
-        self, sentence: str, return_all_hiddens: bool = False
-    ) -> torch.Tensor:
-        """Extract RoBERTa features, aligned to spaCy's word-level tokenizer."""
-        from fairseq.models.roberta import alignment_utils
-        from spacy.tokens import Doc
-
-        nlp = alignment_utils.spacy_nlp()
-        tokenizer = alignment_utils.spacy_tokenizer()
-
-        # tokenize both with GPT-2 BPE and spaCy
-        bpe_toks = self.encode(sentence)
-        spacy_toks = tokenizer(sentence)
-        spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)]
-        alignment = alignment_utils.align_bpe_to_words(self, bpe_toks, spacy_toks_ws)
-
-        # extract features and align them
-        features = self.extract_features(
-            bpe_toks, return_all_hiddens=return_all_hiddens
-        )
-        features = features.squeeze(0)
-        aligned_feats = alignment_utils.align_features_to_words(
-            self, features, alignment
-        )
-
-        # wrap in spaCy Doc
-        doc = Doc(
-            nlp.vocab,
-            words=["<s>"] + [x.text for x in spacy_toks] + ["</s>"],
-            spaces=[True]
-            + [x.endswith(" ") for x in spacy_toks_ws[:-1]]
-            + [True, False],
-        )
-        assert len(doc) == aligned_feats.size(0)
-        doc.user_token_hooks["vector"] = lambda token: aligned_feats[token.i]
-        return doc
-
-    def fill_mask(self, masked_input: str, topk: int = 5):
-        masked_token = "<mask>"
-        assert (
-            masked_token in masked_input and masked_input.count(masked_token) == 1
-        ), "Please add one {0} token for the input, eg: 'He is a {0} guy'".format(
-            masked_token
-        )
-
-        text_spans = masked_input.split(masked_token)
-        text_spans_bpe = (
-            (" {0} ".format(masked_token))
-            .join([self.bpe.encode(text_span.rstrip()) for text_span in text_spans])
-            .strip()
-        )
-        tokens = self.task.source_dictionary.encode_line(
-            "<s> " + text_spans_bpe + " </s>",
-            append_eos=False,
-            add_if_not_exist=False,
-        )
-
-        masked_index = (tokens == self.task.mask_idx).nonzero()
-        if tokens.dim() == 1:
-            tokens = tokens.unsqueeze(0)
-
-        with utils.model_eval(self.model):
-            features, extra = self.model(
-                tokens.long().to(device=self.device),
-                features_only=False,
-                return_all_hiddens=False,
-            )
-        logits = features[0, masked_index, :].squeeze()
-        prob = logits.softmax(dim=0)
-        values, index = prob.topk(k=topk, dim=0)
-        topk_predicted_token_bpe = self.task.source_dictionary.string(index)
-
-        topk_filled_outputs = []
-        for index, predicted_token_bpe in enumerate(
-            topk_predicted_token_bpe.split(" ")
-        ):
-            predicted_token = self.bpe.decode(predicted_token_bpe)
-            # Quick hack to fix https://github.com/pytorch/fairseq/issues/1306
-            if predicted_token_bpe.startswith("\u2581"):
-                predicted_token = " " + predicted_token
-            if " {0}".format(masked_token) in masked_input:
-                topk_filled_outputs.append(
-                    (
-                        masked_input.replace(
-                            " {0}".format(masked_token), predicted_token
-                        ),
-                        values[index].item(),
-                        predicted_token,
-                    )
-                )
-            else:
-                topk_filled_outputs.append(
-                    (
-                        masked_input.replace(masked_token, predicted_token),
-                        values[index].item(),
-                        predicted_token,
-                    )
-                )
-        return topk_filled_outputs
-
-    def disambiguate_pronoun(self, sentence: str) -> bool:
-        """
-        Usage::
-
-            >>> disambiguate_pronoun('The _trophy_ would not fit in the brown suitcase because [it] was too big.')
-            True
-
-            >>> disambiguate_pronoun('The trophy would not fit in the brown suitcase because [it] was too big.')
-            'The trophy'
-        """
-        assert hasattr(
-            self.task, "disambiguate_pronoun"
-        ), "roberta.disambiguate_pronoun() requires a model trained with the WSC task."
-        with utils.model_eval(self.model):
-            return self.task.disambiguate_pronoun(
-                self.model, sentence, use_cuda=self.device.type == "cuda"
-            )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/model.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/model.py
deleted file mode 100644
index d56496f803d2cd66e102b069358d73166a7e482d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/model.py
+++ /dev/null
@@ -1,524 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-RoBERTa: A Robustly Optimized BERT Pretraining Approach.
-"""
-
-import logging
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.models import (
-    FairseqEncoder,
-    FairseqEncoderModel,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.modules import LayerNorm, TransformerSentenceEncoder
-from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
-from fairseq.modules.transformer_sentence_encoder import init_bert_params
-
-from .hub_interface import RobertaHubInterface
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_model("roberta")
-class RobertaModel(FairseqEncoderModel):
-    @classmethod
-    def hub_models(cls):
-        return {
-            "roberta.base": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz",
-            "roberta.large": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz",
-            "roberta.large.mnli": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz",
-            "roberta.large.wsc": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz",
-        }
-
-    def __init__(self, args, encoder):
-        super().__init__(encoder)
-        self.args = args
-
-        # We follow BERT's random weight initialization
-        self.apply(init_bert_params)
-
-        self.classification_heads = nn.ModuleDict()
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        parser.add_argument(
-            "--encoder-layers", type=int, metavar="L", help="num encoder layers"
-        )
-        parser.add_argument(
-            "--encoder-embed-dim",
-            type=int,
-            metavar="H",
-            help="encoder embedding dimension",
-        )
-        parser.add_argument(
-            "--encoder-ffn-embed-dim",
-            type=int,
-            metavar="F",
-            help="encoder embedding dimension for FFN",
-        )
-        parser.add_argument(
-            "--encoder-attention-heads",
-            type=int,
-            metavar="A",
-            help="num encoder attention heads",
-        )
-        parser.add_argument(
-            "--activation-fn",
-            choices=utils.get_available_activation_fns(),
-            help="activation function to use",
-        )
-        parser.add_argument(
-            "--pooler-activation-fn",
-            choices=utils.get_available_activation_fns(),
-            help="activation function to use for pooler layer",
-        )
-        parser.add_argument(
-            "--encoder-normalize-before",
-            action="store_true",
-            help="apply layernorm before each encoder block",
-        )
-        parser.add_argument(
-            "--dropout", type=float, metavar="D", help="dropout probability"
-        )
-        parser.add_argument(
-            "--attention-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability for attention weights",
-        )
-        parser.add_argument(
-            "--activation-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability after activation in FFN",
-        )
-        parser.add_argument(
-            "--pooler-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability in the masked_lm pooler layers",
-        )
-        parser.add_argument(
-            "--max-positions", type=int, help="number of positional embeddings to learn"
-        )
-        parser.add_argument(
-            "--load-checkpoint-heads",
-            action="store_true",
-            help="(re-)register and load heads when loading checkpoints",
-        )
-        # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument(
-            "--encoder-layerdrop",
-            type=float,
-            metavar="D",
-            default=0,
-            help="LayerDrop probability for encoder",
-        )
-        parser.add_argument(
-            "--encoder-layers-to-keep",
-            default=None,
-            help="which layers to *keep* when pruning as a comma-separated list",
-        )
-        # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument(
-            "--quant-noise-pq",
-            type=float,
-            metavar="D",
-            default=0,
-            help="iterative PQ quantization noise at training time",
-        )
-        parser.add_argument(
-            "--quant-noise-pq-block-size",
-            type=int,
-            metavar="D",
-            default=8,
-            help="block size of quantization noise at training time",
-        )
-        parser.add_argument(
-            "--quant-noise-scalar",
-            type=float,
-            metavar="D",
-            default=0,
-            help="scalar quantization noise and scalar quantization at training time",
-        )
-        parser.add_argument(
-            "--untie-weights-roberta",
-            action="store_true",
-            help="Untie weights between embeddings and classifiers in RoBERTa",
-        )
-        parser.add_argument(
-            "--spectral-norm-classification-head",
-            action="store_true",
-            default=False,
-            help="Apply spectral normalization on the classification head",
-        )
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-
-        # make sure all arguments are present
-        base_architecture(args)
-
-        if not hasattr(args, "max_positions"):
-            args.max_positions = args.tokens_per_sample
-
-        encoder = RobertaEncoder(args, task.source_dictionary)
-        return cls(args, encoder)
-
-    def forward(
-        self,
-        src_tokens,
-        features_only=False,
-        return_all_hiddens=False,
-        classification_head_name=None,
-        **kwargs
-    ):
-        if classification_head_name is not None:
-            features_only = True
-
-        x, extra = self.encoder(src_tokens, features_only, return_all_hiddens, **kwargs)
-
-        if classification_head_name is not None:
-            x = self.classification_heads[classification_head_name](x)
-        return x, extra
-
-    def get_normalized_probs(self, net_output, log_probs, sample=None):
-        """Get normalized probabilities (or log probs) from a net's output."""
-        logits = net_output[0].float()
-        if log_probs:
-            return F.log_softmax(logits, dim=-1)
-        else:
-            return F.softmax(logits, dim=-1)
-
-    def register_classification_head(
-        self, name, num_classes=None, inner_dim=None, **kwargs
-    ):
-        """Register a classification head."""
-        if name in self.classification_heads:
-            prev_num_classes = self.classification_heads[name].out_proj.out_features
-            prev_inner_dim = self.classification_heads[name].dense.out_features
-            if num_classes != prev_num_classes or inner_dim != prev_inner_dim:
-                logger.warning(
-                    're-registering head "{}" with num_classes {} (prev: {}) '
-                    "and inner_dim {} (prev: {})".format(
-                        name, num_classes, prev_num_classes, inner_dim, prev_inner_dim
-                    )
-                )
-        self.classification_heads[name] = RobertaClassificationHead(
-            input_dim=self.args.encoder_embed_dim,
-            inner_dim=inner_dim or self.args.encoder_embed_dim,
-            num_classes=num_classes,
-            activation_fn=self.args.pooler_activation_fn,
-            pooler_dropout=self.args.pooler_dropout,
-            q_noise=self.args.quant_noise_pq,
-            qn_block_size=self.args.quant_noise_pq_block_size,
-            do_spectral_norm=self.args.spectral_norm_classification_head,
-        )
-
-    @property
-    def supported_targets(self):
-        return {"self"}
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        model_name_or_path,
-        checkpoint_file="model.pt",
-        data_name_or_path=".",
-        bpe="gpt2",
-        **kwargs
-    ):
-        from fairseq import hub_utils
-
-        x = hub_utils.from_pretrained(
-            model_name_or_path,
-            checkpoint_file,
-            data_name_or_path,
-            archive_map=cls.hub_models(),
-            bpe=bpe,
-            load_checkpoint_heads=True,
-            **kwargs,
-        )
-        cls.upgrade_args(x["args"])
-
-        logger.info(x["args"])
-        return RobertaHubInterface(x["args"], x["task"], x["models"][0])
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        prefix = name + "." if name != "" else ""
-
-        # rename decoder -> encoder before upgrading children modules
-        for k in list(state_dict.keys()):
-            if k.startswith(prefix + "decoder"):
-                new_k = prefix + "encoder" + k[len(prefix + "decoder") :]
-                state_dict[new_k] = state_dict[k]
-                del state_dict[k]
-
-        # upgrade children modules
-        super().upgrade_state_dict_named(state_dict, name)
-
-        # Handle new classification heads present in the state dict.
-        current_head_names = (
-            []
-            if not hasattr(self, "classification_heads")
-            else self.classification_heads.keys()
-        )
-        keys_to_delete = []
-        for k in state_dict.keys():
-            if not k.startswith(prefix + "classification_heads."):
-                continue
-
-            head_name = k[len(prefix + "classification_heads.") :].split(".")[0]
-            num_classes = state_dict[
-                prefix + "classification_heads." + head_name + ".out_proj.weight"
-            ].size(0)
-            inner_dim = state_dict[
-                prefix + "classification_heads." + head_name + ".dense.weight"
-            ].size(0)
-
-            if getattr(self.args, "load_checkpoint_heads", False):
-                if head_name not in current_head_names:
-                    self.register_classification_head(head_name, num_classes, inner_dim)
-            else:
-                if head_name not in current_head_names:
-                    logger.warning(
-                        "deleting classification head ({}) from checkpoint "
-                        "not present in current model: {}".format(head_name, k)
-                    )
-                    keys_to_delete.append(k)
-                elif (
-                    num_classes
-                    != self.classification_heads[head_name].out_proj.out_features
-                    or inner_dim
-                    != self.classification_heads[head_name].dense.out_features
-                ):
-                    logger.warning(
-                        "deleting classification head ({}) from checkpoint "
-                        "with different dimensions than current model: {}".format(
-                            head_name, k
-                        )
-                    )
-                    keys_to_delete.append(k)
-        for k in keys_to_delete:
-            del state_dict[k]
-
-        # Copy any newly-added classification heads into the state dict
-        # with their current weights.
-        if hasattr(self, "classification_heads"):
-            cur_state = self.classification_heads.state_dict()
-            for k, v in cur_state.items():
-                if prefix + "classification_heads." + k not in state_dict:
-                    logger.info("Overwriting " + prefix + "classification_heads." + k)
-                    state_dict[prefix + "classification_heads." + k] = v
-
-
-class RobertaLMHead(nn.Module):
-    """Head for masked language modeling."""
-
-    def __init__(self, embed_dim, output_dim, activation_fn, weight=None):
-        super().__init__()
-        self.dense = nn.Linear(embed_dim, embed_dim)
-        self.activation_fn = utils.get_activation_fn(activation_fn)
-        self.layer_norm = LayerNorm(embed_dim)
-
-        if weight is None:
-            weight = nn.Linear(embed_dim, output_dim, bias=False).weight
-        self.weight = weight
-        self.bias = nn.Parameter(torch.zeros(output_dim))
-
-    def forward(self, features, masked_tokens=None, **kwargs):
-        # Only project the masked tokens while training,
-        # saves both memory and computation
-        if masked_tokens is not None:
-            features = features[masked_tokens, :]
-
-        x = self.dense(features)
-        x = self.activation_fn(x)
-        x = self.layer_norm(x)
-        # project back to size of vocabulary with bias
-        x = F.linear(x, self.weight) + self.bias
-        return x
-
-
-class RobertaClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
-    def __init__(
-        self,
-        input_dim,
-        inner_dim,
-        num_classes,
-        activation_fn,
-        pooler_dropout,
-        q_noise=0,
-        qn_block_size=8,
-        do_spectral_norm=False,
-    ):
-        super().__init__()
-        self.dense = nn.Linear(input_dim, inner_dim)
-        self.activation_fn = utils.get_activation_fn(activation_fn)
-        self.dropout = nn.Dropout(p=pooler_dropout)
-        self.out_proj = apply_quant_noise_(
-            nn.Linear(inner_dim, num_classes), q_noise, qn_block_size
-        )
-        if do_spectral_norm:
-            if q_noise != 0:
-                raise NotImplementedError(
-                    "Attempting to use Spectral Normalization with Quant Noise. This is not officially supported"
-                )
-            self.out_proj = torch.nn.utils.spectral_norm(self.out_proj)
-
-    def forward(self, features, **kwargs):
-        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
-        x = self.dropout(x)
-        x = self.dense(x)
-        x = self.activation_fn(x)
-        x = self.dropout(x)
-        x = self.out_proj(x)
-        return x
-
-
-class RobertaEncoder(FairseqEncoder):
-    """RoBERTa encoder."""
-
-    def __init__(self, args, dictionary):
-        super().__init__(dictionary)
-        self.args = args
-
-        if args.encoder_layers_to_keep:
-            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
-
-        self.sentence_encoder = TransformerSentenceEncoder(
-            padding_idx=dictionary.pad(),
-            vocab_size=len(dictionary),
-            num_encoder_layers=args.encoder_layers,
-            embedding_dim=args.encoder_embed_dim,
-            ffn_embedding_dim=args.encoder_ffn_embed_dim,
-            num_attention_heads=args.encoder_attention_heads,
-            dropout=args.dropout,
-            attention_dropout=args.attention_dropout,
-            activation_dropout=args.activation_dropout,
-            layerdrop=args.encoder_layerdrop,
-            max_seq_len=args.max_positions,
-            num_segments=0,
-            encoder_normalize_before=True,
-            apply_bert_init=True,
-            activation_fn=args.activation_fn,
-            q_noise=args.quant_noise_pq,
-            qn_block_size=args.quant_noise_pq_block_size,
-        )
-        args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False)
-
-        self.lm_head = RobertaLMHead(
-            embed_dim=args.encoder_embed_dim,
-            output_dim=len(dictionary),
-            activation_fn=args.activation_fn,
-            weight=(
-                self.sentence_encoder.embed_tokens.weight
-                if not args.untie_weights_roberta
-                else None
-            ),
-        )
-
-    def forward(
-        self,
-        src_tokens,
-        features_only=False,
-        return_all_hiddens=False,
-        masked_tokens=None,
-        **unused
-    ):
-        """
-        Args:
-            src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
-            features_only (bool, optional): skip LM head and just return
-                features. If True, the output will be of shape
-                `(batch, src_len, embed_dim)`.
-            return_all_hiddens (bool, optional): also return all of the
-                intermediate hidden states (default: False).
-
-        Returns:
-            tuple:
-                - the LM output of shape `(batch, src_len, vocab)`
-                - a dictionary of additional data, where 'inner_states'
-                  is a list of hidden states. Note that the hidden
-                  states have shape `(src_len, batch, vocab)`.
-        """
-        x, extra = self.extract_features(
-            src_tokens, return_all_hiddens=return_all_hiddens
-        )
-        if not features_only:
-            x = self.output_layer(x, masked_tokens=masked_tokens)
-        return x, extra
-
-    def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs):
-        inner_states, _ = self.sentence_encoder(
-            src_tokens,
-            last_state_only=not return_all_hiddens,
-            token_embeddings=kwargs.get("token_embeddings", None),
-        )
-        features = inner_states[-1].transpose(0, 1)  # T x B x C -> B x T x C
-        return features, {"inner_states": inner_states if return_all_hiddens else None}
-
-    def output_layer(self, features, masked_tokens=None, **unused):
-        return self.lm_head(features, masked_tokens)
-
-    def max_positions(self):
-        """Maximum output length supported by the encoder."""
-        return self.args.max_positions
-
-
-@register_model_architecture("roberta", "roberta")
-def base_architecture(args):
-    args.encoder_layers = getattr(args, "encoder_layers", 12)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
-
-    args.activation_fn = getattr(args, "activation_fn", "gelu")
-    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
-
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
-    args.pooler_dropout = getattr(args, "pooler_dropout", 0.0)
-    args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None)
-    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
-    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
-    args.spectral_norm_classification_head = getattr(
-        args, "spectral_nrom_classification_head", False
-    )
-
-
-@register_model_architecture("roberta", "roberta_base")
-def roberta_base_architecture(args):
-    base_architecture(args)
-
-
-@register_model_architecture("roberta", "roberta_large")
-def roberta_large_architecture(args):
-    args.encoder_layers = getattr(args, "encoder_layers", 24)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
-    base_architecture(args)
-
-
-@register_model_architecture("roberta", "xlm")
-def xlm_architecture(args):
-    args.encoder_layers = getattr(args, "encoder_layers", 16)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1280)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1280 * 4)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
-    base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/model_camembert.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/model_camembert.py
deleted file mode 100644
index 46447546fafb4a0a887b481022cac07631047c80..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/model_camembert.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-CamemBERT: a Tasty French Language Model
-"""
-
-from fairseq.models import register_model
-
-from .hub_interface import RobertaHubInterface
-from .model import RobertaModel
-
-
-@register_model("camembert")
-class CamembertModel(RobertaModel):
-    @classmethod
-    def hub_models(cls):
-        return {
-            "camembert": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz",
-            "camembert.v0": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz",
-            "camembert-base": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz",
-            "camembert-large": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-large.tar.gz",
-            "camembert-base-ccnet": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet.tar.gz",
-            "camembert-base-ccnet-4gb": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet-4gb.tar.gz",
-            "camembert-base-wikipedia-4gb": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-wikipedia-4gb.tar.gz",
-            "camembert-base-oscar-4gb": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-oscar-4gb.tar.gz",
-        }
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        model_name_or_path,
-        checkpoint_file="model.pt",
-        data_name_or_path=".",
-        bpe="sentencepiece",
-        **kwargs
-    ):
-        from fairseq import hub_utils
-
-        x = hub_utils.from_pretrained(
-            model_name_or_path,
-            checkpoint_file,
-            data_name_or_path,
-            archive_map=cls.hub_models(),
-            bpe=bpe,
-            load_checkpoint_heads=True,
-            **kwargs,
-        )
-        return RobertaHubInterface(x["args"], x["task"], x["models"][0])
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/model_xlmr.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/model_xlmr.py
deleted file mode 100644
index 5886880f73bd1e2176c49e3d491a7d46eb3d9322..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/roberta/model_xlmr.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Unsupervised Cross-lingual Representation Learning at Scale
-"""
-
-from fairseq.models import register_model
-
-from .hub_interface import RobertaHubInterface
-from .model import RobertaModel
-
-
-@register_model("xlmr")
-class XLMRModel(RobertaModel):
-    @classmethod
-    def hub_models(cls):
-        return {
-            "xlmr.base": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz",
-            "xlmr.large": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz",
-        }
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        model_name_or_path,
-        checkpoint_file="model.pt",
-        data_name_or_path=".",
-        bpe="sentencepiece",
-        **kwargs
-    ):
-        from fairseq import hub_utils
-
-        x = hub_utils.from_pretrained(
-            model_name_or_path,
-            checkpoint_file,
-            data_name_or_path,
-            archive_map=cls.hub_models(),
-            bpe=bpe,
-            load_checkpoint_heads=True,
-            **kwargs,
-        )
-        return RobertaHubInterface(x["args"], x["task"], x["models"][0])
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/speech_to_text/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/speech_to_text/__init__.py
deleted file mode 100644
index 5d7f59b3a6708665952bc2ebb57852d290b40339..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/speech_to_text/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .berard import *  # noqa
-from .s2t_transformer import *  # noqa
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/speech_to_text/berard.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/speech_to_text/berard.py
deleted file mode 100644
index 5aef030fdeb52883dd1b1bebae89da173bdb7cc7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/speech_to_text/berard.py
+++ /dev/null
@@ -1,637 +0,0 @@
-# BSD 3-Clause License
-#
-# Copyright (c) 2017 xxxx
-# All rights reserved.
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# ============================================================================
-#!/usr/bin/env python3
-
-from ast import literal_eval
-from typing import List, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import checkpoint_utils, utils
-from fairseq.data.data_utils import lengths_to_padding_mask
-from fairseq.models import (
-    FairseqEncoder,
-    FairseqEncoderDecoderModel,
-    FairseqIncrementalDecoder,
-    register_model,
-    register_model_architecture,
-)
-
-
-@register_model("s2t_berard")
-class BerardModel(FairseqEncoderDecoderModel):
-    """Implementation of a model similar to https://arxiv.org/abs/1802.04200
-
-    Paper title: End-to-End Automatic Speech Translation of Audiobooks
-    An implementation is available in tensorflow at
-    https://github.com/eske/seq2seq
-    Relevant files in this implementation are the config
-    (https://github.com/eske/seq2seq/blob/master/config/LibriSpeech/AST.yaml)
-    and the model code
-    (https://github.com/eske/seq2seq/blob/master/translate/models.py).
-    The encoder and decoder try to be close to the original implementation.
-    The attention is an MLP as in Bahdanau et al.
-    (https://arxiv.org/abs/1409.0473).
-    There is no state initialization by averaging the encoder outputs.
-    """
-
-    def __init__(self, encoder, decoder):
-        super().__init__(encoder, decoder)
-
-    @staticmethod
-    def add_args(parser):
-        parser.add_argument(
-            "--input-layers",
-            type=str,
-            metavar="EXPR",
-            help="List of linear layer dimensions. These "
-            "layers are applied to the input features and "
-            "are followed by tanh and possibly dropout.",
-        )
-        parser.add_argument(
-            "--dropout",
-            type=float,
-            metavar="D",
-            help="Dropout probability to use in the encoder/decoder. "
-            "Note that this parameters control dropout in various places, "
-            "there is no fine-grained control for dropout for embeddings "
-            "vs LSTM layers for example.",
-        )
-        parser.add_argument(
-            "--in-channels",
-            type=int,
-            metavar="N",
-            help="Number of encoder input channels. " "Typically value is 1.",
-        )
-        parser.add_argument(
-            "--conv-layers",
-            type=str,
-            metavar="EXPR",
-            help="List of conv layers " "(format: (channels, kernel, stride)).",
-        )
-        parser.add_argument(
-            "--num-blstm-layers",
-            type=int,
-            metavar="N",
-            help="Number of encoder bi-LSTM layers.",
-        )
-        parser.add_argument(
-            "--lstm-size", type=int, metavar="N", help="LSTM hidden size."
-        )
-        parser.add_argument(
-            "--decoder-embed-dim",
-            type=int,
-            metavar="N",
-            help="Embedding dimension of the decoder target tokens.",
-        )
-        parser.add_argument(
-            "--decoder-hidden-dim",
-            type=int,
-            metavar="N",
-            help="Decoder LSTM hidden dimension.",
-        )
-        parser.add_argument(
-            "--decoder-num-layers",
-            type=int,
-            metavar="N",
-            help="Number of decoder LSTM layers.",
-        )
-        parser.add_argument(
-            "--attention-dim",
-            type=int,
-            metavar="N",
-            help="Hidden layer dimension in MLP attention.",
-        )
-        parser.add_argument(
-            "--output-layer-dim",
-            type=int,
-            metavar="N",
-            help="Hidden layer dim for linear layer prior to output projection.",
-        )
-        parser.add_argument(
-            "--load-pretrained-encoder-from",
-            type=str,
-            metavar="STR",
-            help="model to take encoder weights from (for initialization)",
-        )
-        parser.add_argument(
-            "--load-pretrained-decoder-from",
-            type=str,
-            metavar="STR",
-            help="model to take decoder weights from (for initialization)",
-        )
-
-    @classmethod
-    def build_encoder(cls, args, task):
-        encoder = BerardEncoder(
-            input_layers=literal_eval(args.input_layers),
-            conv_layers=literal_eval(args.conv_layers),
-            in_channels=args.input_channels,
-            input_feat_per_channel=args.input_feat_per_channel,
-            num_blstm_layers=args.num_blstm_layers,
-            lstm_size=args.lstm_size,
-            dropout=args.dropout,
-        )
-        if getattr(args, "load_pretrained_encoder_from", None):
-            encoder = checkpoint_utils.load_pretrained_component_from_model(
-                component=encoder, checkpoint=args.load_pretrained_encoder_from
-            )
-        return encoder
-
-    @classmethod
-    def build_decoder(cls, args, task):
-        decoder = LSTMDecoder(
-            dictionary=task.target_dictionary,
-            embed_dim=args.decoder_embed_dim,
-            num_layers=args.decoder_num_layers,
-            hidden_size=args.decoder_hidden_dim,
-            dropout=args.dropout,
-            encoder_output_dim=2 * args.lstm_size,  # bidirectional
-            attention_dim=args.attention_dim,
-            output_layer_dim=args.output_layer_dim,
-        )
-        if getattr(args, "load_pretrained_decoder_from", None):
-            decoder = checkpoint_utils.load_pretrained_component_from_model(
-                component=decoder, checkpoint=args.load_pretrained_decoder_from
-            )
-        return decoder
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-        encoder = cls.build_encoder(args, task)
-        decoder = cls.build_decoder(args, task)
-
-        return cls(encoder, decoder)
-
-    def get_normalized_probs(self, net_output, log_probs, sample=None):
-        # net_output['encoder_out'] is a (B, T, D) tensor
-        lprobs = super().get_normalized_probs(net_output, log_probs, sample)
-        # lprobs is a (B, T, D) tensor
-        lprobs.batch_first = True
-        return lprobs
-
-
-class BerardEncoder(FairseqEncoder):
-    def __init__(
-        self,
-        input_layers: List[int],
-        conv_layers: List[Tuple[int]],
-        in_channels: int,
-        input_feat_per_channel: int,
-        num_blstm_layers: int,
-        lstm_size: int,
-        dropout: float,
-    ):
-        """
-        Args:
-            input_layers: list of linear layer dimensions. These layers are
-                applied to the input features and are followed by tanh and
-                possibly dropout.
-            conv_layers: list of conv2d layer configurations. A configuration is
-                a tuple (out_channels, conv_kernel_size, stride).
-            in_channels: number of input channels.
-            input_feat_per_channel: number of input features per channel. These
-                are speech features, typically 40 or 80.
-            num_blstm_layers: number of bidirectional LSTM layers.
-            lstm_size: size of the LSTM hidden (and cell) size.
-            dropout: dropout probability. Dropout can be applied after the
-                linear layers and LSTM layers but not to the convolutional
-                layers.
-        """
-        super().__init__(None)
-
-        self.input_layers = nn.ModuleList()
-        in_features = input_feat_per_channel
-        for out_features in input_layers:
-            if dropout > 0:
-                self.input_layers.append(
-                    nn.Sequential(
-                        nn.Linear(in_features, out_features), nn.Dropout(p=dropout)
-                    )
-                )
-            else:
-                self.input_layers.append(nn.Linear(in_features, out_features))
-            in_features = out_features
-
-        self.in_channels = in_channels
-        self.input_dim = input_feat_per_channel
-        self.conv_kernel_sizes_and_strides = []
-        self.conv_layers = nn.ModuleList()
-        lstm_input_dim = input_layers[-1]
-        for conv_layer in conv_layers:
-            out_channels, conv_kernel_size, conv_stride = conv_layer
-            self.conv_layers.append(
-                nn.Conv2d(
-                    in_channels,
-                    out_channels,
-                    conv_kernel_size,
-                    stride=conv_stride,
-                    padding=conv_kernel_size // 2,
-                )
-            )
-            self.conv_kernel_sizes_and_strides.append((conv_kernel_size, conv_stride))
-            in_channels = out_channels
-            lstm_input_dim //= conv_stride
-
-        lstm_input_dim *= conv_layers[-1][0]
-        self.lstm_size = lstm_size
-        self.num_blstm_layers = num_blstm_layers
-        self.lstm = nn.LSTM(
-            input_size=lstm_input_dim,
-            hidden_size=lstm_size,
-            num_layers=num_blstm_layers,
-            dropout=dropout,
-            bidirectional=True,
-        )
-        self.output_dim = 2 * lstm_size  # bidirectional
-        if dropout > 0:
-            self.dropout = nn.Dropout(p=dropout)
-        else:
-            self.dropout = None
-
-    def forward(self, src_tokens, src_lengths=None, **kwargs):
-        """
-        Args
-            src_tokens: padded tensor (B, T, C * feat)
-            src_lengths: tensor of original lengths of input utterances (B,)
-        """
-        bsz, max_seq_len, _ = src_tokens.size()
-        # (B, C, T, feat)
-        x = (
-            src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim)
-            .transpose(1, 2)
-            .contiguous()
-        )
-
-        for input_layer in self.input_layers:
-            x = input_layer(x)
-            x = torch.tanh(x)
-
-        for conv_layer in self.conv_layers:
-            x = conv_layer(x)
-
-        bsz, _, output_seq_len, _ = x.size()
-
-        # (B, C, T, feat) -> (B, T, C, feat) -> (T, B, C, feat) ->
-        # (T, B, C * feat)
-        x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1)
-
-        input_lengths = src_lengths.clone()
-        for k, s in self.conv_kernel_sizes_and_strides:
-            p = k // 2
-            input_lengths = (input_lengths.float() + 2 * p - k) / s + 1
-            input_lengths = input_lengths.floor().long()
-
-        packed_x = nn.utils.rnn.pack_padded_sequence(x, input_lengths)
-
-        h0 = x.new(2 * self.num_blstm_layers, bsz, self.lstm_size).zero_()
-        c0 = x.new(2 * self.num_blstm_layers, bsz, self.lstm_size).zero_()
-        packed_outs, _ = self.lstm(packed_x, (h0, c0))
-
-        # unpack outputs and apply dropout
-        x, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_outs)
-        if self.dropout is not None:
-            x = self.dropout(x)
-
-        encoder_padding_mask = (
-            lengths_to_padding_mask(output_lengths).to(src_tokens.device).t()
-        )
-
-        return {
-            "encoder_out": x,  # (T, B, C)
-            "encoder_padding_mask": encoder_padding_mask,  # (T, B)
-        }
-
-    def reorder_encoder_out(self, encoder_out, new_order):
-        encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
-            1, new_order
-        )
-        encoder_out["encoder_padding_mask"] = encoder_out[
-            "encoder_padding_mask"
-        ].index_select(1, new_order)
-        return encoder_out
-
-
-class MLPAttention(nn.Module):
-    """The original attention from Badhanau et al. (2014)
-
-    https://arxiv.org/abs/1409.0473, based on a Multi-Layer Perceptron.
-    The attention score between position i in the encoder and position j in the
-    decoder is: alpha_ij = V_a * tanh(W_ae * enc_i + W_ad * dec_j + b_a)
-    """
-
-    def __init__(self, decoder_hidden_state_dim, context_dim, attention_dim):
-        super().__init__()
-
-        self.context_dim = context_dim
-        self.attention_dim = attention_dim
-        # W_ae and b_a
-        self.encoder_proj = nn.Linear(context_dim, self.attention_dim, bias=True)
-        # W_ad
-        self.decoder_proj = nn.Linear(
-            decoder_hidden_state_dim, self.attention_dim, bias=False
-        )
-        # V_a
-        self.to_scores = nn.Linear(self.attention_dim, 1, bias=False)
-
-    def forward(self, decoder_state, source_hids, encoder_padding_mask):
-        """The expected input dimensions are:
-        decoder_state: bsz x decoder_hidden_state_dim
-        source_hids: src_len x bsz x context_dim
-        encoder_padding_mask: src_len x bsz
-        """
-        src_len, bsz, _ = source_hids.size()
-        # (src_len*bsz) x context_dim (to feed through linear)
-        flat_source_hids = source_hids.view(-1, self.context_dim)
-        # (src_len*bsz) x attention_dim
-        encoder_component = self.encoder_proj(flat_source_hids)
-        # src_len x bsz x attention_dim
-        encoder_component = encoder_component.view(src_len, bsz, self.attention_dim)
-        # 1 x bsz x attention_dim
-        decoder_component = self.decoder_proj(decoder_state).unsqueeze(0)
-        # Sum with broadcasting and apply the non linearity
-        # src_len x bsz x attention_dim
-        hidden_att = torch.tanh(
-            (decoder_component + encoder_component).view(-1, self.attention_dim)
-        )
-        # Project onto the reals to get attentions scores (src_len x bsz)
-        attn_scores = self.to_scores(hidden_att).view(src_len, bsz)
-
-        # Mask + softmax (src_len x bsz)
-        if encoder_padding_mask is not None:
-            attn_scores = (
-                attn_scores.float()
-                .masked_fill_(encoder_padding_mask, float("-inf"))
-                .type_as(attn_scores)
-            )  # FP16 support: cast to float and back
-        # srclen x bsz
-        normalized_masked_attn_scores = F.softmax(attn_scores, dim=0)
-
-        # Sum weighted sources (bsz x context_dim)
-        attn_weighted_context = (
-            source_hids * normalized_masked_attn_scores.unsqueeze(2)
-        ).sum(dim=0)
-
-        return attn_weighted_context, normalized_masked_attn_scores
-
-
-class LSTMDecoder(FairseqIncrementalDecoder):
-    def __init__(
-        self,
-        dictionary,
-        embed_dim,
-        num_layers,
-        hidden_size,
-        dropout,
-        encoder_output_dim,
-        attention_dim,
-        output_layer_dim,
-    ):
-        """
-        Args:
-            dictionary: target text dictionary.
-            embed_dim: embedding dimension for target tokens.
-            num_layers: number of LSTM layers.
-            hidden_size: hidden size for LSTM layers.
-            dropout: dropout probability. Dropout can be applied to the
-                embeddings, the LSTM layers, and the context vector.
-            encoder_output_dim: encoder output dimension (hidden size of
-                encoder LSTM).
-            attention_dim: attention dimension for MLP attention.
-            output_layer_dim: size of the linear layer prior to output
-                projection.
-        """
-        super().__init__(dictionary)
-        self.num_layers = num_layers
-        self.hidden_size = hidden_size
-        num_embeddings = len(dictionary)
-        padding_idx = dictionary.pad()
-        self.embed_tokens = nn.Embedding(num_embeddings, embed_dim, padding_idx)
-        if dropout > 0:
-            self.dropout = nn.Dropout(p=dropout)
-        else:
-            self.dropout = None
-
-        self.layers = nn.ModuleList()
-        for layer_id in range(num_layers):
-            input_size = embed_dim if layer_id == 0 else encoder_output_dim
-            self.layers.append(
-                nn.LSTMCell(input_size=input_size, hidden_size=hidden_size)
-            )
-
-        self.context_dim = encoder_output_dim
-        self.attention = MLPAttention(
-            decoder_hidden_state_dim=hidden_size,
-            context_dim=encoder_output_dim,
-            attention_dim=attention_dim,
-        )
-
-        self.deep_output_layer = nn.Linear(
-            hidden_size + encoder_output_dim + embed_dim, output_layer_dim
-        )
-        self.output_projection = nn.Linear(output_layer_dim, num_embeddings)
-
-    def forward(
-        self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs
-    ):
-        encoder_padding_mask = encoder_out["encoder_padding_mask"]
-        encoder_outs = encoder_out["encoder_out"]
-
-        if incremental_state is not None:
-            prev_output_tokens = prev_output_tokens[:, -1:]
-        bsz, seqlen = prev_output_tokens.size()
-
-        srclen = encoder_outs.size(0)
-
-        # embed tokens
-        embeddings = self.embed_tokens(prev_output_tokens)
-        x = embeddings
-        if self.dropout is not None:
-            x = self.dropout(x)
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-
-        # initialize previous states (or get from cache during incremental
-        # generation)
-        cached_state = utils.get_incremental_state(
-            self, incremental_state, "cached_state"
-        )
-        if cached_state is not None:
-            prev_hiddens, prev_cells = cached_state
-        else:
-            prev_hiddens = [encoder_out["encoder_out"].mean(dim=0)] * self.num_layers
-            prev_cells = [x.new_zeros(bsz, self.hidden_size)] * self.num_layers
-
-        attn_scores = x.new_zeros(bsz, srclen)
-        attention_outs = []
-        outs = []
-        for j in range(seqlen):
-            input = x[j, :, :]
-            attention_out = None
-            for i, layer in enumerate(self.layers):
-                # the previous state is one layer below except for the bottom
-                # layer where the previous state is the state emitted by the
-                # top layer
-                hidden, cell = layer(
-                    input,
-                    (
-                        prev_hiddens[(i - 1) % self.num_layers],
-                        prev_cells[(i - 1) % self.num_layers],
-                    ),
-                )
-                if self.dropout is not None:
-                    hidden = self.dropout(hidden)
-                prev_hiddens[i] = hidden
-                prev_cells[i] = cell
-                if attention_out is None:
-                    attention_out, attn_scores = self.attention(
-                        hidden, encoder_outs, encoder_padding_mask
-                    )
-                    if self.dropout is not None:
-                        attention_out = self.dropout(attention_out)
-                    attention_outs.append(attention_out)
-                input = attention_out
-
-            # collect the output of the top layer
-            outs.append(hidden)
-
-        # cache previous states (no-op except during incremental generation)
-        utils.set_incremental_state(
-            self, incremental_state, "cached_state", (prev_hiddens, prev_cells)
-        )
-
-        # collect outputs across time steps
-        x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size)
-        attention_outs_concat = torch.cat(attention_outs, dim=0).view(
-            seqlen, bsz, self.context_dim
-        )
-
-        # T x B x C -> B x T x C
-        x = x.transpose(0, 1)
-        attention_outs_concat = attention_outs_concat.transpose(0, 1)
-
-        # concat LSTM output, attention output and embedding
-        # before output projection
-        x = torch.cat((x, attention_outs_concat, embeddings), dim=2)
-        x = self.deep_output_layer(x)
-        x = torch.tanh(x)
-        if self.dropout is not None:
-            x = self.dropout(x)
-        # project back to size of vocabulary
-        x = self.output_projection(x)
-
-        # to return the full attn_scores tensor, we need to fix the decoder
-        # to account for subsampling input frames
-        # return x, attn_scores
-        return x, None
-
-    def reorder_incremental_state(self, incremental_state, new_order):
-        super().reorder_incremental_state(incremental_state, new_order)
-        cached_state = utils.get_incremental_state(
-            self, incremental_state, "cached_state"
-        )
-        if cached_state is None:
-            return
-
-        def reorder_state(state):
-            if isinstance(state, list):
-                return [reorder_state(state_i) for state_i in state]
-            return state.index_select(0, new_order)
-
-        new_state = tuple(map(reorder_state, cached_state))
-        utils.set_incremental_state(self, incremental_state, "cached_state", new_state)
-
-
-@register_model_architecture(model_name="s2t_berard", arch_name="s2t_berard")
-def berard(args):
-    """The original version: "End-to-End Automatic Speech Translation of
-    Audiobooks" (https://arxiv.org/abs/1802.04200)
-    """
-    args.input_layers = getattr(args, "input_layers", "[256, 128]")
-    args.conv_layers = getattr(args, "conv_layers", "[(16, 3, 2), (16, 3, 2)]")
-    args.num_blstm_layers = getattr(args, "num_blstm_layers", 3)
-    args.lstm_size = getattr(args, "lstm_size", 256)
-    args.dropout = getattr(args, "dropout", 0.2)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 128)
-    args.decoder_num_layers = getattr(args, "decoder_num_layers", 2)
-    args.decoder_hidden_dim = getattr(args, "decoder_hidden_dim", 512)
-    args.attention_dim = getattr(args, "attention_dim", 512)
-    args.output_layer_dim = getattr(args, "output_layer_dim", 128)
-    args.load_pretrained_encoder_from = getattr(
-        args, "load_pretrained_encoder_from", None
-    )
-    args.load_pretrained_decoder_from = getattr(
-        args, "load_pretrained_decoder_from", None
-    )
-
-
-@register_model_architecture(model_name="s2t_berard", arch_name="s2t_berard_256_3_3")
-def berard_256_3_3(args):
-    """Used in
-    * "Harnessing Indirect Training Data for End-to-End Automatic Speech
-    Translation: Tricks of the Trade" (https://arxiv.org/abs/1909.06515)
-    * "CoVoST: A Diverse Multilingual Speech-To-Text Translation Corpus"
-    (https://arxiv.org/pdf/2002.01320.pdf)
-    * "Self-Supervised Representations Improve End-to-End Speech Translation"
-    (https://arxiv.org/abs/2006.12124)
-    """
-    args.decoder_num_layers = getattr(args, "decoder_num_layers", 3)
-    berard(args)
-
-
-@register_model_architecture(model_name="s2t_berard", arch_name="s2t_berard_512_3_2")
-def berard_512_3_2(args):
-    args.num_blstm_layers = getattr(args, "num_blstm_layers", 3)
-    args.lstm_size = getattr(args, "lstm_size", 512)
-    args.dropout = getattr(args, "dropout", 0.3)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
-    args.decoder_num_layers = getattr(args, "decoder_num_layers", 2)
-    args.decoder_hidden_dim = getattr(args, "decoder_hidden_dim", 1024)
-    args.attention_dim = getattr(args, "attention_dim", 512)
-    args.output_layer_dim = getattr(args, "output_layer_dim", 256)
-    berard(args)
-
-
-@register_model_architecture(model_name="s2t_berard", arch_name="s2t_berard_512_5_3")
-def berard_512_5_3(args):
-    args.num_blstm_layers = getattr(args, "num_blstm_layers", 5)
-    args.lstm_size = getattr(args, "lstm_size", 512)
-    args.dropout = getattr(args, "dropout", 0.3)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
-    args.decoder_num_layers = getattr(args, "decoder_num_layers", 3)
-    args.decoder_hidden_dim = getattr(args, "decoder_hidden_dim", 1024)
-    args.attention_dim = getattr(args, "attention_dim", 512)
-    args.output_layer_dim = getattr(args, "output_layer_dim", 256)
-    berard(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/speech_to_text/s2t_transformer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/speech_to_text/s2t_transformer.py
deleted file mode 100644
index cf1097f8a4324e0ecd7f4ba98437d292e27dbcc8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/speech_to_text/s2t_transformer.py
+++ /dev/null
@@ -1,504 +0,0 @@
-# BSD 3-Clause License
-#
-# Copyright (c) 2017 xxxx
-# All rights reserved.
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# ============================================================================
-#!/usr/bin/env python3
-
-import logging
-import math
-from typing import Dict, List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-from fairseq import checkpoint_utils, utils
-from fairseq.data.data_utils import lengths_to_padding_mask
-from fairseq.models import (
-    FairseqEncoder,
-    FairseqEncoderDecoderModel,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.models.fairseq_encoder import EncoderOut
-from fairseq.models.transformer import Embedding, TransformerDecoder
-from fairseq.modules import (
-    FairseqDropout,
-    LayerNorm,
-    PositionalEmbedding,
-    TransformerEncoderLayer,
-)
-from torch import Tensor
-
-
-logger = logging.getLogger(__name__)
-
-
-class Conv1dSubsampler(nn.Module):
-    """Convolutional subsampler: a stack of 1D convolution (along temporal
-    dimension) followed by non-linear activation via gated linear units
-    (https://arxiv.org/abs/1911.08460)
-
-    Args:
-        in_channels (int): the number of input channels
-        mid_channels (int): the number of intermediate channels
-        out_channels (int): the number of output channels
-        kernel_sizes (List[int]): the kernel size for each convolutional layer
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        mid_channels: int,
-        out_channels: int,
-        kernel_sizes: List[int] = (3, 3),
-    ):
-        super(Conv1dSubsampler, self).__init__()
-        self.n_layers = len(kernel_sizes)
-        self.conv_layers = nn.ModuleList(
-            nn.Conv1d(
-                in_channels if i == 0 else mid_channels // 2,
-                mid_channels if i < self.n_layers - 1 else out_channels * 2,
-                k,
-                stride=2,
-                padding=k // 2,
-            )
-            for i, k in enumerate(kernel_sizes)
-        )
-
-    def get_out_seq_lens_tensor(self, in_seq_lens_tensor):
-        out = in_seq_lens_tensor.clone()
-        for _ in range(self.n_layers):
-            out = ((out.float() - 1) / 2 + 1).floor().long()
-        return out
-
-    def forward(self, src_tokens, src_lengths):
-        bsz, in_seq_len, _ = src_tokens.size()  # B x T x (C x D)
-        x = src_tokens.transpose(1, 2).contiguous()  # -> B x (C x D) x T
-        for conv in self.conv_layers:
-            x = conv(x)
-            x = nn.functional.glu(x, dim=1)
-        _, _, out_seq_len = x.size()
-        x = x.transpose(1, 2).transpose(0, 1).contiguous()  # -> T x B x (C x D)
-        return x, self.get_out_seq_lens_tensor(src_lengths)
-
-
-@register_model("s2t_transformer")
-class S2TTransformerModel(FairseqEncoderDecoderModel):
-    """Adapted Transformer model (https://arxiv.org/abs/1706.03762) for
-    speech-to-text tasks. The Transformer encoder/decoder remains the same.
-    A trainable input subsampler is prepended to the Transformer encoder to
-    project inputs into the encoder dimension as well as downsample input
-    sequence for computational efficiency."""
-
-    def __init__(self, encoder, decoder):
-        super().__init__(encoder, decoder)
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        # input
-        parser.add_argument(
-            "--conv-kernel-sizes",
-            type=str,
-            metavar="N",
-            help="kernel sizes of Conv1d subsampling layers",
-        )
-        parser.add_argument(
-            "--conv-channels",
-            type=int,
-            metavar="N",
-            help="# of channels in Conv1d subsampling layers",
-        )
-        # Transformer
-        parser.add_argument(
-            "--activation-fn",
-            type=str,
-            default="relu",
-            choices=utils.get_available_activation_fns(),
-            help="activation function to use",
-        )
-        parser.add_argument(
-            "--dropout", type=float, metavar="D", help="dropout probability"
-        )
-        parser.add_argument(
-            "--attention-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability for attention weights",
-        )
-        parser.add_argument(
-            "--activation-dropout",
-            "--relu-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability after activation in FFN.",
-        )
-        parser.add_argument(
-            "--encoder-embed-dim",
-            type=int,
-            metavar="N",
-            help="encoder embedding dimension",
-        )
-        parser.add_argument(
-            "--encoder-ffn-embed-dim",
-            type=int,
-            metavar="N",
-            help="encoder embedding dimension for FFN",
-        )
-        parser.add_argument(
-            "--encoder-layers", type=int, metavar="N", help="num encoder layers"
-        )
-        parser.add_argument(
-            "--encoder-attention-heads",
-            type=int,
-            metavar="N",
-            help="num encoder attention heads",
-        )
-        parser.add_argument(
-            "--encoder-normalize-before",
-            action="store_true",
-            help="apply layernorm before each encoder block",
-        )
-        parser.add_argument(
-            "--decoder-embed-dim",
-            type=int,
-            metavar="N",
-            help="decoder embedding dimension",
-        )
-        parser.add_argument(
-            "--decoder-ffn-embed-dim",
-            type=int,
-            metavar="N",
-            help="decoder embedding dimension for FFN",
-        )
-        parser.add_argument(
-            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
-        )
-        parser.add_argument(
-            "--decoder-attention-heads",
-            type=int,
-            metavar="N",
-            help="num decoder attention heads",
-        )
-        parser.add_argument(
-            "--decoder-normalize-before",
-            action="store_true",
-            help="apply layernorm before each decoder block",
-        )
-        parser.add_argument(
-            "--layernorm-embedding",
-            action="store_true",
-            help="add layernorm to embedding",
-        )
-        parser.add_argument(
-            "--no-scale-embedding",
-            action="store_true",
-            help="if True, dont scale embeddings",
-        )
-        parser.add_argument(
-            "--load-pretrained-encoder-from",
-            type=str,
-            metavar="STR",
-            help="model to take encoder weights from (for initialization)",
-        )
-
-    @classmethod
-    def build_encoder(cls, args):
-        encoder = S2TTransformerEncoder(args)
-        if getattr(args, "load_pretrained_encoder_from", None):
-            encoder = checkpoint_utils.load_pretrained_component_from_model(
-                component=encoder, checkpoint=args.load_pretrained_encoder_from
-            )
-            logger.info(
-                f"loaded pretrained encoder from: "
-                f"{args.load_pretrained_encoder_from}"
-            )
-        return encoder
-
-    @classmethod
-    def build_decoder(cls, args, task, embed_tokens):
-        return TransformerDecoderScriptable(args, task.target_dictionary, embed_tokens)
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-
-        # make sure all arguments are present in older models
-        base_architecture(args)
-
-        def build_embedding(dictionary, embed_dim):
-            num_embeddings = len(dictionary)
-            padding_idx = dictionary.pad()
-            return Embedding(num_embeddings, embed_dim, padding_idx)
-
-        decoder_embed_tokens = build_embedding(
-            task.target_dictionary, args.decoder_embed_dim
-        )
-        encoder = cls.build_encoder(args)
-        decoder = cls.build_decoder(args, task, decoder_embed_tokens)
-        return cls(encoder, decoder)
-
-    def get_normalized_probs(
-        self,
-        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
-        log_probs: bool,
-        sample: Optional[Dict[str, Tensor]] = None,
-    ):
-        # net_output['encoder_out'] is a (B, T, D) tensor
-        lprobs = self.get_normalized_probs_scriptable(net_output, log_probs, sample)
-        lprobs.batch_first = True
-        return lprobs
-
-    def forward(self, src_tokens, src_lengths, prev_output_tokens):
-        """
-        The forward method inherited from the base class has a **kwargs
-        argument in its input, which is not supported in torchscript. This
-        method overrites the forward method definition without **kwargs.
-        """
-        encoder_out = self.encoder(src_tokens=src_tokens, src_lengths=src_lengths)
-        decoder_out = self.decoder(
-            prev_output_tokens=prev_output_tokens, encoder_out=encoder_out
-        )
-        return decoder_out
-
-
-class S2TTransformerEncoder(FairseqEncoder):
-    """Speech-to-text Transformer encoder that consists of input subsampler and
-    Transformer encoder."""
-
-    def __init__(self, args):
-        super().__init__(None)
-
-        self.dropout_module = FairseqDropout(
-            p=args.dropout, module_name=self.__class__.__name__
-        )
-        self.embed_scale = math.sqrt(args.encoder_embed_dim)
-        if args.no_scale_embedding:
-            self.embed_scale = 1.0
-        self.padding_idx = 1
-
-        self.subsample = Conv1dSubsampler(
-            args.input_feat_per_channel * args.input_channels,
-            args.conv_channels,
-            args.encoder_embed_dim,
-            [int(k) for k in args.conv_kernel_sizes.split(",")],
-        )
-
-        self.embed_positions = PositionalEmbedding(
-            args.max_source_positions, args.encoder_embed_dim, self.padding_idx
-        )
-
-        self.transformer_layers = nn.ModuleList(
-            [TransformerEncoderLayer(args) for _ in range(args.encoder_layers)]
-        )
-        if args.encoder_normalize_before:
-            self.layer_norm = LayerNorm(args.encoder_embed_dim)
-        else:
-            self.layer_norm = None
-
-    def forward(self, src_tokens, src_lengths):
-        x, input_lengths = self.subsample(src_tokens, src_lengths)
-        x = self.embed_scale * x
-
-        encoder_padding_mask = lengths_to_padding_mask(input_lengths)
-        positions = self.embed_positions(encoder_padding_mask).transpose(0, 1)
-        x += positions
-        x = self.dropout_module(x)
-
-        for layer in self.transformer_layers:
-            x = layer(x, encoder_padding_mask)
-
-        if not encoder_padding_mask.any():
-            encoder_padding_mask = None
-
-        if self.layer_norm is not None:
-            x = self.layer_norm(x)
-
-        return EncoderOut(
-            encoder_out=x,
-            encoder_padding_mask=encoder_padding_mask,
-            encoder_embedding=None,
-            encoder_states=None,
-            src_tokens=None,
-            src_lengths=None,
-        )
-
-    @torch.jit.export
-    def reorder_encoder_out(self, encoder_out: EncoderOut, new_order):
-        """
-        Since encoder_padding_mask and encoder_embedding are both of type
-        Optional[Tensor] in EncoderOut, they need to be copied as local
-        variables for Torchscript Optional refinement
-        """
-
-        encoder_padding_mask: Optional[Tensor] = encoder_out.encoder_padding_mask
-        encoder_embedding: Optional[Tensor] = encoder_out.encoder_embedding
-
-        new_encoder_out = (
-            encoder_out.encoder_out
-            if encoder_out.encoder_out is None
-            else encoder_out.encoder_out.index_select(1, new_order)
-        )
-
-        new_encoder_padding_mask = (
-            encoder_padding_mask
-            if encoder_padding_mask is None
-            else encoder_padding_mask.index_select(0, new_order)
-        )
-
-        new_encoder_embedding = (
-            encoder_embedding
-            if encoder_embedding is None
-            else encoder_embedding.index_select(0, new_order)
-        )
-
-        encoder_states = encoder_out.encoder_states
-        if encoder_states is not None:
-            for idx, state in enumerate(encoder_states):
-                encoder_states[idx] = state.index_select(1, new_order)
-
-        return EncoderOut(
-            encoder_out=new_encoder_out,  # T x B x C
-            encoder_padding_mask=new_encoder_padding_mask,  # B x T
-            encoder_embedding=new_encoder_embedding,  # B x T x C
-            encoder_states=encoder_states,  # List[T x B x C]
-            src_tokens=None,
-            src_lengths=None,
-        )
-
-
-class TransformerDecoderScriptable(TransformerDecoder):
-    def extract_features(
-        self,
-        prev_output_tokens,
-        encoder_out: Optional[EncoderOut] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        full_context_alignment: bool = False,
-        alignment_layer: Optional[int] = None,
-        alignment_heads: Optional[int] = None,
-    ):
-        # call scriptable method from parent class
-        x, _ = self.extract_features_scriptable(
-            prev_output_tokens,
-            encoder_out,
-            incremental_state,
-            full_context_alignment,
-            alignment_layer,
-            alignment_heads,
-        )
-        return x, None
-
-
-@register_model_architecture(model_name="s2t_transformer", arch_name="s2t_transformer")
-def base_architecture(args):
-    # Convolutional subsampler
-    args.conv_kernel_sizes = getattr(args, "conv_kernel_sizes", "5,5")
-    args.conv_channels = getattr(args, "conv_channels", 1024)
-    # Transformer
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
-    args.encoder_layers = getattr(args, "encoder_layers", 12)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
-    args.decoder_ffn_embed_dim = getattr(
-        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
-    )
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", args.dropout)
-    args.activation_dropout = getattr(args, "activation_dropout", args.dropout)
-    args.activation_fn = getattr(args, "activation_fn", "relu")
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.adaptive_input = getattr(args, "adaptive_input", False)
-    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
-    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
-
-
-@register_model_architecture("s2t_transformer", "s2t_transformer_s")
-def s2t_transformer_s(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
-    args.dropout = getattr(args, "dropout", 0.1)
-    base_architecture(args)
-
-
-@register_model_architecture("s2t_transformer", "s2t_transformer_sp")
-def s2t_transformer_sp(args):
-    args.encoder_layers = getattr(args, "encoder_layers", 16)
-    s2t_transformer_s(args)
-
-
-@register_model_architecture("s2t_transformer", "s2t_transformer_m")
-def s2t_transformer_m(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 512 * 4)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.dropout = getattr(args, "dropout", 0.15)
-    base_architecture(args)
-
-
-@register_model_architecture("s2t_transformer", "s2t_transformer_mp")
-def s2t_transformer_mp(args):
-    args.encoder_layers = getattr(args, "encoder_layers", 16)
-    s2t_transformer_m(args)
-
-
-@register_model_architecture("s2t_transformer", "s2t_transformer_l")
-def s2t_transformer_l(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024 * 4)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
-    args.dropout = getattr(args, "dropout", 0.2)
-    base_architecture(args)
-
-
-@register_model_architecture("s2t_transformer", "s2t_transformer_lp")
-def s2t_transformer_lp(args):
-    args.encoder_layers = getattr(args, "encoder_layers", 16)
-    s2t_transformer_l(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/transformer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/transformer.py
deleted file mode 100644
index 5bf2c44f0c99f73f336b510167f4becb4e109d48..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/transformer.py
+++ /dev/null
@@ -1,1042 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-from fairseq import utils
-from fairseq.models import (
-    FairseqEncoder,
-    FairseqEncoderDecoderModel,
-    FairseqIncrementalDecoder,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.models.fairseq_encoder import EncoderOut
-from fairseq.modules import (
-    AdaptiveSoftmax,
-    FairseqDropout,
-    LayerDropModuleList,
-    LayerNorm,
-    PositionalEmbedding,
-    SinusoidalPositionalEmbedding,
-    TransformerDecoderLayer,
-    TransformerEncoderLayer,
-)
-from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
-from torch import Tensor
-
-s_len = 0
-DEFAULT_MAX_SOURCE_POSITIONS = 1024
-DEFAULT_MAX_TARGET_POSITIONS = 1024
-
-
-@register_model("transformer")
-class TransformerModel(FairseqEncoderDecoderModel):
-    """
-    Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017)
-    <https://arxiv.org/abs/1706.03762>`_.
-
-    Args:
-        encoder (TransformerEncoder): the encoder
-        decoder (TransformerDecoder): the decoder
-
-    The Transformer model provides the following named architectures and
-    command-line arguments:
-
-    .. argparse::
-        :ref: fairseq.models.transformer_parser
-        :prog:
-    """
-
-    @classmethod
-    def hub_models(cls):
-        # fmt: off
-
-        def moses_subword(path):
-            return {
-                'path': path,
-                'tokenizer': 'moses',
-                'bpe': 'subword_nmt',
-            }
-
-        def moses_fastbpe(path):
-            return {
-                'path': path,
-                'tokenizer': 'moses',
-                'bpe': 'fastbpe',
-            }
-
-        return {
-            'transformer.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2'),
-            'transformer.wmt16.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2',
-            'transformer.wmt18.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz'),
-            'transformer.wmt19.en-de': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz'),
-            'transformer.wmt19.en-ru': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz'),
-            'transformer.wmt19.de-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz'),
-            'transformer.wmt19.ru-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz'),
-            'transformer.wmt19.en-de.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.single_model.tar.gz'),
-            'transformer.wmt19.en-ru.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.single_model.tar.gz'),
-            'transformer.wmt19.de-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.single_model.tar.gz'),
-            'transformer.wmt19.ru-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.single_model.tar.gz'),
-        }
-        # fmt: on
-
-    def __init__(self, args, encoder, decoder):
-        super().__init__(encoder, decoder)
-        self.args = args
-        self.supports_align_args = True
-
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--activation-fn',
-                            choices=utils.get_available_activation_fns(),
-                            help='activation function to use')
-        parser.add_argument('--dropout', type=float, metavar='D',
-                            help='dropout probability')
-        parser.add_argument('--attention-dropout', type=float, metavar='D',
-                            help='dropout probability for attention weights')
-        parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
-                            help='dropout probability after activation in FFN.')
-        parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
-                            help='path to pre-trained encoder embedding')
-        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
-                            help='encoder embedding dimension')
-        parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
-                            help='encoder embedding dimension for FFN')
-        parser.add_argument('--encoder-layers', type=int, metavar='N',
-                            help='num encoder layers')
-        parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
-                            help='num encoder attention heads')
-        parser.add_argument('--encoder-normalize-before', action='store_true',
-                            help='apply layernorm before each encoder block')
-        parser.add_argument('--encoder-learned-pos', action='store_true',
-                            help='use learned positional embeddings in the encoder')
-        parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
-                            help='path to pre-trained decoder embedding')
-        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension')
-        parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
-                            help='decoder embedding dimension for FFN')
-        parser.add_argument('--decoder-layers', type=int, metavar='N',
-                            help='num decoder layers')
-        parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
-                            help='num decoder attention heads')
-        parser.add_argument('--decoder-learned-pos', action='store_true',
-                            help='use learned positional embeddings in the decoder')
-        parser.add_argument('--decoder-normalize-before', action='store_true',
-                            help='apply layernorm before each decoder block')
-        parser.add_argument('--decoder-output-dim', type=int, metavar='N',
-                            help='decoder output dimension (extra linear layer '
-                                 'if different from decoder embed dim')
-        parser.add_argument('--share-decoder-input-output-embed', action='store_true',
-                            help='share decoder input and output embeddings')
-        parser.add_argument('--share-all-embeddings', action='store_true',
-                            help='share encoder, decoder and output embeddings'
-                                 ' (requires shared dictionary and embed dim)')
-        parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
-                            help='if set, disables positional embeddings (outside self attention)')
-        parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
-                            help='comma separated list of adaptive softmax cutoff points. '
-                                 'Must be used with adaptive_loss criterion'),
-        parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
-                            help='sets adaptive softmax dropout for the tail projections')
-        parser.add_argument('--layernorm-embedding', action='store_true',
-                            help='add layernorm to embedding')
-        parser.add_argument('--no-scale-embedding', action='store_true',
-                            help='if True, dont scale embeddings')
-        # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019)
-        parser.add_argument('--no-cross-attention', default=False, action='store_true',
-                            help='do not perform cross-attention')
-        parser.add_argument('--cross-self-attention', default=False, action='store_true',
-                            help='perform cross+self-attention')
-        # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019)
-        parser.add_argument('--encoder-layerdrop', type=float, metavar='D', default=0,
-                            help='LayerDrop probability for encoder')
-        parser.add_argument('--decoder-layerdrop', type=float, metavar='D', default=0,
-                            help='LayerDrop probability for decoder')
-        parser.add_argument('--encoder-layers-to-keep', default=None,
-                            help='which layers to *keep* when pruning as a comma-separated list')
-        parser.add_argument('--decoder-layers-to-keep', default=None,
-                            help='which layers to *keep* when pruning as a comma-separated list')
-        # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)
-        parser.add_argument('--quant-noise-pq', type=float, metavar='D', default=0,
-                            help='iterative PQ quantization noise at training time')
-        parser.add_argument('--quant-noise-pq-block-size', type=int, metavar='D', default=8,
-                            help='block size of quantization noise at training time')
-        parser.add_argument('--quant-noise-scalar', type=float, metavar='D', default=0,
-                            help='scalar quantization noise and scalar quantization at training time')
-        # fmt: on
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-
-        # make sure all arguments are present in older models
-        base_architecture(args)
-
-        if args.encoder_layers_to_keep:
-            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
-        if args.decoder_layers_to_keep:
-            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
-
-        if getattr(args, "max_source_positions", None) is None:
-            args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
-        if getattr(args, "max_target_positions", None) is None:
-            args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
-
-        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
-
-        if args.share_all_embeddings:
-            if src_dict != tgt_dict:
-                raise ValueError("--share-all-embeddings requires a joined dictionary")
-            if args.encoder_embed_dim != args.decoder_embed_dim:
-                raise ValueError(
-                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
-                )
-            if args.decoder_embed_path and (
-                args.decoder_embed_path != args.encoder_embed_path
-            ):
-                raise ValueError(
-                    "--share-all-embeddings not compatible with --decoder-embed-path"
-                )
-            encoder_embed_tokens = cls.build_embedding(
-                args, src_dict, args.encoder_embed_dim, args.encoder_embed_path
-            )
-            decoder_embed_tokens = encoder_embed_tokens
-            args.share_decoder_input_output_embed = True
-        else:
-            encoder_embed_tokens = cls.build_embedding(
-                args, src_dict, args.encoder_embed_dim, args.encoder_embed_path
-            )
-            decoder_embed_tokens = cls.build_embedding(
-                args, tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
-            )
-
-        encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
-        decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
-        return cls(args, encoder, decoder)
-
-    @classmethod
-    def build_embedding(cls, args, dictionary, embed_dim, path=None):
-        num_embeddings = len(dictionary)
-        padding_idx = dictionary.pad()
-
-        emb = Embedding(num_embeddings, embed_dim, padding_idx)
-        # if provided, load from preloaded dictionaries
-        if path:
-            embed_dict = utils.parse_embedding(path)
-            utils.load_embedding(embed_dict, dictionary, emb)
-        return emb
-
-    @classmethod
-    def build_encoder(cls, args, src_dict, embed_tokens):
-        return TransformerEncoder(args, src_dict, embed_tokens)
-
-    @classmethod
-    def build_decoder(cls, args, tgt_dict, embed_tokens):
-        return TransformerDecoder(
-            args,
-            tgt_dict,
-            embed_tokens,
-            no_encoder_attn=getattr(args, "no_cross_attention", False),
-        )
-
-    # TorchScript doesn't support optional arguments with variable length (**kwargs).
-    # Current workaround is to add union of all arguments in child classes.
-    def forward(
-        self,
-        src_tokens,
-        src_lengths,
-        prev_output_tokens,
-        return_all_hiddens: bool = True,
-        features_only: bool = False,
-        alignment_layer: Optional[int] = None,
-        alignment_heads: Optional[int] = None,
-    ):
-        """
-        Run the forward pass for an encoder-decoder model.
-
-        Copied from the base class, but without ``**kwargs``,
-        which are not supported by TorchScript.
-        """
-        encoder_out = self.encoder(
-            src_tokens, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens
-        )
-        decoder_out = self.decoder(
-            prev_output_tokens,
-            encoder_out=encoder_out,
-            features_only=features_only,
-            alignment_layer=alignment_layer,
-            alignment_heads=alignment_heads,
-            src_lengths=src_lengths,
-            return_all_hiddens=return_all_hiddens,
-        )
-        return decoder_out
-
-    # Since get_normalized_probs is in the Fairseq Model which is not scriptable,
-    # I rewrite the get_normalized_probs from Base Class to call the
-    # helper function in the Base Class.
-    @torch.jit.export
-    def get_normalized_probs(
-        self,
-        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
-        log_probs: bool,
-        sample: Optional[Dict[str, Tensor]] = None,
-    ):
-        """Get normalized probabilities (or log probs) from a net's output."""
-        return self.get_normalized_probs_scriptable(net_output, log_probs, sample)
-
-
-class TransformerEncoder(FairseqEncoder):
-    """
-    Transformer encoder consisting of *args.encoder_layers* layers. Each layer
-    is a :class:`TransformerEncoderLayer`.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        dictionary (~fairseq.data.Dictionary): encoding dictionary
-        embed_tokens (torch.nn.Embedding): input embedding
-    """
-
-    def __init__(self, args, dictionary, embed_tokens):
-        super().__init__(dictionary)
-        self.register_buffer("version", torch.Tensor([3]))
-        self.encoder_attention_heads = args.encoder_attention_heads
-        self.dropout_module = FairseqDropout(
-            args.dropout, module_name=self.__class__.__name__
-        )
-        self.encoder_layerdrop = args.encoder_layerdrop
-
-        embed_dim = embed_tokens.embedding_dim
-        self.padding_idx = embed_tokens.padding_idx
-        self.max_source_positions = args.max_source_positions
-
-        self.embed_tokens = embed_tokens
-
-        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)
-
-        self.embed_positions = (
-            PositionalEmbedding(
-                args.max_source_positions,
-                embed_dim,
-                self.padding_idx,
-                learned=args.encoder_learned_pos,
-            )
-            if not args.no_token_positional_embeddings
-            else None
-        )
-
-        if getattr(args, "layernorm_embedding", False):
-            self.layernorm_embedding = LayerNorm(embed_dim)
-        else:
-            self.layernorm_embedding = None
-
-        if not args.adaptive_input and args.quant_noise_pq > 0:
-            self.quant_noise = apply_quant_noise_(
-                nn.Linear(embed_dim, embed_dim, bias=False),
-                args.quant_noise_pq,
-                args.quant_noise_pq_block_size,
-            )
-        else:
-            self.quant_noise = None
-
-        if self.encoder_layerdrop > 0.0:
-            self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
-        else:
-            self.layers = nn.ModuleList([])
-        self.layers.extend(
-            [self.build_encoder_layer(args) for i in range(args.encoder_layers)]
-        )
-        self.num_layers = len(self.layers)
-
-        if args.encoder_normalize_before:
-            self.layer_norm = LayerNorm(embed_dim)
-        else:
-            self.layer_norm = None
-
-    def build_encoder_layer(self, args):
-        return TransformerEncoderLayer(args)
-
-    def forward_embedding(
-        self, src_tokens, token_embedding: Optional[torch.Tensor] = None
-    ):
-        # embed tokens and positions
-        if token_embedding is None:
-            token_embedding = self.embed_tokens(src_tokens)
-        x = embed = self.embed_scale * token_embedding
-        if self.embed_positions is not None:
-            x = embed + self.embed_positions(src_tokens)
-        if self.layernorm_embedding is not None:
-            x = self.layernorm_embedding(x)
-        x = self.dropout_module(x)
-        if self.quant_noise is not None:
-            x = self.quant_noise(x)
-        return x, embed
-
-    def forward(
-        self,
-        src_tokens,
-        src_lengths,
-        return_all_hiddens: bool = False,
-        token_embeddings: Optional[torch.Tensor] = None,
-    ):
-        """
-        Args:
-            src_tokens (LongTensor): tokens in the source language of shape
-                `(batch, src_len)`
-            src_lengths (torch.LongTensor): lengths of each source sentence of
-                shape `(batch)`
-            return_all_hiddens (bool, optional): also return all of the
-                intermediate hidden states (default: False).
-            token_embeddings (torch.Tensor, optional): precomputed embeddings
-                default `None` will recompute embeddings
-
-        Returns:
-            namedtuple:
-                - **encoder_out** (Tensor): the last encoder layer's output of
-                  shape `(src_len, batch, embed_dim)`
-                - **encoder_padding_mask** (ByteTensor): the positions of
-                  padding elements of shape `(batch, src_len)`
-                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
-                  of shape `(batch, src_len, embed_dim)`
-                - **encoder_states** (List[Tensor]): all intermediate
-                  hidden states of shape `(src_len, batch, embed_dim)`.
-                  Only populated if *return_all_hiddens* is True.
-        """
-        x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings)
-
-        # B x T x C -> T x B x C
-        bsz, tgt_len, embed_dim = x.size()
-        global s_len
-        s_len = src_tokens.size()[1]
-
-        # compute padding mask
-        encoder_padding_mask = src_tokens.eq(self.padding_idx)
-        encoder_padding_mask = (encoder_padding_mask.to(torch.float16) * -65504).unsqueeze(1).unsqueeze(2)
-        encoder_padding_mask = encoder_padding_mask.repeat(1,self.encoder_attention_heads,
-                                                           tgt_len, 1).clone().npu_format_cast(29)
-        encoder_states = [] if return_all_hiddens else None
-        if len(x.shape) == 3:
-            x = x.view(-1, x.shape[2]).clone().npu_format_cast(29)
-        else:
-            x = x.npu_format_cast(29)
-        # encoder layers
-        for layer in self.layers:
-            x = layer(x, encoder_padding_mask, bsz, tgt_len, s_len)
-            if return_all_hiddens:
-                assert encoder_states is not None
-                encoder_states.append(x)
-
-        if self.layer_norm is not None:
-            x = self.layer_norm(x)
-        encoder_padding_mask = encoder_padding_mask[:,0,0,:]
-        return EncoderOut(
-            encoder_out=x,  # T x B x C
-            encoder_padding_mask=encoder_padding_mask,  # B x T
-            encoder_embedding=encoder_embedding,  # B x T x C
-            encoder_states=encoder_states,  # List[T x B x C]
-            src_tokens=None,
-            src_lengths=None,
-        )
-
-    @torch.jit.export
-    def reorder_encoder_out(self, encoder_out: EncoderOut, new_order):
-        """
-        Reorder encoder output according to *new_order*.
-
-        Args:
-            encoder_out: output from the ``forward()`` method
-            new_order (LongTensor): desired order
-
-        Returns:
-            *encoder_out* rearranged according to *new_order*
-        """
-        """
-        Since encoder_padding_mask and encoder_embedding are both of type
-        Optional[Tensor] in EncoderOut, they need to be copied as local
-        variables for Torchscript Optional refinement
-        """
-        encoder_padding_mask: Optional[Tensor] = encoder_out.encoder_padding_mask
-        encoder_embedding: Optional[Tensor] = encoder_out.encoder_embedding
-
-        new_encoder_out = (
-            encoder_out.encoder_out
-            if encoder_out.encoder_out is None
-            else encoder_out.encoder_out.index_select(0, new_order)
-        )
-        new_encoder_padding_mask = (
-            encoder_padding_mask
-            if encoder_padding_mask is None
-            else encoder_padding_mask.index_select(0, new_order)
-        )
-        new_encoder_embedding = (
-            encoder_embedding
-            if encoder_embedding is None
-            else encoder_embedding.index_select(0, new_order)
-        )
-        src_tokens = encoder_out.src_tokens
-        if src_tokens is not None:
-            src_tokens = src_tokens.index_select(0, new_order)
-
-        src_lengths = encoder_out.src_lengths
-        if src_lengths is not None:
-            src_lengths = src_lengths.index_select(0, new_order)
-
-        encoder_states = encoder_out.encoder_states
-        if encoder_states is not None:
-            for idx, state in enumerate(encoder_states):
-                encoder_states[idx] = state.index_select(1, new_order)
-
-        return EncoderOut(
-            encoder_out=new_encoder_out,  # T x B x C
-            encoder_padding_mask=new_encoder_padding_mask,  # B x T
-            encoder_embedding=new_encoder_embedding,  # B x T x C
-            encoder_states=encoder_states,  # List[T x B x C]
-            src_tokens=src_tokens,  # B x T
-            src_lengths=src_lengths,  # B x 1
-        )
-
-    def max_positions(self):
-        """Maximum input length supported by the encoder."""
-        if self.embed_positions is None:
-            return self.max_source_positions
-        return min(self.max_source_positions, self.embed_positions.max_positions)
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        """Upgrade a (possibly old) state dict for new versions of fairseq."""
-        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
-            weights_key = "{}.embed_positions.weights".format(name)
-            if weights_key in state_dict:
-                print("deleting {0}".format(weights_key))
-                del state_dict[weights_key]
-            state_dict[
-                "{}.embed_positions._float_tensor".format(name)
-            ] = torch.FloatTensor(1)
-        for i in range(self.num_layers):
-            # update layer norms
-            self.layers[i].upgrade_state_dict_named(
-                state_dict, "{}.layers.{}".format(name, i)
-            )
-
-        version_key = "{}.version".format(name)
-        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2:
-            # earlier checkpoints did not normalize after the stack of layers
-            self.layer_norm = None
-            self.normalize = False
-            state_dict[version_key] = torch.Tensor([1])
-        return state_dict
-
-
-class TransformerDecoder(FairseqIncrementalDecoder):
-    """
-    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
-    is a :class:`TransformerDecoderLayer`.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        dictionary (~fairseq.data.Dictionary): decoding dictionary
-        embed_tokens (torch.nn.Embedding): output embedding
-        no_encoder_attn (bool, optional): whether to attend to encoder outputs
-            (default: False).
-    """
-
-    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
-        self.args = args
-        super().__init__(dictionary)
-        self.register_buffer("version", torch.Tensor([3]))
-        self._future_mask = torch.empty(0)
-        self.decoder_attention_heads = args.decoder_attention_heads
-        self.dropout_module = FairseqDropout(
-            args.dropout, module_name=self.__class__.__name__
-        )
-        self.decoder_layerdrop = args.decoder_layerdrop
-        self.share_input_output_embed = args.share_decoder_input_output_embed
-
-        input_embed_dim = embed_tokens.embedding_dim
-        embed_dim = args.decoder_embed_dim
-        self.embed_dim = embed_dim
-        self.output_embed_dim = args.decoder_output_dim
-
-        self.padding_idx = embed_tokens.padding_idx
-        self.max_target_positions = args.max_target_positions
-
-        self.embed_tokens = embed_tokens
-
-        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)
-
-        if not args.adaptive_input and args.quant_noise_pq > 0:
-            self.quant_noise = apply_quant_noise_(
-                nn.Linear(embed_dim, embed_dim, bias=False),
-                args.quant_noise_pq,
-                args.quant_noise_pq_block_size,
-            )
-        else:
-            self.quant_noise = None
-
-        self.project_in_dim = (
-            Linear(input_embed_dim, embed_dim, bias=False)
-            if embed_dim != input_embed_dim
-            else None
-        )
-
-        self.embed_positions = (
-            PositionalEmbedding(
-                args.max_target_positions,
-                embed_dim,
-                self.padding_idx,
-                learned=args.decoder_learned_pos,
-            )
-            if not args.no_token_positional_embeddings
-            else None
-        )
-
-        if getattr(args, "layernorm_embedding", False):
-            self.layernorm_embedding = LayerNorm(embed_dim)
-        else:
-            self.layernorm_embedding = None
-
-        self.cross_self_attention = getattr(args, "cross_self_attention", False)
-
-        if self.decoder_layerdrop > 0.0:
-            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
-        else:
-            self.layers = nn.ModuleList([])
-        self.layers.extend(
-            [
-                self.build_decoder_layer(args, no_encoder_attn)
-                for _ in range(args.decoder_layers)
-            ]
-        )
-        self.num_layers = len(self.layers)
-
-        if args.decoder_normalize_before and not getattr(
-            args, "no_decoder_final_norm", False
-        ):
-            self.layer_norm = LayerNorm(embed_dim)
-        else:
-            self.layer_norm = None
-
-        self.project_out_dim = (
-            Linear(embed_dim, self.output_embed_dim, bias=False)
-            if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights
-            else None
-        )
-
-        self.adaptive_softmax = None
-        self.output_projection = None
-        if args.adaptive_softmax_cutoff is not None:
-            self.adaptive_softmax = AdaptiveSoftmax(
-                len(dictionary),
-                self.output_embed_dim,
-                utils.eval_str_list(args.adaptive_softmax_cutoff, type=int),
-                dropout=args.adaptive_softmax_dropout,
-                adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None,
-                factor=args.adaptive_softmax_factor,
-                tie_proj=args.tie_adaptive_proj,
-            )
-        elif self.share_input_output_embed:
-            self.output_projection = nn.Linear(
-                self.embed_tokens.weight.shape[1],
-                self.embed_tokens.weight.shape[0],
-                bias=False,
-            )
-            self.output_projection.weight = self.embed_tokens.weight
-        else:
-            self.output_projection = nn.Linear(
-                self.output_embed_dim, len(dictionary), bias=False
-            )
-            nn.init.normal_(
-                self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5
-            )
-
-    def build_decoder_layer(self, args, no_encoder_attn=False):
-        return TransformerDecoderLayer(args, no_encoder_attn)
-
-    def forward(
-        self,
-        prev_output_tokens,
-        encoder_out: Optional[EncoderOut] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        features_only: bool = False,
-        full_context_alignment: bool = False,
-        alignment_layer: Optional[int] = None,
-        alignment_heads: Optional[int] = None,
-        src_lengths: Optional[Any] = None,
-        return_all_hiddens: bool = False,
-    ):
-        """
-        Args:
-            prev_output_tokens (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for teacher forcing
-            encoder_out (optional): output from the encoder, used for
-                encoder-side attention
-            incremental_state (dict): dictionary used for storing state during
-                :ref:`Incremental decoding`
-            features_only (bool, optional): only return features without
-                applying output layer (default: False).
-            full_context_alignment (bool, optional): don't apply
-                auto-regressive mask to self-attention (default: False).
-
-        Returns:
-            tuple:
-                - the decoder's output of shape `(batch, tgt_len, vocab)`
-                - a dictionary with any model-specific outputs
-        """
-        x, extra = self.extract_features(
-            prev_output_tokens,
-            encoder_out=encoder_out,
-            incremental_state=incremental_state,
-            full_context_alignment=full_context_alignment,
-            alignment_layer=alignment_layer,
-            alignment_heads=alignment_heads,
-        )
-        if not features_only:
-            x = self.output_layer(x)
-        return x, extra
-
-    def extract_features(
-        self,
-        prev_output_tokens,
-        encoder_out: Optional[EncoderOut] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        full_context_alignment: bool = False,
-        alignment_layer: Optional[int] = None,
-        alignment_heads: Optional[int] = None,
-    ):
-        return self.extract_features_scriptable(
-            prev_output_tokens,
-            encoder_out,
-            incremental_state,
-            full_context_alignment,
-            alignment_layer,
-            alignment_heads,
-        )
-
-    """
-    A scriptable subclass of this class has an extract_features method and calls
-    super().extract_features, but super() is not supported in torchscript. Aa copy of
-    this function is made to be used in the subclass instead.
-    """
-
-    def extract_features_scriptable(
-        self,
-        prev_output_tokens,
-        encoder_out: Optional[EncoderOut] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        full_context_alignment: bool = False,
-        alignment_layer: Optional[int] = None,
-        alignment_heads: Optional[int] = None,
-    ):
-        """
-        Similar to *forward* but only return features.
-
-        Includes several features from "Jointly Learning to Align and
-        Translate with Transformer Models" (Garg et al., EMNLP 2019).
-
-        Args:
-            full_context_alignment (bool, optional): don't apply
-                auto-regressive mask to self-attention (default: False).
-            alignment_layer (int, optional): return mean alignment over
-                heads at this layer (default: last layer).
-            alignment_heads (int, optional): only average alignment over
-                this many heads (default: all heads).
-
-        Returns:
-            tuple:
-                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
-                - a dictionary with any model-specific outputs
-        """
-        if alignment_layer is None:
-            alignment_layer = self.num_layers - 1
-
-        # embed positions
-        positions = (
-            self.embed_positions(
-                prev_output_tokens, incremental_state=incremental_state
-            )
-            if self.embed_positions is not None
-            else None
-        )
-
-        if incremental_state is not None:
-            prev_output_tokens = prev_output_tokens[:, -1:]
-            if positions is not None:
-                positions = positions[:, -1:]
-
-        # embed tokens and positions
-        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
-
-        if self.quant_noise is not None:
-            x = self.quant_noise(x)
-
-        if self.project_in_dim is not None:
-            x = self.project_in_dim(x)
-
-        if positions is not None:
-            x += positions
-
-        if self.layernorm_embedding is not None:
-            x = self.layernorm_embedding(x)
-
-        x = self.dropout_module(x)
-
-        # B x T x C -> T x B x C
-        bsz, tgt_len, embed_dim = x.size()
-
-        self_attn_padding_mask: Optional[Tensor] = None
-        if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any():
-            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
-            self_attn_padding_mask = (self_attn_padding_mask.to(torch.float16) * -65504).unsqueeze(1).unsqueeze(2)
-            self_attn_padding_mask = self_attn_padding_mask.repeat(1, self.decoder_attention_heads,
-                                                               tgt_len, 1).clone().npu_format_cast(
-                29)
-
-        if encoder_out is not None:
-            encoder_padding_mask = encoder_out.encoder_padding_mask.unsqueeze(1).unsqueeze(2)\
-                .repeat(1, self.decoder_attention_heads, tgt_len, 1)
-            if len(encoder_out.encoder_out.shape) == 3:
-                encoder_out_ = encoder_out.encoder_out.view(-1, encoder_out.encoder_out.shape[2]).clone().npu_format_cast(29)
-            else:
-                encoder_out_ = encoder_out.encoder_out.npu_format_cast(29)
-        if len(x.shape) == 3:
-            x = x.view(-1, x.shape[2]).clone().npu_format_cast(29)
-        else:
-            x = x.npu_format_cast(29)
-        # decoder layers
-        attn: Optional[Tensor] = None
-        inner_states: List[Optional[Tensor]] = [x]
-        for idx, layer in enumerate(self.layers):
-            if incremental_state is None and not full_context_alignment:
-                self_attn_mask = self.buffered_future_mask(x, tgt_len)
-                if self_attn_padding_mask is not None:
-                    self_attn_padding_mask = self_attn_padding_mask + self_attn_mask
-                else:
-                    self_attn_padding_mask = self_attn_mask.unsqueeze(0).unsqueeze(1).repeat(bsz, self.decoder_attention_heads,
-                                                               1, 1).clone().npu_format_cast(
-                29)
-
-            x, layer_attn, _ = layer(
-                x, bsz, tgt_len, s_len,
-                encoder_out_ if encoder_out is not None else None,
-                encoder_padding_mask if encoder_out is not None else None,
-                incremental_state,
-                self_attn_mask=None,
-                self_attn_padding_mask=self_attn_padding_mask,
-                need_attn=bool((idx == alignment_layer)),
-                need_head_weights=bool((idx == alignment_layer)),
-            )
-            inner_states.append(x)
-            if layer_attn is not None and idx == alignment_layer:
-                attn = layer_attn.float().to(x)
-
-        if attn is not None:
-            if alignment_heads is not None:
-                attn = attn[:alignment_heads]
-
-            # average probabilities over heads
-            attn = attn.mean(dim=0)
-
-        if self.layer_norm is not None:
-            x = self.layer_norm(x)
-
-
-        if self.project_out_dim is not None:
-            x = self.project_out_dim(x)
-
-        return x, {"attn": [attn], "inner_states": inner_states}
-
-    def output_layer(self, features):
-        """Project features to the vocabulary size."""
-        if self.adaptive_softmax is None:
-            # project back to size of vocabulary
-            return self.output_projection(features)
-        else:
-            return features
-
-    def max_positions(self):
-        """Maximum output length supported by the decoder."""
-        if self.embed_positions is None:
-            return self.max_target_positions
-        return min(self.max_target_positions, self.embed_positions.max_positions)
-
-    def buffered_future_mask(self, tensor, tgt_len):
-        dim = tgt_len
-        # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround.
-        if (
-            self._future_mask.size(0) == 0
-            or (not self._future_mask.device == tensor.device)
-            or self._future_mask.size(0) < dim
-        ):
-            self._future_mask = torch.triu(
-                utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1
-            )
-        self._future_mask = self._future_mask.to(tensor)
-        return self._future_mask[:dim, :dim]
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        """Upgrade a (possibly old) state dict for new versions of fairseq."""
-        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
-            weights_key = "{}.embed_positions.weights".format(name)
-            if weights_key in state_dict:
-                del state_dict[weights_key]
-            state_dict[
-                "{}.embed_positions._float_tensor".format(name)
-            ] = torch.FloatTensor(1)
-
-        if f"{name}.output_projection.weight" not in state_dict:
-            if self.share_input_output_embed:
-                embed_out_key = f"{name}.embed_tokens.weight"
-            else:
-                embed_out_key = f"{name}.embed_out"
-            if embed_out_key in state_dict:
-                state_dict[f"{name}.output_projection.weight"] = state_dict[
-                    embed_out_key
-                ]
-                if not self.share_input_output_embed:
-                    del state_dict[embed_out_key]
-
-        for i in range(self.num_layers):
-            # update layer norms
-            layer_norm_map = {
-                "0": "self_attn_layer_norm",
-                "1": "encoder_attn_layer_norm",
-                "2": "final_layer_norm",
-            }
-            for old, new in layer_norm_map.items():
-                for m in ("weight", "bias"):
-                    k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m)
-                    if k in state_dict:
-                        state_dict[
-                            "{}.layers.{}.{}.{}".format(name, i, new, m)
-                        ] = state_dict[k]
-                        del state_dict[k]
-
-        version_key = "{}.version".format(name)
-        if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2:
-            # earlier checkpoints did not normalize after the stack of layers
-            self.layer_norm = None
-            self.normalize = False
-            state_dict[version_key] = torch.Tensor([1])
-
-        return state_dict
-
-
-def Embedding(num_embeddings, embedding_dim, padding_idx):
-    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
-    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
-    nn.init.constant_(m.weight[padding_idx], 0)
-    return m
-
-
-def Linear(in_features, out_features, bias=True):
-    m = nn.Linear(in_features, out_features, bias)
-    nn.init.xavier_uniform_(m.weight)
-    if bias:
-        nn.init.constant_(m.bias, 0.0)
-    return m
-
-
-@register_model_architecture("transformer", "transformer")
-def base_architecture(args):
-    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
-    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
-    args.decoder_ffn_embed_dim = getattr(
-        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
-    )
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
-    args.activation_fn = getattr(args, "activation_fn", "relu")
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.adaptive_input = getattr(args, "adaptive_input", False)
-    args.no_cross_attention = getattr(args, "no_cross_attention", False)
-    args.cross_self_attention = getattr(args, "cross_self_attention", False)
-
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-
-    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
-    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
-    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
-
-
-@register_model_architecture("transformer", "transformer_iwslt_de_en")
-def transformer_iwslt_de_en(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    base_architecture(args)
-
-
-@register_model_architecture("transformer", "transformer_wmt_en_de")
-def transformer_wmt_en_de(args):
-    base_architecture(args)
-
-
-# parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017)
-@register_model_architecture("transformer", "transformer_vaswani_wmt_en_de_big")
-def transformer_vaswani_wmt_en_de_big(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
-    args.dropout = getattr(args, "dropout", 0.3)
-    base_architecture(args)
-
-
-@register_model_architecture("transformer", "transformer_vaswani_wmt_en_fr_big")
-def transformer_vaswani_wmt_en_fr_big(args):
-    args.dropout = getattr(args, "dropout", 0.1)
-    transformer_vaswani_wmt_en_de_big(args)
-
-
-@register_model_architecture("transformer", "transformer_wmt_en_de_big")
-def transformer_wmt_en_de_big(args):
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    transformer_vaswani_wmt_en_de_big(args)
-
-
-# default parameters used in tensor2tensor implementation
-@register_model_architecture("transformer", "transformer_wmt_en_de_big_t2t")
-def transformer_wmt_en_de_big_t2t(args):
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.1)
-    transformer_vaswani_wmt_en_de_big(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/transformer_align.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/transformer_align.py
deleted file mode 100644
index eaf585bd10e630ae6cd89920f197cd165f55ad58..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/transformer_align.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.models import register_model, register_model_architecture
-from fairseq.models.transformer import (
-    TransformerModel,
-    base_architecture,
-    transformer_wmt_en_de_big,
-)
-
-
-@register_model("transformer_align")
-class TransformerAlignModel(TransformerModel):
-    """
-    See "Jointly Learning to Align and Translate with Transformer
-    Models" (Garg et al., EMNLP 2019).
-    """
-
-    def __init__(self, encoder, decoder, args):
-        super().__init__(args, encoder, decoder)
-        self.alignment_heads = args.alignment_heads
-        self.alignment_layer = args.alignment_layer
-        self.full_context_alignment = args.full_context_alignment
-
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        super(TransformerAlignModel, TransformerAlignModel).add_args(parser)
-        parser.add_argument('--alignment-heads', type=int, metavar='D',
-                            help='Number of cross attention heads per layer to supervised with alignments')
-        parser.add_argument('--alignment-layer', type=int, metavar='D',
-                            help='Layer number which has to be supervised. 0 corresponding to the bottommost layer.')
-        parser.add_argument('--full-context-alignment', action='store_true',
-                            help='Whether or not alignment is supervised conditioned on the full target context.')
-        # fmt: on
-
-    @classmethod
-    def build_model(cls, args, task):
-        # set any default arguments
-        transformer_align(args)
-
-        transformer_model = TransformerModel.build_model(args, task)
-        return TransformerAlignModel(
-            transformer_model.encoder, transformer_model.decoder, args
-        )
-
-    def forward(self, src_tokens, src_lengths, prev_output_tokens):
-        encoder_out = self.encoder(src_tokens, src_lengths)
-        return self.forward_decoder(prev_output_tokens, encoder_out)
-
-    def forward_decoder(
-        self,
-        prev_output_tokens,
-        encoder_out=None,
-        incremental_state=None,
-        features_only=False,
-        **extra_args,
-    ):
-        attn_args = {
-            "alignment_layer": self.alignment_layer,
-            "alignment_heads": self.alignment_heads,
-        }
-        decoder_out = self.decoder(prev_output_tokens, encoder_out, **attn_args)
-
-        if self.full_context_alignment:
-            attn_args["full_context_alignment"] = self.full_context_alignment
-            _, alignment_out = self.decoder(
-                prev_output_tokens,
-                encoder_out,
-                features_only=True,
-                **attn_args,
-                **extra_args,
-            )
-            decoder_out[1]["attn"] = alignment_out["attn"]
-
-        return decoder_out
-
-
-@register_model_architecture("transformer_align", "transformer_align")
-def transformer_align(args):
-    args.alignment_heads = getattr(args, "alignment_heads", 1)
-    args.alignment_layer = getattr(args, "alignment_layer", 4)
-    args.full_context_alignment = getattr(args, "full_context_alignment", False)
-    base_architecture(args)
-
-
-@register_model_architecture("transformer_align", "transformer_wmt_en_de_big_align")
-def transformer_wmt_en_de_big_align(args):
-    args.alignment_heads = getattr(args, "alignment_heads", 1)
-    args.alignment_layer = getattr(args, "alignment_layer", 4)
-    transformer_wmt_en_de_big(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/transformer_from_pretrained_xlm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/transformer_from_pretrained_xlm.py
deleted file mode 100644
index 236d9942e1fb0238cc92e2b4f160520b5cdd6504..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/transformer_from_pretrained_xlm.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-from typing import Any, Dict
-
-from fairseq import checkpoint_utils
-from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
-from fairseq.models import register_model, register_model_architecture
-from fairseq.models.transformer import (
-    TransformerDecoder,
-    TransformerEncoder,
-    TransformerModel,
-    base_architecture as transformer_base_architecture,
-)
-
-
-@register_model("transformer_from_pretrained_xlm")
-class TransformerFromPretrainedXLMModel(TransformerModel):
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        TransformerModel.add_args(parser)
-        parser.add_argument(
-            "--pretrained-xlm-checkpoint",
-            type=str,
-            metavar="STR",
-            help="XLM model to use for initializing transformer encoder and/or decoder",
-        )
-        parser.add_argument(
-            "--init-encoder-only",
-            action="store_true",
-            help="if set, don't load the XLM weights and embeddings into decoder",
-        )
-        parser.add_argument(
-            "--init-decoder-only",
-            action="store_true",
-            help="if set, don't load the XLM weights and embeddings into encoder",
-        )
-
-    @classmethod
-    def build_model(self, args, task, cls_dictionary=MaskedLMDictionary):
-        assert hasattr(args, "pretrained_xlm_checkpoint"), (
-            "You must specify a path for --pretrained-xlm-checkpoint to use "
-            "--arch transformer_from_pretrained_xlm"
-        )
-        assert isinstance(task.source_dictionary, cls_dictionary) and isinstance(
-            task.target_dictionary, cls_dictionary
-        ), (
-            "You should use a MaskedLMDictionary when using --arch "
-            "transformer_from_pretrained_xlm because the pretrained XLM model "
-            "was trained using data binarized with MaskedLMDictionary. "
-            "For translation, you may want to use --task "
-            "translation_from_pretrained_xlm"
-        )
-        assert not (
-            getattr(args, "init_encoder_only", False)
-            and getattr(args, "init_decoder_only", False)
-        ), "Only one of --init-encoder-only and --init-decoder-only can be set."
-        return super().build_model(args, task)
-
-    @classmethod
-    def build_encoder(cls, args, src_dict, embed_tokens):
-        return TransformerEncoderFromPretrainedXLM(args, src_dict, embed_tokens)
-
-    @classmethod
-    def build_decoder(cls, args, tgt_dict, embed_tokens):
-        return TransformerDecoderFromPretrainedXLM(args, tgt_dict, embed_tokens)
-
-
-def upgrade_state_dict_with_xlm_weights(
-    state_dict: Dict[str, Any], pretrained_xlm_checkpoint: str
-) -> Dict[str, Any]:
-    """
-    Load XLM weights into a Transformer encoder or decoder model.
-
-    Args:
-        state_dict: state dict for either TransformerEncoder or
-            TransformerDecoder
-        pretrained_xlm_checkpoint: checkpoint to load XLM weights from
-
-    Raises:
-        AssertionError: If architecture (num layers, attention heads, etc.)
-            does not match between the current Transformer encoder or
-            decoder and the pretrained_xlm_checkpoint
-    """
-    if not os.path.exists(pretrained_xlm_checkpoint):
-        raise IOError("Model file not found: {}".format(pretrained_xlm_checkpoint))
-
-    state = checkpoint_utils.load_checkpoint_to_cpu(pretrained_xlm_checkpoint)
-    xlm_state_dict = state["model"]
-    for key in xlm_state_dict.keys():
-
-        for search_key in ["embed_tokens", "embed_positions", "layers"]:
-            if search_key in key:
-                subkey = key[key.find(search_key) :]
-                assert subkey in state_dict, (
-                    "{} Transformer encoder / decoder "
-                    "state_dict does not contain {}. Cannot "
-                    "load {} from pretrained XLM checkpoint "
-                    "{} into Transformer.".format(
-                        str(state_dict.keys()), subkey, key, pretrained_xlm_checkpoint
-                    )
-                )
-
-                state_dict[subkey] = xlm_state_dict[key]
-    return state_dict
-
-
-class TransformerEncoderFromPretrainedXLM(TransformerEncoder):
-    def __init__(self, args, dictionary, embed_tokens):
-        super().__init__(args, dictionary, embed_tokens)
-        if getattr(args, "init_decoder_only", False):
-            # Don't load XLM weights for encoder if --init-decoder-only
-            return
-
-        assert hasattr(args, "pretrained_xlm_checkpoint"), (
-            "--pretrained-xlm-checkpoint must be specified to load Transformer "
-            "encoder from pretrained XLM"
-        )
-        xlm_loaded_state_dict = upgrade_state_dict_with_xlm_weights(
-            state_dict=self.state_dict(),
-            pretrained_xlm_checkpoint=args.pretrained_xlm_checkpoint,
-        )
-        self.load_state_dict(xlm_loaded_state_dict, strict=True)
-
-
-class TransformerDecoderFromPretrainedXLM(TransformerDecoder):
-    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
-        super().__init__(args, dictionary, embed_tokens, no_encoder_attn)
-        if getattr(args, "init_encoder_only", False):
-            # Don't load XLM weights for decoder if --init-encoder-only
-            return
-        assert hasattr(args, "pretrained_xlm_checkpoint"), (
-            "--pretrained-xlm-checkpoint must be specified to load Transformer "
-            "decoder from pretrained XLM"
-        )
-
-        xlm_loaded_state_dict = upgrade_state_dict_with_xlm_weights(
-            state_dict=self.state_dict(),
-            pretrained_xlm_checkpoint=args.pretrained_xlm_checkpoint,
-        )
-        self.load_state_dict(xlm_loaded_state_dict, strict=True)
-
-
-@register_model_architecture(
-    "transformer_from_pretrained_xlm", "transformer_from_pretrained_xlm"
-)
-def base_architecture(args):
-    transformer_base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/transformer_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/transformer_lm.py
deleted file mode 100644
index 22b17f06ee4c7d22c27c0a390ffb00a87acef64a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/transformer_lm.py
+++ /dev/null
@@ -1,393 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-from dataclasses import dataclass, field
-from typing import Optional
-
-from fairseq import options, utils
-from fairseq.dataclass import ChoiceEnum, FairseqDataclass
-from fairseq.models import (
-    FairseqLanguageModel,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.models.transformer import Embedding, TransformerDecoder
-from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder
-from omegaconf import II
-
-
-DEFAULT_MAX_TARGET_POSITIONS = 1024
-
-
-@dataclass
-class TransformerLanguageModelConfig(FairseqDataclass):
-    activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field(
-        default="relu", metadata={"help": "activation function to use"}
-    )
-    dropout: float = field(default=0.1, metadata={"help": "dropout probability"})
-    attention_dropout: float = field(
-        default=0.0, metadata={"help": "dropout probability for attention weights"}
-    )
-    activation_dropout: float = field(
-        default=0.0, metadata={"help": "dropout probability after activation in FFN."}
-    )
-    relu_dropout: float = field(
-        default=0.0, metadata={"help": "dropout probability after activation in FFN."}
-    )
-    decoder_embed_dim: int = field(
-        default=512, metadata={"help": "decoder embedding dimension"}
-    )
-    decoder_output_dim: int = field(
-        default=512, metadata={"help": "decoder output dimension"}
-    )
-    decoder_input_dim: int = field(
-        default=512, metadata={"help": "decoder input dimension"}
-    )
-    decoder_ffn_embed_dim: int = field(
-        default=2048, metadata={"help": "decoder embedding dimension for FFN"}
-    )
-    decoder_layers: int = field(default=6, metadata={"help": "num decoder layers"})
-    decoder_attention_heads: int = field(
-        default=8, metadata={"help": "num decoder attention heads"}
-    )
-    decoder_normalize_before: bool = field(
-        default=False, metadata={"help": "apply layernorm before each decoder block"}
-    )
-    no_decoder_final_norm: bool = field(
-        default=False,
-        metadata={"help": "don't add an extra layernorm after the last decoder block"},
-    )
-    adaptive_softmax_cutoff: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "comma separated list of adaptive softmax cutoff points. "
-            "Must be used with adaptive_loss criterion"
-        },
-    )
-    adaptive_softmax_dropout: float = field(
-        default=0,
-        metadata={"help": "sets adaptive softmax dropout for the tail projections"},
-    )
-    adaptive_softmax_factor: float = field(
-        default=4, metadata={"help": "adaptive input factor"}
-    )
-    no_token_positional_embeddings: bool = field(
-        default=False,
-        metadata={
-            "help": "if set, disables positional embeddings (outside self attention)"
-        },
-    )
-    share_decoder_input_output_embed: bool = field(
-        default=False, metadata={"help": "share decoder input and output embeddings"}
-    )
-    character_embeddings: bool = field(
-        default=False,
-        metadata={
-            "help": "if set, uses character embedding convolutions to produce token embeddings"
-        },
-    )
-    character_filters: str = field(
-        default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
-        metadata={"help": "size of character embeddings"},
-    )
-    character_embedding_dim: int = field(
-        default=4, metadata={"help": "size of character embeddings"}
-    )
-    char_embedder_highway_layers: int = field(
-        default=2,
-        metadata={"help": "number of highway layers for character token embeddder"},
-    )
-    adaptive_input: bool = field(
-        default=False, metadata={"help": "if set, uses adaptive input"}
-    )
-    adaptive_input_factor: float = field(
-        default=4, metadata={"help": "adaptive input factor"}
-    )
-    adaptive_input_cutoff: Optional[str] = field(
-        default=None,
-        metadata={"help": "comma separated list of adaptive input cutoff points."},
-    )
-    tie_adaptive_weights: bool = field(
-        default=False,
-        metadata={
-            "help": "if set, ties the weights of adaptive softmax and adaptive input"
-        },
-    )
-    tie_adaptive_proj: bool = field(
-        default=False,
-        metadata={
-            "help": "if set, ties the projection weights of adaptive softmax and adaptive input"
-        },
-    )
-    decoder_learned_pos: bool = field(
-        default=False,
-        metadata={"help": "use learned positional embeddings in the decoder"},
-    )
-    decoder_layerdrop: float = field(
-        default=0.0, metadata={"help": "LayerDrop probability for decoder"}
-    )
-    decoder_layers_to_keep: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "which layers to *keep* when pruning as a comma-separated list"
-        },
-    )
-    layernorm_embedding: bool = field(
-        default=False, metadata={"help": "add layernorm to embedding"}
-    )
-    no_scale_embedding: bool = field(
-        default=False, metadata={"help": "if True, dont scale embeddings"}
-    )
-    quant_noise_pq: float = field(
-        default=0.0,
-        metadata={"help": "iterative PQ quantization noise at training time"},
-    )
-    quant_noise_pq_block_size: int = field(
-        default=8,
-        metadata={"help": "block size of quantization noise at training time"},
-    )
-    # TODO common var add to parent
-    quant_noise_scalar: float = field(
-        default=0.0,
-        metadata={
-            "help": "scalar quantization noise and scalar quantization at training time"
-        },
-    )
-    add_bos_token: bool = II("task.add_bos_token")
-    tokens_per_sample: int = II("task.tokens_per_sample")
-    max_target_positions: Optional[int] = II("task.max_target_positions")
-    tpu: bool = II("params.common.tpu")
-
-
-@register_model("transformer_lm", dataclass=TransformerLanguageModelConfig)
-class TransformerLanguageModel(FairseqLanguageModel):
-    @classmethod
-    def hub_models(cls):
-        def moses_fastbpe(path):
-            return {"path": path, "tokenizer": "moses", "bpe": "fastbpe"}
-
-        return {
-            "transformer_lm.gbw.adaptive_huge": "https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2",
-            "transformer_lm.wiki103.adaptive": "https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2",
-            "transformer_lm.wmt19.en": moses_fastbpe(
-                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.bz2"
-            ),
-            "transformer_lm.wmt19.de": moses_fastbpe(
-                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.bz2"
-            ),
-            "transformer_lm.wmt19.ru": moses_fastbpe(
-                "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.bz2"
-            ),
-        }
-
-    def __init__(self, decoder):
-        super().__init__(decoder)
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-
-        # make sure all arguments are present in older models
-        base_lm_architecture(args)
-
-        if args.decoder_layers_to_keep:
-            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
-
-        if getattr(args, "max_target_positions", None) is None:
-            args.max_target_positions = getattr(
-                args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
-            )
-
-        if args.character_embeddings:
-            embed_tokens = CharacterTokenEmbedder(
-                task.source_dictionary,
-                eval(args.character_filters),
-                args.character_embedding_dim,
-                args.decoder_embed_dim,
-                args.char_embedder_highway_layers,
-            )
-        elif args.adaptive_input:
-            embed_tokens = AdaptiveInput(
-                len(task.source_dictionary),
-                task.source_dictionary.pad(),
-                args.decoder_input_dim,
-                args.adaptive_input_factor,
-                args.decoder_embed_dim,
-                options.eval_str_list(args.adaptive_input_cutoff, type=int),
-                args.quant_noise_pq,
-                args.quant_noise_pq_block_size,
-            )
-        else:
-            embed_tokens = cls.build_embedding(
-                args, task.source_dictionary, args.decoder_input_dim
-            )
-
-        if args.tie_adaptive_weights:
-            assert args.adaptive_input
-            assert args.adaptive_input_factor == args.adaptive_softmax_factor
-            assert (
-                args.adaptive_softmax_cutoff == args.adaptive_input_cutoff
-            ), "{} != {}".format(
-                args.adaptive_softmax_cutoff, args.adaptive_input_cutoff
-            )
-            assert args.decoder_input_dim == args.decoder_output_dim
-
-        decoder = TransformerDecoder(
-            args, task.target_dictionary, embed_tokens, no_encoder_attn=True
-        )
-        return cls(decoder)
-
-    @classmethod
-    def build_embedding(cls, args, dictionary, embed_dim, path=None):
-        embed_tokens = Embedding(len(dictionary), embed_dim, dictionary.pad())
-        return embed_tokens
-
-
-@register_model_architecture("transformer_lm", "transformer_lm")
-def base_lm_architecture(args):
-    # backward compatibility for older model checkpoints
-    if hasattr(args, "no_tie_adaptive_proj"):
-        # previous models defined --no-tie-adaptive-proj, so use the existence of
-        # that option to determine if this is an "old" model checkpoint
-        args.no_decoder_final_norm = True  # old models always set this to True
-        if args.no_tie_adaptive_proj is False:
-            args.tie_adaptive_proj = True
-    if hasattr(args, "decoder_final_norm"):
-        args.no_decoder_final_norm = not args.decoder_final_norm
-
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
-
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048)
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-    args.activation_fn = getattr(args, "activation_fn", "relu")
-
-    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0)
-    args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None)
-    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
-    args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8)
-    args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0)
-
-    args.add_bos_token = getattr(args, "add_bos_token", False)
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.character_embeddings = getattr(args, "character_embeddings", False)
-
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-
-    # Model training is not stable without this
-    args.decoder_normalize_before = True
-    args.no_decoder_final_norm = getattr(args, "no_decoder_final_norm", False)
-
-    args.adaptive_input = getattr(args, "adaptive_input", False)
-    args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4)
-    args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None)
-
-    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
-    args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False)
-
-    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
-    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
-
-
-@register_model_architecture("transformer_lm", "transformer_lm_big")
-def transformer_lm_big(args):
-    args.decoder_layers = getattr(args, "decoder_layers", 12)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
-    base_lm_architecture(args)
-
-
-@register_model_architecture("transformer_lm", "transformer_lm_wiki103")
-@register_model_architecture("transformer_lm", "transformer_lm_baevski_wiki103")
-def transformer_lm_baevski_wiki103(args):
-    args.decoder_layers = getattr(args, "decoder_layers", 16)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.dropout = getattr(args, "dropout", 0.3)
-    args.adaptive_input = getattr(args, "adaptive_input", True)
-    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", True)
-    args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", "20000,60000")
-    args.adaptive_softmax_cutoff = getattr(
-        args, "adaptive_softmax_cutoff", "20000,60000"
-    )
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0.2)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.1)
-    args.no_decoder_final_norm = getattr(args, "no_decoder_final_norm", True)
-    args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", True)
-    transformer_lm_big(args)
-
-
-@register_model_architecture("transformer_lm", "transformer_lm_gbw")
-@register_model_architecture("transformer_lm", "transformer_lm_baevski_gbw")
-def transformer_lm_baevski_gbw(args):
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.no_decoder_final_norm = getattr(args, "no_decoder_final_norm", True)
-    transformer_lm_big(args)
-
-
-@register_model_architecture("transformer_lm", "transformer_lm_gpt")
-def transformer_lm_gpt(args):
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 768)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 3072)
-    args.decoder_layers = getattr(args, "decoder_layers", 12)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 12)
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_fn = getattr(args, "activation_fn", "gelu")
-    base_lm_architecture(args)
-
-
-@register_model_architecture("transformer_lm", "transformer_lm_gpt2_small")
-def transformer_lm_gpt2_small(args):
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
-    args.decoder_layers = getattr(args, "decoder_layers", 24)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_fn = getattr(args, "activation_fn", "gelu")
-    base_lm_architecture(args)
-
-
-@register_model_architecture("transformer_lm", "transformer_lm_gpt2_medium")
-def transformer_lm_gpt2_medium(args):
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1280)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 5120)
-    args.decoder_layers = getattr(args, "decoder_layers", 36)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 20)
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_fn = getattr(args, "activation_fn", "gelu")
-    base_lm_architecture(args)
-
-
-@register_model_architecture("transformer_lm", "transformer_lm_gpt2_big")
-def transformer_lm_gpt2_big(args):
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1600)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 6400)
-    args.decoder_layers = getattr(args, "decoder_layers", 48)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 25)
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_fn = getattr(args, "activation_fn", "gelu")
-    base_lm_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/wav2vec/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/wav2vec/__init__.py
deleted file mode 100644
index 06cec18183ca14cd534d14558e8b44e25f3e69d5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/wav2vec/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .wav2vec import *  # noqa
-from .wav2vec2 import *  # noqa
-from .wav2vec2_asr import *  # noqa
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/wav2vec/wav2vec.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/wav2vec/wav2vec.py
deleted file mode 100644
index 772995b526fe87d4f53badca09aa5aa3a0662412..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/wav2vec/wav2vec.py
+++ /dev/null
@@ -1,735 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import math
-import sys
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq.models import BaseFairseqModel, register_model, register_model_architecture
-from fairseq.modules import (
-    Fp32GroupNorm,
-    Fp32LayerNorm,
-    GumbelVectorQuantizer,
-    KmeansVectorQuantizer,
-    TransposeLast,
-)
-from fairseq.utils import buffered_arange
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_model("wav2vec")
-class Wav2VecModel(BaseFairseqModel):
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        parser.add_argument(
-            "--prediction-steps",
-            type=int,
-            metavar="N",
-            help="number of steps ahead to predict",
-        )
-        parser.add_argument(
-            "--sample-distance",
-            type=int,
-            metavar="N",
-            help="sample distance from target. does not work properly with cross-sampling",
-        )
-        parser.add_argument(
-            "--cross-sample-negatives",
-            type=int,
-            metavar="N",
-            help="num of cross sampled negatives",
-        )
-        parser.add_argument(
-            "--num-negatives", type=int, metavar="N", help="number of negative examples"
-        )
-        parser.add_argument(
-            "--conv-feature-layers",
-            type=str,
-            metavar="EXPR",
-            help="convolutional feature extraction layers [(dim, kernel_size, stride), ...]",
-        )
-        parser.add_argument(
-            "--conv-aggregator-layers",
-            type=str,
-            metavar="EXPR",
-            help="convolutional feature extraction layers [(dim, kernel_size, stride), ...]",
-        )
-        parser.add_argument(
-            "--dropout",
-            type=float,
-            metavar="D",
-            help="dropout to apply within the model",
-        )
-        parser.add_argument(
-            "--dropout-features",
-            type=float,
-            metavar="D",
-            help="dropout to apply to the features",
-        )
-        parser.add_argument(
-            "--dropout-agg",
-            type=float,
-            metavar="D",
-            help="dropout to apply after aggregation step",
-        )
-        parser.add_argument(
-            "--encoder", type=str, choices=["cnn"], help="type of encoder to use"
-        )
-        parser.add_argument(
-            "--aggregator",
-            type=str,
-            choices=["cnn", "gru"],
-            help="type of aggregator to use",
-        )
-        parser.add_argument(
-            "--gru-dim", type=int, metavar="N", help="GRU dimensionality"
-        )
-
-        parser.add_argument(
-            "--no-conv-bias",
-            action="store_true",
-            help="if set, does not learn bias for conv layers",
-        )
-        parser.add_argument(
-            "--agg-zero-pad",
-            action="store_true",
-            help="if set, zero pads in aggregator instead of repl pad",
-        )
-
-        parser.add_argument(
-            "--skip-connections-feat",
-            action="store_true",
-            help="if set, adds skip connections to the feature extractor",
-        )
-        parser.add_argument(
-            "--skip-connections-agg",
-            action="store_true",
-            help="if set, adds skip connections to the aggregator",
-        )
-        parser.add_argument(
-            "--residual-scale",
-            type=float,
-            metavar="D",
-            help="scales residual by sqrt(value)",
-        )
-
-        parser.add_argument(
-            "--log-compression",
-            action="store_true",
-            help="if set, adds a log compression to feature extractor",
-        )
-
-        parser.add_argument(
-            "--balanced-classes",
-            action="store_true",
-            help="if set, loss is scaled to balance for number of negatives",
-        )
-
-        parser.add_argument(
-            "--project-features",
-            choices=["none", "same", "new"],
-            help="if not none, features are projected using the (same or new) aggregator",
-        )
-
-        parser.add_argument(
-            "--non-affine-group-norm",
-            action="store_true",
-            help="if set, group norm is not affine",
-        )
-
-        parser.add_argument(
-            "--offset",
-            help="if set, introduces an offset from target to predictions. "
-            'if set to "auto", it is computed automatically from the receptive field',
-        )
-
-        parser.add_argument(
-            "--activation",
-            type=str,
-            choices=["relu", "gelu"],
-            help="which activation function to use",
-        )
-
-        parser.add_argument(
-            "--vq-type",
-            type=str,
-            choices=["none", "gumbel", "kmeans"],
-            help="which type of quantizer to use",
-        )
-        parser.add_argument(
-            "--vq-vars",
-            type=int,
-            metavar="N",
-            help="if set, project to this many vector quantized variables per group",
-        )
-        parser.add_argument(
-            "--vq-groups",
-            type=int,
-            metavar="N",
-            help="number of groups of latent variables",
-        )
-        parser.add_argument(
-            "--vq-dim",
-            type=int,
-            metavar="N",
-            help="uses this dimensionality for quantized vectors",
-        )
-        parser.add_argument(
-            "--vq-depth",
-            type=int,
-            metavar="N",
-            help="number of layers for vq weight projection",
-        )
-        parser.add_argument(
-            "--combine-groups",
-            action="store_true",
-            help="if set, variables are shared among groups",
-        )
-        parser.add_argument(
-            "--vq-temp",
-            type=str,
-            metavar="TEMP",
-            help="temperature for latent variable sampling with gumbel softmax. should be a tuple of 3 values (start, end, decay)",
-        )
-        parser.add_argument(
-            "--vq-gamma",
-            type=float,
-            metavar="D",
-            help="gamma parameter for kmeans style vector quantization",
-        )
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-
-        # make sure all arguments are present in older models
-        base_wav2vec_architecture(args)
-
-        model = Wav2VecModel(args)
-        logger.info(model)
-        return model
-
-    def __init__(self, args):
-        super().__init__()
-
-        self.prediction_steps = args.prediction_steps
-        offset = args.offset
-
-        if args.activation == "relu":
-            activation = nn.ReLU()
-        elif args.activation == "gelu":
-            activation = nn.GELU()
-        else:
-            raise Exception("unknown activation " + args.activation)
-
-        if args.encoder == "cnn":
-            feature_enc_layers = eval(args.conv_feature_layers)
-            self.feature_extractor = ConvFeatureExtractionModel(
-                conv_layers=feature_enc_layers,
-                dropout=0.0,
-                log_compression=args.log_compression,
-                skip_connections=args.skip_connections_feat,
-                residual_scale=args.residual_scale,
-                non_affine_group_norm=args.non_affine_group_norm,
-                activation=activation,
-            )
-            embed = feature_enc_layers[-1][0]
-        else:
-            raise Exception("unknown encoder type " + args.encoder)
-
-        self.vector_quantizer = None
-        if args.vq_type == "gumbel":
-            self.vector_quantizer = GumbelVectorQuantizer(
-                dim=embed,
-                num_vars=args.vq_vars,
-                temp=eval(args.vq_temp),
-                groups=args.vq_groups,
-                combine_groups=args.combine_groups,
-                vq_dim=args.vq_dim if args.vq_dim > 0 else embed,
-                time_first=False,
-                activation=activation,
-                weight_proj_depth=args.vq_depth,
-                weight_proj_factor=2,
-            )
-        elif args.vq_type == "kmeans":
-            self.vector_quantizer = KmeansVectorQuantizer(
-                dim=embed,
-                num_vars=args.vq_vars,
-                groups=args.vq_groups,
-                combine_groups=args.combine_groups,
-                vq_dim=args.vq_dim if args.vq_dim > 0 else embed,
-                time_first=False,
-                gamma=args.vq_gamma,
-            )
-        else:
-            assert (
-                args.vq_type == "none" or args.vq_type is None
-            ), "Unknown quantizer type"
-
-        if args.offset == "auto":
-            assert args.encoder == "cnn"
-            jin = 0
-            rin = 0
-            for _, k, stride in feature_enc_layers:
-                if rin == 0:
-                    rin = k
-                rin = rin + (k - 1) * jin
-                if jin == 0:
-                    jin = stride
-                else:
-                    jin *= stride
-            offset = math.ceil(rin / jin)
-
-        offset = int(offset)
-
-        def make_aggregator():
-            if args.aggregator == "cnn":
-                agg_layers = eval(args.conv_aggregator_layers)
-                agg_dim = agg_layers[-1][0]
-                feature_aggregator = ConvAggegator(
-                    conv_layers=agg_layers,
-                    embed=embed,
-                    dropout=args.dropout,
-                    skip_connections=args.skip_connections_agg,
-                    residual_scale=args.residual_scale,
-                    non_affine_group_norm=args.non_affine_group_norm,
-                    conv_bias=not args.no_conv_bias,
-                    zero_pad=args.agg_zero_pad,
-                    activation=activation,
-                )
-            elif args.aggregator == "gru":
-                agg_dim = args.gru_dim
-                feature_aggregator = nn.Sequential(
-                    TransposeLast(),
-                    nn.GRU(
-                        input_size=embed,
-                        hidden_size=agg_dim,
-                        num_layers=1,
-                        dropout=args.dropout,
-                    ),
-                    TransposeLast(deconstruct_idx=0),
-                )
-            else:
-                raise Exception("unknown aggregator type " + args.aggregator)
-
-            return feature_aggregator, agg_dim
-
-        self.feature_aggregator, agg_dim = make_aggregator()
-
-        self.wav2vec_predictions = Wav2VecPredictionsModel(
-            in_dim=agg_dim,
-            out_dim=embed,
-            prediction_steps=args.prediction_steps,
-            n_negatives=args.num_negatives,
-            cross_sample_negatives=args.cross_sample_negatives,
-            sample_distance=args.sample_distance,
-            dropout=args.dropout,
-            offset=offset,
-            balanced_classes=args.balanced_classes,
-            infonce=args.infonce,
-        )
-
-        self.dropout_feats = nn.Dropout(p=args.dropout_features)
-        self.dropout_agg = nn.Dropout(p=args.dropout_agg)
-
-        if args.project_features == "none":
-            self.project_features = None
-        elif args.project_features == "same":
-            self.project_features = self.feature_aggregator
-        elif args.project_features == "new":
-            self.project_features, _ = make_aggregator()
-
-    def forward(self, source):
-        result = {}
-
-        features = self.feature_extractor(source)
-        if self.vector_quantizer:
-            q_res = self.vector_quantizer(features)
-            features = q_res["x"]
-            for k in q_res.keys():
-                if k != "x":
-                    result[k] = q_res[k]
-
-        x = self.dropout_feats(features)
-        x = self.feature_aggregator(x)
-        x = self.dropout_agg(x)
-
-        if self.project_features is not None:
-            features = self.project_features(features)
-        x, targets = self.wav2vec_predictions(x, features)
-        result["cpc_logits"] = x
-        result["cpc_targets"] = targets
-
-        return result
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        super().upgrade_state_dict_named(state_dict, name)
-
-    def max_positions(self):
-        """Maximum length supported by the model."""
-        return sys.maxsize
-
-    def get_logits(self, net_output):
-        logits = net_output["cpc_logits"]
-        return logits
-
-    def get_targets(self, sample, net_output):
-        t = net_output["cpc_targets"]
-        if isinstance(t, tuple):
-            t = t[0]
-        return t.contiguous()
-
-    def get_target_weights(self, targets, net_output):
-        targets = net_output["cpc_targets"]
-        if isinstance(targets, tuple) and targets[-1] is not None:
-            return targets[-1]
-        return None
-
-    def get_extra_losses(self, net_output):
-        loss = None
-        if "prob_perplexity" in net_output:
-            loss = net_output["num_vars"] - net_output["prob_perplexity"]
-        elif "kmeans_loss" in net_output:
-            loss = net_output["kmeans_loss"]
-
-        return loss
-
-
-def norm_block(is_layer_norm, dim, affine=True):
-    if is_layer_norm:
-        mod = nn.Sequential(
-            TransposeLast(),
-            Fp32LayerNorm(dim, elementwise_affine=affine),
-            TransposeLast(),
-        )
-    else:
-        mod = Fp32GroupNorm(1, dim, affine=affine)
-
-    return mod
-
-
-class ConvFeatureExtractionModel(nn.Module):
-    def __init__(
-        self,
-        conv_layers,
-        dropout,
-        log_compression,
-        skip_connections,
-        residual_scale,
-        non_affine_group_norm,
-        activation,
-    ):
-        super().__init__()
-
-        def block(n_in, n_out, k, stride):
-            return nn.Sequential(
-                nn.Conv1d(n_in, n_out, k, stride=stride, bias=False),
-                nn.Dropout(p=dropout),
-                norm_block(
-                    is_layer_norm=False, dim=n_out, affine=not non_affine_group_norm
-                ),
-                activation,
-            )
-
-        in_d = 1
-        self.conv_layers = nn.ModuleList()
-        for dim, k, stride in conv_layers:
-            self.conv_layers.append(block(in_d, dim, k, stride))
-            in_d = dim
-
-        self.log_compression = log_compression
-        self.skip_connections = skip_connections
-        self.residual_scale = math.sqrt(residual_scale)
-
-    def forward(self, x):
-        # BxT -> BxCxT
-        x = x.unsqueeze(1)
-
-        for conv in self.conv_layers:
-            residual = x
-            x = conv(x)
-            if self.skip_connections and x.size(1) == residual.size(1):
-                tsz = x.size(2)
-                r_tsz = residual.size(2)
-                residual = residual[..., :: r_tsz // tsz][..., :tsz]
-                x = (x + residual) * self.residual_scale
-
-        if self.log_compression:
-            x = x.abs()
-            x = x + 1
-            x = x.log()
-
-        return x
-
-
-class ZeroPad1d(nn.Module):
-    def __init__(self, pad_left, pad_right):
-        super().__init__()
-        self.pad_left = pad_left
-        self.pad_right = pad_right
-
-    def forward(self, x):
-        return F.pad(x, (self.pad_left, self.pad_right))
-
-
-class ConvAggegator(nn.Module):
-    def __init__(
-        self,
-        conv_layers,
-        embed,
-        dropout,
-        skip_connections,
-        residual_scale,
-        non_affine_group_norm,
-        conv_bias,
-        zero_pad,
-        activation,
-    ):
-        super().__init__()
-
-        def block(n_in, n_out, k, stride):
-            # padding dims only really make sense for stride = 1
-            ka = k // 2
-            kb = ka - 1 if k % 2 == 0 else ka
-
-            pad = (
-                ZeroPad1d(ka + kb, 0) if zero_pad else nn.ReplicationPad1d((ka + kb, 0))
-            )
-
-            return nn.Sequential(
-                pad,
-                nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias),
-                nn.Dropout(p=dropout),
-                norm_block(False, n_out, affine=not non_affine_group_norm),
-                activation,
-            )
-
-        in_d = embed
-        self.conv_layers = nn.ModuleList()
-        self.residual_proj = nn.ModuleList()
-        for dim, k, stride in conv_layers:
-            if in_d != dim and skip_connections:
-                self.residual_proj.append(nn.Conv1d(in_d, dim, 1, bias=False))
-            else:
-                self.residual_proj.append(None)
-
-            self.conv_layers.append(block(in_d, dim, k, stride))
-            in_d = dim
-        self.conv_layers = nn.Sequential(*self.conv_layers)
-        self.skip_connections = skip_connections
-        self.residual_scale = math.sqrt(residual_scale)
-
-    def forward(self, x):
-        for rproj, conv in zip(self.residual_proj, self.conv_layers):
-            residual = x
-            x = conv(x)
-            if self.skip_connections:
-                if rproj is not None:
-                    residual = rproj(residual)
-                x = (x + residual) * self.residual_scale
-        return x
-
-
-class Wav2VecPredictionsModel(nn.Module):
-    def __init__(
-        self,
-        in_dim,
-        out_dim,
-        prediction_steps,
-        n_negatives,
-        cross_sample_negatives,
-        sample_distance,
-        dropout,
-        offset,
-        balanced_classes,
-        infonce,
-    ):
-        super().__init__()
-
-        self.n_negatives = n_negatives
-        self.cross_sample_negatives = cross_sample_negatives
-        self.sample_distance = sample_distance
-        self.project_to_steps = nn.ConvTranspose2d(
-            in_dim, out_dim, (1, prediction_steps)
-        )
-        self.dropout = nn.Dropout(p=dropout)
-        self.offset = offset
-        self.balanced_classes = balanced_classes
-        self.infonce = infonce
-
-    def sample_negatives(self, y):
-        bsz, fsz, tsz = y.shape
-
-        y = y.transpose(0, 1)  # BCT -> CBT
-        y = y.contiguous().view(fsz, -1)  # CBT => C(BxT)
-
-        cross_high = tsz * bsz
-        high = tsz if self.sample_distance is None else min(tsz, self.sample_distance)
-        assert high > 1
-
-        neg_idxs = torch.randint(low=0, high=high, size=(bsz, self.n_negatives * tsz))
-
-        with torch.no_grad():
-            if self.n_negatives > 0:
-                tszs = (
-                    buffered_arange(tsz)
-                    .unsqueeze(-1)
-                    .expand(-1, self.n_negatives)
-                    .flatten()
-                )
-
-                neg_idxs = torch.randint(
-                    low=0, high=high - 1, size=(bsz, self.n_negatives * tsz)
-                )
-                neg_idxs[neg_idxs >= tszs] += 1
-
-            if self.cross_sample_negatives > 0:
-                tszs = (
-                    buffered_arange(tsz)
-                    .unsqueeze(-1)
-                    .expand(-1, self.cross_sample_negatives)
-                    .flatten()
-                )
-
-                cross_neg_idxs = torch.randint(
-                    low=0,
-                    high=cross_high - 1,
-                    size=(bsz, self.cross_sample_negatives * tsz),
-                )
-                cross_neg_idxs[cross_neg_idxs >= tszs] += 1
-
-        if self.n_negatives > 0:
-            for i in range(1, bsz):
-                neg_idxs[i] += i * high
-        else:
-            neg_idxs = cross_neg_idxs
-
-        if self.cross_sample_negatives > 0 and self.n_negatives > 0:
-            neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1)
-
-        negs = y[..., neg_idxs.view(-1)]
-        negs = negs.view(
-            fsz, bsz, self.n_negatives + self.cross_sample_negatives, tsz
-        ).permute(
-            2, 1, 0, 3
-        )  # to NxBxCxT
-
-        return negs
-
-    def forward(self, x, y):
-
-        x = x.unsqueeze(-1)
-        x = self.project_to_steps(x)  # BxCxTxS
-        x = self.dropout(x)
-
-        negatives = self.sample_negatives(y)
-        y = y.unsqueeze(0)
-        targets = torch.cat([y, negatives], dim=0)  # Copies x B x C x T
-
-        copies = targets.size(0)
-        bsz, dim, tsz, steps = x.shape
-        steps = min(steps, tsz - self.offset)
-
-        predictions = x.new(
-            bsz * copies * (tsz - self.offset + 1) * steps
-            - ((steps + 1) * steps // 2) * copies * bsz
-        )
-        if self.infonce:
-            labels = predictions.new_full(
-                (predictions.shape[0] // copies,), 0, dtype=torch.long
-            )
-        else:
-            labels = torch.zeros_like(predictions)
-        weights = (
-            torch.full_like(labels, 1 / self.n_negatives)
-            if self.balanced_classes and not self.infonce
-            else None
-        )
-
-        start = end = 0
-        for i in range(steps):
-            offset = i + self.offset
-            end = start + (tsz - offset) * bsz * copies
-            if self.infonce:
-                predictions[start:end] = torch.einsum(
-                    "bct,nbct->tbn", x[..., :-offset, i], targets[..., offset:]
-                ).flatten()
-            else:
-                pos_num = (end - start) // copies
-                predictions[start:end] = torch.einsum(
-                    "bct,nbct->nbt", x[..., :-offset, i], targets[..., offset:]
-                ).flatten()
-                labels[start : start + pos_num] = 1.0
-                if weights is not None:
-                    weights[start : start + pos_num] = 1.0
-            start = end
-        assert end == predictions.numel(), "{} != {}".format(end, predictions.numel())
-
-        if self.infonce:
-            predictions = predictions.view(-1, copies)
-        else:
-            if weights is not None:
-                labels = (labels, weights)
-
-        return predictions, labels
-
-
-@register_model_architecture("wav2vec", "wav2vec")
-def base_wav2vec_architecture(args):
-    conv_feature_layers = "[(512, 10, 5)]"
-    conv_feature_layers += " + [(512, 8, 4)]"
-    conv_feature_layers += " + [(512, 4, 2)] * 3"
-    args.conv_feature_layers = getattr(args, "conv_feature_layers", conv_feature_layers)
-
-    args.conv_aggregator_layers = getattr(
-        args, "conv_aggregator_layers", "[(512, 3, 1)] * 9"
-    )
-
-    args.prediction_steps = getattr(args, "prediction_steps", 12)
-    args.num_negatives = getattr(args, "num_negatives", 1)
-    args.sample_distance = getattr(args, "sample_distance", None)
-    args.cross_sample_negatives = getattr(args, "cross_sample_negatives", 0)
-
-    args.dropout = getattr(args, "dropout", 0.0)
-    args.dropout_features = getattr(args, "dropout_features", 0.0)
-    args.dropout_agg = getattr(args, "dropout_agg", 0.0)
-    args.encoder = getattr(args, "encoder", "cnn")
-    args.aggregator = getattr(args, "aggregator", "cnn")
-
-    args.skip_connections_feat = getattr(args, "skip_connections_feat", False)
-    args.skip_connections_agg = getattr(args, "skip_connections_agg", False)
-    args.residual_scale = getattr(args, "residual_scale", 0.5)
-
-    args.gru_dim = getattr(args, "gru_dim", 512)
-
-    args.no_conv_bias = getattr(args, "no_conv_bias", False)
-    args.agg_zero_pad = getattr(args, "agg_zero_pad", False)
-
-    args.log_compression = getattr(args, "log_compression", False)
-
-    args.balanced_classes = getattr(args, "balanced_classes", False)
-    args.infonce = getattr(args, "infonce", False)
-    args.project_features = getattr(args, "project_features", "none")
-
-    args.non_affine_group_norm = getattr(args, "non_affine_group_norm", False)
-
-    args.offset = getattr(args, "offset", "auto")
-
-    args.activation = getattr(args, "activation", "relu")
-
-    args.vq_type = getattr(args, "vq_type", "none")
-    args.vq_vars = getattr(args, "vq_vars", 320)
-    args.vq_groups = getattr(args, "vq_groups", 2)
-    args.vq_dim = getattr(args, "vq_dim", 0)
-    args.vq_depth = getattr(args, "vq_depth", 1)
-    args.combine_groups = getattr(args, "combine_groups", False)
-    args.vq_temp = getattr(args, "vq_temp", "(2.0, 0.5, 0.999995)")
-    args.vq_gamma = getattr(args, "vq_gamma", 0.25)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/wav2vec/wav2vec2.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/wav2vec/wav2vec2.py
deleted file mode 100644
index 6a0f787601bcbcd682d410a2cd64a40645fdbeb2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/wav2vec/wav2vec2.py
+++ /dev/null
@@ -1,1029 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import math
-from typing import List, Tuple
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.data.data_utils import compute_mask_indices
-from fairseq.models import BaseFairseqModel, register_model, register_model_architecture
-from fairseq.modules import (
-    Fp32GroupNorm,
-    Fp32LayerNorm,
-    GradMultiply,
-    GumbelVectorQuantizer,
-    LayerNorm,
-    MultiheadAttention,
-    SamePad,
-    TransposeLast,
-)
-from fairseq.modules.transformer_sentence_encoder import init_bert_params
-from fairseq.utils import buffered_arange
-
-
-@register_model("wav2vec2")
-class Wav2Vec2Model(BaseFairseqModel):
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-
-        parser.add_argument(
-            "--extractor-mode",
-            choices=["default", "layer_norm"],
-            help="mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with --normalize)",
-        )
-
-        parser.add_argument(
-            "--encoder-layers",
-            type=int,
-            metavar="L",
-            help="num encoder layers in the transformer",
-        )
-        parser.add_argument(
-            "--encoder-embed-dim",
-            type=int,
-            metavar="H",
-            help="encoder embedding dimension",
-        )
-        parser.add_argument(
-            "--encoder-ffn-embed-dim",
-            type=int,
-            metavar="F",
-            help="encoder embedding dimension for FFN",
-        )
-        parser.add_argument(
-            "--encoder-attention-heads",
-            type=int,
-            metavar="A",
-            help="num encoder attention heads",
-        )
-        parser.add_argument(
-            "--activation-fn",
-            choices=utils.get_available_activation_fns(),
-            help="activation function to use",
-        )
-
-        parser.add_argument(
-            "--dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability for the transformer",
-        )
-
-        parser.add_argument(
-            "--attention-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability for attention weights",
-        )
-
-        parser.add_argument(
-            "--activation-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability after activation in FFN",
-        )
-
-        parser.add_argument(
-            "--final-dim",
-            type=int,
-            metavar="D",
-            help="project final representations and targets to this many dimensions",
-        )
-
-        parser.add_argument(
-            "--layer-norm-first",
-            action="store_true",
-            help="apply layernorm first in the transformer",
-        )
-
-        parser.add_argument(
-            "--encoder-layerdrop",
-            type=float,
-            help="probability of dropping a tarnsformer layer",
-        )
-
-        parser.add_argument(
-            "--conv-feature-layers",
-            type=str,
-            metavar="EXPR",
-            help="convolutional feature extraction layers [(dim, kernel_size, stride), ...]",
-        )
-
-        parser.add_argument(
-            "--logit-temp", type=float, help="temperature to divide logits by"
-        )
-
-        parser.add_argument(
-            "--quantize-targets", action="store_true", help="use quantized targets"
-        )
-
-        parser.add_argument(
-            "--quantize-input", action="store_true", help="use quantized inputs"
-        )
-
-        parser.add_argument(
-            "--same-quantizer",
-            action="store_true",
-            help="use same quantizer for inputs and targets",
-        )
-
-        parser.add_argument(
-            "--feature-grad-mult",
-            type=float,
-            help="multiply feature extractor var grads by this",
-        )
-
-        parser.add_argument(
-            "--latent-vars",
-            type=int,
-            metavar="N",
-            help="number of latent variables V in each group of the codebook",
-        )
-
-        parser.add_argument(
-            "--latent-groups",
-            type=int,
-            metavar="N",
-            help="number of groups G of latent variables in the codebook",
-        )
-
-        parser.add_argument(
-            "--latent-dim",
-            type=int,
-            metavar="N",
-            help="if set, uses this dimensionality for latent variables. otherwise uses final_dim / latent_groups",
-        )
-
-        parser.add_argument("--mask-length", type=int, help="mask length")
-
-        parser.add_argument(
-            "--mask-prob", type=float, help="probability of replacing a token with mask"
-        )
-
-        parser.add_argument(
-            "--mask-selection",
-            type=str,
-            choices=["static", "uniform", "normal", "poisson"],
-            help="how to choose masks",
-        )
-
-        parser.add_argument(
-            "--mask-other",
-            type=float,
-            help="secondary mask argument (used for more complex distributions), see help in compute_mask_indices",
-        )
-
-        parser.add_argument(
-            "--no-mask-overlap",
-            action="store_true",
-            help="whether to allow masks to overlap",
-        )
-
-        parser.add_argument(
-            "--mask-min-space",
-            type=int,
-            help="min space between spans (if no overlap is enabled)",
-        )
-
-        parser.add_argument(
-            "--mask-channel-length",
-            type=int,
-            help="repeat the mask indices multiple times",
-        )
-
-        parser.add_argument(
-            "--mask-channel-prob",
-            type=float,
-            help="probability of replacing a token with mask",
-        )
-
-        parser.add_argument(
-            "--mask-channel-selection",
-            type=str,
-            choices=["static", "uniform", "normal", "poisson"],
-            help="how to choose masks",
-        )
-
-        parser.add_argument(
-            "--mask-channel-other",
-            type=float,
-            help="secondary mask argument (used for more complex distributions), see help in compute_mask_indices",
-        )
-
-        parser.add_argument(
-            "--no-mask-channel-overlap",
-            action="store_true",
-            help="whether to allow masks to overlap",
-        )
-
-        parser.add_argument(
-            "--mask-channel-min-space",
-            type=int,
-            help="min space between spans (if no overlap is enabled)",
-        )
-
-        parser.add_argument(
-            "--dropout-input",
-            type=float,
-            metavar="D",
-            help="dropout to apply to the input (after feat extr)",
-        )
-
-        parser.add_argument(
-            "--dropout-features",
-            type=float,
-            metavar="D",
-            help="dropout to apply to the features (after feat extr)",
-        )
-
-        parser.add_argument(
-            "--num-negatives", type=int, metavar="N", help="number of negative examples"
-        )
-
-        parser.add_argument(
-            "--negatives-from-everywhere",
-            action="store_true",
-            help="sample negatives from everywhere, not just masked states",
-        )
-
-        parser.add_argument(
-            "--cross-sample-negatives",
-            type=int,
-            metavar="N",
-            help="num of cross sampled negatives",
-        )
-
-        parser.add_argument(
-            "--codebook-negatives",
-            type=int,
-            metavar="N",
-            help="num of codebook sampled negatives",
-        )
-
-        parser.add_argument(
-            "--conv-pos",
-            type=int,
-            metavar="N",
-            help="number of filters for convolutional positional embeddings",
-        )
-
-        parser.add_argument(
-            "--conv-pos-groups",
-            type=int,
-            metavar="N",
-            help="number of groups for convolutional positional embedding",
-        )
-
-        parser.add_argument(
-            "--latent-temp",
-            type=str,
-            metavar="D",
-            help="temperature for latent variable sampling. can be tuple of 3 values (start, end, decay)",
-        )
-
-        parser.add_argument(
-            "--target-glu", action="store_true", help="adds projection + glu to targets"
-        )
-
-        parser.add_argument(
-            "--conv-bias", action="store_true", help="include bias in conv encoder"
-        )
-
-    def __init__(self, args):
-        super().__init__()
-        self.args = args
-
-        feature_enc_layers = eval(args.conv_feature_layers)
-        self.embed = feature_enc_layers[-1][0]
-
-        self.feature_extractor = ConvFeatureExtractionModel(
-            conv_layers=feature_enc_layers,
-            dropout=0.0,
-            mode=args.extractor_mode,
-            conv_bias=args.conv_bias,
-        )
-
-        self.post_extract_proj = (
-            nn.Linear(self.embed, args.encoder_embed_dim)
-            if self.embed != args.encoder_embed_dim and not args.quantize_input
-            else None
-        )
-
-        self.mask_prob = args.mask_prob
-        self.mask_selection = args.mask_selection
-        self.mask_other = args.mask_other
-        self.mask_length = args.mask_length
-        self.no_mask_overlap = args.no_mask_overlap
-        self.mask_min_space = args.mask_min_space
-
-        self.mask_channel_prob = args.mask_channel_prob
-        self.mask_channel_selection = args.mask_channel_selection
-        self.mask_channel_other = args.mask_channel_other
-        self.mask_channel_length = args.mask_channel_length
-        self.no_mask_channel_overlap = args.no_mask_channel_overlap
-        self.mask_channel_min_space = args.mask_channel_min_space
-
-        self.dropout_input = nn.Dropout(args.dropout_input)
-        self.dropout_features = nn.Dropout(args.dropout_features)
-
-        self.feature_grad_mult = args.feature_grad_mult
-
-        self.quantizer = None
-        self.input_quantizer = None
-
-        self.n_negatives = args.num_negatives
-        self.cross_sample_negatives = args.cross_sample_negatives
-        self.codebook_negatives = args.codebook_negatives
-        self.negatives_from_everywhere = args.negatives_from_everywhere
-
-        self.logit_temp = args.logit_temp
-
-        final_dim = args.final_dim if args.final_dim > 0 else args.encoder_embed_dim
-
-        if args.quantize_targets:
-            vq_dim = args.latent_dim if args.latent_dim > 0 else final_dim
-            self.quantizer = GumbelVectorQuantizer(
-                dim=self.embed,
-                num_vars=args.latent_vars,
-                temp=eval(args.latent_temp),
-                groups=args.latent_groups,
-                combine_groups=False,
-                vq_dim=vq_dim,
-                time_first=True,
-            )
-            self.project_q = nn.Linear(vq_dim, final_dim)
-        else:
-            self.project_q = nn.Linear(self.embed, final_dim)
-
-        if args.quantize_input:
-            if args.same_quantizer and self.quantizer is not None:
-                vq_dim = final_dim
-                self.input_quantizer = self.quantizer
-            else:
-                vq_dim = (
-                    args.latent_dim if args.latent_dim > 0 else args.encoder_embed_dim
-                )
-                self.input_quantizer = GumbelVectorQuantizer(
-                    dim=self.embed,
-                    num_vars=args.latent_vars,
-                    temp=eval(args.latent_temp),
-                    groups=args.latent_groups,
-                    combine_groups=False,
-                    vq_dim=vq_dim,
-                    time_first=True,
-                )
-            self.project_inp = nn.Linear(vq_dim, args.encoder_embed_dim)
-
-        self.mask_emb = nn.Parameter(
-            torch.FloatTensor(args.encoder_embed_dim).uniform_()
-        )
-
-        self.encoder = TransformerEncoder(args)
-        self.layer_norm = LayerNorm(self.embed)
-
-        self.target_glu = None
-        if args.target_glu:
-            self.target_glu = nn.Sequential(
-                nn.Linear(final_dim, final_dim * 2), nn.GLU()
-            )
-
-        self.final_proj = nn.Linear(args.encoder_embed_dim, final_dim)
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        super().upgrade_state_dict_named(state_dict, name)
-        """Upgrade a (possibly old) state dict for new versions of fairseq."""
-        return state_dict
-
-    @classmethod
-    def build_model(cls, args, task=None):
-        """Build a new model instance."""
-
-        # make sure all arguments are present
-        base_architecture(args)
-
-        return cls(args)
-
-    def apply_mask(self, x, padding_mask):
-        B, T, C = x.shape
-        if self.mask_prob > 0:
-            mask_indices = compute_mask_indices(
-                (B, T),
-                padding_mask,
-                self.mask_prob,
-                self.mask_length,
-                self.mask_selection,
-                self.mask_other,
-                min_masks=2,
-                no_overlap=self.no_mask_overlap,
-                min_space=self.mask_min_space,
-            )
-            mask_indices = torch.from_numpy(mask_indices).to(x.device)
-            x[mask_indices] = self.mask_emb
-        else:
-            mask_indices = None
-
-        if self.mask_channel_prob > 0:
-            mask_channel_indices = compute_mask_indices(
-                (B, C),
-                None,
-                self.mask_channel_prob,
-                self.mask_channel_length,
-                self.mask_channel_selection,
-                self.mask_channel_other,
-                no_overlap=self.no_mask_channel_overlap,
-                min_space=self.mask_channel_min_space,
-            )
-            mask_channel_indices = (
-                torch.from_numpy(mask_channel_indices)
-                .to(x.device)
-                .unsqueeze(1)
-                .expand(-1, T, -1)
-            )
-            x[mask_channel_indices] = 0
-
-        return x, mask_indices
-
-    def sample_negatives(self, y, num):
-
-        if self.n_negatives == 0 and self.cross_sample_negatives == 0:
-            return y.new(0)
-
-        bsz, tsz, fsz = y.shape
-        y = y.view(-1, fsz)  # BTC => (BxT)C
-
-        cross_high = tsz * bsz
-        high = tsz
-        with torch.no_grad():
-            assert high > 1, f"{bsz,tsz,fsz}"
-
-            if self.n_negatives > 0:
-                tszs = (
-                    buffered_arange(num)
-                    .unsqueeze(-1)
-                    .expand(-1, self.n_negatives)
-                    .flatten()
-                )
-
-                neg_idxs = torch.randint(
-                    low=0, high=high - 1, size=(bsz, self.n_negatives * num)
-                )
-                neg_idxs[neg_idxs >= tszs] += 1
-
-            if self.cross_sample_negatives > 0:
-                tszs = (
-                    buffered_arange(num)
-                    .unsqueeze(-1)
-                    .expand(-1, self.cross_sample_negatives)
-                    .flatten()
-                )
-
-                cross_neg_idxs = torch.randint(
-                    low=0,
-                    high=cross_high - 1,
-                    size=(bsz, self.cross_sample_negatives * num),
-                )
-                cross_neg_idxs[cross_neg_idxs >= tszs] += 1
-
-        if self.n_negatives > 0:
-            for i in range(1, bsz):
-                neg_idxs[i] += i * high
-        else:
-            neg_idxs = cross_neg_idxs
-
-        if self.cross_sample_negatives > 0 and self.n_negatives > 0:
-            neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1)
-
-        negs = y[neg_idxs.view(-1)]
-        negs = negs.view(
-            bsz, num, self.n_negatives + self.cross_sample_negatives, fsz
-        ).permute(
-            2, 0, 1, 3
-        )  # to NxBxTxC
-        return negs, neg_idxs
-
-    def compute_preds(self, x, y, negatives):
-
-        neg_is_pos = (y == negatives).all(-1)
-        y = y.unsqueeze(0)
-        targets = torch.cat([y, negatives], dim=0)
-
-        logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).type_as(x)
-
-        logits /= self.logit_temp
-
-        if neg_is_pos.any():
-            logits[1:][neg_is_pos] = float("-inf")
-
-        return logits
-
-    def forward(self, source, padding_mask=None, mask=True, features_only=False):
-
-        if self.feature_grad_mult > 0:
-            features = self.feature_extractor(source)
-            if self.feature_grad_mult != 1.0:
-                features = GradMultiply.apply(features, self.feature_grad_mult)
-        else:
-            with torch.no_grad():
-                features = self.feature_extractor(source)
-
-        features_pen = features.float().pow(2).mean()
-
-        features = features.transpose(1, 2)
-        features = self.layer_norm(features)
-        unmasked_features = features.clone()
-
-        if padding_mask is not None:
-            extra = padding_mask.size(1) % features.size(1)
-            if extra > 0:
-                padding_mask = padding_mask[:, :-extra]
-            padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1)
-            padding_mask = padding_mask.all(-1)
-
-        if self.post_extract_proj is not None:
-            features = self.post_extract_proj(features)
-
-        features = self.dropout_input(features)
-        unmasked_features = self.dropout_features(unmasked_features)
-
-        num_vars = None
-        code_ppl = None
-        prob_ppl = None
-        curr_temp = None
-
-        if self.input_quantizer:
-            q = self.input_quantizer(features, produce_targets=False)
-            features = q["x"]
-            num_vars = q["num_vars"]
-            code_ppl = q["code_perplexity"]
-            prob_ppl = q["prob_perplexity"]
-            curr_temp = q["temp"]
-            features = self.project_inp(features)
-
-        if mask:
-            x, mask_indices = self.apply_mask(features, padding_mask)
-            if mask_indices is not None:
-                y = unmasked_features[mask_indices].view(
-                    unmasked_features.size(0), -1, unmasked_features.size(-1)
-                )
-            else:
-                y = unmasked_features
-        else:
-            x = features
-            y = unmasked_features
-            mask_indices = None
-
-        x = self.encoder(x, padding_mask=padding_mask)
-
-        if features_only:
-            return {"x": x, "padding_mask": padding_mask}
-
-        if self.quantizer:
-            q = self.quantizer(y, produce_targets=False)
-            y = q["x"]
-            num_vars = q["num_vars"]
-            code_ppl = q["code_perplexity"]
-            prob_ppl = q["prob_perplexity"]
-            curr_temp = q["temp"]
-
-            y = self.project_q(y)
-
-            if self.negatives_from_everywhere:
-                neg_cands, *_ = self.quantizer(unmasked_features, produce_targets=False)
-                negs, _ = self.sample_negatives(neg_cands, y.size(1))
-                negs = self.project_q(negs)
-
-            else:
-                negs, _ = self.sample_negatives(y, y.size(1))
-
-            if self.codebook_negatives > 0:
-                cb_negs = self.quantizer.sample_from_codebook(
-                    y.size(0) * y.size(1), self.codebook_negatives
-                )
-                cb_negs = cb_negs.view(
-                    self.codebook_negatives, y.size(0), y.size(1), -1
-                )  # order doesnt matter
-                cb_negs = self.project_q(cb_negs)
-                negs = torch.cat([negs, cb_negs], dim=0)
-        else:
-            y = self.project_q(y)
-
-            if self.negatives_from_everywhere:
-                negs, _ = self.sample_negatives(unmasked_features, y.size(1))
-                negs = self.project_q(negs)
-            else:
-                negs, _ = self.sample_negatives(y, y.size(1))
-
-        x = x[mask_indices].view(x.size(0), -1, x.size(-1))
-
-        if self.target_glu:
-            y = self.target_glu(y)
-            negs = self.target_glu(negs)
-
-        x = self.final_proj(x)
-        x = self.compute_preds(x, y, negs)
-
-        result = {"x": x, "padding_mask": padding_mask, "features_pen": features_pen}
-
-        if prob_ppl is not None:
-            result["prob_perplexity"] = prob_ppl
-            result["code_perplexity"] = code_ppl
-            result["num_vars"] = num_vars
-            result["temp"] = curr_temp
-
-        return result
-
-    def quantize(self, x):
-        assert self.quantizer is not None
-        x = self.feature_extractor(x)
-        x = x.transpose(1, 2)
-        x = self.layer_norm(x)
-        return self.quantizer.forward_idx(x)
-
-    def extract_features(self, source, padding_mask, mask=False):
-        res = self.forward(source, padding_mask, mask=mask, features_only=True)
-        return res["x"], res["padding_mask"]
-
-    def get_logits(self, net_output):
-        logits = net_output["x"]
-        logits = logits.transpose(0, 2)
-        logits = logits.reshape(-1, logits.size(-1))
-        return logits
-
-    def get_targets(self, sample, net_output, expand_steps=True):
-        x = net_output["x"]
-        return x.new_zeros(x.size(1) * x.size(2), dtype=torch.long)
-
-    def get_extra_losses(self, net_output):
-        pen = []
-
-        if "prob_perplexity" in net_output:
-            pen.append(
-                (net_output["num_vars"] - net_output["prob_perplexity"])
-                / net_output["num_vars"]
-            )
-
-        if "features_pen" in net_output:
-            pen.append(net_output["features_pen"])
-
-        return pen
-
-    def remove_pretraining_modules(self):
-        self.quantizer = None
-        self.project_q = None
-        self.target_glu = None
-        self.final_proj = None
-
-
-class ConvFeatureExtractionModel(nn.Module):
-    def __init__(
-        self,
-        conv_layers: List[Tuple[int, int, int]],
-        dropout: float = 0.0,
-        mode: str = "default",
-        conv_bias: bool = False,
-    ):
-        super().__init__()
-
-        assert mode in {"default", "layer_norm"}
-
-        def block(
-            n_in,
-            n_out,
-            k,
-            stride,
-            is_layer_norm=False,
-            is_group_norm=False,
-            conv_bias=False,
-        ):
-            def make_conv():
-                conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
-                nn.init.kaiming_normal_(conv.weight)
-                return conv
-
-            assert (
-                is_layer_norm and is_group_norm
-            ) == False, "layer norm and group norm are exclusive"
-
-            if is_layer_norm:
-                return nn.Sequential(
-                    make_conv(),
-                    nn.Dropout(p=dropout),
-                    nn.Sequential(
-                        TransposeLast(),
-                        Fp32LayerNorm(dim, elementwise_affine=True),
-                        TransposeLast(),
-                    ),
-                    nn.GELU(),
-                )
-            elif is_group_norm:
-                return nn.Sequential(
-                    make_conv(),
-                    nn.Dropout(p=dropout),
-                    Fp32GroupNorm(dim, dim, affine=True),
-                    nn.GELU(),
-                )
-            else:
-                return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
-
-        in_d = 1
-        self.conv_layers = nn.ModuleList()
-        for i, cl in enumerate(conv_layers):
-            assert len(cl) == 3, "invalid conv definition: " + str(cl)
-            (dim, k, stride) = cl
-
-            self.conv_layers.append(
-                block(
-                    in_d,
-                    dim,
-                    k,
-                    stride,
-                    is_layer_norm=mode == "layer_norm",
-                    is_group_norm=mode == "default" and i == 0,
-                    conv_bias=conv_bias,
-                )
-            )
-            in_d = dim
-
-    def forward(self, x):
-
-        # BxT -> BxCxT
-        x = x.unsqueeze(1)
-
-        for conv in self.conv_layers:
-            x = conv(x)
-
-        return x
-
-
-class TransformerEncoder(nn.Module):
-    def __init__(self, args):
-        super().__init__()
-
-        self.dropout = args.dropout
-        self.embedding_dim = args.encoder_embed_dim
-
-        self.pos_conv = nn.Conv1d(
-            self.embedding_dim,
-            self.embedding_dim,
-            kernel_size=args.conv_pos,
-            padding=args.conv_pos // 2,
-            groups=args.conv_pos_groups,
-        )
-        dropout = 0
-        std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
-        nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
-        nn.init.constant_(self.pos_conv.bias, 0)
-
-        self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
-        self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
-
-        self.layers = nn.ModuleList(
-            [
-                TransformerSentenceEncoderLayer(
-                    embedding_dim=self.embedding_dim,
-                    ffn_embedding_dim=args.encoder_ffn_embed_dim,
-                    num_attention_heads=args.encoder_attention_heads,
-                    dropout=self.dropout,
-                    attention_dropout=args.attention_dropout,
-                    activation_dropout=args.activation_dropout,
-                    activation_fn=args.activation_fn,
-                    layer_norm_first=args.layer_norm_first,
-                )
-                for _ in range(args.encoder_layers)
-            ]
-        )
-
-        self.layer_norm_first = args.layer_norm_first
-        self.layer_norm = LayerNorm(self.embedding_dim)
-        self.layerdrop = args.encoder_layerdrop
-
-        self.apply(init_bert_params)
-
-    def forward(self, x, padding_mask=None):
-        x = self.extract_features(x, padding_mask)
-
-        if self.layer_norm_first:
-            x = self.layer_norm(x)
-
-        return x
-
-    def extract_features(self, x, padding_mask=None):
-
-        if padding_mask is not None:
-            x[padding_mask] = 0
-
-        x_conv = self.pos_conv(x.transpose(1, 2))
-        x_conv = x_conv.transpose(1, 2)
-        x += x_conv
-
-        if not self.layer_norm_first:
-            x = self.layer_norm(x)
-
-        x = F.dropout(x, p=self.dropout, training=self.training)
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-
-        layer_results = []
-        for i, layer in enumerate(self.layers):
-            dropout_probability = np.random.random()
-            if not self.training or (dropout_probability > self.layerdrop):
-                x, z = layer(x, self_attn_padding_mask=padding_mask, need_weights=False)
-                layer_results.append(x)
-
-        # T x B x C -> B x T x C
-        x = x.transpose(0, 1)
-
-        return x
-
-    def max_positions(self):
-        """Maximum output length supported by the encoder."""
-        return self.args.max_positions
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        """Upgrade a (possibly old) state dict for new versions of fairseq."""
-        return state_dict
-
-
-class TransformerSentenceEncoderLayer(nn.Module):
-    """
-    Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
-    models.
-    """
-
-    def __init__(
-        self,
-        embedding_dim: float = 768,
-        ffn_embedding_dim: float = 3072,
-        num_attention_heads: float = 8,
-        dropout: float = 0.1,
-        attention_dropout: float = 0.1,
-        activation_dropout: float = 0.1,
-        activation_fn: str = "relu",
-        layer_norm_first: bool = False,
-    ) -> None:
-
-        super().__init__()
-        # Initialize parameters
-        self.embedding_dim = embedding_dim
-        self.dropout = dropout
-        self.activation_dropout = activation_dropout
-
-        # Initialize blocks
-        self.activation_fn = utils.get_activation_fn(activation_fn)
-        self.self_attn = MultiheadAttention(
-            self.embedding_dim,
-            num_attention_heads,
-            dropout=attention_dropout,
-            self_attention=True,
-        )
-
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(self.activation_dropout)
-        self.dropout3 = nn.Dropout(dropout)
-
-        self.layer_norm_first = layer_norm_first
-
-        # layer norm associated with the self attention layer
-        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
-        self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
-        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
-
-        # layer norm associated with the position wise feed-forward NN
-        self.final_layer_norm = LayerNorm(self.embedding_dim)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        self_attn_mask: torch.Tensor = None,
-        self_attn_padding_mask: torch.Tensor = None,
-        need_weights: bool = False,
-        att_args=None,
-    ):
-        """
-        LayerNorm is applied either before or after the self-attention/ffn
-        modules similar to the original Transformer imlementation.
-        """
-        residual = x
-
-        if self.layer_norm_first:
-            x = self.self_attn_layer_norm(x)
-            x, attn = self.self_attn(
-                query=x,
-                key=x,
-                value=x,
-                key_padding_mask=self_attn_padding_mask,
-                need_weights=False,
-                attn_mask=self_attn_mask,
-            )
-            x = self.dropout1(x)
-            x = residual + x
-
-            residual = x
-            x = self.final_layer_norm(x)
-            x = self.activation_fn(self.fc1(x))
-            x = self.dropout2(x)
-            x = self.fc2(x)
-            x = self.dropout3(x)
-            x = residual + x
-        else:
-            x, attn = self.self_attn(
-                query=x,
-                key=x,
-                value=x,
-                key_padding_mask=self_attn_padding_mask,
-                need_weights=need_weights,
-            )
-
-            x = self.dropout1(x)
-            x = residual + x
-
-            x = self.self_attn_layer_norm(x)
-
-            residual = x
-            x = self.activation_fn(self.fc1(x))
-            x = self.dropout2(x)
-            x = self.fc2(x)
-            x = self.dropout3(x)
-            x = residual + x
-            x = self.final_layer_norm(x)
-
-        return x, attn
-
-
-@register_model_architecture("wav2vec2", "wav2vec2")
-def base_architecture(args):
-    args.extractor_mode = getattr(args, "extractor_mode", "default")
-
-    args.encoder_layers = getattr(args, "encoder_layers", 12)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
-
-    args.activation_fn = getattr(args, "activation_fn", "gelu")
-
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
-
-    args.final_dim = getattr(args, "final_dim", 0)
-
-    args.layer_norm_first = getattr(args, "layer_norm_first", False)
-    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
-
-    conv_feature_layers = "[(512, 10, 5)]"
-    conv_feature_layers += " + [(512, 8, 4)]"
-    conv_feature_layers += " + [(512, 4, 2)] * 3"
-    conv_feature_layers += " + [(512, 1, 1)]"
-    args.conv_feature_layers = getattr(args, "conv_feature_layers", conv_feature_layers)
-
-    args.logit_temp = getattr(args, "logit_temp", 0.1)
-
-    args.quantize_targets = getattr(args, "quantize_targets", False)
-    args.quantize_input = getattr(args, "quantize_input", False)
-    args.same_quantizer = getattr(args, "same_quantizer", False)
-
-    args.feature_grad_mult = getattr(args, "feature_grad_mult", 1.0)
-
-    args.latent_vars = getattr(args, "latent_vars", 320)
-    args.latent_groups = getattr(args, "latent_groups", 2)
-    args.latent_dim = getattr(args, "latent_dim", 0)
-
-    args.mask_length = getattr(args, "mask_length", 10)
-    args.mask_prob = getattr(args, "mask_prob", 0.65)
-    args.mask_selection = getattr(args, "mask_selection", "static")
-    args.mask_other = getattr(args, "mask_other", 0)
-    args.no_mask_overlap = getattr(args, "no_mask_overlap", False)
-    args.mask_min_space = getattr(args, "mask_min_space", 1)
-
-    args.mask_channel_length = getattr(args, "mask_channel_length", 10)
-    args.mask_channel_prob = getattr(args, "mask_channel_prob", 0)
-    args.mask_channel_selection = getattr(args, "mask_channel_selection", "static")
-    args.mask_channel_other = getattr(args, "mask_channel_other", 0)
-    args.no_mask_channel_overlap = getattr(args, "no_mask_channel_overlap", False)
-    args.mask_channel_min_space = getattr(args, "mask_channel_min_space", 1)
-
-    args.dropout_input = getattr(args, "dropout_input", 0)
-    args.dropout_features = getattr(args, "dropout_features", 0)
-
-    args.num_negatives = getattr(args, "num_negatives", 100)
-    args.negatives_from_everywhere = getattr(args, "negatives_from_everywhere", False)
-    args.cross_sample_negatives = getattr(args, "cross_sample_negatives", 0)
-    args.codebook_negatives = getattr(args, "codebook_negatives", 0)
-
-    args.conv_pos = getattr(args, "conv_pos", 128)
-    args.conv_pos_groups = getattr(args, "conv_pos_groups", 16)
-
-    args.latent_temp = getattr(args, "latent_temp", "(2,0.5,0.999995)")
-
-    args.target_glu = getattr(args, "target_glu", False)
-
-    args.conv_bias = getattr(args, "conv_bias", False)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/wav2vec/wav2vec2_asr.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/wav2vec/wav2vec2_asr.py
deleted file mode 100644
index 52ca9a8007b3e6236c7ac23bfa573990e549d15d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/models/wav2vec/wav2vec2_asr.py
+++ /dev/null
@@ -1,675 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import contextlib
-import copy
-import math
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import checkpoint_utils, tasks, utils
-from fairseq.models import (
-    BaseFairseqModel,
-    FairseqEncoder,
-    FairseqEncoderDecoderModel,
-    FairseqIncrementalDecoder,
-    register_model,
-    register_model_architecture,
-)
-from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerDecoderLayer
-
-
-def add_common_args(parser):
-    parser.add_argument("--w2v-path", help="path to wav2vec 2.0 model")
-    parser.add_argument(
-        "--no-pretrained-weights",
-        action="store_true",
-        help="if true, does not load pretrained weights",
-    )
-    parser.add_argument(
-        "--dropout-input",
-        type=float,
-        metavar="D",
-        help="dropout to apply to the input (after feat extr)",
-    )
-    parser.add_argument(
-        "--final-dropout",
-        type=float,
-        metavar="D",
-        help="dropout after transformer and before final projection",
-    )
-    parser.add_argument(
-        "--apply-mask", action="store_true", help="apply masking during fine-tuning"
-    )
-    parser.add_argument(
-        "--dropout",
-        type=float,
-        metavar="D",
-        help="dropout probability inside wav2vec 2.0 model",
-    )
-    parser.add_argument(
-        "--attention-dropout",
-        type=float,
-        metavar="D",
-        help="dropout probability for attention weights inside wav2vec 2.0 model",
-    )
-    parser.add_argument(
-        "--activation-dropout",
-        "--relu-dropout",
-        type=float,
-        metavar="D",
-        help="dropout probability after activation in FFN inside wav2vec 2.0 model",
-    )
-
-    parser.add_argument(
-        "--mask-length", type=int, help="repeat the mask indices multiple times"
-    )
-
-    parser.add_argument(
-        "--mask-prob", type=float, help="probability of replacing a token with mask"
-    )
-
-    parser.add_argument(
-        "--mask-selection",
-        type=str,
-        choices=["static", "uniform", "normal", "poisson"],
-        help="how to choose masks",
-    )
-
-    parser.add_argument(
-        "--mask-other",
-        type=float,
-        help="stdev of the mask length in case of 'normal' selection strategy",
-    )
-
-    parser.add_argument(
-        "--no-mask-overlap",
-        action="store_true",
-        help="whether to allow masks to overlap",
-    )
-
-    parser.add_argument(
-        "--mask-channel-length", type=int, help="repeat the mask indices multiple times"
-    )
-
-    parser.add_argument(
-        "--mask-channel-prob",
-        type=float,
-        help="probability of replacing a token with mask",
-    )
-
-    parser.add_argument(
-        "--mask-channel-selection",
-        type=str,
-        choices=["static", "uniform", "normal", "poisson"],
-        help="how to choose masks",
-    )
-
-    parser.add_argument(
-        "--mask-channel-other",
-        type=float,
-        help="stdev of the mask length in case of 'normal' selection strategy",
-    )
-
-    parser.add_argument(
-        "--no-mask-channel-overlap",
-        action="store_true",
-        help="whether to allow masks to overlap",
-    )
-
-    parser.add_argument(
-        "--freeze-finetune-updates",
-        default=0,
-        type=int,
-        help="dont finetune wav2vec for this many updates",
-    )
-
-    parser.add_argument(
-        "--feature-grad-mult",
-        default=None,
-        type=float,
-        help="reset feature grad mult in wav2vec 2.0 to this",
-    )
-
-    parser.add_argument(
-        "--layerdrop",
-        default=0.0,
-        type=float,
-        help="probability of dropping a layer in wav2vec 2.0",
-    )
-
-
-@register_model("wav2vec_ctc")
-class Wav2VecCtc(BaseFairseqModel):
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        add_common_args(parser)
-
-    def __init__(self, w2v_encoder, args):
-        super().__init__()
-        self.w2v_encoder = w2v_encoder
-        self.args = args
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        super().upgrade_state_dict_named(state_dict, name)
-        return state_dict
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-        base_architecture(args)
-        w2v_encoder = Wav2VecEncoder(args, task.target_dictionary)
-        return cls(w2v_encoder, args)
-
-    def get_normalized_probs(self, net_output, log_probs):
-        """Get normalized probabilities (or log probs) from a net's output."""
-
-        logits = net_output["encoder_out"]
-        if log_probs:
-            return utils.log_softmax(logits.float(), dim=-1)
-        else:
-            return utils.softmax(logits.float(), dim=-1)
-
-    def forward(self, **kwargs):
-        x = self.w2v_encoder(**kwargs)
-        return x
-
-    # def max_positions(self):
-    #     return None
-
-
-@register_model("wav2vec_seq2seq")
-class TransformerModel(FairseqEncoderDecoderModel):
-    def __init__(self, args, encoder, decoder):
-        super().__init__(encoder, decoder)
-
-    @staticmethod
-    def add_args(parser):
-        add_common_args(parser)
-
-        parser.add_argument(
-            "--decoder-embed-dim",
-            type=int,
-            metavar="N",
-            help="decoder embedding dimension",
-        )
-        parser.add_argument(
-            "--decoder-ffn-embed-dim",
-            type=int,
-            metavar="N",
-            help="decoder embedding dimension for FFN",
-        )
-        parser.add_argument(
-            "--decoder-layers", type=int, metavar="N", help="num decoder layers"
-        )
-        parser.add_argument(
-            "--decoder-layerdrop",
-            type=float,
-            metavar="D",
-            help="decoder layerdrop chance",
-        )
-        parser.add_argument(
-            "--decoder-attention-heads",
-            type=int,
-            metavar="N",
-            help="num decoder attention heads",
-        )
-        parser.add_argument(
-            "--decoder-learned-pos",
-            action="store_true",
-            help="use learned positional embeddings in the decoder",
-        )
-        parser.add_argument(
-            "--decoder-normalize-before",
-            action="store_true",
-            help="apply layernorm before each decoder block",
-        )
-        parser.add_argument(
-            "--no-token-positional-embeddings",
-            default=False,
-            action="store_true",
-            help="if set, disables positional embeddings (outside self attention)",
-        )
-
-        parser.add_argument(
-            "--decoder-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability in the decoder",
-        )
-        parser.add_argument(
-            "--decoder-attention-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability for attention weights inside the decoder",
-        )
-        parser.add_argument(
-            "--decoder-activation-dropout",
-            type=float,
-            metavar="D",
-            help="dropout probability after activation in FFN inside the decoder",
-        )
-
-        # fmt: on
-
-    @classmethod
-    def build_model(cls, args, task):
-        """Build a new model instance."""
-
-        # make sure all arguments are present in older models
-        base_architecture(args)
-
-        if not hasattr(args, "max_source_positions"):
-            args.max_source_positions = 2048
-        if not hasattr(args, "max_target_positions"):
-            args.max_target_positions = 2048
-
-        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
-
-        def build_embedding(dictionary, embed_dim):
-            num_embeddings = len(dictionary)
-            padding_idx = dictionary.pad()
-            emb = Embedding(num_embeddings, embed_dim, padding_idx)
-            return emb
-
-        decoder_embed_tokens = build_embedding(tgt_dict, args.decoder_embed_dim)
-
-        encoder = cls.build_encoder(args)
-        decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
-        return TransformerModel(args, encoder, decoder)
-
-    @classmethod
-    def build_encoder(cls, args):
-        return Wav2VecEncoder(args)
-
-    @classmethod
-    def build_decoder(cls, args, tgt_dict, embed_tokens):
-        return TransformerDecoder(args, tgt_dict, embed_tokens)
-
-    def forward(self, **kwargs):
-        encoder_out = self.encoder(tbc=False, **kwargs)
-        decoder_out = self.decoder(encoder_out=encoder_out, **kwargs)
-        return decoder_out
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        super().upgrade_state_dict_named(state_dict, name)
-        return state_dict
-
-
-class Wav2VecEncoder(FairseqEncoder):
-    def __init__(self, args, tgt_dict=None):
-        self.apply_mask = args.apply_mask
-
-        arg_overrides = {
-            "dropout": args.dropout,
-            "activation_dropout": args.activation_dropout,
-            "dropout_input": args.dropout_input,
-            "attention_dropout": args.attention_dropout,
-            "mask_length": args.mask_length,
-            "mask_prob": args.mask_prob,
-            "mask_selection": args.mask_selection,
-            "mask_other": args.mask_other,
-            "no_mask_overlap": args.no_mask_overlap,
-            "mask_channel_length": args.mask_channel_length,
-            "mask_channel_prob": args.mask_channel_prob,
-            "mask_channel_selection": args.mask_channel_selection,
-            "mask_channel_other": args.mask_channel_other,
-            "no_mask_channel_overlap": args.no_mask_channel_overlap,
-            "encoder_layerdrop": args.layerdrop,
-            "feature_grad_mult": args.feature_grad_mult,
-        }
-
-        if getattr(args, "w2v_args", None) is None:
-            state = checkpoint_utils.load_checkpoint_to_cpu(
-                args.w2v_path, arg_overrides
-            )
-            w2v_args = state["args"]
-        else:
-            state = None
-            w2v_args = args.w2v_args
-
-        assert (
-            args.normalize == w2v_args.normalize
-        ), "Fine-tuning works best when data normalization is the same"
-
-        w2v_args.data = args.data
-        task = tasks.setup_task(w2v_args)
-        model = task.build_model(w2v_args)
-
-        if state is not None and not args.no_pretrained_weights:
-            model.load_state_dict(state["model"], strict=True)
-
-        model.remove_pretraining_modules()
-
-        super().__init__(task.source_dictionary)
-
-        d = w2v_args.encoder_embed_dim
-
-        self.w2v_model = model
-
-        self.final_dropout = nn.Dropout(args.final_dropout)
-        self.freeze_finetune_updates = args.freeze_finetune_updates
-        self.num_updates = 0
-
-        if tgt_dict is not None:
-            self.proj = Linear(d, len(tgt_dict))
-        elif getattr(args, "decoder_embed_dim", d) != d:
-            self.proj = Linear(d, args.decoder_embed_dim)
-        else:
-            self.proj = None
-
-    def set_num_updates(self, num_updates):
-        """Set the number of parameters updates."""
-        super().set_num_updates(num_updates)
-        self.num_updates = num_updates
-
-    def forward(self, source, padding_mask, tbc=True, **kwargs):
-
-        w2v_args = {
-            "source": source,
-            "padding_mask": padding_mask,
-            "mask": self.apply_mask and self.training,
-        }
-
-        ft = self.freeze_finetune_updates <= self.num_updates
-
-        with torch.no_grad() if not ft else contextlib.ExitStack():
-            x, padding_mask = self.w2v_model.extract_features(**w2v_args)
-
-            if tbc:
-                # B x T x C -> T x B x C
-                x = x.transpose(0, 1)
-
-        x = self.final_dropout(x)
-
-        if self.proj:
-            x = self.proj(x)
-
-        return {
-            "encoder_out": x,  # T x B x C
-            "encoder_padding_mask": padding_mask,  # B x T
-            "padding_mask": padding_mask,
-        }
-
-    def reorder_encoder_out(self, encoder_out, new_order):
-        if encoder_out["encoder_out"] is not None:
-            encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select(
-                1, new_order
-            )
-        if encoder_out["encoder_padding_mask"] is not None:
-            encoder_out["encoder_padding_mask"] = encoder_out[
-                "encoder_padding_mask"
-            ].index_select(0, new_order)
-        return encoder_out
-
-    def max_positions(self):
-        """Maximum input length supported by the encoder."""
-        return None
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        return state_dict
-
-
-class TransformerDecoder(FairseqIncrementalDecoder):
-    """
-    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
-    is a :class:`TransformerDecoderLayer`.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        dictionary (~fairseq.data.Dictionary): decoding dictionary
-        embed_tokens (torch.nn.Embedding): output embedding
-        no_encoder_attn (bool, optional): whether to attend to encoder outputs
-            (default: False).
-    """
-
-    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
-        super().__init__(dictionary)
-
-        self.dropout = args.decoder_dropout
-        self.share_input_output_embed = args.share_decoder_input_output_embed
-
-        input_embed_dim = embed_tokens.embedding_dim
-        embed_dim = args.decoder_embed_dim
-        self.output_embed_dim = args.decoder_embed_dim
-        args.encoder_embed_dim = embed_dim
-
-        self.layerdrop = args.decoder_layerdrop
-
-        padding_idx = embed_tokens.padding_idx
-        self.max_target_positions = args.max_target_positions
-
-        self.embed_tokens = embed_tokens
-        self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim
-
-        self.project_in_dim = (
-            Linear(input_embed_dim, embed_dim, bias=False)
-            if embed_dim != input_embed_dim
-            else None
-        )
-
-        self.embed_positions = (
-            PositionalEmbedding(
-                args.max_target_positions,
-                embed_dim,
-                padding_idx,
-                learned=args.decoder_learned_pos,
-            )
-            if not args.no_token_positional_embeddings
-            else None
-        )
-
-        args = copy.deepcopy(args)
-        args.dropout = args.decoder_dropout
-        args.attention_dropout = args.decoder_attention_dropout
-        args.activation_dropout = args.decoder_activation_dropout
-
-        self.layers = nn.ModuleList([])
-        self.layers.extend(
-            [
-                TransformerDecoderLayer(args, no_encoder_attn)
-                for _ in range(args.decoder_layers)
-            ]
-        )
-
-        if not self.share_input_output_embed:
-            self.embed_out = nn.Parameter(
-                torch.Tensor(len(dictionary), self.output_embed_dim)
-            )
-            nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5)
-
-        if args.decoder_normalize_before and not getattr(
-            args, "no_decoder_final_norm", False
-        ):
-            self.layer_norm = LayerNorm(embed_dim)
-        else:
-            self.layer_norm = None
-
-    def forward(
-        self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused
-    ):
-        """
-        Args:
-            prev_output_tokens (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for teacher forcing
-            encoder_out (Tensor, optional): output from the encoder, used for
-                encoder-side attention
-            incremental_state (dict): dictionary used for storing state during
-                :ref:`Incremental decoding`
-
-        Returns:
-            tuple:
-                - the decoder's output of shape `(batch, tgt_len, vocab)`
-                - a dictionary with any model-specific outputs
-        """
-        prev_output_tokens = prev_output_tokens.long()
-        x, extra = self.extract_features(
-            prev_output_tokens, encoder_out, incremental_state
-        )
-        x = self.output_layer(x)
-        return x, extra
-
-    def extract_features(
-        self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused
-    ):
-        """
-        Similar to *forward* but only return features.
-
-        Returns:
-            tuple:
-                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
-                - a dictionary with any model-specific outputs
-        """
-
-        # embed positions
-        positions = (
-            self.embed_positions(
-                prev_output_tokens, incremental_state=incremental_state
-            )
-            if self.embed_positions is not None
-            else None
-        )
-
-        if incremental_state is not None:
-            prev_output_tokens = prev_output_tokens[:, -1:]
-            if positions is not None:
-                positions = positions[:, -1:]
-
-        # embed tokens and positions
-        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
-
-        if self.project_in_dim is not None:
-            x = self.project_in_dim(x)
-
-        if positions is not None:
-            x += positions
-        x = F.dropout(x, p=self.dropout, training=self.training)
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-        attn = None
-
-        inner_states = [x]
-
-        # decoder layers
-        for layer in self.layers:
-            dropout_probability = np.random.random()
-            if not self.training or (dropout_probability > self.layerdrop):
-                x, attn, _ = layer(
-                    x,
-                    encoder_out["encoder_out"] if encoder_out is not None else None,
-                    encoder_out["encoder_padding_mask"]
-                    if encoder_out is not None
-                    else None,
-                    incremental_state,
-                    self_attn_mask=self.buffered_future_mask(x)
-                    if incremental_state is None
-                    else None,
-                )
-                inner_states.append(x)
-
-        if self.layer_norm:
-            x = self.layer_norm(x)
-
-        # T x B x C -> B x T x C
-        x = x.transpose(0, 1)
-
-        return x, {"attn": attn, "inner_states": inner_states}
-
-    def output_layer(self, features, **kwargs):
-        """Project features to the vocabulary size."""
-        # project back to size of vocabulary
-        if self.share_input_output_embed:
-            return F.linear(features, self.embed_tokens.weight)
-        else:
-            return F.linear(features, self.embed_out)
-
-    def max_positions(self):
-        """Maximum output length supported by the decoder."""
-        if self.embed_positions is None:
-            return self.max_target_positions
-        return min(self.max_target_positions, self.embed_positions.max_positions)
-
-    def buffered_future_mask(self, tensor):
-        dim = tensor.size(0)
-        if (
-            not hasattr(self, "_future_mask")
-            or self._future_mask is None
-            or self._future_mask.device != tensor.device
-            or self._future_mask.size(0) < dim
-        ):
-            self._future_mask = torch.triu(
-                utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
-            )
-        return self._future_mask[:dim, :dim]
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        return state_dict
-
-
-def Embedding(num_embeddings, embedding_dim, padding_idx):
-    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
-    nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
-    nn.init.constant_(m.weight[padding_idx], 0)
-    return m
-
-
-def Linear(in_features, out_features, bias=True):
-    m = nn.Linear(in_features, out_features, bias)
-    nn.init.xavier_uniform_(m.weight)
-    if bias:
-        nn.init.constant_(m.bias, 0.0)
-    return m
-
-
-@register_model_architecture("wav2vec_ctc", "wav2vec_ctc")
-def base_architecture(args):
-    args.no_pretrained_weights = getattr(args, "no_pretrained_weights", False)
-    args.dropout_input = getattr(args, "dropout_input", 0)
-    args.final_dropout = getattr(args, "final_dropout", 0)
-    args.apply_mask = getattr(args, "apply_mask", False)
-    args.dropout = getattr(args, "dropout", 0)
-    args.attention_dropout = getattr(args, "attention_dropout", 0)
-    args.activation_dropout = getattr(args, "activation_dropout", 0)
-
-    args.mask_length = getattr(args, "mask_length", 10)
-    args.mask_prob = getattr(args, "mask_prob", 0.5)
-    args.mask_selection = getattr(args, "mask_selection", "static")
-    args.mask_other = getattr(args, "mask_other", 0)
-    args.no_mask_overlap = getattr(args, "no_mask_overlap", False)
-    args.mask_channel_length = getattr(args, "mask_channel_length", 10)
-    args.mask_channel_prob = getattr(args, "mask_channel_prob", 0.5)
-    args.mask_channel_selection = getattr(args, "mask_channel_selection", "static")
-    args.mask_channel_other = getattr(args, "mask_channel_other", 0)
-    args.no_mask_channel_overlap = getattr(args, "no_mask_channel_overlap", False)
-
-    args.freeze_finetune_updates = getattr(args, "freeze_finetune_updates", 0)
-    args.feature_grad_mult = getattr(args, "feature_grad_mult", 0)
-    args.layerdrop = getattr(args, "layerdrop", 0.0)
-
-
-@register_model_architecture("wav2vec_seq2seq", "wav2vec_seq2seq")
-def seq2seq_architecture(args):
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
-    args.decoder_layers = getattr(args, "decoder_layers", 10)
-    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.decoder_dropout = getattr(args, "decoder_dropout", 0)
-    args.decoder_attention_dropout = getattr(args, "decoder_attention_dropout", 0)
-    args.decoder_activation_dropout = getattr(args, "decoder_activation_dropout", 0)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-
-    base_architecture(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/__init__.py
deleted file mode 100644
index 4b7fcb95e1873b58edca5853fa3e058fde1eb1b2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/__init__.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""isort:skip_file"""
-
-from .adaptive_input import AdaptiveInput
-from .adaptive_softmax import AdaptiveSoftmax
-from .beamable_mm import BeamableMM
-from .character_token_embedder import CharacterTokenEmbedder
-from .conv_tbc import ConvTBC
-from .cross_entropy import cross_entropy
-from .downsampled_multihead_attention import DownsampledMultiHeadAttention
-from .dynamic_convolution import DynamicConv, DynamicConv1dTBC
-from .dynamic_crf_layer import DynamicCRF
-from .fairseq_dropout import FairseqDropout, NpuFairseqDropout
-from .fp32_group_norm import Fp32GroupNorm
-from .gelu import gelu, gelu_accurate
-from .grad_multiply import GradMultiply
-from .gumbel_vector_quantizer import GumbelVectorQuantizer
-from .kmeans_vector_quantizer import KmeansVectorQuantizer
-from .layer_drop import LayerDropModuleList
-from .layer_norm import Fp32LayerNorm, LayerNorm
-from .learned_positional_embedding import LearnedPositionalEmbedding
-from .lightweight_convolution import LightweightConv, LightweightConv1dTBC
-from .linearized_convolution import LinearizedConvolution
-from .multihead_attention import MultiheadAttention
-from .positional_embedding import PositionalEmbedding
-from .same_pad import SamePad
-from .scalar_bias import ScalarBias
-from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding
-from .transformer_sentence_encoder_layer import TransformerSentenceEncoderLayer
-from .transformer_sentence_encoder import TransformerSentenceEncoder
-from .transpose_last import TransposeLast
-from .unfold import unfold1d
-from .transformer_layer import TransformerDecoderLayer, TransformerEncoderLayer
-from .vggblock import VGGBlock
-
-__all__ = [
-    "AdaptiveInput",
-    "AdaptiveSoftmax",
-    "BeamableMM",
-    "CharacterTokenEmbedder",
-    "ConvTBC",
-    "cross_entropy",
-    "DownsampledMultiHeadAttention",
-    "DynamicConv1dTBC",
-    "DynamicConv",
-    "DynamicCRF",
-    "FairseqDropout",
-    "NpuFairseqDropout",
-    "Fp32GroupNorm",
-    "Fp32LayerNorm",
-    "gelu",
-    "gelu_accurate",
-    "GradMultiply",
-    "GumbelVectorQuantizer",
-    "KmeansVectorQuantizer",
-    "LayerDropModuleList",
-    "LayerNorm",
-    "LearnedPositionalEmbedding",
-    "LightweightConv1dTBC",
-    "LightweightConv",
-    "LinearizedConvolution",
-    "MultiheadAttention",
-    "PositionalEmbedding",
-    "SamePad",
-    "ScalarBias",
-    "SinusoidalPositionalEmbedding",
-    "TransformerSentenceEncoderLayer",
-    "TransformerSentenceEncoder",
-    "TransformerDecoderLayer",
-    "TransformerEncoderLayer",
-    "TransposeLast",
-    "VGGBlock",
-    "unfold1d",
-]
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/adaptive_input.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/adaptive_input.py
deleted file mode 100644
index 446534a9f8b87337a4dd752944ea386ff7cf7965..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/adaptive_input.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-from typing import List
-
-import torch
-from fairseq.modules.quant_noise import quant_noise
-from torch import nn
-
-
-class AdaptiveInput(nn.Module):
-    def __init__(
-        self,
-        vocab_size: int,
-        padding_idx: int,
-        initial_dim: int,
-        factor: float,
-        output_dim: int,
-        cutoff: List[int],
-        q_noise: float = 0,
-        qn_block_size: int = 8,
-    ):
-        super().__init__()
-
-        if vocab_size > cutoff[-1]:
-            cutoff = cutoff + [vocab_size]
-        else:
-            assert (
-                vocab_size == cutoff[-1]
-            ), "cannot specify cutoff larger than vocab size"
-
-        self.cutoff = cutoff
-        self.embedding_dim = output_dim
-        self.padding_idx = padding_idx
-
-        self.embeddings = nn.ModuleList()
-        for i in range(len(self.cutoff)):
-            prev = self.cutoff[i - 1] if i > 0 else 0
-            size = self.cutoff[i] - prev
-            dim = int(initial_dim // (factor ** i))
-            seq = nn.Sequential(
-                nn.Embedding(size, dim, self.padding_idx),
-                quant_noise(
-                    nn.Linear(dim, output_dim, bias=False), q_noise, qn_block_size
-                ),
-            )
-
-            self.embeddings.append(seq)
-            self.padding_idx = None
-        self.padding_idx = padding_idx
-
-        def init_weights(m):
-            if isinstance(m, nn.Embedding):
-                nn.init.normal_(m.weight, mean=0, std=m.weight.shape[1] ** -0.5)
-                nn.init.constant_(m.weight[padding_idx], 0)
-            elif hasattr(m, "weight"):
-                nn.init.xavier_uniform_(m.weight)
-
-        self.apply(init_weights)
-
-        self.register_buffer("_float_tensor", torch.FloatTensor(1))
-
-    def weights_for_band(self, band: int):
-        return self.embeddings[band][0].weight, self.embeddings[band][1].weight
-
-    def forward(self, input: torch.Tensor):
-        result = self._float_tensor.new(input.shape + (self.embedding_dim,))
-        for i in range(len(self.cutoff)):
-            mask = input.lt(self.cutoff[i])
-            if i > 0:
-                mask.mul_(input.ge(self.cutoff[i - 1]))
-                chunk_input = input[mask] - self.cutoff[i - 1]
-            else:
-                chunk_input = input[mask]
-            if mask.any():
-                result[mask] = self.embeddings[i](chunk_input)
-        return result
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/adaptive_softmax.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/adaptive_softmax.py
deleted file mode 100644
index ae0c77ba0f6ee98501306d66cbc4a948b4ade0f7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/adaptive_softmax.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import functools
-import operator
-
-import torch
-import torch.nn.functional as F
-from fairseq.modules.fairseq_dropout import FairseqDropout
-from fairseq.modules.quant_noise import quant_noise
-from torch import nn
-
-
-class TiedLinear(nn.Module):
-    def __init__(self, weight, transpose):
-        super().__init__()
-        self.weight = weight
-        self.transpose = transpose
-
-    def forward(self, input):
-        return F.linear(input, self.weight.t() if self.transpose else self.weight)
-
-
-class TiedHeadModule(nn.Module):
-    def __init__(self, weights, input_dim, num_classes, q_noise, qn_block_size):
-        super().__init__()
-        tied_emb, _ = weights
-        self.num_words, emb_dim = tied_emb.size()
-
-        self.word_proj = quant_noise(
-            TiedLinear(tied_emb, transpose=False), q_noise, qn_block_size
-        )
-        if input_dim != emb_dim:
-            self.word_proj = nn.Sequential(
-                quant_noise(
-                    nn.Linear(input_dim, emb_dim, bias=False), q_noise, qn_block_size
-                ),
-                self.word_proj,
-            )
-
-        self.class_proj = quant_noise(
-            nn.Linear(input_dim, num_classes, bias=False), q_noise, qn_block_size
-        )
-        self.out_dim = self.num_words + num_classes
-
-        self.register_buffer("_float_tensor", torch.FloatTensor(1))
-
-    def forward(self, input):
-        inp_sz = functools.reduce(operator.mul, input.shape[:-1], 1)
-        out = self._float_tensor.new(inp_sz, self.out_dim)
-        out[:, : self.num_words] = self.word_proj(input.view(inp_sz, -1))
-        out[:, self.num_words :] = self.class_proj(input.view(inp_sz, -1))
-        return out
-
-
-class AdaptiveSoftmax(nn.Module):
-    """
-    This is an implementation of the efficient softmax approximation for
-    graphical processing units (GPU), described in the paper "Efficient softmax
-    approximation for GPUs" (http://arxiv.org/abs/1609.04309).
-    """
-
-    def __init__(
-        self,
-        vocab_size,
-        input_dim,
-        cutoff,
-        dropout,
-        factor=4.0,
-        adaptive_inputs=None,
-        tie_proj=False,
-        q_noise=0,
-        qn_block_size=8,
-    ):
-        super().__init__()
-
-        if vocab_size > cutoff[-1]:
-            cutoff = cutoff + [vocab_size]
-        else:
-            assert (
-                vocab_size == cutoff[-1]
-            ), "cannot specify cutoff larger than vocab size"
-
-        output_dim = cutoff[0] + len(cutoff) - 1
-
-        self.vocab_size = vocab_size
-        self.cutoff = cutoff
-        self.dropout_module = FairseqDropout(
-            dropout, module_name=self.__class__.__name__
-        )
-        self.input_dim = input_dim
-        self.factor = factor
-        self.q_noise = q_noise
-        self.qn_block_size = qn_block_size
-
-        self.lsm = nn.LogSoftmax(dim=1)
-
-        if adaptive_inputs is not None:
-            self.head = TiedHeadModule(
-                adaptive_inputs.weights_for_band(0),
-                input_dim,
-                len(cutoff) - 1,
-                self.q_noise,
-                self.qn_block_size,
-            )
-        else:
-            self.head = quant_noise(
-                nn.Linear(input_dim, output_dim, bias=False),
-                self.q_noise,
-                self.qn_block_size,
-            )
-
-        self._make_tail(adaptive_inputs, tie_proj)
-
-        def init_weights(m):
-            if (
-                hasattr(m, "weight")
-                and not isinstance(m, TiedLinear)
-                and not isinstance(m, TiedHeadModule)
-            ):
-                nn.init.xavier_uniform_(m.weight)
-
-        self.apply(init_weights)
-
-        self.register_buffer("version", torch.LongTensor([1]))
-
-    def _make_tail(self, adaptive_inputs=None, tie_proj=False):
-        self.tail = nn.ModuleList()
-        for i in range(len(self.cutoff) - 1):
-            dim = int(self.input_dim // self.factor ** (i + 1))
-
-            tied_emb, tied_proj = (
-                adaptive_inputs.weights_for_band(i + 1)
-                if adaptive_inputs is not None
-                else (None, None)
-            )
-
-            if tied_proj is not None:
-                if tie_proj:
-                    proj = quant_noise(
-                        TiedLinear(tied_proj, transpose=True),
-                        self.q_noise,
-                        self.qn_block_size,
-                    )
-                else:
-                    proj = quant_noise(
-                        nn.Linear(tied_proj.size(0), tied_proj.size(1), bias=False),
-                        self.q_noise,
-                        self.qn_block_size,
-                    )
-            else:
-                proj = quant_noise(
-                    nn.Linear(self.input_dim, dim, bias=False),
-                    self.q_noise,
-                    self.qn_block_size,
-                )
-
-            if tied_emb is None:
-                out_proj = nn.Linear(
-                    dim, self.cutoff[i + 1] - self.cutoff[i], bias=False
-                )
-            else:
-                out_proj = TiedLinear(tied_emb, transpose=False)
-
-            m = nn.Sequential(
-                proj,
-                nn.Dropout(self.dropout_module.p),
-                quant_noise(out_proj, self.q_noise, self.qn_block_size),
-            )
-
-            self.tail.append(m)
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        version_name = name + ".version"
-        if version_name not in state_dict:
-            raise Exception("This version of the model is no longer supported")
-
-    def adapt_target(self, target):
-        """
-        In order to be efficient, the AdaptiveSoftMax does not compute the
-        scores for all the word of the vocabulary for all the examples. It is
-        thus necessary to call the method adapt_target of the AdaptiveSoftMax
-        layer inside each forward pass.
-        """
-
-        target = target.view(-1)
-        new_target = [target.clone()]
-        target_idxs = []
-
-        for i in range(len(self.cutoff) - 1):
-            mask = target.ge(self.cutoff[i]).mul(target.lt(self.cutoff[i + 1]))
-            new_target[0][mask] = self.cutoff[0] + i
-
-            if mask.any():
-                target_idxs.append(mask.nonzero(as_tuple=False).squeeze(1))
-                new_target.append(target[mask].add(-self.cutoff[i]))
-            else:
-                target_idxs.append(None)
-                new_target.append(None)
-
-        return new_target, target_idxs
-
-    def forward(self, input, target):
-        """
-        Args:
-            input: (b x t x d)
-            target: (b x t)
-        Returns:
-            2 lists: output for each cutoff section and new targets by cut off
-        """
-
-        input = input.contiguous().view(-1, input.size(-1))
-        input = self.dropout_module(input)
-
-        new_target, target_idxs = self.adapt_target(target)
-        output = [self.head(input)]
-
-        for i in range(len(target_idxs)):
-            if target_idxs[i] is not None:
-                output.append(self.tail[i](input.index_select(0, target_idxs[i])))
-            else:
-                output.append(None)
-
-        return output, new_target
-
-    def get_log_prob(self, input, target):
-        """
-        Computes the log probabilities for all the words of the vocabulary,
-        given a 2D tensor of hidden vectors.
-        """
-
-        bsz, length, dim = input.size()
-        input = input.contiguous().view(-1, dim)
-
-        if target is not None:
-            _, target_idxs = self.adapt_target(target)
-        else:
-            target_idxs = None
-
-        head_y = self.head(input)
-        log_probs = head_y.new_zeros(input.size(0), self.vocab_size)
-
-        head_sz = self.cutoff[0] + len(self.tail)
-        log_probs[:, :head_sz] = self.lsm(head_y)
-        tail_priors = log_probs[:, self.cutoff[0] : head_sz].clone()
-
-        for i in range(len(self.tail)):
-            start = self.cutoff[i]
-            end = self.cutoff[i + 1]
-
-            if target_idxs is None:
-                tail_out = log_probs[:, start:end]
-                tail_out.copy_(self.tail[i](input))
-                log_probs[:, start:end] = self.lsm(tail_out).add_(
-                    tail_priors[:, i, None]
-                )
-            elif target_idxs[i] is not None:
-                idxs = target_idxs[i]
-                tail_out = log_probs[idxs, start:end]
-                tail_out.copy_(self.tail[i](input[idxs]))
-                log_probs[idxs, start:end] = self.lsm(tail_out).add_(
-                    tail_priors[idxs, i, None]
-                )
-
-        log_probs = log_probs.view(bsz, length, -1)
-        return log_probs
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/beamable_mm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/beamable_mm.py
deleted file mode 100644
index eff1a4607f600c71210e6b914985dc48731aae86..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/beamable_mm.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn as nn
-
-
-class BeamableMM(nn.Module):
-    """This module provides an optimized MM for beam decoding with attention.
-
-    It leverage the fact that the source-side of the input is replicated beam
-    times and the target-side of the input is of width one. This layer speeds up
-    inference by replacing the inputs {(bsz x 1 x nhu), (bsz x sz2 x nhu)}
-    with smaller inputs {(bsz/beam x beam x nhu), (bsz/beam x sz2 x nhu)}.
-    """
-
-    def __init__(self, beam_size=None):
-        super(BeamableMM, self).__init__()
-        self.beam_size = beam_size
-
-    def forward(self, input1, input2):
-        if (
-            not self.training
-            and self.beam_size is not None  # test mode
-            and input1.dim() == 3  # beam size is set
-            and input1.size(1)  # only support batched input
-            == 1  # single time step update
-        ):
-            bsz, beam = input1.size(0), self.beam_size
-
-            # bsz x 1 x nhu --> bsz/beam x beam x nhu
-            input1 = input1[:, 0, :].unfold(0, beam, beam).transpose(2, 1)
-
-            # bsz x sz2 x nhu --> bsz/beam x sz2 x nhu
-            input2 = input2.unfold(0, beam, beam)[:, :, :, 0]
-
-            # use non batched operation if bsz = beam
-            if input1.size(0) == 1:
-                output = torch.mm(input1[0, :, :], input2[0, :, :])
-            else:
-                output = input1.bmm(input2)
-            return output.view(bsz, 1, -1)
-        else:
-            return input1.bmm(input2)
-
-    def set_beam_size(self, beam_size):
-        self.beam_size = beam_size
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/character_token_embedder.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/character_token_embedder.py
deleted file mode 100644
index 181221b61b9f76453b67e3b848b198620dce912c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/character_token_embedder.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-from typing import List, Tuple
-
-import torch
-import torch.nn.functional as F
-from fairseq.data import Dictionary
-from torch import nn
-
-
-CHAR_PAD_IDX = 0
-CHAR_EOS_IDX = 257
-
-
-logger = logging.getLogger(__name__)
-
-
-class CharacterTokenEmbedder(torch.nn.Module):
-    def __init__(
-        self,
-        vocab: Dictionary,
-        filters: List[Tuple[int, int]],
-        char_embed_dim: int,
-        word_embed_dim: int,
-        highway_layers: int,
-        max_char_len: int = 50,
-        char_inputs: bool = False,
-    ):
-        super(CharacterTokenEmbedder, self).__init__()
-
-        self.onnx_trace = False
-        self.embedding_dim = word_embed_dim
-        self.max_char_len = max_char_len
-        self.char_embeddings = nn.Embedding(257, char_embed_dim, padding_idx=0)
-        self.symbol_embeddings = nn.Parameter(torch.FloatTensor(2, word_embed_dim))
-        self.eos_idx, self.unk_idx = 0, 1
-        self.char_inputs = char_inputs
-
-        self.convolutions = nn.ModuleList()
-        for width, out_c in filters:
-            self.convolutions.append(
-                nn.Conv1d(char_embed_dim, out_c, kernel_size=width)
-            )
-
-        last_dim = sum(f[1] for f in filters)
-
-        self.highway = Highway(last_dim, highway_layers) if highway_layers > 0 else None
-
-        self.projection = nn.Linear(last_dim, word_embed_dim)
-
-        assert (
-            vocab is not None or char_inputs
-        ), "vocab must be set if not using char inputs"
-        self.vocab = None
-        if vocab is not None:
-            self.set_vocab(vocab, max_char_len)
-
-        self.reset_parameters()
-
-    def prepare_for_onnx_export_(self):
-        self.onnx_trace = True
-
-    def set_vocab(self, vocab, max_char_len):
-        word_to_char = torch.LongTensor(len(vocab), max_char_len)
-
-        truncated = 0
-        for i in range(len(vocab)):
-            if i < vocab.nspecial:
-                char_idxs = [0] * max_char_len
-            else:
-                chars = vocab[i].encode()
-                # +1 for padding
-                char_idxs = [c + 1 for c in chars] + [0] * (max_char_len - len(chars))
-            if len(char_idxs) > max_char_len:
-                truncated += 1
-                char_idxs = char_idxs[:max_char_len]
-            word_to_char[i] = torch.LongTensor(char_idxs)
-
-        if truncated > 0:
-            logger.info(
-                "truncated {} words longer than {} characters".format(
-                    truncated, max_char_len
-                )
-            )
-
-        self.vocab = vocab
-        self.word_to_char = word_to_char
-
-    @property
-    def padding_idx(self):
-        return Dictionary().pad() if self.vocab is None else self.vocab.pad()
-
-    def reset_parameters(self):
-        nn.init.xavier_normal_(self.char_embeddings.weight)
-        nn.init.xavier_normal_(self.symbol_embeddings)
-        nn.init.xavier_uniform_(self.projection.weight)
-
-        nn.init.constant_(
-            self.char_embeddings.weight[self.char_embeddings.padding_idx], 0.0
-        )
-        nn.init.constant_(self.projection.bias, 0.0)
-
-    def forward(
-        self,
-        input: torch.Tensor,
-    ):
-        if self.char_inputs:
-            chars = input.view(-1, self.max_char_len)
-            pads = chars[:, 0].eq(CHAR_PAD_IDX)
-            eos = chars[:, 0].eq(CHAR_EOS_IDX)
-            if eos.any():
-                if self.onnx_trace:
-                    chars = torch.where(eos.unsqueeze(1), chars.new_zeros(1), chars)
-                else:
-                    chars[eos] = 0
-
-            unk = None
-        else:
-            flat_words = input.view(-1)
-            chars = self.word_to_char[flat_words.type_as(self.word_to_char)].type_as(
-                input
-            )
-            pads = flat_words.eq(self.vocab.pad())
-            eos = flat_words.eq(self.vocab.eos())
-            unk = flat_words.eq(self.vocab.unk())
-
-        word_embs = self._convolve(chars)
-        if self.onnx_trace:
-            if pads.any():
-                word_embs = torch.where(
-                    pads.unsqueeze(1), word_embs.new_zeros(1), word_embs
-                )
-            if eos.any():
-                word_embs = torch.where(
-                    eos.unsqueeze(1), self.symbol_embeddings[self.eos_idx], word_embs
-                )
-            if unk is not None and unk.any():
-                word_embs = torch.where(
-                    unk.unsqueeze(1), self.symbol_embeddings[self.unk_idx], word_embs
-                )
-        else:
-            if pads.any():
-                word_embs[pads] = 0
-            if eos.any():
-                word_embs[eos] = self.symbol_embeddings[self.eos_idx]
-            if unk is not None and unk.any():
-                word_embs[unk] = self.symbol_embeddings[self.unk_idx]
-
-        return word_embs.view(input.size()[:2] + (-1,))
-
-    def _convolve(
-        self,
-        char_idxs: torch.Tensor,
-    ):
-        char_embs = self.char_embeddings(char_idxs)
-        char_embs = char_embs.transpose(1, 2)  # BTC -> BCT
-
-        conv_result = []
-
-        for conv in self.convolutions:
-            x = conv(char_embs)
-            x, _ = torch.max(x, -1)
-            x = F.relu(x)
-            conv_result.append(x)
-
-        x = torch.cat(conv_result, dim=-1)
-
-        if self.highway is not None:
-            x = self.highway(x)
-        x = self.projection(x)
-
-        return x
-
-
-class Highway(torch.nn.Module):
-    """
-    A `Highway layer <https://arxiv.org/abs/1505.00387>`_.
-    Adopted from the AllenNLP implementation.
-    """
-
-    def __init__(self, input_dim: int, num_layers: int = 1):
-        super(Highway, self).__init__()
-        self.input_dim = input_dim
-        self.layers = nn.ModuleList(
-            [nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)]
-        )
-        self.activation = nn.ReLU()
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        for layer in self.layers:
-            # As per comment in AllenNLP:
-            # We should bias the highway layer to just carry its input forward.  We do that by
-            # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
-            # be high, so we will carry the input forward.  The bias on `B(x)` is the second half
-            # of the bias vector in each Linear layer.
-            nn.init.constant_(layer.bias[self.input_dim :], 1)
-
-            nn.init.constant_(layer.bias[: self.input_dim], 0)
-            nn.init.xavier_normal_(layer.weight)
-
-    def forward(self, x: torch.Tensor):
-        for layer in self.layers:
-            projection = layer(x)
-            proj_x, gate = projection.chunk(2, dim=-1)
-            proj_x = self.activation(proj_x)
-            gate = torch.sigmoid(gate)
-            x = gate * x + (gate.new_tensor([1]) - gate) * proj_x
-        return x
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/conv_tbc.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/conv_tbc.py
deleted file mode 100644
index 2dc46c4b9baf93c54234df0c61e8e7fd6390ee63..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/conv_tbc.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-from torch.nn.modules.utils import _single
-
-
-class ConvTBC(torch.nn.Module):
-    """1D convolution over an input of shape (time x batch x channel)
-
-    The implementation uses gemm to perform the convolution. This implementation
-    is faster than cuDNN for small kernel sizes.
-    """
-
-    def __init__(self, in_channels, out_channels, kernel_size, padding=0):
-        super(ConvTBC, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = _single(kernel_size)
-        self.padding = _single(padding)
-
-        self.weight = torch.nn.Parameter(
-            torch.Tensor(self.kernel_size[0], in_channels, out_channels)
-        )
-        self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
-
-    def forward(self, input):
-        return torch.conv_tbc(
-            input.contiguous(), self.weight, self.bias, self.padding[0]
-        )
-
-    def __repr__(self):
-        s = (
-            "{name}({in_channels}, {out_channels}, kernel_size={kernel_size}"
-            ", padding={padding}"
-        )
-        if self.bias is None:
-            s += ", bias=False"
-        s += ")"
-        return s.format(name=self.__class__.__name__, **self.__dict__)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/cross_entropy.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/cross_entropy.py
deleted file mode 100644
index 0d2beb44bbf8522ea4002fa6108c41e6a4b5ccc6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/cross_entropy.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-import torch
-import torch.nn.functional as F
-
-
-logger = logging.getLogger(__name__)
-
-
-def _cross_entropy_pytorch(logits, target, ignore_index=None, reduction="mean"):
-    lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
-    return F.nll_loss(
-        lprobs,
-        target,
-        ignore_index=ignore_index,
-        reduction=reduction,
-    )
-
-
-try:
-    import xentropy_cuda
-    from apex.contrib import xentropy
-
-    logger.info("using fused cross entropy")
-
-    def cross_entropy(logits, target, ignore_index=-100, reduction="mean"):
-        if logits.device == torch.device("cpu"):
-            return _cross_entropy_pytorch(logits, target, ignore_index, reduction)
-        else:
-            half_to_float = logits.dtype == torch.half
-            losses = xentropy.SoftmaxCrossEntropyLoss.apply(
-                logits,
-                target,
-                0.0,
-                ignore_index,
-                half_to_float,
-            )
-            if reduction == "sum":
-                return losses.sum()
-            elif reduction == "mean":
-                if ignore_index >= 0:
-                    return losses.sum() / target.ne(ignore_index).sum()
-                else:
-                    return losses.mean()
-            elif reduction == "none":
-                return losses
-            else:
-                raise NotImplementedError
-
-
-except ImportError:
-
-    def cross_entropy(logits, target, ignore_index=-100, reduction="mean"):
-        return _cross_entropy_pytorch(logits, target, ignore_index, reduction)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/cuda_utils.cu b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/cuda_utils.cu
deleted file mode 100644
index 516f1d92440e9e2c092f122e45d81b45cb135602..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/cuda_utils.cu
+++ /dev/null
@@ -1,203 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- * 
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-
-template <typename U, typename V>	
-constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {	
-  return (a + b - 1) / b;	
-}
-
-
-template<int FS, int SB, int padding_l, typename scalar_t>
-__inline__ __device__
-void zeroSharedMem(scalar_t* data) {
-  /*
-    Given an array of length FS + SB, zero out the first padding_l and last
-    (FS - padding_l) values in the array
-  */
-
-  int tid = threadIdx.x;
-
-  if (FS < SB) {
-
-    // zero all if we have enough threads in a block to do all of them
-    if (tid < padding_l || tid > SB - FS + padding_l - 1) {
-      data[tid] = scalar_t(0.0);
-    }
-  } else {
-
-    // otherwise zero out one block at a time
-    const int numIterations = divUp<int, int>(FS, SB);
-    for (int i = 0; i < numIterations; i++) {
-      int offset = i * SB;
-      if (tid + offset < padding_l) {
-        data[tid + offset] = scalar_t(0.0);
-      } else if (tid + offset < FS) {
-        data[SB + tid + offset] = scalar_t(0.0);
-      }
-    }
-  }
-}
-
-template<typename scalar_t>
-__inline__ __device__
-scalar_t warpReduce(scalar_t data) {
-  /*
-    Reduce an array within each warp. After processing all values in warp will
-    caontain the sum of all original values in that warp.
-
-    data - pointer to data to reduce
-  */
-  data += __shfl_xor_sync(SHFL_MASK, data, 16);
-  data += __shfl_xor_sync(SHFL_MASK, data, 8);
-  data += __shfl_xor_sync(SHFL_MASK, data, 4);
-  data += __shfl_xor_sync(SHFL_MASK, data, 2);
-  data += __shfl_xor_sync(SHFL_MASK, data, 1);
-  return data;
-}
-
-template<typename scalar_t>
-__inline__ __device__
-scalar_t blockReduce(scalar_t data) {
-  /*
-     Reduce an entire array on the block level. After processing, the
-     first value in the array will contain the reduced sum.
-
-     data - pointer to data to reduce
-  */
-
-  static __shared__ scalar_t warpSum[32];
-  const int tid = threadIdx.x;
-  int wid = tid / 32;
-  int lane = tid % 32;
-
-  __syncthreads();
-
-  // reduce each warp then write to shared memory
-  scalar_t sum = warpReduce(data);
-  if (lane == 0) {
-    warpSum[wid] = sum;
-  }
-  
-  __syncthreads();
-
-  scalar_t v;
-  // perform final sum of partial warp sums
-  if (tid < blockDim.x / 32) {
-    v = warpSum[lane];
-  } else {
-    v = scalar_t(0.0);
-  }
-
-  if (wid == 0) {
-    v = warpReduce(v);
-  }
-  __syncthreads();
-
-  return v;
-}
-
-void checkCudaStatus(cudaError_t status, int lineNumber = -1) {
-
-  if (status != cudaSuccess) {
-    std::cout << cudaGetErrorString(status)
-              << " at line " << lineNumber << std::endl;
-    std::cout << "Exiting" << std::endl;
-    exit(1);
-  }
-}
-
-template<int FS, int SB, int padding_l, typename scalar_t>
-__device__
-void load_input_to_shared(const scalar_t* input, // global memory
-                          int inputOffset, int sequenceLength,
-                          int iteration, int numIterations,
-                          bool no_prev, scalar_t* output /* shared memory */) {
-  /*
-    Load a block size of input into shared memory with
-    right and left overhang of total size FS. If previously
-    loaded memory, overlap will be shifted over to reduce
-    global memory access
-
-    input - pointer to start of channel sequence
-    inputOffset - how far in the sequence to start loading
-    sequenceLength - total length of sequence
-    iteration - which block of sequence we are loading
-    numIterations - total number of blocks to load
-    no_prev - whether to load the whole block if the previous block
-              wasn't loaded
-    output - shared memory to write input to
-  */
-
-  const int tid = threadIdx.x;
-
-  // Load the left "overhang" of input
-  if (iteration > 0) {
-    if (padding_l < SB) {
-
-      // load all at once
-      if (tid < padding_l) {
-        output[tid] = (no_prev) ? input[inputOffset - padding_l + tid] : output[tid + SB];
-      }
-    } else {
-
-      // load in chunks of size SB
-      int numIterations = divUp<int, int>(padding_l, SB);
-      for (int i = 0; i < numIterations; i++) {
-        int offset = i * SB;
-        if ((tid + offset) < padding_l) {
-          output[tid + offset] = (no_prev) ? input[inputOffset - padding_l + tid + offset] : output[tid + offset + SB];
-        }
-      }
-    }
-  }
-
-  // Load the right "overhang" of input
-  if (iteration < (numIterations - 1)) {
-    const int elementsLeft = sequenceLength - (iteration+1) * SB;
-
-    if ((FS - padding_l) < SB) {
-
-      // load all at once
-      if (tid < (FS - padding_l)) {
-          output[padding_l + SB + tid] = (tid < elementsLeft) ? input[inputOffset + SB + tid] : scalar_t(0.0);
-      }
-    } else {
-
-      // load in chunks of size SB
-      int numIterations = divUp<int, int>(FS - padding_l, SB);
-      for (int i = 0; i < numIterations; i++) {
-        int offset = i * SB;
-        if ((tid + offset) < (FS - padding_l)) {
-          output[padding_l + SB + tid + offset] = ((tid + offset) < elementsLeft) ? input[inputOffset + SB + tid + offset] : scalar_t(0.0);
-        }
-      }
-    }
-  }
-
-  // We should also clear out the right "overhang"
-  if (iteration == (numIterations - 1)) {
-    if ((FS - padding_l) < SB) {
-
-      // clear out all at once
-      if (tid < (FS - padding_l)) {
-          output[padding_l + SB + tid] = scalar_t(0.0);
-      }
-    } else {
-
-      // clear in chunks of size SB
-      int numIterations = divUp<int, int>(FS - padding_l, SB);
-      for (int i = 0; i < numIterations; i++) {
-        int offset = i * SB;
-        if ((tid + offset) < (FS - padding_l)) {
-          output[padding_l + SB + tid + offset] = scalar_t(0.0);
-        }
-      }
-    }
-  }
-  output[tid + padding_l] = ((inputOffset + tid) < sequenceLength) ? input[inputOffset + tid] : scalar_t(0.0);
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/downsampled_multihead_attention.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/downsampled_multihead_attention.py
deleted file mode 100644
index 2cdece3f7fca2b830eb72999ce93f58667ed595b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/downsampled_multihead_attention.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-#
-
-import math
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq.modules.fairseq_dropout import FairseqDropout
-from fairseq.modules.scalar_bias import scalar_bias
-
-
-class SingleHeadAttention(nn.Module):
-    """
-    Single-head attention that supports Gating and Downsampling
-    """
-
-    def __init__(
-        self,
-        out_channels,
-        embed_dim,
-        head_dim,
-        head_index,
-        dropout=0.0,
-        bias=True,
-        project_input=True,
-        gated=False,
-        downsample=False,
-        num_heads=1,
-    ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.dropout_module = FairseqDropout(
-            dropout, module_name=self.__class__.__name__
-        )
-        self.head_index = head_index
-        self.head_dim = head_dim
-        self.project_input = project_input
-        self.gated = gated
-        self.downsample = downsample
-        self.num_heads = num_heads
-        self.projection = None
-
-        k_layers = []
-        v_layers = []
-        if self.downsample:
-            k_layers.append(Downsample(self.head_index))
-            v_layers.append(Downsample(self.head_index))
-            out_proj_size = self.head_dim
-        else:
-            out_proj_size = self.head_dim * self.num_heads
-        if self.gated:
-            k_layers.append(GatedLinear(self.embed_dim, out_proj_size, bias=bias))
-            self.in_proj_q = GatedLinear(self.embed_dim, out_proj_size, bias=bias)
-            v_layers.append(GatedLinear(self.embed_dim, out_proj_size, bias=bias))
-        else:
-            k_layers.append(Linear(self.embed_dim, out_proj_size, bias=bias))
-            self.in_proj_q = Linear(self.embed_dim, out_proj_size, bias=bias)
-            v_layers.append(Linear(self.embed_dim, out_proj_size, bias=bias))
-
-        self.in_proj_k = nn.Sequential(*k_layers)
-        self.in_proj_v = nn.Sequential(*v_layers)
-
-        if self.downsample:
-            self.out_proj = Linear(out_proj_size, self.head_dim, bias=bias)
-        else:
-            self.out_proj = Linear(out_proj_size, out_channels, bias=bias)
-
-        self.scaling = self.head_dim ** -0.5
-
-    def forward(
-        self,
-        query,
-        key,
-        value,
-        mask_future_timesteps=False,
-        key_padding_mask=None,
-        use_scalar_bias=False,
-    ):
-        """Input shape: Time x Batch x Channel
-        Self-attention can be implemented by passing in the same arguments for
-        query, key and value. Future timesteps can be masked with the
-        `mask_future_timesteps` argument. Padding elements can be excluded from
-        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
-        batch x src_len, where padding elements are indicated by 1s.
-        """
-        src_len, bsz, out_channels = key.size()
-        tgt_len = query.size(0)
-        assert list(query.size()) == [tgt_len, bsz, out_channels]
-        assert key.size() == value.size()
-
-        if key_padding_mask is not None:
-            assert key_padding_mask.size(0) == bsz
-            assert key_padding_mask.size(1) == src_len
-
-        if self.downsample:
-            size = bsz
-        else:
-            size = bsz * self.num_heads
-
-        k = key
-        v = value
-        q = query
-        if self.project_input:
-            q = self.in_proj_q(q)
-            k = self.in_proj_k(k)
-            v = self.in_proj_v(v)
-            src_len = k.size()[0]
-        q *= self.scaling
-
-        if not self.downsample:
-            q = q.view(tgt_len, size, self.head_dim)
-            k = k.view(src_len, size, self.head_dim)
-            v = v.view(src_len, size, self.head_dim)
-
-        q = q.transpose(0, 1)
-        k = k.transpose(0, 1)
-        v = v.transpose(0, 1)
-
-        attn_weights = torch.bmm(q, k.transpose(1, 2))
-        if mask_future_timesteps:
-            assert (
-                query.size() == key.size()
-            ), "mask_future_timesteps only applies to self-attention"
-            attn_weights *= torch.tril(
-                attn_weights.data.new([1]).expand(tgt_len, tgt_len).clone(),
-                diagonal=-1,
-            )[:, :: self.head_index + 1 if self.downsample else 1].unsqueeze(0)
-            attn_weights += torch.triu(
-                attn_weights.data.new([-math.inf]).expand(tgt_len, tgt_len).clone(),
-                diagonal=0,
-            )[:, :: self.head_index + 1 if self.downsample else 1].unsqueeze(0)
-        tgt_size = tgt_len
-        if use_scalar_bias:
-            attn_weights = scalar_bias(attn_weights, 2)
-            v = scalar_bias(v, 1)
-            tgt_size += 1
-
-        if key_padding_mask is not None:
-            # don't attend to padding symbols
-            if key_padding_mask.max() > 0:
-                if self.downsample:
-                    attn_weights = attn_weights.view(bsz, 1, tgt_len, src_len)
-                else:
-                    attn_weights = attn_weights.view(
-                        size, self.num_heads, tgt_len, src_len
-                    )
-                attn_weights = attn_weights.masked_fill(
-                    key_padding_mask.unsqueeze(1).unsqueeze(2),
-                    -math.inf,
-                )
-                attn_weights = attn_weights.view(size, tgt_len, src_len)
-        attn_weights = F.softmax(attn_weights, dim=-1)
-        attn_weights = self.dropout_module(attn_weights)
-
-        attn = torch.bmm(attn_weights, v)
-        if self.downsample:
-            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.head_dim)
-        else:
-            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim)
-
-        attn = self.out_proj(attn)
-
-        return attn, attn_weights
-
-
-class DownsampledMultiHeadAttention(nn.ModuleList):
-    """
-    Multi-headed attention with Gating and Downsampling
-    """
-
-    def __init__(
-        self,
-        out_channels,
-        embed_dim,
-        num_heads,
-        dropout=0.0,
-        bias=True,
-        project_input=True,
-        gated=False,
-        downsample=False,
-    ):
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.head_dim = embed_dim // num_heads
-        self.downsample = downsample
-        self.gated = gated
-        self.project_input = project_input
-        assert self.head_dim * num_heads == embed_dim
-
-        if self.downsample:
-            attention_heads = []
-            for index in range(self.num_heads):
-                attention_heads.append(
-                    SingleHeadAttention(
-                        out_channels,
-                        self.embed_dim,
-                        self.head_dim,
-                        index,
-                        dropout,
-                        bias,
-                        self.project_input,
-                        self.gated,
-                        self.downsample,
-                        self.num_heads,
-                    )
-                )
-            super().__init__(modules=attention_heads)
-            self.out_proj = Linear(embed_dim, out_channels, bias=bias)
-        else:
-            # either we have a list of attention heads, or just one attention head
-            # if not being downsampled, we can do the heads with one linear layer instead of separate ones
-            super().__init__()
-            self.attention_module = SingleHeadAttention(
-                out_channels,
-                self.embed_dim,
-                self.head_dim,
-                1,
-                dropout,
-                bias,
-                self.project_input,
-                self.gated,
-                self.downsample,
-                self.num_heads,
-            )
-
-    def forward(
-        self,
-        query,
-        key,
-        value,
-        mask_future_timesteps=False,
-        key_padding_mask=None,
-        use_scalar_bias=False,
-    ):
-        src_len, bsz, embed_dim = key.size()
-        tgt_len = query.size(0)
-        assert embed_dim == self.embed_dim
-        assert list(query.size()) == [tgt_len, bsz, embed_dim]
-        assert key.size() == value.size()
-
-        tgt_size = tgt_len
-        if use_scalar_bias:
-            tgt_size += 1
-
-        attn = []
-        attn_weights = []
-        if self.downsample:
-            for attention_head_number in range(self.num_heads):
-                # call the forward of each attention head
-                _attn, _attn_weight = self[attention_head_number](
-                    query,
-                    key,
-                    value,
-                    mask_future_timesteps,
-                    key_padding_mask,
-                    use_scalar_bias,
-                )
-                attn.append(_attn)
-                attn_weights.append(_attn_weight)
-            full_attn = torch.cat(attn, dim=2)
-            full_attn = self.out_proj(full_attn)
-            return full_attn, attn_weights[0].clone()
-        else:
-            _attn, _attn_weight = self.attention_module(
-                query,
-                key,
-                value,
-                mask_future_timesteps,
-                key_padding_mask,
-                use_scalar_bias,
-            )
-            attn.append(_attn)
-            attn_weights.append(_attn_weight)
-            full_attn = torch.cat(attn, dim=2)
-            full_attn_weights = torch.cat(attn_weights)
-            full_attn_weights = full_attn_weights.view(
-                bsz, self.num_heads, tgt_size, src_len
-            )
-            full_attn_weights = full_attn_weights.sum(dim=1) / self.num_heads
-            return full_attn, full_attn_weights
-
-
-class Downsample(nn.Module):
-    """
-    Selects every nth element, where n is the index
-    """
-
-    def __init__(self, index):
-        super().__init__()
-        self.index = index
-
-    def forward(self, x):
-        return x[:: self.index + 1]
-
-
-def Linear(in_features, out_features, dropout=0.0, bias=True):
-    """Weight-normalized Linear layer (input: B x T x C)"""
-    m = nn.Linear(in_features, out_features, bias=bias)
-    m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
-    m.bias.data.zero_()
-    return nn.utils.weight_norm(m)
-
-
-def GatedLinear(in_features, out_features, dropout=0.0, bias=True):
-    """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units"""
-    return nn.Sequential(
-        Linear(in_features, out_features * 4, dropout, bias),
-        nn.GLU(),
-        Linear(out_features * 2, out_features * 2, dropout, bias),
-        nn.GLU(),
-        Linear(out_features, out_features, dropout, bias),
-    )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamic_convolution.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamic_convolution.py
deleted file mode 100644
index 5999a0453973166e65ae22fe49c0c4143a253bcc..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamic_convolution.py
+++ /dev/null
@@ -1,304 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.incremental_decoding_utils import with_incremental_state
-from fairseq.modules.fairseq_dropout import FairseqDropout
-
-from .unfold import unfold1d
-
-
-def DynamicConv(
-    input_size,
-    kernel_size=1,
-    padding_l=None,
-    num_heads=1,
-    weight_dropout=0.0,
-    weight_softmax=False,
-    renorm_padding=False,
-    bias=False,
-    conv_bias=False,
-    query_size=None,
-    in_proj=False,
-):
-    if torch.cuda.is_available():
-        try:
-            from fairseq.modules.dynamicconv_layer import DynamicconvLayer
-
-            return DynamicconvLayer(
-                input_size,
-                kernel_size=kernel_size,
-                padding_l=padding_l,
-                num_heads=num_heads,
-                weight_dropout=weight_dropout,
-                weight_softmax=weight_softmax,
-                bias=bias,
-            )
-        except ImportError as e:
-            print(e)
-    return DynamicConv1dTBC(
-        input_size,
-        kernel_size=kernel_size,
-        padding_l=padding_l,
-        num_heads=num_heads,
-        weight_dropout=weight_dropout,
-        weight_softmax=weight_softmax,
-        bias=bias,
-    )
-
-
-def Linear(in_features, out_features, bias=True):
-    m = nn.Linear(in_features, out_features, bias)
-    nn.init.xavier_uniform_(m.weight)
-    if bias:
-        nn.init.constant_(m.bias, 0.0)
-    return m
-
-
-@with_incremental_state
-class DynamicConv1dTBC(nn.Module):
-    """Dynamic lightweight convolution taking T x B x C inputs
-    Args:
-        input_size: # of channels of the input
-        kernel_size: convolution channels
-        padding_l: padding to the left when using "same" padding
-        num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size)
-        weight_dropout: the drop rate of the DropConnect to drop the weight
-        weight_softmax: normalize the weight with softmax before the convolution
-        renorm_padding: re-normalize the filters to ignore the padded part (only the non-padding parts sum up to 1)
-        bias: use bias
-        conv_bias: bias of the convolution
-        query_size: specified when feeding a different input as the query
-        in_proj: project the input and generate the filter together
-
-    Shape:
-        Input: TxBxC, i.e. (timesteps, batch_size, input_size)
-        Output: TxBxC, i.e. (timesteps, batch_size, input_size)
-
-    Attributes:
-        weight: the learnable weights of the module of shape
-            `(num_heads, 1, kernel_size)`
-        bias:   the learnable bias of the module of shape `(input_size)`
-    """
-
-    def __init__(
-        self,
-        input_size,
-        kernel_size=1,
-        padding_l=None,
-        num_heads=1,
-        weight_dropout=0.0,
-        weight_softmax=False,
-        renorm_padding=False,
-        bias=False,
-        conv_bias=False,
-        query_size=None,
-        in_proj=False,
-    ):
-        super().__init__()
-        self.input_size = input_size
-        self.query_size = input_size if query_size is None else query_size
-        self.kernel_size = kernel_size
-        self.padding_l = padding_l
-        self.num_heads = num_heads
-        self.weight_dropout_module = FairseqDropout(
-            weight_dropout, module_name=self.__class__.__name__
-        )
-        self.weight_softmax = weight_softmax
-        self.renorm_padding = renorm_padding
-
-        if in_proj:
-            self.weight_linear = Linear(
-                self.input_size, self.input_size + num_heads * kernel_size * 1
-            )
-        else:
-            self.weight_linear = Linear(
-                self.query_size, num_heads * kernel_size * 1, bias=bias
-            )
-        if conv_bias:
-            self.conv_bias = nn.Parameter(torch.Tensor(input_size))
-        else:
-            self.conv_bias = None
-        self.reset_parameters()
-
-    @property
-    def in_proj(self):
-        return (
-            self.weight_linear.out_features
-            == self.input_size + self.num_heads * self.kernel_size
-        )
-
-    def reset_parameters(self):
-        self.weight_linear.reset_parameters()
-        if self.conv_bias is not None:
-            nn.init.constant_(self.conv_bias, 0.0)
-
-    def forward(self, x, incremental_state=None, query=None, unfold=None):
-        """Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C
-        args:
-            x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size)
-            incremental_state: A dict to keep the state
-            unfold: unfold the input or not. If not, we use the matrix trick instead
-            query: use the specified query to predict the conv filters
-        """
-        unfold = (
-            x.size(0) > 512 if unfold is None else unfold
-        )  # use unfold mode as default for long sequence to save memory
-        unfold = unfold or (incremental_state is not None)
-        assert query is None or not self.in_proj
-
-        if query is None:
-            query = x
-        if unfold:
-            output = self._forward_unfolded(x, incremental_state, query)
-        else:
-            output = self._forward_expanded(x, incremental_state, query)
-
-        if self.conv_bias is not None:
-            output = output + self.conv_bias.view(1, 1, -1)
-        return output
-
-    def _forward_unfolded(self, x, incremental_state, query):
-        """The conventional implementation of convolutions.
-        Unfolding the input by having a window shifting to the right."""
-        T, B, C = x.size()
-        K, H = self.kernel_size, self.num_heads
-        R = C // H
-        assert R * H == C == self.input_size
-
-        if self.in_proj:
-            proj = self.weight_linear(x)
-            x = proj.narrow(2, 0, self.input_size).contiguous()
-            weight = (
-                proj.narrow(2, self.input_size, H * K).contiguous().view(T * B * H, -1)
-            )
-        else:
-            weight = self.weight_linear(query).view(T * B * H, -1)
-
-        # renorm_padding is only implemented in _forward_expanded
-        assert not self.renorm_padding or incremental_state is not None
-
-        if incremental_state is not None:
-            input_buffer = self._get_input_buffer(incremental_state)
-            if input_buffer is None:
-                input_buffer = x.new()
-            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
-            if self.kernel_size > 1:
-                self._set_input_buffer(
-                    incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :]
-                )
-            x_unfold = x_unfold.view(T * B * H, R, -1)
-        else:
-            padding_l = self.padding_l
-            if K > T and padding_l == K - 1:
-                weight = weight.narrow(1, K - T, T)
-                K, padding_l = T, T - 1
-            # unfold the input: T x B x C --> T' x B x C x K
-            x_unfold = unfold1d(x, K, padding_l, 0)
-            x_unfold = x_unfold.view(T * B * H, R, K)
-
-        if self.weight_softmax and not self.renorm_padding:
-            weight = F.softmax(weight, dim=1)
-        weight = weight.narrow(1, 0, K)
-
-        if incremental_state is not None:
-            weight = weight[:, -x_unfold.size(2) :]
-            K = weight.size(1)
-
-        if self.weight_softmax and self.renorm_padding:
-            weight = F.softmax(weight, dim=1)
-
-        weight = self.weight_dropout_module(weight, inplace=False)
-
-        output = torch.bmm(x_unfold, weight.unsqueeze(2))  # T*B*H x R x 1
-        output = output.view(T, B, C)
-        return output
-
-    def _forward_expanded(self, x, incremental_stat, query):
-        """Turn the convolution filters into band matrices and do matrix multiplication.
-        This is faster when the sequence is short, but less memory efficient.
-        This is not used in the decoder during inference.
-        """
-        T, B, C = x.size()
-        K, H = self.kernel_size, self.num_heads
-        R = C // H
-        assert R * H == C == self.input_size
-        if self.in_proj:
-            proj = self.weight_linear(x)
-            x = proj.narrow(2, 0, self.input_size).contiguous()
-            weight = (
-                proj.narrow(2, self.input_size, H * K).contiguous().view(T * B * H, -1)
-            )
-        else:
-            weight = self.weight_linear(query).view(T * B * H, -1)
-
-        if not self.renorm_padding:
-            if self.weight_softmax:
-                weight = F.softmax(weight, dim=1)
-            weight = self.weight_dropout_module(weight, inplace=False)
-        weight = weight.narrow(1, 0, K).contiguous()
-        weight = weight.view(T, B * H, K).transpose(0, 1)
-
-        x = x.view(T, B * H, R).transpose(0, 1)
-        if self.weight_softmax and self.renorm_padding:
-            # turn the convolution filters into band matrices
-            weight_expanded = weight.new(B * H, T, T + K - 1).fill_(float("-inf"))
-            weight_expanded.as_strided(
-                (B * H, T, K), (T * (T + K - 1), T + K, 1)
-            ).copy_(weight)
-            weight_expanded = weight_expanded.narrow(2, self.padding_l, T)
-            # normalize the weight over valid positions like self-attention
-            weight_expanded = F.softmax(weight_expanded, dim=2)
-            weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False)
-        else:
-            P = self.padding_l
-            # For efficieny, we cut the kernel size and reduce the padding when the kernel is larger than the length
-            if K > T and P == K - 1:
-                weight = weight.narrow(2, K - T, T)
-                K, P = T, T - 1
-            # turn the convolution filters into band matrices
-            weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False)
-            weight_expanded.as_strided(
-                (B * H, T, K), (T * (T + K - 1), T + K, 1)
-            ).copy_(weight)
-            weight_expanded = weight_expanded.narrow(2, P, T)  # B*H x T x T
-        output = torch.bmm(weight_expanded, x)
-        output = output.transpose(0, 1).contiguous().view(T, B, C)
-        return output
-
-    def reorder_incremental_state(self, incremental_state, new_order):
-        input_buffer = self._get_input_buffer(incremental_state)
-        if input_buffer is not None:
-            input_buffer = input_buffer.index_select(1, new_order)
-            self._set_input_buffer(incremental_state, input_buffer)
-
-    def _get_input_buffer(self, incremental_state):
-        return utils.get_incremental_state(self, incremental_state, "input_buffer")
-
-    def _set_input_buffer(self, incremental_state, new_buffer):
-        return utils.set_incremental_state(
-            self, incremental_state, "input_buffer", new_buffer
-        )
-
-    def extra_repr(self):
-        s = "{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, conv_bias={}, renorm_padding={}, in_proj={}".format(
-            self.input_size,
-            self.kernel_size,
-            self.padding_l,
-            self.num_heads,
-            self.weight_softmax,
-            self.conv_bias is not None,
-            self.renorm_padding,
-            self.in_proj,
-        )
-
-        if self.query_size != self.input_size:
-            s += ", query_size={}".format(self.query_size)
-        if self.weight_dropout_module.p > 0.0:
-            s += ", weight_dropout={}".format(self.weight_dropout_module.p)
-        return s
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamic_crf_layer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamic_crf_layer.py
deleted file mode 100644
index 8fcc6b8d2672d2eacc6d01b9688bac44d5e1ce26..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamic_crf_layer.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-This file is to re-implemented the low-rank and beam approximation of CRF layer
-Proposed by:
-
-Sun, Zhiqing, et al.
-Fast Structured Decoding for Sequence Models
-https://arxiv.org/abs/1910.11555
-
-The CRF implementation is mainly borrowed from
-https://github.com/kmkurn/pytorch-crf/blob/master/torchcrf/__init__.py
-
-"""
-
-import numpy as np
-import torch
-import torch.nn as nn
-
-
-def logsumexp(x, dim=1):
-    return torch.logsumexp(x.float(), dim=dim).type_as(x)
-
-
-class DynamicCRF(nn.Module):
-    """Dynamic CRF layer is used to approximate the traditional
-    Conditional Random Fields (CRF)
-    $P(y | x) = 1/Z(x) exp(sum_i s(y_i, x) + sum_i t(y_{i-1}, y_i, x))$
-
-    where in this function, we assume the emition scores (s) are given,
-    and the transition score is a |V| x |V| matrix $M$
-
-    in the following two aspects:
-     (1) it used a low-rank approximation for the transition matrix:
-         $M = E_1 E_2^T$
-     (2) it used a beam to estimate the normalizing factor Z(x)
-    """
-
-    def __init__(self, num_embedding, low_rank=32, beam_size=64):
-        super().__init__()
-
-        self.E1 = nn.Embedding(num_embedding, low_rank)
-        self.E2 = nn.Embedding(num_embedding, low_rank)
-
-        self.vocb = num_embedding
-        self.rank = low_rank
-        self.beam = beam_size
-
-    def extra_repr(self):
-        return "vocab_size={}, low_rank={}, beam_size={}".format(
-            self.vocb, self.rank, self.beam
-        )
-
-    def forward(self, emissions, targets, masks, beam=None):
-        """
-        Compute the conditional log-likelihood of a sequence of target tokens given emission scores
-
-        Args:
-            emissions (`~torch.Tensor`): Emission score are usually the unnormalized decoder output
-                ``(batch_size, seq_len, vocab_size)``. We assume batch-first
-            targets (`~torch.LongTensor`): Sequence of target token indices
-                ``(batch_size, seq_len)
-            masks (`~torch.ByteTensor`): Mask tensor with the same size as targets
-
-        Returns:
-            `~torch.Tensor`: approximated log-likelihood
-        """
-        numerator = self._compute_score(emissions, targets, masks)
-        denominator = self._compute_normalizer(emissions, targets, masks, beam)
-        return numerator - denominator
-
-    def forward_decoder(self, emissions, masks=None, beam=None):
-        """
-        Find the most likely output sequence using Viterbi algorithm.
-
-        Args:
-            emissions (`~torch.Tensor`): Emission score are usually the unnormalized decoder output
-                ``(batch_size, seq_len, vocab_size)``. We assume batch-first
-            masks (`~torch.ByteTensor`): Mask tensor with the same size as targets
-
-        Returns:
-            `~torch.LongTensor`: decoded sequence from the CRF model
-        """
-        return self._viterbi_decode(emissions, masks, beam)
-
-    def _compute_score(self, emissions, targets, masks=None):
-        batch_size, seq_len = targets.size()
-        emission_scores = emissions.gather(2, targets[:, :, None])[:, :, 0]  # B x T
-        transition_scores = (self.E1(targets[:, :-1]) * self.E2(targets[:, 1:])).sum(2)
-
-        scores = emission_scores
-        scores[:, 1:] += transition_scores
-
-        if masks is not None:
-            scores = scores * masks.type_as(scores)
-        return scores.sum(-1)
-
-    def _compute_normalizer(self, emissions, targets=None, masks=None, beam=None):
-        # HACK: we include "target" which is a hueristic for training
-        # HACK: we use a beam of tokens to approximate the normalizing factor (which is bad?)
-
-        beam = beam if beam is not None else self.beam
-        batch_size, seq_len = emissions.size()[:2]
-        if targets is not None:
-            _emissions = emissions.scatter(2, targets[:, :, None], np.float("inf"))
-            beam_targets = _emissions.topk(beam, 2)[1]
-            beam_emission_scores = emissions.gather(2, beam_targets)
-        else:
-            beam_emission_scores, beam_targets = emissions.topk(beam, 2)
-        beam_transition_score1 = self.E1(beam_targets[:, :-1])  # B x (T-1) x K x D
-        beam_transition_score2 = self.E2(beam_targets[:, 1:])  # B x (T-1) x K x D
-        beam_transition_matrix = torch.bmm(
-            beam_transition_score1.view(-1, beam, self.rank),
-            beam_transition_score2.view(-1, beam, self.rank).transpose(1, 2),
-        )
-        beam_transition_matrix = beam_transition_matrix.view(batch_size, -1, beam, beam)
-
-        # compute the normalizer in the log-space
-        score = beam_emission_scores[:, 0]  # B x K
-        for i in range(1, seq_len):
-            next_score = score[:, :, None] + beam_transition_matrix[:, i - 1]
-            next_score = logsumexp(next_score, dim=1) + beam_emission_scores[:, i]
-
-            if masks is not None:
-                score = torch.where(masks[:, i : i + 1], next_score, score)
-            else:
-                score = next_score
-
-        # Sum (log-sum-exp) over all possible tags
-        return logsumexp(score, dim=1)
-
-    def _viterbi_decode(self, emissions, masks=None, beam=None):
-        # HACK: we use a beam of tokens to approximate the normalizing factor (which is bad?)
-
-        beam = beam if beam is not None else self.beam
-        batch_size, seq_len = emissions.size()[:2]
-        beam_emission_scores, beam_targets = emissions.topk(beam, 2)
-        beam_transition_score1 = self.E1(beam_targets[:, :-1])  # B x (T-1) x K x D
-        beam_transition_score2 = self.E2(beam_targets[:, 1:])  # B x (T-1) x K x D
-        beam_transition_matrix = torch.bmm(
-            beam_transition_score1.view(-1, beam, self.rank),
-            beam_transition_score2.view(-1, beam, self.rank).transpose(1, 2),
-        )
-        beam_transition_matrix = beam_transition_matrix.view(batch_size, -1, beam, beam)
-
-        traj_tokens, traj_scores = [], []
-        finalized_tokens, finalized_scores = [], []
-
-        # compute the normalizer in the log-space
-        score = beam_emission_scores[:, 0]  # B x K
-        dummy = (
-            torch.arange(beam, device=score.device).expand(*score.size()).contiguous()
-        )
-
-        for i in range(1, seq_len):
-            traj_scores.append(score)
-            _score = score[:, :, None] + beam_transition_matrix[:, i - 1]
-            _score, _index = _score.max(dim=1)
-            _score = _score + beam_emission_scores[:, i]
-
-            if masks is not None:
-                score = torch.where(masks[:, i : i + 1], _score, score)
-                index = torch.where(masks[:, i : i + 1], _index, dummy)
-            else:
-                score, index = _score, _index
-            traj_tokens.append(index)
-
-        # now running the back-tracing and find the best
-        best_score, best_index = score.max(dim=1)
-        finalized_tokens.append(best_index[:, None])
-        finalized_scores.append(best_score[:, None])
-
-        for idx, scs in zip(reversed(traj_tokens), reversed(traj_scores)):
-            previous_index = finalized_tokens[-1]
-            finalized_tokens.append(idx.gather(1, previous_index))
-            finalized_scores.append(scs.gather(1, previous_index))
-
-        finalized_tokens.reverse()
-        finalized_tokens = torch.cat(finalized_tokens, 1)
-        finalized_tokens = beam_targets.gather(2, finalized_tokens[:, :, None])[:, :, 0]
-
-        finalized_scores.reverse()
-        finalized_scores = torch.cat(finalized_scores, 1)
-        finalized_scores[:, 1:] = finalized_scores[:, 1:] - finalized_scores[:, :-1]
-
-        return finalized_scores, finalized_tokens
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/__init__.py
deleted file mode 100644
index 22dc6f403d2a0ecdb1b9e7e69ed96bd560e93b2c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .dynamicconv_layer import DynamicconvLayer  # noqa
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
deleted file mode 100644
index 9304f99eb8169a614f39babc830c84cac80e080b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-def gen_forward():
-
-    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
-    blocks = [32, 64, 128, 256]
-
-    head = """
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "dynamicconv_cuda.cuh"
-
-std::vector<at::Tensor> dynamicconv_cuda_forward(at::Tensor input, at::Tensor weight, int padding_l) {
-
-    at::DeviceGuard g(input.device());
-    const auto minibatch = input.size(0);
-    const auto numFeatures = input.size(1);
-    const auto sequenceLength = input.size(2);
-
-    const auto numHeads = weight.size(1);
-    const auto filterSize = weight.size(2);
-
-    const auto numFiltersInBlock = numFeatures / numHeads;
-    const dim3 blocks(minibatch, numFeatures);
-
-    auto output = at::zeros_like(input);
-    auto stream = at::cuda::getCurrentCUDAStream();
-"""
-
-    switch = """
-    switch(filterSize) {
-"""
-
-    case_k = """
-        case {k}:
-"""
-
-    main_block = """
-            if (padding_l == {pad}) {{
-                AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "dynamicconv_forward", ([&] {{
-                    dynamicconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t>
-                    <<<blocks, {b_size}, 0, stream>>>(
-                            input.data<scalar_t>(),
-                            weight.data<scalar_t>(),
-                            minibatch,
-                            sequenceLength,
-                            numFeatures,
-                            numFiltersInBlock,
-                            numHeads,
-                            output.data<scalar_t>());
-                }}));
-            }} else
-"""
-
-    bad_padding = """
-            {
-                std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl;
-            }
-            break;\n
-"""
-
-    end = """
-        default:
-            std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl;
-    }
-
-    return {output};
-}
-"""
-
-    with open("dynamicconv_cuda_forward.cu", "w") as forward:
-        forward.write(head)
-        forward.write(switch)
-        for k in kernels:
-            b_size = 32
-            for b in blocks:
-                if b > k:
-                    b_size = b
-                    break
-            forward.write(case_k.format(k=k))
-            for pad in [k // 2, k - 1]:
-                forward.write(main_block.format(k=k, b_size=b_size, pad=pad))
-            forward.write(bad_padding)
-        forward.write(end)
-
-
-def gen_backward():
-
-    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
-    thresh = [512, 512, 512, 512, 512, 380, 256, 256]
-    min_block = [64, 64, 64, 64, 64, 64, 128, 256]
-    seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
-
-    head = """
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "dynamicconv_cuda.cuh"
-
-std::vector<at::Tensor> dynamicconv_cuda_backward(at::Tensor gradOutput, int padding_l, at::Tensor input, at::Tensor weight) {
-
-    at::DeviceGuard g(input.device());
-    const auto minibatch = input.size(0);
-    const auto numFeatures = input.size(1);
-    const auto sequenceLength = input.size(2);
-
-    const auto numHeads = weight.size(1);
-    const auto filterSize = weight.size(2);
-
-    const auto numFiltersInBlock = numFeatures / numHeads;
-    auto numChunks = 1;
-
-    auto gradInput = at::zeros_like(input);
-    auto gradWeight = at::zeros_like(weight);
-    auto stream = at::cuda::getCurrentCUDAStream();
-
-    dim3 blocks(minibatch, numHeads, numChunks);
-"""
-
-    sequence_if = """
-    if (sequenceLength < {seq}) {{
-        switch(filterSize) {{
-"""
-
-    case_k = """
-            case {k}:
-"""
-
-    chunks_reset = """
-                numChunks = int(ceilf(sequenceLength/float({b_size})));
-                blocks = dim3(minibatch, numHeads, numChunks);
-"""
-
-    main_block = """
-                if (padding_l == {p}) {{
-                    AT_DISPATCH_FLOATING_TYPES_AND_HALF(gradOutput.scalar_type(), "dynamicconv_backward", ([&] {{
-                        dynamicconv_backward_kernel<{k}, {b_size}, {p}, scalar_t>
-                        <<<blocks, {b_size}, 0, stream>>>(
-                                    gradOutput.data<scalar_t>(),
-                                    input.data<scalar_t>(),
-                                    weight.data<scalar_t>(),
-                                    minibatch,
-                                    sequenceLength,
-                                    numFeatures,
-                                    numFiltersInBlock,
-                                    numHeads,
-                                    gradWeight.data<scalar_t>(),
-                                    gradInput.data<scalar_t>());
-                    }}));
-                }} else
-"""
-
-    bad_padding = """
-                {
-                    std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl;
-                }
-                break;\n
-"""
-
-    bad_filter = """
-            default:
-                std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl;
-        }
-"""
-
-    con_else = """
-    } else
-"""
-
-    final_else = """
-    {
-        switch(filterSize) {
-"""
-
-    last_return = """
-    }
-    return {gradInput, gradWeight};
-}
-"""
-
-    with open("dynamicconv_cuda_backward.cu", "w") as backward:
-        backward.write(head)
-        for seq in seqs:
-            backward.write(sequence_if.format(seq=seq))
-            for k, t, m in zip(kernels, thresh, min_block):
-                backward.write(case_k.format(k=k))
-                if seq <= t:
-                    b_size = seq
-                else:
-                    b_size = m
-                    backward.write(chunks_reset.format(b_size=b_size))
-                for p in [k // 2, k - 1]:
-                    backward.write(main_block.format(k=k, b_size=b_size, p=p))
-                backward.write(bad_padding)
-            backward.write(bad_filter)
-            backward.write(con_else)
-        backward.write(final_else)
-        for k, m in zip(kernels, min_block):
-            backward.write(case_k.format(k=k))
-            backward.write(chunks_reset.format(b_size=m))
-            for p in [k // 2, k - 1]:
-                backward.write(main_block.format(k=k, b_size=m, p=p))
-            backward.write(bad_padding)
-        backward.write(bad_filter)
-        backward.write(last_return)
-
-
-if __name__ == "__main__":
-    gen_forward()
-    gen_backward()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
deleted file mode 100644
index ebd4df0e9608d769f31eadc6e0b487505f11b279..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <torch/extension.h>
-#include <vector>
-
-std::vector<at::Tensor> dynamicconv_cuda_forward(
-    at::Tensor input,
-    at::Tensor filters,
-    int padding_l);
-
-std::vector<at::Tensor> dynamicconv_cuda_backward(
-    at::Tensor gradOutput,
-    int padding_l,
-    at::Tensor input,
-    at::Tensor filters);
-
-
-#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
-
-std::vector<at::Tensor> dynamicconv_forward(
-    at::Tensor input,
-    at::Tensor filters,
-    int padding_l) {
-
-    CHECK_INPUT(input);
-    CHECK_INPUT(filters);
-
-    return dynamicconv_cuda_forward(input, filters,
-            padding_l);
-}
-
-std::vector<at::Tensor> dynamicconv_backward(
-    at::Tensor gradOutput,
-    int padding_l,
-    at::Tensor input,
-    at::Tensor filters) {
-
-    CHECK_INPUT(gradOutput);
-    CHECK_INPUT(input);
-    CHECK_INPUT(filters);
-
-    return dynamicconv_cuda_backward(gradOutput, padding_l,
-            input, filters);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("forward", &dynamicconv_forward, "dynamicconv forward (CUDA)");
-    m.def("backward", &dynamicconv_backward, "dynamicconv backward (CUDA)");
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
deleted file mode 100644
index 2196259433aefc88f96cd5bbcae57740a9a8c2dc..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh
+++ /dev/null
@@ -1,51 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- * 
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <ATen/ATen.h>
-#include <c10/cuda/CUDAStream.h>
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-
-#include <algorithm>
-#include <functional>
-#include <iostream>
-#include <stdexcept>
-#include <utility>
-#include <vector>
-
-#include <stdlib.h>
-#include <assert.h>
-#include <math.h>
-
-#define SHFL_MASK 0xffffffff
-
-template<int FS, int SB, int padding_l, typename scalar_t>
-__global__
-void dynamicconv_forward_kernel(const scalar_t* input,
-                                const scalar_t* weight,
-                                int minibatch, 
-                                int sequenceLength,
-                                int numFeatures, 
-                                int numFiltersInBlock,
-                                int numHeads,
-                                scalar_t* output);
-
-template<int FS, int SB, int padding_l, typename scalar_t>
-__global__
-void dynamicconv_backward_kernel(
-    const scalar_t* gradOutput, // B * C * T
-    const scalar_t* input, // B * C * T
-    const scalar_t* weight,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    int numHeads,
-    scalar_t* gradWeight,
-    scalar_t* gradInput); // B * H * k * T
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
deleted file mode 100644
index 300d35b6478080a9594a22e335988c321d43127f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
+++ /dev/null
@@ -1,168 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- * 
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "dynamicconv_cuda.cuh"
-#include "dynamicconv_cuda_forward.cu"
-#include "dynamicconv_cuda_backward.cu"
-#include "../cuda_utils.cu"
-
-// FS is filter size and kernels are specialized for filter sizes
-template<int FS, int SB, int padding_l, typename scalar_t>
-__global__
-void dynamicconv_forward_kernel(const scalar_t* input,
-                                const scalar_t* weight,
-                                int minibatch,
-                                int sequenceLength,
-                                int numFeatures,
-                                int numFiltersInBlock,
-                                int numHeads,
-                                scalar_t* output) {
-  assert(blockDim.x == SB);
-
-  const int tid = threadIdx.x;
-  const int batchIdx = blockIdx.x;
-  const int featureIdx = blockIdx.y;
-  const int head = featureIdx / numFiltersInBlock;
-
-  const int IOOffset = batchIdx * numFeatures * sequenceLength
-                       + featureIdx * sequenceLength;
-  const scalar_t* inputFeature = &input[IOOffset];
-  scalar_t* outputFeature = &output[IOOffset];
-
-  scalar_t filter[FS];
-
-  __shared__ scalar_t tempInput[SB + FS];
-  zeroSharedMem<FS, SB, padding_l>(tempInput);
-
-  const int numIterations = divUp<int, int>(sequenceLength, SB);
-
-  for (int i = 0; i < numIterations; ++i) {
-    __syncthreads();
-    const int inputOffset = i * SB;
-    load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset,
-                                            sequenceLength, i,
-                                            numIterations, false, tempInput);
-    __syncthreads();
-    if (inputOffset + tid < sequenceLength) {
-
-      #pragma unroll
-      for (int k = 0; k < FS; ++k) {
-        const int filterOffset = batchIdx * numHeads * FS * sequenceLength
-                                 + head * FS * sequenceLength
-                                 + k * sequenceLength
-                                 + i * SB + tid;
-        filter[k] = weight[filterOffset];
-      }
-
-      scalar_t out = scalar_t(0.0);
-      #pragma unroll
-      for (int k = 0; k < FS; ++k) {
-        out += filter[k] * tempInput[tid + k];
-      }
-
-      outputFeature[inputOffset + tid] = out;
-
-    }
-  }
-}
-
-template<int FS, int SB, int padding_l, typename scalar_t>
-__global__
-void dynamicconv_backward_kernel(
-    const scalar_t* gradOutput, // B * C * T
-    const scalar_t* input, // B * C * T
-    const scalar_t* weight,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    int numHeads,
-    scalar_t* gradWeight,
-    scalar_t* gradInput) { // B * H * k * T
-
-  assert(blockDim.x == SB);
-
-  // each block operates on a single batch and filter head
-  const int tid = threadIdx.x;
-  const int batchIdx = blockIdx.x;
-  const int headIdx = blockIdx.y;
-  const int chunkIdx = blockIdx.z;
-
-  const int numChunks = divUp<int, int>(sequenceLength, SB);
-  const int inputOffset = chunkIdx * SB;
-
-  // initialize shared memory for output gradient and input
-  __shared__ scalar_t tempGradOutput[SB + FS];
-  __shared__ scalar_t tempInput[SB + FS];
-  const int padding = FS - padding_l - 1;
-
-  zeroSharedMem<FS, SB, padding>(tempGradOutput);
-  zeroSharedMem<FS, SB, padding_l>(tempInput);
-
-  // initialize local filter and weight gradient sum arrays
-  scalar_t tempGradSum[FS];
-  scalar_t bfilter[FS];
-  for (int k = 0; k < FS; ++k) {
-    tempGradSum[k] = scalar_t(0.0);
-
-    int idxOffset = inputOffset + tid + k - padding;
-    if (idxOffset >= 0 && idxOffset < sequenceLength) {
-      int bfilterOffset = batchIdx * numHeads * FS * sequenceLength
-                          + headIdx * FS * sequenceLength
-                          + (FS - k  - 1) * sequenceLength
-                          + idxOffset;
-      bfilter[k] = weight[bfilterOffset];
-    } else {
-      bfilter[k] = scalar_t(0.0);
-    }
-  }
-
-
-  // iterate over filter block
-  for (int featureIdx = 0; featureIdx < numFiltersInBlock; ++featureIdx) {
-    __syncthreads();
-
-    // load input and output gradient for this channel and chunk
-    const int IOOffset = batchIdx * numFeatures * sequenceLength
-                         + (headIdx * numFiltersInBlock + featureIdx) * sequenceLength;
-    const scalar_t* inputFeature = &input[IOOffset];
-    const scalar_t* gradOutputFeature = &gradOutput[IOOffset];
-    scalar_t* gradInputFeature = &gradInput[IOOffset];
-
-    load_input_to_shared<FS, SB, padding>(gradOutputFeature, inputOffset,
-                                            sequenceLength, chunkIdx,
-                                            numChunks, true, tempGradOutput);
-    load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset,
-                                            sequenceLength, chunkIdx,
-                                            numChunks, true, tempInput);
-    __syncthreads();
- 
-    // sum input and weight gradients
-    scalar_t out = scalar_t(0.0);
-    #pragma unroll
-    for (int k = 0; k < FS; ++k) {
-      tempGradSum[k] += tempInput[tid + k] * tempGradOutput[tid + padding];
-      out += bfilter[k] * tempGradOutput[tid + k];
-    }
-    
-    if (inputOffset + tid < sequenceLength) {
-      gradInputFeature[inputOffset + tid] = out;
-    }
-  }
-
-  const int gradOffset = batchIdx * numHeads * FS * sequenceLength
-               + headIdx * FS * sequenceLength;
-  scalar_t *gradWeightFeature = &gradWeight[gradOffset];
-
-  // write weight gradient
-  if (inputOffset + tid < sequenceLength) {
-    for (int k = 0; k < FS; ++k) {
-      const int outputOffset = k * sequenceLength + inputOffset + tid;
-      gradWeightFeature[outputOffset] = tempGradSum[k];
-    }
-  }
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
deleted file mode 100644
index 4a683d2690d5e3058192afb1b3f4c1f3e2c41352..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import dynamicconv_cuda
-import torch
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.incremental_decoding_utils import with_incremental_state
-from fairseq.modules.fairseq_dropout import FairseqDropout
-from fairseq.modules.unfold import unfold1d
-from torch import nn
-from torch.autograd import Function
-
-
-class dynamicconvFunction(Function):
-    @staticmethod
-    def forward(ctx, x, weights, padding_l):
-        ctx.padding_l = padding_l
-        outputs = dynamicconv_cuda.forward(x, weights, padding_l)
-        variables = [x, weights]
-        ctx.save_for_backward(*variables)
-        return outputs[0]
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        outputs = dynamicconv_cuda.backward(
-            grad_output.contiguous(), ctx.padding_l, *ctx.saved_tensors
-        )
-        grad_input, grad_weights = outputs
-        return grad_input, grad_weights, None
-
-
-@with_incremental_state
-class DynamicconvLayer(nn.Module):
-    def __init__(
-        self,
-        input_size,
-        kernel_size=1,
-        padding_l=None,
-        weight_softmax=False,
-        num_heads=1,
-        weight_dropout=0.0,
-        bias=False,
-        renorm_padding=False,
-        conv_bias=False,
-        query_size=None,
-    ):
-
-        super(DynamicconvLayer, self).__init__()
-        self.input_size = input_size
-        self.query_size = input_size if query_size is None else query_size
-        self.kernel_size = kernel_size
-        self.padding_l = padding_l
-        self.num_heads = num_heads
-        self.weight_softmax = weight_softmax
-        self.weight_dropout_module = FairseqDropout(
-            weight_dropout, module_name=self.__class__.__name__
-        )
-        self.renorm_padding = renorm_padding
-        self.bias = bias
-
-        self.weight_linear = nn.Linear(input_size, num_heads * kernel_size, bias)
-        if conv_bias:
-            self.conv_bias = nn.Parameter(torch.Tensor(input_size))
-        else:
-            self.conv_bias = None
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.xavier_uniform_(self.weight_linear.weight)
-        if self.conv_bias is not None:
-            nn.init.constant_(self.conv_bias, 0.0)
-            nn.init.constant_(self.weight_linaer.bias, 0.0)
-
-    def forward(self, x, incremental_state=None, query=None, unfold=None):
-
-        T, B, C = x.size()
-        K, H = self.kernel_size, self.num_heads
-        # R = C // H
-
-        # during inference time, incremental BMM is faster
-        if incremental_state is not None:
-            unfold = (
-                x.size(0) > 512 if unfold is None else unfold
-            )  # use unfold mode as default for long sequence to save memory
-            unfold = unfold or (incremental_state is not None)
-            assert query is None
-
-            if query is None:
-                query = x
-            if unfold:
-                output = self._forward_unfolded(x, incremental_state, query)
-            else:
-                output = self._forward_expanded(x, incremental_state, query)
-
-            if self.conv_bias is not None:
-                output = output + self.conv_bias.view(1, 1, -1)
-
-            return output
-
-        # during training time, use CUDA kernel
-        else:
-            weight = self.weight_linear(x).view(T, B, H, K)
-            if self.weight_softmax:
-                weight = F.softmax(weight, dim=-1)
-            if self.weight_dropout_module.p:
-                weight = self.weight_dropout_module(weight)
-
-            weight = weight.permute(1, 2, 3, 0).contiguous()
-            self.filters = weight
-            x = x.permute(1, 2, 0).contiguous()
-            output = dynamicconvFunction.apply(x, weight, self.padding_l).permute(
-                2, 0, 1
-            )
-            if self.conv_bias is not None:
-                output = output + self.conv_bias.view(1, 1, -1)
-            return output
-
-    def reorder_incremental_state(self, incremental_state, new_order):
-        input_buffer = self._get_input_buffer(incremental_state)
-        if input_buffer is not None:
-            input_buffer = input_buffer.index_select(1, new_order)
-            self._set_input_buffer(incremental_state, input_buffer)
-
-    def _get_input_buffer(self, incremental_state):
-        return utils.get_incremental_state(self, incremental_state, "input_buffer")
-
-    def _set_input_buffer(self, incremental_state, new_buffer):
-        return utils.set_incremental_state(
-            self, incremental_state, "input_buffer", new_buffer
-        )
-
-    def _forward_unfolded(self, x, incremental_state, query):
-        """The conventional implementation of convolutions.
-        Unfolding the input by having a window shifting to the right."""
-        T, B, C = x.size()
-        K, H = self.kernel_size, self.num_heads
-        R = C // H
-        assert R * H == C == self.input_size
-
-        weight = self.weight_linear(query).view(T * B * H, -1)
-
-        # renorm_padding is only implemented in _forward_expanded
-        assert not self.renorm_padding or incremental_state is not None
-
-        if incremental_state is not None:
-            input_buffer = self._get_input_buffer(incremental_state)
-            if input_buffer is None:
-                input_buffer = x.new()
-            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
-            if self.kernel_size > 1:
-                self._set_input_buffer(
-                    incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :]
-                )
-            x_unfold = x_unfold.view(T * B * H, R, -1)
-        else:
-            padding_l = self.padding_l
-            if K > T and padding_l == K - 1:
-                weight = weight.narrow(1, K - T, T)
-                K, padding_l = T, T - 1
-            # unfold the input: T x B x C --> T' x B x C x K
-            x_unfold = unfold1d(x, K, padding_l, 0)
-            x_unfold = x_unfold.view(T * B * H, R, K)
-
-        if self.weight_softmax and not self.renorm_padding:
-            weight = F.softmax(weight, dim=1)
-        weight = weight.narrow(1, 0, K)
-
-        if incremental_state is not None:
-            weight = weight[:, -x_unfold.size(2) :]
-            K = weight.size(1)
-
-        if self.weight_softmax and self.renorm_padding:
-            weight = F.softmax(weight, dim=1)
-
-        weight = self.weight_dropout_module(weight, inplace=False)
-
-        output = torch.bmm(x_unfold, weight.unsqueeze(2))  # T*B*H x R x 1
-        output = output.view(T, B, C)
-        return output
-
-    def _forward_expanded(self, x, incremental_stat, query):
-        """Turn the convolution filters into band matrices and do matrix multiplication.
-        This is faster when the sequence is short, but less memory efficient.
-        This is not used in the decoder during inference.
-        """
-        T, B, C = x.size()
-        K, H = self.kernel_size, self.num_heads
-        R = C // H
-        assert R * H == C == self.input_size
-        weight = self.weight_linear(query).view(T * B * H, -1)
-
-        if not self.renorm_padding:
-            if self.weight_softmax:
-                weight = F.softmax(weight, dim=1)
-            weight = self.weight_dropout_module(weight, inplace=False)
-        weight = weight.narrow(1, 0, K).contiguous()
-        weight = weight.view(T, B * H, K).transpose(0, 1)
-
-        x = x.view(T, B * H, R).transpose(0, 1)
-        if self.weight_softmax and self.renorm_padding:
-            # turn the convolution filters into band matrices
-            weight_expanded = weight.new(B * H, T, T + K - 1).fill_(float("-inf"))
-            weight_expanded.as_strided(
-                (B * H, T, K), (T * (T + K - 1), T + K, 1)
-            ).copy_(weight)
-            weight_expanded = weight_expanded.narrow(2, self.padding_l, T)
-            # normalize the weight over valid positions like self-attention
-            weight_expanded = F.softmax(weight_expanded, dim=2)
-            weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False)
-        else:
-            P = self.padding_l
-            # For efficieny, we cut the kernel size and reduce the padding when the kernel is larger than the length
-            if K > T and P == K - 1:
-                weight = weight.narrow(2, K - T, T)
-                K, P = T, T - 1
-            # turn the convolution filters into band matrices
-            weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False)
-            weight_expanded.as_strided(
-                (B * H, T, K), (T * (T + K - 1), T + K, 1)
-            ).copy_(weight)
-            weight_expanded = weight_expanded.narrow(2, P, T)  # B*H x T x T
-        output = torch.bmm(weight_expanded, x)
-        output = output.transpose(0, 1).contiguous().view(T, B, C)
-        return output
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
deleted file mode 100644
index 6d589533000e1e75a26dcdc7bf423442feb63656..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-// BSD 3-Clause License
-//
-// Copyright (c) 2017 xxxx
-// All rights reserved.
-// Copyright 2021 Huawei Technologies Co., Ltd
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// * Redistributions of source code must retain the above copyright notice, this
-//   list of conditions and the following disclaimer.
-//
-// * Redistributions in binary form must reproduce the above copyright notice,
-//   this list of conditions and the following disclaimer in the documentation
-//   and/or other materials provided with the distribution.
-//
-// * Neither the name of the copyright holder nor the names of its
-//   contributors may be used to endorse or promote products derived from
-//   this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// ============================================================================
-
-#include <torch/torch.h>
-#include <vector>
-
-std::vector<float*> dynamicconv_cpu_forward(
-    float* input,
-    float* filters,
-    int padding_l);
-
-std::vector<float*> dynamicconv_cpu_backward(
-    float* gradOutput,
-    int padding_l,
-    float* input,
-    float* filters);
-
-std::vector<float*> dynamicconv_forward(
-    float* input,
-    float* filters,
-    int padding_l) {
-
-    return dynamicconv_cpu_forward(input, filters, padding_l);
-}
-
-std::vector<float*> dynamicconv_backward(
-    float* gradOutput,
-    int padding_l,
-    float* input,
-    float* filters) {
-
-    return dynamicconv_cpu_backward(gradOutput, padding_l, input, filters);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("forward", &dynamicconv_forward, "dynamicconv forward (CPU)");
-    m.def("backward", &dynamicconv_backward, "dynamicconv backward (CPU)");
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/setup.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/setup.py
deleted file mode 100644
index 6a21f7e2ee0840a3b251522275a0b32a856951d7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/dynamicconv_layer/setup.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-
-setup(
-    name="dynamicconv_layer",
-    ext_modules=[
-        CUDAExtension(
-            name="dynamicconv_cuda",
-            sources=[
-                "dynamicconv_cuda.cpp",
-                "dynamicconv_cuda_kernel.cu",
-            ],
-        ),
-    ],
-    cmdclass={"build_ext": BuildExtension},
-)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/fairseq_dropout.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/fairseq_dropout.py
deleted file mode 100644
index c82f175c41acc889dd31fb03495d4987181c2f6e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/fairseq_dropout.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-from typing import List, Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-logger = logging.getLogger(__name__)
-
-def get_dropout_class():
-    try:
-        from torch import npu_dropout_do_mask
-        return NpuFairseqDropout
-    except:
-        return FairseqDropout
-
-class FairseqDropout(nn.Module):
-    def __init__(self, p, module_name=None):
-        super().__init__()
-        self.p = p
-        self.module_name = module_name
-        self.apply_during_inference = False
-
-    def forward(self, x, inplace: bool = False):
-        if self.training or self.apply_during_inference:
-            return F.dropout(x, p=self.p, training=True, inplace=inplace)
-        else:
-            return x
-
-    def make_generation_fast_(
-        self,
-        name: str,
-        retain_dropout: bool = False,
-        retain_dropout_modules: Optional[List[str]] = None,
-        **kwargs
-    ):
-        if retain_dropout:
-            if retain_dropout_modules is not None and self.module_name is None:
-                logger.warning(
-                    "Cannot enable dropout during inference for module {} "
-                    "because module_name was not set".format(name)
-                )
-            elif (
-                retain_dropout_modules is None  # if None, apply to all modules
-                or self.module_name in retain_dropout_modules
-            ):
-                logger.info(
-                    "Enabling dropout during inference for module: {}".format(name)
-                )
-                self.apply_during_inference = True
-            else:
-                logger.info("Disabling dropout for module: {}".format(name))
-
-class DropOutTask:
-    def __init__(self, shape, dtype, device, p):
-        self.shape = shape
-        self.dtype = dtype
-        self.device = device
-        self.p = p
-        self.request_count = 0
-        self.mask_queue = []
-
-class NpuFairseqDropout(torch.nn.Dropout):
-    task_dict = {}
-    dropout_stream = None
-
-    def __init__(self, p, module_name=None):
-        super().__init__(p)
-        self.module_name = module_name
-
-    def forward(self, x):
-        if isinstance(x, torch.Tensor):
-            shape = x.shape
-            dtype = x.dtype
-            device = x.device
-            do_mask_flag = True
-            return_obj = x
-        elif isinstance(x, list):
-            shape, dtype, device = x
-            do_mask_flag = False
-            return_obj = None
-        else:
-            raise RuntimeError("input type error!")
-
-        if self.p == 0:
-            return return_obj
-        key = (shape, dtype, device, self.p)
-        if key not in NpuFairseqDropout.task_dict:
-            dropout_task = DropOutTask(shape, dtype, device, self.p)
-            dropout_task.request_count += 1
-            NpuFairseqDropout.task_dict[key] = dropout_task
-            return return_obj
-        elif not NpuFairseqDropout.task_dict[key].mask_queue:
-            NpuFairseqDropout.task_dict[key].request_count += 1
-            return return_obj
-        else:
-            mask, event = NpuFairseqDropout.task_dict[key].mask_queue.pop(0)
-            if do_mask_flag:
-                return torch.npu_dropout_do_mask(x, mask, self.p)[0]
-            else:
-                return mask
-
-    @classmethod
-    def enable_dropout_ensemble(cls, model):
-        if cls.dropout_stream is None:
-            cls.dropout_stream = torch.npu.Stream()
-
-        def wait_stream_hook_func():
-            def hook_function(module, inputs):
-                torch.npu.current_stream().wait_stream(cls.dropout_stream)
-            return hook_function
-        model.register_forward_pre_hook(wait_stream_hook_func())
-
-        def mask_gen_hook_func():
-            def hook_function(module, inputs, outputs):
-                with torch.npu.stream(cls.dropout_stream):
-                    with torch.no_grad():
-                        for _, task in cls.task_dict.items():
-                            if len(task.mask_queue) < task.request_count:
-                                for j in range(task.request_count - len(task.mask_queue)):
-                                    mask = torch.npu_dropout_gen_mask(task.shape, p=task.p, dtype=task.dtype,
-                                                                      device=task.device)
-                                    event = None
-                                    task.mask_queue.append((mask, event))
-            return hook_function
-
-        model.register_forward_hook(mask_gen_hook_func())
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/fp32_group_norm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/fp32_group_norm.py
deleted file mode 100644
index d03aac022e30c8c14a600062d1d86429504ba003..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/fp32_group_norm.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Layer norm done in fp32 (for fp16 training)
-"""
-
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class Fp32GroupNorm(nn.GroupNorm):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def forward(self, input):
-        output = F.group_norm(
-            input.float(),
-            self.num_groups,
-            self.weight.float() if self.weight is not None else None,
-            self.bias.float() if self.bias is not None else None,
-            self.eps,
-        )
-        return output.type_as(input)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/gelu.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/gelu.py
deleted file mode 100644
index a60c15a5a1110dad9dd3f01302d38ca4459038cc..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/gelu.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-See "Gaussian Error Linear Units (GELUs)" by Dan Hendrycks and Kevin Gimpel with
-the corresponding GitHub repo: https://github.com/hendrycks/GELUs
-"""
-
-import math
-
-import torch
-import torch.nn as nn
-
-
-def gelu_accurate(x):
-    if not hasattr(gelu_accurate, "_a"):
-        gelu_accurate._a = math.sqrt(2 / math.pi)
-    return (
-        0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3))))
-    )
-
-
-def gelu(x: torch.Tensor) -> torch.Tensor:
-    return torch.fast_gelu(x).type_as(x)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/grad_multiply.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/grad_multiply.py
deleted file mode 100644
index 08d15f55dfda9c61a1cf8641ea31424fe1d97f57..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/grad_multiply.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-
-
-class GradMultiply(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, scale):
-        ctx.scale = scale
-        res = x.new(x)
-        return res
-
-    @staticmethod
-    def backward(ctx, grad):
-        return grad * ctx.scale, None
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/gumbel_vector_quantizer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/gumbel_vector_quantizer.py
deleted file mode 100644
index 47657bb0ab70864a3f7a0b00c226ccc9fc527fa3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/gumbel_vector_quantizer.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class GumbelVectorQuantizer(nn.Module):
-    def __init__(
-        self,
-        dim,
-        num_vars,
-        temp,
-        groups,
-        combine_groups,
-        vq_dim,
-        time_first,
-        activation=nn.GELU(),
-        weight_proj_depth=1,
-        weight_proj_factor=1,
-    ):
-        """Vector quantization using gumbel softmax
-
-        Args:
-            dim: input dimension (channels)
-            num_vars: number of quantized vectors per group
-            temp: temperature for training. this should be a tuple of 3 elements: (start, stop, decay factor)
-            groups: number of groups for vector quantization
-            combine_groups: whether to use the vectors for all groups
-            vq_dim: dimensionality of the resulting quantized vector
-            time_first: if true, expect input in BxTxC format, otherwise in BxCxT
-            activation: what activation to use (should be a module). this is only used if weight_proj_depth is > 1
-            weight_proj_depth: number of layers (with activation in between) to project input before computing logits
-            weight_proj_factor: this is used only if weight_proj_depth is > 1. scales the inner dimensionality of
-                                projections by this factor
-        """
-        super().__init__()
-
-        self.groups = groups
-        self.combine_groups = combine_groups
-        self.input_dim = dim
-        self.num_vars = num_vars
-        self.time_first = time_first
-
-        assert (
-            vq_dim % groups == 0
-        ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation"
-
-        var_dim = vq_dim // groups
-        num_groups = groups if not combine_groups else 1
-
-        self.vars = nn.Parameter(torch.FloatTensor(1, num_groups * num_vars, var_dim))
-        nn.init.uniform_(self.vars)
-
-        if weight_proj_depth > 1:
-
-            def block(input_dim, output_dim):
-                return nn.Sequential(nn.Linear(input_dim, output_dim), activation)
-
-            inner_dim = self.input_dim * weight_proj_factor
-            self.weight_proj = nn.Sequential(
-                *[
-                    block(self.input_dim if i == 0 else inner_dim, inner_dim)
-                    for i in range(weight_proj_depth - 1)
-                ],
-                nn.Linear(inner_dim, groups * num_vars),
-            )
-        else:
-            self.weight_proj = nn.Linear(self.input_dim, groups * num_vars)
-            nn.init.normal_(self.weight_proj.weight, mean=0, std=1)
-            nn.init.zeros_(self.weight_proj.bias)
-
-        assert len(temp) == 3, temp
-
-        self.max_temp, self.min_temp, self.temp_decay = temp
-        self.curr_temp = self.max_temp
-        self.codebook_indices = None
-
-    def set_num_updates(self, num_updates):
-        self.curr_temp = max(
-            self.max_temp * self.temp_decay ** num_updates, self.min_temp
-        )
-
-    def get_codebook_indices(self):
-        if self.codebook_indices is None:
-            from itertools import product
-
-            p = [range(self.num_vars)] * self.groups
-            inds = list(product(*p))
-            self.codebook_indices = torch.tensor(
-                inds, dtype=torch.long, device=self.vars.device
-            ).flatten()
-
-            if not self.combine_groups:
-                self.codebook_indices = self.codebook_indices.view(
-                    self.num_vars ** self.groups, -1
-                )
-                for b in range(1, self.groups):
-                    self.codebook_indices[:, b] += self.num_vars * b
-                self.codebook_indices = self.codebook_indices.flatten()
-        return self.codebook_indices
-
-    def codebook(self):
-        indices = self.get_codebook_indices()
-        return (
-            self.vars.squeeze(0)
-            .index_select(0, indices)
-            .view(self.num_vars ** self.groups, -1)
-        )
-
-    def sample_from_codebook(self, b, n):
-        indices = self.get_codebook_indices()
-        indices = indices.view(-1, self.groups)
-        cb_size = indices.size(0)
-        assert (
-            n < cb_size
-        ), f"sample size {n} is greater than size of codebook {cb_size}"
-        sample_idx = torch.randint(low=0, high=cb_size, size=(b * n,))
-        indices = indices[sample_idx]
-
-        z = self.vars.squeeze(0).index_select(0, indices.flatten()).view(b, n, -1)
-        return z
-
-    def to_codebook_index(self, indices):
-        res = indices.new_full(indices.shape[:-1], 0)
-        for i in range(self.groups):
-            exponent = self.groups - i - 1
-            res += indices[..., i] * (self.num_vars ** exponent)
-        return res
-
-    def forward_idx(self, x):
-        res = self.forward(x, produce_targets=True)
-        return res["x"], res["targets"]
-
-    def forward(self, x, produce_targets=False):
-
-        result = {"num_vars": self.num_vars * self.groups}
-
-        if not self.time_first:
-            x = x.transpose(1, 2)
-
-        bsz, tsz, fsz = x.shape
-        x = x.reshape(-1, fsz)
-        x = self.weight_proj(x)
-        x = x.view(bsz * tsz * self.groups, -1)
-
-        _, k = x.max(-1)
-        hard_x = (
-            x.new_zeros(*x.shape)
-            .scatter_(-1, k.view(-1, 1), 1.0)
-            .view(bsz * tsz, self.groups, -1)
-        )
-        hard_probs = torch.mean(hard_x.float(), dim=0)
-        result["code_perplexity"] = torch.exp(
-            -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1)
-        ).sum()
-
-        avg_probs = torch.softmax(
-            x.view(bsz * tsz, self.groups, -1).float(), dim=-1
-        ).mean(dim=0)
-        result["prob_perplexity"] = torch.exp(
-            -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1)
-        ).sum()
-
-        result["temp"] = self.curr_temp
-
-        if self.training:
-            x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=True).type_as(x)
-        else:
-            x = hard_x
-
-        x = x.view(bsz * tsz, -1)
-
-        vars = self.vars
-        if self.combine_groups:
-            vars = vars.repeat(1, self.groups, 1)
-
-        if produce_targets:
-            result["targets"] = (
-                x.view(bsz * tsz * self.groups, -1)
-                .argmax(dim=-1)
-                .view(bsz, tsz, self.groups)
-                .detach()
-            )
-
-        x = x.unsqueeze(-1) * vars
-        x = x.view(bsz * tsz, self.groups, self.num_vars, -1)
-        x = x.sum(-2)
-        x = x.view(bsz, tsz, -1)
-
-        if not self.time_first:
-            x = x.transpose(1, 2)  # BTC -> BCT
-
-        result["x"] = x
-
-        return result
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/kmeans_vector_quantizer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/kmeans_vector_quantizer.py
deleted file mode 100644
index 040db1e83e775a3bb59d5263d22aae9276a83f22..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/kmeans_vector_quantizer.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn as nn
-from fairseq.modules import Fp32GroupNorm
-
-
-class KmeansVectorQuantizer(nn.Module):
-    def __init__(
-        self, dim, num_vars, groups, combine_groups, vq_dim, time_first, gamma=0.25
-    ):
-        """Vector quantization using straight pass-through estimator (i.e. kmeans)
-
-        Args:
-            dim: input dimension (channels)
-            num_vars: number of quantized vectors per group
-            groups: number of groups for vector quantization
-            combine_groups: whether to use the vectors for all groups
-            vq_dim: dimensionality of the resulting quantized vector
-            time_first: if true, expect input in BxTxC format, otherwise in BxCxT
-            gamma: commitment loss coefficient
-        """
-        super().__init__()
-
-        self.groups = groups
-        self.combine_groups = combine_groups
-        self.input_dim = dim
-        self.num_vars = num_vars
-        self.vq_dim = vq_dim
-        self.time_first = time_first
-
-        assert (
-            vq_dim % groups == 0
-        ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation"
-
-        self.var_dim = vq_dim // groups
-        num_groups = groups if not combine_groups else 1
-
-        self.embedding = nn.Parameter(
-            0.01 * torch.randn(num_vars, num_groups, self.var_dim)
-        )
-        self.projection = nn.Sequential(
-            nn.Conv1d(dim, dim, kernel_size=1, groups=groups, bias=False),
-            Fp32GroupNorm(groups, dim),
-        )
-        self.gamma = gamma
-        self.mse_mean = nn.MSELoss(reduction="mean")
-
-    def _pass_grad(self, x, y):
-        """Manually set gradient for backward pass.
-        for y = f(x), ensure that during the backward pass,
-        dL/dy = dL/dx regardless of f(x).
-        Returns:
-            y, with the gradient forced to be dL/dy = dL/dx.
-        """
-
-        return y.detach() + (x - x.detach())
-
-    @property
-    def expand_embedding(self):
-        if self.combine_groups:
-            return self.embedding.expand(self.num_vars, self.groups, self.var_dim)
-        return self.embedding
-
-    def forward_idx(self, x):
-        res = self.forward(x, produce_targets=True)
-        return res["x"], res["targets"]
-
-    def forward(self, x, produce_targets=False):
-
-        result = {"num_vars": self.num_vars}
-
-        if self.time_first:
-            x = x.transpose(1, 2)
-
-        bsz, fsz, tsz = x.shape
-
-        ze = self.projection(x)
-        ze_ = ze.view(bsz, self.groups, self.var_dim, tsz).permute(0, 3, 1, 2)
-        d = (
-            (ze_.unsqueeze(0) - self.expand_embedding.unsqueeze(1).unsqueeze(1))
-            .view(self.num_vars, bsz, tsz, self.groups, -1)
-            .norm(dim=-1, p=2)
-        )
-        idx = d.argmin(dim=0)
-        zq = (
-            torch.stack(
-                [
-                    self.expand_embedding[idx[..., group], group]
-                    for group in range(self.groups)
-                ],
-                dim=-2,
-            )
-            .view(bsz, tsz, self.groups * self.var_dim)
-            .permute(0, 2, 1)
-        )
-        assert ze.shape == zq.shape, (ze.shape, zq.shape)
-        x = self._pass_grad(ze, zq)
-
-        hard_x = (
-            idx.new_zeros(bsz * tsz * self.groups, self.num_vars)
-            .scatter_(-1, idx.view(-1, 1), 1.0)
-            .view(bsz * tsz, self.groups, -1)
-        )
-        hard_probs = torch.mean(hard_x.float(), dim=0)
-        result["code_perplexity"] = torch.exp(
-            -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1)
-        ).sum()
-
-        if produce_targets:
-            result["targets"] = idx
-
-        if self.time_first:
-            x = x.transpose(1, 2)  # BCT -> BTC
-        result["x"] = x
-
-        ze = ze.float()
-        zq = zq.float()
-        latent_loss = self.mse_mean(zq, ze.detach())
-        commitment_loss = self.mse_mean(ze, zq.detach())
-
-        result["kmeans_loss"] = latent_loss + self.gamma * commitment_loss
-
-        return result
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/layer_drop.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/layer_drop.py
deleted file mode 100644
index 8961d8bcbc492c40c6b30973234416ce5a414f5a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/layer_drop.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-LayerDrop as described in https://arxiv.org/abs/1909.11556.
-"""
-
-import torch
-import torch.nn as nn
-
-
-class LayerDropModuleList(nn.ModuleList):
-    """
-    A LayerDrop implementation based on :class:`torch.nn.ModuleList`.
-
-    We refresh the choice of which layers to drop every time we iterate
-    over the LayerDropModuleList instance. During evaluation we always
-    iterate over all layers.
-
-    Usage::
-
-        layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3])
-        for layer in layers:  # this might iterate over layers 1 and 3
-            x = layer(x)
-        for layer in layers:  # this might iterate over all layers
-            x = layer(x)
-        for layer in layers:  # this might not iterate over any layers
-            x = layer(x)
-
-    Args:
-        p (float): probability of dropping out each layer
-        modules (iterable, optional): an iterable of modules to add
-    """
-
-    def __init__(self, p, modules=None):
-        super().__init__(modules)
-        self.p = p
-
-    def __iter__(self):
-        dropout_probs = torch.empty(len(self)).uniform_()
-        for i, m in enumerate(super().__iter__()):
-            if not self.training or (dropout_probs[i] > self.p):
-                yield m
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/layer_norm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/layer_norm.py
deleted file mode 100644
index fa0926d847b824af0a8c8159d21748035faf833f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/layer_norm.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-try:
-    from apex.normalization import FusedLayerNorm as _FusedLayerNorm
-
-    has_fused_layernorm = True
-
-    class FusedLayerNorm(_FusedLayerNorm):
-        @torch.jit.unused
-        def forward(self, x):
-            if not x.is_cuda:
-                return super().forward(x)
-            else:
-                with torch.cuda.device(x.device):
-                    return super().forward(x)
-
-
-except ImportError:
-    has_fused_layernorm = False
-
-
-def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
-    if torch.jit.is_scripting():
-        export = True
-    if not export and torch.cuda.is_available() and has_fused_layernorm:
-        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-    return NpuLayerNorm(normalized_shape, eps, elementwise_affine)
-
-class NpuLayerNorm(nn.LayerNorm):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-    def forward(self, input):
-        return torch.layer_norm(
-            input * 0.5,
-            self.normalized_shape,
-            self.weight.reshape(-1),
-            self.bias.reshape(-1),
-            self.eps,
-        )
-class Fp32LayerNorm(nn.LayerNorm):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def forward(self, input):
-        output = F.layer_norm(
-            input.float(),
-            self.normalized_shape,
-            self.weight.float() if self.weight is not None else None,
-            self.bias.float() if self.bias is not None else None,
-            self.eps,
-        )
-        return output.type_as(input)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/learned_positional_embedding.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/learned_positional_embedding.py
deleted file mode 100644
index 378d0f707183dd344dbb9288dda394b11053acf0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/learned_positional_embedding.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Dict, Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from torch import Tensor
-
-
-class LearnedPositionalEmbedding(nn.Embedding):
-    """
-    This module learns positional embeddings up to a fixed maximum size.
-    Padding ids are ignored by either offsetting based on padding_idx
-    or by setting padding_idx to None and ensuring that the appropriate
-    position ids are passed to the forward function.
-    """
-
-    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
-        super().__init__(num_embeddings, embedding_dim, padding_idx)
-        self.onnx_trace = False
-        if self.padding_idx is not None:
-            self.max_positions = self.num_embeddings - self.padding_idx - 1
-        else:
-            self.max_positions = self.num_embeddings
-
-    def forward(
-        self,
-        input: Tensor,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        positions: Optional[Tensor] = None,
-    ):
-        """Input is expected to be of size [bsz x seqlen]."""
-        assert (positions is None) or (
-            self.padding_idx is None
-        ), "If positions is pre-computed then padding_idx should not be set."
-
-        if positions is None:
-            if incremental_state is not None:
-                # positions is the same for every token when decoding a single step
-                # Without the int() cast, it doesn't work in some cases when exporting to ONNX
-                positions = torch.zeros(
-                    (1, 1), device=input.device, dtype=input.dtype
-                ).fill_(int(self.padding_idx + input.size(1)))
-            else:
-                positions = utils.make_positions(
-                    input, self.padding_idx, onnx_trace=self.onnx_trace
-                )
-        return F.embedding(
-            positions,
-            self.weight,
-            self.padding_idx,
-            self.max_norm,
-            self.norm_type,
-            self.scale_grad_by_freq,
-            self.sparse,
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/__init__.py
deleted file mode 100644
index 3b2a99c1227f827768911e5e22e79f6865ffbfd3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .lightconv_layer import LightconvLayer  # noqa
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/cuda_function_gen.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/cuda_function_gen.py
deleted file mode 100644
index a25433dd8edae2f0b52d7d0eeeb829cabc6b4b89..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/cuda_function_gen.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-def gen_forward():
-
-    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
-    seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
-
-    head = """
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "lightconv_cuda.cuh"
-
-std::vector<at::Tensor> lightconv_cuda_forward(at::Tensor input, at::Tensor filters, int padding_l) {
-
-    at::DeviceGuard g(input.device());
-    const auto minibatch = input.size(0);
-    const auto numFeatures = input.size(1);
-    const auto sequenceLength = input.size(2);
-
-    const auto numHeads = filters.size(0);
-    const auto filterSize = filters.size(1);
-
-    const auto numFiltersInBlock = numFeatures / numHeads;
-
-    const dim3 blocks(minibatch, numFeatures);
-
-    auto output = at::zeros_like(input);
-    auto stream = at::cuda::getCurrentCUDAStream();
-"""
-
-    sequence_if = """
-    if (sequenceLength <= {seq}) {{
-        switch(filterSize) {{
-"""
-
-    case_k = """
-            case {k}:
-"""
-
-    main_block = """
-                if (padding_l == {pad}) {{
-                    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "lightconv_forward", ([&] {{
-                        lightconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t>
-                        <<<blocks, {b_size}, 0, stream>>>(
-                                input.data<scalar_t>(),
-                                filters.data<scalar_t>(),
-                                minibatch,
-                                sequenceLength,
-                                numFeatures,
-                                numFiltersInBlock,
-                                output.data<scalar_t>());
-                    }}));
-                }} else
-"""
-
-    bad_padding = """
-                {
-                    std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl;
-                }
-                break;
-"""
-
-    bad_filter = """
-            default:
-                std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl;
-        }
-"""
-
-    con_else = """
-    } else
-"""
-
-    final_else = """
-    {
-        switch(filterSize) {
-"""
-
-    final_return = """
-    }
-
-    return {output};
-}
-"""
-
-    with open("lightconv_cuda_forward.cu", "w") as forward:
-        forward.write(head)
-        for seq in seqs:
-            forward.write(sequence_if.format(seq=seq))
-            for k in kernels:
-                forward.write(case_k.format(k=k))
-                for pad in [k // 2, k - 1]:
-                    forward.write(main_block.format(k=k, b_size=seq, pad=pad))
-                forward.write(bad_padding)
-            forward.write(bad_filter)
-            forward.write(con_else)
-
-        forward.write(final_else)
-        for k in kernels:
-            forward.write(case_k.format(k=k))
-            for pad in [k // 2, k - 1]:
-                forward.write(main_block.format(k=k, b_size=seq, pad=pad))
-            forward.write(bad_padding)
-        forward.write(bad_filter)
-        forward.write(final_return)
-
-
-def gen_backward():
-
-    head = """
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "lightconv_cuda.cuh"
-
-std::vector<at::Tensor> lightconv_cuda_backward(
-        at::Tensor gradOutput,
-        int padding_l,
-        at::Tensor input,
-        at::Tensor filters) {
-
-    // gradWrtInput
-    const int minibatch = input.size(0);
-    const int numFeatures = input.size(1);
-    const int sequenceLength = input.size(2);
-
-    const int numHeads = filters.size(0);
-    const int filterSize = filters.size(1);
-
-    const dim3 gradBlocks(minibatch, numFeatures);
-    const dim3 weightGradFirstpassShortBlocks(minibatch, numHeads);
-    const dim3 weightGradSecondpassBlocks(numHeads, filterSize);
-
-    const int numFiltersInBlock = numFeatures / numHeads;
-
-    auto gradInput = at::zeros_like(input);
-    auto gradFilters = at::zeros_like(filters);
-
-    at::DeviceGuard g(input.device());
-    auto stream = at::cuda::getCurrentCUDAStream();
-
-    switch(filterSize) {
-"""
-
-    sequence_if = """
-            if (sequenceLength <= {seq}) {{
-"""
-
-    case_k = """
-        case {k}:
-"""
-
-    main_block = """
-                if (padding_l == {p}) {{
-                    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "lightconv_backward", ([&] {{
-                        lightconv_grad_wrt_input_kernel<{k}, {b_size}, {p}, scalar_t>
-                        <<<gradBlocks, {b_size}, 0, stream>>>(
-                                gradOutput.data<scalar_t>(),
-                                filters.data<scalar_t>(),
-                                minibatch,
-                                sequenceLength,
-                                numFeatures,
-                                numFiltersInBlock,
-                                gradInput.data<scalar_t>());
-
-"""
-
-    weight_grad_short = """
-                        at::Tensor tempSumGradFilters = at::zeros({{minibatch, numHeads, filterSize}}, input.options().dtype(at::kFloat));
-                        lightconv_grad_wrt_weights_firstpass_short_kernel<{k}, {b_size}, {p}, scalar_t>
-                        <<<weightGradFirstpassShortBlocks, {b_size}, 0, stream>>>(
-                                input.data<scalar_t>(),
-                                gradOutput.data<scalar_t>(),
-                                minibatch,
-                                sequenceLength,
-                                numFeatures,
-                                numFiltersInBlock,
-                                numHeads,
-                                tempSumGradFilters.data<float>()
-                        );
-
-                        lightconv_grad_wrt_weights_secondpass_short_kernel<{k}, {b_size}, scalar_t>
-                        <<<weightGradSecondpassBlocks, {b_size}, 0, stream>>>(
-                                tempSumGradFilters.data<float>(),
-                                minibatch,
-                                numFiltersInBlock,
-                                gradFilters.data<scalar_t>()
-                        );
-                    }}));
-                }} else
-"""
-
-    weight_grad = """
-                        at::Tensor tempSumGradFilters = at::zeros({{minibatch, numFeatures, filterSize}}, input.options().dtype(at::kFloat));
-                        lightconv_grad_wrt_weights_firstpass_kernel<{k}, {b_size}, {p}, scalar_t>
-                        <<<gradBlocks, {b_size}, 0, stream>>>(
-                                input.data<scalar_t>(),
-                                gradOutput.data<scalar_t>(),
-                                minibatch,
-                                sequenceLength,
-                                numFeatures,
-                                numFiltersInBlock,
-                                tempSumGradFilters.data<float>()
-                        );
-
-                        lightconv_grad_wrt_weights_secondpass_kernel<{k}, {b_size}, scalar_t>
-                        <<<weightGradSecondpassBlocks, {b_size}, 0, stream>>>(
-                                tempSumGradFilters.data<float>(),
-                                minibatch,
-                                numFiltersInBlock,
-                                gradFilters.data<scalar_t>()
-                        );
-                    }}));
-                }} else
-"""
-
-    bad_padding = """
-                {
-                    std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl;
-                }
-"""
-
-    breakout = """
-                break;
-"""
-
-    bad_filter = """
-        default:
-            std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl;
-"""
-
-    con_else = """
-            } else
-"""
-
-    final_else = """
-    {
-        switch(filterSize) {
-"""
-
-    last_return = """
-    }
-    return {gradInput, gradFilters};
-}
-"""
-
-    kernels = [3, 5, 7, 15, 31, 63, 127, 255]
-    seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
-    thresh = [32, 32, 64, 128, 256, -1, -1, -1]
-    max_mem = [-1, -1, -1, -1, -1, 192, 96, 64]
-
-    with open("lightconv_cuda_backward.cu", "w") as backward:
-        backward.write(head)
-        for (k, t, mem) in zip(kernels, thresh, max_mem):
-            backward.write(case_k.format(k=k))
-            for seq in seqs:
-                if (t == -1 or seq <= t) and (mem == -1 or seq < mem):
-                    backward.write(sequence_if.format(seq=seq))
-                    for p in [k // 2, k - 1]:
-                        backward.write(main_block.format(k=k, b_size=seq, p=p))
-                        backward.write(weight_grad_short.format(k=k, b_size=seq, p=p))
-                    backward.write(bad_padding)
-                else:
-                    for p in [k // 2, k - 1]:
-                        backward.write(main_block.format(k=k, b_size=32, p=p))
-                        backward.write(weight_grad.format(k=k, b_size=32, p=p))
-                    backward.write(bad_padding)
-                    backward.write(breakout)
-                    break
-                backward.write(con_else)
-        backward.write(bad_filter)
-        backward.write(last_return)
-
-
-if __name__ == "__main__":
-    gen_forward()
-    gen_backward()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/lightconv_cuda.cpp b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
deleted file mode 100644
index 4bf6b5ad365d604bd91eda384bb422857b640744..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <torch/extension.h>
-#include <vector>
-
-std::vector<at::Tensor> lightconv_cuda_forward(
-    at::Tensor input,
-    at::Tensor filters,
-    int padding_l);
-
-std::vector<at::Tensor> lightconv_cuda_backward(
-    at::Tensor gradOutput,
-    int padding_l,
-    at::Tensor input,
-    at::Tensor filters);
-
-
-#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
-
-std::vector<at::Tensor> lightconv_forward(
-    at::Tensor input,
-    at::Tensor filters,
-    int padding_l) {
-
-    CHECK_INPUT(input);
-    CHECK_INPUT(filters);
-
-    return lightconv_cuda_forward(input, filters, padding_l);
-}
-
-std::vector<at::Tensor> lightconv_backward(
-    at::Tensor gradOutput,
-    int padding_l,
-    at::Tensor input,
-    at::Tensor filters) {
-
-    CHECK_INPUT(gradOutput);
-    CHECK_INPUT(input);
-    CHECK_INPUT(filters);
-
-    return lightconv_cuda_backward(gradOutput, padding_l, input, filters);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("forward", &lightconv_forward, "lighconv forward (CUDA)");
-    m.def("backward", &lightconv_backward, "lighconv backward (CUDA)");
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/lightconv_cuda.cuh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/lightconv_cuda.cuh
deleted file mode 100644
index 3cae57b68fc96872a5047a7a0d081b78456e8fae..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/lightconv_cuda.cuh
+++ /dev/null
@@ -1,83 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- * 
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <ATen/ATen.h>
-#include <c10/cuda/CUDAStream.h>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <algorithm>
-#include <functional>
-#include <iostream>
-#include <stdexcept>
-#include <utility>
-#include <vector>
-
-#include <stdlib.h>
-#include <assert.h>
-
-#define SHFL_MASK 0xffffffff
-
-template<int FS, int SB, int padding_l, typename scalar_t>
-__global__
-void lightconv_forward_kernel(const scalar_t* input,
-                              const scalar_t* filters,
-                              int minibatch, int sequenceLength,
-                              int numFeatures, int numFiltersInBlock,
-                              scalar_t* output);
-
-template<int FS, int SB, int padding_l, typename scalar_t>
-__global__
-void lightconv_grad_wrt_input_kernel(
-    const scalar_t* input, 
-    const scalar_t* filters,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    scalar_t* output);
-
-template<int FS, int SB, int padding_l, typename scalar_t>
-__global__
-void lightconv_grad_wrt_weights_firstpass_short_kernel(
-    const scalar_t* input,
-    const scalar_t* gradInput,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    int numHeads,
-    float* output);
-
-template<int FS, int SB, typename scalar_t>
-__global__
-void lightconv_grad_wrt_weights_secondpass_short_kernel(
-    const float* input,
-    const int minibatch, 
-    const int numFiltersInBlock,
-    scalar_t* output);
-
-template<int FS, int SB, int padding_l, typename scalar_t>
-__global__
-void lightconv_grad_wrt_weights_firstpass_kernel(
-    const scalar_t* input,
-    const scalar_t* gradInput,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    float* output);
-
-template<int FS, int SB, typename scalar_t>
-__global__
-void lightconv_grad_wrt_weights_secondpass_kernel(
-    const float* input,
-    const int minibatch, 
-    const int numFiltersInBlock,
-    scalar_t* output);
-
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
deleted file mode 100644
index 8ee83a56c89754c2abbe717b269d07ca9e64eef2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
+++ /dev/null
@@ -1,375 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- * 
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "lightconv_cuda.cuh"
-#include "lightconv_cuda_forward.cu"
-#include "lightconv_cuda_backward.cu"
-#include "../cuda_utils.cu"
-
-template<int FS, int SB, int padding_l, typename scalar_t>
-__global__
-void lightconv_forward_kernel(const scalar_t* input,
-                              const scalar_t* filters,
-                              int minibatch, int sequenceLength,
-                              int numFeatures, int numFiltersInBlock,
-                              scalar_t* output) {
-
-  const int tid = threadIdx.x;
-  const int batchIdx = blockIdx.x;
-  const int featureIdx = blockIdx.y;
-  const int filterIdx = featureIdx / numFiltersInBlock;
-
-  const int IOOffset = numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength;
-  const scalar_t* inputFeature = &input[IOOffset];
-  scalar_t* outputFeature = &output[IOOffset];
-  const scalar_t* inputFilter = &filters[filterIdx * FS];
-
-  assert(blockDim.x == SB);
-
-  scalar_t filter[FS];
-  #pragma unroll
-  for (int i = 0; i < FS; ++i) {
-    filter[i] = inputFilter[i];
-  }
-
-  __shared__ scalar_t temp[SB + FS];
-  zeroSharedMem<FS, SB, padding_l>(temp);
-
-  const int numIterations = divUp<int, int>(sequenceLength, SB);
-
-  for (int i = 0; i < numIterations; ++i) {
-    // Read input into shared memory
-    const int inputOffset = i * SB;
-
-    load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, sequenceLength,
-                                 i, numIterations, (numIterations == 1), temp);
-
-    __syncthreads();
-
-    scalar_t out = 0;
-    #pragma unroll
-    for (int j = 0; j < FS; ++j) {
-      out += filter[j] * temp[tid + j];
-    }
-
-    // Write output
-    const int outputOffset = inputOffset;
-    if ((outputOffset + tid) < sequenceLength) {
-      outputFeature[outputOffset + tid] = out;
-    }
-
-    __syncthreads();
-  }
-}
-
-template<int FS, int SB, int padding_l, typename scalar_t>
-__global__
-void lightconv_grad_wrt_input_kernel(
-    const scalar_t* input,
-    const scalar_t* filters,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    scalar_t* output) {
-
-  // input grad kernel is similar to forward kernel
-  const int tid = threadIdx.x;
-  const int batchIdx = blockIdx.x;
-  const int featureIdx = blockIdx.y;
-  const int filterIdx = featureIdx / numFiltersInBlock;
-
-  const int IOOffset = numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength;
-  const scalar_t* inputFeature = &input[IOOffset];
-  scalar_t* outputFeature = &output[IOOffset];
-  const scalar_t* inputFilter = &filters[filterIdx * FS];
-
-  assert(blockDim.x == SB);
-
-  scalar_t filter[FS];
-
-  // The only change is loading the filter in reverse
-  #pragma unroll
-  for (int i = 0; i < FS; ++i) {
-    filter[i] = inputFilter[FS - i - 1];
-  }
-
-  __shared__ scalar_t temp[SB + FS];
-  const int padding = FS - padding_l - 1;
-  zeroSharedMem<FS, SB, padding>(temp);
-
-  __syncthreads();
-
-  const int numIterations = divUp<int, int>(sequenceLength, SB);
-
-  for (int i = 0; i < numIterations; ++i) {
-    // Read input into shared memory
-    const int inputOffset = i * SB;
-
-    load_input_to_shared<FS, SB, padding>(inputFeature, inputOffset, sequenceLength,
-                                 i, numIterations, false, temp);
-
-    __syncthreads();
-
-    scalar_t out = 0;
-    #pragma unroll
-    for (int j = 0; j < FS; ++j) {
-      out += filter[j] * temp[tid + j];
-    }
-
-    // Write output
-    const int outputOffset = inputOffset;
-    if ((outputOffset + tid) < sequenceLength) {
-      outputFeature[outputOffset + tid] = out;
-    }
-
-    __syncthreads();
-  }
-}
-
-// This is by far the most expensive kernel in terms of time taken.
-// Can be 16x slower than the forward or grad_wrt_input when filter size is 31
-template<int FS, int SB, int padding_l, typename scalar_t>
-__global__
-void lightconv_grad_wrt_weights_firstpass_short_kernel(
-    const scalar_t* input,
-    const scalar_t* gradInput,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    int numHeads,
-    float* output) {
-
-  const int tid = threadIdx.x;
-  const int batchIdx = blockIdx.x;
-  const int filterIdx = blockIdx.y;
-
-  const int numIterations = divUp<int, int>(sequenceLength, SB);
-
-  float* tempOutputGradWeight = &output[filterIdx * FS * minibatch];
-
-  assert(blockDim.x == SB);
-
-  __shared__ scalar_t tempInput[SB + FS];
-  __shared__ scalar_t tempGradInput[SB + FS];
-
-  // local weight accumulation
-  float accumWeights[FS];
-
-  // Initialize memory
-  for (int i = 0; i < FS; ++i) {
-    accumWeights[i] = float(0.0);
-  }
-
-
-  // loop over each sequence within filterblock
-  for (int idxInFilterBlock = 0; idxInFilterBlock < numFiltersInBlock; ++idxInFilterBlock) {
-
-    const int featureOffset = batchIdx * numFeatures * sequenceLength + (filterIdx * numFiltersInBlock + idxInFilterBlock) * sequenceLength;
-    const scalar_t* inputFeature = &input[featureOffset];
-    const scalar_t* gradInputFeature = &gradInput[featureOffset];
-
-    zeroSharedMem<FS, SB, padding_l>(tempInput);
-    zeroSharedMem<FS, SB, (FS/2)>(tempGradInput);
-    __syncthreads();
-
-    for (int i = 0; i < numIterations; ++i) {
-
-      const int inputOffset = i * SB;
-
-      load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, sequenceLength,
-                                    i, numIterations, false, tempInput);
-      load_input_to_shared<FS, SB, (FS/2)>(gradInputFeature, inputOffset, sequenceLength,
-                                    i, numIterations, false, tempGradInput);
-
-      __syncthreads();
-
-      const int gradIndex = (FS/2) + tid;
-      scalar_t tempGrad = tempGradInput[gradIndex];
-
-      #pragma unroll
-      for (int j = 0; j < FS; j++) {
-        const int inputIndex = tid + j;
-        accumWeights[j] += tempInput[inputIndex] * tempGrad;
-      }
-
-      __syncthreads();
-
-    }
-
-  }
-
-  // Row-major sum
-  for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) {
-
-    float temp;
-    if (tid < sequenceLength) {
-        temp = accumWeights[filterWeightIdx];
-    } else {
-        temp = float(0.0);
-    }
-
-    const int outputOffset = filterWeightIdx * minibatch + batchIdx;
-
-    temp = blockReduce(temp);
-
-    if (tid == 0) {
-      tempOutputGradWeight[outputOffset] = temp;
-    }
-  }
-}
-
-template<int FS, int SB, typename scalar_t>
-__global__
-void lightconv_grad_wrt_weights_secondpass_short_kernel(
-    const float* input,
-    const int minibatch,
-    const int numFiltersInBlock,
-    scalar_t* output) {
-
-  assert(blockDim.x == SB);
-
-  const int tid = threadIdx.x;
-
-  const int filterIdx = blockIdx.x;
-  const int filterWeightIdx = blockIdx.y;
-
-  const int inputOffset = filterIdx * FS * minibatch +
-                          filterWeightIdx * minibatch;
-  const float* tempInput = &input[inputOffset];
-
-  // read into shared memory for reduction
-  int readIndex = tid;
-
-  float sum = 0.0;
-  while (readIndex < minibatch) {
-    sum += tempInput[readIndex];
-    readIndex += SB;
-  }
-
-  float temp = blockReduce(sum);
-
-  if (tid == 0) {
-    output[blockIdx.x * FS + blockIdx.y] = temp;
-  }
-}
-
-// This is by far the most expensive kernel in terms of time taken.
-// Can be 16x slower than the forward or grad_wrt_input when filter size is 31
-template<int FS, int SB, int padding_l, typename scalar_t>
-__global__
-void lightconv_grad_wrt_weights_firstpass_kernel(
-    const scalar_t* input,
-    const scalar_t* gradInput,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    float* output) {
-
-  assert(blockDim.x == SB);
-
-  const int tid = threadIdx.x;
-  const int batchIdx = blockIdx.x;
-  const int featureIdx = blockIdx.y;
-  const int filterIdx = featureIdx / numFiltersInBlock;
-  const int idxInFilterBlock = featureIdx % numFiltersInBlock;
-
-  const int numIterations = divUp<int, int>(sequenceLength, SB);
-
-  float temp;
-
-  __shared__ scalar_t tempInput[SB + FS];
-  __shared__ scalar_t tempGradInput[SB + FS];
-  zeroSharedMem<FS, SB, padding_l>(tempInput);
-  zeroSharedMem<FS, SB, (FS/2)>(tempGradInput);
-  __syncthreads();
-
-  float accumWeights[FS];
-
-  for (int i = 0; i < FS; ++i) {
-    accumWeights[i] = float(0.0);
-  }
-
-  const int IOOffset = batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength;
-  const scalar_t* inputFeature = &input[IOOffset];
-  const scalar_t* gradInputFeature = &gradInput[IOOffset];
-  float* tempOutputGradWeight = &output[filterIdx * FS * minibatch * numFiltersInBlock];
-
-  for (int i = 0; i < numIterations; ++i) {
-    const int inputOffset = i * SB;
-
-    load_input_to_shared<FS, SB, padding_l>(inputFeature, inputOffset, sequenceLength,
-                                 i, numIterations, false, tempInput);
-    load_input_to_shared<FS, SB, (FS/2)>(gradInputFeature, inputOffset, sequenceLength,
-                                 i, numIterations, false, tempGradInput);
-    __syncthreads();
-
-    #pragma unroll
-    for (int j = 0; j < FS; ++j) {
-      accumWeights[j] += tempInput[tid + j] * tempGradInput[tid + (FS/2)];
-    }
-
-    __syncthreads();
-  }
-
-  // Row-major sum
-  for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) {
-
-    // Write to shared memory before reduction
-    if (tid < sequenceLength) {
-      temp = accumWeights[filterWeightIdx];
-    } else {
-      temp = float(0.0);
-    }
-
-    temp = blockReduce(temp);
-
-    const int outputOffset = filterWeightIdx * minibatch * numFiltersInBlock +
-                             batchIdx * numFiltersInBlock +
-                             idxInFilterBlock;
-
-    if (tid == 0) {
-      tempOutputGradWeight[outputOffset] = temp;
-    }
-  }
-}
-
-template<int FS, int SB, typename scalar_t>
-__global__
-void lightconv_grad_wrt_weights_secondpass_kernel(
-    const float* input,
-    const int minibatch,
-    const int numFiltersInBlock,
-    scalar_t* output) {
-
-  assert(blockDim.x == SB);
-  const int tid = threadIdx.x;
-
-  // What is the id within a minibatch
-  const int filterIdx = blockIdx.x;
-  const int filterWeightIdx = blockIdx.y;
-
-  const int inputOffset = filterIdx * FS * minibatch * numFiltersInBlock +
-                          filterWeightIdx * minibatch * numFiltersInBlock;
-  const float* tempInput = &input[inputOffset];
-
-  int readIndex = tid;
-
-  float sum = float(0.0);
-  while (readIndex < (minibatch * numFiltersInBlock)) {
-    sum += tempInput[readIndex];
-    readIndex += SB;
-  }
-
-  float temp = blockReduce(sum);
-
-  if (tid == 0) {
-    output[blockIdx.x * FS + blockIdx.y] = temp;
-  }
-}
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/lightconv_layer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/lightconv_layer.py
deleted file mode 100644
index e7e597f4749c591b057d776aacec39b44d99c037..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/lightconv_layer.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import lightconv_cuda
-import torch
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.incremental_decoding_utils import with_incremental_state
-from fairseq.modules.fairseq_dropout import FairseqDropout
-from torch import nn
-from torch.autograd import Function
-
-
-class lightconvFunction(Function):
-    @staticmethod
-    def forward(ctx, x, weights, padding_l):
-        ctx.padding_l = padding_l
-        outputs = lightconv_cuda.forward(x, weights, padding_l)
-        variables = [x, weights]
-        ctx.save_for_backward(*variables)
-        return outputs[0]
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        outputs = lightconv_cuda.backward(
-            grad_output.contiguous(), ctx.padding_l, *ctx.saved_tensors
-        )
-        grad_input, grad_weights = outputs
-        return grad_input, grad_weights, None
-
-
-@with_incremental_state
-class LightconvLayer(nn.Module):
-    def __init__(
-        self,
-        input_size,
-        kernel_size=1,
-        padding_l=None,
-        weight_softmax=False,
-        num_heads=1,
-        weight_dropout=0.0,
-        bias=False,
-    ):
-        super(LightconvLayer, self).__init__()
-        self.input_size = input_size
-        self.kernel_size = kernel_size
-        self.padding_l = padding_l
-        self.num_heads = num_heads
-        self.weight_softmax = weight_softmax
-        self.weight_dropout_module = FairseqDropout(
-            weight_dropout, module_name=self.__class__.__name__
-        )
-
-        self.weight = nn.Parameter(torch.Tensor(num_heads, kernel_size))
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(input_size))
-        else:
-            self.bias = None
-        self.reset_parameters()
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        prefix = name + "." if name != "" else ""
-        for k, v in state_dict.items():
-            if k.endswith(prefix + "weight"):
-                if v.dim() == 3 and v.size(1) == 1:
-                    state_dict[k] = v.squeeze(1)
-
-    def reset_parameters(self):
-        nn.init.xavier_uniform_(self.weight)
-        if self.bias is not None:
-            nn.init.constant_(self.bias, 0.0)
-
-    def forward(self, x, incremental_state=None):
-
-        # during inference time, incremental BMM is faster
-        if incremental_state is not None:
-            T, B, C = x.size()
-            K, H = self.kernel_size, self.num_heads
-            R = C // H
-            input_buffer = self._get_input_buffer(incremental_state)
-            if input_buffer is None:
-                input_buffer = x.new()
-            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
-            if self.kernel_size > 1:
-                self._set_input_buffer(
-                    incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :]
-                )
-            x_unfold = x_unfold.view(T * B * H, R, -1)
-
-            weight = self.weight
-            if self.weight_softmax:
-                weight = F.softmax(weight.float(), dim=1).type_as(weight)
-
-            weight = weight[:, -x_unfold.size(2) :]
-
-            K = weight.size(1)
-
-            weight = (
-                weight.view(1, H, K)
-                .expand(T * B, H, K)
-                .contiguous()
-                .view(T * B * H, K, 1)
-            )
-
-            weight = self.weight_dropout_module(weight)
-            output = torch.bmm(x_unfold, weight)  # T*B*H x R x 1
-            output = output.view(T, B, C)
-            return output
-
-        # during training time, use CUDA kernel
-        else:
-            x = x.permute(1, 2, 0).contiguous()
-            weight = self.weight
-            if self.weight_softmax:
-                weight = F.softmax(self.weight, -1)
-            if self.weight_dropout_module.p:
-                weight = self.weight_dropout_module(weight)
-            return lightconvFunction.apply(x, weight, self.padding_l).permute(2, 0, 1)
-
-    def reorder_incremental_state(self, incremental_state, new_order):
-        input_buffer = self._get_input_buffer(incremental_state)
-        if input_buffer is not None:
-            input_buffer = input_buffer.index_select(1, new_order)
-            self._set_input_buffer(incremental_state, input_buffer)
-
-    def _get_input_buffer(self, incremental_state):
-        return utils.get_incremental_state(self, incremental_state, "input_buffer")
-
-    def _set_input_buffer(self, incremental_state, new_buffer):
-        return utils.set_incremental_state(
-            self, incremental_state, "input_buffer", new_buffer
-        )
-
-    def half(self):
-        return self._apply(lambda t: t.half() if t.is_floating_point() else t)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/setup.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/setup.py
deleted file mode 100644
index 052635be79b466d0ad56cf5cf607bd10c2297ecf..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightconv_layer/setup.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-
-setup(
-    name="lightconv_layer",
-    ext_modules=[
-        CUDAExtension(
-            "lightconv_cuda",
-            [
-                "lightconv_cuda.cpp",
-                "lightconv_cuda_kernel.cu",
-            ],
-        ),
-    ],
-    cmdclass={"build_ext": BuildExtension},
-)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightweight_convolution.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightweight_convolution.py
deleted file mode 100644
index ec11a9507951c9e8f3564753841dd9c74a4900e0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/lightweight_convolution.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.incremental_decoding_utils import with_incremental_state
-from fairseq.modules.fairseq_dropout import FairseqDropout
-from fairseq.modules.unfold import unfold1d
-
-
-def LightweightConv(
-    input_size,
-    kernel_size=1,
-    padding_l=None,
-    num_heads=1,
-    weight_dropout=0.0,
-    weight_softmax=False,
-    bias=False,
-):
-    if torch.cuda.is_available():
-        try:
-            from fairseq.modules.lightconv_layer import LightconvLayer
-
-            return LightconvLayer(
-                input_size,
-                kernel_size=kernel_size,
-                padding_l=padding_l,
-                num_heads=num_heads,
-                weight_dropout=weight_dropout,
-                weight_softmax=weight_softmax,
-                bias=bias,
-            )
-        except ImportError as e:
-            print(e)
-    return LightweightConv1dTBC(
-        input_size,
-        kernel_size=kernel_size,
-        padding_l=padding_l,
-        num_heads=num_heads,
-        weight_dropout=weight_dropout,
-        weight_softmax=weight_softmax,
-        bias=bias,
-    )
-
-
-class LightweightConv1d(nn.Module):
-    """Lightweight Convolution assuming the input is BxCxT
-    This is just an example that explains LightConv clearer than the TBC version.
-    We don't use this module in the model.
-
-    Args:
-        input_size: # of channels of the input and output
-        kernel_size: convolution channels
-        padding: padding
-        num_heads: number of heads used. The weight is of shape
-            `(num_heads, 1, kernel_size)`
-        weight_softmax: normalize the weight with softmax before the convolution
-
-    Shape:
-        Input: BxCxT, i.e. (batch_size, input_size, timesteps)
-        Output: BxCxT, i.e. (batch_size, input_size, timesteps)
-
-    Attributes:
-        weight: the learnable weights of the module of shape
-            `(num_heads, 1, kernel_size)`
-        bias: the learnable bias of the module of shape `(input_size)`
-    """
-
-    def __init__(
-        self,
-        input_size,
-        kernel_size=1,
-        padding=0,
-        num_heads=1,
-        weight_softmax=False,
-        bias=False,
-        weight_dropout=0.0,
-    ):
-        super().__init__()
-        self.input_size = input_size
-        self.kernel_size = kernel_size
-        self.num_heads = num_heads
-        self.padding = padding
-        self.weight_softmax = weight_softmax
-        self.weight = nn.Parameter(torch.Tensor(num_heads, 1, kernel_size))
-
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(input_size))
-        else:
-            self.bias = None
-        self.weight_dropout_module = FairseqDropout(
-            weight_dropout, module_name=self.__class__.__name__
-        )
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        nn.init.xavier_uniform_(self.weight)
-        if self.bias is not None:
-            nn.init.constant_(self.bias, 0.0)
-
-    def forward(self, input):
-        """
-        input size: B x C x T
-        output size: B x C x T
-        """
-        B, C, T = input.size()
-        H = self.num_heads
-
-        weight = self.weight
-        if self.weight_softmax:
-            weight = F.softmax(weight, dim=-1)
-
-        weight = self.weight_dropout_module(weight)
-        # Merge every C/H entries into the batch dimension (C = self.input_size)
-        # B x C x T -> (B * C/H) x H x T
-        # One can also expand the weight to C x 1 x K by a factor of C/H
-        # and do not reshape the input instead, which is slow though
-        input = input.view(-1, H, T)
-        output = F.conv1d(input, weight, padding=self.padding, groups=self.num_heads)
-        output = output.view(B, C, T)
-        if self.bias is not None:
-            output = output + self.bias.view(1, -1, 1)
-
-        return output
-
-
-@with_incremental_state
-class LightweightConv1dTBC(nn.Module):
-    """Lightweight Convolution assuming the input is TxBxC
-    Args:
-        input_size: # of channels of the input
-        kernel_size: convolution channels
-        padding_l: padding to the left when using "same" padding
-        num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size)
-        weight_dropout: the drop rate of the DropConnect to drop the weight
-        weight_softmax: normalize the weight with softmax before the convolution
-        bias: use bias
-
-    Shape:
-        Input: TxBxC, i.e. (timesteps, batch_size, input_size)
-        Output: TxBxC, i.e. (timesteps, batch_size, input_size)
-
-    Attributes:
-        weight: the learnable weights of the module of shape
-            `(num_heads, 1, kernel_size)`
-        bias:   the learnable bias of the module of shape `(input_size)`
-    """
-
-    def __init__(
-        self,
-        input_size,
-        kernel_size=1,
-        padding_l=None,
-        num_heads=1,
-        weight_dropout=0.0,
-        weight_softmax=False,
-        bias=False,
-    ):
-        super().__init__()
-        self.input_size = input_size
-        self.kernel_size = kernel_size
-        self.padding_l = padding_l
-        self.num_heads = num_heads
-        self.weight_dropout_module = FairseqDropout(
-            weight_dropout, module_name=self.__class__.__name__
-        )
-        self.weight_softmax = weight_softmax
-
-        self.weight = nn.Parameter(torch.Tensor(num_heads, 1, kernel_size))
-        if bias:
-            self.bias = nn.Parameter(torch.Tensor(input_size))
-        else:
-            self.bias = None
-
-        self.reset_parameters()
-        self.onnx_trace = False
-
-    def reset_parameters(self):
-        nn.init.xavier_uniform_(self.weight)
-        if self.bias is not None:
-            nn.init.constant_(self.bias, 0.0)
-
-    def forward(self, x, incremental_state=None, unfold=False):
-        """Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C
-        args:
-            x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size)
-            incremental_state: A dict to keep the state
-            unfold: unfold the input or not. If not, we use the matrix trick instead
-        """
-        unfold = unfold or (incremental_state is not None)
-
-        if unfold:
-            output = self._forward_unfolded(x, incremental_state)
-        else:
-            output = self._forward_expanded(x, incremental_state)
-
-        if self.bias is not None:
-            output = output + self.bias.view(1, 1, -1)
-        return output
-
-    def prepare_for_onnx_export_(self):
-        self.onnx_trace = True
-
-    def _forward_unfolded(self, x, incremental_state):
-        """The conventional implementation of convolutions.
-        Unfolding the input by having a window shifting to the right."""
-        T, B, C = x.size()
-        K, H = self.kernel_size, self.num_heads
-        R = C // H
-        assert R * H == C == self.input_size
-
-        weight = self.weight.view(H, K)
-        if incremental_state is not None:
-            input_buffer = self._get_input_buffer(incremental_state)
-            if input_buffer is None:
-                input_buffer = x.new()
-            x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
-            if self.kernel_size > 1:
-                self._set_input_buffer(
-                    incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :]
-                )
-            x_unfold = x_unfold.view(T * B * H, R, -1)
-        else:
-            # unfold the input: T x B x C --> T' x B x C x K
-            x_unfold = unfold1d(x, self.kernel_size, self.padding_l, 0)
-            x_unfold = x_unfold.view(T * B * H, R, K)
-
-        if self.weight_softmax:
-            weight = utils.softmax(weight, dim=1, onnx_trace=self.onnx_trace).type_as(
-                weight
-            )
-
-        if incremental_state is not None:
-            weight = weight[:, -x_unfold.size(2) :]
-            K = weight.size(1)
-
-        weight = (
-            weight.view(1, H, K).expand(T * B, H, K).contiguous().view(T * B * H, K, 1)
-        )
-
-        weight = self.weight_dropout_module(weight)
-        output = torch.bmm(x_unfold, weight)  # T*B*H x R x 1
-        output = output.view(T, B, C)
-        return output
-
-    def _forward_expanded(self, x, incremental_state):
-        """Turn the convolution filters into band matrices and do matrix multiplication.
-        This is faster when the sequence is short, but less memory efficient.
-        This is not used in the decoder during inference.
-        """
-        T, B, C = x.size()
-        K, H = self.kernel_size, self.num_heads
-        R = C // H
-        assert R * H == C == self.input_size
-
-        weight = self.weight.view(H, K)
-        if self.weight_softmax:
-            weight = utils.softmax(weight, dim=1, onnx_trace=self.onnx_trace).type_as(
-                weight
-            )
-        weight = weight.view(1, H, K).expand(T * B, H, K).contiguous()
-        weight = weight.view(T, B * H, K).transpose(0, 1)
-
-        x = x.view(T, B * H, R).transpose(0, 1)
-        P = self.padding_l
-        if K > T and P == K - 1:
-            weight = weight.narrow(2, K - T, T)
-            K, P = T, T - 1
-        # turn the convolution filters into band matrices
-        weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False)
-        weight_expanded.as_strided((B * H, T, K), (T * (T + K - 1), T + K, 1)).copy_(
-            weight
-        )
-        weight_expanded = weight_expanded.narrow(2, P, T)
-        weight_expanded = self.weight_dropout_module(weight_expanded)
-
-        output = torch.bmm(weight_expanded, x)
-        output = output.transpose(0, 1).contiguous().view(T, B, C)
-        return output
-
-    def reorder_incremental_state(self, incremental_state, new_order):
-        input_buffer = self._get_input_buffer(incremental_state)
-        if input_buffer is not None:
-            input_buffer = input_buffer.index_select(1, new_order)
-            self._set_input_buffer(incremental_state, input_buffer)
-
-    def _get_input_buffer(self, incremental_state):
-        return utils.get_incremental_state(self, incremental_state, "input_buffer")
-
-    def _set_input_buffer(self, incremental_state, new_buffer):
-        return utils.set_incremental_state(
-            self, incremental_state, "input_buffer", new_buffer
-        )
-
-    def extra_repr(self):
-        s = "{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, bias={}".format(
-            self.input_size,
-            self.kernel_size,
-            self.padding_l,
-            self.num_heads,
-            self.weight_softmax,
-            self.bias is not None,
-        )
-        if self.weight_dropout_module.p > 0.0:
-            s += ", weight_dropout={}".format(self.weight_dropout_module.p)
-        return s
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/linearized_convolution.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/linearized_convolution.py
deleted file mode 100644
index 09a8f201c0218f461f44ca57b3352328e4efb936..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/linearized_convolution.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.incremental_decoding_utils import with_incremental_state
-
-from .conv_tbc import ConvTBC
-
-
-@with_incremental_state
-class LinearizedConvolution(ConvTBC):
-    """An optimized version of nn.Conv1d.
-
-    At training time, this module uses ConvTBC, which is an optimized version
-    of Conv1d. At inference time, it optimizes incremental generation (i.e.,
-    one time step at a time) by replacing the convolutions with linear layers.
-    Note that the input order changes from training to inference.
-    """
-
-    def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
-        super().__init__(in_channels, out_channels, kernel_size, **kwargs)
-        self._linearized_weight = None
-        self.register_backward_hook(self._clear_linearized_weight)
-
-    def state_dict(self, destination=None, prefix="", keep_vars=False):
-        state = ConvTBC.state_dict(self, destination, prefix, keep_vars=keep_vars)
-        # don't store redundant _linearized_weight in checkpoints
-        if prefix + "_linearized_weight" in state:
-            del state[prefix + "_linearized_weight"]
-        return state
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        prefix = name + "." if name != "" else ""
-        if prefix + "_linearized_weight" in state_dict:
-            del state_dict[prefix + "_linearized_weight"]
-
-    def forward(self, input, incremental_state=None):
-        """
-        Args:
-            incremental_state: Used to buffer signal; if not None, then input is
-                expected to contain a single frame. If the input order changes
-                between time steps, call reorder_incremental_state.
-        Input:
-            Time x Batch x Channel during training
-            Batch x Time x Channel during inference
-        """
-        if incremental_state is None:
-            output = super().forward(input)
-            if self.kernel_size[0] > 1 and self.padding[0] > 0:
-                # remove future timesteps added by padding
-                output = output[: -self.padding[0], :, :]
-            return output
-
-        # reshape weight
-        weight = self._get_linearized_weight()
-        kw = self.kernel_size[0]
-
-        bsz = input.size(0)  # input: bsz x len x dim
-        if kw > 1:
-            input = input.data
-            input_buffer = self._get_input_buffer(incremental_state)
-            if input_buffer is None:
-                input_buffer = input.new(bsz, kw, input.size(2)).zero_()
-                self._set_input_buffer(incremental_state, input_buffer)
-            else:
-                # shift buffer
-                input_buffer[:, :-1, :] = input_buffer[:, 1:, :].clone()
-            # append next input
-            input_buffer[:, -1, :] = input[:, -1, :]
-            input = input_buffer
-        with torch.no_grad():
-            output = F.linear(input.view(bsz, -1), weight, self.bias)
-        return output.view(bsz, 1, -1)
-
-    def reorder_incremental_state(self, incremental_state, new_order):
-        input_buffer = self._get_input_buffer(incremental_state)
-        if input_buffer is not None:
-            input_buffer = input_buffer.index_select(0, new_order)
-            self._set_input_buffer(incremental_state, input_buffer)
-
-    def _get_input_buffer(self, incremental_state):
-        return utils.get_incremental_state(self, incremental_state, "input_buffer")
-
-    def _set_input_buffer(self, incremental_state, new_buffer):
-        return utils.set_incremental_state(
-            self, incremental_state, "input_buffer", new_buffer
-        )
-
-    def _get_linearized_weight(self):
-        if self._linearized_weight is None:
-            kw = self.kernel_size[0]
-            weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous()
-            assert weight.size() == (self.out_channels, kw, self.in_channels)
-            self._linearized_weight = torch.nn.Parameter(
-                weight.view(self.out_channels, -1)
-            )
-        return self._linearized_weight
-
-    def _clear_linearized_weight(self, *args):
-        self._linearized_weight = None
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/multihead_attention.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/multihead_attention.py
deleted file mode 100644
index 5d061ce0ca3cec518aa4cb1c209e8c5aca308706..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/multihead_attention.py
+++ /dev/null
@@ -1,510 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from typing import Dict, Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-from fairseq import utils
-from fairseq.incremental_decoding_utils import with_incremental_state
-from fairseq.modules.fairseq_dropout import NpuFairseqDropout, get_dropout_class
-from fairseq.modules.quant_noise import quant_noise
-from torch import Tensor, nn
-from torch.nn import Parameter
-
-dropout_class = get_dropout_class()
-
-
-class NpuLinear(nn.Linear):
-    def forward(self, input):
-        input_shape = input.size()
-        if input.dim() == 3:
-            input = input.view(-1, self.in_features)
-            return torch.npu_linear(input,self.weight, self.bias).view(input_shape[0],
-                                                                       input_shape[1],
-                                                                       self.out_features)
-        elif input.dim() == 2:
-            return torch.npu_linear(input, self.weight,self.bias)
-        else:
-            raise RuntimeError('not support this dim')
-
-class MHAConfig:
-    use_fussion_mha = False
-
-    @classmethod
-    def set_fussion(cls):
-        try:
-            from torch import npu_multi_head_attention
-            cls.use_fussion_mha = True
-        except:
-            cls.use_fussion_mha = False
-
-class MatmulApply(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, self, mat2):
-        ctx.save_for_backward(self, mat2)
-        result = torch.matmul(self, mat2.transpose(-2,-1))
-        return result.detach()
-    @staticmethod
-    def backward(ctx, grad):
-        self, mat2 = ctx.saved_tensors
-        self_grad = torch.npu_bmmV2(grad, mat2, [])
-        mat2_grad = torch.npu_bmmV2(grad.transpose(-2, -1), self, [])
-        return self_grad, mat2_grad
-
-def Matmul_transpose(tensor1, tensor2):
-    return MatmulApply.apply(tensor1, tensor2)
-
-@with_incremental_state
-class MultiheadAttention(nn.Module):
-    """Multi-headed attention.
-
-    See "Attention Is All You Need" for more details.
-    """
-
-    def __init__(
-        self,
-        embed_dim,
-        num_heads,
-        kdim=None,
-        vdim=None,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        self_attention=False,
-        encoder_decoder_attention=False,
-        q_noise=0.0,
-        qn_block_size=8,
-    ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.kdim = kdim if kdim is not None else embed_dim
-        self.vdim = vdim if vdim is not None else embed_dim
-        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
-
-        self.num_heads = num_heads
-        self.dropout_module = dropout_class(
-            dropout, module_name=self.__class__.__name__
-        )
-        self.dropout_prob = dropout
-
-        self.use_dropout_optim = (dropout_class is NpuFairseqDropout)
-
-        self.head_dim = embed_dim // num_heads
-        assert (
-            self.head_dim * num_heads == self.embed_dim
-        ), "embed_dim must be divisible by num_heads"
-        self.scaling = self.head_dim ** -0.5
-
-        self.self_attention = self_attention
-        self.encoder_decoder_attention = encoder_decoder_attention
-
-        assert not self.self_attention or self.qkv_same_dim, (
-            "Self-attention requires query, key and " "value to be of the same size"
-        )
-
-        self.k_proj = quant_noise(
-            NpuLinear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size
-        )
-        self.v_proj = quant_noise(
-            NpuLinear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
-        )
-        self.q_proj = quant_noise(
-            NpuLinear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
-        )
-
-        self.out_proj = quant_noise(
-            NpuLinear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
-        )
-
-        if add_bias_kv:
-            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
-            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
-        else:
-            self.bias_k = self.bias_v = None
-
-        self.add_zero_attn = add_zero_attn
-
-        self.reset_parameters()
-
-        self.onnx_trace = False
-        self.tpu = False
-
-    def prepare_for_onnx_export_(self):
-        self.onnx_trace = True
-
-    def prepare_for_tpu_(self, **kwargs):
-        self.tpu = True
-
-    def reset_parameters(self):
-        if self.qkv_same_dim:
-            # Empirically observed the convergence to be much better with
-            # the scaled initialization
-            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
-            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
-            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
-        else:
-            nn.init.xavier_uniform_(self.k_proj.weight)
-            nn.init.xavier_uniform_(self.v_proj.weight)
-            nn.init.xavier_uniform_(self.q_proj.weight)
-
-        nn.init.xavier_uniform_(self.out_proj.weight)
-        if self.out_proj.bias is not None:
-            nn.init.constant_(self.out_proj.bias, 0.0)
-        if self.bias_k is not None:
-            nn.init.xavier_normal_(self.bias_k)
-        if self.bias_v is not None:
-            nn.init.xavier_normal_(self.bias_v)
-
-    def transpose_for_scores(self, x):
-        new_x_shape = (self.batch_size, self.squence_length) + (self.num_attention_heads, self.attention_head_size)
-        return x.npu_confusion_transpose((0, 2, 1, 3), new_x_shape, False)
-
-    def forward(
-        self,
-        query,
-        key: Optional[Tensor],
-        value: Optional[Tensor], bsz, tgt_len, s_len,
-        key_padding_mask: Optional[Tensor] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        need_weights: bool = True,
-        static_kv: bool = False,
-        attn_mask: Optional[Tensor] = None,
-        before_softmax: bool = False,
-        need_head_weights: bool = False,
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        """Input shape: Time x Batch x Channel
-
-        Args:
-            key_padding_mask (ByteTensor, optional): mask to exclude
-                keys that are pads, of shape `(batch, src_len)`, where
-                padding elements are indicated by 1s.
-            need_weights (bool, optional): return the attention weights,
-                averaged over heads (default: False).
-            attn_mask (ByteTensor, optional): typically used to
-                implement causal attention, where the mask prevents the
-                attention from looking forward in time (default: None).
-            before_softmax (bool, optional): return the raw attention
-                weights and values before the attention softmax.
-            need_head_weights (bool, optional): return the attention
-                weights for each head. Implies *need_weights*. Default:
-                return the average attention weights over all heads.
-        """
-        if MHAConfig.use_fussion_mha:
-            attn = self.multi_attn(query, key, value, key_padding_mask, bsz, tgt_len)
-            return attn, None
-        else:
-            return self.ori_attn(query, key, value, bsz, tgt_len, key_padding_mask, incremental_state,
-                                    need_weights, static_kv, attn_mask, before_softmax, need_head_weights)
-
-    def ori_attn(self, query, key, value, bsz, tgt_len, key_padding_mask, incremental_state,
-                    need_weights, static_kv, attn_mask, before_softmax, need_head_weights):
-        if need_head_weights:
-            need_weights = True
-
-        embed_dim = query.size()[-1]
-        assert embed_dim == self.embed_dim
-        assert list(query.size()) == [tgt_len * bsz, embed_dim]
-
-        if incremental_state is not None:
-            saved_state = self._get_input_buffer(incremental_state)
-            if saved_state is not None and "prev_key" in saved_state:
-                # previous time steps are cached - no need to recompute
-                # key and value if they are static
-                if static_kv:
-                    assert self.encoder_decoder_attention and not self.self_attention
-                    key = value = None
-        else:
-            saved_state = None
-
-        if self.self_attention:
-            q = self.q_proj(query)
-            k = self.k_proj(query)
-            v = self.v_proj(query)
-        elif self.encoder_decoder_attention:
-            # encoder-decoder attention
-            q = self.q_proj(query)
-            if key is None:
-                assert value is None
-                k = v = None
-            else:
-                k = self.k_proj(key)
-                v = self.v_proj(key)
-
-        else:
-            assert key is not None and value is not None
-            q = self.q_proj(query)
-            k = self.k_proj(key)
-            v = self.v_proj(value)
-        q *= self.scaling
-
-        if self.bias_k is not None:
-            assert self.bias_v is not None
-            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
-            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
-            if attn_mask is not None:
-                attn_mask = torch.cat(
-                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
-                )
-            if key_padding_mask is not None:
-                key_padding_mask = torch.cat(
-                    [
-                        key_padding_mask,
-                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
-                    ],
-                    dim=1,
-                )
-        new_shape = (bsz, tgt_len) + (self.num_heads, self.head_dim)
-        if k is not None:
-            key_shape = (bsz, k.size(0) // bsz) + (self.num_heads, self.head_dim)
-        q = q.npu_confusion_transpose((0, 2, 1, 3), new_shape, False)
-
-        if k is not None:
-            k = k.npu_confusion_transpose((0, 2, 1, 3), key_shape, False)
-        if v is not None:
-            v = v.npu_confusion_transpose((0, 2, 1, 3), key_shape, False)
-
-        if saved_state is not None:
-            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
-            if "prev_key" in saved_state:
-                prev_key = saved_state["prev_key"]
-                assert prev_key is not None
-                if static_kv:
-                    k = prev_key
-                else:
-                    assert k is not None
-                    k = torch.cat([prev_key, k], dim=2)
-            if "prev_value" in saved_state:
-                prev_value = saved_state["prev_value"]
-                assert prev_value is not None
-                if static_kv:
-                    v = prev_value
-                else:
-                    assert v is not None
-                    v = torch.cat([prev_value, v], dim=2)
-            prev_key_padding_mask: Optional[Tensor] = None
-            if "prev_key_padding_mask" in saved_state:
-                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
-            assert k is not None and v is not None
-            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
-                key_padding_mask=key_padding_mask,
-                prev_key_padding_mask=prev_key_padding_mask,
-                batch_size=bsz,
-                src_len=k.size(2),
-                static_kv=static_kv,
-            )
-
-            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
-            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
-            saved_state["prev_key_padding_mask"] = key_padding_mask
-            # In this branch incremental_state is never None
-            assert incremental_state is not None
-            incremental_state = self._set_input_buffer(incremental_state, saved_state)
-        assert k is not None
-        src_len = k.size(2)
-
-        # This is part of a workaround to get around fork/join parallelism
-        # not supporting Optional types.
-        if key_padding_mask is not None and key_padding_mask.dim() == 0:
-            key_padding_mask = None
-
-        if key_padding_mask is not None:
-            assert key_padding_mask.size(0) == bsz
-            assert key_padding_mask.size(3) == src_len
-
-        if self.add_zero_attn:
-            assert v is not None
-            src_len += 1
-            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
-            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
-            if attn_mask is not None:
-                attn_mask = torch.cat(
-                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
-                )
-            if key_padding_mask is not None:
-                key_padding_mask = torch.cat(
-                    [
-                        key_padding_mask,
-                        torch.zeros(key_padding_mask.size(0), 1).type_as(
-                            key_padding_mask
-                        ),
-                    ],
-                    dim=1,
-                )
-
-        attn_weights = Matmul_transpose(q, k)
-        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
-
-        assert list(attn_weights.size()) == [bsz, self.num_heads, tgt_len, src_len]
-
-        if attn_mask is not None:
-            attn_mask = attn_mask.unsqueeze(0)
-            if self.onnx_trace:
-                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
-            attn_weights += attn_mask
-
-        if key_padding_mask is not None:
-            # don't attend to padding symbols
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights + key_padding_mask
-
-        if before_softmax:
-            return attn_weights, v
-
-        attn_weights_float = utils.softmax(
-            attn_weights, dim=-1, onnx_trace=self.onnx_trace
-        )
-        attn_weights = attn_weights_float.to(attn_weights.dtype)
-        attn_probs = self.dropout_module(attn_weights)
-
-        assert v is not None
-        attn = torch.matmul(attn_probs, v)
-        assert list(attn.size()) == [bsz, self.num_heads, tgt_len, self.head_dim]
-        if self.onnx_trace and attn.size(1) == 1:
-            # when ONNX tracing a single decoder step (sequence length == 1)
-            # the transpose is a no-op copy before view, thus unnecessary
-            attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
-        else:
-            attn =attn.npu_confusion_transpose((0, 2, 1, 3),
-                                               (attn.size()[0]* attn.size()[2], embed_dim),
-                                               True)
-        attn = self.out_proj(attn)
-        attn_weights: Optional[Tensor] = None
-        if need_weights:
-            attn_weights = attn_weights_float.view(
-                bsz, self.num_heads, tgt_len, src_len
-            ).transpose(1, 0)
-            if not need_head_weights:
-                # average attention weights over heads
-                attn_weights = attn_weights.mean(dim=0)
-
-        return attn, attn_weights
-
-    def multi_attn(self, query, key, value, key_padding_mask, bsz, tgt_len):
-        src_len = key.size(0) // bsz
-        if self.use_dropout_optim:
-            dropout_mask = self.dropout_module([(bsz, self.num_heads, tgt_len, src_len), query.dtype, query.device])
-        else:
-            dropout_mask = None
-        attn = torch.npu_multi_head_attention(query, key, value, self.q_proj.weight,
-                                                 self.k_proj.weight, self.v_proj.weight,
-                                                 key_padding_mask, self.out_proj.weight,
-                                                 self.q_proj.bias, self.k_proj.bias, self.v_proj.bias,
-                                                 self.out_proj.bias, dropout_mask, self.num_heads,
-                                                 self.head_dim, src_len, tgt_len, self.dropout_prob, True)
-        return attn[0]
-
-    @staticmethod
-    def _append_prev_key_padding_mask(
-        key_padding_mask: Optional[Tensor],
-        prev_key_padding_mask: Optional[Tensor],
-        batch_size: int,
-        src_len: int,
-        static_kv: bool,
-    ) -> Optional[Tensor]:
-        # saved key padding masks have shape (bsz, seq_len)
-        if prev_key_padding_mask is not None and static_kv:
-            new_key_padding_mask = prev_key_padding_mask
-        elif prev_key_padding_mask is not None and key_padding_mask is not None:
-            new_key_padding_mask = torch.cat(
-                [prev_key_padding_mask.half(), key_padding_mask.half()], dim=3
-            )
-        # During incremental decoding, as the padding token enters and
-        # leaves the frame, there will be a time when prev or current
-        # is None
-        elif prev_key_padding_mask is not None:
-            filler = torch.zeros(
-                (batch_size, key_padding_mask.size(1),key_padding_mask.size(2),
-                 src_len - prev_key_padding_mask.size(3)),
-                device=prev_key_padding_mask.device,
-            )
-            new_key_padding_mask = torch.cat(
-                [prev_key_padding_mask.half(), filler.half()], dim=3
-            )
-        elif key_padding_mask is not None:
-            filler = torch.zeros(
-                (batch_size, key_padding_mask.size(1),key_padding_mask.size(2)
-                 , src_len - key_padding_mask.size(3)),
-                device=key_padding_mask.device,
-            )
-            new_key_padding_mask = torch.cat(
-                [filler.half(), key_padding_mask.half()], dim=3
-            )
-        else:
-            new_key_padding_mask = prev_key_padding_mask
-        return new_key_padding_mask
-
-    @torch.jit.export
-    def reorder_incremental_state(
-        self,
-        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-        new_order: Tensor,
-    ):
-        """Reorder buffered internal state (for incremental generation)."""
-        input_buffer = self._get_input_buffer(incremental_state)
-        if input_buffer is not None:
-            for k in input_buffer.keys():
-                input_buffer_k = input_buffer[k]
-                if input_buffer_k is not None:
-                    if self.encoder_decoder_attention and input_buffer_k.size(
-                        0
-                    ) == new_order.size(0):
-                        break
-                    input_buffer[k] = input_buffer_k.index_select(0, new_order)
-            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
-        return incremental_state
-
-    def _get_input_buffer(
-        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
-    ) -> Dict[str, Optional[Tensor]]:
-        result = self.get_incremental_state(incremental_state, "attn_state")
-        if result is not None:
-            return result
-        else:
-            empty_result: Dict[str, Optional[Tensor]] = {}
-            return empty_result
-
-    def _set_input_buffer(
-        self,
-        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
-        buffer: Dict[str, Optional[Tensor]],
-    ):
-        return self.set_incremental_state(incremental_state, "attn_state", buffer)
-
-    def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
-        return attn_weights
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        prefix = name + "." if name != "" else ""
-        items_to_add = {}
-        keys_to_remove = []
-        for k in state_dict.keys():
-            if k.endswith(prefix + "in_proj_weight"):
-                # in_proj_weight used to be q + k + v with same dimensions
-                dim = int(state_dict[k].shape[0] / 3)
-                items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
-                items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
-                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]
-
-                keys_to_remove.append(k)
-
-                k_bias = prefix + "in_proj_bias"
-                if k_bias in state_dict.keys():
-                    dim = int(state_dict[k].shape[0] / 3)
-                    items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
-                    items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][
-                        dim : 2 * dim
-                    ]
-                    items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]
-
-                    keys_to_remove.append(prefix + "in_proj_bias")
-
-        for k in keys_to_remove:
-            del state_dict[k]
-
-        for key, value in items_to_add.items():
-            state_dict[key] = value
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/positional_embedding.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/positional_embedding.py
deleted file mode 100644
index 8e94e35edb46bf9dea911fe74577d8ecbe9b5ff1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/positional_embedding.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch.nn as nn
-
-from .learned_positional_embedding import LearnedPositionalEmbedding
-from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding
-
-
-def PositionalEmbedding(
-    num_embeddings: int,
-    embedding_dim: int,
-    padding_idx: int,
-    learned: bool = False,
-):
-    if learned:
-        # if padding_idx is specified then offset the embedding ids by
-        # this index and adjust num_embeddings appropriately
-        # TODO: The right place for this offset would be inside
-        # LearnedPositionalEmbedding. Move this there for a cleaner implementation.
-        if padding_idx is not None:
-            num_embeddings = num_embeddings + padding_idx + 1
-        m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
-        nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
-        if padding_idx is not None:
-            nn.init.constant_(m.weight[padding_idx], 0)
-    else:
-        m = SinusoidalPositionalEmbedding(
-            embedding_dim,
-            padding_idx,
-            init_size=num_embeddings + padding_idx + 1,
-        )
-    return m
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quant_noise.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quant_noise.py
deleted file mode 100644
index d777dfbb6c1bf6a9b769dfdaec35d5ef084c8a8b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quant_noise.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn as nn
-
-
-def quant_noise(module, p, block_size):
-    """
-    Wraps modules and applies quantization noise to the weights for
-    subsequent quantization with Iterative Product Quantization as
-    described in "Training with Quantization Noise for Extreme Model Compression"
-
-    Args:
-        - module: nn.Module
-        - p: amount of Quantization Noise
-        - block_size: size of the blocks for subsequent quantization with iPQ
-
-    Remarks:
-        - Module weights must have the right sizes wrt the block size
-        - Only Linear, Embedding and Conv2d modules are supported for the moment
-        - For more detail on how to quantize by blocks with convolutional weights,
-          see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks"
-        - We implement the simplest form of noise here as stated in the paper
-          which consists in randomly dropping blocks
-    """
-
-    # if no quantization noise, don't register hook
-    if p <= 0:
-        return module
-
-    # supported modules
-    assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
-
-    # test whether module.weight has the right sizes wrt block_size
-    is_conv = module.weight.ndim == 4
-
-    # 2D matrix
-    if not is_conv:
-        assert (
-            module.weight.size(1) % block_size == 0
-        ), "Input features must be a multiple of block sizes"
-
-    # 4D matrix
-    else:
-        # 1x1 convolutions
-        if module.kernel_size == (1, 1):
-            assert (
-                module.in_channels % block_size == 0
-            ), "Input channels must be a multiple of block sizes"
-        # regular convolutions
-        else:
-            k = module.kernel_size[0] * module.kernel_size[1]
-            assert k % block_size == 0, "Kernel size must be a multiple of block size"
-
-    def _forward_pre_hook(mod, input):
-        # no noise for evaluation
-        if mod.training:
-            if not is_conv:
-                # gather weight and sizes
-                weight = mod.weight
-                in_features = weight.size(1)
-                out_features = weight.size(0)
-
-                # split weight matrix into blocks and randomly drop selected blocks
-                mask = torch.zeros(
-                    in_features // block_size * out_features, device=weight.device
-                )
-                mask.bernoulli_(p)
-                mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
-
-            else:
-                # gather weight and sizes
-                weight = mod.weight
-                in_channels = mod.in_channels
-                out_channels = mod.out_channels
-
-                # split weight matrix into blocks and randomly drop selected blocks
-                if mod.kernel_size == (1, 1):
-                    mask = torch.zeros(
-                        int(in_channels // block_size * out_channels),
-                        device=weight.device,
-                    )
-                    mask.bernoulli_(p)
-                    mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
-                else:
-                    mask = torch.zeros(
-                        weight.size(0), weight.size(1), device=weight.device
-                    )
-                    mask.bernoulli_(p)
-                    mask = (
-                        mask.unsqueeze(2)
-                        .unsqueeze(3)
-                        .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
-                    )
-
-            # scale weights and apply mask
-            mask = mask.to(
-                torch.bool
-            )  # x.bool() is not currently supported in TorchScript
-            s = 1 / (1 - p)
-            mod.weight.data = s * weight.masked_fill(mask, 0)
-
-    module.register_forward_pre_hook(_forward_pre_hook)
-    return module
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/__init__.py
deleted file mode 100644
index 5b10b51b1b0ca21aaec96344f86a0ab9df0c22f8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .utils import SizeTracker, quantize_model_  # NOQA
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/em.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/em.py
deleted file mode 100644
index 6f15c3e46bd052b1e00929e7ece9355fb03846c7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/em.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-import random
-from collections import Counter
-
-import torch
-
-
-class EM:
-    """
-    EM algorithm used to quantize the columns of W to minimize
-
-                         ||W - W_hat||^2
-
-    Args:
-        - W: weight matrix of size (in_features x out_features)
-        - n_iter: number of k-means iterations
-        - n_centroids: number of centroids (size of codebook)
-        - eps: for cluster reassignment when an empty cluster is found
-        - max_tentatives for cluster reassignment when an empty cluster is found
-        - verbose: print error after each iteration
-
-    Remarks:
-        - If one cluster is empty, the most populated cluster is split into
-          two clusters
-        - All the relevant dimensions are specified in the code
-    """
-
-    def __init__(
-        self, W, n_centroids=256, n_iter=20, eps=1e-6, max_tentatives=30, verbose=True
-    ):
-        self.W = W
-        self.n_centroids = n_centroids
-        self.n_iter = n_iter
-        self.eps = eps
-        self.max_tentatives = max_tentatives
-        self.verbose = verbose
-        self.centroids = torch.Tensor()
-        self.assignments = torch.Tensor()
-        self.objective = []
-
-    def initialize_centroids(self):
-        """
-        Initializes the centroids by sampling random columns from W.
-        """
-
-        in_features, out_features = self.W.size()
-        indices = torch.randint(
-            low=0, high=out_features, size=(self.n_centroids,)
-        ).long()
-        self.centroids = self.W[:, indices].t()  # (n_centroids x in_features)
-
-    def step(self, i):
-        """
-        There are two standard steps for each iteration: expectation (E) and
-        minimization (M). The E-step (assignment) is performed with an exhaustive
-        search and the M-step (centroid computation) is performed with
-        the exact solution.
-
-        Args:
-            - i: step number
-
-        Remarks:
-            - The E-step heavily uses PyTorch broadcasting to speed up computations
-              and reduce the memory overhead
-        """
-
-        # assignments (E-step)
-        distances = self.compute_distances()  # (n_centroids x out_features)
-        self.assignments = torch.argmin(distances, dim=0)  # (out_features)
-        n_empty_clusters = self.resolve_empty_clusters()
-
-        # centroids (M-step)
-        for k in range(self.n_centroids):
-            W_k = self.W[:, self.assignments == k]  # (in_features x size_of_cluster_k)
-            self.centroids[k] = W_k.mean(dim=1)  # (in_features)
-
-        # book-keeping
-        obj = (self.centroids[self.assignments].t() - self.W).norm(p=2).item()
-        self.objective.append(obj)
-        if self.verbose:
-            logging.info(
-                f"Iteration: {i},\t"
-                f"objective: {obj:.6f},\t"
-                f"resolved empty clusters: {n_empty_clusters}"
-            )
-
-    def resolve_empty_clusters(self):
-        """
-        If one cluster is empty, the most populated cluster is split into
-        two clusters by shifting the respective centroids. This is done
-        iteratively for a fixed number of tentatives.
-        """
-
-        # empty clusters
-        counts = Counter(map(lambda x: x.item(), self.assignments))
-        empty_clusters = set(range(self.n_centroids)) - set(counts.keys())
-        n_empty_clusters = len(empty_clusters)
-
-        tentatives = 0
-        while len(empty_clusters) > 0:
-            # given an empty cluster, find most populated cluster and split it into two
-            k = random.choice(list(empty_clusters))
-            m = counts.most_common(1)[0][0]
-            e = torch.randn_like(self.centroids[m]) * self.eps
-            self.centroids[k] = self.centroids[m].clone()
-            self.centroids[k] += e
-            self.centroids[m] -= e
-
-            # recompute assignments
-            distances = self.compute_distances()  # (n_centroids x out_features)
-            self.assignments = torch.argmin(distances, dim=0)  # (out_features)
-
-            # check for empty clusters
-            counts = Counter(map(lambda x: x.item(), self.assignments))
-            empty_clusters = set(range(self.n_centroids)) - set(counts.keys())
-
-            # increment tentatives
-            if tentatives == self.max_tentatives:
-                logging.info(
-                    f"Could not resolve all empty clusters, {len(empty_clusters)} remaining"
-                )
-                raise EmptyClusterResolveError
-            tentatives += 1
-
-        return n_empty_clusters
-
-    def compute_distances(self):
-        """
-        For every centroid m, computes
-
-                          ||M - m[None, :]||_2
-
-        Remarks:
-            - We rely on PyTorch's broadcasting to speed up computations
-              and reduce the memory overhead
-            - Without chunking, the sizes in the broadcasting are modified as:
-              (n_centroids x n_samples x out_features) -> (n_centroids x out_features)
-            - The broadcasting computation is automatically chunked so that
-              the tensors fit into the memory of the GPU
-        """
-
-        nb_centroids_chunks = 1
-
-        while True:
-            try:
-                return torch.cat(
-                    [
-                        (self.W[None, :, :] - centroids_c[:, :, None]).norm(p=2, dim=1)
-                        for centroids_c in self.centroids.chunk(
-                            nb_centroids_chunks, dim=0
-                        )
-                    ],
-                    dim=0,
-                )
-            except RuntimeError:
-                nb_centroids_chunks *= 2
-
-    def assign(self):
-        """
-        Assigns each column of W to its closest centroid, thus essentially
-        performing the E-step in train().
-
-        Remarks:
-            - The function must be called after train() or after loading
-              centroids using self.load(), otherwise it will return empty tensors
-        """
-
-        distances = self.compute_distances()  # (n_centroids x out_features)
-        self.assignments = torch.argmin(distances, dim=0)  # (out_features)
-
-    def save(self, path, layer):
-        """
-        Saves centroids and assignments.
-
-        Args:
-            - path: folder used to save centroids and assignments
-        """
-
-        torch.save(self.centroids, os.path.join(path, "{}_centroids.pth".format(layer)))
-        torch.save(
-            self.assignments, os.path.join(path, "{}_assignments.pth".format(layer))
-        )
-        torch.save(self.objective, os.path.join(path, "{}_objective.pth".format(layer)))
-
-    def load(self, path, layer):
-        """
-        Loads centroids and assignments from a given path
-
-        Args:
-            - path: folder use to load centroids and assignments
-        """
-
-        self.centroids = torch.load(
-            os.path.join(path, "{}_centroids.pth".format(layer))
-        )
-        self.assignments = torch.load(
-            os.path.join(path, "{}_assignments.pth".format(layer))
-        )
-        self.objective = torch.load(
-            os.path.join(path, "{}_objective.pth".format(layer))
-        )
-
-
-class EmptyClusterResolveError(Exception):
-    pass
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/modules/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/modules/__init__.py
deleted file mode 100644
index b67c8e8ad691aa01e9e10e904d69d94595387668..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/modules/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .qconv import PQConv2d  # NOQA
-from .qemb import PQEmbedding  # NOQA
-from .qlinear import PQLinear  # NOQA
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/modules/qconv.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/modules/qconv.py
deleted file mode 100644
index d15ec192e8cda6265a198e583a9bf7fb194dd129..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/modules/qconv.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.modules.utils import _pair
-
-
-class PQConv2d(nn.Module):
-    """
-    Quantized counterpart of nn.Conv2d module. Stores the centroid, the assignments
-    and the non-quantized biases. The full weight is re-instantiated at each forward
-    pass and autograd automatically computes the gradients with respect to the
-    centroids.
-
-    Args:
-        - centroids: centroids of size n_centroids x block_size
-        - assignments: assignments of the centroids to the subvectors
-          of size self.out_channels x n_blocks
-        - bias: the non-quantized bias, must be either torch.Tensor or None
-
-    Remarks:
-        - We refer the reader to the official documentation of the nn.Conv2d module
-          for the other arguments and the behavior of the module.
-        - Performance tests on GPU show that this implementation is 10% slower than
-          the non-quantized nn.Conv2d module for a standard training loop.
-        - During the backward, the gradients are averaged by cluster and not summed.
-          This explains the hook registered to the centroids.
-    """
-
-    def __init__(
-        self,
-        centroids,
-        assignments,
-        bias,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        padding=0,
-        dilation=1,
-        groups=1,
-        padding_mode="zeros",
-    ):
-        super(PQConv2d, self).__init__()
-        self.block_size = centroids.size(1)
-        self.n_centroids = centroids.size(0)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = _pair(kernel_size)
-        self.stride = _pair(stride)
-        self.padding = _pair(padding)
-        self.dilation = _pair(dilation)
-        self.groups = groups
-        self.padding_mode = padding_mode
-        # check compatibility
-        if in_channels // groups * np.prod(self.kernel_size) % self.block_size != 0:
-            raise ValueError("Wrong PQ sizes")
-        if len(assignments) % out_channels != 0:
-            raise ValueError("Wrong PQ sizes")
-        if in_channels % groups != 0:
-            raise ValueError("in_channels must be divisible by groups")
-        if out_channels % groups != 0:
-            raise ValueError("out_channels must be divisible by groups")
-        # define parameters
-        self.centroids = nn.Parameter(centroids, requires_grad=True)
-        self.register_buffer("assignments", assignments)
-        self.register_buffer("counts", torch.bincount(assignments).type_as(centroids))
-        if bias is not None:
-            self.bias = nn.Parameter(bias)
-        else:
-            self.register_parameter("bias", None)
-        # register hook for averaging gradients per centroids instead of summing
-        self.centroids.register_hook(lambda x: x / self.counts[:, None])
-
-    @property
-    def weight(self):
-        return (
-            self.centroids[self.assignments]
-            .reshape(-1, self.out_channels, self.block_size)
-            .permute(1, 0, 2)
-            .reshape(
-                self.out_channels, self.in_channels // self.groups, *self.kernel_size
-            )
-        )
-
-    def forward(self, x):
-        return F.conv2d(
-            x,
-            self.weight,
-            self.bias,
-            self.stride,
-            self.padding,
-            self.dilation,
-            self.groups,
-        )
-
-    def extra_repr(self):
-        s = "{in_channels}, {out_channels}, kernel_size={kernel_size}, stride={stride}"
-        if self.padding != (0,) * len(self.padding):
-            s += ", padding={padding}"
-        if self.dilation != (1,) * len(self.dilation):
-            s += ", dilation={dilation}"
-        if self.groups != 1:
-            s += ", groups={groups}"
-        if self.bias is None:
-            s += ", bias=False"
-        if self.padding_mode != "zeros":
-            s += ", padding_mode={padding_mode}"
-        s += ", n_centroids={n_centroids}, block_size={block_size}"
-        return s.format(**self.__dict__)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/modules/qemb.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/modules/qemb.py
deleted file mode 100644
index 3a74ad3c4c7c9d3203d26e7885864ba578951bfe..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/modules/qemb.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class PQEmbedding(nn.Module):
-    """
-    Quantized counterpart of nn.Embedding module. Stores the centroids and
-    the assignments. The full weight is re-instantiated at each forward
-    pass.
-
-    Args:
-        - centroids: centroids of size n_centroids x block_size
-        - assignments: assignments of the centroids to the subvectors
-          of size self.out_features x n_blocks
-        - bias: the non-quantized bias
-
-    Remarks:
-        - We refer the reader to the official documentation of the nn.Embedding module
-          for the other arguments and the behavior of the module
-        - Performance tests on GPU show that this implementation is 10% slower than
-          the non-quantized nn.Embedding module for a standard training loop.
-    """
-
-    def __init__(
-        self,
-        centroids,
-        assignments,
-        num_embeddings,
-        embedding_dim,
-        padding_idx=None,
-        max_norm=None,
-        norm_type=2.0,
-        scale_grad_by_freq=False,
-        sparse=False,
-        _weight=None,
-    ):
-        super(PQEmbedding, self).__init__()
-        self.block_size = centroids.size(1)
-        self.n_centroids = centroids.size(0)
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        if padding_idx is not None:
-            if padding_idx > 0:
-                assert (
-                    padding_idx < self.num_embeddings
-                ), "Padding_idx must be within num_embeddings"
-            elif padding_idx < 0:
-                assert (
-                    padding_idx >= -self.num_embeddings
-                ), "Padding_idx must be within num_embeddings"
-                padding_idx = self.num_embeddings + padding_idx
-        self.padding_idx = padding_idx
-        self.max_norm = max_norm
-        self.norm_type = norm_type
-        self.scale_grad_by_freq = scale_grad_by_freq
-        self.sparse = sparse
-        # check compatibility
-        if self.embedding_dim % self.block_size != 0:
-            raise ValueError("Wrong PQ sizes")
-        if len(assignments) % self.num_embeddings != 0:
-            raise ValueError("Wrong PQ sizes")
-        # define parameters
-        self.centroids = nn.Parameter(centroids, requires_grad=True)
-        self.register_buffer("assignments", assignments)
-        self.register_buffer("counts", torch.bincount(assignments).type_as(centroids))
-
-    @property
-    def weight(self):
-        return (
-            self.centroids[self.assignments]
-            .reshape(-1, self.num_embeddings, self.block_size)
-            .permute(1, 0, 2)
-            .flatten(1, 2)
-        )
-
-    def forward(self, input):
-        return F.embedding(
-            input,
-            self.weight,
-            self.padding_idx,
-            self.max_norm,
-            self.norm_type,
-            self.scale_grad_by_freq,
-            self.sparse,
-        )
-
-    def extra_repr(self):
-        s = "{num_embeddings}, {embedding_dim}"
-        if self.padding_idx is not None:
-            s += ", padding_idx={padding_idx}"
-        if self.max_norm is not None:
-            s += ", max_norm={max_norm}"
-        if self.norm_type != 2:
-            s += ", norm_type={norm_type}"
-        if self.scale_grad_by_freq is not False:
-            s += ", scale_grad_by_freq={scale_grad_by_freq}"
-        if self.sparse is not False:
-            s += ", sparse=True"
-        s += ", n_centroids={n_centroids}, block_size={block_size}"
-
-        return s.format(**self.__dict__)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/modules/qlinear.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/modules/qlinear.py
deleted file mode 100644
index 9bdd25a8685bb7c7b32e1f02372aaeb26d8ba53a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/modules/qlinear.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class PQLinear(nn.Module):
-    """
-    Quantized counterpart of nn.Linear module. Stores the centroid, the assignments
-    and the non-quantized biases. The full weight is re-instantiated at each forward
-    pass.
-
-    Args:
-        - centroids: centroids of size n_centroids x block_size
-        - assignments: assignments of the centroids to the subvectors
-          of size self.out_features x n_blocks
-        - bias: the non-quantized bias
-
-    Remarks:
-        - We refer the reader to the official documentation of the nn.Linear module
-          for the other arguments and the behavior of the module
-        - Performance tests on GPU show that this implementation is 15% slower than
-          the non-quantized nn.Linear module for a standard training loop.
-    """
-
-    def __init__(self, centroids, assignments, bias, in_features, out_features):
-        super(PQLinear, self).__init__()
-        self.block_size = centroids.size(1)
-        self.n_centroids = centroids.size(0)
-        self.in_features = in_features
-        self.out_features = out_features
-        # check compatibility
-        if self.in_features % self.block_size != 0:
-            raise ValueError("Wrong PQ sizes")
-        if len(assignments) % self.out_features != 0:
-            raise ValueError("Wrong PQ sizes")
-        # define parameters
-        self.centroids = nn.Parameter(centroids, requires_grad=True)
-        self.register_buffer("assignments", assignments)
-        self.register_buffer("counts", torch.bincount(assignments).type_as(centroids))
-        if bias is not None:
-            self.bias = nn.Parameter(bias)
-        else:
-            self.register_parameter("bias", None)
-
-    @property
-    def weight(self):
-        return (
-            self.centroids[self.assignments]
-            .reshape(-1, self.out_features, self.block_size)
-            .permute(1, 0, 2)
-            .flatten(1, 2)
-        )
-
-    def forward(self, x):
-        return F.linear(
-            x,
-            self.weight,
-            self.bias,
-        )
-
-    def extra_repr(self):
-        return f"in_features={self.in_features},\
-                 out_features={self.out_features},\
-                 n_centroids={self.n_centroids},\
-                 block_size={self.block_size},\
-                 bias={self.bias is not None}"
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/pq.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/pq.py
deleted file mode 100644
index eddc2eb34602403f10979f54cd23a45bc2f104d5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/pq.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .em import EM, EmptyClusterResolveError
-
-
-class PQ(EM):
-    """
-    Quantizes the layer weights W with the standard Product Quantization
-    technique. This learns a codebook of codewords or centroids of size
-    block_size from W. For further reference on using PQ to quantize
-    neural networks, see "And the Bit Goes Down: Revisiting the Quantization
-    of Neural Networks", Stock et al., ICLR 2020.
-
-    PQ is performed in two steps:
-    (1) The matrix W (weights or fully-connected or convolutional layer)
-        is reshaped to (block_size, -1).
-            - If W is fully-connected (2D), its columns are split into
-              blocks of size block_size.
-            - If W is convolutional (4D), its filters are split along the
-              spatial dimension.
-    (2) We apply the standard EM/k-means algorithm to the resulting reshaped matrix.
-
-    Args:
-        - W: weight matrix to quantize of size (in_features x out_features)
-        - block_size: size of the blocks (subvectors)
-        - n_centroids: number of centroids
-        - n_iter: number of k-means iterations
-        - eps: for cluster reassignment when an empty cluster is found
-        - max_tentatives for cluster reassignment when an empty cluster is found
-        - verbose: print information after each iteration
-
-    Remarks:
-        - block_size be compatible with the shape of W
-    """
-
-    def __init__(
-        self,
-        W,
-        block_size,
-        n_centroids=256,
-        n_iter=20,
-        eps=1e-6,
-        max_tentatives=30,
-        verbose=True,
-    ):
-        self.block_size = block_size
-        W_reshaped = self._reshape(W)
-        super(PQ, self).__init__(
-            W_reshaped,
-            n_centroids=n_centroids,
-            n_iter=n_iter,
-            eps=eps,
-            max_tentatives=max_tentatives,
-            verbose=verbose,
-        )
-
-    def _reshape(self, W):
-        """
-        Reshapes the matrix W as expained in step (1).
-        """
-
-        # fully connected: by convention the weight has size out_features x in_features
-        if len(W.size()) == 2:
-            self.out_features, self.in_features = W.size()
-            assert (
-                self.in_features % self.block_size == 0
-            ), "Linear: n_blocks must be a multiple of in_features"
-            return (
-                W.reshape(self.out_features, -1, self.block_size)
-                .permute(2, 1, 0)
-                .flatten(1, 2)
-            )
-
-        # convolutional: we reshape along the spatial dimension
-        elif len(W.size()) == 4:
-            self.out_channels, self.in_channels, self.k_h, self.k_w = W.size()
-            assert (
-                self.in_channels * self.k_h * self.k_w
-            ) % self.block_size == 0, (
-                "Conv2d: n_blocks must be a multiple of in_channels * k_h * k_w"
-            )
-            return (
-                W.reshape(self.out_channels, -1, self.block_size)
-                .permute(2, 1, 0)
-                .flatten(1, 2)
-            )
-        # not implemented
-        else:
-            raise NotImplementedError(W.size())
-
-    def encode(self):
-        """
-        Performs self.n_iter EM steps.
-        """
-
-        self.initialize_centroids()
-        for i in range(self.n_iter):
-            try:
-                self.step(i)
-            except EmptyClusterResolveError:
-                break
-
-    def decode(self):
-        """
-        Returns the encoded full weight matrix. Must be called after
-        the encode function.
-        """
-
-        # fully connected case
-        if "k_h" not in self.__dict__:
-            return (
-                self.centroids[self.assignments]
-                .reshape(-1, self.out_features, self.block_size)
-                .permute(1, 0, 2)
-                .flatten(1, 2)
-            )
-
-        # convolutional case
-        else:
-            return (
-                self.centroids[self.assignments]
-                .reshape(-1, self.out_channels, self.block_size)
-                .permute(1, 0, 2)
-                .reshape(self.out_channels, self.in_channels, self.k_h, self.k_w)
-            )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/utils.py
deleted file mode 100644
index 03b15e4b1b58c9a1e6d42052b3bd5457df9a6e2e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/pq/utils.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import re
-from operator import attrgetter, itemgetter
-
-import numpy as np
-import torch.distributed as dist
-import torch.nn as nn
-
-from .modules import PQConv2d, PQEmbedding, PQLinear
-from .pq import PQ
-
-
-def quantize_model_(
-    model,
-    size_tracker,
-    layers_to_quantize,
-    block_sizes_config,
-    n_centroids_config,
-    step=0,
-    n_iter=15,
-    eps=1e-6,
-    max_tentatives=100,
-    verbose=True,
-):
-    """
-    Quantize a model in-place by stages. All the targeted
-    layers are replaced by their quantized counterpart,
-    and the model is ready for the finetuning of the
-    centroids in a standard training loop (no modifications
-    required). Note that we do not quantize biases.
-
-    Args:
-        - model: a nn.Module
-        - size_tracker: useful for tracking quatization statistics
-        - layers_to_quantize: a list containing regexps for
-          filtering the layers to quantize at each stage according
-          to their name (as in model.named_parameters())
-        - block_sizes_config: dict like
-          {
-              'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}),
-              'Linear': ('in_features', {'*': 8})
-          }
-          For instance, all conv2d layers with kernel size 3x3 have
-          a block size of 9 and all Linear layers are quantized with
-          a block size of 8, irrespective of their size.
-        - n_centroids_config: dict like
-          {
-              'Conv2d': ('kernel_size', {'*': 256}),
-              'Linear': ('in_features', {'*': 256})
-          }
-          For instance, all conv2d layers are quantized with 256 centroids
-        - step: the layers to quantize inplace corresponding
-          to layers_to_quantize[step]
-    """
-
-    quantized_layers = get_layers(model, layers_to_quantize[step])
-
-    for layer in quantized_layers:
-
-        # book-keeping
-        is_master_process = (not dist.is_initialized()) or (
-            dist.is_initialized() and dist.get_rank() == 0
-        )
-        verbose = verbose and is_master_process
-
-        # get block size and centroids
-        module = attrgetter(layer)(model)
-        block_size = get_param(module, layer, block_sizes_config)
-        n_centroids = get_param(module, layer, n_centroids_config)
-        if verbose:
-            logging.info(
-                f"Quantizing layer {layer} with block size {block_size} and {n_centroids} centroids"
-            )
-
-        # quantize layer
-        weight = module.weight.data.clone()
-        is_bias = "bias" in [x[0] for x in module.named_parameters()]
-        bias = module.bias.data.clone() if is_bias else None
-        quantizer = PQ(
-            weight,
-            block_size,
-            n_centroids=n_centroids,
-            n_iter=n_iter,
-            eps=eps,
-            max_tentatives=max_tentatives,
-            verbose=verbose,
-        )
-
-        # quantization performed on all GPUs with same seed
-        quantizer.encode()
-        centroids = quantizer.centroids.contiguous()
-        assignments = quantizer.assignments.contiguous()
-
-        # broadcast results to make sure weights are up-to-date
-        if dist.is_initialized():
-            dist.broadcast(centroids, 0)
-            dist.broadcast(assignments, 0)
-
-        # instantiate the quantized counterpart
-        if isinstance(module, nn.Linear):
-            out_features, in_features = map(
-                lambda k: module.__dict__[k], ["out_features", "in_features"]
-            )
-            quantized_module = PQLinear(
-                centroids, assignments, bias, in_features, out_features
-            )
-        elif isinstance(module, nn.Embedding):
-            num_embeddings, embedding_dim = map(
-                lambda k: module.__dict__[k], ["num_embeddings", "embedding_dim"]
-            )
-            quantized_module = PQEmbedding(
-                centroids, assignments, num_embeddings, embedding_dim
-            )
-        elif isinstance(module, nn.Conv2d):
-            out_channels, in_channels, kernel_size = map(
-                lambda k: module.__dict__[k],
-                ["out_channels", "in_channels", "kernel_size"],
-            )
-            stride, padding, dilation, groups, padding_mode = map(
-                lambda k: module.__dict__[k],
-                ["stride", "padding", "dilation", "groups", "padding_mode"],
-            )
-
-            quantized_module = PQConv2d(
-                centroids,
-                assignments,
-                bias,
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride=stride,
-                padding=padding,
-                dilation=dilation,
-                groups=groups,
-                padding_mode=padding_mode,
-            )
-        else:
-            raise ValueError(f"Module {module} not yet supported for quantization")
-
-        # replace layer by its quantized counterpart
-        attrsetter(layer)(model, quantized_module)
-
-        # update statistics
-        size_tracker.update(weight, block_size, n_centroids)
-
-    # return name of quantized layers
-    return quantized_layers
-
-
-def get_layers(model, filter_regexp):
-    """
-    Filters out the layers according to a regexp. Note that
-    we omit biases.
-
-    Args:
-        - model: a nn.Module
-        - filter_regexp: a regexp to filter the layers to keep
-          according to their name in model.named_parameters().
-          For instance, the regexp:
-
-             down_layers\\.[123456]\\.(conv[12]|identity\\.conv))
-
-          is keeping blocks down_layers from 1 to 6, and inside
-          each block is keeping conv1, conv2 and identity.conv.
-
-    Remarks:
-        - We add (module\\.)? at the beginning of the regexp to
-          account for the possible use of nn.parallel.DataParallel
-    """
-
-    # get all parameter names
-    all_layers = map(itemgetter(0), model.named_parameters())
-
-    # remove biases
-    all_layers = filter(lambda x: "bias" not in x, all_layers)
-
-    # remove .weight in all other names (or .weight_orig is spectral norm)
-    all_layers = map(lambda x: x.replace(".weight_orig", ""), all_layers)
-    all_layers = map(lambda x: x.replace(".weight", ""), all_layers)
-
-    # return filtered layers
-    filter_regexp = "(module\\.)?" + "(" + filter_regexp + ")"
-    r = re.compile(filter_regexp)
-
-    return list(filter(r.match, all_layers))
-
-
-def get_param(module, layer_name, param_config):
-    """
-    Given a quantization configuration, get the right parameter
-    for the module to be quantized.
-
-    Args:
-        - module: a nn.Module
-        - layer_name: the name of the layer
-        - param_config: a dict like
-          {
-              'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}),
-              'Linear': ('in_features', {'*': 8})
-          }
-          For instance, all conv2d layers with kernel size 3x3 have
-          a block size of 9 and all Linear layers are quantized with
-          a block size of 8, irrespective of their size.
-
-    Remarks:
-        - if 'fuzzy_name' is passed as a parameter, layers whose layer_name
-          include 'fuzzy_name' will be assigned the given parameter.
-          In the following example, conv.expand layers will have a block
-          size of 9 while conv.reduce will have a block size of 4 and all
-          other layers will have a block size of 2.
-          {
-              'Conv2d': ('fuzzy_name', {'expand': 9, 'reduce': 4, '*': 2}),
-              'Linear': ('fuzzy_name', {'classifier': 8, 'projection': 4})
-          }
-
-    """
-
-    layer_type = module.__class__.__name__
-
-    if layer_type not in param_config:
-        raise KeyError(f"Layer type {layer_type} not in config for layer {module}")
-
-    feature, params = param_config[module.__class__.__name__]
-
-    if feature != "fuzzy_name":
-        feature_value = str(getattr(module, feature))
-        if feature_value not in params:
-            if "*" in params:
-                feature_value = "*"
-            else:
-                raise KeyError(
-                    f"{feature}={feature_value} not in config for layer {module}"
-                )
-    else:
-        feature_values = [name for name in params if name in layer_name]
-        if len(feature_values) == 0:
-            if "*" in params:
-                feature_value = "*"
-            else:
-                raise KeyError(f"name={layer_name} not in config for {module}")
-        else:
-            feature_value = feature_values[0]
-
-    return params[feature_value]
-
-
-class SizeTracker(object):
-    """
-    Class to keep track of the compressed network size with iPQ.
-
-    Args:
-        - model: a nn.Module
-
-    Remarks:
-        - The compressed size is the sum of three components
-          for each layer in the network:
-              (1) Storing the centroids given by iPQ in fp16
-              (2) Storing the assignments of the blocks in int8
-              (3) Storing all non-compressed elements such as biases
-        - This cost in only valid if we use 256 centroids (then
-          indexing can indeed by done with int8).
-    """
-
-    def __init__(self, model):
-        self.model = model
-        self.size_non_compressed_model = self.compute_size()
-        self.size_non_quantized = self.size_non_compressed_model
-        self.size_index = 0
-        self.size_centroids = 0
-        self.n_quantized_layers = 0
-
-    def compute_size(self):
-        """
-        Computes the size of the model (in MB).
-        """
-
-        res = 0
-        for _, p in self.model.named_parameters():
-            res += p.numel()
-        return res * 4 / 1024 / 1024
-
-    def update(self, W, block_size, n_centroids):
-        """
-        Updates the running statistics when quantizing a new layer.
-        """
-
-        # bits per weights
-        bits_per_weight = np.log2(n_centroids) / block_size
-        self.n_quantized_layers += 1
-
-        # size of indexing the subvectors of size block_size (in MB)
-        size_index_layer = bits_per_weight * W.numel() / 8 / 1024 / 1024
-        self.size_index += size_index_layer
-
-        # size of the centroids stored in float16 (in MB)
-        size_centroids_layer = n_centroids * block_size * 2 / 1024 / 1024
-        self.size_centroids += size_centroids_layer
-
-        # size of non-compressed layers, e.g. LayerNorms or biases (in MB)
-        size_uncompressed_layer = W.numel() * 4 / 1024 / 1024
-        self.size_non_quantized -= size_uncompressed_layer
-
-    def __repr__(self):
-        size_compressed = (
-            self.size_index + self.size_centroids + self.size_non_quantized
-        )
-        compression_ratio = self.size_non_compressed_model / size_compressed  # NOQA
-        return (
-            f"Non-compressed model size: {self.size_non_compressed_model:.2f} MB. "
-            f"After quantizing {self.n_quantized_layers} layers, size "
-            f"(indexing + centroids + other): {self.size_index:.2f} MB + "
-            f"{self.size_centroids:.2f} MB + {self.size_non_quantized:.2f} MB = "
-            f"{size_compressed:.2f} MB, compression ratio: {compression_ratio:.2f}x"
-        )
-
-
-def attrsetter(*items):
-    def resolve_attr(obj, attr):
-        attrs = attr.split(".")
-        head = attrs[:-1]
-        tail = attrs[-1]
-
-        for name in head:
-            obj = getattr(obj, name)
-        return obj, tail
-
-    def g(obj, val):
-        for attr in items:
-            resolved_obj, resolved_attr = resolve_attr(obj, attr)
-            setattr(resolved_obj, resolved_attr, val)
-
-    return g
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/quantization_options.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/quantization_options.py
deleted file mode 100644
index b46d682c0edaeaaf2a230e51d50da2a32d4bda98..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/quantization_options.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-def parse_config_yaml(yaml_data):
-    # Initialize to default options.
-    quantization_options = {
-        "n_centroids": {
-            "Linear": ["in_features", {"*": 256}],
-            "Embedding": ["embedding_dim", {"*": 256}],
-        },
-        "block_sizes": {
-            "Linear": ["fuzzy_name", {"fc": 8, "attn": 4, "emb": 4}],
-            "Embedding": ["fuzzy_name", {"emb": 8}],
-        },
-        "layers_to_quantize": [
-            "decoder\\.layers\\.\\d+\\.fc[12]",
-            "decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01]",
-            "decoder\\.layers\\.\\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj)",
-        ],
-    }
-
-    if "n_centroids" in yaml_data:
-        quantization_options["n_centroids"] = {
-            layer: convert_yaml_to_tuple(layer_data)
-            for layer, layer_data in yaml_data["n_centroids"].items()
-        }
-    if "block_sizes" in yaml_data:
-        quantization_options["block_sizes"] = {
-            layer: convert_yaml_to_tuple(layer_data)
-            for layer, layer_data in yaml_data["block_sizes"].items()
-        }
-    if "layers_to_quantize" in yaml_data:
-        quantization_options["layers_to_quantize"] = yaml_data["layers_to_quantize"]
-
-    return quantization_options
-
-
-def convert_yaml_to_tuple(yaml_dictionary):
-    """Converts a yaml dictionary with two keys: `key` and `value` into a two
-    argument tuple of those values."""
-    return (yaml_dictionary["key"], yaml_dictionary["value"])
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/__init__.py
deleted file mode 100644
index 143834f3d036780eb6844c82f0c6f2d10cfe2f61..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .utils import quantize_model_  # NOQA
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/__init__.py
deleted file mode 100644
index 8031d9cdb23f2bc72596f8bc9cfa4965f96e3e6c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from .qact import ActivationQuantizer  # NOQA
-from .qconv import IntConv2d  # NOQA
-from .qemb import IntEmbedding  # NOQA
-from .qlinear import IntLinear  # NOQA
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/qact.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/qact.py
deleted file mode 100644
index c5dd1d63362423ab0cfc381dddabb547a3b44c72..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/qact.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-
-from ..ops import emulate_int
-
-
-class ActivationQuantizer:
-    """
-    Fake scalar quantization of the activations using a forward hook.
-
-    Args:
-        - module. a nn.Module for which we quantize the *post-activations*
-        - p: proportion of activations to quantize, set by default to 1
-        - update_step: to recompute quantization parameters
-        - bits: number of bits for quantization
-        - method: choose among {"tensor", "histogram", "channel"}
-        - clamp_threshold: to prevent gradients overflow
-
-    Remarks:
-        - Parameters scale and zero_point are recomputed every update_step
-          forward pass to reduce the overhead
-        - For the list of quantization methods and number of bits, see ops.py
-        - To remove the hook from the module, simply call self.handle.remove()
-        - At test time, the activations are fully quantized
-        - We use the straight-through estimator so that the gradients
-          back-propagate nicely in the network, this is implemented with
-          the detach() trick
-        - The activations are hard-clamped in [-clamp_threshold, clamp_threshold]
-          to prevent overflow during the backward pass
-    """
-
-    def __init__(
-        self,
-        module,
-        p=1,
-        update_step=1000,
-        bits=8,
-        method="histogram",
-        clamp_threshold=5,
-    ):
-        self.module = module
-        self.p = p
-        self.update_step = update_step
-        self.counter = 0
-        self.bits = bits
-        self.method = method
-        self.clamp_threshold = clamp_threshold
-        self.handle = None
-        self.register_hook()
-
-    def register_hook(self):
-        # forward hook
-        def quantize_hook(module, x, y):
-
-            # update parameters every 1000 iterations
-            if self.counter % self.update_step == 0:
-                self.scale = None
-                self.zero_point = None
-            self.counter += 1
-
-            # train with QuantNoise and evaluate the fully quantized network
-            p = self.p if self.module.training else 1
-
-            # quantize activations
-            y_q, self.scale, self.zero_point = emulate_int(
-                y.detach(),
-                bits=self.bits,
-                method=self.method,
-                scale=self.scale,
-                zero_point=self.zero_point,
-            )
-
-            # mask to apply noise
-            mask = torch.zeros_like(y)
-            mask.bernoulli_(1 - p)
-            noise = (y_q - y).masked_fill(mask.bool(), 0)
-
-            # using straight-through estimator (STE)
-            clamp_low = -self.scale * self.zero_point
-            clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point)
-            return torch.clamp(y, clamp_low.item(), clamp_high.item()) + noise.detach()
-
-        # register hook
-        self.handle = self.module.register_forward_hook(quantize_hook)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/qconv.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/qconv.py
deleted file mode 100644
index 83788c6f71fd41e61fd115681a22d53ce8b8362c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/qconv.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn.functional as F
-from torch.nn.modules.conv import _ConvNd
-from torch.nn.modules.utils import _pair
-
-from ..ops import emulate_int
-
-
-class IntConv2d(_ConvNd):
-    """
-    Quantized counterpart of the nn.Conv2d module that applies QuantNoise during training.
-
-    Args:
-        - standard nn.Conv2d parameters
-        - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights)
-        - bits: number of bits
-        - method: choose among {"tensor", "histogram", "channel"}
-        - update_step: recompute scale and zero_point every update_steps iterations
-
-    Remarks:
-        - We use the straight-thgourh estimator so that the gradients
-          back-propagate nicely in the network, this is implemented with
-          the detach() trick
-        - Parameters scale and zero_point are recomputed every update_step
-          forward pass to reduce the overhead
-        - At test time, the weights are fully quantized
-    """
-
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        padding=0,
-        dilation=1,
-        groups=1,
-        bias=True,
-        padding_mode="zeros",
-        p=0,
-        bits=8,
-        method="histogram",
-        update_step=1000,
-    ):
-        kernel_size = _pair(kernel_size)
-        stride = _pair(stride)
-        padding = _pair(padding)
-        dilation = _pair(dilation)
-        super(IntConv2d, self).__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding,
-            dilation,
-            False,
-            _pair(0),
-            groups,
-            bias,
-            padding_mode,
-        )
-
-        # quantization parameters
-        self.p = p
-        self.bits = bits
-        self.method = method
-        self.update_step = update_step
-        self.counter = 0
-
-    def _conv_forward(self, input, weight):
-        if self.padding_mode != "zeros":
-            return F.conv2d(
-                F.pad(input, self._padding_repeated_twice, mode=self.padding_mode),
-                weight,
-                self.bias,
-                self.stride,
-                _pair(0),
-                self.dilation,
-                self.groups,
-            )
-        return F.conv2d(
-            input,
-            weight,
-            self.bias,
-            self.stride,
-            self.padding,
-            self.dilation,
-            self.groups,
-        )
-
-    def forward(self, input):
-        # train with QuantNoise and evaluate the fully quantized network
-        p = self.p if self.training else 1
-
-        # update parameters every 100 iterations
-        if self.counter % self.update_step == 0:
-            self.scale = None
-            self.zero_point = None
-        self.counter += 1
-
-        # quantize weight
-        weight_quantized, self.scale, self.zero_point = emulate_int(
-            self.weight.detach(),
-            bits=self.bits,
-            method=self.method,
-            scale=self.scale,
-            zero_point=self.zero_point,
-        )
-
-        # mask to apply noise
-        mask = torch.zeros_like(self.weight)
-        mask.bernoulli_(1 - p)
-        noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0)
-
-        # using straight-through estimator (STE)
-        clamp_low = -self.scale * self.zero_point
-        clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point)
-        weight = (
-            torch.clamp(self.weight, clamp_low.item(), clamp_high.item())
-            + noise.detach()
-        )
-
-        # return output
-        output = self._conv_forward(input, weight)
-        return output
-
-    def extra_repr(self):
-        return (
-            "in_channels={}, out_channels={}, kernel_size={}, stride={}, "
-            "padding={}, dilation={}, groups={}, bias={}, quant_noise={}, "
-            "bits={}, method={}".format(
-                self.in_channels,
-                self.out_channels,
-                self.kernel_size,
-                self.stride,
-                self.padding,
-                self.dilation,
-                self.groups,
-                self.bias is not None,
-                self.p,
-                self.bits,
-                self.method,
-            )
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/qemb.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/qemb.py
deleted file mode 100644
index d6cf06e5872cb86e5c2e726153c7a80c78db9d1e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/qemb.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..ops import emulate_int
-
-
-class IntEmbedding(nn.Module):
-    """
-    Quantized counterpart of the nn.Embedding module that applies QuantNoise during training.
-
-    Args:
-        - num_embeddings: number of tokens
-        - embedding_dim: embedding dimension
-        - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights)
-        - bits: number of bits
-        - method: choose among {"tensor", "histogram", "channel"}
-        - update_step: recompute scale and zero_point every update_steps iterations
-
-    Remarks:
-        - We use the straight-through estimator so that the gradients
-          back-propagate nicely in the network, this is implemented with
-          the detach() trick
-        - Parameters scale and zero_point are recomputed every update_step
-          forward pass to reduce the overhead
-        - At test time, the weights are fully quantized
-    """
-
-    def __init__(
-        self,
-        num_embeddings,
-        embedding_dim,
-        padding_idx=None,
-        max_norm=None,
-        norm_type=2.0,
-        scale_grad_by_freq=False,
-        sparse=False,
-        _weight=None,
-        p=0,
-        update_step=1000,
-        bits=8,
-        method="histogram",
-    ):
-        super(IntEmbedding, self).__init__()
-        self.num_embeddings = num_embeddings
-        self.embedding_dim = embedding_dim
-        if padding_idx is not None:
-            if padding_idx > 0:
-                assert (
-                    padding_idx < self.num_embeddings
-                ), "Padding_idx must be within num_embeddings"
-            elif padding_idx < 0:
-                assert (
-                    padding_idx >= -self.num_embeddings
-                ), "Padding_idx must be within num_embeddings"
-                padding_idx = self.num_embeddings + padding_idx
-        self.padding_idx = padding_idx
-        self.max_norm = max_norm
-        self.norm_type = norm_type
-        self.scale_grad_by_freq = scale_grad_by_freq
-        if _weight is None:
-            self.weight = nn.Parameter(torch.Tensor(num_embeddings, embedding_dim))
-            self.reset_parameters()
-        else:
-            assert list(_weight.shape) == [
-                num_embeddings,
-                embedding_dim,
-            ], "Shape of weight does not match num_embeddings and embedding_dim"
-            self.weight = nn.Parameter(_weight)
-        self.sparse = sparse
-
-        # quantization parameters
-        self.p = p
-        self.bits = bits
-        self.method = method
-        self.update_step = update_step
-        self.counter = 0
-
-    def reset_parameters(self):
-        nn.init.normal_(self.weight)
-        if self.padding_idx is not None:
-            with torch.no_grad():
-                self.weight[self.padding_idx].fill_(0)
-
-    def forward(self, input):
-        # train with QuantNoise and evaluate the fully quantized network
-        p = self.p if self.training else 1
-
-        # update parameters every 1000 iterations
-        if self.counter % self.update_step == 0:
-            self.scale = None
-            self.zero_point = None
-        self.counter += 1
-
-        # quantize weight
-        weight_quantized, self.scale, self.zero_point = emulate_int(
-            self.weight.detach(),
-            bits=self.bits,
-            method=self.method,
-            scale=self.scale,
-            zero_point=self.zero_point,
-        )
-
-        # mask to apply noise
-        mask = torch.zeros_like(self.weight)
-        mask.bernoulli_(1 - p)
-        noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0)
-
-        # using straight-through estimator (STE)
-        clamp_low = -self.scale * self.zero_point
-        clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point)
-        weight = (
-            torch.clamp(self.weight, clamp_low.item(), clamp_high.item())
-            + noise.detach()
-        )
-
-        # return output
-        output = F.embedding(
-            input,
-            weight,
-            self.padding_idx,
-            self.max_norm,
-            self.norm_type,
-            self.scale_grad_by_freq,
-            self.sparse,
-        )
-        return output
-
-    def extra_repr(self):
-        s = "{num_embeddings}, {embedding_dim}"
-        if self.padding_idx is not None:
-            s += ", padding_idx={padding_idx}"
-        if self.max_norm is not None:
-            s += ", max_norm={max_norm}"
-        if self.norm_type != 2:
-            s += ", norm_type={norm_type}"
-        if self.scale_grad_by_freq is not False:
-            s += ", scale_grad_by_freq={scale_grad_by_freq}"
-        if self.sparse is not False:
-            s += ", sparse=True"
-        s += "quant_noise={p}, bits={bits}, method={method}"
-        return s.format(**self.__dict__)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/qlinear.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/qlinear.py
deleted file mode 100644
index 9db1559386bce286301d31435851dc4ea76687a5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/modules/qlinear.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ..ops import emulate_int
-
-
-class IntLinear(nn.Module):
-    """
-    Quantized counterpart of the nn.Linear module that applies QuantNoise during training.
-
-    Args:
-        - in_features: input features
-        - out_features: output features
-        - bias: bias or not
-        - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights)
-        - bits: number of bits
-        - method: choose among {"tensor", "histogram", "channel"}
-        - update_step: recompute scale and zero_point every update_steps iterations
-
-    Remarks:
-        - We use the straight-through estimator so that the gradients
-          back-propagate nicely in the network, this is implemented with
-          the detach() trick.
-        - Parameters scale and zero_point are recomputed every update_step
-          forward pass to reduce the overhead
-        - At test time, the weights are fully quantized
-    """
-
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        bias=True,
-        p=0,
-        update_step=3000,
-        bits=8,
-        method="histogram",
-    ):
-        super(IntLinear, self).__init__()
-        self.in_features = int(in_features)
-        self.out_features = int(out_features)
-        self.weight = torch.nn.Parameter(torch.Tensor(out_features, in_features))
-        self.chosen_bias = bias
-        if self.chosen_bias:
-            self.bias = torch.nn.Parameter(torch.Tensor(out_features))
-        else:
-            self.register_parameter("bias", None)
-        self.reset_parameters()
-
-        # quantization parameters
-        self.p = p
-        self.bits = bits
-        self.method = method
-        self.update_step = update_step
-        self.counter = 0
-
-    def reset_parameters(self):
-        nn.init.xavier_uniform_(self.weight)
-        if self.chosen_bias:
-            nn.init.constant_(self.bias, 0.0)
-        return
-
-    def forward(self, input):
-        # train with QuantNoise and evaluate the fully quantized network
-        p = self.p if self.training else 1
-
-        # update parameters every 100 iterations
-        if self.counter % self.update_step == 0:
-            self.scale = None
-            self.zero_point = None
-        self.counter += 1
-
-        # quantize weight
-        weight_quantized, self.scale, self.zero_point = emulate_int(
-            self.weight.detach(),
-            bits=self.bits,
-            method=self.method,
-            scale=self.scale,
-            zero_point=self.zero_point,
-        )
-
-        # mask to apply noise
-        mask = torch.zeros_like(self.weight)
-        mask.bernoulli_(1 - p)
-        noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0)
-
-        # using straight-through estimator (STE)
-        clamp_low = -self.scale * self.zero_point
-        clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point)
-        weight = (
-            torch.clamp(self.weight, clamp_low.item(), clamp_high.item())
-            + noise.detach()
-        )
-
-        # return output
-        output = F.linear(input, weight, self.bias)
-        return output
-
-    def extra_repr(self):
-        return "in_features={}, out_features={}, bias={}, quant_noise={}, bits={}, method={}".format(
-            self.in_features,
-            self.out_features,
-            self.bias is not None,
-            self.p,
-            self.bits,
-            self.method,
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/ops.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/ops.py
deleted file mode 100644
index 2a855159be2795bdad45f1365e202d9abd26433b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/ops.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-
-
-def emulate_int(w, bits, method, scale=None, zero_point=None):
-    q = globals()[f"emulate_int{bits}_{method}"]
-    return q(w, scale=scale, zero_point=zero_point)
-
-
-def quantize(w, scale, zero_point):
-    return (
-        torch.clamp(torch.round(w / scale + zero_point), 0, 255) - zero_point
-    ) * scale
-
-
-def emulate_int8_histogram(w, scale=None, zero_point=None):
-    if scale is None:
-        obs = torch.quantization.observer.HistogramObserver()
-        _ = obs(w.float())
-        scale, zero_point = obs.calculate_qparams()
-        scale = scale.cuda().type_as(w)
-        zero_point = zero_point.cuda().type_as(w)
-    return quantize(w, scale, zero_point), scale, zero_point
-
-
-def emulate_int8_channel(w, scale=None, zero_point=None):
-    if scale is None:
-        obs = torch.quantization.observer.PerChannelMinMaxObserver(
-            ch_axis=-1, qscheme=torch.per_channel_symmetric
-        )
-        _ = obs(w)
-        scale, zero_point, ch_axis = obs.get_qparams()
-        scale = scale.cuda().type_as(w)
-        zero_point = zero_point.cuda().type_as(w)
-    return quantize(w, scale, zero_point), scale, zero_point
-
-
-def emulate_int8_tensor(w, scale=None, zero_point=None):
-    if scale is None:
-        obs = torch.quantization.observer.MinMaxObserver()
-        _ = obs(w)
-        scale, zero_point = obs.calculate_qparams()
-        scale = scale.cuda().type_as(w)
-        zero_point = zero_point.cuda().type_as(w)
-    return quantize(w, scale, zero_point), scale, zero_point
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/utils.py
deleted file mode 100644
index 32cf616568160004bd97a673f2d85923974c1fae..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/quantization/scalar/utils.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-from operator import attrgetter
-
-import torch.distributed as dist
-import torch.nn as nn
-
-from ..pq.utils import attrsetter, get_layers
-from .modules import ActivationQuantizer, IntConv2d, IntEmbedding, IntLinear
-
-
-MAPPING = {nn.Linear: IntLinear, nn.Embedding: IntEmbedding, nn.Conv2d: IntConv2d}
-
-
-def quantize_model_(model, p=0.2, bits=8, update_step=3000):
-    """
-    Replaces all modules with their scalar quantized counterpart and
-    registers hooks to quantize the post-ativations of those modules.
-
-    Args:
-        - model: a nn.Module
-        - p: amount of noise (0 for no noise, 1 to quantize all the weights/activations)
-        - bits: number of bits
-        - update_step: update quantization parameters every update_step steps
-    """
-
-    # quantize all layers
-    quantized_layers = get_layers(model, "(.*?)")
-
-    for layer in quantized_layers:
-
-        # book-keeping
-        is_master_process = (not dist.is_initialized()) or (
-            dist.is_initialized() and dist.get_rank() == 0
-        )
-
-        # recover module
-        module = attrgetter(layer)(model)
-        if is_master_process:
-            logging.info(
-                f"Quantizing layer {layer} with bits={bits} and QuantNoise={p}"
-            )
-
-        # quantization params
-        q_params = {
-            "p": p,
-            "update_step": update_step,
-            "bits": bits,
-            "method": "histogram",
-            "counter": 0,
-        }
-
-        # instantiate the quantized counterpart
-        if isinstance(module, tuple(MAPPING.keys())):
-            QuantizedModule = MAPPING[module.__class__]
-            quantized_module = QuantizedModule.__new__(QuantizedModule)
-            params = module.__dict__
-            params.update(q_params)
-            quantized_module.__dict__.update(params)
-
-        else:
-            if is_master_process:
-                logging.info(f"Module {module} not yet supported for quantization")
-            continue
-
-        # activation quantization
-        a_q = ActivationQuantizer(quantized_module, p=0, bits=bits, method="histogram")
-
-        # replace layer by its quantized counterpart
-        attrsetter(layer)(model, quantized_module)
-
-    # return name of quantized layers
-    return quantized_layers
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/same_pad.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/same_pad.py
deleted file mode 100644
index b46f94d6357888bde46035d8fcd57ceff5d24a88..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/same_pad.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-from torch import nn
-
-
-class SamePad(nn.Module):
-    def __init__(self, kernel_size):
-        super().__init__()
-        self.remove = kernel_size % 2 == 0
-
-    def forward(self, x):
-        if self.remove:
-            x = x[:, :, :-1]
-        return x
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/scalar_bias.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/scalar_bias.py
deleted file mode 100644
index c96247c75914fabb8a2b7ff731bb82b588f72690..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/scalar_bias.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-#
-
-import torch
-
-
-class ScalarBias(torch.autograd.Function):
-    """
-    Adds a vector of scalars, used in self-attention mechanism to allow
-    the model to optionally attend to this vector instead of the past
-    """
-
-    @staticmethod
-    def forward(ctx, input, dim, bias_init):
-        size = list(input.size())
-        size[dim] += 1
-        output = input.new(*size).fill_(bias_init)
-        output.narrow(dim, 1, size[dim] - 1).copy_(input)
-        ctx.dim = dim
-        return output
-
-    @staticmethod
-    def backward(ctx, grad):
-        return grad.narrow(ctx.dim, 1, grad.size(ctx.dim) - 1), None, None
-
-
-def scalar_bias(input, dim, bias_init=0):
-    return ScalarBias.apply(input, dim, bias_init)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/sinusoidal_positional_embedding.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/sinusoidal_positional_embedding.py
deleted file mode 100644
index 857830faf7cb64950021947e2c5babcb906c48d3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/sinusoidal_positional_embedding.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from typing import Any, Optional
-
-import torch
-import torch.onnx.operators
-from fairseq import utils
-from torch import Tensor, nn
-
-
-class SinusoidalPositionalEmbedding(nn.Module):
-    """This module produces sinusoidal positional embeddings of any length.
-
-    Padding symbols are ignored.
-    """
-
-    def __init__(self, embedding_dim, padding_idx, init_size=1024):
-        super().__init__()
-        self.embedding_dim = embedding_dim
-        self.padding_idx = padding_idx
-        self.weights = SinusoidalPositionalEmbedding.get_embedding(
-            init_size, embedding_dim, padding_idx
-        )
-        self.onnx_trace = False
-        self.register_buffer("_float_tensor", torch.FloatTensor(1))
-        self.max_positions = int(1e5)
-
-    def prepare_for_onnx_export_(self):
-        self.onnx_trace = True
-
-    @staticmethod
-    def get_embedding(
-        num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None
-    ):
-        """Build sinusoidal embeddings.
-
-        This matches the implementation in tensor2tensor, but differs slightly
-        from the description in Section 3.5 of "Attention Is All You Need".
-        """
-        half_dim = embedding_dim // 2
-        emb = math.log(10000) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
-        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(
-            1
-        ) * emb.unsqueeze(0)
-        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(
-            num_embeddings, -1
-        )
-        if embedding_dim % 2 == 1:
-            # zero pad
-            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
-        if padding_idx is not None:
-            emb[padding_idx, :] = 0
-        return emb
-
-    def forward(
-        self,
-        input,
-        incremental_state: Optional[Any] = None,
-        timestep: Optional[Tensor] = None,
-        positions: Optional[Any] = None,
-    ):
-        """Input is expected to be of size [bsz x seqlen]."""
-        bspair = torch.onnx.operators.shape_as_tensor(input)
-        bsz, seq_len = bspair[0], bspair[1]
-        max_pos = self.padding_idx + 1 + seq_len
-        if self.weights is None or max_pos > self.weights.size(0):
-            # recompute/expand embeddings if needed
-            self.weights = SinusoidalPositionalEmbedding.get_embedding(
-                max_pos, self.embedding_dim, self.padding_idx
-            )
-        self.weights = self.weights.to(self._float_tensor)
-
-        if incremental_state is not None:
-            # positions is the same for every token when decoding a single step
-            pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
-            if self.onnx_trace:
-                return (
-                    self.weights.index_select(index=self.padding_idx + pos, dim=0)
-                    .unsqueeze(1)
-                    .repeat(bsz, 1, 1)
-                )
-            return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)
-
-        positions = utils.make_positions(
-            input, self.padding_idx, onnx_trace=self.onnx_trace
-        )
-        if self.onnx_trace:
-            flat_embeddings = self.weights.detach().index_select(0, positions.view(-1))
-            embedding_shape = torch.cat(
-                (bsz.view(1), seq_len.view(1), torch.tensor([-1], dtype=torch.long))
-            )
-            embeddings = torch.onnx.operators.reshape_from_tensor_shape(
-                flat_embeddings, embedding_shape
-            )
-            return embeddings
-        return (
-            self.weights.index_select(0, positions.view(-1))
-            .view(bsz, seq_len, -1)
-            .detach()
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/sparse_multihead_attention.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/sparse_multihead_attention.py
deleted file mode 100644
index 3cbd9d6785886e319aab0601517e27df733b6f97..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/sparse_multihead_attention.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-
-from .multihead_attention import MultiheadAttention
-
-
-class SparseMultiheadAttention(MultiheadAttention):
-    """Sparse Multi-Headed Attention.
-
-    "Generating Long Sequences with Sparse Transformers". Implements
-    fixed factorized self attention, where l=stride and c=expressivity.
-    A(1) includes all words in the stride window and A(2) takes a summary of c
-    words from the end of each stride window.
-    If is_bidirectional=False, we do not include any words past the current word,
-    as in the paper.
-    """
-
-    def __init__(
-        self,
-        embed_dim,
-        num_heads,
-        kdim=None,
-        vdim=None,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        self_attention=False,
-        encoder_decoder_attention=False,
-        stride=32,
-        expressivity=8,
-        is_bidirectional=True,
-    ):
-
-        super().__init__(
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            self_attention,
-            encoder_decoder_attention,
-        )
-
-        self.is_bidirectional = is_bidirectional
-        self.stride = stride
-        self.expressivity = expressivity
-        assert self.stride > 0 and self.stride >= self.expressivity
-
-    # Used for Ai(2) calculations - beginning of [l-c, l] range
-    def compute_checkpoint(self, word_index):
-        if word_index % self.stride == 0 and word_index != 0:
-            checkpoint_index = word_index - self.expressivity
-        else:
-            checkpoint_index = (
-                math.floor(word_index / self.stride) * self.stride
-                + self.stride
-                - self.expressivity
-            )
-        return checkpoint_index
-
-    # Computes Ai(2)
-    def compute_subset_summaries(self, absolute_max):
-        checkpoint_index = self.compute_checkpoint(0)
-        subset_two = set()
-        while checkpoint_index <= absolute_max - 1:
-            summary = set(
-                range(
-                    checkpoint_index,
-                    min(checkpoint_index + self.expressivity + 1, absolute_max),
-                )
-            )
-            subset_two = subset_two.union(summary)
-            checkpoint_index = self.compute_checkpoint(checkpoint_index + self.stride)
-        return subset_two
-
-    # Sparse Transformer Fixed Attention Pattern: https://arxiv.org/pdf/1904.10509.pdf
-    def compute_fixed_attention_subset(self, word_index, tgt_len):
-        # +1s account for range function; [min, max) -> [min, max]
-        if not self.is_bidirectional:
-            absolute_max = word_index + 1
-        else:
-            absolute_max = tgt_len
-
-        # Subset 1 - whole window
-        rounded_index = (
-            math.floor((word_index + self.stride) / self.stride) * self.stride
-        )
-        if word_index % self.stride == 0 and word_index != 0:
-            subset_one = set(
-                range(word_index - self.stride, min(absolute_max, word_index + 1))
-            )
-        else:
-            subset_one = set(
-                range(
-                    max(0, rounded_index - self.stride),
-                    min(absolute_max, rounded_index + 1),
-                )
-            )
-
-        # Subset 2 - summary per window
-        # If bidirectional, subset 2 is the same for every index
-        subset_two = set()
-        if not self.is_bidirectional:
-            subset_two = self.compute_subset_summaries(absolute_max)
-
-        return subset_one.union(subset_two)
-
-    # Compute sparse mask - if bidirectional, can pre-compute and store
-    def buffered_sparse_mask(self, tensor, tgt_len, src_len):
-        assert tgt_len > self.stride
-        sparse_mask = torch.empty((tgt_len, src_len)).float().fill_(float("-inf"))
-
-        # If bidirectional, subset 2 is the same for every index
-        subset_summaries = set()
-        if self.is_bidirectional:
-            subset_summaries = self.compute_subset_summaries(tgt_len)
-
-        for i in range(tgt_len):
-            fixed_attention_subset = self.compute_fixed_attention_subset(i, tgt_len)
-            fixed_attention_subset = fixed_attention_subset.union(subset_summaries)
-            included_word_indices = torch.LongTensor(list(fixed_attention_subset))
-            sparse_mask[i].index_fill_(0, included_word_indices, 0)
-        return sparse_mask.type_as(tensor)
-
-    def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz):
-        sparse_mask = self.buffered_sparse_mask(attn_weights, tgt_len, src_len)
-        sparse_mask = sparse_mask.unsqueeze(0).expand(
-            bsz * self.num_heads, tgt_len, src_len
-        )
-        attn_weights += sparse_mask
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/sparse_transformer_sentence_encoder.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/sparse_transformer_sentence_encoder.py
deleted file mode 100644
index f41ec09327fe80b50d20674e7482794ce45c531c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/sparse_transformer_sentence_encoder.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch.nn as nn
-from fairseq.modules import TransformerSentenceEncoder
-from fairseq.modules.sparse_transformer_sentence_encoder_layer import (
-    SparseTransformerSentenceEncoderLayer,
-)
-
-
-class SparseTransformerSentenceEncoder(TransformerSentenceEncoder):
-    """
-    Sparse implementation of the TransformerSentenceEncoder
-    - see SparseMultiheadAttention
-    """
-
-    def __init__(
-        self,
-        padding_idx: int,
-        vocab_size: int,
-        num_encoder_layers: int = 6,
-        embedding_dim: int = 768,
-        ffn_embedding_dim: int = 3072,
-        num_attention_heads: int = 8,
-        dropout: float = 0.1,
-        attention_dropout: float = 0.1,
-        activation_dropout: float = 0.1,
-        max_seq_len: int = 256,
-        num_segments: int = 2,
-        use_position_embeddings: bool = True,
-        offset_positions_by_padding: bool = True,
-        encoder_normalize_before: bool = False,
-        apply_bert_init: bool = False,
-        activation_fn: str = "relu",
-        learned_pos_embedding: bool = True,
-        embed_scale: float = None,
-        freeze_embeddings: bool = False,
-        n_trans_layers_to_freeze: int = 0,
-        export: bool = False,
-        is_bidirectional: bool = True,
-        stride: int = 32,
-        expressivity: int = 8,
-    ) -> None:
-
-        super().__init__(
-            padding_idx,
-            vocab_size,
-            num_encoder_layers,
-            embedding_dim,
-            ffn_embedding_dim,
-            num_attention_heads,
-            dropout,
-            attention_dropout,
-            activation_dropout,
-            max_seq_len,
-            num_segments,
-            use_position_embeddings,
-            offset_positions_by_padding,
-            encoder_normalize_before,
-            apply_bert_init,
-            activation_fn,
-            learned_pos_embedding,
-            embed_scale,
-            freeze_embeddings,
-            n_trans_layers_to_freeze,
-            export,
-        )
-
-        self.layers = nn.ModuleList(
-            [
-                SparseTransformerSentenceEncoderLayer(
-                    embedding_dim=self.embedding_dim,
-                    ffn_embedding_dim=ffn_embedding_dim,
-                    num_attention_heads=num_attention_heads,
-                    dropout=dropout,
-                    attention_dropout=attention_dropout,
-                    activation_dropout=activation_dropout,
-                    activation_fn=activation_fn,
-                    export=export,
-                    is_bidirectional=is_bidirectional,
-                    stride=stride,
-                    expressivity=expressivity,
-                )
-                for _ in range(num_encoder_layers)
-            ]
-        )
-
-        def freeze_module_params(m):
-            if m is not None:
-                for p in m.parameters():
-                    p.requires_grad = False
-
-        for layer in range(n_trans_layers_to_freeze):
-            freeze_module_params(self.layers[layer])
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
deleted file mode 100644
index d95da59c2471bfa858fd627605196d7f41f9ec12..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.modules import TransformerSentenceEncoderLayer
-from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention
-
-
-class SparseTransformerSentenceEncoderLayer(TransformerSentenceEncoderLayer):
-    """
-    Implements a Sprase Transformer Encoder Layer (see SparseMultiheadAttention)
-    """
-
-    def __init__(
-        self,
-        embedding_dim: int = 768,
-        ffn_embedding_dim: int = 3072,
-        num_attention_heads: int = 8,
-        dropout: float = 0.1,
-        attention_dropout: float = 0.1,
-        activation_dropout: float = 0.1,
-        activation_fn: str = "relu",
-        export: bool = False,
-        is_bidirectional: bool = True,
-        stride: int = 32,
-        expressivity: int = 8,
-    ) -> None:
-
-        super().__init__(
-            embedding_dim,
-            ffn_embedding_dim,
-            num_attention_heads,
-            dropout,
-            attention_dropout,
-            activation_dropout,
-            activation_fn,
-            export,
-        )
-
-        self.self_attn = SparseMultiheadAttention(
-            self.embedding_dim,
-            num_attention_heads,
-            dropout=attention_dropout,
-            add_bias_kv=False,
-            add_zero_attn=False,
-            self_attention=True,
-            is_bidirectional=is_bidirectional,
-            stride=stride,
-            expressivity=expressivity,
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/transformer_layer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/transformer_layer.py
deleted file mode 100644
index b889b7cfc448e5fade7735ac2a50c364d1345a88..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/transformer_layer.py
+++ /dev/null
@@ -1,432 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Dict, List, Optional
-
-import torch
-import torch.nn as nn
-from fairseq import utils
-from fairseq.modules import LayerNorm, MultiheadAttention
-from fairseq.modules.fairseq_dropout import get_dropout_class
-from fairseq.modules.quant_noise import quant_noise
-from torch import Tensor
-
-dropout_class = get_dropout_class()
-
-class NpuLinear(torch.nn.Linear):
-    def forward(self, input):
-        return torch.npu_linear(input, self.weight, self.bias)
-
-class TransformerEncoderLayer(nn.Module):
-    """Encoder layer block.
-
-    In the original paper each operation (multi-head attention or FFN) is
-    postprocessed with: `dropout -> add residual -> layernorm`. In the
-    tensor2tensor code they suggest that learning is more robust when
-    preprocessing each layer with layernorm and postprocessing with:
-    `dropout -> add residual`. We default to the approach in the paper, but the
-    tensor2tensor approach can be enabled by setting
-    *args.encoder_normalize_before* to ``True``.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-    """
-
-    def __init__(self, args):
-        super().__init__()
-        self.embed_dim = args.encoder_embed_dim
-        self.quant_noise = getattr(args, "quant_noise_pq", 0)
-        self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8)
-        self.self_attn = self.build_self_attention(self.embed_dim, args)
-        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
-        self.dropout_module = dropout_class(
-            args.dropout, module_name=self.__class__.__name__
-        )
-        self.activation_fn = utils.get_activation_fn(
-            activation=getattr(args, "activation_fn", "relu")
-        )
-        activation_dropout_p = getattr(args, "activation_dropout", 0)
-        if activation_dropout_p == 0:
-            # for backwards compatibility with models that use args.relu_dropout
-            activation_dropout_p = getattr(args, "relu_dropout", 0)
-        self.activation_dropout_module = dropout_class(
-            float(activation_dropout_p), module_name=self.__class__.__name__
-        )
-        self.normalize_before = args.encoder_normalize_before
-        self.fc1 = self.build_fc1(
-            self.embed_dim,
-            args.encoder_ffn_embed_dim,
-            self.quant_noise,
-            self.quant_noise_block_size,
-        )
-        self.fc2 = self.build_fc2(
-            args.encoder_ffn_embed_dim,
-            self.embed_dim,
-            self.quant_noise,
-            self.quant_noise_block_size,
-        )
-
-        self.final_layer_norm = LayerNorm(self.embed_dim)
-
-    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
-        return quant_noise(
-            nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size
-        )
-
-    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
-        return quant_noise(
-            nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size
-        )
-
-    def build_self_attention(self, embed_dim, args):
-        return MultiheadAttention(
-            embed_dim,
-            args.encoder_attention_heads,
-            dropout=args.attention_dropout,
-            self_attention=True,
-            q_noise=self.quant_noise,
-            qn_block_size=self.quant_noise_block_size,
-        )
-
-    def residual_connection(self, x, residual):
-        return residual + x
-
-    def upgrade_state_dict_named(self, state_dict, name):
-        """
-        Rename layer norm states from `...layer_norms.0.weight` to
-        `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
-        `...final_layer_norm.weight`
-        """
-        layer_norm_map = {"0": "self_attn_layer_norm", "1": "final_layer_norm"}
-        for old, new in layer_norm_map.items():
-            for m in ("weight", "bias"):
-                k = "{}.layer_norms.{}.{}".format(name, old, m)
-                if k in state_dict:
-                    state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k]
-                    del state_dict[k]
-
-    def forward(self, x, encoder_padding_mask, bsz, tgt_len, s_len, attn_mask: Optional[Tensor] = None):
-        """
-        Args:
-            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
-                `(batch, seq_len)` where padding elements are indicated by ``1``.
-            attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`,
-                where `tgt_len` is the length of output and `src_len` is the
-                length of input, though here both are equal to `seq_len`.
-                `attn_mask[tgt_i, src_j] = 1` means that when calculating the
-                embedding for `tgt_i`, we exclude (mask out) `src_j`. This is
-                useful for strided self-attention.
-
-        Returns:
-            encoded output of shape `(seq_len, batch, embed_dim)`
-        """
-        # anything in original attn_mask = 1, becomes -1e8
-        # anything in original attn_mask = 0, becomes 0
-        # Note that we cannot use -inf here, because at some edge cases,
-        # the attention weight (before softmax) for some padded element in query
-        # will become -inf, which results in NaN in model parameters
-        if attn_mask is not None:
-            attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8)
-
-        residual = x
-
-        if self.normalize_before:
-            x = self.self_attn_layer_norm(x)
-
-        x, _ = self.self_attn(
-            query=x,
-            key=x,
-            value=x, bsz=bsz, tgt_len=tgt_len,s_len=s_len,
-            key_padding_mask=encoder_padding_mask,
-            attn_mask=attn_mask,
-        )
-        x = self.dropout_module(x)
-        x = self.residual_connection(x, residual)
-        if not self.normalize_before:
-            x = self.self_attn_layer_norm(x)
-
-        residual = x
-        if self.normalize_before:
-            x = self.final_layer_norm(x)
-
-        x = self.activation_fn(self.fc1(x))
-        x = self.activation_dropout_module(x)
-        x = self.fc2(x)
-        x = self.dropout_module(x)
-        x = self.residual_connection(x, residual)
-        if not self.normalize_before:
-            x = self.final_layer_norm(x)
-        return x
-
-
-class TransformerDecoderLayer(nn.Module):
-    """Decoder layer block.
-
-    In the original paper each operation (multi-head attention, encoder
-    attention or FFN) is postprocessed with: `dropout -> add residual ->
-    layernorm`. In the tensor2tensor code they suggest that learning is more
-    robust when preprocessing each layer with layernorm and postprocessing with:
-    `dropout -> add residual`. We default to the approach in the paper, but the
-    tensor2tensor approach can be enabled by setting
-    *args.decoder_normalize_before* to ``True``.
-
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        no_encoder_attn (bool, optional): whether to attend to encoder outputs
-            (default: False).
-    """
-
-    def __init__(
-        self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
-    ):
-        super().__init__()
-        self.embed_dim = args.decoder_embed_dim
-        self.dropout_module = dropout_class(
-            args.dropout, module_name=self.__class__.__name__
-        )
-        self.quant_noise = getattr(args, "quant_noise_pq", 0)
-        self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8)
-
-        self.cross_self_attention = getattr(args, "cross_self_attention", False)
-
-        self.self_attn = self.build_self_attention(
-            self.embed_dim,
-            args,
-            add_bias_kv=add_bias_kv,
-            add_zero_attn=add_zero_attn,
-        )
-
-        self.activation_fn = utils.get_activation_fn(
-            activation=str(args.activation_fn)
-            if getattr(args, "activation_fn", None) is not None
-            else "relu"
-        )
-        activation_dropout_p = getattr(args, "activation_dropout", 0)
-        if activation_dropout_p == 0:
-            # for backwards compatibility with models that use args.relu_dropout
-            activation_dropout_p = getattr(args, "relu_dropout", 0)
-        self.activation_dropout_module = dropout_class(
-            float(activation_dropout_p), module_name=self.__class__.__name__
-        )
-        self.normalize_before = args.decoder_normalize_before
-
-        # use layerNorm rather than FusedLayerNorm for exporting.
-        # char_inputs can be used to determint this.
-        # TODO  remove this once we update apex with the fix
-        export = getattr(args, "char_inputs", False)
-        self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
-
-        if no_encoder_attn:
-            self.encoder_attn = None
-            self.encoder_attn_layer_norm = None
-        else:
-            self.encoder_attn = self.build_encoder_attention(self.embed_dim, args)
-            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
-
-        self.fc1 = self.build_fc1(
-            self.embed_dim,
-            args.decoder_ffn_embed_dim,
-            self.quant_noise,
-            self.quant_noise_block_size,
-        )
-        self.fc2 = self.build_fc2(
-            args.decoder_ffn_embed_dim,
-            self.embed_dim,
-            self.quant_noise,
-            self.quant_noise_block_size,
-        )
-
-        self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
-        self.need_attn = True
-
-        self.onnx_trace = False
-
-    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
-        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
-
-    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
-        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
-
-    def build_self_attention(
-        self, embed_dim, args, add_bias_kv=False, add_zero_attn=False
-    ):
-        return MultiheadAttention(
-            embed_dim,
-            args.decoder_attention_heads,
-            dropout=args.attention_dropout,
-            add_bias_kv=add_bias_kv,
-            add_zero_attn=add_zero_attn,
-            self_attention=not getattr(args, "cross_self_attention", False),
-            q_noise=self.quant_noise,
-            qn_block_size=self.quant_noise_block_size,
-        )
-
-    def build_encoder_attention(self, embed_dim, args):
-        return MultiheadAttention(
-            embed_dim,
-            args.decoder_attention_heads,
-            kdim=getattr(args, "encoder_embed_dim", None),
-            vdim=getattr(args, "encoder_embed_dim", None),
-            dropout=args.attention_dropout,
-            encoder_decoder_attention=True,
-            q_noise=self.quant_noise,
-            qn_block_size=self.quant_noise_block_size,
-        )
-
-    def prepare_for_onnx_export_(self):
-        self.onnx_trace = True
-
-    def residual_connection(self, x, residual):
-        return residual + x
-
-    def forward(
-        self,
-        x, bsz, tgt_len, s_len,
-        encoder_out: Optional[torch.Tensor] = None,
-        encoder_padding_mask: Optional[torch.Tensor] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        prev_self_attn_state: Optional[List[torch.Tensor]] = None,
-        prev_attn_state: Optional[List[torch.Tensor]] = None,
-        self_attn_mask: Optional[torch.Tensor] = None,
-        self_attn_padding_mask: Optional[torch.Tensor] = None,
-        need_attn: bool = False,
-        need_head_weights: bool = False,
-    ):
-        """
-        Args:
-            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
-            encoder_padding_mask (ByteTensor, optional): binary
-                ByteTensor of shape `(batch, src_len)` where padding
-                elements are indicated by ``1``.
-            need_attn (bool, optional): return attention weights
-            need_head_weights (bool, optional): return attention weights
-                for each head (default: return average over heads).
-
-        Returns:
-            encoded output of shape `(seq_len, batch, embed_dim)`
-        """
-        if need_head_weights:
-            need_attn = True
-
-        residual = x
-
-        if self.normalize_before:
-            x = self.self_attn_layer_norm(x)
-
-        if prev_self_attn_state is not None:
-            prev_key, prev_value = prev_self_attn_state[:2]
-            saved_state: Dict[str, Optional[Tensor]] = {
-                "prev_key": prev_key,
-                "prev_value": prev_value,
-            }
-            if len(prev_self_attn_state) >= 3:
-                saved_state["prev_key_padding_mask"] = prev_self_attn_state[2]
-            assert incremental_state is not None
-            self.self_attn._set_input_buffer(incremental_state, saved_state)
-        _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state)
-        if self.cross_self_attention and not (
-            incremental_state is not None
-            and _self_attn_input_buffer is not None
-            and "prev_key" in _self_attn_input_buffer
-        ):
-            if self_attn_mask is not None:
-                assert encoder_out is not None
-                self_attn_mask = torch.cat(
-                    (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1
-                )
-            if self_attn_padding_mask is not None:
-                if encoder_padding_mask is None:
-                    assert encoder_out is not None
-                    encoder_padding_mask = self_attn_padding_mask.new_zeros(
-                        encoder_out.size(1), encoder_out.size(0)
-                    )
-                self_attn_padding_mask = torch.cat(
-                    (encoder_padding_mask, self_attn_padding_mask), dim=1
-                )
-            assert encoder_out is not None
-            y = torch.cat((encoder_out, x), dim=0)
-        else:
-            y = x
-
-        x, attn = self.self_attn(
-            query=x,
-            key=y,
-            value=y, bsz=bsz, tgt_len=tgt_len, s_len=s_len,
-            key_padding_mask=self_attn_padding_mask,
-            incremental_state=incremental_state,
-            need_weights=False,
-            attn_mask=self_attn_mask,
-        )
-        x = self.dropout_module(x)
-        x = self.residual_connection(x, residual)
-        if not self.normalize_before:
-            x = self.self_attn_layer_norm(x)
-
-        if self.encoder_attn is not None and encoder_out is not None:
-            residual = x
-            if self.normalize_before:
-                x = self.encoder_attn_layer_norm(x)
-            if prev_attn_state is not None:
-                prev_key, prev_value = prev_attn_state[:2]
-                saved_state: Dict[str, Optional[Tensor]] = {
-                    "prev_key": prev_key,
-                    "prev_value": prev_value,
-                }
-                if len(prev_attn_state) >= 3:
-                    saved_state["prev_key_padding_mask"] = prev_attn_state[2]
-                assert incremental_state is not None
-                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
-
-            x, attn = self.encoder_attn(
-                query=x,
-                key=encoder_out,
-                value=encoder_out, bsz=bsz, tgt_len=tgt_len, s_len=s_len,
-                key_padding_mask=encoder_padding_mask,
-                incremental_state=incremental_state,
-                static_kv=True,
-                need_weights=need_attn or (not self.training and self.need_attn),
-                need_head_weights=need_head_weights,
-            )
-            x = self.dropout_module(x)
-            x = self.residual_connection(x, residual)
-            if not self.normalize_before:
-                x = self.encoder_attn_layer_norm(x)
-
-        residual = x
-        if self.normalize_before:
-            x = self.final_layer_norm(x)
-
-        x = self.activation_fn(self.fc1(x))
-        x = self.activation_dropout_module(x)
-        x = self.fc2(x)
-        x = self.dropout_module(x)
-        x = self.residual_connection(x, residual)
-        if not self.normalize_before:
-            x = self.final_layer_norm(x)
-        if self.onnx_trace and incremental_state is not None:
-            saved_state = self.self_attn._get_input_buffer(incremental_state)
-            assert saved_state is not None
-            if self_attn_padding_mask is not None:
-                self_attn_state = [
-                    saved_state["prev_key"],
-                    saved_state["prev_value"],
-                    saved_state["prev_key_padding_mask"],
-                ]
-            else:
-                self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]]
-            return x, attn, self_attn_state
-        return x, attn, None
-
-    def make_generation_fast_(self, need_attn: bool = False, **kwargs):
-        self.need_attn = need_attn
-
-
-def Linear(in_features, out_features, bias=True):
-    m = nn.Linear(in_features, out_features, bias)
-    nn.init.xavier_uniform_(m.weight)
-    if bias:
-        nn.init.constant_(m.bias, 0.0)
-    return m
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/transformer_sentence_encoder.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/transformer_sentence_encoder.py
deleted file mode 100644
index 208488f562b64e79360fd4deb9c627b6ff1e53ba..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/transformer_sentence_encoder.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-from fairseq.modules import (
-    FairseqDropout,
-    LayerDropModuleList,
-    LayerNorm,
-    MultiheadAttention,
-    PositionalEmbedding,
-    TransformerSentenceEncoderLayer,
-)
-from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
-
-
-def init_bert_params(module):
-    """
-    Initialize the weights specific to the BERT Model.
-    This overrides the default initializations depending on the specified arguments.
-        1. If normal_init_linear_weights is set then weights of linear
-           layer will be initialized using the normal distribution and
-           bais will be set to the specified value.
-        2. If normal_init_embed_weights is set then weights of embedding
-           layer will be initialized using the normal distribution.
-        3. If normal_init_proj_weights is set then weights of
-           in_project_weight for MultiHeadAttention initialized using
-           the normal distribution (to be validated).
-    """
-
-    if isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=0.02)
-        if module.bias is not None:
-            module.bias.data.zero_()
-    if isinstance(module, nn.Embedding):
-        module.weight.data.normal_(mean=0.0, std=0.02)
-        if module.padding_idx is not None:
-            module.weight.data[module.padding_idx].zero_()
-    if isinstance(module, MultiheadAttention):
-        module.q_proj.weight.data.normal_(mean=0.0, std=0.02)
-        module.k_proj.weight.data.normal_(mean=0.0, std=0.02)
-        module.v_proj.weight.data.normal_(mean=0.0, std=0.02)
-
-
-class TransformerSentenceEncoder(nn.Module):
-    """
-    Implementation for a Bi-directional Transformer based Sentence Encoder used
-    in BERT/XLM style pre-trained models.
-
-    This first computes the token embedding using the token embedding matrix,
-    position embeddings (if specified) and segment embeddings
-    (if specified). After applying the specified number of
-    TransformerEncoderLayers, it outputs all the internal states of the
-    encoder as well as the final representation associated with the first
-    token (usually CLS token).
-
-    Input:
-        - tokens: B x T matrix representing sentences
-        - segment_labels: B x T matrix representing segment label for tokens
-
-    Output:
-        - a tuple of the following:
-            - a list of internal model states used to compute the
-              predictions where each tensor has shape T x B x C
-            - sentence representation associated with first input token
-              in format B x C.
-    """
-
-    def __init__(
-        self,
-        padding_idx: int,
-        vocab_size: int,
-        num_encoder_layers: int = 6,
-        embedding_dim: int = 768,
-        ffn_embedding_dim: int = 3072,
-        num_attention_heads: int = 8,
-        dropout: float = 0.1,
-        attention_dropout: float = 0.1,
-        activation_dropout: float = 0.1,
-        layerdrop: float = 0.0,
-        max_seq_len: int = 256,
-        num_segments: int = 2,
-        use_position_embeddings: bool = True,
-        offset_positions_by_padding: bool = True,
-        encoder_normalize_before: bool = False,
-        apply_bert_init: bool = False,
-        activation_fn: str = "relu",
-        learned_pos_embedding: bool = True,
-        embed_scale: float = None,
-        freeze_embeddings: bool = False,
-        n_trans_layers_to_freeze: int = 0,
-        export: bool = False,
-        traceable: bool = False,
-        q_noise: float = 0.0,
-        qn_block_size: int = 8,
-    ) -> None:
-
-        super().__init__()
-        self.padding_idx = padding_idx
-        self.vocab_size = vocab_size
-        self.dropout_module = FairseqDropout(
-            dropout, module_name=self.__class__.__name__
-        )
-        self.layerdrop = layerdrop
-        self.max_seq_len = max_seq_len
-        self.embedding_dim = embedding_dim
-        self.num_segments = num_segments
-        self.use_position_embeddings = use_position_embeddings
-        self.apply_bert_init = apply_bert_init
-        self.learned_pos_embedding = learned_pos_embedding
-        self.traceable = traceable
-        self.tpu = False  # whether we're on TPU
-
-        self.embed_tokens = self.build_embedding(
-            self.vocab_size, self.embedding_dim, self.padding_idx
-        )
-        self.embed_scale = embed_scale
-
-        if q_noise > 0:
-            self.quant_noise = apply_quant_noise_(
-                nn.Linear(self.embedding_dim, self.embedding_dim, bias=False),
-                q_noise,
-                qn_block_size,
-            )
-        else:
-            self.quant_noise = None
-
-        self.segment_embeddings = (
-            nn.Embedding(self.num_segments, self.embedding_dim, padding_idx=None)
-            if self.num_segments > 0
-            else None
-        )
-
-        self.embed_positions = (
-            PositionalEmbedding(
-                self.max_seq_len,
-                self.embedding_dim,
-                padding_idx=(self.padding_idx if offset_positions_by_padding else None),
-                learned=self.learned_pos_embedding,
-            )
-            if self.use_position_embeddings
-            else None
-        )
-
-        if self.layerdrop > 0.0:
-            self.layers = LayerDropModuleList(p=self.layerdrop)
-        else:
-            self.layers = nn.ModuleList([])
-        self.layers.extend(
-            [
-                self.build_transformer_sentence_encoder_layer(
-                    embedding_dim=self.embedding_dim,
-                    ffn_embedding_dim=ffn_embedding_dim,
-                    num_attention_heads=num_attention_heads,
-                    dropout=self.dropout_module.p,
-                    attention_dropout=attention_dropout,
-                    activation_dropout=activation_dropout,
-                    activation_fn=activation_fn,
-                    export=export,
-                    q_noise=q_noise,
-                    qn_block_size=qn_block_size,
-                )
-                for _ in range(num_encoder_layers)
-            ]
-        )
-
-        if encoder_normalize_before:
-            self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export)
-        else:
-            self.emb_layer_norm = None
-
-        # Apply initialization of model params after building the model
-        if self.apply_bert_init:
-            self.apply(init_bert_params)
-
-        def freeze_module_params(m):
-            if m is not None:
-                for p in m.parameters():
-                    p.requires_grad = False
-
-        if freeze_embeddings:
-            freeze_module_params(self.embed_tokens)
-            freeze_module_params(self.segment_embeddings)
-            freeze_module_params(self.embed_positions)
-            freeze_module_params(self.emb_layer_norm)
-
-        for layer in range(n_trans_layers_to_freeze):
-            freeze_module_params(self.layers[layer])
-
-    def build_embedding(self, vocab_size, embedding_dim, padding_idx):
-        return nn.Embedding(vocab_size, embedding_dim, padding_idx)
-
-    def build_transformer_sentence_encoder_layer(
-        self,
-        embedding_dim,
-        ffn_embedding_dim,
-        num_attention_heads,
-        dropout,
-        attention_dropout,
-        activation_dropout,
-        activation_fn,
-        export,
-        q_noise,
-        qn_block_size,
-    ):
-        return TransformerSentenceEncoderLayer(
-            embedding_dim=embedding_dim,
-            ffn_embedding_dim=ffn_embedding_dim,
-            num_attention_heads=num_attention_heads,
-            dropout=dropout,
-            attention_dropout=attention_dropout,
-            activation_dropout=activation_dropout,
-            activation_fn=activation_fn,
-            export=export,
-            q_noise=q_noise,
-            qn_block_size=qn_block_size,
-        )
-
-    def prepare_for_tpu_(self, **kwargs):
-        self.tpu = True
-
-    def forward(
-        self,
-        tokens: torch.Tensor,
-        segment_labels: torch.Tensor = None,
-        last_state_only: bool = False,
-        positions: Optional[torch.Tensor] = None,
-        token_embeddings: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-
-        # compute padding mask. This is needed for multi-head attention
-        padding_mask = tokens.eq(self.padding_idx)
-        if not self.traceable and not self.tpu and not padding_mask.any():
-            padding_mask = None
-
-        if token_embeddings is not None:
-            x = token_embeddings
-        else:
-            x = self.embed_tokens(tokens)
-
-        if self.embed_scale is not None:
-            x = x * self.embed_scale
-
-        if self.embed_positions is not None:
-            x = x + self.embed_positions(tokens, positions=positions)
-
-        if self.segment_embeddings is not None and segment_labels is not None:
-            x = x + self.segment_embeddings(segment_labels)
-
-        if self.quant_noise is not None:
-            x = self.quant_noise(x)
-
-        if self.emb_layer_norm is not None:
-            x = self.emb_layer_norm(x)
-
-        x = self.dropout_module(x)
-
-        # account for padding while computing the representation
-        if padding_mask is not None:
-            x = x * (1 - padding_mask.unsqueeze(-1).type_as(x))
-
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-
-        inner_states = []
-        if not last_state_only:
-            inner_states.append(x)
-
-        for layer in self.layers:
-            x, _ = layer(x, self_attn_padding_mask=padding_mask)
-            if not last_state_only:
-                inner_states.append(x)
-
-        sentence_rep = x[0, :, :]
-
-        if last_state_only:
-            inner_states = [x]
-
-        if self.traceable:
-            return torch.stack(inner_states), sentence_rep
-        else:
-            return inner_states, sentence_rep
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/transformer_sentence_encoder_layer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/transformer_sentence_encoder_layer.py
deleted file mode 100644
index 3589c60fe6843c549cfcb94a26cd27bad1fd8033..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/transformer_sentence_encoder_layer.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Callable, Optional
-
-import torch
-import torch.nn as nn
-from fairseq import utils
-from fairseq.modules import LayerNorm, MultiheadAttention
-from fairseq.modules.fairseq_dropout import FairseqDropout
-from fairseq.modules.quant_noise import quant_noise
-
-
-class TransformerSentenceEncoderLayer(nn.Module):
-    """
-    Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
-    models.
-    """
-
-    def __init__(
-        self,
-        embedding_dim: int = 768,
-        ffn_embedding_dim: int = 3072,
-        num_attention_heads: int = 8,
-        dropout: float = 0.1,
-        attention_dropout: float = 0.1,
-        activation_dropout: float = 0.1,
-        activation_fn: str = "relu",
-        export: bool = False,
-        q_noise: float = 0.0,
-        qn_block_size: int = 8,
-        init_fn: Callable = None,
-    ) -> None:
-        super().__init__()
-
-        if init_fn is not None:
-            init_fn()
-
-        # Initialize parameters
-        self.embedding_dim = embedding_dim
-        self.dropout_module = FairseqDropout(
-            dropout, module_name=self.__class__.__name__
-        )
-        self.activation_dropout_module = FairseqDropout(
-            activation_dropout, module_name=self.__class__.__name__
-        )
-
-        # Initialize blocks
-        self.activation_fn = utils.get_activation_fn(activation_fn)
-        self.self_attn = self.build_self_attention(
-            self.embedding_dim,
-            num_attention_heads,
-            dropout=attention_dropout,
-            self_attention=True,
-            q_noise=q_noise,
-            qn_block_size=qn_block_size,
-        )
-
-        # layer norm associated with the self attention layer
-        self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export)
-
-        self.fc1 = self.build_fc1(
-            self.embedding_dim,
-            ffn_embedding_dim,
-            q_noise=q_noise,
-            qn_block_size=qn_block_size,
-        )
-        self.fc2 = self.build_fc2(
-            ffn_embedding_dim,
-            self.embedding_dim,
-            q_noise=q_noise,
-            qn_block_size=qn_block_size,
-        )
-
-        # layer norm associated with the position wise feed-forward NN
-        self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
-
-    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
-        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
-
-    def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
-        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
-
-    def build_self_attention(
-        self,
-        embed_dim,
-        num_attention_heads,
-        dropout,
-        self_attention,
-        q_noise,
-        qn_block_size,
-    ):
-        return MultiheadAttention(
-            embed_dim,
-            num_attention_heads,
-            dropout=dropout,
-            self_attention=True,
-            q_noise=q_noise,
-            qn_block_size=qn_block_size,
-        )
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        self_attn_mask: Optional[torch.Tensor] = None,
-        self_attn_padding_mask: Optional[torch.Tensor] = None,
-    ):
-        """
-        LayerNorm is applied either before or after the self-attention/ffn
-        modules similar to the original Transformer implementation.
-        """
-        residual = x
-        x, attn = self.self_attn(
-            query=x,
-            key=x,
-            value=x,
-            key_padding_mask=self_attn_padding_mask,
-            need_weights=False,
-            attn_mask=self_attn_mask,
-        )
-        x = self.dropout_module(x)
-        x = residual + x
-        x = self.self_attn_layer_norm(x)
-
-        residual = x
-        x = self.activation_fn(self.fc1(x))
-        x = self.activation_dropout_module(x)
-        x = self.fc2(x)
-        x = self.dropout_module(x)
-        x = residual + x
-        x = self.final_layer_norm(x)
-        return x, attn
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/transpose_last.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/transpose_last.py
deleted file mode 100644
index e578b3ec5097bfac5c976b207ea46bec1d9bd4f5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/transpose_last.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-transpose last 2 dimensions of the input
-"""
-
-import torch.nn as nn
-
-
-class TransposeLast(nn.Module):
-    def __init__(self, deconstruct_idx=None):
-        super().__init__()
-        self.deconstruct_idx = deconstruct_idx
-
-    def forward(self, x):
-        if self.deconstruct_idx is not None:
-            x = x[self.deconstruct_idx]
-        return x.transpose(-2, -1)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/unfold.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/unfold.py
deleted file mode 100644
index 138272f1ef4f673b29e36aed4531106f7ce95968..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/unfold.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch.nn.functional as F
-
-
-def unfold1d(x, kernel_size, padding_l, pad_value=0):
-    """unfold T x B x C to T x B x C x K"""
-    if kernel_size > 1:
-        T, B, C = x.size()
-        x = F.pad(
-            x, (0, 0, 0, 0, padding_l, kernel_size - 1 - padding_l), value=pad_value
-        )
-        x = x.as_strided((T, B, C, kernel_size), (B * C, C, 1, B * C))
-    else:
-        x = x.unsqueeze(3)
-    return x
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/vggblock.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/vggblock.py
deleted file mode 100644
index ee5ee19a34816c7350c21fba7c4907fec8ca7a61..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/modules/vggblock.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-from collections.abc import Iterable
-from itertools import repeat
-
-import torch
-import torch.nn as nn
-
-
-def _pair(v):
-    if isinstance(v, Iterable):
-        assert len(v) == 2, "len(v) != 2"
-        return v
-    return tuple(repeat(v, 2))
-
-
-def infer_conv_output_dim(conv_op, input_dim, sample_inchannel):
-    sample_seq_len = 200
-    sample_bsz = 10
-    x = torch.randn(sample_bsz, sample_inchannel, sample_seq_len, input_dim)
-    # N x C x H x W
-    # N: sample_bsz, C: sample_inchannel, H: sample_seq_len, W: input_dim
-    x = conv_op(x)
-    # N x C x H x W
-    x = x.transpose(1, 2)
-    # N x H x C x W
-    bsz, seq = x.size()[:2]
-    per_channel_dim = x.size()[3]
-    # bsz: N, seq: H, CxW the rest
-    return x.contiguous().view(bsz, seq, -1).size(-1), per_channel_dim
-
-
-class VGGBlock(torch.nn.Module):
-    """
-    VGG motibated cnn module https://arxiv.org/pdf/1409.1556.pdf
-
-    Args:
-        in_channels: (int) number of input channels (typically 1)
-        out_channels: (int) number of output channels
-        conv_kernel_size: convolution channels
-        pooling_kernel_size: the size of the pooling window to take a max over
-        num_conv_layers: (int) number of convolution layers
-        input_dim: (int) input dimension
-        conv_stride: the stride of the convolving kernel.
-            Can be a single number or a tuple (sH, sW)  Default: 1
-        padding: implicit paddings on both sides of the input.
-            Can be a single number or a tuple (padH, padW). Default: None
-        layer_norm: (bool) if layer norm is going to be applied. Default: False
-
-    Shape:
-        Input: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features)
-        Output: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features)
-    """
-
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        conv_kernel_size,
-        pooling_kernel_size,
-        num_conv_layers,
-        input_dim,
-        conv_stride=1,
-        padding=None,
-        layer_norm=False,
-    ):
-        assert (
-            input_dim is not None
-        ), "Need input_dim for LayerNorm and infer_conv_output_dim"
-        super(VGGBlock, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.conv_kernel_size = _pair(conv_kernel_size)
-        self.pooling_kernel_size = _pair(pooling_kernel_size)
-        self.num_conv_layers = num_conv_layers
-        self.padding = (
-            tuple(e // 2 for e in self.conv_kernel_size)
-            if padding is None
-            else _pair(padding)
-        )
-        self.conv_stride = _pair(conv_stride)
-
-        self.layers = nn.ModuleList()
-        for layer in range(num_conv_layers):
-            conv_op = nn.Conv2d(
-                in_channels if layer == 0 else out_channels,
-                out_channels,
-                self.conv_kernel_size,
-                stride=self.conv_stride,
-                padding=self.padding,
-            )
-            self.layers.append(conv_op)
-            if layer_norm:
-                conv_output_dim, per_channel_dim = infer_conv_output_dim(
-                    conv_op, input_dim, in_channels if layer == 0 else out_channels
-                )
-                self.layers.append(nn.LayerNorm(per_channel_dim))
-                input_dim = per_channel_dim
-            self.layers.append(nn.ReLU())
-
-        if self.pooling_kernel_size is not None:
-            pool_op = nn.MaxPool2d(kernel_size=self.pooling_kernel_size, ceil_mode=True)
-            self.layers.append(pool_op)
-            self.total_output_dim, self.output_dim = infer_conv_output_dim(
-                pool_op, input_dim, out_channels
-            )
-
-    def forward(self, x):
-        for i, _ in enumerate(self.layers):
-            x = self.layers[i](x)
-        return x
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/nan_detector.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/nan_detector.py
deleted file mode 100644
index faa8031d4666c9ba9837919fe1c884dacf47ac3a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/nan_detector.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-import torch
-
-
-logger = logging.getLogger(__name__)
-
-
-class NanDetector:
-    """
-    Detects the first NaN or Inf in forward and/or backward pass and logs, together with the module name
-    """
-
-    def __init__(self, model, forward=True, backward=True):
-        self.bhooks = []
-        self.fhooks = []
-        self.forward = forward
-        self.backward = backward
-        self.named_parameters = list(model.named_parameters())
-        self.reset()
-
-        for name, mod in model.named_modules():
-            mod.__module_name = name
-            self.add_hooks(mod)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, exc_traceback):
-        # Dump out all model gnorms to enable better debugging
-        norm = {}
-        gradients = {}
-        for name, param in self.named_parameters:
-            if param.grad is not None:
-                grad_norm = torch.norm(param.grad.data, p=2, dtype=torch.float32)
-                norm[name] = grad_norm.item()
-                if torch.isnan(grad_norm).any() or torch.isinf(grad_norm).any():
-                    gradients[name] = param.grad.data
-        if len(gradients) > 0:
-            logger.info("Detected nan/inf grad norm, dumping norms...")
-            logger.info(f"norms: {norm}")
-            logger.info(f"gradients: {gradients}")
-
-        self.close()
-
-    def add_hooks(self, module):
-        if self.forward:
-            self.fhooks.append(module.register_forward_hook(self.fhook_fn))
-        if self.backward:
-            self.bhooks.append(module.register_backward_hook(self.bhook_fn))
-
-    def reset(self):
-        self.has_printed_f = False
-        self.has_printed_b = False
-
-    def _detect(self, tensor, name, backward):
-        err = None
-        if (
-            torch.is_floating_point(tensor)
-            # single value tensors (like the loss) will not provide much info
-            and tensor.numel() >= 2
-        ):
-            with torch.no_grad():
-                if torch.isnan(tensor).any():
-                    err = "NaN"
-                elif torch.isinf(tensor).any():
-                    err = "Inf"
-        if err is not None:
-            err = f"{err} detected in output of {name}, shape: {tensor.shape}, {'backward' if backward else 'forward'}"
-        return err
-
-    def _apply(self, module, inp, x, backward):
-        if torch.is_tensor(x):
-            if isinstance(inp, tuple) and len(inp) > 0:
-                inp = inp[0]
-            err = self._detect(x, module.__module_name, backward)
-            if err is not None:
-                if torch.is_tensor(inp) and not backward:
-                    err += (
-                        f" input max: {inp.max().item()}, input min: {inp.min().item()}"
-                    )
-
-                has_printed_attr = "has_printed_b" if backward else "has_printed_f"
-                logger.warning(err)
-                setattr(self, has_printed_attr, True)
-        elif isinstance(x, dict):
-            for v in x.values():
-                self._apply(module, inp, v, backward)
-        elif isinstance(x, list) or isinstance(x, tuple):
-            for v in x:
-                self._apply(module, inp, v, backward)
-
-    def fhook_fn(self, module, inp, output):
-        if not self.has_printed_f:
-            self._apply(module, inp, output, backward=False)
-
-    def bhook_fn(self, module, inp, output):
-        if not self.has_printed_b:
-            self._apply(module, inp, output, backward=True)
-
-    def close(self):
-        for hook in self.fhooks + self.bhooks:
-            hook.remove()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/__init__.py
deleted file mode 100644
index ab45abfd1f19a7d2adb237e59815d9d5b37fae9c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/__init__.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""isort:skip_file"""
-
-import importlib
-import os
-from argparse import Namespace
-from typing import Union
-
-from fairseq import registry
-from fairseq.optim.bmuf import FairseqBMUF  # noqa
-from fairseq.optim.fairseq_optimizer import (  # noqa
-    FairseqOptimizer,
-    LegacyFairseqOptimizer,
-)
-from fairseq.optim.fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer
-from fairseq.optim.shard import shard_
-from omegaconf import DictConfig
-
-
-__all__ = [
-    "FairseqOptimizer",
-    "FP16Optimizer",
-    "MemoryEfficientFP16Optimizer",
-    "shard_",
-]
-
-
-(
-    _build_optimizer,
-    register_optimizer,
-    OPTIMIZER_REGISTRY,
-    OPTIMIZER_DATACLASS_REGISTRY,
-) = registry.setup_registry("--optimizer", base_class=FairseqOptimizer, required=True)
-
-
-def build_optimizer(
-    optimizer_cfg: Union[DictConfig, Namespace], params, *extra_args, **extra_kwargs
-):
-    if all(isinstance(p, dict) for p in params):
-        params = [t for p in params for t in p.values()]
-    return _build_optimizer(optimizer_cfg, params, *extra_args, **extra_kwargs)
-
-
-# automatically import any Python files in the optim/ directory
-for file in os.listdir(os.path.dirname(__file__)):
-    if file.endswith(".py") and not file.startswith("_"):
-        file_name = file[: file.find(".py")]
-        importlib.import_module("fairseq.optim." + file_name)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adadelta.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adadelta.py
deleted file mode 100644
index f1a21549770f0904a6a40a42ff7eb52811f1bfbe..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adadelta.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch.optim
-
-from . import LegacyFairseqOptimizer, register_optimizer
-
-
-@register_optimizer("adadelta")
-class Adadelta(LegacyFairseqOptimizer):
-    def __init__(self, args, params):
-        super().__init__(args)
-        self._optimizer = torch.optim.Adadelta(params, **self.optimizer_config)
-
-    @staticmethod
-    def add_args(parser):
-        """Add optimizer-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--adadelta-rho', type=float, default=0.9, metavar='RHO',
-                            help='coefficient used for computing a running average of squared gradients')
-        parser.add_argument('--adadelta-eps', type=float, default=1e-6, metavar='EPS',
-                            help='term added to the denominator to improve numerical stability')
-        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
-                            help='weight decay')
-        parser.add_argument('--anneal-eps', action='store_true', help='flag to anneal eps')
-        # fmt: on
-
-    @property
-    def optimizer_config(self):
-        """
-        Return a kwarg dictionary that will be used to override optimizer
-        args stored in checkpoints. This allows us to load a checkpoint and
-        resume training using a different set of optimizer args, e.g., with a
-        different learning rate.
-        """
-        return {
-            "lr": self.args.lr[0],
-            "rho": self.args.adadelta_rho,
-            "eps": self.args.adadelta_eps,
-            "weight_decay": self.args.weight_decay,
-        }
-
-    @property
-    def supports_flat_params(self):
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adafactor.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adafactor.py
deleted file mode 100644
index 91745ce10e183479f8cb552f2a1a91834e2a61ed..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adafactor.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-import torch
-import torch.optim
-
-from . import LegacyFairseqOptimizer, register_optimizer
-
-
-@register_optimizer("adafactor")
-class FairseqAdafactor(LegacyFairseqOptimizer):
-    def __init__(self, args, params):
-        super().__init__(args)
-        self._optimizer = Adafactor(params, **self.optimizer_config)
-
-    @staticmethod
-    def add_args(parser):
-        """Add optimizer-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--adafactor-eps', default='(1e-30, 1e-3)', metavar="E",
-                            help='epsilons for Adafactor optimizer')
-        parser.add_argument('--clip-threshold', type=float, default=1.0, metavar="C",
-                            help='threshold for clipping update root mean square')
-        parser.add_argument('--decay-rate', type=float, default=-0.8, metavar="D",
-                            help='decay rate of the second moment estimator')
-        parser.add_argument('--beta1', type=float, default=None, metavar="B",
-                            help='beta for first moment estimator. Optional')
-        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
-                            help='weight decay')
-        parser.add_argument('--scale-parameter', action='store_true',
-                            help='scale learning rate by root mean square of parameter')
-        parser.add_argument('--relative-step', action='store_true',
-                            help='set learning rate to inverse square root of timestep,'
-                                 'otherwise use external learning rate')
-        parser.add_argument('--warmup-init', action='store_true',
-                            help='use relative step for warm-up learning rate schedule')
-        # fmt: on
-
-    @property
-    def optimizer_config(self):
-        """
-        Return a kwarg dictionary that will be used to override optimizer
-        args stored in checkpoints. This allows us to load a checkpoint and
-        resume training using a different set of optimizer args, e.g., with a
-        different learning rate.
-        Note : Convergence issues empirically observed with fp16 on.
-               Might require search for appropriate configuration.
-        """
-        return {
-            "lr": self.args.lr[0],
-            "eps": eval(self.args.adafactor_eps),
-            "clip_threshold": self.args.clip_threshold,
-            "decay_rate": self.args.decay_rate,
-            "beta1": self.args.beta1,
-            "weight_decay": self.args.weight_decay,
-            "scale_parameter": self.args.scale_parameter,  # defaults to False
-            "relative_step": self.args.relative_step,  # defaults to False
-            "warmup_init": self.args.warmup_init,
-        }
-
-
-class Adafactor(torch.optim.Optimizer):
-    """Implements Adafactor algorithm.
-
-    This implementation is based on:
-    `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost`
-    (see https://arxiv.org/abs/1804.04235)
-
-    Note that this optimizer internally adjusts the learning rate
-    depending on the *scale_parameter*, *relative_step* and
-    *warmup_init* options. To use a manual (external) learning rate
-    schedule you should set `scale_parameter=False` and
-    `relative_step=False`.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): external learning rate (default: None)
-        eps (tuple[float, float]): regularization constans for square gradient
-            and parameter scale respectively (default: (1e-30, 1e-3))
-        clip_threshold (float): threshold of root mean square of
-            final gradient update (default: 1.0)
-        decay_rate (float): coefficient used to compute running averages of square
-            gradient (default: -0.8)
-        beta1 (float): coefficient used for computing running averages of gradient
-            (default: None)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        scale_parameter (bool): if True, learning rate is scaled by root mean square of
-            parameter (default: True)
-        relative_step (bool): if True, time-dependent learning rate is computed
-            instead of external learning rate (default: True)
-        warmup_init (bool): time-dependent learning rate computation depends on
-            whether warm-up initialization is being used (default: False)
-    """
-
-    def __init__(
-        self,
-        params,
-        lr=None,
-        eps=(1e-30, 1e-3),
-        clip_threshold=1.0,
-        decay_rate=-0.8,
-        beta1=None,
-        weight_decay=0.0,
-        scale_parameter=True,
-        relative_step=True,
-        warmup_init=False,
-    ):
-        if lr is not None and relative_step:
-            raise ValueError("Cannot combine manual lr and relative_step options")
-        if warmup_init and not relative_step:
-            raise ValueError("warmup_init requires relative_step=True")
-
-        defaults = dict(
-            lr=lr,
-            eps=eps,
-            clip_threshold=clip_threshold,
-            decay_rate=decay_rate,
-            beta1=beta1,
-            weight_decay=weight_decay,
-            scale_parameter=scale_parameter,
-            relative_step=relative_step,
-            warmup_init=warmup_init,
-        )
-        super(Adafactor, self).__init__(params, defaults)
-
-    @property
-    def supports_memory_efficient_fp16(self):
-        return True
-
-    @property
-    def supports_flat_params(self):
-        return False
-
-    def _get_lr(self, param_group, param_state):
-        rel_step_sz = param_group["lr"]
-        if param_group["relative_step"]:
-            min_step = (
-                1e-6 * param_state["step"] if param_group["warmup_init"] else 1e-2
-            )
-            rel_step_sz = min(min_step, 1.0 / math.sqrt(param_state["step"]))
-        param_scale = 1.0
-        if param_group["scale_parameter"]:
-            param_scale = max(param_group["eps"][1], param_state["RMS"])
-        return param_scale * rel_step_sz
-
-    def _get_options(self, param_group, param_shape):
-        factored = len(param_shape) >= 2
-        use_first_moment = param_group["beta1"] is not None
-        return factored, use_first_moment
-
-    def _rms(self, tensor):
-        return tensor.norm(2) / (tensor.numel() ** 0.5)
-
-    def _approx_sq_grad(self, exp_avg_sq_row, exp_avg_sq_col):
-        r_factor = (
-            (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True))
-            .rsqrt_()
-            .unsqueeze(-1)
-        )
-        c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt()
-        return torch.mul(r_factor, c_factor)
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.dtype in {torch.float16, torch.bfloat16}:
-                    grad = grad.float()
-                if grad.is_sparse:
-                    raise RuntimeError("Adafactor does not support sparse gradients.")
-
-                state = self.state[p]
-                grad_shape = grad.shape
-
-                factored, use_first_moment = self._get_options(group, grad_shape)
-                # State Initialization
-                if len(state) == 0:
-                    state["step"] = 0
-
-                    if use_first_moment:
-                        # Exponential moving average of gradient values
-                        state["exp_avg"] = torch.zeros_like(grad)
-                    if factored:
-                        state["exp_avg_sq_row"] = torch.zeros(grad_shape[:-1]).to(grad)
-                        state["exp_avg_sq_col"] = torch.zeros(
-                            grad_shape[:-2] + grad_shape[-1:]
-                        ).to(grad)
-                    else:
-                        state["exp_avg_sq"] = torch.zeros_like(grad)
-
-                    state["RMS"] = 0
-                else:
-                    if use_first_moment:
-                        state["exp_avg"] = state["exp_avg"].to(grad)
-                    if factored:
-                        state["exp_avg_sq_row"] = state["exp_avg_sq_row"].to(grad)
-                        state["exp_avg_sq_col"] = state["exp_avg_sq_col"].to(grad)
-                    else:
-                        state["exp_avg_sq"] = state["exp_avg_sq"].to(grad)
-
-                p_data_fp32 = p.data
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
-                    p_data_fp32 = p_data_fp32.float()
-
-                state["step"] += 1
-                state["RMS"] = self._rms(p_data_fp32)
-                group["lr"] = self._get_lr(group, state)
-
-                beta2t = 1.0 - math.pow(state["step"], group["decay_rate"])
-                update = (grad ** 2) + group["eps"][0]
-                if factored:
-                    exp_avg_sq_row = state["exp_avg_sq_row"]
-                    exp_avg_sq_col = state["exp_avg_sq_col"]
-
-                    exp_avg_sq_row.mul_(beta2t).add_(
-                        update.mean(dim=-1), alpha=1.0 - beta2t
-                    )
-                    exp_avg_sq_col.mul_(beta2t).add_(
-                        update.mean(dim=-2), alpha=1.0 - beta2t
-                    )
-
-                    # Approximation of exponential moving average of square of gradient
-                    update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
-                    update.mul_(grad)
-                else:
-                    exp_avg_sq = state["exp_avg_sq"]
-
-                    exp_avg_sq.mul_(beta2t).add_(update, alpha=1.0 - beta2t)
-                    update = exp_avg_sq.rsqrt().mul_(grad)
-
-                update.div_(
-                    (self._rms(update) / group["clip_threshold"]).clamp_(min=1.0)
-                )
-                update.mul_(group["lr"])
-
-                if use_first_moment:
-                    exp_avg = state["exp_avg"]
-                    exp_avg.mul_(group["beta1"]).add_(update, alpha=1 - group["beta1"])
-                    update = exp_avg
-
-                if group["weight_decay"] != 0:
-                    p_data_fp32.add_(
-                        p_data_fp32, alpha=-group["weight_decay"] * group["lr"]
-                    )
-
-                p_data_fp32.add_(-update)
-
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
-                    p.data.copy_(p_data_fp32)
-
-        return loss
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adagrad.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adagrad.py
deleted file mode 100644
index a79b6c39da2dd73286b8a12122d87365d0e526f7..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adagrad.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch.optim
-
-from . import LegacyFairseqOptimizer, register_optimizer
-
-
-@register_optimizer("adagrad")
-class Adagrad(LegacyFairseqOptimizer):
-    def __init__(self, args, params):
-        super().__init__(args)
-        self._optimizer = torch.optim.Adagrad(params, **self.optimizer_config)
-
-    @staticmethod
-    def add_args(parser):
-        """Add optimizer-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
-                            help='weight decay')
-        # fmt: on
-
-    @property
-    def optimizer_config(self):
-        """
-        Return a kwarg dictionary that will be used to override optimizer
-        args stored in checkpoints. This allows us to load a checkpoint and
-        resume training using a different set of optimizer args, e.g., with a
-        different learning rate.
-        """
-        return {
-            "lr": self.args.lr[0],
-            "weight_decay": self.args.weight_decay,
-        }
-
-    @property
-    def supports_flat_params(self):
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adam.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adam.py
deleted file mode 100644
index c26cac1fb2a73bf34326416ae502f4b4c141111c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adam.py
+++ /dev/null
@@ -1,211 +0,0 @@
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import math
-from dataclasses import dataclass, field
-from typing import List
-
-import torch
-import torch.distributed as dist
-import torch.optim
-from fairseq.dataclass import FairseqDataclass
-from fairseq.optim import FairseqOptimizer, register_optimizer
-from fairseq.optim.fused_adam import get_fused_adam_class
-from omegaconf import II
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class FairseqAdamConfig(FairseqDataclass):
-    adam_betas: str = field(
-        default="(0.9, 0.999)", metadata={"help": "betas for Adam optimizer"}
-    )
-    adam_eps: float = field(
-        default=1e-8, metadata={"help": "epsilon for Adam optimizer"}
-    )
-    weight_decay: float = field(default=0.0, metadata={"help": "weight decay"})
-    use_old_adam: bool = field(
-        default=False, metadata={"help": "Use fairseq.optim.adam.Adam"}
-    )
-    # TODO common vars below in parent
-    tpu: bool = II("params.common.tpu")
-    lr: List[float] = II("params.optimization.lr")
-
-
-@register_optimizer("adam", dataclass=FairseqAdamConfig)
-class FairseqAdam(FairseqOptimizer):
-    """Adam optimizer for fairseq.
-
-    Important note: this optimizer corresponds to the "AdamW" variant of
-    Adam in its weight decay behavior. As such, it is most closely
-    analogous to torch.optim.AdamW from PyTorch.
-    """
-
-    def __init__(self, args, params):
-        super().__init__(args)
-        fused_adam_cls = get_fused_adam_class()
-        use_fused_adam = (
-            not getattr(args, "use_old_adam", False)
-            and fused_adam_cls is not None
-            and torch.cuda.is_available()
-        )
-        if getattr(args, "tpu", False):
-            # on TPUs we use the Adam defined here, since it
-            # automatically casts gradients to FP32
-            self._optimizer = Adam(params, **self.optimizer_config)
-        elif use_fused_adam:
-            logger.info("using FusedAdam")
-            self._optimizer = fused_adam_cls(params, **self.optimizer_config)
-        else:
-            self._optimizer = Adam(params, **self.optimizer_config)
-
-    @property
-    def optimizer_config(self):
-        """
-        Return a kwarg dictionary that will be used to override optimizer
-        args stored in checkpoints. This allows us to load a checkpoint and
-        resume training using a different set of optimizer args, e.g., with a
-        different learning rate.
-        """
-        return {
-            "lr": self.args.lr[0],
-            "betas": eval(self.args.adam_betas),
-            "eps": self.args.adam_eps,
-            "weight_decay": self.args.weight_decay,
-        }
-
-    def average_params(self):
-        """Reduce Params is only used during BMUF distributed training."""
-        state_dict = self.optimizer.state_dict()
-        total_gpus = float(dist.get_world_size())
-
-        for _, value in state_dict["state"].items():
-            value["exp_avg"] /= total_gpus
-            value["exp_avg_sq"] /= total_gpus
-            dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM)
-            dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM)
-
-
-class Adam(torch.optim.Optimizer):
-    """Implements Adam algorithm.
-
-    This implementation is modified from torch.optim.Adam based on:
-    `Fixed Weight Decay Regularization in Adam`
-    (see https://arxiv.org/abs/1711.05101)
-
-    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        betas=(0.9, 0.999),
-        eps=1e-8,
-        weight_decay=0,
-        amsgrad=False,
-    ):
-        defaults = dict(
-            lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad
-        )
-        super(Adam, self).__init__(params, defaults)
-
-    @property
-    def supports_memory_efficient_fp16(self):
-        return True
-
-    @property
-    def supports_flat_params(self):
-        return True
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.dtype in {torch.float16, torch.bfloat16}:
-                    grad = grad.float()
-                if grad.is_sparse:
-                    raise RuntimeError(
-                        "Adam does not support sparse gradients, please consider SparseAdam instead"
-                    )
-                amsgrad = group.get("amsgrad", False)
-
-                p_data_fp32 = p.data
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
-                    p_data_fp32 = p_data_fp32.float()
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
-                    if amsgrad:
-                        # Maintains max of all exp. moving avg. of sq. grad. values
-                        state["max_exp_avg_sq"] = torch.zeros_like(p_data_fp32)
-                else:
-                    state["exp_avg"] = state["exp_avg"].to(p_data_fp32)
-                    state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32)
-                    if amsgrad:
-                        state["max_exp_avg_sq"] = state["max_exp_avg_sq"].to(
-                            p_data_fp32
-                        )
-
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-                if amsgrad:
-                    max_exp_avg_sq = state["max_exp_avg_sq"]
-                beta1, beta2 = group["betas"]
-
-                state["step"] += 1
-                bias_correction1 = 1 - beta1 ** state["step"]
-                bias_correction2 = 1 - beta2 ** state["step"]
-                step_size = group["lr"] * math.sqrt(bias_correction2) / bias_correction1
-                # Decay the first and second moment running average coefficient
-                p_data_fp32, exp_avg, exp_avg_sq = \
-                    torch.npu_bert_apply_adam(p_data_fp32, exp_avg,
-                                              exp_avg_sq, group["lr"], beta1,
-                                              beta2, group["eps"], grad, 0.0, 0.0,
-                                              group["weight_decay"], step_size, adam_mode=1)
-
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
-                    p.data.copy_(p_data_fp32)
-        return loss
-
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adamax.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adamax.py
deleted file mode 100644
index 577a68816692710b008f9088bdf7fa45868c64a0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/adamax.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-import torch.optim
-
-from . import LegacyFairseqOptimizer, register_optimizer
-
-
-@register_optimizer("adamax")
-class FairseqAdamax(LegacyFairseqOptimizer):
-    def __init__(self, args, params):
-        super().__init__(args)
-        self._optimizer = Adamax(params, **self.optimizer_config)
-
-    @staticmethod
-    def add_args(parser):
-        """Add optimizer-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--adamax-betas', default='(0.9, 0.999)', metavar='B',
-                            help='betas for Adam optimizer')
-        parser.add_argument('--adamax-eps', type=float, default=1e-8, metavar='D',
-                            help='epsilon for Adam optimizer')
-        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
-                            help='weight decay')
-        parser.add_argument('--no-bias-correction', default=False, action='store_true',
-                            help='disable bias correction')
-        # fmt: on
-
-    @property
-    def optimizer_config(self):
-        """
-        Return a kwarg dictionary that will be used to override optimizer
-        args stored in checkpoints. This allows us to load a checkpoint and
-        resume training using a different set of optimizer args, e.g., with a
-        different learning rate.
-        """
-        return {
-            "lr": self.args.lr[0],
-            "betas": eval(self.args.adamax_betas),
-            "eps": self.args.adamax_eps,
-            "weight_decay": self.args.weight_decay,
-            "bias_correction": not self.args.no_bias_correction,
-        }
-
-
-class Adamax(torch.optim.Optimizer):
-    """Implements Adamax algorithm (a variant of Adam based on infinity norm).
-
-    It has been proposed in `Adam: A Method for Stochastic Optimization`__.
-
-    Compared to the version in PyTorch, this version implements a fix for weight decay.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 2e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        bias_correction (bool, optional): enable bias correction (default: True)
-
-    __ https://arxiv.org/abs/1412.6980
-    """
-
-    def __init__(
-        self,
-        params,
-        lr=2e-3,
-        betas=(0.9, 0.999),
-        eps=1e-8,
-        weight_decay=0,
-        bias_correction=True,
-    ):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
-        if not 0.0 <= weight_decay:
-            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-
-        defaults = dict(
-            lr=lr,
-            betas=betas,
-            eps=eps,
-            weight_decay=weight_decay,
-            bias_correction=bias_correction,
-        )
-        super(Adamax, self).__init__(params, defaults)
-
-    @property
-    def supports_memory_efficient_fp16(self):
-        return True
-
-    @property
-    def supports_flat_params(self):
-        return True
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data.float()
-                if grad.is_sparse:
-                    raise RuntimeError("Adamax does not support sparse gradients")
-
-                p_data_fp32 = p.data
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
-                    p_data_fp32 = p_data_fp32.float()
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
-                    state["exp_inf"] = torch.zeros_like(p_data_fp32)
-                else:
-                    state["exp_avg"] = state["exp_avg"].to(p_data_fp32)
-                    state["exp_inf"] = state["exp_inf"].to(p_data_fp32)
-
-                exp_avg, exp_inf = state["exp_avg"], state["exp_inf"]
-                beta1, beta2 = group["betas"]
-                eps = group["eps"]
-
-                state["step"] += 1
-
-                # Update biased first moment estimate.
-                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-
-                # Update the exponentially weighted infinity norm.
-                torch.max(
-                    exp_inf.mul_(beta2),
-                    grad.abs_(),
-                    out=exp_inf,
-                )
-
-                step_size = group["lr"]
-                if group["bias_correction"]:
-                    bias_correction = 1 - beta1 ** state["step"]
-                    step_size /= bias_correction
-
-                if group["weight_decay"] != 0:
-                    p_data_fp32.add_(
-                        p_data_fp32, alpha=-group["weight_decay"] * group["lr"]
-                    )
-
-                p_data_fp32.addcdiv_(exp_avg, exp_inf.add(eps), value=-step_size)
-
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
-                    p.data.copy_(p_data_fp32)
-
-        return loss
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/bmuf.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/bmuf.py
deleted file mode 100644
index 3312f81103490f8414487e54c6e0a4a5b2aa5de3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/bmuf.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from dataclasses import dataclass, field
-
-import torch
-import torch.distributed as dist
-from fairseq.dataclass import FairseqDataclass
-from fairseq.dataclass.utils import gen_parser_from_dataclass
-from fairseq.optim.fairseq_optimizer import FairseqOptimizer
-from omegaconf import II
-
-
-@dataclass
-class FairseqBMUFConfig(FairseqDataclass):
-    block_lr: float = field(
-        default=1, metadata={"help": "block learning rate for bmuf"}
-    )
-    block_momentum: float = field(
-        default=0.875, metadata={"help": "block momentum for bmuf"}
-    )
-    global_sync_iter: int = field(
-        default=50, metadata={"help": "Iteration for syncing global model"}
-    )
-    warmup_iterations: int = field(
-        default=500, metadata={"help": "warmup iterations for model to broadcast"}
-    )
-    use_nbm: bool = field(
-        default=False,
-        metadata={"help": "Specify whether you want to use classical BM / Nesterov BM"},
-    )
-    average_sync: bool = field(
-        default=False,
-        metadata={
-            "help": "Specify whether you want to average the local momentum after each sync"
-        },
-    )
-    distributed_world_size: int = II(
-        "params.distributed_training.distributed_world_size"
-    )
-
-
-class FairseqBMUF(FairseqOptimizer):
-    """
-    Implements incremental block distributed data parallelism similar to
-    https://ieeexplore.ieee.org/document/7472805
-
-    Paper title: Scalable training of deep learning machines by incremental
-    block training with intra-block parallel optimization and blockwise
-    model-update filtering
-    """
-
-    def __init__(self, args, optimizer):
-
-        super().__init__(args)
-        self._optimizer = optimizer
-        self._num_updates = 0
-        self.sync_iter = self.args.global_sync_iter
-        self.block_momentum = self.args.block_momentum
-        self.block_lr = self.args.block_lr
-        self._reset_local_data()
-        self.warmup_iteration = self.args.warmup_iterations
-        self.use_nbm = self.args.use_nbm
-        self.initial_state = self._optimizer.state_dict()
-        self.average_sync = self.args.average_sync
-        self.world_size = self.args.distributed_world_size
-
-    @staticmethod
-    def add_args(parser):
-        """Add optimizer-specific arguments to the parser."""
-        gen_parser_from_dataclass(parser, FairseqBMUFConfig())
-
-    @property
-    def optimizer(self):
-        return self._optimizer.optimizer
-
-    @property
-    def optimizer_config(self):
-        return self._optimizer.optimizer_config
-
-    def get_lr(self):
-        return self._optimizer.get_lr()
-
-    def set_lr(self, lr):
-        self._optimizer.set_lr(lr)
-
-    def state_dict(self):
-        return self._optimizer.state_dict()
-
-    def load_state_dict(self, state_dict, optimizer_overrides=None):
-        self._optimizer.load_state_dict(state_dict, optimizer_overrides)
-        self.initial_state = self._optimizer.state_dict()
-
-    def multiply_grads(self, c):
-        """Multiplies grads by a constant *c*."""
-        self._optimizer.multiply_grads(c)
-
-    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
-        """Clips gradient norm."""
-        return self._optimizer.clip_grad_norm(max_norm, aggregate_norm_fn)
-
-    def average_params(self):
-        self._optimizer.average_params()
-
-    def _block_sync(self):
-        if self.world_size <= 1:
-            return
-        # Update the global model using local models from all GPUs
-        # (Step-1) Calculate grad between previously synced model and
-        # currrent local model
-        if self.block_momentum != 0:
-            self._calc_grad()
-
-        # (Step-2) Average gradient from all GPUs
-        self._avg_grad_from_all_gpus()
-
-        # (Step-3) Calculate global momentum and update the global model
-        if self.block_momentum != 0:
-            self._update_global_model()
-
-        # (Step-4) Average local optimizer params
-        if self.average_sync:
-            self.average_params()
-
-    def _is_warmup_end(self):
-        # Check whether train iterations is equal to warmup iter
-        if self.get_num_updates() == self.warmup_iteration:
-            return True
-        return False
-
-    def _is_bmuf_iter(self):
-        # Check whether train iterations is equal to bmuf sync iter
-        if (self.get_num_updates() > self.warmup_iteration) and (
-            self.get_num_updates() % self.sync_iter == 0
-        ):
-            return True
-        return False
-
-    def _warmup_sync(self, root_rank=0):
-        if self.world_size <= 1:
-            return
-        # Broadcast the local model to all gpus
-        for param in self.params:
-            dist.broadcast(param.data, src=root_rank)
-
-        # Update local optimizer state
-        if self.average_sync:
-            self._optimizer.average_params()
-        else:
-            self._optimizer.load_state_dict(self.initial_state)
-
-        self._reset_local_data()
-
-    def step(self, closure=None):
-        """Performs a single optimization step."""
-        self._optimizer.step(closure)
-        self.set_num_updates(self.get_num_updates() + 1)
-        if self._is_warmup_end():
-            self._warmup_sync()
-        elif self._is_bmuf_iter():
-            self._block_sync()
-
-    def zero_grad(self):
-        """Clears the gradients of all optimized parameters."""
-        self._optimizer.zero_grad()
-
-    def get_num_updates(self):
-        """Get the number of parameters updates."""
-        return self._num_updates
-
-    def set_num_updates(self, num_updates):
-        """Set the number of parameters updates."""
-        self._num_updates = num_updates
-
-    @torch.no_grad()
-    def _reset_local_data(self):
-        # (Step-0) Initialize global momentum parameters and store global copy on each gpu
-        self.global_params = [torch.zeros_like(p.data) for p in self.params]
-        self.smoothed_grads = [p.data.new_zeros(p.data.size()) for p in self.params]
-        self.grads = [p.data.new_zeros(p.data.size()) for p in self.params]
-
-        # saving the global model locally for calculating gradient during bmuf sync
-        for param, global_param in zip(self.params, self.global_params):
-            global_param.copy_(param.data)
-
-    @torch.no_grad()
-    def _calc_grad(self):
-        # global_params is basically the global copy from the previously finished
-        # synchronisation. param.data is local parameter after block_sync_freq
-        # for the local gpu. so grad is difference between previously synced
-        # model and currrent local model.
-        for index, (param, global_param) in enumerate(
-            zip(self.params, self.global_params)
-        ):
-            self.grads[index] = global_param - param.data
-
-    def _avg_grad_from_all_gpus(self):
-        for index, param in enumerate(self.params):
-            sync_para = param.data if self.block_momentum == 0 else self.grads[index]
-            sync_para /= float(dist.get_world_size())
-            dist.all_reduce(sync_para, op=dist.ReduceOp.SUM)
-
-    @torch.no_grad()
-    def _update_global_model(self):
-        for index, (param, global_param, smoothed_grad, grad) in enumerate(
-            zip(
-                self.params,
-                self.global_params,
-                self.smoothed_grads,
-                # all gpus would share the same value of smoothed_grad, since it is
-                # always computed on synchronized gradients.
-                self.grads,
-            )
-        ):
-            # global_param is basically last syncrhornized parameter. though
-            # smoothed_grad is local, all processes will have same value of
-            # smoothed_grad and hence param is globally synchronized copy.
-            # smoothed_grad(t) = BM * smoothed_grad(t-1) + BM_lr * grad(t)
-            smoothed_grad = self.block_momentum * smoothed_grad + self.block_lr * grad
-            param.data.copy_(global_param - smoothed_grad)
-
-            # A Nesterov momentum here is to do a partial weight update before
-            # calculating the gradient
-            if self.use_nbm:
-                param.data.copy_(param.data - self.block_momentum * smoothed_grad)
-
-            # backup for the next synchronization.
-            self.smoothed_grads[index] = smoothed_grad
-            global_param.copy_(param.data)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/dynamic_loss_scaler.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/dynamic_loss_scaler.py
deleted file mode 100644
index f8c957bb591190b16c0a30864f5b4bf9d5d0d9dc..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/dynamic_loss_scaler.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-class DynamicLossScaler(object):
-    def __init__(
-        self,
-        init_scale=2.0 ** 15,
-        scale_factor=2.0,
-        scale_window=2000,
-        tolerance=0.05,
-        threshold=None,
-        min_loss_scale=1e-4,
-    ):
-        self.loss_scale = init_scale
-        self.scale_factor = scale_factor
-        self.scale_window = scale_window
-        self.tolerance = tolerance
-        self.threshold = threshold
-        self._iter = 0
-        self._last_overflow_iter = -1
-        self._last_rescale_iter = -1
-        self._overflows_since_rescale = 0
-        self.min_loss_scale = min_loss_scale
-
-    def scale(self, outputs):
-        return self.loss_scale * outputs
-
-    def update(self):
-        if (self._iter - self._last_overflow_iter) % self.scale_window == 0 and self.loss_scale <= 16:
-            self.loss_scale *= self.scale_factor
-            self._last_rescale_iter = self._iter
-        self._iter += 1
-
-    def _decrease_loss_scale(self):
-        self.loss_scale /= self.scale_factor
-        if self.threshold is not None:
-            self.loss_scale = max(self.loss_scale, self.threshold)
-
-    def check_overflow(self, grad_norm):
-        # detect inf and nan
-        if grad_norm == float("inf") or grad_norm != grad_norm:
-            # overflow has occured
-            prev_scale = self.loss_scale
-            iter_since_rescale = self._iter - self._last_rescale_iter
-
-            self._last_overflow_iter = self._iter
-            self._overflows_since_rescale += 1
-            pct_overflow = self._overflows_since_rescale / float(iter_since_rescale)
-            if pct_overflow >= self.tolerance:
-                self._decrease_loss_scale()
-                self._last_rescale_iter = self._iter
-                self._overflows_since_rescale = 0
-
-            if self.loss_scale <= self.min_loss_scale:
-                # Use FloatingPointError as an uncommon error that parent
-                # functions can safely catch to stop training.
-                self.loss_scale = prev_scale
-                raise FloatingPointError(
-                    (
-                        "Minimum loss scale reached ({}). Your loss is probably exploding. "
-                        "Try lowering the learning rate, using gradient clipping or "
-                        "increasing the batch size."
-                    ).format(self.min_loss_scale)
-                )
-
-            self._iter += 1
-            raise OverflowError("setting loss scale to: " + str(self.loss_scale))
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/fairseq_optimizer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/fairseq_optimizer.py
deleted file mode 100644
index 8a10399a8b413c4188dbe7c8d51f5353348d835b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/fairseq_optimizer.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-from fairseq import utils
-from fairseq.dataclass.utils import gen_parser_from_dataclass
-
-
-class FairseqOptimizer(object):
-    def __init__(self, args):
-        super().__init__()
-        self.args = args
-
-    @classmethod
-    def add_args(cls, parser):
-        """Add optimizer-specific arguments to the parser."""
-        dc = getattr(cls, "__dataclass", None)
-        if dc is not None:
-            gen_parser_from_dataclass(parser, dc())
-
-    @property
-    def optimizer(self):
-        """Return a torch.optim.optimizer.Optimizer instance."""
-        if not hasattr(self, "_optimizer"):
-            raise NotImplementedError
-        if not isinstance(self._optimizer, torch.optim.Optimizer):
-            raise ValueError("_optimizer must be an instance of torch.optim.Optimizer")
-        return self._optimizer
-
-    @optimizer.setter
-    def optimizer(self, optimizer):
-        """Reset optimizer instance."""
-        if not hasattr(self, "_optimizer"):
-            raise NotImplementedError
-        if not isinstance(self._optimizer, torch.optim.Optimizer):
-            raise ValueError("_optimizer must be an instance of torch.optim.Optimizer")
-        self._optimizer = optimizer
-
-    @property
-    def optimizer_config(self):
-        """
-        Return a kwarg dictionary that will be used to override optimizer
-        args stored in checkpoints. This allows us to load a checkpoint and
-        resume training using a different set of optimizer args, e.g., with a
-        different learning rate.
-        """
-        raise NotImplementedError
-
-    @property
-    def params(self):
-        """Return an iterable of the parameters held by the optimizer."""
-        for param_group in self.param_groups:
-            for p in param_group["params"]:
-                yield p
-
-    @property
-    def param_groups(self):
-        return self.optimizer.param_groups
-
-    def __getstate__(self):
-        return self._optimizer.__getstate__()
-
-    def get_lr(self):
-        """Return the current learning rate."""
-        return self.param_groups[0]["lr"]
-
-    def set_lr(self, lr):
-        """Set the learning rate."""
-        for param_group in self.param_groups:
-            param_group["lr"] = lr
-
-    def state_dict(self):
-        """Return the optimizer's state dict."""
-        return self.optimizer.state_dict()
-
-    def load_state_dict(self, state_dict, optimizer_overrides=None):
-        """Load an optimizer state dict.
-
-        In general we should prefer the configuration of the existing optimizer
-        instance (e.g., learning rate) over that found in the state_dict. This
-        allows us to resume training from a checkpoint using a new set of
-        optimizer args.
-        """
-        self.optimizer.load_state_dict(state_dict)
-
-        if optimizer_overrides is not None and len(optimizer_overrides) > 0:
-            # override learning rate, momentum, etc. with latest values
-            for group in self.param_groups:
-                group.update(optimizer_overrides)
-
-    def backward(self, loss):
-        """Computes the sum of gradients of the given tensor w.r.t. graph leaves."""
-        loss.backward()
-
-    def multiply_grads(self, c):
-        """Multiplies grads by a constant *c*."""
-        for p in self.params:
-            if p.grad is not None:
-                p.grad.data.mul_(c)
-
-    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
-        """Clips gradient norm."""
-        return utils.clip_grad_norm_(self.params, max_norm, aggregate_norm_fn)
-
-    def step(self, closure=None, scale=1.0):
-        """Performs a single optimization step."""
-        if self.supports_step_with_scale:
-            self.optimizer.step(closure, scale=scale)
-        else:
-            if scale != 1.0:
-                self.multiply_grads(1.0 / scale)
-            self.optimizer.step(closure)
-
-    def zero_grad(self):
-        """Clears the gradients of all optimized parameters."""
-        for p in self.params:
-            p.grad = None
-        self.optimizer.zero_grad()
-
-    @property
-    def supports_memory_efficient_fp16(self):
-        if hasattr(self.optimizer, "supports_memory_efficient_fp16"):
-            return self.optimizer.supports_memory_efficient_fp16
-        return False
-
-    @property
-    def supports_step_with_scale(self):
-        if hasattr(self.optimizer, "supports_step_with_scale"):
-            return self.optimizer.supports_step_with_scale
-        return False
-
-    @property
-    def supports_flat_params(self):
-        """
-        Whether the optimizer supports collapsing of the model
-        parameters/gradients into a single contiguous Tensor.
-        """
-        if hasattr(self.optimizer, "supports_flat_params"):
-            return self.optimizer.supports_flat_params
-        return False
-
-    def average_params(self):
-        pass
-
-
-class LegacyFairseqOptimizer(FairseqOptimizer):
-    def __init__(self, args):
-        self.args = args
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/fp16_optimizer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/fp16_optimizer.py
deleted file mode 100644
index dd2d7155d8f929aced9dd41510d8c7668baa2482..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/fp16_optimizer.py
+++ /dev/null
@@ -1,492 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from collections import defaultdict
-from itertools import chain
-
-import torch
-from fairseq import optim, utils
-
-from .dynamic_loss_scaler import DynamicLossScaler
-from apex.contrib.combine_tensors import combine_npu
-
-class _FP16OptimizerMixin(object):
-    def __init__(self, *args, **kwargs):
-        # forward __init__ call to the next class in mro(method resolution order)
-        super().__init__(*args, **kwargs)
-        self._multiply_factor = 1.0
-
-    @property
-    def has_flat_params(self):
-        return torch.is_tensor(self.fp32_params) or (
-            isinstance(self.fp32_params, dict)
-            and all(torch.is_tensor(t) for t in self.fp32_params.values())
-        )
-
-    @classmethod
-    @torch.no_grad()
-    def build_fp32_params(cls, args, params, flatten=True):
-        # create FP32 copy of parameters and grads
-        cls.fp32_tmp_params = dict()
-        if flatten:
-            is_pipeline_parallel = getattr(
-                args, "pipeline_model_parallel", False
-            ) and getattr(args, "distributed_no_spawn", False)
-
-            devices = [torch.npu.current_device()]
-            if is_pipeline_parallel:
-                devices = list(set(args.pipeline_devices))
-            fp32_params = {}
-            for device in devices:
-                cls.fp32_tmp_params[device] = []
-                for idx, p in enumerate(params):
-                    cls.fp32_tmp_params[device].append(p.data.float())
-                fp32_params[device] = combine_npu(cls.fp32_tmp_params[device])
-
-                fp32_params[device].grad = torch.zeros_like(fp32_params[device].data)
-            return fp32_params
-        else:
-            fp32_params = []
-            for p in params:
-                p32 = torch.nn.Parameter(p.data.float())
-                p32.grad = torch.zeros_like(p32.data)
-                fp32_params.append(p32)
-            return fp32_params
-
-    def state_dict(self):
-        """Return the optimizer's state dict."""
-        state_dict = self.fp32_optimizer.state_dict()
-        if self.scaler is not None:
-            state_dict["loss_scale"] = self.scaler.loss_scale
-        return state_dict
-
-    def load_state_dict(self, state_dict, optimizer_overrides=None):
-        """Load an optimizer state dict.
-
-        In general we should prefer the configuration of the existing optimizer
-        instance (e.g., learning rate) over that found in the state_dict. This
-        allows us to resume training from a checkpoint using a new set of
-        optimizer args.
-        """
-        if "loss_scale" in state_dict and self.scaler is not None:
-            self.scaler.loss_scale = state_dict["loss_scale"]
-        self.fp32_optimizer.load_state_dict(state_dict, optimizer_overrides)
-
-    def backward(self, loss):
-        """Computes the sum of gradients of the given tensor w.r.t. graph leaves.
-
-        Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this
-        function additionally dynamically scales the loss to avoid gradient
-        underflow.
-        """
-        if self.scaler is not None:
-            loss = self.scaler.scale(loss)
-        loss.backward()
-        self._needs_sync = True
-
-    def _sync_fp16_grads_to_fp32(self):
-        if self._needs_sync:
-            # copy FP16 grads to FP32
-            if self.has_flat_params:
-                devices = list(self.fp32_params.keys())
-                if not self.combine_grads_flag:
-                    device_params_dict = defaultdict(list)
-                    for p in self.fp16_params:
-                        if p.requires_grad:
-                            device_params_dict[p.device.index].append(p)
-                    for device in devices:
-                        device_params = device_params_dict[device]
-                        fp16_grads_list = []
-                        fp16_params_list = []
-                        for p in device_params:
-                            grad_data = (
-                                p.grad.data
-                                if p.grad is not None
-                                else p.data.new_zeros(p.data.shape)
-                            )
-                            fp16_grads_list.append(grad_data)
-                            fp16_params_list.append(p.data)
-                        self.fp16_tmp_grads[device] = combine_npu(fp16_grads_list)
-                        self.fp16_tmp_params[device] = combine_npu(fp16_params_list)
-                        self.fp32_params[device].grad.data.copy_(self.fp16_tmp_grads[device])
-                    self.combine_grads_flag = True
-                else:
-                    for device in devices:
-                        self.fp32_params[device].grad.data.copy_(self.fp16_tmp_grads[device])
-
-            else:
-                for p, p32 in zip(self.fp16_params, self.fp32_params):
-                    if not p.requires_grad:
-                        continue
-                    if p.grad is not None:
-                        p32.grad.data.copy_(p.grad.data)
-                    else:
-                        p32.grad = torch.zeros_like(p.data, dtype=torch.float)
-
-            self._needs_sync = False
-
-    def _sync_fp32_params_to_fp16(self):
-        # copy FP32 params back into FP16 model
-        if self.has_flat_params:
-            devices = list(self.fp32_params.keys())
-            if not self.combine_grads_flag:
-                device_params_dict = defaultdict(list)
-                for p in self.fp16_params:
-                    device_params_dict[p.device.index].append(p)
-                for device in devices:
-                    device_params = device_params_dict[device]
-                    for idx, p in enumerate(device_params):
-                        p.data.copy_(self.fp32_tmp_params[device][idx].data)
-            else:
-                for device in devices:
-                    self.fp16_tmp_params[device].data.copy_(self.fp32_params[device])
-        else:
-            for p, p32 in zip(self.fp16_params, self.fp32_params):
-                if not p.requires_grad:
-                    continue
-                p.data.copy_(p32.data)
-
-    def _unscale_grads(self):
-        self._sync_fp16_grads_to_fp32()
-        if self._multiply_factor != 1.0:
-            self.fp32_optimizer.multiply_grads(self._multiply_factor)
-            self._multiply_factor = 1.0
-
-    def multiply_grads(self, c):
-        """Multiplies grads by a constant ``c``."""
-        self._multiply_factor *= c
-
-    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
-        """Clips gradient norm and updates dynamic loss scaler."""
-        self._sync_fp16_grads_to_fp32()
-
-        grad_norm = self._multiply_factor * self.fp32_optimizer.clip_grad_norm(
-            0, aggregate_norm_fn
-        )
-
-        if self.scaler is not None:
-            if max_norm > 0.0:
-                if grad_norm > max_norm:
-                    self._multiply_factor *= max_norm / grad_norm
-
-        elif max_norm > 0.0:
-            clip_coef = (max_norm / (grad_norm + 1e-6)).clamp_(max=1)
-            self._multiply_factor *= clip_coef
-
-        return grad_norm
-
-    def step(self, closure=None):
-        """Performs a single optimization step."""
-        self._sync_fp16_grads_to_fp32()
-
-        if getattr(self, "supports_step_with_scale", False):
-            self.fp32_optimizer.step(closure, scale=(1.0 / self._multiply_factor))
-        else:
-            self._unscale_grads()
-            self.fp32_optimizer.step(closure)
-
-        if self.scaler is not None:
-            self.scaler.update()
-
-        self._sync_fp32_params_to_fp16()
-
-    def zero_grad(self):
-        """Clears the gradients of all optimized parameters."""
-        if self.combine_grads_flag:
-            devices = list(self.fp16_tmp_grads.keys())
-            for device in devices:
-                self.fp16_tmp_grads[device].zero_()
-        else:
-            for p in self.fp16_params:
-                if p.grad is not None:
-                    p.grad.zero_()
-        if self.has_flat_params:
-            if torch.is_tensor(self.fp32_params):
-                self.fp32_params.grad.zero_()
-            elif isinstance(self.fp32_params, dict):
-                for fp32_params in self.fp32_params.values():
-                    fp32_params.grad.zero_()
-            else:
-                raise ("self.fp32_params must be a tensor or dict")
-        else:
-            for p32 in self.fp32_params:
-                if p32.grad is None:
-                    p32.grad.zero_()
-        self._needs_sync = False
-
-        if self.scaler is not None:
-            self._multiply_factor = 1.0 / float(self.scaler.loss_scale)
-
-
-class FP16Optimizer(_FP16OptimizerMixin, optim.FairseqOptimizer):
-    """
-    Wrap an *optimizer* to support FP16 (mixed precision) training.
-    """
-
-    def __init__(self, args, params, fp32_optimizer, fp32_params):
-        super().__init__(args)
-        self.fp16_params = params
-        self.fp32_optimizer = fp32_optimizer
-        self.fp32_params = fp32_params
-        self.fp16_tmp_grads = dict()
-        self.fp16_tmp_params = dict()
-        self.combine_grads_flag = False
-
-        if getattr(args, "fp16_scale_window", None) is None:
-            if len(args.update_freq) > 1:
-                raise ValueError(
-                    "--fp16-scale-window must be given explicitly when using a "
-                    "custom --update-freq schedule"
-                )
-            data_parallel_size = int(
-                args.distributed_world_size / args.model_parallel_size
-            )
-            scale_window = int(2 ** 14 / data_parallel_size / args.update_freq[0])
-        else:
-            scale_window = args.fp16_scale_window
-
-        if not getattr(args, "bf16", False):
-            self.scaler = DynamicLossScaler(
-                init_scale=0.00390625,
-                scale_window=scale_window,
-                tolerance=args.fp16_scale_tolerance,
-                threshold=args.threshold_loss_scale,
-                min_loss_scale=args.min_loss_scale,
-            )
-        else:
-            # disable loss scaling for bfloat16
-            self.scaler = None
-
-    @classmethod
-    def build_optimizer(cls, args, params):
-        """
-        Args:
-            args (argparse.Namespace): fairseq args
-            params (iterable): iterable of parameters to optimize
-        """
-        flatten = not getattr(args, "fp16_no_flatten_grads", False)
-        if getattr(args, "bf16", False):
-            flatten = False  # mixed precision is faster on TPUs without flat grads
-        fp32_params = cls.build_fp32_params(args, params, flatten=flatten)
-        if flatten:
-            fp32_optimizer = optim.build_optimizer(args, [fp32_params])
-        else:
-            fp32_optimizer = optim.build_optimizer(args, fp32_params)
-        if flatten and not fp32_optimizer.supports_flat_params:
-            raise RuntimeError(
-                "chosen optimizer does not support flat params, "
-                "please set --fp16-no-flatten-grads"
-            )
-        return cls(args, params, fp32_optimizer, fp32_params)
-
-    @property
-    def optimizer(self):
-        return self.fp32_optimizer.optimizer
-
-    @optimizer.setter
-    def optimizer(self, optimizer):
-        self.fp32_optimizer.optimizer = optimizer
-
-    @property
-    def optimizer_config(self):
-        return self.fp32_optimizer.optimizer_config
-
-    def get_lr(self):
-        return self.fp32_optimizer.get_lr()
-
-    def set_lr(self, lr):
-        self.fp32_optimizer.set_lr(lr)
-
-
-class _MemoryEfficientFP16OptimizerMixin(object):
-    def __init__(self, *args, **kwargs):
-        # forward __init__ call to the next class in MRO (method resolution order)
-        super().__init__(*args, **kwargs)
-        self._multiply_factor = 1.0
-
-    @property
-    def has_flat_params(self):
-        return False
-
-    def state_dict(self):
-        """Return the optimizer's state dict."""
-        state_dict = self.wrapped_optimizer.state_dict()
-        if self.scaler is not None:
-            state_dict["loss_scale"] = self.scaler.loss_scale
-        return state_dict
-
-    def load_state_dict(self, state_dict, optimizer_overrides=None):
-        """Load an optimizer state dict.
-
-        In general we should prefer the configuration of the existing optimizer
-        instance (e.g., learning rate) over that found in the state_dict. This
-        allows us to resume training from a checkpoint using a new set of
-        optimizer args.
-        """
-        if "loss_scale" in state_dict and self.scaler is not None:
-            self.scaler.loss_scale = state_dict["loss_scale"]
-
-        self.wrapped_optimizer.load_state_dict(state_dict, optimizer_overrides)
-
-        # Hack: PyTorch automatically casts the optimizer state to match the
-        # type of the current parameters. But with --memory-efficient-fp16 the
-        # params are FP16 while the optimizer state is FP32 and we don't want
-        # to cast. A workaround is to manually copy back the original state
-        # after the optimizer has been loaded.
-        if not getattr(self.optimizer, "disable_mem_eff_fp16_loading_hack", False):
-            groups = self.optimizer.param_groups
-            saved_groups = state_dict["param_groups"]
-            id_map = {
-                old_id: p
-                for old_id, p in zip(
-                    chain(*(g["params"] for g in saved_groups)),
-                    chain(*(g["params"] for g in groups)),
-                )
-            }
-            for k, v in state_dict["state"].items():
-                if k in id_map:
-                    param = id_map[k]
-                    self.optimizer.state[param] = v
-
-    def backward(self, loss):
-        """Computes the sum of gradients of the given tensor w.r.t. graph leaves.
-
-        Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this
-        function additionally dynamically scales the loss to avoid gradient
-        underflow.
-        """
-        if self.scaler is not None:
-            loss = self.scaler.scale(loss)
-        loss.backward()
-
-    def _unscale_grads(self):
-        if self._multiply_factor != 1.0:
-            self.wrapped_optimizer.multiply_grads(self._multiply_factor)
-            self._multiply_factor = 1.0
-
-    def multiply_grads(self, c):
-        """Multiplies grads by a constant *c*."""
-        self._multiply_factor *= c
-
-    def clip_grad_norm(self, max_norm, aggregate_norm_fn=None):
-        """Clips gradient norm and updates dynamic loss scaler."""
-        max_norm = float(max_norm)
-        grad_norm = self._multiply_factor * self.wrapped_optimizer.clip_grad_norm(
-            0, aggregate_norm_fn
-        )
-
-        if self.scaler is not None:
-            grad_norm_cpu = float(grad_norm)
-            if grad_norm_cpu > max_norm > 0.0:
-                self._multiply_factor *= max_norm / grad_norm_cpu
-
-            # detect overflow and adjust loss scale
-            self.scaler.check_overflow(grad_norm_cpu)
-        elif max_norm > 0.0:
-            clip_coef = (max_norm / (grad_norm + 1e-6)).clamp_(max=1)
-            self._multiply_factor *= clip_coef
-
-        return grad_norm
-
-    def step(self, closure=None):
-        """Performs a single optimization step."""
-        if getattr(self, "supports_step_with_scale", False):
-            # NOTE(msb) optimizer divides by scale factor
-            self.wrapped_optimizer.step(closure, scale=(1.0 / self._multiply_factor))
-        else:
-            self._unscale_grads()
-            self.wrapped_optimizer.step(closure)
-
-        if self.scaler is not None:
-            self.scaler.update()
-
-    def zero_grad(self):
-        """Clears the gradients of all optimized parameters."""
-        self.wrapped_optimizer.zero_grad()
-        if self.scaler is not None:
-            self._multiply_factor = 1.0 / float(self.scaler.loss_scale)
-        else:
-            self._multiply_factor = 1.0
-
-
-class MemoryEfficientFP16Optimizer(
-    _MemoryEfficientFP16OptimizerMixin, optim.FairseqOptimizer
-):
-    """
-    Wrap an *optimizer* to support FP16 (mixed precision) training.
-
-    Compared to :class:`fairseq.optim.FP16Optimizer`, this version does not
-    maintain an FP32 copy of the model. We instead expect the optimizer to
-    convert the gradients to FP32 internally and sync the results back to the
-    FP16 model params. This significantly reduces memory usage but slightly
-    increases the time spent in the optimizer.
-
-    Since this wrapper depends on specific functionality in the wrapped
-    optimizer (i.e., on-the-fly conversion of grads to FP32), only certain
-    optimizers can be wrapped. This is determined by the
-    *supports_memory_efficient_fp16* property.
-    """
-
-    def __init__(self, args, params, optimizer):
-        if not optimizer.supports_memory_efficient_fp16:
-            raise ValueError(
-                "Unsupported optimizer: {}".format(optimizer.__class__.__name__)
-            )
-
-        super().__init__(args)
-        self.wrapped_optimizer = optimizer
-
-        if getattr(args, "fp16_scale_window", None) is None:
-            if len(args.update_freq) > 1:
-                raise ValueError(
-                    "--fp16-scale-window must be given explicitly when using a "
-                    "custom --update-freq schedule"
-                )
-            data_parallel_size = int(
-                args.distributed_world_size / args.model_parallel_size
-            )
-            scale_window = 2 ** 14 / data_parallel_size / args.update_freq[0]
-        else:
-            scale_window = args.fp16_scale_window
-
-        if not getattr(args, "bf16", False):
-            self.scaler = DynamicLossScaler(
-                init_scale=args.fp16_init_scale,
-                scale_window=scale_window,
-                tolerance=args.fp16_scale_tolerance,
-                threshold=args.threshold_loss_scale,
-                min_loss_scale=args.min_loss_scale,
-            )
-        else:
-            # disable loss scaling for bfloat16
-            self.scaler = None
-
-    @classmethod
-    def build_optimizer(cls, args, params):
-        """
-        Args:
-            args (argparse.Namespace): fairseq args
-            params (iterable): iterable of parameters to optimize
-        """
-        fp16_optimizer = optim.build_optimizer(args, params)
-        return cls(args, params, fp16_optimizer)
-
-    @property
-    def optimizer(self):
-        return self.wrapped_optimizer.optimizer
-
-    @optimizer.setter
-    def optimizer(self, optimizer):
-        self.wrapped_optimizer.optimizer = optimizer
-
-    @property
-    def optimizer_config(self):
-        return self.wrapped_optimizer.optimizer_config
-
-    def get_lr(self):
-        return self.wrapped_optimizer.get_lr()
-
-    def set_lr(self, lr):
-        self.wrapped_optimizer.set_lr(lr)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/fused_adam.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/fused_adam.py
deleted file mode 100644
index 1780f9c0bbaedf67e033febf0a332672d8912785..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/fused_adam.py
+++ /dev/null
@@ -1,348 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import types
-
-import torch
-
-
-def get_fused_adam_class():
-    """
-    Look for the FusedAdam optimizer from apex. We first try to load the
-    "contrib" interface, which is a bit faster than the main interface,
-    but is technically deprecated.
-    """
-    try:
-        # The "deprecated" interface in recent versions of apex is a bit
-        # faster than the main interface, since we don't use the apex
-        # optimizer. This can be installed by passing the
-        # `--deprecated_fused_adam` option when building apex.
-        global fused_adam_cuda
-        import importlib
-
-        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
-        return FusedAdamV1
-    except ImportError:
-        try:
-            # fallback to the newer interface
-            from apex.optimizers import FusedAdam as _FusedAdam  # noqa
-            from apex.multi_tensor_apply import multi_tensor_applier
-
-            if multi_tensor_applier.available:
-                return FusedAdamV2
-        except ImportError:
-            pass
-    return None
-
-
-class FusedAdamV1(torch.optim.Optimizer):
-    """
-    Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
-    ``python setup.py install --cuda_ext --cpp_ext``.
-
-    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
-
-    Compared to the original version in Apex, the fairseq version casts grads
-    and params to FP32 internally to support ``--memory-efficient-fp16``.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups.
-        lr (float, optional): learning rate. (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square. (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability. (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False) NOT SUPPORTED in FusedAdam!
-        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
-            adds eps to the bias-corrected second moment estimate before
-            evaluating square root instead of adding it to the square root of
-            second moment estimate as in the original paper. (default: False)
-    .. _Adam: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(
-        self,
-        params,
-        lr=1e-3,
-        bias_correction=True,
-        betas=(0.9, 0.999),
-        eps=1e-8,
-        eps_inside_sqrt=False,
-        weight_decay=0.0,
-        max_grad_norm=0.0,
-        amsgrad=False,
-    ):
-        global fused_adam_cuda
-        import importlib
-
-        fused_adam_cuda = importlib.import_module("fused_adam_cuda")
-
-        if amsgrad:
-            raise RuntimeError("FusedAdam does not support the AMSGrad variant.")
-        defaults = {
-            "lr": lr,
-            "bias_correction": bias_correction,
-            "betas": betas,
-            "eps": eps,
-            "weight_decay": weight_decay,
-            "max_grad_norm": max_grad_norm,
-        }
-        super().__init__(params, defaults)
-        self.eps_mode = 0 if eps_inside_sqrt else 1
-
-    @property
-    def supports_memory_efficient_fp16(self):
-        return True
-
-    @property
-    def supports_flat_params(self):
-        return True
-
-    @property
-    def supports_step_with_scale(self):
-        return True
-
-    def step(self, closure=None, grads=None, scale=1.0, grad_norms=None):
-        """Performs a single optimization step.
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-            grads (list of tensors, optional): weight gradient to use for the
-                optimizer update. If gradients have type torch.half, parameters
-                are expected to be in type torch.float. (default: None)
-            output params (list of tensors, optional): A reduced precision copy
-                of the updated weights written out in addition to the regular
-                updated weights. Have to be of same type as gradients. (default: None)
-            scale (float, optional): factor to divide gradient tensor values
-                by before applying to weights. (default: 1)
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        if grads is None:
-            grads_group = [None] * len(self.param_groups)
-        # backward compatibility
-        # assuming a list/generator of parameter means single group
-        elif isinstance(grads, types.GeneratorType):
-            grads_group = [grads]
-        elif type(grads[0]) != list:
-            grads_group = [grads]
-        else:
-            grads_group = grads
-
-        if grad_norms is None:
-            grad_norms = [None] * len(self.param_groups)
-
-        for group, grads_this_group, grad_norm in zip(
-            self.param_groups, grads_group, grad_norms
-        ):
-            if grads_this_group is None:
-                grads_this_group = [None] * len(group["params"])
-
-            # compute combined scale factor for this group
-            combined_scale = scale
-            if group.get("max_grad_norm", 0) > 0:
-                # norm is in fact norm*scale
-                clip = ((grad_norm / scale) + 1e-6) / group["max_grad_norm"]
-                if clip > 1:
-                    combined_scale = clip * scale
-
-            bias_correction = 1 if group.get("bias_correction", 1) else 0
-
-            for p, grad in zip(group["params"], grads_this_group):
-                # note: p.grad should not ever be set for correct
-                # operation of mixed precision optimizer that sometimes
-                # sends None gradients
-                if p.grad is None and grad is None:
-                    continue
-                if grad is None:
-                    grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError(
-                        "FusedAdam does not support sparse gradients, "
-                        "please consider SparseAdam instead"
-                    )
-
-                p_data_fp32 = p.data.float()
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state["step"] = 0
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(p_data_fp32)
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
-                else:
-                    state["exp_avg"] = state["exp_avg"].to(p_data_fp32)
-                    state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32)
-
-                exp_avg = state["exp_avg"]
-                exp_avg_sq = state["exp_avg_sq"]
-                beta1, beta2 = group["betas"]
-
-                state["step"] += 1
-
-                out_p = p.data
-                with torch.cuda.device(p.device):
-                    fused_adam_cuda.adam(
-                        p_data_fp32,
-                        out_p,
-                        exp_avg,
-                        exp_avg_sq,
-                        grad,
-                        group["lr"],
-                        beta1,
-                        beta2,
-                        group["eps"],
-                        combined_scale,
-                        state["step"],
-                        self.eps_mode,
-                        bias_correction,
-                        group["weight_decay"],
-                    )
-
-        return loss
-
-
-try:
-    from apex.optimizers import FusedAdam
-    from apex.multi_tensor_apply import multi_tensor_applier
-
-    class FusedAdamV2(FusedAdam):
-        """
-        Compared to the original version in Apex, the fairseq version casts grads
-        and params to FP32 internally to support ``--memory-efficient-fp16``.
-        """
-
-        def __init__(self, *args, **kwargs):
-            super().__init__(*args, **kwargs)
-            if not hasattr(self, "multi_tensor_adam"):
-                raise Exception(
-                    "Apex installation is outdated. Please install an updated version of apex."
-                )
-
-        @property
-        def supports_memory_efficient_fp16(self):
-            return True
-
-        @property
-        def supports_flat_params(self):
-            return True
-
-        def step(
-            self,
-            closure=None,
-            grads=None,
-            output_params=None,
-            scale=None,
-            grad_norms=None,
-        ):
-            """Performs a single optimization step."""
-            loss = None
-            if closure is not None:
-                loss = closure()
-
-            for group in self.param_groups:
-                bias_correction = 1 if group["bias_correction"] else 0
-                beta1, beta2 = group["betas"]
-
-                # assume same step across group now to simplify things
-                # per parameter step can be easily support by making it tensor, or pass list into kernel
-                if "step" in group:
-                    group["step"] += 1
-                else:
-                    group["step"] = 1
-
-                # create lists for multi-tensor apply
-                g_16, p_16, orig_p_16, m_16, v_16 = [], [], [], [], []
-                g_32, p_32, m_32, v_32 = [], [], [], []
-
-                for p in group["params"]:
-                    if p.grad is None:
-                        continue
-                    if p.grad.data.is_sparse:
-                        raise RuntimeError(
-                            "FusedAdam does not support sparse gradients, "
-                            "please consider SparseAdam instead"
-                        )
-
-                    state = self.state[p]
-                    # State initialization
-                    if len(state) == 0:
-                        # Exponential moving average of gradient values
-                        state["exp_avg"] = torch.zeros_like(p.data, dtype=torch.float)
-                        # Exponential moving average of squared gradient values
-                        state["exp_avg_sq"] = torch.zeros_like(
-                            p.data, dtype=torch.float
-                        )
-                    else:
-                        state["exp_avg"] = state["exp_avg"].to(
-                            device=p.data.device, dtype=torch.float
-                        )
-                        state["exp_avg_sq"] = state["exp_avg_sq"].to(
-                            device=p.data.device, dtype=torch.float
-                        )
-
-                    if p.dtype == torch.float16:
-                        g_16.append(p.grad.data.float())
-                        p_16.append(p.data.float())
-                        orig_p_16.append(p.data)
-                        m_16.append(state["exp_avg"])
-                        v_16.append(state["exp_avg_sq"])
-                    elif p.dtype == torch.float32:
-                        g_32.append(p.grad.data)
-                        p_32.append(p.data)
-                        m_32.append(state["exp_avg"])
-                        v_32.append(state["exp_avg_sq"])
-                    else:
-                        raise RuntimeError("FusedAdam only support fp16 and fp32.")
-
-                with torch.cuda.device(p.device):
-                    if len(g_16) > 0:
-                        multi_tensor_applier(
-                            self.multi_tensor_adam,
-                            self._dummy_overflow_buf,
-                            [g_16, p_16, m_16, v_16],
-                            group["lr"],
-                            beta1,
-                            beta2,
-                            group["eps"],
-                            group["step"],
-                            self.adam_w_mode,
-                            bias_correction,
-                            group["weight_decay"],
-                        )
-                        for orig_p, p in zip(orig_p_16, p_16):
-                            orig_p.copy_(p.data)
-                    if len(g_32) > 0:
-                        multi_tensor_applier(
-                            self.multi_tensor_adam,
-                            self._dummy_overflow_buf,
-                            [g_32, p_32, m_32, v_32],
-                            group["lr"],
-                            beta1,
-                            beta2,
-                            group["eps"],
-                            group["step"],
-                            self.adam_w_mode,
-                            bias_correction,
-                            group["weight_decay"],
-                        )
-
-            return loss
-
-
-except ImportError:
-    pass
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/fused_lamb.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/fused_lamb.py
deleted file mode 100644
index f4f2bdb0c6c65f7758509b6d4d2f2c48cb6e8b4f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/fused_lamb.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.optim import LegacyFairseqOptimizer, register_optimizer
-
-
-@register_optimizer("lamb")
-class FairseqLAMB(LegacyFairseqOptimizer):
-    """LAMB optimizer."""
-
-    def __init__(self, args, params):
-        super().__init__(args)
-        try:
-            from apex.optimizers import FusedLAMB
-
-            self._optimizer = FusedLAMB(params, **self.optimizer_config)
-        except ImportError:
-            raise ImportError("Please install apex to use LAMB optimizer")
-
-    @staticmethod
-    def add_args(parser):
-        """Add optimizer-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--lamb-betas', default='(0.9, 0.999)', metavar='B',
-                            help='betas for LAMB optimizer')
-        parser.add_argument('--lamb-eps', type=float, default=1e-8, metavar='D',
-                            help='epsilon for LAMB optimizer')
-        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
-                            help='weight decay')
-        # fmt: on
-
-    @property
-    def optimizer_config(self):
-        """
-        Return a kwarg dictionary that will be used to override optimizer
-        args stored in checkpoints. This allows us to load a checkpoint and
-        resume training using a different set of optimizer args, e.g., with a
-        different learning rate.
-        """
-        return {
-            "lr": self.args.lr[0],
-            "betas": eval(self.args.lamb_betas),
-            "eps": self.args.lamb_eps,
-            "weight_decay": self.args.weight_decay,
-        }
-
-    @property
-    def supports_flat_params(self):
-        return False
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/__init__.py
deleted file mode 100644
index 7b72c257840fccc7baf2dcc6390e1106a12d3d2f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/__init__.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""isort:skip_file"""
-
-import importlib
-import os
-from argparse import Namespace
-from typing import Union
-
-from fairseq import registry
-from fairseq.optim.lr_scheduler.fairseq_lr_scheduler import (  # noqa
-    FairseqLRScheduler,
-    LegacyFairseqLRScheduler,
-)
-from omegaconf import DictConfig
-
-
-(
-    build_lr_scheduler_,
-    register_lr_scheduler,
-    LR_SCHEDULER_REGISTRY,
-    LR_SCHEDULER_DATACLASS_REGISTRY,
-) = registry.setup_registry(
-    "--lr-scheduler", base_class=FairseqLRScheduler, default="fixed"
-)
-
-
-def build_lr_scheduler(lr_scheduler_cfg: Union[DictConfig, Namespace], optimizer):
-    return build_lr_scheduler_(lr_scheduler_cfg, optimizer)
-
-
-# automatically import any Python files in the optim/lr_scheduler/ directory
-for file in os.listdir(os.path.dirname(__file__)):
-    if file.endswith(".py") and not file.startswith("_"):
-        file_name = file[: file.find(".py")]
-        importlib.import_module("fairseq.optim.lr_scheduler." + file_name)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
deleted file mode 100644
index 98d557504febcf4d4713f21227079d2b9dcf52d1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/cosine_lr_scheduler.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from dataclasses import dataclass, field
-from typing import List
-
-from fairseq.dataclass import FairseqDataclass
-from omegaconf import II
-
-from . import FairseqLRScheduler, register_lr_scheduler
-
-
-@dataclass
-class CosineConfig(FairseqDataclass):
-    warmup_updates: int = field(
-        default=0,
-        metadata={"help": "warmup the learning rate linearly for the first N updates"},
-    )
-    warmup_init_lr: float = field(
-        default=-1,
-        metadata={
-            "help": "initial learning rate during warmup phase; default is args.lr"
-        },
-    )
-    max_lr: float = field(
-        default=1.0, metadata={"help": "max learning rate, must be more than args.lr"}
-    )
-    t_mult: float = field(
-        default=1.0, metadata={"help": "factor to grow the length of each period"}
-    )
-    lr_period_updates: float = field(
-        default=-1, metadata={"help": "initial number of updates per period"}
-    )
-    lr_shrink: float = field(
-        default=0.1, metadata={"help": "shrink factor for annealing"}
-    )
-    # TODO common var for parent class
-    lr: List[float] = II("params.optimization.lr")
-    max_update: int = II("params.optimization.max_update")
-
-
-@register_lr_scheduler("cosine", dataclass=CosineConfig)
-class CosineSchedule(FairseqLRScheduler):
-    """Assign LR based on a cyclical schedule that follows the cosine function.
-
-    See https://arxiv.org/pdf/1608.03983.pdf for details.
-
-    We also support a warmup phase where we linearly increase the learning rate
-    from some initial learning rate (``--warmup-init-lr``) until the configured
-    max learning rate (``--max-lr``).
-
-    During warmup::
-
-      lrs = torch.linspace(args.warmup_init_lr, args.lr, args.warmup_updates)
-      lr = lrs[update_num]
-
-    After warmup::
-
-      lr = lr_min + 0.5*(lr_max - lr_min)*(1 + cos(t_curr / t_i))
-
-    where ``t_curr`` is current percentage of updates within the current period
-    range and ``t_i`` is the current period range, which is scaled by ``t_mul``
-    after every iteration.
-    """
-
-    def __init__(self, args, optimizer):
-        super().__init__(args, optimizer)
-        if len(args.lr) > 1:
-            raise ValueError(
-                "Cannot use a fixed learning rate schedule with cosine."
-                " Consider --lr-scheduler=fixed instead."
-            )
-
-        warmup_end_lr = args.max_lr
-        if args.warmup_init_lr < 0:
-            args.warmup_init_lr = args.lr[0]
-
-        self.min_lr = args.lr[0]
-        self.max_lr = args.max_lr
-
-        assert self.max_lr > self.min_lr, "max_lr must be more than lr"
-
-        self.t_mult = args.t_mult
-        self.period = args.lr_period_updates
-
-        if self.period <= 0:
-            assert (
-                args.max_update >= 0
-            ), "Either --max_update or --lr-period-updates must be set"
-            self.period = args.max_update - args.warmup_updates
-
-        if args.warmup_updates > 0:
-            # linearly warmup for the first args.warmup_updates
-            self.lr_step = (warmup_end_lr - args.warmup_init_lr) / args.warmup_updates
-        else:
-            self.lr_step = 1
-
-        self.warmup_updates = args.warmup_updates
-        self.lr_shrink = args.lr_shrink
-
-        # initial learning rate
-        self.lr = args.warmup_init_lr
-        self.optimizer.set_lr(self.lr)
-
-    def step(self, epoch, val_loss=None):
-        """Update the learning rate at the end of the given epoch."""
-        super().step(epoch, val_loss)
-        # we don't change the learning rate at epoch boundaries
-        return self.optimizer.get_lr()
-
-    def step_update(self, num_updates):
-        """Update the learning rate after each update."""
-        if num_updates < self.args.warmup_updates:
-            self.lr = self.args.warmup_init_lr + num_updates * self.lr_step
-        else:
-            curr_updates = num_updates - self.args.warmup_updates
-            if self.t_mult != 1:
-                i = math.floor(
-                    math.log(
-                        1 - curr_updates / self.period * (1 - self.t_mult), self.t_mult
-                    )
-                )
-                t_i = self.t_mult ** i * self.period
-                t_curr = (
-                    curr_updates
-                    - (1 - self.t_mult ** i) / (1 - self.t_mult) * self.period
-                )
-            else:
-                i = math.floor(curr_updates / self.period)
-                t_i = self.period
-                t_curr = curr_updates - (self.period * i)
-
-            lr_shrink = self.lr_shrink ** i
-            min_lr = self.min_lr * lr_shrink
-            max_lr = self.max_lr * lr_shrink
-
-            self.lr = min_lr + 0.5 * (max_lr - min_lr) * (
-                1 + math.cos(math.pi * t_curr / t_i)
-            )
-
-        self.optimizer.set_lr(self.lr)
-        return self.lr
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
deleted file mode 100644
index e025374893ad1971c8fcaff45e61a2f6deb0378d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from argparse import Namespace
-
-from fairseq.dataclass.utils import gen_parser_from_dataclass
-
-from .. import FairseqOptimizer
-
-
-class FairseqLRScheduler(object):
-    def __init__(self, args, optimizer):
-        super().__init__()
-        if not isinstance(optimizer, FairseqOptimizer):
-            raise ValueError("optimizer must be an instance of FairseqOptimizer")
-        self.args = args
-        self.optimizer = optimizer
-        self.best = None
-
-    @classmethod
-    def add_args(cls, parser):
-        """Add arguments to the parser for this LR scheduler."""
-        dc = getattr(cls, "__dataclass", None)
-        if dc is not None:
-            gen_parser_from_dataclass(parser, dc())
-
-    def state_dict(self):
-        """Return the LR scheduler state dict."""
-        return {"best": self.best}
-
-    def load_state_dict(self, state_dict):
-        """Load an LR scheduler state dict."""
-        self.best = state_dict["best"]
-
-    def step_begin_epoch(self, epoch):
-        """Update the learning rate at the beginning of the given epoch."""
-        pass
-
-    def step(self, epoch, val_loss=None):
-        """Update the learning rate at the end of the given epoch."""
-        if val_loss is not None:
-            if self.best is None:
-                self.best = val_loss
-            else:
-                self.best = min(self.best, val_loss)
-
-    def step_update(self, num_updates):
-        """Update the learning rate after each update."""
-        return self.optimizer.get_lr()
-
-
-class LegacyFairseqLRScheduler(FairseqLRScheduler):
-    def __init__(self, args: Namespace, optimizer):
-        if not isinstance(optimizer, FairseqOptimizer):
-            raise ValueError("optimizer must be an instance of FairseqOptimizer")
-        self.args = args
-        self.optimizer = optimizer
-        self.best = None
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/fixed_schedule.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/fixed_schedule.py
deleted file mode 100644
index e91ba86f8ccdf9141a562885903db3dc454bdab3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/fixed_schedule.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from . import LegacyFairseqLRScheduler, register_lr_scheduler
-
-
-@register_lr_scheduler("fixed")
-class FixedSchedule(LegacyFairseqLRScheduler):
-    """Decay the LR on a fixed schedule."""
-
-    def __init__(self, args, optimizer):
-        super().__init__(args, optimizer)
-
-        # set defaults
-        args.warmup_updates = getattr(args, "warmup_updates", 0) or 0
-
-        self.lr = args.lr[0]
-        if args.warmup_updates > 0:
-            self.warmup_factor = 1.0 / args.warmup_updates
-        else:
-            self.warmup_factor = 1
-
-    @staticmethod
-    def add_args(parser):
-        """Add arguments to the parser for this LR scheduler."""
-        # fmt: off
-        parser.add_argument('--force-anneal', '--fa', type=int, metavar='N',
-                            help='force annealing at specified epoch (epochs start at 1)')
-        parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
-                            help='shrink factor for annealing, lr_new = (lr * lr_shrink)')
-        parser.add_argument('--warmup-updates', default=0, type=int, metavar='N',
-                            help='warmup the learning rate linearly for the first N updates')
-        # fmt: on
-
-    def state_dict(self):
-        return {"lr": self.lr}
-
-    def load_state_dict(self, state_dict):
-        if "lr" in state_dict:
-            self.lr = state_dict["lr"]
-
-    def get_next_lr(self, epoch):
-        lrs = self.args.lr
-        if self.args.force_anneal is None or epoch < self.args.force_anneal:
-            # use fixed LR schedule
-            next_lr = lrs[min(epoch - 1, len(lrs) - 1)]
-        else:
-            # annneal based on lr_shrink
-            next_lr = lrs[-1] * self.args.lr_shrink ** (
-                epoch + 1 - self.args.force_anneal
-            )
-        return next_lr
-
-    def step_begin_epoch(self, epoch):
-        """Update the learning rate at the beginning of the given epoch."""
-        self.lr = self.get_next_lr(epoch)
-        self.optimizer.set_lr(self.warmup_factor * self.lr)
-        return self.optimizer.get_lr()
-
-    def step_update(self, num_updates):
-        """Update the learning rate after each update."""
-        if self.args.warmup_updates > 0 and num_updates < self.args.warmup_updates:
-            self.warmup_factor = (num_updates + 1) / float(self.args.warmup_updates)
-            self.optimizer.set_lr(self.warmup_factor * self.lr)
-        else:
-            self.optimizer.set_lr(self.lr)
-        return self.optimizer.get_lr()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
deleted file mode 100644
index d27261ad482f8714f546919b6c899cd1da129ab1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from dataclasses import dataclass, field
-from typing import List
-
-from fairseq.dataclass import FairseqDataclass
-from omegaconf import II
-
-from . import FairseqLRScheduler, register_lr_scheduler
-
-
-@dataclass
-class InverseSquareRootScheduleConfig(FairseqDataclass):
-    warmup_updates: int = field(
-        default=4000,
-        metadata={"help": "warmup the learning rate linearly for the first N updates"},
-    )
-    warmup_init_lr: float = field(
-        default=-1,
-        metadata={
-            "help": "initial learning rate during warmup phase; default is args.lr"
-        },
-    )
-    # TODO common vars at parent class
-    lr: List[float] = II("params.optimization.lr")
-
-
-@register_lr_scheduler("inverse_sqrt", dataclass=InverseSquareRootScheduleConfig)
-class InverseSquareRootSchedule(FairseqLRScheduler):
-    """Decay the LR based on the inverse square root of the update number.
-
-    We also support a warmup phase where we linearly increase the learning rate
-    from some initial learning rate (``--warmup-init-lr``) until the configured
-    learning rate (``--lr``). Thereafter we decay proportional to the number of
-    updates, with a decay factor set to align with the configured learning rate.
-
-    During warmup::
-
-      lrs = torch.linspace(args.warmup_init_lr, args.lr, args.warmup_updates)
-      lr = lrs[update_num]
-
-    After warmup::
-
-      decay_factor = args.lr * sqrt(args.warmup_updates)
-      lr = decay_factor / sqrt(update_num)
-    """
-
-    def __init__(self, args, optimizer):
-        super().__init__(args, optimizer)
-        if len(args.lr) > 1:
-            raise ValueError(
-                "Cannot use a fixed learning rate schedule with inverse_sqrt."
-                " Consider --lr-scheduler=fixed instead."
-            )
-        warmup_end_lr = args.lr[0]
-        if args.warmup_init_lr < 0:
-            args.warmup_init_lr = 0 if args.warmup_updates > 0 else warmup_end_lr
-
-        # linearly warmup for the first args.warmup_updates
-        self.lr_step = (warmup_end_lr - args.warmup_init_lr) / args.warmup_updates
-
-        # then, decay prop. to the inverse square root of the update number
-        self.decay_factor = warmup_end_lr * args.warmup_updates ** 0.5
-
-        # initial learning rate
-        self.lr = args.warmup_init_lr
-        self.optimizer.set_lr(self.lr)
-
-    def step(self, epoch, val_loss=None):
-        """Update the learning rate at the end of the given epoch."""
-        super().step(epoch, val_loss)
-        # we don't change the learning rate at epoch boundaries
-        return self.optimizer.get_lr()
-
-    def step_update(self, num_updates):
-        """Update the learning rate after each update."""
-        if num_updates < self.args.warmup_updates:
-            self.lr = self.args.warmup_init_lr + num_updates * self.lr_step
-        else:
-            self.lr = self.decay_factor * num_updates ** -0.5
-        self.optimizer.set_lr(self.lr)
-        return self.lr
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
deleted file mode 100644
index 63adc740a9a910e7b85b05cd7263140101714603..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from . import LegacyFairseqLRScheduler, register_lr_scheduler
-
-
-@register_lr_scheduler("polynomial_decay")
-class PolynomialDecaySchedule(LegacyFairseqLRScheduler):
-    """Decay the LR on a fixed schedule."""
-
-    def __init__(self, args, optimizer):
-        super().__init__(args, optimizer)
-
-        # set defaults
-        args.warmup_updates = getattr(args, "warmup_updates", 0) or 0
-
-        self.lr = args.lr[0]
-        if args.warmup_updates > 0:
-            self.warmup_factor = 1.0 / args.warmup_updates
-        else:
-            self.warmup_factor = 1
-        self.end_learning_rate = args.end_learning_rate
-        self.total_num_update = args.total_num_update
-        self.power = args.power
-        self.optimizer.set_lr(self.warmup_factor * self.lr)
-
-    @staticmethod
-    def add_args(parser):
-        """Add arguments to the parser for this LR scheduler."""
-        parser.add_argument(
-            "--force-anneal",
-            "--fa",
-            type=int,
-            metavar="N",
-            help="force annealing at specified epoch",
-        )
-        parser.add_argument(
-            "--warmup-updates",
-            default=0,
-            type=int,
-            metavar="N",
-            help="warmup the learning rate linearly for the first N updates",
-        )
-        parser.add_argument("--end-learning-rate", default=0.0, type=float)
-        parser.add_argument("--power", default=1.0, type=float)
-        parser.add_argument("--total-num-update", default=1000000, type=int)
-
-    def get_next_lr(self, epoch):
-        lrs = self.args.lr
-        if self.args.force_anneal is None or epoch < self.args.force_anneal:
-            # use fixed LR schedule
-            next_lr = lrs[min(epoch, len(lrs) - 1)]
-        else:
-            # annneal based on lr_shrink
-            next_lr = self.optimizer.get_lr()
-        return next_lr
-
-    def step_begin_epoch(self, epoch):
-        """Update the learning rate at the beginning of the given epoch."""
-        self.lr = self.get_next_lr(epoch)
-        self.optimizer.set_lr(self.warmup_factor * self.lr)
-        return self.optimizer.get_lr()
-
-    def step_update(self, num_updates):
-        """Update the learning rate after each update."""
-        if self.args.warmup_updates > 0 and num_updates <= self.args.warmup_updates:
-            self.warmup_factor = num_updates / float(self.args.warmup_updates)
-            lr = self.warmup_factor * self.lr
-        elif num_updates >= self.total_num_update:
-            lr = self.end_learning_rate
-        else:
-            warmup = self.args.warmup_updates
-            lr_range = self.lr - self.end_learning_rate
-            pct_remaining = 1 - (num_updates - warmup) / (
-                self.total_num_update - warmup
-            )
-            lr = lr_range * pct_remaining ** (self.power) + self.end_learning_rate
-        self.optimizer.set_lr(lr)
-        return self.optimizer.get_lr()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
deleted file mode 100644
index 82bb36efe9bd56a872cad76f90fc1490796dd704..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch.optim.lr_scheduler
-
-from . import LegacyFairseqLRScheduler, register_lr_scheduler
-
-
-@register_lr_scheduler("reduce_lr_on_plateau")
-class ReduceLROnPlateau(LegacyFairseqLRScheduler):
-    """
-    Decay the LR by a factor every time the validation loss plateaus.
-    Also comes with optional warmup phase, where we linearly increase
-    the learning rate from some initial learning rate
-    (``--warmup-init-lr``) until the configured learning rate
-    (``--lr``). Thereafter the lr is adjusted according to original
-    reduce_on_plateau scheme.
-
-    During warmup::
-
-      lrs = torch.linspace(
-          args.warmup_init_lr, args.lr, args.warmup_updates
-      )
-      lr = lrs[update_num]
-    """
-
-    def __init__(self, args, optimizer):
-        super().__init__(args, optimizer)
-        if len(args.lr) > 1:
-            raise ValueError(
-                "Cannot use a fixed learning rate schedule with reduce_lr_on_plateau."
-                " Consider --lr-scheduler=fixed instead."
-            )
-        self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
-            self.optimizer.optimizer,
-            patience=args.lr_patience,
-            factor=args.lr_shrink,
-            mode="max" if args.maximize_best_checkpoint_metric else "min",
-            threshold=args.lr_threshold,
-        )
-        warmup_end_lr = args.lr[0]
-        # if no warm up, sets initial lr to be args.lr[0]
-        if args.warmup_init_lr < 0:
-            args.warmup_init_lr = 0 if args.warmup_updates > 0 else warmup_end_lr
-
-        # linearly warmup for the first args.warmup_updates
-        if args.warmup_updates > 0:
-            self.lr_step = (warmup_end_lr - args.warmup_init_lr) / args.warmup_updates
-
-        # this flag is either set from arg when no warm up, or set by
-        # step_update() when warmup finishes
-        self.warmup_end = True if args.warmup_updates <= 0 else False
-
-        # initial learning rate
-        # this self.lr is used only during init and/or warm up period
-        self.lr = args.warmup_init_lr
-        self.optimizer.set_lr(self.lr)
-
-    @staticmethod
-    def add_args(parser):
-        """Add arguments to the parser for this LR scheduler."""
-        # fmt: off
-        parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
-                            help='shrink factor for annealing, lr_new = (lr * lr_shrink)')
-        parser.add_argument('--lr-threshold', default=1e-4, type=float, metavar='LT',
-                            help='threshold for measuring the new optimum, '
-                                 'to only focus on significant changes')
-        parser.add_argument('--lr-patience', default=0, type=int,
-                            help='number of epochs with no improvement after which '
-                                 'learning rate will be reduced')
-        parser.add_argument('--warmup-updates', default=0, type=int, metavar='N',
-                            help='warmup the learning rate linearly for the first N updates')
-        parser.add_argument('--warmup-init-lr', default=-1, type=float, metavar='LR',
-                            help='initial learning rate during warmup phase; default is args.lr')
-        # fmt: on
-
-    def state_dict(self):
-        """Return the LR scheduler state dict."""
-        return {
-            "best": self.lr_scheduler.best,
-            "last_epoch": self.lr_scheduler.last_epoch,
-        }
-
-    def load_state_dict(self, state_dict):
-        """Load an LR scheduler state dict."""
-        self.lr_scheduler.best = state_dict["best"]
-        if "last_epoch" in state_dict:
-            self.lr_scheduler.last_epoch = state_dict["last_epoch"]
-
-    def step(self, epoch, val_loss=None):
-        """
-        Update the learning rate at the end of the given epoch if warmup
-        finishes otherwise no update of lr on epoch boundaries
-        """
-        if val_loss is not None and self.warmup_end is True:
-            self.lr_scheduler.step(val_loss)
-        else:
-            self.lr_scheduler.last_epoch = epoch
-        return self.optimizer.get_lr()
-
-    def step_update(self, num_updates):
-        """
-        Update the learning rate after each update."""
-        # if there is warmup
-        if self.args.warmup_updates > 0:
-            if num_updates <= self.args.warmup_updates:
-                self.lr = self.args.warmup_init_lr + num_updates * self.lr_step
-                self.optimizer.set_lr(self.lr)
-            else:
-                if self.warmup_end is False:
-                    self.warmup_end = True
-        # else do nothing
-        return self.optimizer.get_lr()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
deleted file mode 100644
index c573237f1193b4fa1eaca3c94c2da7260c39b441..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-from . import LegacyFairseqLRScheduler, register_lr_scheduler
-
-
-@register_lr_scheduler("tri_stage")
-class TriStageLRSchedule(LegacyFairseqLRScheduler):
-    """Tristage learning rate schedulr
-
-    Implement the learning rate scheduler in https://arxiv.org/pdf/1904.08779.pdf
-
-    Similar to inverse_squre_root scheduler, but tri_stage learning rate employs
-    three stages LR scheduling:
-
-        - warmup stage, starting from `lr` * `init_lr_scale`, linearly
-          increased to `lr` in `warmup_steps` iterations
-
-        - hold stage, after `warmup_steps`, keep the LR as `lr` for `hold_steps`
-          iterations
-
-        - decay stage, after hold stage, decay LR exponetially to
-          `lr` * `final_lr_scale` in `decay_steps`;
-          after that LR is keep as `final_lr_scale` * `lr`
-
-    During warmup::
-
-      init_lr = args.init_lr_scale * args.lr
-      lrs = torch.linspace(init_lr, args.lr, args.warmup_steps)
-      lr = lrs[update_num]
-
-    During hold::
-
-      lr = args.lr
-
-    During decay::
-
-      decay_factor = - math.log(args.final_lr_scale) / args.decay_steps
-      lr = args.lr * exp(- (update_num - warmup_steps - decay_steps) * decay_factor)
-
-    After that::
-
-      lr = args.lr * args.final_lr_scale
-    """
-
-    def __init__(self, args, optimizer):
-        super().__init__(args, optimizer)
-        if len(args.lr) > 1:
-            raise ValueError(
-                "Cannot use a fixed learning rate schedule with tri-stage lr."
-                " Consider --lr-scheduler=fixed instead."
-            )
-
-        # calculate LR at each point
-        self.peak_lr = args.lr[0]
-        self.init_lr = args.init_lr_scale * args.lr[0]
-        self.final_lr = args.final_lr_scale * args.lr[0]
-
-        # remember the steps at each stage
-        self.warmup_steps = args.warmup_steps
-        self.hold_steps = args.hold_steps
-        self.decay_steps = args.decay_steps
-
-        self.warmup_rate = (
-            (self.peak_lr - self.init_lr) / self.warmup_steps
-            if self.warmup_steps != 0
-            else 0
-        )
-        self.decay_factor = -math.log(args.final_lr_scale) / args.decay_steps
-
-        # initial learning rate
-        self.lr = self.init_lr
-        self.optimizer.set_lr(self.lr)
-
-    @staticmethod
-    def add_args(parser):
-        """Add arguments to the parser for this LR scheduler."""
-        # fmt: off
-        parser.add_argument(
-            '--warmup-steps',
-            default=4000,
-            type=int,
-            metavar='N',
-            help='warmup the learning rate linearly for the first N updates'
-        )
-        parser.add_argument(
-            '--hold-steps',
-            default=20000,
-            type=int,
-            metavar='N',
-            help='steps in hold stage.'
-        )
-        parser.add_argument(
-            '--decay-steps',
-            default=60000,
-            type=int,
-            metavar='N',
-            help='steps in decay stages'
-        )
-        parser.add_argument(
-            '--init-lr-scale',
-            default=0.01,
-            type=float,
-            help="""
-    initial learning rate scale during warmup phase; default is 0.01""")
-        parser.add_argument(
-            '--final-lr-scale',
-            default=0.01,
-            type=float,
-            help="final learning rate scale; default to 0.01"
-        )
-        # fmt: on
-
-    def _decide_stage(self, update_step):
-        """
-        return stage, and the corresponding steps within the current stage
-        """
-        if update_step < self.warmup_steps:
-            # warmup state
-            return 0, update_step
-
-        offset = self.warmup_steps
-
-        if update_step < offset + self.hold_steps:
-            # hold stage
-            return 1, update_step - offset
-
-        offset += self.hold_steps
-
-        if update_step <= offset + self.decay_steps:
-            # decay stage
-            return 2, update_step - offset
-
-        offset += self.decay_steps
-
-        # still here ? constant lr stage
-        return 3, update_step - offset
-
-    def step(self, epoch, val_loss=None):
-        """Update the learning rate at the end of the given epoch."""
-        super().step(epoch, val_loss)
-        # we don't change the learning rate at epoch boundaries
-        return self.optimizer.get_lr()
-
-    def step_update(self, num_updates):
-        """Update the learning rate after each update."""
-        stage, steps_in_stage = self._decide_stage(num_updates)
-        if stage == 0:
-            self.lr = self.init_lr + self.warmup_rate * steps_in_stage
-        elif stage == 1:
-            self.lr = self.peak_lr
-        elif stage == 2:
-            self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage)
-        elif stage == 3:
-            self.lr = self.final_lr
-        else:
-            raise ValueError("Undefined stage")
-
-        self.optimizer.set_lr(self.lr)
-
-        return self.lr
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
deleted file mode 100644
index 0f3193f2b8342b1d9bf3a79a7a9d1d42eeddd1c4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-
-from . import LegacyFairseqLRScheduler, register_lr_scheduler
-
-
-@register_lr_scheduler("triangular")
-class TriangularSchedule(LegacyFairseqLRScheduler):
-    """Assign LR based on a triangular cyclical schedule.
-
-    See https://arxiv.org/pdf/1506.01186.pdf for details.
-    """
-
-    def __init__(self, args, optimizer):
-        super().__init__(args, optimizer)
-        if len(args.lr) > 1:
-            raise ValueError(
-                "Cannot use a fixed learning rate schedule with triangular."
-                " Consider --lr-scheduler=fixed instead."
-            )
-
-        lr = args.lr[0]
-
-        assert args.max_lr > lr, "max_lr must be more than lr"
-        self.min_lr = lr
-        self.max_lr = args.max_lr
-        self.stepsize = args.lr_period_updates // 2
-        self.lr_shrink = args.lr_shrink
-        self.shrink_min = args.shrink_min
-
-        # initial learning rate
-        self.lr = self.min_lr
-        self.optimizer.set_lr(self.lr)
-
-    @staticmethod
-    def add_args(parser):
-        """Add arguments to the parser for this LR scheduler."""
-        # fmt: off
-        parser.add_argument('--max-lr', required=True, type=float, metavar='LR',
-                            help='max learning rate, must be more than args.lr')
-        parser.add_argument('--lr-period-updates', default=5000, type=float, metavar='LR',
-                            help='initial number of updates per period (cycle length)')
-        parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
-                            help='shrink factor for annealing')
-        parser.add_argument('--shrink-min', action='store_true',
-                            help='if set, also shrinks min lr')
-        # fmt: on
-
-    def step(self, epoch, val_loss=None):
-        """Update the learning rate at the end of the given epoch."""
-        super().step(epoch, val_loss)
-        # we don't change the learning rate at epoch boundaries
-        return self.optimizer.get_lr()
-
-    def step_update(self, num_updates):
-        """Update the learning rate after each update."""
-        cycle = math.floor(num_updates / (2 * self.stepsize))
-
-        lr_shrink = self.lr_shrink ** cycle
-        max_lr = self.max_lr * lr_shrink
-        if self.shrink_min:
-            min_lr = self.min_lr * lr_shrink
-        else:
-            min_lr = self.min_lr
-
-        x = abs(num_updates / self.stepsize - 2 * (cycle + 1) + 1)
-        self.lr = min_lr + (max_lr - min_lr) * max(0, (1 - x))
-
-        self.optimizer.set_lr(self.lr)
-        return self.lr
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/nag.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/nag.py
deleted file mode 100644
index 58d2f3560fc142a9e10fbeca60c063d261a2085f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/nag.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from dataclasses import dataclass, field
-from typing import List
-
-import torch
-from fairseq.dataclass import FairseqDataclass
-from omegaconf import II
-from torch.optim.optimizer import Optimizer, required
-
-from . import FairseqOptimizer, register_optimizer
-
-
-@dataclass
-class FairseqNAGConfig(FairseqDataclass):
-    momentum: float = field(default=0.99, metadata={"help": "momentum factor"})
-    weight_decay: float = field(default=0.0, metadata={"help": "weight decay"})
-    # TODO common vars in parent class
-    lr: List[float] = II("params.optimization.lr")
-
-
-@register_optimizer("nag", dataclass=FairseqNAGConfig)
-class FairseqNAG(FairseqOptimizer):
-    def __init__(self, args, params):
-        super().__init__(args)
-        self._optimizer = NAG(params, **self.optimizer_config)
-
-    @property
-    def optimizer_config(self):
-        """
-        Return a kwarg dictionary that will be used to override optimizer
-        args stored in checkpoints. This allows us to load a checkpoint and
-        resume training using a different set of optimizer args, e.g., with a
-        different learning rate.
-        """
-        return {
-            "lr": self.args.lr[0],
-            "momentum": self.args.momentum,
-            "weight_decay": self.args.weight_decay,
-        }
-
-
-class NAG(Optimizer):
-    def __init__(self, params, lr=required, momentum=0, weight_decay=0):
-        defaults = dict(lr=lr, lr_old=lr, momentum=momentum, weight_decay=weight_decay)
-        super(NAG, self).__init__(params, defaults)
-
-    @property
-    def supports_memory_efficient_fp16(self):
-        return True
-
-    @property
-    def supports_flat_params(self):
-        return True
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            weight_decay = group["weight_decay"]
-            momentum = group["momentum"]
-            lr = group["lr"]
-            lr_old = group.get("lr_old", lr)
-            lr_correct = lr / lr_old
-
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-
-                p_data_fp32 = p.data
-                if p_data_fp32.dtype in {torch.float16, torch.bfloat16}:
-                    p_data_fp32 = p_data_fp32.float()
-
-                d_p = p.grad.data.float()
-                param_state = self.state[p]
-                if "momentum_buffer" not in param_state:
-                    param_state["momentum_buffer"] = torch.zeros_like(d_p)
-                else:
-                    param_state["momentum_buffer"] = param_state["momentum_buffer"].to(
-                        d_p
-                    )
-
-                buf = param_state["momentum_buffer"]
-
-                if weight_decay != 0:
-                    p_data_fp32.mul_(1 - lr * weight_decay)
-                p_data_fp32.add_(buf, alpha=momentum * momentum * lr_correct)
-                p_data_fp32.add_(d_p, alpha=-(1 + momentum) * lr)
-
-                buf.mul_(momentum * lr_correct).add_(d_p, alpha=-lr)
-
-                if p.data.dtype in {torch.float16, torch.bfloat16}:
-                    p.data.copy_(p_data_fp32)
-
-            group["lr_old"] = lr
-
-        return loss
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/sgd.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/sgd.py
deleted file mode 100644
index 8e34fb99a18fff12ab76be5894a84cbbb2f48176..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/sgd.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch.optim
-
-from . import LegacyFairseqOptimizer, register_optimizer
-
-
-@register_optimizer("sgd")
-class SGD(LegacyFairseqOptimizer):
-    def __init__(self, args, params):
-        super().__init__(args)
-        self._optimizer = torch.optim.SGD(params, **self.optimizer_config)
-
-    @staticmethod
-    def add_args(parser):
-        """Add optimizer-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('--momentum', default=0.0, type=float, metavar='M',
-                            help='momentum factor')
-        parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
-                            help='weight decay')
-        # fmt: on
-
-    @property
-    def optimizer_config(self):
-        """
-        Return a kwarg dictionary that will be used to override optimizer
-        args stored in checkpoints. This allows us to load a checkpoint and
-        resume training using a different set of optimizer args, e.g., with a
-        different learning rate.
-        """
-        return {
-            "lr": self.args.lr[0],
-            "momentum": self.args.momentum,
-            "weight_decay": self.args.weight_decay,
-        }
-
-    @property
-    def supports_flat_params(self):
-        return True
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/shard.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/shard.py
deleted file mode 100644
index a035a1c1f93d3ce3eae93f8d80216af8ecf9620a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/optim/shard.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-try:
-    from fairscale.optim import OSS
-
-    _has_fairscale = True
-except ImportError:
-    _has_fairscale = False
-
-
-def shard_(args, optimizer, group):
-    if not _has_fairscale:
-        raise ImportError(
-            "\n\nPlease install the fairscale package:" "\n\n  pip install fairscale"
-        )
-
-    class FairseqOSS(OSS):
-        @property
-        def disable_mem_eff_fp16_loading_hack(self):
-            return True
-
-        def __getattr__(self, name):
-            if name.startswith("supports") and hasattr(self.optim, name):
-                return getattr(self.optim, name)
-            raise AttributeError(
-                "'FairseqOSS' object has no attribute {0!r}".format(name)
-            )
-
-    torch_optimizer = optimizer.optimizer
-    optim_cls = type(torch_optimizer)
-
-    optimizer.optimizer = FairseqOSS(
-        torch_optimizer.param_groups,
-        optim_cls,
-        group=group,
-        **optimizer.optimizer_config
-    )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/options.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/options.py
deleted file mode 100644
index 1a24fccaec4af9d91a6d49ca6b03bce936a650cf..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/options.py
+++ /dev/null
@@ -1,441 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-from typing import Callable, List, Optional
-
-import torch
-from fairseq import utils
-from fairseq.data.indexed_dataset import get_available_dataset_impl
-from fairseq.dataclass.data_class import (
-    CheckpointParams,
-    CommonEvalParams,
-    CommonParams,
-    DatasetParams,
-    DistributedTrainingParams,
-    EvalLMParams,
-    OptimizationParams,
-)
-from fairseq.dataclass.utils import gen_parser_from_dataclass
-
-# this import is for backward compatibility
-from fairseq.utils import csv_str_list, eval_bool, eval_str_dict, eval_str_list  # noqa
-
-
-def get_preprocessing_parser(default_task="translation"):
-    parser = get_parser("Preprocessing", default_task)
-    add_preprocess_args(parser)
-    return parser
-
-
-def get_training_parser(default_task="translation"):
-    parser = get_parser("Trainer", default_task)
-    add_dataset_args(parser, train=True)
-    add_distributed_training_args(parser)
-    add_model_args(parser)
-    add_optimization_args(parser)
-    add_checkpoint_args(parser)
-    return parser
-
-
-def get_generation_parser(interactive=False, default_task="translation"):
-    parser = get_parser("Generation", default_task)
-    add_dataset_args(parser, gen=True)
-    add_distributed_training_args(parser, default_world_size=1)
-    add_generation_args(parser)
-    if interactive:
-        add_interactive_args(parser)
-    return parser
-
-
-def get_interactive_generation_parser(default_task="translation"):
-    return get_generation_parser(interactive=True, default_task=default_task)
-
-
-def get_eval_lm_parser(default_task="language_modeling"):
-    parser = get_parser("Evaluate Language Model", default_task)
-    add_dataset_args(parser, gen=True)
-    add_distributed_training_args(parser, default_world_size=1)
-    add_eval_lm_args(parser)
-    return parser
-
-
-def get_validation_parser(default_task=None):
-    parser = get_parser("Validation", default_task)
-    add_dataset_args(parser, train=True)
-    add_distributed_training_args(parser, default_world_size=1)
-    group = parser.add_argument_group("Evaluation")
-    gen_parser_from_dataclass(group, CommonEvalParams())
-    return parser
-
-
-def parse_args_and_arch(
-    parser: argparse.ArgumentParser,
-    input_args: List[str] = None,
-    parse_known: bool = False,
-    suppress_defaults: bool = False,
-    modify_parser: Optional[Callable[[argparse.ArgumentParser], None]] = None,
-):
-    """
-    Args:
-        parser (ArgumentParser): the parser
-        input_args (List[str]): strings to parse, defaults to sys.argv
-        parse_known (bool): only parse known arguments, similar to
-            `ArgumentParser.parse_known_args`
-        suppress_defaults (bool): parse while ignoring all default values
-        modify_parser (Optional[Callable[[ArgumentParser], None]]):
-            function to modify the parser, e.g., to set default values
-    """
-    if suppress_defaults:
-        # Parse args without any default values. This requires us to parse
-        # twice, once to identify all the necessary task/model args, and a second
-        # time with all defaults set to None.
-        args = parse_args_and_arch(
-            parser,
-            input_args=input_args,
-            parse_known=parse_known,
-            suppress_defaults=False,
-        )
-        suppressed_parser = argparse.ArgumentParser(add_help=False, parents=[parser])
-        suppressed_parser.set_defaults(**{k: None for k, v in vars(args).items()})
-        args = suppressed_parser.parse_args(input_args)
-        return argparse.Namespace(
-            **{k: v for k, v in vars(args).items() if v is not None}
-        )
-
-    from fairseq.models import ARCH_MODEL_REGISTRY, ARCH_CONFIG_REGISTRY, MODEL_REGISTRY
-
-    # Before creating the true parser, we need to import optional user module
-    # in order to eagerly import custom tasks, optimizers, architectures, etc.
-    usr_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False)
-    usr_parser.add_argument("--user-dir", default=None)
-    usr_args, _ = usr_parser.parse_known_args(input_args)
-    utils.import_user_module(usr_args)
-
-    if modify_parser is not None:
-        modify_parser(parser)
-
-    # The parser doesn't know about model/criterion/optimizer-specific args, so
-    # we parse twice. First we parse the model/criterion/optimizer, then we
-    # parse a second time after adding the *-specific arguments.
-    # If input_args is given, we will parse those args instead of sys.argv.
-    args, _ = parser.parse_known_args(input_args)
-
-    # Add model-specific args to parser.
-    if hasattr(args, "arch"):
-        model_specific_group = parser.add_argument_group(
-            "Model-specific configuration",
-            # Only include attributes which are explicitly given as command-line
-            # arguments or which have default values.
-            argument_default=argparse.SUPPRESS,
-        )
-        if args.arch in ARCH_MODEL_REGISTRY:
-            ARCH_MODEL_REGISTRY[args.arch].add_args(model_specific_group)
-        elif args.arch in MODEL_REGISTRY:
-            MODEL_REGISTRY[args.arch].add_args(model_specific_group)
-        else:
-            raise RuntimeError()
-
-    # Add *-specific args to parser.
-    from fairseq.registry import REGISTRIES
-
-    for registry_name, REGISTRY in REGISTRIES.items():
-        choice = getattr(args, registry_name, None)
-        if choice is not None:
-            cls = REGISTRY["registry"][choice]
-            if hasattr(cls, "add_args"):
-                cls.add_args(parser)
-    if hasattr(args, "task"):
-        from fairseq.tasks import TASK_REGISTRY
-
-        TASK_REGISTRY[args.task].add_args(parser)
-    if getattr(args, "use_bmuf", False):
-        # hack to support extra args for block distributed data parallelism
-        from fairseq.optim.bmuf import FairseqBMUF
-
-        FairseqBMUF.add_args(parser)
-
-    # Modify the parser a second time, since defaults may have been reset
-    if modify_parser is not None:
-        modify_parser(parser)
-
-    # Parse a second time.
-    if parse_known:
-        args, extra = parser.parse_known_args(input_args)
-    else:
-        args = parser.parse_args(input_args)
-        extra = None
-    # Post-process args.
-    if (
-        hasattr(args, "batch_size_valid") and args.batch_size_valid is None
-    ) or not hasattr(args, "batch_size_valid"):
-        args.batch_size_valid = args.batch_size
-    if hasattr(args, "max_tokens_valid") and args.max_tokens_valid is None:
-        args.max_tokens_valid = args.max_tokens
-    if getattr(args, "memory_efficient_fp16", False):
-        args.fp16 = True
-    if getattr(args, "memory_efficient_bf16", False):
-        args.bf16 = True
-    args.tpu = getattr(args, "tpu", False)
-    args.bf16 = getattr(args, "bf16", False)
-    if args.bf16:
-        args.tpu = True
-    if args.tpu and args.fp16:
-        raise ValueError("Cannot combine --fp16 and --tpu, use --bf16 on TPUs")
-
-    if getattr(args, "seed", None) is None:
-        args.seed = 1  # default seed for training
-        args.no_seed_provided = True
-    else:
-        args.no_seed_provided = False
-
-    # Apply architecture configuration.
-    if hasattr(args, "arch") and args.arch in ARCH_CONFIG_REGISTRY:
-        ARCH_CONFIG_REGISTRY[args.arch](args)
-
-    if parse_known:
-        return args, extra
-    else:
-        return args
-
-
-def get_parser(desc, default_task="translation"):
-    # Before creating the true parser, we need to import optional user module
-    # in order to eagerly import custom tasks, optimizers, architectures, etc.
-    usr_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False)
-    usr_parser.add_argument("--user-dir", default=None)
-    usr_args, _ = usr_parser.parse_known_args()
-    utils.import_user_module(usr_args)
-
-    parser = argparse.ArgumentParser(allow_abbrev=False)
-    gen_parser_from_dataclass(parser, CommonParams())
-
-    from fairseq.registry import REGISTRIES
-
-    for registry_name, REGISTRY in REGISTRIES.items():
-        parser.add_argument(
-            "--" + registry_name.replace("_", "-"),
-            default=REGISTRY["default"],
-            choices=REGISTRY["registry"].keys(),
-        )
-
-    # Task definitions can be found under fairseq/tasks/
-    from fairseq.tasks import TASK_REGISTRY
-
-    parser.add_argument(
-        "--task",
-        metavar="TASK",
-        default=default_task,
-        choices=TASK_REGISTRY.keys(),
-        help="task",
-    )
-    # fmt: on
-    return parser
-
-
-def add_preprocess_args(parser):
-    group = parser.add_argument_group("Preprocessing")
-    # fmt: off
-    group.add_argument("-s", "--source-lang", default=None, metavar="SRC",
-                       help="source language")
-    group.add_argument("-t", "--target-lang", default=None, metavar="TARGET",
-                       help="target language")
-    group.add_argument("--trainpref", metavar="FP", default=None,
-                       help="train file prefix")
-    group.add_argument("--validpref", metavar="FP", default=None,
-                       help="comma separated, valid file prefixes")
-    group.add_argument("--testpref", metavar="FP", default=None,
-                       help="comma separated, test file prefixes")
-    group.add_argument("--align-suffix", metavar="FP", default=None,
-                       help="alignment file suffix")
-    group.add_argument("--destdir", metavar="DIR", default="data-bin",
-                       help="destination dir")
-    group.add_argument("--thresholdtgt", metavar="N", default=0, type=int,
-                       help="map words appearing less than threshold times to unknown")
-    group.add_argument("--thresholdsrc", metavar="N", default=0, type=int,
-                       help="map words appearing less than threshold times to unknown")
-    group.add_argument("--tgtdict", metavar="FP",
-                       help="reuse given target dictionary")
-    group.add_argument("--srcdict", metavar="FP",
-                       help="reuse given source dictionary")
-    group.add_argument("--nwordstgt", metavar="N", default=-1, type=int,
-                       help="number of target words to retain")
-    group.add_argument("--nwordssrc", metavar="N", default=-1, type=int,
-                       help="number of source words to retain")
-    group.add_argument("--alignfile", metavar="ALIGN", default=None,
-                       help="an alignment file (optional)")
-    parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap',
-                        choices=get_available_dataset_impl(),
-                        help='output dataset implementation')
-    group.add_argument("--joined-dictionary", action="store_true",
-                       help="Generate joined dictionary")
-    group.add_argument("--only-source", action="store_true",
-                       help="Only process the source language")
-    group.add_argument("--padding-factor", metavar="N", default=8, type=int,
-                       help="Pad dictionary size to be multiple of N")
-    group.add_argument("--workers", metavar="N", default=1, type=int,
-                       help="number of parallel workers")
-    # fmt: on
-    return parser
-
-
-def add_dataset_args(parser, train=False, gen=False):
-    group = parser.add_argument_group("dataset_data_loading")
-    gen_parser_from_dataclass(group, DatasetParams())
-    # fmt: on
-    return group
-
-
-def add_distributed_training_args(parser, default_world_size=None):
-    group = parser.add_argument_group("distributed_training")
-    if default_world_size is None:
-        default_world_size = max(1, torch.cuda.device_count())
-    gen_parser_from_dataclass(
-        group, DistributedTrainingParams(distributed_world_size=default_world_size)
-    )
-    return group
-
-
-def add_optimization_args(parser):
-    group = parser.add_argument_group("optimization")
-    # fmt: off
-    gen_parser_from_dataclass(group, OptimizationParams())
-    # fmt: on
-    return group
-
-
-def add_checkpoint_args(parser):
-    group = parser.add_argument_group("checkpoint")
-    # fmt: off
-    gen_parser_from_dataclass(group, CheckpointParams())
-    # fmt: on
-    return group
-
-
-def add_common_eval_args(group):
-    gen_parser_from_dataclass(group, CommonEvalParams())
-
-
-def add_eval_lm_args(parser):
-    group = parser.add_argument_group("LM Evaluation")
-    add_common_eval_args(group)
-    gen_parser_from_dataclass(group, EvalLMParams())
-
-
-def add_generation_args(parser):
-    group = parser.add_argument_group("Generation")
-    add_common_eval_args(group)
-    # fmt: off
-    group.add_argument('--beam', default=5, type=int, metavar='N',
-                       help='beam size')
-    group.add_argument('--nbest', default=1, type=int, metavar='N',
-                       help='number of hypotheses to output')
-    group.add_argument('--max-len-a', default=0, type=float, metavar='N',
-                       help=('generate sequences of maximum length ax + b, '
-                             'where x is the source length'))
-    group.add_argument('--max-len-b', default=200, type=int, metavar='N',
-                       help=('generate sequences of maximum length ax + b, '
-                             'where x is the source length'))
-    group.add_argument('--min-len', default=1, type=float, metavar='N',
-                       help=('minimum generation length'))
-    group.add_argument('--match-source-len', default=False, action='store_true',
-                       help=('generations should match the source length'))
-    group.add_argument('--no-early-stop', action='store_true',
-                       help='deprecated')
-    group.add_argument('--unnormalized', action='store_true',
-                       help='compare unnormalized hypothesis scores')
-    group.add_argument('--no-beamable-mm', action='store_true',
-                       help='don\'t use BeamableMM in attention layers')
-    group.add_argument('--lenpen', default=1, type=float,
-                       help='length penalty: <1.0 favors shorter, >1.0 favors longer sentences')
-    group.add_argument('--unkpen', default=0, type=float,
-                       help='unknown word penalty: <0 produces more unks, >0 produces fewer')
-    group.add_argument('--replace-unk', nargs='?', const=True, default=None,
-                       help='perform unknown replacement (optionally with alignment dictionary)')
-    group.add_argument('--sacrebleu', action='store_true',
-                       help='score with sacrebleu')
-    group.add_argument('--score-reference', action='store_true',
-                       help='just score the reference translation')
-    group.add_argument('--prefix-size', default=0, type=int, metavar='PS',
-                       help='initialize generation by target prefix of given length')
-    group.add_argument('--no-repeat-ngram-size', default=0, type=int, metavar='N',
-                       help='ngram blocking such that this size ngram cannot be repeated in the generation')
-    group.add_argument('--sampling', action='store_true',
-                       help='sample hypotheses instead of using beam search')
-    group.add_argument('--sampling-topk', default=-1, type=int, metavar='PS',
-                       help='sample from top K likely next words instead of all words')
-    group.add_argument('--sampling-topp', default=-1.0, type=float, metavar='PS',
-                       help='sample from the smallest set whose cumulative probability mass exceeds p for next words')
-    group.add_argument('--constraints', const="ordered", nargs="?", choices=["ordered", "unordered"],
-                       help='enables lexically constrained decoding')
-    group.add_argument('--temperature', default=1., type=float, metavar='N',
-                       help='temperature for generation')
-    group.add_argument('--diverse-beam-groups', default=-1, type=int, metavar='N',
-                       help='number of groups for Diverse Beam Search')
-    group.add_argument('--diverse-beam-strength', default=0.5, type=float, metavar='N',
-                       help='strength of diversity penalty for Diverse Beam Search')
-    group.add_argument('--diversity-rate', default=-1.0, type=float, metavar='N',
-                       help='strength of diversity penalty for Diverse Siblings Search')
-    group.add_argument('--print-alignment', action='store_true',
-                       help='if set, uses attention feedback to compute and print alignment to source tokens')
-    group.add_argument('--print-step', action='store_true')
-
-    group.add_argument('--lm-path', default=None, type=str, metavar='PATH',
-                       help='path to lm checkpoint for lm fusion')
-    group.add_argument('--lm-weight', default=0.0, type=float, metavar='N',
-                       help='weight for lm probs for lm fusion')
-
-    # arguments for iterative refinement generator
-    group.add_argument('--iter-decode-eos-penalty', default=0.0, type=float, metavar='N',
-                       help='if > 0.0, it penalized early-stopping in decoding.')
-    group.add_argument('--iter-decode-max-iter', default=10, type=int, metavar='N',
-                       help='maximum iterations for iterative refinement.')
-    group.add_argument('--iter-decode-force-max-iter', action='store_true',
-                       help='if set, run exact the maximum number of iterations without early stop')
-    group.add_argument('--iter-decode-with-beam', default=1, type=int, metavar='N',
-                       help='if > 1, model will generate translations varying by the lengths.')
-    group.add_argument('--iter-decode-with-external-reranker', action='store_true',
-                       help='if set, the last checkpoint are assumed to be a reranker to rescore the translations'),
-    group.add_argument('--retain-iter-history', action='store_true',
-                       help='if set, decoding returns the whole history of iterative refinement')
-    group.add_argument('--retain-dropout', action='store_true',
-                       help='Use dropout at inference time')
-    group.add_argument('--retain-dropout-modules', default=None, nargs='+', type=str,
-                       help='if set, only retain dropout for the specified modules; '
-                            'if not set, then dropout will be retained for all modules')
-
-    # special decoding format for advanced decoding.
-    group.add_argument('--decoding-format', default=None, type=str, choices=['unigram', 'ensemble', 'vote', 'dp', 'bs'])
-    # fmt: on
-    return group
-
-
-def add_interactive_args(parser):
-    group = parser.add_argument_group("Interactive")
-    # fmt: off
-    group.add_argument('--buffer-size', default=0, type=int, metavar='N',
-                       help='read this many sentences into a buffer before processing them')
-    group.add_argument('--input', default='-', type=str, metavar='FILE',
-                       help='file to read from; use - for stdin')
-    # fmt: on
-
-
-def add_model_args(parser):
-    group = parser.add_argument_group("Model configuration")
-    # fmt: off
-
-    # Model definitions can be found under fairseq/models/
-    #
-    # The model architecture can be specified in several ways.
-    # In increasing order of priority:
-    # 1) model defaults (lowest priority)
-    # 2) --arch argument
-    # 3) --encoder/decoder-* arguments (highest priority)
-    from fairseq.models import ARCH_MODEL_REGISTRY
-    group.add_argument('--arch', '-a', metavar='ARCH',
-                       choices=ARCH_MODEL_REGISTRY.keys(),
-                       help='model architecture')
-    # fmt: on
-    return group
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/pdb.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/pdb.py
deleted file mode 100644
index 1ba6ef0d336b30717cfdde94e1b838cfe2bfeb20..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/pdb.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import multiprocessing
-import os
-import pdb
-import sys
-
-
-__all__ = ["set_trace"]
-
-
-_stdin = [None]
-_stdin_lock = multiprocessing.Lock()
-try:
-    _stdin_fd = sys.stdin.fileno()
-except Exception:
-    _stdin_fd = None
-
-
-class MultiprocessingPdb(pdb.Pdb):
-    """A Pdb wrapper that works in a multiprocessing environment.
-
-    Usage: `from fairseq import pdb; pdb.set_trace()`
-    """
-
-    def __init__(self):
-        pdb.Pdb.__init__(self, nosigint=True)
-
-    def _cmdloop(self):
-        stdin_bak = sys.stdin
-        with _stdin_lock:
-            try:
-                if _stdin_fd is not None:
-                    if not _stdin[0]:
-                        _stdin[0] = os.fdopen(_stdin_fd)
-                    sys.stdin = _stdin[0]
-                self.cmdloop()
-            finally:
-                sys.stdin = stdin_bak
-
-
-def set_trace():
-    pdb = MultiprocessingPdb()
-    pdb.set_trace(sys._getframe().f_back)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/quantization_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/quantization_utils.py
deleted file mode 100644
index 69dd61d785091eadad91d2a77485b13ef086b46a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/quantization_utils.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-
-from fairseq.modules.quantization import pq, quantization_options, scalar
-
-
-logger = logging.getLogger(__name__)
-
-
-def quantize_model_scalar(model, args):
-    quant_noise_scalar = getattr(args, "quant_noise_scalar", 0)
-    if quant_noise_scalar > 0:
-        # quantize_model edits the model in place
-        scalar.quantize_model_(model, p=quant_noise_scalar, bits=8, update_step=1000)
-    return model
-
-
-class Quantizer(object):
-    def __init__(self, config_path, max_epoch, max_update):
-        try:
-            import yaml
-        except ImportError:
-            raise ImportError("Please install yaml with: pip install yaml")
-
-        # parse config
-        if config_path:
-            with open(config_path) as config_file:
-                config = quantization_options.parse_config_yaml(
-                    yaml.safe_load(config_file)
-                )
-        else:
-            config = quantization_options.parse_config_yaml({})
-
-        self.n_centroids_config = config["n_centroids"]
-        self.block_sizes_config = config["block_sizes"]
-        self.layers_to_quantize = config["layers_to_quantize"]
-
-        # We assume that training will run for a fixed number of epochs
-        # (or updates) and that we should train for equal durations
-        # between iterations of PQ.
-        num_iterations = len(self.layers_to_quantize)
-        if max_epoch > 0:
-            assert max_epoch % num_iterations == 0, (
-                "for iterative PQ, --max-epoch (={}) must be evenly divisible by "
-                "len(layers_to_quantize) (={})".format(max_epoch, num_iterations)
-            )
-            self.epoch_schedule = max_epoch // num_iterations
-        else:
-            self.epoch_schedule = None
-        if max_update > 0:
-            assert max_update % num_iterations == 0, (
-                "for iterative PQ, --max-update (={}) must be evenly divisible by "
-                "len(layers_to_quantize) (={})".format(max_update, num_iterations)
-            )
-            self.update_schedule = max_update // num_iterations
-        else:
-            self.update_schedule = None
-        assert (self.epoch_schedule is not None) ^ (
-            self.update_schedule is not None
-        ), "for iterative PQ, cannot specify both --max-update and --max-epoch"
-
-        # 0 is a special value for quantization step, which will force
-        # the first call to begin_epoch() to call step()
-        self.quantization_step = 0
-
-    def set_trainer(self, trainer):
-        self.trainer = trainer
-        self.size_tracker = pq.SizeTracker(self.trainer.get_model())
-
-    def step(self):
-        """Move to the next stage of quantization."""
-        if self.quantization_step >= len(self.layers_to_quantize):
-            # Maybe we just finished the last training step or we loaded
-            # a checkpoint for an iterative PQ model which previously
-            # finished training. Either way, don't quantize again.
-            return
-
-        logger.info(
-            "quantizing model (step={}; layers_to_quantize[step]={})".format(
-                self.quantization_step, self.layers_to_quantize[self.quantization_step]
-            )
-        )
-        quantized_layers = pq.quantize_model_(
-            self.trainer.get_model(),
-            self.size_tracker,
-            self.layers_to_quantize,
-            self.block_sizes_config,
-            self.n_centroids_config,
-            step=self.quantization_step,
-        )
-        logger.info("quantized layers: {}".format(quantized_layers))
-        logger.info(self.size_tracker)
-
-        self.quantization_step += 1
-
-        # reintialize the Trainer since model parameters have changed
-        self.trainer.reinitialize()
-
-    def begin_epoch(self, epoch):
-        """Called at the beginning of each epoch (epochs start at 1)."""
-        if (
-            (
-                self.epoch_schedule is not None
-                and epoch > 0
-                and (epoch - 1) % self.epoch_schedule == 0
-            )
-            # we always step once in the beginning, even if using
-            # update-based quantization
-            or self.quantization_step == 0
-        ):
-            self.step()
-
-    def step_update(self, num_updates):
-        """Called at the end of each step."""
-        if (
-            self.update_schedule is not None
-            and num_updates > 0
-            and num_updates % self.update_schedule == 0
-        ):
-            self.step()
-
-    def state_dict(self):
-        return {
-            "n_centroids_config": self.n_centroids_config,
-            "block_sizes_config": self.block_sizes_config,
-            "layers_to_quantize": self.layers_to_quantize,
-            "epoch_schedule": self.epoch_schedule,
-            "update_schedule": self.update_schedule,
-            "quantization_step": self.quantization_step,
-        }
-
-    def load_state_dict(self, state_dict):
-        self.n_centroids_config = state_dict["n_centroids_config"]
-        self.block_sizes_config = state_dict["block_sizes_config"]
-        self.layers_to_quantize = state_dict["layers_to_quantize"]
-        self.epoch_schedule = state_dict["epoch_schedule"]
-        self.update_schedule = state_dict["update_schedule"]
-        self.quantization_step = state_dict["quantization_step"]
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/registry.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/registry.py
deleted file mode 100644
index 382dec22a84e736dba93469aef8f7c11dc885339..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/registry.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-from argparse import Namespace
-from typing import Union
-
-from fairseq.dataclass import FairseqDataclass
-from omegaconf import DictConfig
-
-
-REGISTRIES = {}
-
-
-def setup_registry(registry_name: str, base_class=None, default=None, required=False):
-    assert registry_name.startswith("--")
-    registry_name = registry_name[2:].replace("-", "_")
-
-    REGISTRY = {}
-    REGISTRY_CLASS_NAMES = set()
-    DATACLASS_REGISTRY = {}
-
-    # maintain a registry of all registries
-    if registry_name in REGISTRIES:
-        return  # registry already exists
-    REGISTRIES[registry_name] = {"registry": REGISTRY, "default": default}
-
-    def build_x(args: Union[DictConfig, Namespace], *extra_args, **extra_kwargs):
-        if isinstance(args, DictConfig):
-            if getattr(args, "_name", None) is not None:
-                choice = args._name
-            elif hasattr(args, registry_name):
-                choice = args.registry_name
-            else:
-                raise RuntimeError(
-                    f"Neither _name nor {registry_name} in args, args = {args}"
-                )
-        else:
-            choice = getattr(args, registry_name, None)
-
-        if choice is None:
-            if required:
-                raise ValueError("--{} is required!".format(registry_name))
-            return None
-        cls = REGISTRY[choice]
-        if hasattr(cls, "build_" + registry_name):
-            builder = getattr(cls, "build_" + registry_name)
-        else:
-            builder = cls
-        if isinstance(args, Namespace):
-            set_defaults(args, cls)
-        return builder(args, *extra_args, **extra_kwargs)
-
-    def register_x(name, dataclass=None):
-        def register_x_cls(cls):
-            if name in REGISTRY:
-                raise ValueError(
-                    "Cannot register duplicate {} ({})".format(registry_name, name)
-                )
-            if cls.__name__ in REGISTRY_CLASS_NAMES:
-                raise ValueError(
-                    "Cannot register {} with duplicate class name ({})".format(
-                        registry_name, cls.__name__
-                    )
-                )
-            if base_class is not None and not issubclass(cls, base_class):
-                raise ValueError(
-                    "{} must extend {}".format(cls.__name__, base_class.__name__)
-                )
-
-            if dataclass is not None and not issubclass(dataclass, FairseqDataclass):
-                raise ValueError(
-                    "Dataclass {} must extend FairseqDataclass".format(dataclass)
-                )
-
-            cls.__dataclass = dataclass
-            REGISTRY[name] = cls
-            DATACLASS_REGISTRY[name] = cls.__dataclass
-            REGISTRY_CLASS_NAMES.add(cls.__name__)
-            return cls
-
-        return register_x_cls
-
-    return build_x, register_x, REGISTRY, DATACLASS_REGISTRY
-
-
-def set_defaults(args: Namespace, cls):
-    """Helper to set default arguments based on *add_args*."""
-    if not hasattr(cls, "add_args"):
-        return
-    parser = argparse.ArgumentParser(
-        argument_default=argparse.SUPPRESS, allow_abbrev=False
-    )
-    cls.add_args(parser)
-    # copied from argparse.py:
-    defaults = argparse.Namespace()
-    for action in parser._actions:
-        if action.dest is not argparse.SUPPRESS:
-            if not hasattr(defaults, action.dest):
-                if action.default is not argparse.SUPPRESS:
-                    setattr(defaults, action.dest, action.default)
-    for key, default_value in vars(defaults).items():
-        if not hasattr(args, key):
-            setattr(args, key, default_value)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/__init__.py
deleted file mode 100644
index 4be0cb51885c54fce73d39cc1296fa36952b0899..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/__init__.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import importlib
-import os
-from abc import ABC, abstractmethod
-
-from fairseq import registry
-
-
-class BaseScorer(ABC):
-    def __init__(self, args):
-        self.args = args
-        self.ref = []
-        self.pred = []
-
-    @staticmethod
-    def add_args(parser):
-        pass
-
-    def add_string(self, ref, pred):
-        self.ref.append(ref)
-        self.pred.append(pred)
-
-    @abstractmethod
-    def score(self) -> float:
-        pass
-
-    @abstractmethod
-    def result_string(self) -> str:
-        pass
-
-
-_build_scorer, register_scorer, SCORER_REGISTRY, _ = registry.setup_registry(
-    "--scoring", default="bleu"
-)
-
-
-def build_scorer(args, tgt_dict):
-    from fairseq import utils
-
-    if args.sacrebleu:
-        utils.deprecation_warning(
-            "--sacrebleu is deprecated. Please use --scoring sacrebleu instead."
-        )
-        args.scoring = "sacrebleu"
-    if args.scoring == "bleu":
-        from fairseq.scoring import bleu
-
-        return bleu.Scorer(tgt_dict.pad(), tgt_dict.eos(), tgt_dict.unk())
-    return _build_scorer(args)
-
-
-# automatically import any Python files in the current directory
-for file in os.listdir(os.path.dirname(__file__)):
-    if file.endswith(".py") and not file.startswith("_"):
-        module = file[: file.find(".py")]
-        importlib.import_module("fairseq.scoring." + module)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/bleu.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/bleu.py
deleted file mode 100644
index 7f8bd73bf5c5dad8c7a4e48485675b3c2f4a181f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/bleu.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import ctypes
-import math
-import sys
-
-import torch
-from fairseq.scoring import BaseScorer, register_scorer
-from fairseq.scoring.tokenizer import EvaluationTokenizer
-
-
-class BleuStat(ctypes.Structure):
-    _fields_ = [
-        ("reflen", ctypes.c_size_t),
-        ("predlen", ctypes.c_size_t),
-        ("match1", ctypes.c_size_t),
-        ("count1", ctypes.c_size_t),
-        ("match2", ctypes.c_size_t),
-        ("count2", ctypes.c_size_t),
-        ("match3", ctypes.c_size_t),
-        ("count3", ctypes.c_size_t),
-        ("match4", ctypes.c_size_t),
-        ("count4", ctypes.c_size_t),
-    ]
-
-
-@register_scorer("sacrebleu")
-class SacrebleuScorer(BaseScorer):
-    def __init__(self, args):
-        super(SacrebleuScorer, self).__init__(args)
-        import sacrebleu
-
-        self.sacrebleu = sacrebleu
-        self.tokenizer = EvaluationTokenizer(
-            tokenizer_type=self.args.sacrebleu_tokenizer,
-            lowercase=self.args.sacrebleu_lowercase,
-            character_tokenization=self.args.sacrebleu_char_level,
-        )
-
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        parser.add_argument('--sacrebleu-tokenizer', type=str, default='13a',
-                            choices=EvaluationTokenizer.ALL_TOKENIZER_TYPES,
-                            help='tokenizer')
-        parser.add_argument('--sacrebleu-lowercase', type=str, default=False,
-                            help='apply lowercasing')
-        parser.add_argument('--sacrebleu-char-level', action='store_true',
-                            help='evaluate at character level')
-        # fmt: on
-
-    def add_string(self, ref, pred):
-        self.ref.append(self.tokenizer.tokenize(ref))
-        self.pred.append(self.tokenizer.tokenize(pred))
-
-    def score(self, order=4):
-        return self.result_string(order).score
-
-    def result_string(self, order=4):
-        if order != 4:
-            raise NotImplementedError
-        # tokenization and lowercasing are performed by self.tokenizer instead.
-        return self.sacrebleu.corpus_bleu(
-            self.pred, [self.ref], tokenize="none"
-        ).format()
-
-
-@register_scorer("bleu")
-class Scorer(object):
-    def __init__(self, pad, eos, unk):
-        self.stat = BleuStat()
-        self.pad = pad
-        self.eos = eos
-        self.unk = unk
-
-        try:
-            from fairseq import libbleu
-        except ImportError as e:
-            sys.stderr.write(
-                "ERROR: missing libbleu.so. run `pip install --editable .`\n"
-            )
-            raise e
-
-        self.C = ctypes.cdll.LoadLibrary(libbleu.__file__)
-
-        self.reset()
-
-    def reset(self, one_init=False):
-        if one_init:
-            self.C.bleu_one_init(ctypes.byref(self.stat))
-        else:
-            self.C.bleu_zero_init(ctypes.byref(self.stat))
-
-    def add(self, ref, pred):
-        if not isinstance(ref, torch.IntTensor):
-            raise TypeError("ref must be a torch.IntTensor (got {})".format(type(ref)))
-        if not isinstance(pred, torch.IntTensor):
-            raise TypeError("pred must be a torch.IntTensor(got {})".format(type(pred)))
-
-        # don't match unknown words
-        rref = ref.clone()
-        assert not rref.lt(0).any()
-        rref[rref.eq(self.unk)] = -999
-
-        rref = rref.contiguous().view(-1)
-        pred = pred.contiguous().view(-1)
-
-        self.C.bleu_add(
-            ctypes.byref(self.stat),
-            ctypes.c_size_t(rref.size(0)),
-            ctypes.c_void_p(rref.data_ptr()),
-            ctypes.c_size_t(pred.size(0)),
-            ctypes.c_void_p(pred.data_ptr()),
-            ctypes.c_int(self.pad),
-            ctypes.c_int(self.eos),
-        )
-
-    def score(self, order=4):
-        psum = sum(
-            math.log(p) if p > 0 else float("-Inf") for p in self.precision()[:order]
-        )
-        return self.brevity() * math.exp(psum / order) * 100
-
-    def precision(self):
-        def ratio(a, b):
-            return a / b if b > 0 else 0
-
-        return [
-            ratio(self.stat.match1, self.stat.count1),
-            ratio(self.stat.match2, self.stat.count2),
-            ratio(self.stat.match3, self.stat.count3),
-            ratio(self.stat.match4, self.stat.count4),
-        ]
-
-    def brevity(self):
-        r = self.stat.reflen / self.stat.predlen
-        return min(1, math.exp(1 - r))
-
-    def result_string(self, order=4):
-        assert order <= 4, "BLEU scores for order > 4 aren't supported"
-        fmt = "BLEU{} = {:2.2f}, {:2.1f}"
-        for _ in range(1, order):
-            fmt += "/{:2.1f}"
-        fmt += " (BP={:.3f}, ratio={:.3f}, syslen={}, reflen={})"
-        bleup = [p * 100 for p in self.precision()[:order]]
-        return fmt.format(
-            order,
-            self.score(order=order),
-            *bleup,
-            self.brevity(),
-            self.stat.predlen / self.stat.reflen,
-            self.stat.predlen,
-            self.stat.reflen
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/chrf.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/chrf.py
deleted file mode 100644
index 0d6cb77383a44d9ac739958b79a30764f1fbf7f3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/chrf.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.scoring import BaseScorer, register_scorer
-
-
-@register_scorer("chrf")
-class ChrFScorer(BaseScorer):
-    def __init__(self, args):
-        super(ChrFScorer, self).__init__(args)
-        import sacrebleu
-
-        self.sacrebleu = sacrebleu
-
-    def add_string(self, ref, pred):
-        self.ref.append(ref)
-        self.pred.append(pred)
-
-    def score(self, order=4):
-        return self.result_string(order).score
-
-    def result_string(self, order=4):
-        if order != 4:
-            raise NotImplementedError
-        return self.sacrebleu.corpus_chrf(self.pred, [self.ref]).format()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/tokenizer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/tokenizer.py
deleted file mode 100644
index dbcc6e4d101d74d6dbe2f6bd4838c1cde9414e46..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/tokenizer.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unicodedata
-
-
-class EvaluationTokenizer(object):
-    """A generic evaluation-time tokenizer, which leverages built-in tokenizers
-    in sacreBLEU (https://github.com/mjpost/sacrebleu). It additionally provides
-    lowercasing, punctuation removal and character tokenization, which are
-    applied after sacreBLEU tokenization.
-
-    Args:
-        tokenizer_type (str): the type of sacreBLEU tokenizer to apply.
-        lowercase (bool): lowercase the text.
-        punctuation_removal (bool): remove punctuation (based on unicode
-        category) from text.
-        character_tokenization (bool): tokenize the text to characters.
-    """
-
-    SPACE = chr(32)
-    SPACE_ESCAPE = chr(9601)
-    ALL_TOKENIZER_TYPES = ["none", "13a", "intl", "zh", "ja-mecab"]
-
-    def __init__(
-        self,
-        tokenizer_type: str = "13a",
-        lowercase: bool = False,
-        punctuation_removal: bool = False,
-        character_tokenization: bool = False,
-    ):
-        from sacrebleu.tokenizers import TOKENIZERS
-
-        assert tokenizer_type in self.ALL_TOKENIZER_TYPES
-        self.lowercase = lowercase
-        self.punctuation_removal = punctuation_removal
-        self.character_tokenization = character_tokenization
-        self.tokenizer = TOKENIZERS[tokenizer_type]
-
-    @classmethod
-    def remove_punctuation(cls, sent: str):
-        """Remove punctuation based on Unicode category."""
-        return cls.SPACE.join(
-            t
-            for t in sent.split(cls.SPACE)
-            if not all(unicodedata.category(c)[0] == "P" for c in t)
-        )
-
-    def tokenize(self, sent: str):
-        tokenized = self.tokenizer()(sent)
-
-        if self.punctuation_removal:
-            tokenized = self.remove_punctuation(tokenized)
-
-        if self.character_tokenization:
-            tokenized = self.SPACE.join(
-                list(tokenized.replace(self.SPACE, self.SPACE_ESCAPE))
-            )
-
-        if self.lowercase:
-            tokenized = tokenized.lower()
-
-        return tokenized
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/wer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/wer.py
deleted file mode 100644
index 21efefd9b85fc3529284fb3e491f3d7273c395b8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/scoring/wer.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.scoring import BaseScorer, register_scorer
-from fairseq.scoring.tokenizer import EvaluationTokenizer
-
-
-@register_scorer("wer")
-class WerScorer(BaseScorer):
-    def __init__(self, args):
-        super().__init__(args)
-        self.reset()
-        try:
-            import editdistance as ed
-        except ImportError:
-            raise ImportError("Please install editdistance to use WER scorer")
-        self.ed = ed
-        self.tokenizer = EvaluationTokenizer(
-            tokenizer_type=self.args.wer_tokenizer,
-            lowercase=self.args.wer_lowercase,
-            punctuation_removal=self.args.wer_remove_punct,
-            character_tokenization=self.args.wer_char_level,
-        )
-
-    @staticmethod
-    def add_args(parser):
-        # fmt: off
-        parser.add_argument('--wer-tokenizer', type=str, default='none',
-                            choices=EvaluationTokenizer.ALL_TOKENIZER_TYPES,
-                            help='sacreBLEU tokenizer to use for evaluation')
-        parser.add_argument('--wer-remove-punct', action='store_true',
-                            help='remove punctuation')
-        parser.add_argument('--wer-char-level', action='store_true',
-                            help='evaluate at character level')
-        parser.add_argument('--wer-lowercase', action='store_true',
-                            help='lowercasing')
-        # fmt: on
-
-    def reset(self):
-        self.distance = 0
-        self.ref_length = 0
-
-    def add_string(self, ref, pred):
-        ref_items = self.tokenizer.tokenize(ref).split()
-        pred_items = self.tokenizer.tokenize(pred).split()
-        self.distance += self.ed.eval(ref_items, pred_items)
-        self.ref_length += len(ref_items)
-
-    def result_string(self):
-        return f"WER: {self.score():.2f}"
-
-    def score(self):
-        return 100.0 * self.distance / self.ref_length if self.ref_length > 0 else 0
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/search.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/search.py
deleted file mode 100644
index d5ea68b4ce04409c504c1d22098b7968a9ce596a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/search.py
+++ /dev/null
@@ -1,814 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from typing import List, Optional
-
-import torch
-import torch.nn as nn
-from fairseq.token_generation_constraints import (
-    ConstraintState,
-    OrderedConstraintState,
-    UnorderedConstraintState,
-)
-from torch import Tensor
-
-
-class Search(nn.Module):
-    def __init__(self, tgt_dict):
-        super().__init__()
-        self.pad = tgt_dict.pad()
-        self.unk = tgt_dict.unk()
-        self.eos = tgt_dict.eos()
-        self.vocab_size = len(tgt_dict)
-        self.src_lengths = torch.tensor(-1)
-        self.supports_constraints = False
-        self.stop_on_max_len = False
-
-    def step(
-        self, step, lprobs, scores, prev_output_tokens=None, original_batch_idxs=None
-    ):
-        """Take a single search step.
-
-        Args:
-            step: the current search step, starting at 0
-            lprobs: (bsz x input_beam_size x vocab_size)
-                the model's log-probabilities over the vocabulary at the current step
-            scores: (bsz x input_beam_size x step)
-                the historical model scores of each hypothesis up to this point
-            prev_output_tokens: (bsz x step)
-                the previously generated oputput tokens
-            original_batch_idxs: (bsz)
-                the tensor with the batch indices, in the range [0, bsz)
-                this is useful in case there has been applied a re-ordering
-                and we need to know the orignal indices
-
-        Return: A tuple of (scores, indices, beams) where:
-            scores: (bsz x output_beam_size)
-                the scores of the chosen elements; output_beam_size can be
-                larger than input_beam_size, e.g., we may return
-                2*input_beam_size to account for EOS
-            indices: (bsz x output_beam_size)
-                the indices of the chosen elements
-            beams: (bsz x output_beam_size)
-                the hypothesis ids of the chosen elements, in the range [0, input_beam_size)
-        """
-        raise NotImplementedError
-
-    @torch.jit.export
-    def set_src_lengths(self, src_lengths):
-        self.src_lengths = src_lengths
-
-    @torch.jit.export
-    def init_constraints(self, batch_constraints: Optional[Tensor], beam_size: int):
-        """Initialize constraint states for constrained decoding (if supported).
-
-        Args:
-            batch_constraints: (torch.Tensor, optional)
-                the list of constraints, in packed form
-            beam_size: (int)
-                the beam size
-        Returns:
-            *encoder_out* rearranged according to *new_order*
-        """
-        pass
-
-    def prune_sentences(self, batch_idxs: Tensor):
-        """
-        Removes constraint states for completed sentences (if supported).
-        This is called from sequence_generator._generate() when sentences are
-        deleted from the batch.
-
-        Args:
-            batch_idxs: Indices of *sentences* whose constraint state should be *kept*.
-        """
-        pass
-
-    def update_constraints(self, active_hypos: Tensor):
-        """
-        Updates the constraint states by selecting the beam items that are retained.
-        This is called at each time step of sequence_generator._generate() when
-        the set of 2 * {beam_size} candidate hypotheses are reduced to the beam size.
-
-        Args:
-            active_hypos: (batch size, beam size)
-              list of integers denoting, for each sentence, which beam candidate items
-              should be kept.
-        """
-        pass
-
-
-class BeamSearch(Search):
-    def __init__(self, tgt_dict):
-        super().__init__(tgt_dict)
-        self.constraint_states = None
-
-    @torch.jit.export
-    def step(
-        self,
-        step: int,
-        lprobs,
-        scores: Optional[Tensor],
-        prev_output_tokens: Optional[Tensor] = None,
-        original_batch_idxs: Optional[Tensor] = None,
-    ):
-        bsz, beam_size, vocab_size = lprobs.size()
-
-        if step == 0:
-            # at the first step all hypotheses are equally likely, so use
-            # only the first beam
-            lprobs = lprobs[:, ::beam_size, :].contiguous()
-        else:
-            # make probs contain cumulative scores for each hypothesis
-            assert scores is not None
-            lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)
-
-        top_prediction = torch.topk(
-            lprobs.view(bsz, -1),
-            k=min(
-                # Take the best 2 x beam_size predictions. We'll choose the first
-                # beam_size of these which don't predict eos to continue with.
-                beam_size * 2,
-                lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
-            ),
-        )
-        scores_buf = top_prediction[0]
-        indices_buf = top_prediction[1]
-        # Project back into relative indices and beams
-        beams_buf = indices_buf // vocab_size
-        indices_buf = indices_buf.fmod(vocab_size)
-
-        # At this point, beams_buf and indices_buf are single-dim and contain relative indices
-        return scores_buf, indices_buf, beams_buf
-
-
-class PrefixConstrainedBeamSearch(Search):
-    def __init__(self, tgt_dict, prefix_allowed_tokens_fn):
-        super().__init__(tgt_dict)
-        self.prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
-        self.stop_on_max_len = True
-
-    @torch.jit.export
-    def apply_mask(self, x, prev_output_tokens, original_batch_idxs):
-        beam_size = x.shape[0] // original_batch_idxs.shape[0]
-        original_batch_idxs = (
-            original_batch_idxs.unsqueeze(-1).repeat((1, beam_size)).flatten().tolist()
-        )
-
-        mask = torch.full_like(x, -math.inf)
-        for sent_i, (sent, batch_i) in enumerate(
-            zip(prev_output_tokens, original_batch_idxs)
-        ):
-            mask[sent_i, :, self.prefix_allowed_tokens_fn(batch_i, sent)] = 0
-
-        return mask
-
-    @torch.jit.export
-    def step(
-        self,
-        step: int,
-        lprobs: Tensor,
-        scores: Tensor,
-        prev_output_tokens: Tensor,
-        original_batch_idxs: Tensor,
-    ):
-        bsz, beam_size, vocab_size = lprobs.size()
-
-        lprobs += self.apply_mask(
-            lprobs.view(bsz * beam_size, 1, vocab_size),
-            prev_output_tokens,
-            original_batch_idxs,
-        ).view(bsz, beam_size, vocab_size)
-
-        if step == 0:
-            # at the first step all hypotheses are equally likely, so use
-            # only the first beam
-            lprobs = lprobs[:, ::beam_size, :].contiguous()
-        else:
-            # make probs contain cumulative scores for each hypothesis
-            assert scores is not None
-            lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)
-
-        top_prediction = torch.topk(
-            lprobs.view(bsz, -1),
-            k=min(
-                # Take the best beam_size predictions. We'll choose the first
-                # beam_size of these which don't predict eos to continue with.
-                beam_size,
-                lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
-            ),
-        )
-        scores_buf = top_prediction[0]
-        indices_buf = top_prediction[1]
-        beams_buf = indices_buf // vocab_size
-        indices_buf = indices_buf.fmod(vocab_size)
-        return scores_buf, indices_buf, beams_buf
-
-
-class LexicallyConstrainedBeamSearch(Search):
-    """Implements lexically constrained beam search as described in
-
-        Fast Lexically Constrained Decoding with Dynamic Beam
-        Allocation for Neural Machine Translation.  Post & Vilar,
-        NAACL 2018.  https://www.aclweb.org/anthology/N18-1119/
-
-    and
-
-        Improved Lexically Constrained Decoding for Translation and
-        Monolingual Rewriting. Hu et al, NAACL
-        2019. https://www.aclweb.org/anthology/N19-1090/
-
-    This is accomplished by maintaining, for each beam hypothesis, a
-    ConstraintState object (see constraints.py) that tracks which
-    constraints have been generated and using this information to
-    shape the beam for each input sentence.
-    """
-
-    def __init__(self, tgt_dict, representation):
-        super().__init__(tgt_dict)
-        self.representation = representation
-        self.vocab_size = len(tgt_dict)
-        self.num_cands = 0
-        self.supports_constraints = True
-
-    @torch.jit.export
-    def init_constraints(self, batch_constraints: Optional[Tensor], beam_size: int):
-        self.constraint_states = []
-        for constraint_tensor in batch_constraints:
-            if self.representation == "ordered":
-                constraint_state = OrderedConstraintState.create(constraint_tensor)
-            elif self.representation == "unordered":
-                constraint_state = UnorderedConstraintState.create(constraint_tensor)
-
-            self.constraint_states.append([constraint_state for i in range(beam_size)])
-
-    @torch.jit.export
-    def prune_sentences(self, batch_idxs: Tensor):
-        self.constraint_states = [
-            self.constraint_states[i] for i in batch_idxs.tolist()
-        ]
-
-    @torch.jit.export
-    def update_constraints(self, active_hypos: Tensor):
-        if self.constraint_states:
-            batch_size = active_hypos.size(0)
-            for sentid in range(batch_size):
-                self.constraint_states[sentid] = [
-                    self.constraint_states[sentid][i] for i in active_hypos[sentid]
-                ]
-
-    @torch.jit.export
-    def step(
-        self,
-        step: int,
-        lprobs: Tensor,
-        scores: Optional[Tensor],
-        prev_output_tokens: Optional[Tensor] = None,
-        original_batch_idxs: Optional[Tensor] = None,
-    ):
-        """
-        A constrained step builds a large candidates list from the following:
-        - the top 2 * {beam_size} items over the whole beam
-        - for each item in the beam
-          - the top {each_k} (default 1)
-          - all next constraints
-        We then compute the constrained state of each beam item, and assign
-        stripe codes: 0 to the best in each bank, 1 to the 2nd-best, and so
-        on. We then sort by (stripe, score), and truncate the list at
-        2 * beam size.
-
-        Args:
-            step: the decoder step
-            lprobs: (batch size, beam size, target vocab)
-                the target-vocab distributions for each item in the beam.
-        Retrun: A tuple of (scores, indices, beams, constraints) where:
-            scores: (batch, output beam size)
-                the scores of the chosen elements
-            indices: (batch, output beam size)
-                the target vocab indices of the chosen elements
-            beams: (batch, output beam size)
-                the 0-indexed hypothesis ids of the chosen elements
-            constraints: (batch, output beam size)
-                the new constraint states
-        """
-        each_k = 1
-        device = lprobs.device
-
-        batch_size, beam_size, vocab_size = lprobs.size()
-
-        self.num_cands = min(
-            # Just take the k-best. We'll get another k from the 1-best from each
-            # row, plus more from the constraints
-            beam_size * 2,
-            lprobs.view(batch_size, -1).size(1) - 1,  # -1 so we never select pad
-        )
-
-        # STEP 0: Preliminary. Prevent EOS for unfinished hyps across all batch items
-        constraint_states = self.constraint_states
-        if constraint_states and step > 0:
-            not_finished_indices = []
-            for sentno, sent_constraints in enumerate(constraint_states):
-                for beamno, state in enumerate(sent_constraints):
-                    index = sentno * beam_size + beamno
-                    if not state.finished:
-                        not_finished_indices.append(index)
-            not_finished_indices = torch.tensor(not_finished_indices)
-            if not_finished_indices.numel() > 0:
-                lprobs.view(batch_size * beam_size, -1)[
-                    not_finished_indices, self.eos
-                ] = -math.inf
-
-        if step == 0:
-            # at the first step all hypotheses are equally likely, so use
-            # only the first beam entry for each batch item
-            lprobs = lprobs[:, ::beam_size, :].contiguous()
-        else:
-            # make probs contain cumulative scores for each hypothesis
-            assert scores is not None
-            lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1)
-
-        top_prediction = torch.topk(
-            lprobs.view(batch_size, -1),
-            self.num_cands,
-        )
-        scores_buf, indices_buf = top_prediction
-        # Project back into relative indices and beams
-        beams_buf = indices_buf // vocab_size
-        indices_buf = indices_buf.fmod(vocab_size)
-
-        # Short circuit if there are no constraints in this batch
-        if not constraint_states:
-            return scores_buf, indices_buf, beams_buf
-
-        # STEP 1: get top-1 from each hypothesis across all sentences in the batch
-        if step > 0:
-            top_scores, top_indices = torch.topk(
-                lprobs.view(batch_size * beam_size, -1),
-                k=each_k,
-                dim=1,
-            )
-            top_scores = top_scores.view(batch_size, -1)
-            top_indices = top_indices.view(batch_size, -1)
-            scores_buf = torch.cat((scores_buf, top_scores), dim=1)
-            indices_buf = torch.cat((indices_buf, top_indices), dim=1)
-            new_beams = torch.arange(0, beam_size, device=device).repeat(batch_size, 1)
-            beams_buf = torch.cat((beams_buf, new_beams), dim=1)
-
-        # Now, process sentences in the batch one by one.
-        new_scores_buf = torch.zeros((batch_size, 2 * beam_size), device=device)
-        new_indices_buf = torch.zeros((batch_size, 2 * beam_size), device=device).long()
-        new_beams_buf = torch.zeros((batch_size, 2 * beam_size), device=device).long()
-        for sentno, states in enumerate(constraint_states):
-            scores, indices, beams, new_states = self.step_sentence(
-                step,
-                sentno,
-                lprobs[sentno],
-                constraint_states[sentno],
-                beams_buf[sentno].clone(),
-                indices_buf[sentno].clone(),
-                scores_buf[sentno].clone(),
-            )
-            new_scores_buf[sentno] = scores
-            new_indices_buf[sentno] = indices
-            new_beams_buf[sentno] = beams
-            self.constraint_states[sentno] = new_states
-
-        return new_scores_buf, new_indices_buf, new_beams_buf
-
-    @torch.jit.export
-    def step_sentence(
-        self,
-        step: int,
-        sentno: int,
-        lprobs: Tensor,
-        constraint_states: List[List[ConstraintState]],
-        beams_buf: Tensor,
-        indices_buf: Tensor,
-        scores_buf: Tensor,
-    ):
-        """Does per-sentence processing. Adds all constraints for each
-        hypothesis to the list of candidates; then removes duplicates,
-        sorts, and dynamically stripes across the banks. All tensor inputs
-        are collapsed to those pertaining to a single input sentence.
-        """
-        device = lprobs.device
-
-        # STEP 2: Add all constraints for each beam item
-        for beamno, state in enumerate(constraint_states):
-            next_tokens = torch.tensor(list(state.next_tokens()), device=device).long()
-            if next_tokens.numel() != 0:
-                indices_buf = torch.cat((indices_buf, next_tokens))
-                next_beams = (
-                    torch.tensor(beamno, device=device)
-                    .repeat(next_tokens.size(0))
-                    .long()
-                )
-                beams_buf = torch.cat((beams_buf, next_beams))
-                next_values = lprobs[beamno].take(next_tokens.view(-1))
-                scores_buf = torch.cat((scores_buf, next_values))
-
-            # At the 0th time step, there is just one beam item
-            if step == 0:
-                break
-
-        # STEP 3: Compute the "bank" for each candidate. This is the
-        # number of constraints it's generated. We need this so that
-        # we can do round-robin allocation of the beam across these
-        # banks. If C is the number of constraints, we select the best
-        # item in bank C, then the best in bank C-1, etc, followed by
-        # the 2nd-best in bank C, the 2nd-best in bank C-1, etc, and so
-        # on, until the maximum beam size. We accomplish this by
-        # creating a sort key and striping across the banks.
-
-        # Compute the new states for all candidates
-        cands_size = indices_buf.size(0)
-        constraint_states = [
-            constraint_states[beams_buf[i]].advance(indices_buf[i])
-            for i in range(cands_size)
-        ]
-
-        banks = torch.tensor([state.bank for state in constraint_states], device=device)
-
-        # STEP 4: Sort
-        num_constraint_tokens = len(state.tokens)
-
-        # Sort by keys (bank, score) (i.e., sort banks together, and scores
-        # within banks). AFAIK pytorch doesn't support either stable sort or
-        # multi-key sorting, so we have to hack this.
-        MAX_SCORE = -100
-        sort_key = (num_constraint_tokens - banks) * MAX_SCORE + scores_buf
-        sort_values, sort_indices = sort_key.sort(dim=0, descending=True)
-        scores_buf = scores_buf[sort_indices]
-        indices_buf = indices_buf[sort_indices]
-        beams_buf = beams_buf[sort_indices]
-        banks = banks[sort_indices]
-
-        # Sort the constraints to follow suit
-        constraint_states = [constraint_states[i] for i in sort_indices]
-
-        # STEP 5: Remove duplicates. The topk calls (overall and
-        # per-row) plus the per-row generation of constraints will
-        # produce duplicates. Here we remove them.
-
-        def roll(t):
-            """Rolls a 1d tensor left by 1.
-
-            [0, 1, 2, 3, 4] becomes [4, 0, 1, 2, 3]
-            """
-            return torch.cat((t[-1].unsqueeze(0), t[0:-1]), dim=0)
-
-        # We map candidates (beam, token_id) to a single dimension.
-        # This is then shifted by 1. We can then easily identify
-        # duplicates and create a mask that identifies unique
-        # extensions.
-        uniques_mask = beams_buf * (self.vocab_size + 1) + indices_buf
-        uniques_mask = roll(uniques_mask) != uniques_mask
-
-        # Use the mask to pare down the data structures
-        scores_buf = torch.masked_select(scores_buf, uniques_mask)
-        indices_buf = torch.masked_select(indices_buf, uniques_mask)
-        beams_buf = torch.masked_select(beams_buf, uniques_mask)
-        banks = torch.masked_select(banks, uniques_mask)
-        i = 1
-        for mask in uniques_mask[1:]:
-            if not mask:
-                constraint_states.pop(i)
-            i += mask
-
-        # STEP 6: Assign IDs round-robin across banks, sort, and
-        # truncate. Now that the candidates are sorted by (bank,
-        # score) and uniqed, we dynamically allocate the {beam_size}
-        # beam by striping across the candidates. These stripes will
-        # be used as sort keys to do round-robin selection. This is
-        # accomplished in a single pass with offsets. Sorting by
-        # highest-banks (furthest-along hypotheses) first ensures
-        # progress through the constraints.
-        #
-        # e.g., BANKS: 3 3 3 2 2 2 2 1 1 1 0 0
-        # OLD STRIPES: 0 1 2 0 1 2 3 0 1 2 0 1
-        # NEW STRIPES: 0 1+4 2+8 0+1 1+5 2+9 3+11 0+2 1+6 2+10 0+3 1+7
-        #            = 0 5 10 1 6 11 13 2 7 12 3 8
-        #
-        # Sorting by this then gives the following banks:
-        #
-        #             3 2 1 0 3 2 1 0 3 2 1 2
-        #
-        # We'll take the top {beam_size} of these.
-        stripe_offsets = [offset * (len(banks) + 1) for offset in range(len(banks) + 1)]
-        stripes = torch.zeros_like(banks)
-        cur_bank_count = -1
-        cur_bank = banks[0]
-        for i, bank in enumerate(banks):
-            if bank != cur_bank:
-                cur_bank_count = 0
-                cur_bank = bank
-            else:
-                cur_bank_count += 1
-            stripes[i] = num_constraint_tokens - bank + stripe_offsets[cur_bank_count]
-
-        # STEP 7: Sort by the stripes values
-        sort_values, sort_indices = stripes.sort(dim=0)
-        scores_buf = scores_buf[sort_indices]
-        indices_buf = indices_buf[sort_indices]
-        beams_buf = beams_buf[sort_indices]
-        constraint_states = [constraint_states[i] for i in sort_indices]
-
-        # STEP 8: Truncate to the candidates size!
-        scores_buf = scores_buf[: self.num_cands]
-        indices_buf = indices_buf[: self.num_cands]
-        beams_buf = beams_buf[: self.num_cands]
-
-        return scores_buf, indices_buf, beams_buf, constraint_states
-
-
-class LengthConstrainedBeamSearch(Search):
-    def __init__(self, tgt_dict, min_len_a, min_len_b, max_len_a, max_len_b):
-        super().__init__(tgt_dict)
-        self.min_len_a = min_len_a
-        self.min_len_b = min_len_b
-        self.max_len_a = max_len_a
-        self.max_len_b = max_len_b
-        self.beam = BeamSearch(tgt_dict)
-        self.needs_src_lengths = True
-
-    def step(
-        self,
-        step: int,
-        lprobs,
-        scores,
-        prev_output_tokens: Optional[Tensor] = None,
-        original_batch_idxs: Optional[Tensor] = None,
-    ):
-        min_lens = self.min_len_a * self.src_lengths + self.min_len_b
-        max_lens = self.max_len_a * self.src_lengths + self.max_len_b
-        lprobs[step < min_lens, :, self.eos] = -math.inf
-        lprobs[step >= max_lens, :, self.eos] = 0
-        return self.beam.step(step, lprobs, scores)
-
-
-class DiverseBeamSearch(Search):
-    """Diverse Beam Search.
-
-    See "Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence
-    Models" for details.
-
-    We only implement the Hamming Diversity penalty here, which performed best
-    in the original paper.
-    """
-
-    def __init__(self, tgt_dict, num_groups, diversity_strength):
-        super().__init__(tgt_dict)
-        self.num_groups = num_groups
-        self.diversity_strength = -diversity_strength
-        self.beam = BeamSearch(tgt_dict)
-
-    @torch.jit.export
-    def step(
-        self,
-        step: int,
-        lprobs,
-        scores,
-        prev_output_tokens: Optional[Tensor] = None,
-        original_batch_idxs: Optional[Tensor] = None,
-    ):
-        bsz, beam_size, vocab_size = lprobs.size()
-        if beam_size % self.num_groups != 0:
-            raise ValueError(
-                "DiverseBeamSearch requires --beam to be divisible by the number of groups"
-            )
-
-        # initialize diversity penalty
-        diversity_buf = torch.zeros(lprobs[:, 0, :].size()).to(lprobs)
-
-        scores_G, indices_G, beams_G = [], [], []
-        for g in range(self.num_groups):
-            lprobs_g = lprobs[:, g :: self.num_groups, :]
-            scores_g = scores[:, g :: self.num_groups, :] if step > 0 else None
-
-            # apply diversity penalty
-            if g > 0:
-                lprobs_g = torch.add(
-                    lprobs_g,
-                    other=diversity_buf.unsqueeze(1),
-                    alpha=self.diversity_strength,
-                )
-            else:
-                lprobs_g = lprobs_g.contiguous()
-
-            scores_buf, indices_buf, beams_buf = self.beam.step(
-                step, lprobs_g, scores_g
-            )
-            beams_buf.mul_(self.num_groups).add_(g)
-
-            scores_G.append(scores_buf.clone())
-            indices_G.append(indices_buf.clone())
-            beams_G.append(beams_buf.clone())
-
-            # update diversity penalty
-            diversity_buf.scatter_add_(
-                1, indices_buf, torch.ones(indices_buf.size()).to(diversity_buf)
-            )
-
-        # interleave results from different groups
-        scores_buf = torch.stack(scores_G, dim=2).view(bsz, -1)
-        indices_buf = torch.stack(indices_G, dim=2).view(bsz, -1)
-        beams_buf = torch.stack(beams_G, dim=2).view(bsz, -1)
-        return scores_buf, indices_buf, beams_buf
-
-
-class Sampling(Search):
-    sampling_topk: int
-    sampling_topp: float
-
-    def __init__(self, tgt_dict, sampling_topk=-1, sampling_topp=-1.0):
-        super().__init__(tgt_dict)
-        self.sampling_topk = sampling_topk
-        self.sampling_topp = sampling_topp
-
-    def _sample_topp(self, lprobs):
-        """Sample among the smallest set of elements whose cumulative probability mass exceeds p.
-
-        See `"The Curious Case of Neural Text Degeneration"
-        (Holtzman et al., 2019) <https://arxiv.org/abs/1904.09751>`_.
-
-        Args:
-            lprobs: (bsz x input_beam_size x vocab_size)
-                the model's log-probabilities over the vocabulary at the current step
-
-        Return: A tuple of (trimed_probs, truncated_indices) where:
-            trimed_probs: (bsz x input_beam_size x ?)
-                the model's probabilities over the elements selected to sample from. The
-                width of the third dimension is determined by top-P.
-            truncated_indices: (bsz x input_beam_size x ?)
-                the indices of the chosen elements.
-        """
-        probs = lprobs.exp_()
-
-        # sort the last dimension (vocab dimension) in descending order
-        sorted_probs, sorted_indices = probs.sort(descending=True)
-
-        # compute a mask to indicate the words to be included in the top-P set.
-        cumsum_probs = sorted_probs.cumsum(dim=2)
-        mask = cumsum_probs.lt(self.sampling_topp)
-
-        # note that mask was computed by 'lt'. One more word needs to be included
-        # so that the cumulative probability mass can exceed p.
-        cumsum_mask = mask.cumsum(dim=2)
-        last_included = cumsum_mask[:, :, -1:]
-        last_included.clamp_(0, mask.size()[2] - 1)
-        mask = mask.scatter_(2, last_included, 1)
-
-        # truncate unnecessary dims.
-        max_dim = last_included.max()
-        truncated_mask = mask[:, :, : max_dim + 1]
-        truncated_probs = sorted_probs[:, :, : max_dim + 1]
-        truncated_indices = sorted_indices[:, :, : max_dim + 1]
-
-        # trim the words that are not in top-P by setting their probabilities
-        # to 0, so that they would not be sampled later.
-        trim_mask = ~truncated_mask
-        trimed_probs = truncated_probs.masked_fill_(trim_mask, 0)
-        return trimed_probs, truncated_indices
-
-    @torch.jit.export
-    def step(
-        self,
-        step: int,
-        lprobs,
-        scores,
-        prev_output_tokens: Optional[Tensor] = None,
-        original_batch_idxs: Optional[Tensor] = None,
-    ):
-        bsz, beam_size, vocab_size = lprobs.size()
-
-        if step == 0:
-            # at the first step all hypotheses are equally likely, so use
-            # only the first beam
-            lprobs = lprobs[:, ::beam_size, :].contiguous()
-
-        if self.sampling_topp > 0:
-            # only sample from the smallest set of words whose cumulative probability mass exceeds p
-            probs, top_indices = self._sample_topp(lprobs)
-        elif self.sampling_topk > 0:
-            # only sample from top-k candidates
-            lprobs, top_indices = lprobs.topk(self.sampling_topk)
-            probs = lprobs.exp_()
-        else:
-            probs = lprobs.exp_()
-
-            # dummy data to be consistent with true branch for type check
-            top_indices = torch.empty(0).to(probs)
-        # sample
-        if step == 0:
-            indices_buf = torch.multinomial(
-                probs.view(bsz, -1),
-                beam_size,
-                replacement=True,
-            ).view(bsz, beam_size)
-        else:
-            indices_buf = torch.multinomial(
-                probs.view(bsz * beam_size, -1),
-                1,
-                replacement=True,
-            ).view(bsz, beam_size)
-
-        if step == 0:
-            # expand to beam size
-            probs = probs.expand(bsz, beam_size, -1)
-
-        # gather scores
-        scores_buf = torch.gather(probs, dim=2, index=indices_buf.unsqueeze(-1))
-        scores_buf = scores_buf.log_().view(bsz, -1)
-
-        # remap indices if using top-k or top-P sampling
-        if self.sampling_topk > 0 or self.sampling_topp > 0:
-            indices_buf = torch.gather(
-                top_indices.expand(bsz, beam_size, -1),
-                dim=2,
-                index=indices_buf.unsqueeze(-1),
-            ).squeeze(2)
-
-        if step == 0:
-            beams_buf = indices_buf.new_zeros(bsz, beam_size)
-        else:
-            beams_buf = torch.arange(0, beam_size).to(indices_buf).repeat(bsz, 1)
-            # make scores cumulative
-            scores_buf.add_(
-                torch.gather(scores[:, :, step - 1], dim=1, index=beams_buf)
-            )
-
-        return scores_buf, indices_buf, beams_buf
-
-
-class DiverseSiblingsSearch(Search):
-    """
-    Beam search with diverse siblings.
-
-    See "A Simple, Fast Diverse Decoding Algorithm for Neural Generation" for details.
-    https://arxiv.org/abs/1611.08562
-
-    1/ Calculate hypotheses for each beam
-    2/ Intra-sibling ordering
-    3/ Rewrite scores
-    4/ Choose top K hypotheses
-
-    if diversity_rate == 0 is equivalent to BeamSearch
-    """
-
-    def __init__(self, tgt_dict, diversity_rate):
-        super().__init__(tgt_dict)
-        self.diversity_rate = diversity_rate
-        self.beam = BeamSearch(tgt_dict)
-
-    def step(
-        self,
-        step: int,
-        lprobs,
-        scores,
-        prev_output_tokens: Optional[Tensor] = None,
-        original_batch_idxs: Optional[Tensor] = None,
-    ):
-        bsz, beam_size, vocab_size = lprobs.size()
-        k = min(
-            # Take the best 2 x beam_size predictions. We'll choose the first
-            # beam_size of these which don't predict eos to continue with.
-            beam_size * 2,
-            lprobs.view(bsz, -1).size(1) - 1,  # -1 so we never select pad
-        )
-        s_list: List[Tensor]
-        i_list: List[Tensor]
-        s_list = [torch.empty(0).to(lprobs) for i in range(beam_size)]
-        i_list = [torch.LongTensor().to(device=lprobs.device) for i in range(beam_size)]
-        sibling_score = torch.arange(1, k + 1).to(lprobs) * self.diversity_rate
-
-        if step == 0:
-            return self.beam.step(step, lprobs, scores)
-        lprobs.add_(scores[:, :, step - 1].unsqueeze(-1))
-
-        # 1/ Calculate hypotheses for each beam
-        for i in range(beam_size):
-            torch.topk(lprobs[:, i, :].view(bsz, -1), k, out=(s_list[i], i_list[i]))
-            i_list[i].fmod_(vocab_size)
-
-            # 2/ Intra-sibling ordering by default from topk + 3/ Rewrite scores
-            s_list[i].sub_(sibling_score)
-
-        # 4/ Choose top K hypotheses
-        indices = torch.stack(i_list, dim=1).view(bsz, -1)
-
-        final_scores = torch.empty(0).to(lprobs)
-        final_indices = torch.LongTensor().to(device=lprobs.device)
-        final_beams = torch.LongTensor().to(device=lprobs.device)
-        (final_scores, final_indices) = torch.topk(
-            torch.stack(s_list, dim=1).view(bsz, -1),
-            k,
-        )
-
-        final_beams = final_indices // k
-
-        for i in range(bsz):
-            final_indices[i] = indices[i][final_indices[i]]
-
-        return final_scores, final_indices, final_beams
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/sequence_generator.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/sequence_generator.py
deleted file mode 100644
index a3420f1d10f8401585e770e01bcdbd1179b3842e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/sequence_generator.py
+++ /dev/null
@@ -1,1026 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import math
-from typing import Dict, List, Optional
-
-import torch
-import torch.nn as nn
-from fairseq import search, utils
-from fairseq.data import data_utils
-from fairseq.models import FairseqIncrementalDecoder
-from fairseq.models.fairseq_encoder import EncoderOut
-from torch import Tensor
-
-
-class SequenceGenerator(nn.Module):
-    def __init__(
-        self,
-        models,
-        tgt_dict,
-        beam_size=1,
-        max_len_a=0,
-        max_len_b=200,
-        min_len=1,
-        normalize_scores=True,
-        len_penalty=1.0,
-        unk_penalty=0.0,
-        temperature=1.0,
-        match_source_len=False,
-        no_repeat_ngram_size=0,
-        search_strategy=None,
-        eos=None,
-        symbols_to_strip_from_output=None,
-        lm_model=None,
-        lm_weight=1.0,
-    ):
-        """Generates translations of a given source sentence.
-
-        Args:
-            models (List[~fairseq.models.FairseqModel]): ensemble of models,
-                currently support fairseq.models.TransformerModel for scripting
-            beam_size (int, optional): beam width (default: 1)
-            max_len_a/b (int, optional): generate sequences of maximum length
-                ax + b, where x is the source length
-            min_len (int, optional): the minimum length of the generated output
-                (not including end-of-sentence)
-            normalize_scores (bool, optional): normalize scores by the length
-                of the output (default: True)
-            len_penalty (float, optional): length penalty, where <1.0 favors
-                shorter, >1.0 favors longer sentences (default: 1.0)
-            unk_penalty (float, optional): unknown word penalty, where <0
-                produces more unks, >0 produces fewer (default: 0.0)
-            temperature (float, optional): temperature, where values
-                >1.0 produce more uniform samples and values <1.0 produce
-                sharper samples (default: 1.0)
-            match_source_len (bool, optional): outputs should match the source
-                length (default: False)
-        """
-        super().__init__()
-        if isinstance(models, EnsembleModel):
-            self.model = models
-        else:
-            self.model = EnsembleModel(models)
-        self.tgt_dict = tgt_dict
-        self.pad = tgt_dict.pad()
-        self.unk = tgt_dict.unk()
-        self.eos = tgt_dict.eos() if eos is None else eos
-        self.symbols_to_strip_from_output = (
-            symbols_to_strip_from_output.union({self.eos})
-            if symbols_to_strip_from_output is not None
-            else {self.eos}
-        )
-        self.vocab_size = len(tgt_dict)
-        self.beam_size = beam_size
-        # the max beam size is the dictionary size - 1, since we never select pad
-        self.beam_size = min(beam_size, self.vocab_size - 1)
-        self.max_len_a = max_len_a
-        self.max_len_b = max_len_b
-        self.min_len = min_len
-
-        self.normalize_scores = normalize_scores
-        self.len_penalty = len_penalty
-        self.unk_penalty = unk_penalty
-        self.temperature = temperature
-        self.match_source_len = match_source_len
-        self.no_repeat_ngram_size = no_repeat_ngram_size
-        assert temperature > 0, "--temperature must be greater than 0"
-
-        self.search = (
-            search.BeamSearch(tgt_dict) if search_strategy is None else search_strategy
-        )
-        # We only need to set src_lengths in LengthConstrainedBeamSearch.
-        # As a module attribute, setting it would break in multithread
-        # settings when the model is shared.
-        self.should_set_src_lengths = (
-            hasattr(self.search, "needs_src_lengths") and self.search.needs_src_lengths
-        )
-
-        self.model.eval()
-
-        self.lm_model = lm_model
-        self.lm_weight = lm_weight
-        if self.lm_model is not None:
-            self.lm_model.eval()
-
-    def cuda(self):
-        self.model.cuda()
-        return self
-
-    @torch.no_grad()
-    def forward(
-        self,
-        sample: Dict[str, Dict[str, Tensor]],
-        prefix_tokens: Optional[Tensor] = None,
-        bos_token: Optional[int] = None,
-    ):
-        """Generate a batch of translations.
-
-        Args:
-            sample (dict): batch
-            prefix_tokens (torch.LongTensor, optional): force decoder to begin
-                with these tokens
-            bos_token (int, optional): beginning of sentence token
-                (default: self.eos)
-        """
-        return self._generate(sample, prefix_tokens, bos_token=bos_token)
-
-    # TODO(myleott): unused, deprecate after pytorch-translate migration
-    def generate_batched_itr(self, data_itr, beam_size=None, cuda=False, timer=None):
-        """Iterate over a batched dataset and yield individual translations.
-        Args:
-            cuda (bool, optional): use GPU for generation
-            timer (StopwatchMeter, optional): time generations
-        """
-        for sample in data_itr:
-            s = utils.move_to_cuda(sample) if cuda else sample
-            if "net_input" not in s:
-                continue
-            input = s["net_input"]
-            # model.forward normally channels prev_output_tokens into the decoder
-            # separately, but SequenceGenerator directly calls model.encoder
-            encoder_input = {
-                k: v for k, v in input.items() if k != "prev_output_tokens"
-            }
-            if timer is not None:
-                timer.start()
-            with torch.no_grad():
-                hypos = self.generate(encoder_input)
-            if timer is not None:
-                timer.stop(sum(len(h[0]["tokens"]) for h in hypos))
-            for i, id in enumerate(s["id"].data):
-                # remove padding
-                src = utils.strip_pad(input["src_tokens"].data[i, :], self.pad)
-                ref = (
-                    utils.strip_pad(s["target"].data[i, :], self.pad)
-                    if s["target"] is not None
-                    else None
-                )
-                yield id, src, ref, hypos[i]
-
-    @torch.no_grad()
-    def generate(self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs):
-        """Generate translations. Match the api of other fairseq generators.
-
-        Args:
-            models (List[~fairseq.models.FairseqModel]): ensemble of models
-            sample (dict): batch
-            prefix_tokens (torch.LongTensor, optional): force decoder to begin
-                with these tokens
-            constraints (torch.LongTensor, optional): force decoder to include
-                the list of constraints
-            bos_token (int, optional): beginning of sentence token
-                (default: self.eos)
-        """
-        return self._generate(sample, **kwargs)
-
-    def _generate(
-        self,
-        sample: Dict[str, Dict[str, Tensor]],
-        prefix_tokens: Optional[Tensor] = None,
-        constraints: Optional[Tensor] = None,
-        bos_token: Optional[int] = None,
-    ):
-        incremental_states = torch.jit.annotate(
-            List[Dict[str, Dict[str, Optional[Tensor]]]],
-            [
-                torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {})
-                for i in range(self.model.models_size)
-            ],
-        )
-        net_input = sample["net_input"]
-
-        if "src_tokens" in net_input:
-            src_tokens = net_input["src_tokens"]
-            # length of the source text being the character length except EndOfSentence and pad
-            src_lengths = (
-                (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).int().sum(dim=1)
-            )
-        elif "source" in net_input:
-            src_tokens = net_input["source"]
-            src_lengths = (
-                net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1)
-                if net_input["padding_mask"] is not None
-                else torch.tensor(src_tokens.size(-1)).to(src_tokens)
-            )
-        else:
-            raise Exception("expected src_tokens or source in net input")
-
-        # bsz: total number of sentences in beam
-        # Note that src_tokens may have more than 2 dimenions (i.e. audio features)
-        bsz, src_len = src_tokens.size()[:2]
-        beam_size = self.beam_size
-
-        if constraints is not None and not self.search.supports_constraints:
-            raise NotImplementedError(
-                "Target-side constraints were provided, but search method doesn't support them"
-            )
-
-        # Initialize constraints, when active
-        self.search.init_constraints(constraints, beam_size)
-
-        max_len: int = -1
-        if self.match_source_len:
-            max_len = src_lengths.max().item()
-        else:
-            max_len = min(
-                int(self.max_len_a * src_len + self.max_len_b),
-                # exclude the EOS marker
-                self.model.max_decoder_positions() - 1,
-            )
-        assert (
-            self.min_len <= max_len
-        ), "min_len cannot be larger than max_len, please adjust these!"
-        # compute the encoder output for each beam
-        encoder_outs = self.model.forward_encoder(net_input)
-
-        # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores
-        new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1)
-        new_order = new_order.to(src_tokens.device).int()
-        encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order, bsz)
-        # ensure encoder_outs is a List.
-        assert encoder_outs is not None
-
-        # initialize buffers
-        scores = (
-            torch.zeros(bsz * beam_size, max_len + 1).to(src_tokens).float()
-        )  # +1 for eos; pad is never chosen for scoring
-        tokens = (
-            torch.zeros(bsz * beam_size, max_len + 2)
-            .to(src_tokens)
-            .int()
-            .fill_(self.pad)
-        )  # +2 for eos and pad
-        tokens[:, 0] = self.eos if bos_token is None else bos_token
-        attn: Optional[Tensor] = None
-
-        # A list that indicates candidates that should be ignored.
-        # For example, suppose we're sampling and have already finalized 2/5
-        # samples. Then cands_to_ignore would mark 2 positions as being ignored,
-        # so that we only finalize the remaining 3 samples.
-        cands_to_ignore = (
-            torch.zeros(bsz, beam_size).to(src_tokens).eq(-1)
-        )  # forward and backward-compatible False mask
-
-        # list of completed sentences
-        finalized = torch.jit.annotate(
-            List[List[Dict[str, Tensor]]],
-            [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)],
-        )  # contains lists of dictionaries of infomation about the hypothesis being finalized at each step
-
-        finished = [
-            False for i in range(bsz)
-        ]  # a boolean array indicating if the sentence at the index is finished or not
-        num_remaining_sent = bsz  # number of sentences remaining
-
-        # number of candidate hypos per step
-        cand_size = 2 * beam_size  # 2 x beam size in case half are EOS
-
-        # offset arrays for converting between different indexing schemes
-        bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens).cpu()
-        cand_offsets = torch.arange(0, cand_size).type_as(tokens).cpu()
-
-        reorder_state: Optional[Tensor] = None
-        batch_idxs: Optional[Tensor] = None
-
-        original_batch_idxs: Optional[Tensor] = None
-        if "id" in sample and isinstance(sample["id"], Tensor):
-            original_batch_idxs = sample["id"].cpu()
-        else:
-            original_batch_idxs = torch.arange(0, bsz).type_as(tokens).cpu()
-
-        for step in range(max_len + 1):  # one extra step for EOS marker
-            # reorder decoder internal states based on the prev choice of beams
-            # print(f'step: {step}')
-            if reorder_state is not None:
-                if batch_idxs is not None:
-                    # update beam indices to take into account removed sentences
-                    corr = (batch_idxs - torch.arange(batch_idxs.numel()).type_as(
-                        batch_idxs)).npu()
-                    reorder_state.view(-1, beam_size).add_(
-                        corr.unsqueeze(-1) * beam_size
-                    )
-                    original_batch_idxs = original_batch_idxs[batch_idxs]
-                self.model.reorder_incremental_state(incremental_states, reorder_state)
-                encoder_outs = self.model.reorder_encoder_out(
-                    encoder_outs, reorder_state
-                )
-
-            lprobs, avg_attn_scores = self.model.forward_decoder(
-                tokens[:, : step + 1],
-                encoder_outs,
-                incremental_states,
-                self.temperature,
-            )
-
-            if self.lm_model is not None:
-                lm_out = self.lm_model(tokens[:, : step + 1])
-                probs = self.lm_model.get_normalized_probs(
-                    lm_out, log_probs=True, sample=None
-                )
-                probs = probs[:, -1, :] * self.lm_weight
-                lprobs += probs
-
-            # lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs)
-
-            lprobs[:, self.pad] = -65504.0  # never select pad
-            lprobs[:, self.unk] -= self.unk_penalty  # apply unk penalty
-
-            # handle max length constraint
-            if step >= max_len:
-                lprobs[:, : self.eos] = -65504.0
-                lprobs[:, self.eos + 1 :] = -65504.0
-
-            # handle prefix tokens (possibly with different lengths)
-            if (
-                prefix_tokens is not None
-                and step < prefix_tokens.size(1)
-                and step < max_len
-            ):
-                lprobs, tokens, scores = self._prefix_tokens(
-                    step, lprobs, scores, tokens, prefix_tokens, beam_size
-                )
-            elif step < self.min_len:
-                # minimum length constraint (does not apply if using prefix_tokens)
-                lprobs[:, self.eos] = -65504.0
-
-            # Record attention scores, only support avg_attn_scores is a Tensor
-            if avg_attn_scores is not None:
-                if attn is None:
-                    attn = torch.empty(
-                        bsz * beam_size, avg_attn_scores.size(1), max_len + 2
-                    ).to(scores)
-                attn[:, :, step + 1].copy_(avg_attn_scores)
-
-            scores = scores.type_as(lprobs)
-            eos_bbsz_idx = torch.empty(0).to(
-                tokens
-            )  # indices of hypothesis ending with eos (finished sentences)
-            eos_scores = torch.empty(0).to(
-                scores
-            )  # scores of hypothesis ending with eos (finished sentences)
-
-            if self.should_set_src_lengths:
-                self.search.set_src_lengths(src_lengths)
-
-            if self.no_repeat_ngram_size > 0:
-                lprobs = self._no_repeat_ngram(tokens, lprobs, bsz, beam_size, step)
-
-            # Shape: (batch, cand_size)
-            cand_scores, cand_indices, cand_beams = self.search.step(
-                step,
-                lprobs.view(bsz, -1, self.vocab_size),
-                scores.view(bsz, beam_size, -1)[:, :, :step],
-                tokens[:, : step + 1],
-                original_batch_idxs,
-            )
-            scores = scores.cpu()
-            cands_to_ignore = cands_to_ignore.cpu()
-            attn = attn.cpu()
-            cand_scores = cand_scores.cpu()
-            cand_indices = cand_indices.cpu()
-            cand_beams = cand_beams.cpu()
-            # cand_bbsz_idx contains beam indices for the top candidate
-            # hypotheses, with a range of values: [0, bsz*beam_size),
-            # and dimensions: [bsz, cand_size]
-            cand_bbsz_idx = cand_beams.add(bbsz_offsets)
-
-            # finalize hypotheses that end in eos
-            # Shape of eos_mask: (batch size, beam size)
-            eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-65504.0)
-            eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(eos_mask)
-
-            # only consider eos when it's among the top beam_size indices
-            # Now we know what beam item(s) to finish
-            # Shape: 1d list of absolute-numbered
-            eos_bbsz_idx = torch.masked_select(
-                cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size]
-            )
-
-            finalized_sents: List[int] = []
-            if eos_bbsz_idx.numel() > 0:
-                eos_scores = torch.masked_select(
-                    cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size]
-                )
-
-                finalized_sents = self.finalize_hypos(
-                    step,
-                    eos_bbsz_idx,
-                    eos_scores,
-                    tokens.cpu(),
-                    scores,
-                    finalized,
-                    finished,
-                    beam_size,
-                    attn,
-                    src_lengths.cpu(),
-                    max_len,
-                )
-                num_remaining_sent -= len(finalized_sents)
-
-            assert num_remaining_sent >= 0
-            if num_remaining_sent == 0:
-                break
-            if self.search.stop_on_max_len and step >= max_len:
-                break
-            assert step < max_len
-
-            # Remove finalized sentences (ones for which {beam_size}
-            # finished hypotheses have been generated) from the batch.
-            if len(finalized_sents) > 0:
-                new_bsz = bsz - len(finalized_sents)
-
-                # construct batch_idxs which holds indices of batches to keep for the next pass
-                batch_mask = torch.ones(
-                    bsz, dtype=torch.bool, device=cand_indices.device
-                )
-                batch_mask[finalized_sents] = False
-                # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it
-                batch_idxs = torch.arange(
-                    bsz, device=cand_indices.device
-                ).masked_select(batch_mask).long()
-
-                # Choose the subset of the hypothesized constraints that will continue
-                self.search.prune_sentences(batch_idxs)
-
-                eos_mask = eos_mask[batch_idxs]
-                cand_beams = cand_beams[batch_idxs]
-                bbsz_offsets.resize_(new_bsz, 1)
-                cand_bbsz_idx = cand_beams.add(bbsz_offsets)
-                cand_scores = cand_scores[batch_idxs]
-                cand_indices = cand_indices[batch_idxs]
-
-                if prefix_tokens is not None:
-                    prefix_tokens = prefix_tokens[batch_idxs]
-                src_lengths = src_lengths[batch_idxs]
-                cands_to_ignore = cands_to_ignore[batch_idxs]
-
-                scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
-                tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1)
-                if attn is not None:
-                    attn = attn.view(bsz, -1)[batch_idxs].view(
-                        new_bsz * beam_size, attn.size(1), -1
-                    )
-                bsz = new_bsz
-            else:
-                batch_idxs = None
-
-            # Set active_mask so that values > cand_size indicate eos hypos
-            # and values < cand_size indicate candidate active hypos.
-            # After, the min values per row are the top candidate active hypos
-
-            # Rewrite the operator since the element wise or is not supported in torchscript.
-
-            eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size]))
-            active_mask = torch.add(
-                eos_mask.type_as(cand_offsets) * cand_size,
-                cand_offsets[: eos_mask.size(1)],
-            )
-
-            # get the top beam_size active hypotheses, which are just
-            # the hypos with the smallest values in active_mask.
-            # {active_hypos} indicates which {beam_size} hypotheses
-            # from the list of {2 * beam_size} candidates were
-            # selected. Shapes: (batch size, beam size)
-            new_cands_to_ignore, active_hypos = torch.topk(
-                active_mask, k=beam_size, dim=1, largest=False
-            )
-
-            # update cands_to_ignore to ignore any finalized hypos.
-            cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size]
-            # Make sure there is at least one active item for each sentence in the batch.
-            assert (~cands_to_ignore).any(dim=1).all()
-
-            # update cands_to_ignore to ignore any finalized hypos
-
-            # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam
-            # can be selected more than once).
-            active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos)
-            active_scores = torch.gather(cand_scores, dim=1, index=active_hypos)
-
-            active_bbsz_idx = active_bbsz_idx.view(-1)
-            active_scores = active_scores.view(-1)
-
-            # copy tokens and scores for active hypotheses
-
-            # Set the tokens for each beam (can select the same row more than once)
-            tokens[:, : step + 1] = torch.index_select(
-                tokens[:, : step + 1].cpu(), dim=0, index=active_bbsz_idx
-            )
-            # Select the next token for each of them
-            tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather(
-                cand_indices, dim=1, index=active_hypos
-            )
-            if step > 0:
-                scores[:, :step] = torch.index_select(
-                    scores[:, :step], dim=0, index=active_bbsz_idx
-                )
-            scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather(
-                cand_scores, dim=1, index=active_hypos
-            )
-
-            # Update constraints based on which candidates were selected for the next beam
-            self.search.update_constraints(active_hypos)
-
-            # copy attention for active hypotheses
-            if attn is not None:
-                attn[:, :, : step + 2] = torch.index_select(
-                    attn[:, :, : step + 2], dim=0, index=active_bbsz_idx
-                )
-
-            # reorder incremental state in decoder
-            reorder_state = active_bbsz_idx
-            reorder_state = reorder_state.npu()
-
-        # sort by score descending
-        for sent in range(len(finalized)):
-            scores = torch.tensor(
-                [float(elem["score"].item()) for elem in finalized[sent]]
-            )
-            _, sorted_scores_indices = torch.sort(scores, descending=True)
-            finalized[sent] = [finalized[sent][ssi] for ssi in sorted_scores_indices]
-            finalized[sent] = torch.jit.annotate(
-                List[Dict[str, Tensor]], finalized[sent]
-            )
-        return finalized
-
-    def _prefix_tokens(
-        self, step: int, lprobs, scores, tokens, prefix_tokens, beam_size: int
-    ):
-        """Handle prefix tokens"""
-        prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1)
-        prefix_lprobs = lprobs.gather(-1, prefix_toks.unsqueeze(-1))
-        prefix_mask = prefix_toks.ne(self.pad)
-        lprobs[prefix_mask] = torch.tensor(-65504.0).to(lprobs)
-        lprobs[prefix_mask] = lprobs[prefix_mask].scatter(
-            -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lprobs[prefix_mask]
-        )
-        # if prefix includes eos, then we should make sure tokens and
-        # scores are the same across all beams
-        eos_mask = prefix_toks.eq(self.eos)
-        if eos_mask.any():
-            # validate that the first beam matches the prefix
-            first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[
-                :, 0, 1 : step + 1
-            ]
-            eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0]
-            target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step]
-            assert (first_beam == target_prefix).all()
-
-            # copy tokens, scores and lprobs from the first beam to all beams
-            tokens = self.replicate_first_beam(tokens, eos_mask_batch_dim, beam_size)
-            scores = self.replicate_first_beam(scores, eos_mask_batch_dim, beam_size)
-            lprobs = self.replicate_first_beam(lprobs, eos_mask_batch_dim, beam_size)
-        return lprobs, tokens, scores
-
-    def replicate_first_beam(self, tensor, mask, beam_size: int):
-        tensor = tensor.view(-1, beam_size, tensor.size(-1))
-        tensor[mask] = tensor[mask][:, :1, :]
-        return tensor.view(-1, tensor.size(-1))
-
-    def finalize_hypos(
-        self,
-        step: int,
-        bbsz_idx,
-        eos_scores,
-        tokens,
-        scores,
-        finalized: List[List[Dict[str, Tensor]]],
-        finished: List[bool],
-        beam_size: int,
-        attn: Optional[Tensor],
-        src_lengths,
-        max_len: int,
-    ):
-        """Finalize hypothesis, store finalized information in `finalized`, and change `finished` accordingly.
-        A sentence is finalized when {beam_size} finished items have been collected for it.
-
-        Returns number of sentences (not beam items) being finalized.
-        These will be removed from the batch and not processed further.
-        Args:
-            bbsz_idx (Tensor):
-        """
-        assert bbsz_idx.numel() == eos_scores.numel()
-
-        # clone relevant token and attention tensors.
-        # tokens is (batch * beam, max_len). So the index_select
-        # gets the newly EOS rows, then selects cols 1..{step + 2}
-        tokens_clone = tokens.index_select(0, bbsz_idx)[
-            :, 1 : step + 2
-        ]  # skip the first index, which is EOS
-
-        tokens_clone[:, step] = self.eos
-        attn_clone = (
-            attn.index_select(0, bbsz_idx)[:, :, 1 : step + 2]
-            if attn is not None
-            else None
-        )
-
-        # compute scores per token position
-        pos_scores = scores.index_select(0, bbsz_idx)[:, : step + 1]
-        pos_scores[:, step] = eos_scores
-        # convert from cumulative to per-position scores
-        pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1]
-
-        # normalize sentence-level scores
-        if self.normalize_scores:
-            eos_scores /= (step + 1) ** self.len_penalty
-
-        # cum_unfin records which sentences in the batch are finished.
-        # It helps match indexing between (a) the original sentences
-        # in the batch and (b) the current, possibly-reduced set of
-        # sentences.
-        cum_unfin: List[int] = []
-        prev = 0
-        for f in finished:
-            if f:
-                prev += 1
-            else:
-                cum_unfin.append(prev)
-
-        # set() is not supported in script export
-
-        # The keys here are of the form "{sent}_{unfin_idx}", where
-        # "unfin_idx" is the index in the current (possibly reduced)
-        # list of sentences, and "sent" is the index in the original,
-        # unreduced batch
-        sents_seen: Dict[str, Optional[Tensor]] = {}
-
-        # For every finished beam item
-        for i in range(bbsz_idx.size()[0]):
-            idx = bbsz_idx[i]
-            score = eos_scores[i]
-            # sentence index in the current (possibly reduced) batch
-            unfin_idx = idx // beam_size
-            # sentence index in the original (unreduced) batch
-            sent = unfin_idx + cum_unfin[unfin_idx]
-            # print(f"{step} FINISHED {idx} {score} {sent}={unfin_idx} {cum_unfin}")
-            # Cannot create dict for key type '(int, int)' in torchscript.
-            # The workaround is to cast int to string
-            seen = str(sent.item()) + "_" + str(unfin_idx.item())
-            if seen not in sents_seen:
-                sents_seen[seen] = None
-
-            if self.match_source_len and step > src_lengths[unfin_idx]:
-                score = torch.tensor(-65504.0).to(score)
-
-            # An input sentence (among those in a batch) is finished when
-            # beam_size hypotheses have been collected for it
-            if len(finalized[sent]) < beam_size:
-                if attn_clone is not None:
-                    # remove padding tokens from attn scores
-                    hypo_attn = attn_clone[i]
-                else:
-                    hypo_attn = torch.empty(0)
-
-                finalized[sent].append(
-                    {
-                        "tokens": tokens_clone[i],
-                        "score": score,
-                        "attention": hypo_attn,  # src_len x tgt_len
-                        "alignment": torch.empty(0),
-                        "positional_scores": pos_scores[i],
-                    }
-                )
-
-        newly_finished: List[int] = []
-
-        for seen in sents_seen.keys():
-            # check termination conditions for this sentence
-            sent: int = int(float(seen.split("_")[0]))
-            unfin_idx: int = int(float(seen.split("_")[1]))
-
-            if not finished[sent] and self.is_finished(
-                step, unfin_idx, max_len, len(finalized[sent]), beam_size
-            ):
-                finished[sent] = True
-                newly_finished.append(unfin_idx)
-
-        return newly_finished
-
-    def is_finished(
-        self,
-        step: int,
-        unfin_idx: int,
-        max_len: int,
-        finalized_sent_len: int,
-        beam_size: int,
-    ):
-        """
-        Check whether decoding for a sentence is finished, which
-        occurs when the list of finalized sentences has reached the
-        beam size, or when we reach the maximum length.
-        """
-        assert finalized_sent_len <= beam_size
-        if finalized_sent_len == beam_size or step == max_len:
-            return True
-        return False
-
-    def calculate_banned_tokens(
-        self,
-        tokens,
-        step: int,
-        gen_ngrams: List[Dict[str, List[int]]],
-        no_repeat_ngram_size: int,
-        bbsz_idx: int,
-    ):
-        tokens_list: List[int] = tokens[
-            bbsz_idx, step + 2 - no_repeat_ngram_size : step + 1
-        ].tolist()
-        # before decoding the next token, prevent decoding of ngrams that have already appeared
-        ngram_index = ",".join([str(x) for x in tokens_list])
-        return gen_ngrams[bbsz_idx].get(ngram_index, torch.jit.annotate(List[int], []))
-
-    def transpose_list(self, l: List[List[int]]):
-        # GeneratorExp aren't supported in TS so ignoring the lint
-        min_len = min([len(x) for x in l])  # noqa
-        l2 = [[row[i] for row in l] for i in range(min_len)]
-        return l2
-
-    def _no_repeat_ngram(self, tokens, lprobs, bsz: int, beam_size: int, step: int):
-        # for each beam and batch sentence, generate a list of previous ngrams
-        gen_ngrams: List[Dict[str, List[int]]] = [
-            torch.jit.annotate(Dict[str, List[int]], {})
-            for bbsz_idx in range(bsz * beam_size)
-        ]
-        cpu_tokens = tokens.cpu()
-        for bbsz_idx in range(bsz * beam_size):
-            gen_tokens: List[int] = cpu_tokens[bbsz_idx].tolist()
-            for ngram in self.transpose_list(
-                [gen_tokens[i:] for i in range(self.no_repeat_ngram_size)]
-            ):
-                key = ",".join([str(x) for x in ngram[:-1]])
-                gen_ngrams[bbsz_idx][key] = gen_ngrams[bbsz_idx].get(
-                    key, torch.jit.annotate(List[int], [])
-                ) + [ngram[-1]]
-
-        if step + 2 - self.no_repeat_ngram_size >= 0:
-            # no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
-            banned_tokens = [
-                self.calculate_banned_tokens(
-                    tokens, step, gen_ngrams, self.no_repeat_ngram_size, bbsz_idx
-                )
-                for bbsz_idx in range(bsz * beam_size)
-            ]
-        else:
-            banned_tokens = [
-                torch.jit.annotate(List[int], []) for bbsz_idx in range(bsz * beam_size)
-            ]
-        for bbsz_idx in range(bsz * beam_size):
-            lprobs[bbsz_idx][
-                torch.tensor(banned_tokens[bbsz_idx]).int()
-            ] = torch.tensor(-65504.0).to(lprobs)
-        return lprobs
-
-
-class EnsembleModel(nn.Module):
-    """A wrapper around an ensemble of models."""
-
-    def __init__(self, models):
-        super().__init__()
-        self.models_size = len(models)
-        # method '__len__' is not supported in ModuleList for torch script
-        self.single_model = models[0]
-        self.models = nn.ModuleList(models)
-
-        self.has_incremental: bool = False
-        if all(
-            hasattr(m, "decoder") and isinstance(m.decoder, FairseqIncrementalDecoder)
-            for m in models
-        ):
-            self.has_incremental = True
-
-    def forward(self):
-        pass
-
-    def has_encoder(self):
-        return hasattr(self.single_model, "encoder")
-
-    def has_incremental_states(self):
-        return self.has_incremental
-
-    def max_decoder_positions(self):
-        return min([m.max_decoder_positions() for m in self.models])
-
-    @torch.jit.export
-    def forward_encoder(self, net_input: Dict[str, Tensor]):
-        if not self.has_encoder():
-            return None
-        return [model.encoder.forward_torchscript(net_input) for model in self.models]
-
-    @torch.jit.export
-    def forward_decoder(
-        self,
-        tokens,
-        encoder_outs: List[EncoderOut],
-        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
-        temperature: float = 1.0,
-    ):
-        log_probs = []
-        avg_attn: Optional[Tensor] = None
-        encoder_out: Optional[EncoderOut] = None
-        for i, model in enumerate(self.models):
-            if self.has_encoder():
-                encoder_out = encoder_outs[i]
-            # decode each model
-            if self.has_incremental_states():
-                decoder_out = model.decoder.forward(
-                    tokens,
-                    encoder_out=encoder_out,
-                    incremental_state=incremental_states[i],
-                )
-            else:
-                decoder_out = model.decoder.forward(tokens, encoder_out=encoder_out)
-
-            attn: Optional[Tensor] = None
-            decoder_len = len(decoder_out)
-            if decoder_len > 1 and decoder_out[1] is not None:
-                if isinstance(decoder_out[1], Tensor):
-                    attn = decoder_out[1]
-                else:
-                    attn_holder = decoder_out[1]["attn"]
-                    if isinstance(attn_holder, Tensor):
-                        attn = attn_holder
-                    elif attn_holder is not None:
-                        attn = attn_holder[0]
-                if attn is not None:
-                    attn = attn[:, -1, :]
-
-            decoder_out_tuple = (
-                decoder_out[0].div_(temperature),
-                None if decoder_len <= 1 else decoder_out[1],
-            )
-
-            probs = model.get_normalized_probs(
-                decoder_out_tuple, log_probs=True, sample=None
-            )
-
-            if self.models_size == 1:
-                return probs, attn
-
-            log_probs.append(probs)
-            if attn is not None:
-                if avg_attn is None:
-                    avg_attn = attn
-                else:
-                    avg_attn.add_(attn)
-
-        avg_probs = torch.logsumexp(torch.stack(log_probs, dim=0), dim=0) - math.log(
-            self.models_size
-        )
-
-        if avg_attn is not None:
-            avg_attn.div_(self.models_size)
-        return avg_probs, avg_attn
-
-    @torch.jit.export
-    def reorder_encoder_out(self, encoder_outs: Optional[List[EncoderOut]], new_order, bsz=None):
-        """
-        Reorder encoder output according to *new_order*.
-
-        Args:
-            encoder_out: output from the ``forward()`` method
-            new_order (LongTensor): desired order
-
-        Returns:
-            *encoder_out* rearranged according to *new_order*
-        """
-        def reshape_encoder_outs(encoder_outs):
-            if bsz is not None:
-                _, ori_len = encoder_outs.encoder_out.size()
-                new_encoder_out = encoder_outs.encoder_out.reshape(bsz, -1, ori_len)
-            else:
-                new_encoder_out = encoder_outs.encoder_out
-            return EncoderOut(encoder_out=new_encoder_out,
-                              encoder_padding_mask=encoder_outs.encoder_padding_mask,
-                              encoder_embedding=encoder_outs.encoder_embedding,
-                              encoder_states=encoder_outs.encoder_states,
-                              src_tokens=encoder_outs.src_tokens,
-                              src_lengths=encoder_outs.src_lengths)
-        new_outs: List[EncoderOut] = []
-        if not self.has_encoder():
-            return new_outs
-        for i, model in enumerate(self.models):
-            assert encoder_outs is not None
-            reshape_encoder_out = reshape_encoder_outs(encoder_outs[i])
-            new_outs.append(
-                model.encoder.reorder_encoder_out(reshape_encoder_out, new_order)
-            )
-        return new_outs
-
-    @torch.jit.export
-    def reorder_incremental_state(
-        self,
-        incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]],
-        new_order,
-    ):
-        if not self.has_incremental_states():
-            return
-        for i, model in enumerate(self.models):
-            model.decoder.reorder_incremental_state_scripting(
-                incremental_states[i], new_order
-            )
-
-
-class SequenceGeneratorWithAlignment(SequenceGenerator):
-    def __init__(self, models, tgt_dict, left_pad_target=False, **kwargs):
-        """Generates translations of a given source sentence.
-
-        Produces alignments following "Jointly Learning to Align and
-        Translate with Transformer Models" (Garg et al., EMNLP 2019).
-
-        Args:
-            left_pad_target (bool, optional): Whether or not the
-                hypothesis should be left padded or not when they are
-                teacher forced for generating alignments.
-        """
-        super().__init__(EnsembleModelWithAlignment(models), tgt_dict, **kwargs)
-        self.left_pad_target = left_pad_target
-
-    @torch.no_grad()
-    def generate(self, models, sample, **kwargs):
-        finalized = super()._generate(sample, **kwargs)
-
-        src_tokens = sample["net_input"]["src_tokens"]
-        bsz = src_tokens.shape[0]
-        beam_size = self.beam_size
-        (
-            src_tokens,
-            src_lengths,
-            prev_output_tokens,
-            tgt_tokens,
-        ) = self._prepare_batch_for_alignment(sample, finalized)
-        if any(getattr(m, "full_context_alignment", False) for m in self.model.models):
-            attn = self.model.forward_align(src_tokens, src_lengths, prev_output_tokens)
-        else:
-            attn = [
-                finalized[i // beam_size][i % beam_size]["attention"].transpose(1, 0)
-                for i in range(bsz * beam_size)
-            ]
-
-        if src_tokens.device != "cpu":
-            src_tokens = src_tokens.to("cpu")
-            tgt_tokens = tgt_tokens.to("cpu")
-            attn = [i.to("cpu") for i in attn]
-
-        # Process the attn matrix to extract hard alignments.
-        for i in range(bsz * beam_size):
-            alignment = utils.extract_hard_alignment(
-                attn[i], src_tokens[i], tgt_tokens[i], self.pad, self.eos
-            )
-            finalized[i // beam_size][i % beam_size]["alignment"] = alignment
-        return finalized
-
-    def _prepare_batch_for_alignment(self, sample, hypothesis):
-        src_tokens = sample["net_input"]["src_tokens"]
-        bsz = src_tokens.shape[0]
-        src_tokens = (
-            src_tokens[:, None, :]
-            .expand(-1, self.beam_size, -1)
-            .contiguous()
-            .view(bsz * self.beam_size, -1)
-        )
-        src_lengths = sample["net_input"]["src_lengths"]
-        src_lengths = (
-            src_lengths[:, None]
-            .expand(-1, self.beam_size)
-            .contiguous()
-            .view(bsz * self.beam_size)
-        )
-        prev_output_tokens = data_utils.collate_tokens(
-            [beam["tokens"] for example in hypothesis for beam in example],
-            self.pad,
-            self.eos,
-            self.left_pad_target,
-            move_eos_to_beginning=True,
-        )
-        tgt_tokens = data_utils.collate_tokens(
-            [beam["tokens"] for example in hypothesis for beam in example],
-            self.pad,
-            self.eos,
-            self.left_pad_target,
-            move_eos_to_beginning=False,
-        )
-        return src_tokens, src_lengths, prev_output_tokens, tgt_tokens
-
-
-class EnsembleModelWithAlignment(EnsembleModel):
-    """A wrapper around an ensemble of models."""
-
-    def __init__(self, models):
-        super().__init__(models)
-
-    def forward_align(self, src_tokens, src_lengths, prev_output_tokens):
-        avg_attn = None
-        for model in self.models:
-            decoder_out = model(src_tokens, src_lengths, prev_output_tokens)
-            attn = decoder_out[1]["attn"][0]
-            if avg_attn is None:
-                avg_attn = attn
-            else:
-                avg_attn.add_(attn)
-        if len(self.models) > 1:
-            avg_attn.div_(len(self.models))
-        return avg_attn
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/sequence_scorer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/sequence_scorer.py
deleted file mode 100644
index 411d4df4445ef8dd3f1907ad56f9de6943d1fed8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/sequence_scorer.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import sys
-
-import torch
-from fairseq import utils
-
-
-class SequenceScorer(object):
-    """Scores the target for a given source sentence."""
-
-    def __init__(
-        self,
-        tgt_dict,
-        softmax_batch=None,
-        compute_alignment=False,
-        eos=None,
-        symbols_to_strip_from_output=None,
-    ):
-        self.pad = tgt_dict.pad()
-        self.eos = tgt_dict.eos() if eos is None else eos
-        self.softmax_batch = softmax_batch or sys.maxsize
-        assert self.softmax_batch > 0
-        self.compute_alignment = compute_alignment
-        self.symbols_to_strip_from_output = (
-            symbols_to_strip_from_output.union({self.eos})
-            if symbols_to_strip_from_output is not None
-            else {self.eos}
-        )
-
-    @torch.no_grad()
-    def generate(self, models, sample, **kwargs):
-        """Score a batch of translations."""
-        net_input = sample["net_input"]
-
-        def batch_for_softmax(dec_out, target):
-            # assumes decoder_out[0] is the only thing needed (may not be correct for future models!)
-            first, rest = dec_out[0], dec_out[1:]
-            bsz, tsz, dim = first.shape
-            if bsz * tsz < self.softmax_batch:
-                yield dec_out, target, True
-            else:
-                flat = first.contiguous().view(1, -1, dim)
-                flat_tgt = target.contiguous().view(flat.shape[:-1])
-                s = 0
-                while s < flat.size(1):
-                    e = s + self.softmax_batch
-                    yield (flat[:, s:e],) + rest, flat_tgt[:, s:e], False
-                    s = e
-
-        def gather_target_probs(probs, target):
-            probs = probs.gather(
-                dim=2,
-                index=target.unsqueeze(-1),
-            )
-            return probs
-
-        orig_target = sample["target"]
-
-        # compute scores for each model in the ensemble
-        avg_probs = None
-        avg_attn = None
-        for model in models:
-            model.eval()
-            decoder_out = model(**net_input)
-            attn = decoder_out[1] if len(decoder_out) > 1 else None
-            if type(attn) is dict:
-                attn = attn.get("attn", None)
-
-            batched = batch_for_softmax(decoder_out, orig_target)
-            probs, idx = None, 0
-            for bd, tgt, is_single in batched:
-                sample["target"] = tgt
-                curr_prob = model.get_normalized_probs(
-                    bd, log_probs=len(models) == 1, sample=sample
-                ).data
-                if is_single:
-                    probs = gather_target_probs(curr_prob, orig_target)
-                else:
-                    if probs is None:
-                        probs = curr_prob.new(orig_target.numel())
-                    step = curr_prob.size(0) * curr_prob.size(1)
-                    end = step + idx
-                    tgt_probs = gather_target_probs(
-                        curr_prob.view(tgt.shape + (curr_prob.size(-1),)), tgt
-                    )
-                    probs[idx:end] = tgt_probs.view(-1)
-                    idx = end
-                sample["target"] = orig_target
-
-            probs = probs.view(sample["target"].shape)
-
-            if avg_probs is None:
-                avg_probs = probs
-            else:
-                avg_probs.add_(probs)
-            if attn is not None:
-                if torch.is_tensor(attn):
-                    attn = attn.data
-                else:
-                    attn = attn[0]
-                if avg_attn is None:
-                    avg_attn = attn
-                else:
-                    avg_attn.add_(attn)
-        if len(models) > 1:
-            avg_probs.div_(len(models))
-            avg_probs.log_()
-            if avg_attn is not None:
-                avg_attn.div_(len(models))
-
-        bsz = avg_probs.size(0)
-        hypos = []
-        start_idxs = sample["start_indices"] if "start_indices" in sample else [0] * bsz
-        for i in range(bsz):
-            # remove padding from ref
-            ref = (
-                utils.strip_pad(sample["target"][i, start_idxs[i] :], self.pad)
-                if sample["target"] is not None
-                else None
-            )
-            tgt_len = ref.numel()
-            avg_probs_i = avg_probs[i][start_idxs[i] : start_idxs[i] + tgt_len]
-            score_i = avg_probs_i.sum() / tgt_len
-            if avg_attn is not None:
-                avg_attn_i = avg_attn[i]
-                if self.compute_alignment:
-                    alignment = utils.extract_hard_alignment(
-                        avg_attn_i,
-                        sample["net_input"]["src_tokens"][i],
-                        sample["target"][i],
-                        self.pad,
-                        self.eos,
-                    )
-                else:
-                    alignment = None
-            else:
-                avg_attn_i = alignment = None
-            hypos.append(
-                [
-                    {
-                        "tokens": ref,
-                        "score": score_i,
-                        "attention": avg_attn_i,
-                        "alignment": alignment,
-                        "positional_scores": avg_probs_i,
-                    }
-                ]
-            )
-        return hypos
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/__init__.py
deleted file mode 100644
index e0abce253c9311142c9864dc13dfc5043eca6c06..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/__init__.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""isort:skip_file"""
-
-import argparse
-import importlib
-import os
-from argparse import Namespace
-from typing import Union
-
-from fairseq.dataclass import FairseqDataclass
-from omegaconf import DictConfig
-
-from .fairseq_task import FairseqTask, LegacyFairseqTask  # noqa
-
-
-# register dataclass
-TASK_DATACLASS_REGISTRY = {}
-TASK_REGISTRY = {}
-TASK_CLASS_NAMES = set()
-
-
-def setup_task(task_cfg: Union[DictConfig, Namespace], **kwargs):
-    if isinstance(task_cfg, DictConfig):
-        return TASK_REGISTRY[task_cfg._name].setup_task(task_cfg, **kwargs)
-    return TASK_REGISTRY[task_cfg.task].setup_task(task_cfg, **kwargs)
-
-
-def register_task(name, dataclass=None):
-    """
-    New tasks can be added to fairseq with the
-    :func:`~fairseq.tasks.register_task` function decorator.
-
-    For example::
-
-        @register_task('classification')
-        class ClassificationTask(FairseqTask):
-            (...)
-
-    .. note::
-
-        All Tasks must implement the :class:`~fairseq.tasks.FairseqTask`
-        interface.
-
-    Args:
-        name (str): the name of the task
-    """
-
-    def register_task_cls(cls):
-        if name in TASK_REGISTRY:
-            raise ValueError("Cannot register duplicate task ({})".format(name))
-        if not issubclass(cls, FairseqTask):
-            raise ValueError(
-                "Task ({}: {}) must extend FairseqTask".format(name, cls.__name__)
-            )
-        if cls.__name__ in TASK_CLASS_NAMES:
-            raise ValueError(
-                "Cannot register task with duplicate class name ({})".format(
-                    cls.__name__
-                )
-            )
-        TASK_REGISTRY[name] = cls
-        TASK_CLASS_NAMES.add(cls.__name__)
-
-        if dataclass is not None and not issubclass(dataclass, FairseqDataclass):
-            raise ValueError(
-                "Dataclass {} must extend FairseqDataclass".format(dataclass)
-            )
-
-        cls.__dataclass = dataclass
-        TASK_DATACLASS_REGISTRY[name] = dataclass
-
-        return cls
-
-    return register_task_cls
-
-
-def get_task(name):
-    return TASK_REGISTRY[name]
-
-
-# automatically import any Python files in the tasks/ directory
-tasks_dir = os.path.dirname(__file__)
-for file in os.listdir(tasks_dir):
-    path = os.path.join(tasks_dir, file)
-    if (
-        not file.startswith("_")
-        and not file.startswith(".")
-        and (file.endswith(".py") or os.path.isdir(path))
-    ):
-        task_name = file[: file.find(".py")] if file.endswith(".py") else file
-        module = importlib.import_module("fairseq.tasks." + task_name)
-
-        # expose `task_parser` for sphinx
-        if task_name in TASK_REGISTRY:
-            parser = argparse.ArgumentParser(add_help=False)
-            group_task = parser.add_argument_group("Task name")
-            # fmt: off
-            group_task.add_argument('--task', metavar=task_name,
-                                    help='Enable this task with: ``--task=' + task_name + '``')
-            # fmt: on
-            group_args = parser.add_argument_group("Additional command-line arguments")
-            TASK_REGISTRY[task_name].add_args(group_args)
-            globals()[task_name + "_parser"] = parser
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/audio_pretraining.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/audio_pretraining.py
deleted file mode 100644
index ff2342afa92b644cc2de97e79776844bda9b39e4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/audio_pretraining.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) 2017-present, Facebook, Inc.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the LICENSE file in
-# the root directory of this source tree. An additional grant of patent rights
-# can be found in the PATENTS file in the same directory.
-
-import os
-import sys
-
-from fairseq.data import AddTargetDataset, Dictionary, FileAudioDataset
-
-from . import LegacyFairseqTask, register_task
-
-
-class LabelEncoder(object):
-    def __init__(self, dictionary):
-        self.dictionary = dictionary
-
-    def __call__(self, label):
-        return self.dictionary.encode_line(
-            label, append_eos=False, add_if_not_exist=False
-        )
-
-
-@register_task("audio_pretraining")
-class AudioPretrainingTask(LegacyFairseqTask):
-    """"""
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        parser.add_argument("data", help="path to data directory")
-        parser.add_argument(
-            "--sample-rate",
-            default=16000,
-            type=int,
-            help="target sample rate. audio files will be up/down sampled to this rate",
-        )
-        parser.add_argument(
-            "--normalize",
-            action="store_true",
-            help="if set, normalizes input to have 0 mean and unit variance",
-        )
-        parser.add_argument(
-            "--max-sample-size",
-            default=None,
-            type=int,
-            help="max sample size to crop to for batching. default = min sample length",
-        )
-        parser.add_argument(
-            "--min-sample-size",
-            default=None,
-            type=int,
-            help="min sample size to crop to for batching. default = same as --max-sample-size",
-        )
-
-        parser.add_argument(
-            "--enable-padding",
-            action="store_true",
-            help="pad shorter samples instead of cropping",
-        )
-
-        parser.add_argument(
-            "--labels",
-            type=str,
-            default=None,
-            help="extension of the label file to load, if any",
-        )
-
-    def __init__(self, args, source_dictionary=None):
-        super().__init__(args)
-        self._target_dictionary = None
-        self._source_dictionary = source_dictionary
-        self.is_ctc = args.criterion == "ctc"
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        """Setup the task (e.g., load dictionaries).
-
-        Args:
-            args (argparse.Namespace): parsed command-line arguments
-        """
-        return cls(args)
-
-    def load_dataset(self, split, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        manifest = os.path.join(self.args.data, "{}.tsv".format(split))
-        self.datasets[split] = FileAudioDataset(
-            manifest,
-            sample_rate=self.args.sample_rate,
-            max_sample_size=self.args.max_sample_size,
-            min_sample_size=self.args.max_sample_size,
-            min_length=self.args.min_sample_size,
-            pad=self.args.labels is not None or self.args.enable_padding,
-            normalize=self.args.normalize,
-        )
-
-        if self.args.labels:
-            dict_path = os.path.join(self.args.data, f"dict.{self.args.labels}.txt")
-            self._target_dictionary = Dictionary.load(dict_path)
-            label_path = os.path.join(self.args.data, f"{split}.{self.args.labels}")
-            labels = []
-            with open(label_path, "r") as f:
-                for line in f:
-                    labels.append(line)
-
-            process_label = LabelEncoder(self.target_dictionary)
-
-            self.datasets[split] = AddTargetDataset(
-                self.datasets[split],
-                labels,
-                pad=self.target_dictionary.pad(),
-                eos=self.target_dictionary.eos(),
-                batch_targets=True,
-                process_label=process_label,
-                add_to_input=not self.is_ctc,
-            )
-
-    @property
-    def source_dictionary(self):
-        return self._source_dictionary
-
-    @property
-    def target_dictionary(self):
-        """Return the :class:`~fairseq.data.Dictionary` for the language
-        model."""
-        return self._target_dictionary
-
-    def max_positions(self):
-        """Maximum input length supported by the encoder."""
-        return (sys.maxsize, sys.maxsize)
-
-    def filter_indices_by_size(
-        self,
-        indices,
-        dataset,
-        max_positions=None,
-        ignore_invalid_inputs=False,
-    ):
-        # we do not need to filter by size in this task as dataloaders take care of this
-        return indices
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/cross_lingual_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/cross_lingual_lm.py
deleted file mode 100644
index 8f8fe7e2de181e41bd0e6a2bf96948ee78de5ae8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/cross_lingual_lm.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import itertools
-import logging
-import os
-from collections import OrderedDict
-
-import numpy as np
-from fairseq import tokenizer, utils
-from fairseq.data import ConcatDataset, Dictionary, TokenBlockDataset, data_utils
-from fairseq.data.legacy.masked_lm_dataset import MaskedLMDataset
-from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
-from fairseq.data.multi_corpus_sampled_dataset import MultiCorpusSampledDataset
-from fairseq.tasks import LegacyFairseqTask, register_task
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("cross_lingual_lm")
-class CrossLingualLMTask(LegacyFairseqTask):
-    """
-    Task for training cross-lingual language models.
-
-    For more details look at: https://arxiv.org/pdf/1901.07291.pdf
-
-    Args:
-        dictionary (Dictionary): the dictionary for the input of the task
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        parser.add_argument(
-            "data",
-            help="colon separated path to data directories list, \
-                            will be iterated upon during epochs in round-robin manner",
-        )
-        parser.add_argument(
-            "--tokens-per-sample",
-            default=512,
-            type=int,
-            help="max number of total tokens over all segments" " per sample",
-        )
-        parser.add_argument(
-            "--monolingual-langs",
-            default="en",
-            type=str,
-            help="comma separated list of languages for which we"
-            " want to train XLM on",
-        )
-        parser.add_argument(
-            "--shuffle",
-            action="store_true",
-            help="shuffle each monolingual dataset while" " training",
-        )
-
-    def __init__(self, args, dictionary):
-        super().__init__(args)
-        self.dictionary = dictionary
-        self.seed = args.seed
-        self.distributed_world_size = args.distributed_world_size
-        self.langs2id = self._lang_to_id(args.monolingual_langs)
-
-    def _lang_to_id(self, languages: str):
-        """
-        Build a map from languages to ids. These ids are used as segment labels
-        for cross-lingual LM training.
-        """
-        lang2id = {}
-        langs = [l.strip() for l in languages.split(",")]
-        for id, lang in enumerate(langs):
-            lang2id[lang] = id
-        return lang2id
-
-    @classmethod
-    def load_dictionary(cls, filename):
-        return MaskedLMDictionary.load(filename)
-
-    @classmethod
-    def build_dictionary(
-        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
-    ):
-        d = MaskedLMDictionary()
-        for filename in filenames:
-            Dictionary.add_file_to_dictionary(
-                filename, d, tokenizer.tokenize_line, workers
-            )
-        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
-        return d
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        """Setup the task."""
-        dictionary = MaskedLMDictionary.load(os.path.join(args.data, "dict.txt"))
-        logger.info("dictionary: {} types".format(len(dictionary)))
-        return cls(args, dictionary)
-
-    def _load_single_lang_dataset(self, split, epoch):
-        loaded_datasets = []
-
-        paths = utils.split_paths(self.args.data)
-        assert len(paths) > 0
-        data_path = paths[(epoch - 1) % len(paths)]
-
-        for k in itertools.count():
-            split_k = split + (str(k) if k > 0 else "")
-            path = os.path.join(data_path, split_k)
-
-            ds = data_utils.load_indexed_dataset(
-                path, self.dictionary, self.args.dataset_impl
-            )
-            if ds is None:
-                if k > 0:
-                    break
-                else:
-                    raise FileNotFoundError(
-                        "Dataset not found: {} ({})".format(split, data_path)
-                    )
-
-            # Since we append each block with the classification_token,
-            # we need to effectively create blocks of length
-            # tokens_per_sample-1
-            loaded_datasets.append(
-                TokenBlockDataset(
-                    ds,
-                    ds.sizes,
-                    self.args.tokens_per_sample - 1,
-                    pad=self.dictionary.pad(),
-                    eos=self.dictionary.eos(),
-                )
-            )
-
-            logger.info(
-                "{} {} {} examples".format(data_path, split_k, len(loaded_datasets[-1]))
-            )
-
-        if len(loaded_datasets) == 1:
-            dataset = loaded_datasets[0]
-            sizes = dataset.sizes
-        else:
-            dataset = ConcatDataset(loaded_datasets)
-            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])
-
-        return dataset, sizes
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        dataset_map = OrderedDict()
-
-        for lang in self.langs2id.keys():
-            # Datasets are expected to be in "split.lang" format (Eg: train.en)
-            language_split = "{}.{}".format(split, lang)
-
-            block_dataset, sizes = self._load_single_lang_dataset(
-                split=language_split, epoch=epoch
-            )
-
-            dataset_map[lang] = MaskedLMDataset(
-                dataset=block_dataset,
-                sizes=sizes,
-                vocab=self.dictionary,
-                pad_idx=self.dictionary.pad(),
-                mask_idx=self.dictionary.mask(),
-                classif_token_idx=self.dictionary.eos(),
-                sep_token_idx=self.dictionary.eos(),
-                shuffle=getattr(self.args, "shuffle", False),
-                has_pairs=False,
-                segment_id=self.langs2id[lang],
-                seed=self.seed,
-            )
-
-        self.datasets[split] = MultiCorpusSampledDataset(dataset_map)
-        logger.info(
-            "{} {} {} examples".format(
-                utils.split_paths(self.args.data)[epoch - 1],
-                split,
-                len(self.datasets[split]),
-            )
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/denoising.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/denoising.py
deleted file mode 100644
index 3e88bf0ed0468ab2821304280f99e94123c41cac..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/denoising.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-
-from fairseq import utils
-from fairseq.data import (
-    AppendTokenDataset,
-    DenoisingDataset,
-    Dictionary,
-    PrependTokenDataset,
-    StripTokenDataset,
-    TokenBlockDataset,
-    data_utils,
-)
-from fairseq.data.encoders.utils import get_whole_word_mask
-from fairseq.tasks import LegacyFairseqTask, register_task
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("denoising")
-class DenoisingTask(LegacyFairseqTask):
-    """
-    Denoising task for applying sequence to sequence denoising. (ie. BART)
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        parser.add_argument("data", help="path to data directory")
-        parser.add_argument(
-            "--tokens-per-sample",
-            default=512,
-            type=int,
-            help="max number of total tokens over all segments"
-            " per sample for dataset",
-        )
-        parser.add_argument(
-            "--sample-break-mode",
-            default="complete_doc",
-            type=str,
-            help="mode for breaking sentence",
-        )
-        parser.add_argument(
-            "--mask",
-            default=0.0,
-            type=float,
-            help="fraction of words/subwords that will be masked",
-        )
-        parser.add_argument(
-            "--mask-random",
-            default=0.0,
-            type=float,
-            help="instead of using [MASK], use random token this often",
-        )
-        parser.add_argument(
-            "--insert",
-            default=0.0,
-            type=float,
-            help="insert this percentage of additional random tokens",
-        )
-        parser.add_argument(
-            "--permute",
-            default=0.0,
-            type=float,
-            help="take this proportion of subwords and permute them",
-        )
-        parser.add_argument(
-            "--rotate",
-            default=0.5,
-            type=float,
-            help="rotate this proportion of inputs",
-        )
-        parser.add_argument(
-            "--poisson-lambda",
-            default=3.0,
-            type=float,
-            help="randomly shuffle sentences for this proportion of inputs",
-        )
-        parser.add_argument(
-            "--permute-sentences",
-            default=0.0,
-            type=float,
-            help="shuffle this proportion of sentences in all inputs",
-        )
-        parser.add_argument(
-            "--mask-length",
-            default="subword",
-            type=str,
-            choices=["subword", "word", "span-poisson"],
-            help="mask length to choose",
-        )
-        parser.add_argument(
-            "--replace-length",
-            default=-1,
-            type=int,
-            help="when masking N tokens, replace with 0, 1, or N tokens (use -1 for N)",
-        )
-        parser.add_argument(
-            "--max-source-positions",
-            default=1024,
-            type=int,
-            metavar="N",
-            help="max number of tokens in the source sequence",
-        )
-        parser.add_argument(
-            "--max-target-positions",
-            default=1024,
-            type=int,
-            metavar="N",
-            help="max number of tokens in the target sequence",
-        )
-
-    def __init__(self, args, dictionary):
-        super().__init__(args)
-        self.dictionary = dictionary
-        self.seed = args.seed
-
-        # add mask token
-        self.mask_idx = self.dictionary.add_symbol("<mask>")
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        """Setup the task."""
-        dictionary = Dictionary.load(os.path.join(args.data, "dict.txt"))
-        logger.info("dictionary: {} types".format(len(dictionary)))
-        if not hasattr(args, "shuffle_instance"):
-            args.shuffle_instance = False
-        return cls(args, dictionary)
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        paths = utils.split_paths(self.args.data)
-        assert len(paths) > 0
-        data_path = paths[(epoch - 1) % len(paths)]
-        split_path = os.path.join(data_path, split)
-
-        dataset = data_utils.load_indexed_dataset(
-            split_path,
-            self.dictionary,
-            self.args.dataset_impl,
-            combine=combine,
-        )
-        if dataset is None:
-            raise FileNotFoundError(
-                "Dataset not found: {} ({})".format(split, split_path)
-            )
-
-        dataset = StripTokenDataset(dataset, self.dictionary.eos())
-
-        # create continuous blocks of tokens
-        dataset = TokenBlockDataset(
-            dataset,
-            dataset.sizes,
-            self.args.tokens_per_sample - 2,  # one less for <s> and one for </s>
-            pad=self.dictionary.pad(),
-            eos=self.dictionary.eos(),
-            break_mode=self.args.sample_break_mode,
-            document_sep_len=0,
-        )
-
-        # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
-        dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
-        dataset = AppendTokenDataset(dataset, self.source_dictionary.eos())
-
-        mask_whole_words = (
-            get_whole_word_mask(self.args, self.source_dictionary)
-            if self.args.mask_length != "subword"
-            else None
-        )
-
-        self.datasets[split] = DenoisingDataset(
-            dataset,
-            dataset.sizes,
-            self.dictionary,
-            self.mask_idx,
-            mask_whole_words,
-            shuffle=self.args.shuffle_instance,
-            seed=self.seed,
-            args=self.args,
-        )
-        logger.info(
-            "Split: {0}, Loaded {1} samples of denoising_dataset".format(
-                split,
-                len(self.datasets[split]),
-            )
-        )
-
-    def max_positions(self):
-        """Return the max sentence length allowed by the task."""
-        return (self.args.max_source_positions, self.args.max_target_positions)
-
-    @property
-    def source_dictionary(self):
-        """Return the source :class:`~fairseq.data.Dictionary`."""
-        return self.dictionary
-
-    @property
-    def target_dictionary(self):
-        """Return the target :class:`~fairseq.data.Dictionary`."""
-        return self.dictionary
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/fairseq_task.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/fairseq_task.py
deleted file mode 100644
index 732195da51ad4abd37659bec7436b034272d7e3d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/fairseq_task.py
+++ /dev/null
@@ -1,564 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-import warnings
-from argparse import Namespace
-
-import torch
-from fairseq import metrics, search, tokenizer, utils
-from fairseq.data import Dictionary, FairseqDataset, data_utils, encoders, iterators
-from fairseq.dataclass.utils import gen_parser_from_dataclass
-
-
-logger = logging.getLogger(__name__)
-
-
-class FairseqTask(object):
-    """
-    Tasks store dictionaries and provide helpers for loading/iterating over
-    Datasets, initializing the Model/Criterion and calculating the loss.
-    """
-
-    @classmethod
-    def add_args(cls, parser):
-        """Add task-specific arguments to the parser."""
-        dc = getattr(cls, "__dataclass", None)
-        if dc is not None:
-            gen_parser_from_dataclass(parser, dc())
-
-    @staticmethod
-    def logging_outputs_can_be_summed(criterion) -> bool:
-        """
-        Whether the logging outputs returned by `train_step` and `valid_step` can
-        be summed across workers prior to calling `aggregate_logging_outputs`.
-        Setting this to True will improves distributed training speed.
-        """
-        return criterion.logging_outputs_can_be_summed()
-
-    def __init__(self, args):
-        self.args = args
-        self.datasets = {}
-        self.dataset_to_epoch_iter = {}
-
-    @classmethod
-    def load_dictionary(cls, filename):
-        """Load the dictionary from the filename
-
-        Args:
-            filename (str): the filename
-        """
-        return Dictionary.load(filename)
-
-    @classmethod
-    def build_dictionary(
-        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
-    ):
-        """Build the dictionary
-
-        Args:
-            filenames (list): list of filenames
-            workers (int): number of concurrent workers
-            threshold (int): defines the minimum word count
-            nwords (int): defines the total number of words in the final dictionary,
-                including special symbols
-            padding_factor (int): can be used to pad the dictionary size to be a
-                multiple of 8, which is important on some hardware (e.g., Nvidia
-                Tensor Cores).
-        """
-        d = Dictionary()
-        for filename in filenames:
-            Dictionary.add_file_to_dictionary(
-                filename, d, tokenizer.tokenize_line, workers
-            )
-        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
-        return d
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        """Setup the task (e.g., load dictionaries).
-
-        Args:
-            args (argparse.Namespace): parsed command-line arguments
-        """
-        return cls(args, **kwargs)
-
-    def has_sharded_data(self, split):
-        return os.pathsep in getattr(self.args, "data", "")
-
-    def load_dataset(self, split, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        raise NotImplementedError
-
-    def dataset(self, split):
-        """
-        Return a loaded dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-
-        Returns:
-            a :class:`~fairseq.data.FairseqDataset` corresponding to *split*
-        """
-        from fairseq.data import FairseqDataset
-
-        if split not in self.datasets:
-            raise KeyError("Dataset not loaded: " + split)
-        if not isinstance(self.datasets[split], FairseqDataset):
-            raise TypeError("Datasets are expected to be of type FairseqDataset")
-        return self.datasets[split]
-
-    def filter_indices_by_size(
-        self, indices, dataset, max_positions=None, ignore_invalid_inputs=False
-    ):
-        """
-        Filter examples that are too large
-
-        Args:
-            indices (np.array): original array of sample indices
-            dataset (~fairseq.data.FairseqDataset): dataset to batch
-            max_positions (optional): max sentence length supported by the
-                model (default: None).
-            ignore_invalid_inputs (bool, optional): don't raise Exception for
-                sentences that are too long (default: False).
-        Returns:
-            np.array: array of filtered sample indices
-        """
-        indices, ignored = dataset.filter_indices_by_size(indices, max_positions)
-        if len(ignored) > 0:
-            if not ignore_invalid_inputs:
-                raise Exception(
-                    (
-                        "Size of sample #{} is invalid (={}) since max_positions={}, "
-                        "skip this example with --skip-invalid-size-inputs-valid-test"
-                    ).format(ignored[0], dataset.size(ignored[0]), max_positions)
-                )
-            logger.warning(
-                (
-                    "{} samples have invalid sizes and will be skipped, "
-                    "max_positions={}, first few sample ids={}"
-                ).format(len(ignored), max_positions, ignored[:10])
-            )
-        return indices
-
-    def can_reuse_epoch_itr(self, dataset):
-        # We can reuse the epoch iterator across epochs as long as the dataset
-        # hasn't disabled it. We default to ``False`` here, although in practice
-        # this will be ``True`` for most datasets that inherit from
-        # ``FairseqDataset`` due to the base implementation there.
-        return getattr(dataset, "can_reuse_epoch_itr_across_epochs", False)
-
-    def get_batch_iterator(
-        self,
-        dataset,
-        max_tokens=None,
-        max_sentences=None,
-        max_positions=None,
-        ignore_invalid_inputs=False,
-        required_batch_size_multiple=1,
-        seed=1,
-        num_shards=1,
-        shard_id=0,
-        num_workers=0,
-        epoch=1,
-        data_buffer_size=0,
-        disable_iterator_cache=False,
-    ):
-        """
-        Get an iterator that yields batches of data from the given dataset.
-
-        Args:
-            dataset (~fairseq.data.FairseqDataset): dataset to batch
-            max_tokens (int, optional): max number of tokens in each batch
-                (default: None).
-            max_sentences (int, optional): max number of sentences in each
-                batch (default: None).
-            max_positions (optional): max sentence length supported by the
-                model (default: None).
-            ignore_invalid_inputs (bool, optional): don't raise Exception for
-                sentences that are too long (default: False).
-            required_batch_size_multiple (int, optional): require batch size to
-                be a multiple of N (default: 1).
-            seed (int, optional): seed for random number generator for
-                reproducibility (default: 1).
-            num_shards (int, optional): shard the data iterator into N
-                shards (default: 1).
-            shard_id (int, optional): which shard of the data iterator to
-                return (default: 0).
-            num_workers (int, optional): how many subprocesses to use for data
-                loading. 0 means the data will be loaded in the main process
-                (default: 0).
-            epoch (int, optional): the epoch to start the iterator from
-                (default: 1).
-            data_buffer_size (int, optional): number of batches to
-                preload (default: 0).
-            disable_iterator_cache (bool, optional): don't cache the
-                EpochBatchIterator (ignores `FairseqTask::can_reuse_epoch_itr`)
-                (default: False).
-        Returns:
-            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
-                given dataset split
-        """
-        can_reuse_epoch_itr = not disable_iterator_cache and self.can_reuse_epoch_itr(
-            dataset
-        )
-        if can_reuse_epoch_itr and dataset in self.dataset_to_epoch_iter:
-            logger.debug("reusing EpochBatchIterator for epoch {}".format(epoch))
-            return self.dataset_to_epoch_iter[dataset]
-
-        assert isinstance(dataset, FairseqDataset)
-
-        # initialize the dataset with the correct starting epoch
-        dataset.set_epoch(epoch)
-
-        # get indices ordered by example size
-        with data_utils.numpy_seed(seed):
-            indices = dataset.ordered_indices()
-
-        # filter examples that are too large
-        if max_positions is not None:
-            indices = self.filter_indices_by_size(
-                indices, dataset, max_positions, ignore_invalid_inputs
-            )
-
-        # create mini-batches with given size constraints
-        batch_sampler = dataset.batch_by_size(
-            indices,
-            max_tokens=max_tokens,
-            max_sentences=max_sentences,
-            required_batch_size_multiple=required_batch_size_multiple,
-        )
-
-        # return a reusable, sharded iterator
-        epoch_iter = iterators.EpochBatchIterator(
-            dataset=dataset,
-            collate_fn=dataset.collater,
-            batch_sampler=batch_sampler,
-            seed=seed,
-            num_shards=num_shards,
-            shard_id=shard_id,
-            num_workers=num_workers,
-            epoch=epoch,
-            buffer_size=data_buffer_size,
-        )
-
-        if can_reuse_epoch_itr:
-            self.dataset_to_epoch_iter[dataset] = epoch_iter
-
-        return epoch_iter
-
-    def build_model(self, args):
-        """
-        Build the :class:`~fairseq.models.BaseFairseqModel` instance for this
-        task.
-
-        Args:
-            args (argparse.Namespace): parsed command-line arguments
-
-        Returns:
-            a :class:`~fairseq.models.BaseFairseqModel` instance
-        """
-        from fairseq import models, quantization_utils
-
-        model = models.build_model(args, self)
-        if getattr(args, "tpu", False):
-            model.prepare_for_tpu_()
-        model = quantization_utils.quantize_model_scalar(model, args)
-        return model
-
-    def build_criterion(self, args):
-        """
-        Build the :class:`~fairseq.criterions.FairseqCriterion` instance for
-        this task.
-
-        Args:
-            args (argparse.Namespace): parsed command-line arguments
-
-        Returns:
-            a :class:`~fairseq.criterions.FairseqCriterion` instance
-        """
-        from fairseq import criterions
-
-        return criterions.build_criterion(args, self)
-
-    def build_generator(
-        self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None
-    ):
-        if getattr(args, "score_reference", False):
-            from fairseq.sequence_scorer import SequenceScorer
-
-            return SequenceScorer(
-                self.target_dictionary,
-                compute_alignment=getattr(args, "print_alignment", False),
-            )
-
-        from fairseq.sequence_generator import (
-            SequenceGenerator,
-            SequenceGeneratorWithAlignment,
-        )
-
-        # Choose search strategy. Defaults to Beam Search.
-        sampling = getattr(args, "sampling", False)
-        sampling_topk = getattr(args, "sampling_topk", -1)
-        sampling_topp = getattr(args, "sampling_topp", -1.0)
-        diverse_beam_groups = getattr(args, "diverse_beam_groups", -1)
-        diverse_beam_strength = getattr(args, "diverse_beam_strength", 0.5)
-        match_source_len = getattr(args, "match_source_len", False)
-        diversity_rate = getattr(args, "diversity_rate", -1)
-        constrained = getattr(args, "constraints", False)
-        prefix_allowed_tokens_fn = getattr(args, "prefix_allowed_tokens_fn", None)
-        if (
-            sum(
-                int(cond)
-                for cond in [
-                    sampling,
-                    diverse_beam_groups > 0,
-                    match_source_len,
-                    diversity_rate > 0,
-                ]
-            )
-            > 1
-        ):
-            raise ValueError("Provided Search parameters are mutually exclusive.")
-        assert sampling_topk < 0 or sampling, "--sampling-topk requires --sampling"
-        assert sampling_topp < 0 or sampling, "--sampling-topp requires --sampling"
-
-        if sampling:
-            search_strategy = search.Sampling(
-                self.target_dictionary, sampling_topk, sampling_topp
-            )
-        elif diverse_beam_groups > 0:
-            search_strategy = search.DiverseBeamSearch(
-                self.target_dictionary, diverse_beam_groups, diverse_beam_strength
-            )
-        elif match_source_len:
-            # this is useful for tagging applications where the output
-            # length should match the input length, so we hardcode the
-            # length constraints for simplicity
-            search_strategy = search.LengthConstrainedBeamSearch(
-                self.target_dictionary,
-                min_len_a=1,
-                min_len_b=0,
-                max_len_a=1,
-                max_len_b=0,
-            )
-        elif diversity_rate > -1:
-            search_strategy = search.DiverseSiblingsSearch(
-                self.target_dictionary, diversity_rate
-            )
-        elif constrained:
-            search_strategy = search.LexicallyConstrainedBeamSearch(
-                self.target_dictionary, args.constraints
-            )
-        elif prefix_allowed_tokens_fn:
-            search_strategy = search.PrefixConstrainedBeamSearch(
-                self.target_dictionary, prefix_allowed_tokens_fn
-            )
-        else:
-            search_strategy = search.BeamSearch(self.target_dictionary)
-
-        if seq_gen_cls is None:
-            if getattr(args, "print_alignment", False):
-                seq_gen_cls = SequenceGeneratorWithAlignment
-            else:
-                seq_gen_cls = SequenceGenerator
-        extra_gen_cls_kwargs = extra_gen_cls_kwargs or {}
-        return seq_gen_cls(
-            models,
-            self.target_dictionary,
-            beam_size=getattr(args, "beam", 5),
-            max_len_a=getattr(args, "max_len_a", 0),
-            max_len_b=getattr(args, "max_len_b", 200),
-            min_len=getattr(args, "min_len", 1),
-            normalize_scores=(not getattr(args, "unnormalized", False)),
-            len_penalty=getattr(args, "lenpen", 1),
-            unk_penalty=getattr(args, "unkpen", 0),
-            temperature=getattr(args, "temperature", 1.0),
-            match_source_len=getattr(args, "match_source_len", False),
-            no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
-            search_strategy=search_strategy,
-            **extra_gen_cls_kwargs,
-        )
-
-    def train_step(
-        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
-    ):
-        """
-        Do forward and backward, and return the loss as computed by *criterion*
-        for the given *model* and *sample*.
-
-        Args:
-            sample (dict): the mini-batch. The format is defined by the
-                :class:`~fairseq.data.FairseqDataset`.
-            model (~fairseq.models.BaseFairseqModel): the model
-            criterion (~fairseq.criterions.FairseqCriterion): the criterion
-            optimizer (~fairseq.optim.FairseqOptimizer): the optimizer
-            update_num (int): the current update
-            ignore_grad (bool): multiply loss by 0 if this is set to True
-
-        Returns:
-            tuple:
-                - the loss
-                - the sample size, which is used as the denominator for the
-                  gradient
-                - logging outputs to display while training
-        """
-        model.train()
-        model.set_num_updates(update_num)
-        loss, sample_size, logging_output = criterion(model, sample)
-        if ignore_grad:
-            loss *= 0
-        optimizer.backward(loss)
-        return loss, sample_size, logging_output
-
-    def valid_step(self, sample, model, criterion):
-        model.eval()
-        with torch.no_grad():
-            loss, sample_size, logging_output = criterion(model, sample)
-        return loss, sample_size, logging_output
-
-    def inference_step(
-        self, generator, models, sample, prefix_tokens=None, constraints=None
-    ):
-        with torch.no_grad():
-            return generator.generate(
-                models, sample, prefix_tokens=prefix_tokens, constraints=constraints
-            )
-
-    def begin_epoch(self, epoch, model):
-        """Hook function called before the start of each epoch."""
-        pass
-
-    def begin_valid_epoch(self, epoch, model):
-        """Hook function called before the start of each validation epoch."""
-        pass
-
-    def aggregate_logging_outputs(self, logging_outputs, criterion):
-        """[deprecated] Aggregate logging outputs from data parallel training."""
-        utils.deprecation_warning(
-            "The aggregate_logging_outputs API is deprecated. "
-            "Please use the reduce_metrics API instead."
-        )
-        with metrics.aggregate() as agg:
-            self.reduce_metrics(logging_outputs, criterion)
-            return agg.get_smoothed_values()
-
-    def reduce_metrics(self, logging_outputs, criterion):
-        """Aggregate logging outputs from data parallel training."""
-        # backward compatibility for tasks that override aggregate_logging_outputs
-        base_func = FairseqTask.aggregate_logging_outputs
-        self_func = getattr(self, "aggregate_logging_outputs").__func__
-        if self_func is not base_func:
-            utils.deprecation_warning(
-                "Tasks should implement the reduce_metrics API. "
-                "Falling back to deprecated aggregate_logging_outputs API."
-            )
-            agg_logging_outputs = self.aggregate_logging_outputs(
-                logging_outputs, criterion
-            )
-            for k, v in agg_logging_outputs.items():
-                metrics.log_scalar(k, v)
-            return
-
-        if not any("ntokens" in log for log in logging_outputs):
-            warnings.warn(
-                "ntokens not found in Criterion logging outputs, cannot log wpb or wps"
-            )
-        else:
-            ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
-            metrics.log_scalar("wpb", ntokens, priority=180, round=1)
-            metrics.log_speed("wps", ntokens, priority=90, round=1)
-
-        if not any("nsentences" in log for log in logging_outputs):
-            warnings.warn(
-                "nsentences not found in Criterion logging outputs, cannot log bsz"
-            )
-        else:
-            nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
-            metrics.log_scalar("bsz", nsentences, priority=190, round=1)
-
-        criterion.__class__.reduce_metrics(logging_outputs)
-
-    def max_positions(self):
-        """Return the max input length allowed by the task."""
-        return None
-
-    @property
-    def source_dictionary(self):
-        """Return the source :class:`~fairseq.data.Dictionary` (if applicable
-        for this task)."""
-        raise NotImplementedError
-
-    @property
-    def target_dictionary(self):
-        """Return the target :class:`~fairseq.data.Dictionary` (if applicable
-        for this task)."""
-        raise NotImplementedError
-
-    def build_tokenizer(self, args):
-        """Build the pre-tokenizer for this task."""
-        return encoders.build_tokenizer(args)
-
-    def build_bpe(self, args):
-        """Build the tokenizer for this task."""
-        return encoders.build_bpe(args)
-
-
-class LegacyFairseqTask(FairseqTask):
-    def __init__(self, args: Namespace):
-        self.args = args
-        self.datasets = {}
-        self.dataset_to_epoch_iter = {}
-
-    @classmethod
-    def setup_task(cls, args: Namespace, **kwargs):
-        """Setup the task (e.g., load dictionaries).
-
-        Args:
-            args (argparse.Namespace): parsed command-line arguments
-        """
-        return cls(args, **kwargs)
-
-    def has_sharded_data(self, split):
-        return os.pathsep in getattr(self.args, "data", "")
-
-    def build_model(self, args: Namespace):
-        """
-        Build the :class:`~fairseq.models.BaseFairseqModel` instance for this
-        task.
-
-        Args:
-            args (argparse.Namespace): parsed command-line arguments
-
-        Returns:
-            a :class:`~fairseq.models.BaseFairseqModel` instance
-        """
-        from fairseq import models, quantization_utils
-
-        model = models.build_model(args, self)
-        if getattr(args, "tpu", False):
-            model.prepare_for_tpu_()
-        model = quantization_utils.quantize_model_scalar(model, args)
-        return model
-
-    def build_criterion(self, args: Namespace):
-        """
-        Build the :class:`~fairseq.criterions.FairseqCriterion` instance for
-        this task.
-
-        Args:
-            args (argparse.Namespace): parsed command-line arguments
-
-        Returns:
-            a :class:`~fairseq.criterions.FairseqCriterion` instance
-        """
-        from fairseq import criterions
-
-        return criterions.build_criterion(args, self)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/language_modeling.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/language_modeling.py
deleted file mode 100644
index 8792c6481ce190a152e78bbf36f3d3d0165a5920..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/language_modeling.py
+++ /dev/null
@@ -1,327 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-from dataclasses import dataclass, field
-from typing import Optional
-
-import numpy as np
-import torch
-from fairseq import utils
-from fairseq.data import (
-    AppendTokenDataset,
-    Dictionary,
-    IdDataset,
-    MonolingualDataset,
-    NestedDictionaryDataset,
-    NumelDataset,
-    PadDataset,
-    PrependTokenDataset,
-    StripTokenDataset,
-    TokenBlockDataset,
-    TruncatedDictionary,
-    data_utils,
-)
-from fairseq.data.indexed_dataset import get_available_dataset_impl
-from fairseq.data.shorten_dataset import maybe_shorten_dataset
-from fairseq.dataclass import ChoiceEnum, FairseqDataclass
-from fairseq.tasks import FairseqTask, register_task
-from omegaconf import II
-
-
-SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"])
-SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"])
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class LanguageModelingConfig(FairseqDataclass):
-    # TODO common var add to parent
-    data: Optional[str] = field(
-        default=None, metadata={"help": "path to data directory"}
-    )
-    sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field(
-        default="none",
-        metadata={
-            "help": 'If omitted or "none", fills each sample with tokens-per-sample '
-            'tokens. If set to "complete", splits samples only at the end '
-            "of sentence, but may include multiple sentences per sample. "
-            '"complete_doc" is similar but respects doc boundaries. '
-            'If set to "eos", includes only one sentence per sample.'
-        },
-    )
-    tokens_per_sample: int = field(
-        default=1024,
-        metadata={"help": "max number of tokens per sample for LM dataset"},
-    )
-    output_dictionary_size: int = field(
-        default=-1, metadata={"help": "limit the size of output dictionary"}
-    )
-    self_target: bool = field(default=False, metadata={"help": "include self target"})
-    future_target: bool = field(
-        default=False, metadata={"help": "include future target"}
-    )
-    past_target: bool = field(default=False, metadata={"help": "include past target"})
-    add_bos_token: bool = field(
-        default=False, metadata={"help": "prepend beginning of sentence token (<s>)"}
-    )
-    max_target_positions: Optional[int] = field(
-        default=None, metadata={"help": "max number of tokens in the target sequence"}
-    )
-    shorten_method: SHORTEN_METHOD_CHOICES = field(
-        default="none",
-        metadata={
-            "help": "if not none, shorten sequences that exceed --tokens-per-sample"
-        },
-    )
-    shorten_data_split_list: str = field(
-        default="",
-        metadata={
-            "help": "comma-separated list of dataset splits to apply shortening to, "
-            'e.g., "train,valid" (default: all dataset splits)'
-        },
-    )
-    # TODO common vars below add to parent
-    seed: int = II("params.common.seed")
-    dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II(
-        "params.dataset.dataset_impl"
-    )
-    data_buffer_size: int = II("params.dataset.data_buffer_size")
-    tpu: bool = II("params.common.tpu")
-
-
-@register_task("language_modeling", dataclass=LanguageModelingConfig)
-class LanguageModelingTask(FairseqTask):
-    """
-    Train a language model.
-
-    Args:
-        dictionary (~fairseq.data.Dictionary): the dictionary for the input of
-            the language model
-        output_dictionary (~fairseq.data.Dictionary): the dictionary for the
-            output of the language model. In most cases it will be the same as
-            *dictionary*, but could possibly be a more limited version of the
-            dictionary (if ``--output-dictionary-size`` is used).
-        targets (List[str]): list of the target types that the language model
-            should predict.  Can be one of "self", "future", and "past".
-            Defaults to "future".
-
-    .. note::
-
-        The language modeling task is compatible with :mod:`fairseq-train`,
-        :mod:`fairseq-generate`, :mod:`fairseq-interactive` and
-        :mod:`fairseq-eval-lm`.
-
-    The language modeling task provides the following additional command-line
-    arguments:
-
-    .. argparse::
-        :ref: fairseq.tasks.language_modeling_parser
-        :prog:
-    """
-
-    def __init__(self, args, dictionary, output_dictionary=None, targets=None):
-        super().__init__(args)
-        self.dictionary = dictionary
-        self.output_dictionary = output_dictionary or dictionary
-
-        if targets is None:
-            targets = ["future"]
-        self.targets = targets
-
-    @classmethod
-    def setup_dictionary(cls, args, **kwargs):
-        dictionary = None
-        output_dictionary = None
-        if args.data:
-            paths = utils.split_paths(args.data)
-            assert len(paths) > 0
-            dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
-            logger.info("dictionary: {} types".format(len(dictionary)))
-            output_dictionary = dictionary
-            if args.output_dictionary_size >= 0:
-                output_dictionary = TruncatedDictionary(
-                    dictionary, args.output_dictionary_size
-                )
-        return (dictionary, output_dictionary)
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        """Setup the task (e.g., load dictionaries).
-
-        Args:
-            args (argparse.Namespace): parsed command-line arguments
-        """
-        dictionary, output_dictionary = cls.setup_dictionary(args, **kwargs)
-
-        # upgrade old checkpoints
-        if hasattr(args, "exclude_self_target"):
-            args.self_target = not args.exclude_self_target
-
-        targets = []
-        if getattr(args, "self_target", False):
-            targets.append("self")
-        if getattr(args, "future_target", False):
-            targets.append("future")
-        if getattr(args, "past_target", False):
-            targets.append("past")
-        if len(targets) == 0:
-            # standard language modeling
-            targets = ["future"]
-
-        return cls(args, dictionary, output_dictionary, targets=targets)
-
-    def build_model(self, args):
-        model = super().build_model(args)
-
-        for target in self.targets:
-            if target not in model.supported_targets:
-                raise ValueError(
-                    "Unsupported language modeling target: {}".format(target)
-                )
-
-        return model
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        paths = utils.split_paths(self.args.data)
-        assert len(paths) > 0
-
-        data_path = paths[(epoch - 1) % len(paths)]
-        split_path = os.path.join(data_path, split)
-
-        dataset = data_utils.load_indexed_dataset(
-            split_path, self.dictionary, self.args.dataset_impl, combine=combine
-        )
-        if dataset is None:
-            raise FileNotFoundError(
-                "Dataset not found: {} ({})".format(split, split_path)
-            )
-
-        dataset = maybe_shorten_dataset(
-            dataset,
-            split,
-            self.args.shorten_data_split_list,
-            self.args.shorten_method,
-            self.args.tokens_per_sample,
-            self.args.seed,
-        )
-
-        dataset = TokenBlockDataset(
-            dataset,
-            dataset.sizes,
-            self.args.tokens_per_sample,
-            pad=self.dictionary.pad(),
-            eos=self.dictionary.eos(),
-            break_mode=self.args.sample_break_mode,
-            include_targets=True,
-        )
-
-        add_eos_for_other_targets = (
-            self.args.sample_break_mode is not None
-            and self.args.sample_break_mode != "none"
-        )
-
-        self.datasets[split] = self._initialize_dataset(
-            dataset=dataset,
-            sizes=dataset.sizes,
-            src_vocab=self.dictionary,
-            tgt_vocab=self.output_dictionary,
-            add_eos_for_other_targets=add_eos_for_other_targets,
-            shuffle=True,
-            targets=self.targets,
-            add_bos_token=self.args.add_bos_token,
-        )
-
-    def _initialize_dataset(self, **kwargs):
-        return MonolingualDataset(**kwargs)
-
-    def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs):
-        """
-        Generate batches for inference. We prepend an eos token to src_tokens
-        (or bos if `--add-bos-token` is set) and we append a <pad> to target.
-        This is convenient both for generation with a prefix and LM scoring.
-        """
-        dataset = StripTokenDataset(
-            TokenBlockDataset(
-                src_tokens,
-                src_lengths,
-                block_size=None,  # ignored for "eos" break mode
-                pad=self.source_dictionary.pad(),
-                eos=self.source_dictionary.eos(),
-                break_mode="eos",
-            ),
-            # remove eos from (end of) target sequence
-            self.source_dictionary.eos(),
-        )
-        src_dataset = PrependTokenDataset(
-            dataset,
-            token=(
-                self.source_dictionary.bos()
-                if getattr(self.args, "add_bos_token", False)
-                else self.source_dictionary.eos()
-            ),
-        )
-        tgt_dataset = AppendTokenDataset(dataset, token=self.source_dictionary.pad())
-        return NestedDictionaryDataset(
-            {
-                "id": IdDataset(),
-                "net_input": {
-                    "src_tokens": PadDataset(
-                        src_dataset,
-                        pad_idx=self.source_dictionary.pad(),
-                        left_pad=False,
-                    ),
-                    "src_lengths": NumelDataset(src_dataset, reduce=False),
-                },
-                "target": PadDataset(
-                    tgt_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False
-                ),
-            },
-            sizes=[np.array(src_lengths)],
-        )
-
-    def inference_step(
-        self, generator, models, sample, prefix_tokens=None, constraints=None
-    ):
-        with torch.no_grad():
-            # Generation will always be conditioned on bos_token
-            if getattr(self.args, "add_bos_token", False):
-                bos_token = self.source_dictionary.bos()
-            else:
-                bos_token = self.source_dictionary.eos()
-
-            if constraints is not None:
-                raise NotImplementedError(
-                    "Constrained decoding with the language_modeling task is not supported"
-                )
-
-            # SequenceGenerator doesn't use src_tokens directly, we need to
-            # pass the `prefix_tokens` argument instead
-            if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement():
-                prefix_tokens = sample["net_input"]["src_tokens"]
-                if prefix_tokens[:, 0].eq(bos_token).all():
-                    prefix_tokens = prefix_tokens[:, 1:]
-
-            return generator.generate(
-                models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token
-            )
-
-    @property
-    def source_dictionary(self):
-        """Return the :class:`~fairseq.data.Dictionary` for the language
-        model."""
-        return self.dictionary
-
-    @property
-    def target_dictionary(self):
-        """Return the :class:`~fairseq.data.Dictionary` for the language
-        model."""
-        return self.output_dictionary
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/legacy_masked_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/legacy_masked_lm.py
deleted file mode 100644
index 975497654926b64fff6c4960f54c4e6932e7fce1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/legacy_masked_lm.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import itertools
-import logging
-import os
-
-import numpy as np
-from fairseq import tokenizer, utils
-from fairseq.data import ConcatDataset, Dictionary, data_utils, indexed_dataset
-from fairseq.data.legacy.block_pair_dataset import BlockPairDataset
-from fairseq.data.legacy.masked_lm_dataset import MaskedLMDataset
-from fairseq.data.legacy.masked_lm_dictionary import BertDictionary
-from fairseq.tasks import LegacyFairseqTask, register_task
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("legacy_masked_lm")
-class LegacyMaskedLMTask(LegacyFairseqTask):
-    """
-    Task for training Masked LM (BERT) model.
-    Args:
-        dictionary (Dictionary): the dictionary for the input of the task
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        parser.add_argument(
-            "data",
-            help="colon separated path to data directories list, \
-                            will be iterated upon during epochs in round-robin manner",
-        )
-        parser.add_argument(
-            "--tokens-per-sample",
-            default=512,
-            type=int,
-            help="max number of total tokens over all segments"
-            " per sample for BERT dataset",
-        )
-        parser.add_argument(
-            "--break-mode", default="doc", type=str, help="mode for breaking sentence"
-        )
-        parser.add_argument("--shuffle-dataset", action="store_true", default=False)
-
-    def __init__(self, args, dictionary):
-        super().__init__(args)
-        self.dictionary = dictionary
-        self.seed = args.seed
-
-    @classmethod
-    def load_dictionary(cls, filename):
-        return BertDictionary.load(filename)
-
-    @classmethod
-    def build_dictionary(
-        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8
-    ):
-        d = BertDictionary()
-        for filename in filenames:
-            Dictionary.add_file_to_dictionary(
-                filename, d, tokenizer.tokenize_line, workers
-            )
-        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
-        return d
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        """Setup the task."""
-        paths = utils.split_paths(args.data)
-        assert len(paths) > 0
-        dictionary = BertDictionary.load(os.path.join(paths[0], "dict.txt"))
-        logger.info("dictionary: {} types".format(len(dictionary)))
-
-        return cls(args, dictionary)
-
-    def load_dataset(self, split, epoch=1, combine=False):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        loaded_datasets = []
-
-        paths = utils.split_paths(self.args.data)
-        assert len(paths) > 0
-        data_path = paths[(epoch - 1) % len(paths)]
-        logger.info("data_path", data_path)
-
-        for k in itertools.count():
-            split_k = split + (str(k) if k > 0 else "")
-            path = os.path.join(data_path, split_k)
-            ds = indexed_dataset.make_dataset(
-                path,
-                impl=self.args.dataset_impl,
-                fix_lua_indexing=True,
-                dictionary=self.dictionary,
-            )
-
-            if ds is None:
-                if k > 0:
-                    break
-                else:
-                    raise FileNotFoundError(
-                        "Dataset not found: {} ({})".format(split, data_path)
-                    )
-
-            with data_utils.numpy_seed(self.seed + k):
-                loaded_datasets.append(
-                    BlockPairDataset(
-                        ds,
-                        self.dictionary,
-                        ds.sizes,
-                        self.args.tokens_per_sample,
-                        break_mode=self.args.break_mode,
-                        doc_break_size=1,
-                    )
-                )
-
-            logger.info(
-                "{} {} {} examples".format(data_path, split_k, len(loaded_datasets[-1]))
-            )
-
-            if not combine:
-                break
-
-        if len(loaded_datasets) == 1:
-            dataset = loaded_datasets[0]
-            sizes = dataset.sizes
-        else:
-            dataset = ConcatDataset(loaded_datasets)
-            sizes = np.concatenate([ds.sizes for ds in loaded_datasets])
-
-        self.datasets[split] = MaskedLMDataset(
-            dataset=dataset,
-            sizes=sizes,
-            vocab=self.dictionary,
-            pad_idx=self.dictionary.pad(),
-            mask_idx=self.dictionary.mask(),
-            classif_token_idx=self.dictionary.cls(),
-            sep_token_idx=self.dictionary.sep(),
-            shuffle=self.args.shuffle_dataset,
-            seed=self.seed,
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/masked_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/masked_lm.py
deleted file mode 100644
index 56086f5e819dd890a180e9b97c24aeab6a873db8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/masked_lm.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-
-import numpy as np
-from fairseq import utils
-from fairseq.data import (
-    Dictionary,
-    IdDataset,
-    MaskTokensDataset,
-    NestedDictionaryDataset,
-    NumelDataset,
-    NumSamplesDataset,
-    PrependTokenDataset,
-    RightPadDataset,
-    SortDataset,
-    TokenBlockDataset,
-    data_utils,
-)
-from fairseq.data.encoders.utils import get_whole_word_mask
-from fairseq.data.shorten_dataset import maybe_shorten_dataset
-from fairseq.tasks import LegacyFairseqTask, register_task
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("masked_lm")
-class MaskedLMTask(LegacyFairseqTask):
-    """Task for training masked language models (e.g., BERT, RoBERTa)."""
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        parser.add_argument(
-            "data",
-            help="colon separated path to data directories list, \
-                            will be iterated upon during epochs in round-robin manner",
-        )
-        parser.add_argument(
-            "--sample-break-mode",
-            default="complete",
-            choices=["none", "complete", "complete_doc", "eos"],
-            help='If omitted or "none", fills each sample with tokens-per-sample '
-            'tokens. If set to "complete", splits samples only at the end '
-            "of sentence, but may include multiple sentences per sample. "
-            '"complete_doc" is similar but respects doc boundaries. '
-            'If set to "eos", includes only one sentence per sample.',
-        )
-        parser.add_argument(
-            "--tokens-per-sample",
-            default=512,
-            type=int,
-            help="max number of total tokens over all segments "
-            "per sample for BERT dataset",
-        )
-        parser.add_argument(
-            "--mask-prob",
-            default=0.15,
-            type=float,
-            help="probability of replacing a token with mask",
-        )
-        parser.add_argument(
-            "--leave-unmasked-prob",
-            default=0.1,
-            type=float,
-            help="probability that a masked token is unmasked",
-        )
-        parser.add_argument(
-            "--random-token-prob",
-            default=0.1,
-            type=float,
-            help="probability of replacing a token with a random token",
-        )
-        parser.add_argument(
-            "--freq-weighted-replacement",
-            default=False,
-            action="store_true",
-            help="sample random replacement words based on word frequencies",
-        )
-        parser.add_argument(
-            "--mask-whole-words",
-            default=False,
-            action="store_true",
-            help="mask whole words; you may also want to set --bpe",
-        )
-        parser.add_argument(
-            "--shorten-method",
-            default="none",
-            choices=["none", "truncate", "random_crop"],
-            help="if not none, shorten sequences that exceed --tokens-per-sample",
-        )
-        parser.add_argument(
-            "--shorten-data-split-list",
-            default="",
-            help="comma-separated list of dataset splits to apply shortening to, "
-            'e.g., "train,valid" (default: all dataset splits)',
-        )
-
-    def __init__(self, args, dictionary):
-        super().__init__(args)
-        self.dictionary = dictionary
-        self.seed = args.seed
-
-        # add mask token
-        self.mask_idx = dictionary.add_symbol("<mask>")
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        paths = utils.split_paths(args.data)
-        assert len(paths) > 0
-        dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
-        logger.info("dictionary: {} types".format(len(dictionary)))
-        return cls(args, dictionary)
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        paths = utils.split_paths(self.args.data)
-        assert len(paths) > 0
-        data_path = paths[(epoch - 1) % len(paths)]
-        split_path = os.path.join(data_path, split)
-
-        dataset = data_utils.load_indexed_dataset(
-            split_path,
-            self.source_dictionary,
-            self.args.dataset_impl,
-            combine=combine,
-        )
-        if dataset is None:
-            raise FileNotFoundError(
-                "Dataset not found: {} ({})".format(split, split_path)
-            )
-
-        dataset = maybe_shorten_dataset(
-            dataset,
-            split,
-            self.args.shorten_data_split_list,
-            self.args.shorten_method,
-            self.args.tokens_per_sample,
-            self.args.seed,
-        )
-
-        # create continuous blocks of tokens
-        dataset = TokenBlockDataset(
-            dataset,
-            dataset.sizes,
-            self.args.tokens_per_sample - 1,  # one less for <s>
-            pad=self.source_dictionary.pad(),
-            eos=self.source_dictionary.eos(),
-            break_mode=self.args.sample_break_mode,
-        )
-        logger.info("loaded {} blocks from: {}".format(len(dataset), split_path))
-
-        # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
-        dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
-
-        # create masked input and targets
-        mask_whole_words = (
-            get_whole_word_mask(self.args, self.source_dictionary)
-            if self.args.mask_whole_words
-            else None
-        )
-
-        src_dataset, tgt_dataset = MaskTokensDataset.apply_mask(
-            dataset,
-            self.source_dictionary,
-            pad_idx=self.source_dictionary.pad(),
-            mask_idx=self.mask_idx,
-            seed=self.args.seed,
-            mask_prob=self.args.mask_prob,
-            leave_unmasked_prob=self.args.leave_unmasked_prob,
-            random_token_prob=self.args.random_token_prob,
-            freq_weighted_replacement=self.args.freq_weighted_replacement,
-            mask_whole_words=mask_whole_words,
-        )
-
-        with data_utils.numpy_seed(self.args.seed + epoch):
-            shuffle = np.random.permutation(len(src_dataset))
-
-        self.datasets[split] = SortDataset(
-            NestedDictionaryDataset(
-                {
-                    "id": IdDataset(),
-                    "net_input": {
-                        "src_tokens": RightPadDataset(
-                            src_dataset,
-                            pad_idx=self.source_dictionary.pad(),
-                        ),
-                        "src_lengths": NumelDataset(src_dataset, reduce=False),
-                    },
-                    "target": RightPadDataset(
-                        tgt_dataset,
-                        pad_idx=self.source_dictionary.pad(),
-                    ),
-                    "nsentences": NumSamplesDataset(),
-                    "ntokens": NumelDataset(src_dataset, reduce=True),
-                },
-                sizes=[src_dataset.sizes],
-            ),
-            sort_order=[
-                shuffle,
-                src_dataset.sizes,
-            ],
-        )
-
-    def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):
-        src_dataset = RightPadDataset(
-            TokenBlockDataset(
-                src_tokens,
-                src_lengths,
-                self.args.tokens_per_sample - 1,  # one less for <s>
-                pad=self.source_dictionary.pad(),
-                eos=self.source_dictionary.eos(),
-                break_mode="eos",
-            ),
-            pad_idx=self.source_dictionary.pad(),
-        )
-        src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos())
-        src_dataset = NestedDictionaryDataset(
-            {
-                "id": IdDataset(),
-                "net_input": {
-                    "src_tokens": src_dataset,
-                    "src_lengths": NumelDataset(src_dataset, reduce=False),
-                },
-            },
-            sizes=src_lengths,
-        )
-        if sort:
-            src_dataset = SortDataset(src_dataset, sort_order=[src_lengths])
-        return src_dataset
-
-    @property
-    def source_dictionary(self):
-        return self.dictionary
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/multilingual_denoising.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/multilingual_denoising.py
deleted file mode 100644
index d1c914917feb5165aad7482cd1377f5f65b21635..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/multilingual_denoising.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-
-import numpy as np
-from fairseq.data import (
-    AppendTokenDataset,
-    ConcatDataset,
-    DenoisingDataset,
-    Dictionary,
-    PrependTokenDataset,
-    ResamplingDataset,
-    SortDataset,
-    TokenBlockDataset,
-    data_utils,
-)
-from fairseq.data.encoders.utils import get_whole_word_mask
-from fairseq.tasks import register_task
-
-from .denoising import DenoisingTask
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("multilingual_denoising")
-class MultilingualDenoisingTask(DenoisingTask):
-    @staticmethod
-    def add_args(parser):
-        DenoisingTask.add_args(parser)
-        parser.add_argument(
-            "--multilang-sampling-alpha",
-            type=float,
-            default=1.0,
-            help="smoothing alpha for sample ratios across multiple datasets",
-        )
-        parser.add_argument("--add-lang-token", default=False, action="store_true")
-        parser.add_argument(
-            "--langs", type=str, help="language ids we are considering", default=None
-        )
-        parser.add_argument(
-            "--no-whole-word-mask-langs",
-            type=str,
-            default="",
-            metavar="N",
-            help="languages without spacing between words dont support whole word masking",
-        )
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        """Setup the task."""
-        paths = args.data.split(":")
-        assert len(paths) > 0
-        dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
-
-        data_path = paths[0]
-        if args.langs is None:
-            languages = sorted(
-                [
-                    name
-                    for name in os.listdir(data_path)
-                    if os.path.isdir(os.path.join(data_path, name))
-                ]
-            )
-        else:
-            languages = args.langs.split(",")
-
-        if args.add_lang_token:
-            for lang in languages:
-                dictionary.add_symbol("[{}]".format(lang))
-
-        logger.info("dictionary: {} types".format(len(dictionary)))
-        if not hasattr(args, "shuffle_instance"):
-            args.shuffle_instance = False
-        return cls(args, dictionary)
-
-    def __init__(self, args, dictionary):
-        super().__init__(args, dictionary)
-        self.dictionary = dictionary
-        self.seed = args.seed
-
-        # add mask token
-        self.mask_idx = self.dictionary.add_symbol("<mask>")
-        self.langs = args.langs
-        self.args = args
-
-    def _get_sample_prob(self, dataset_lens):
-        """
-        Get smoothed sampling porbability by languages. This helps low resource
-        languages by upsampling them.
-        """
-        prob = dataset_lens / dataset_lens.sum()
-        smoothed_prob = prob ** self.args.multilang_sampling_alpha
-        smoothed_prob = smoothed_prob / smoothed_prob.sum()
-        return smoothed_prob
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        paths = self.args.data.split(":")
-        assert len(paths) > 0
-        data_path = paths[(epoch - 1) % len(paths)]
-        split_path = os.path.join(data_path, split)
-
-        if self.langs is None:
-            languages = sorted(
-                [
-                    name
-                    for name in os.listdir(data_path)
-                    if os.path.isdir(os.path.join(data_path, name))
-                ]
-            )
-        else:
-            languages = self.langs.split(",")
-            for name in languages:
-                p = os.path.join(data_path, name)
-                assert os.path.exists(p), "data not found: {}".format(p)
-
-        logger.info("Training on {0} languages: {1}".format(len(languages), languages))
-        logger.info(
-            "Language to id mapping: ", {lang: id for id, lang in enumerate(languages)}
-        )
-
-        mask_whole_words = get_whole_word_mask(self.args, self.dictionary)
-        language_without_segmentations = self.args.no_whole_word_mask_langs.split(",")
-        lang_datasets = []
-        for language in languages:
-            split_path = os.path.join(data_path, language, split)
-
-            dataset = data_utils.load_indexed_dataset(
-                split_path,
-                self.source_dictionary,
-                self.args.dataset_impl,
-                combine=combine,
-            )
-            if dataset is None:
-                raise FileNotFoundError(
-                    "Dataset not found: {} ({})".format(split, split_path)
-                )
-
-            end_token = (
-                self.source_dictionary.index("[{}]".format(language))
-                if self.args.add_lang_token
-                else self.source_dictionary.eos()
-            )
-
-            # create continuous blocks of tokens
-            dataset = TokenBlockDataset(
-                dataset,
-                dataset.sizes,
-                self.args.tokens_per_sample - 2,  # one less for <s>
-                pad=self.source_dictionary.pad(),
-                eos=end_token,
-                break_mode=self.args.sample_break_mode,
-            )
-            logger.info("loaded {} blocks from: {}".format(len(dataset), split_path))
-
-            # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
-            dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
-            dataset = AppendTokenDataset(dataset, end_token)
-
-            lang_mask_whole_words = (
-                mask_whole_words
-                if language not in language_without_segmentations
-                else None
-            )
-            lang_dataset = DenoisingDataset(
-                dataset,
-                dataset.sizes,
-                self.dictionary,
-                self.mask_idx,
-                lang_mask_whole_words,
-                shuffle=self.args.shuffle_instance,
-                seed=self.seed,
-                args=self.args,
-                eos=None
-                if not self.args.add_lang_token
-                else self.source_dictionary.index("[{}]".format(language)),
-            )
-            lang_datasets.append(lang_dataset)
-
-        dataset_lengths = np.array(
-            [len(d) for d in lang_datasets],
-            dtype=float,
-        )
-        logger.info(
-            "loaded total {} blocks for all languages".format(
-                int(dataset_lengths.sum()),
-            )
-        )
-        if split == self.args.train_subset:
-            # For train subset, additionally up or down sample languages.
-            sample_probs = self._get_sample_prob(dataset_lengths)
-            logger.info(
-                "Sample probability by language: {}".format(
-                    {
-                        lang: "{0:.4f}".format(sample_probs[id])
-                        for id, lang in enumerate(languages)
-                    }
-                )
-            )
-            size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths
-            logger.info(
-                "Up/Down Sampling ratio by language: {}".format(
-                    {
-                        lang: "{0:.2f}".format(size_ratio[id])
-                        for id, lang in enumerate(languages)
-                    }
-                )
-            )
-
-            resampled_lang_datasets = [
-                ResamplingDataset(
-                    lang_datasets[i],
-                    size_ratio=size_ratio[i],
-                    seed=self.args.seed,
-                    epoch=epoch,
-                    replace=size_ratio[i] >= 1.0,
-                )
-                for i, d in enumerate(lang_datasets)
-            ]
-            dataset = ConcatDataset(
-                resampled_lang_datasets,
-            )
-        else:
-            dataset = ConcatDataset(lang_datasets)
-            lang_splits = [split]
-            for lang_id, lang_dataset in enumerate(lang_datasets):
-                split_name = split + "_" + languages[lang_id]
-                lang_splits.append(split_name)
-                self.datasets[split_name] = lang_dataset
-
-            if split in self.args.valid_subset:
-                self.args.valid_subset = self.args.valid_subset.replace(
-                    split, ",".join(lang_splits)
-                )
-
-        with data_utils.numpy_seed(self.args.seed + epoch):
-            shuffle = np.random.permutation(len(dataset))
-
-        self.datasets[split] = SortDataset(
-            dataset,
-            sort_order=[
-                shuffle,
-                dataset.sizes,
-            ],
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/multilingual_masked_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/multilingual_masked_lm.py
deleted file mode 100644
index 9e6ce4b8a2f77ed889a6e1451321a8e3ac21dc67..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/multilingual_masked_lm.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-
-import numpy as np
-import torch
-from fairseq import utils
-from fairseq.data import (
-    ConcatDataset,
-    Dictionary,
-    IdDataset,
-    MaskTokensDataset,
-    NestedDictionaryDataset,
-    NumelDataset,
-    NumSamplesDataset,
-    PadDataset,
-    PrependTokenDataset,
-    RawLabelDataset,
-    ResamplingDataset,
-    SortDataset,
-    TokenBlockDataset,
-    data_utils,
-    encoders,
-)
-from fairseq.tasks import LegacyFairseqTask, register_task
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("multilingual_masked_lm")
-class MultiLingualMaskedLMTask(LegacyFairseqTask):
-    """Task for training masked language models (e.g., BERT, RoBERTa)."""
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        parser.add_argument(
-            "data",
-            help="colon separated path to data directories list, \
-                            will be iterated upon during epochs in round-robin manner",
-        )
-        parser.add_argument(
-            "--sample-break-mode",
-            default="complete",
-            choices=["none", "complete", "complete_doc", "eos"],
-            help='If omitted or "none", fills each sample with tokens-per-sample '
-            'tokens. If set to "complete", splits samples only at the end '
-            "of sentence, but may include multiple sentences per sample. "
-            '"complete_doc" is similar but respects doc boundaries. '
-            'If set to "eos", includes only one sentence per sample.',
-        )
-        parser.add_argument(
-            "--tokens-per-sample",
-            default=512,
-            type=int,
-            help="max number of total tokens over all segments "
-            "per sample for BERT dataset",
-        )
-        parser.add_argument(
-            "--mask-prob",
-            default=0.15,
-            type=float,
-            help="probability of replacing a token with mask",
-        )
-        parser.add_argument(
-            "--leave-unmasked-prob",
-            default=0.1,
-            type=float,
-            help="probability that a masked token is unmasked",
-        )
-        parser.add_argument(
-            "--random-token-prob",
-            default=0.1,
-            type=float,
-            help="probability of replacing a token with a random token",
-        )
-        parser.add_argument(
-            "--freq-weighted-replacement",
-            action="store_true",
-            help="sample random replacement words based on word frequencies",
-        )
-        parser.add_argument(
-            "--mask-whole-words",
-            default=False,
-            action="store_true",
-            help="mask whole words; you may also want to set --bpe",
-        )
-        parser.add_argument(
-            "--multilang-sampling-alpha",
-            type=float,
-            default=1.0,
-            help="smoothing alpha for sample rations across multiple datasets",
-        )
-
-    def __init__(self, args, dictionary):
-        super().__init__(args)
-        self.dictionary = dictionary
-        self.seed = args.seed
-
-        # add mask token
-        self.mask_idx = dictionary.add_symbol("<mask>")
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        paths = utils.split_paths(args.data)
-        assert len(paths) > 0
-        dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt"))
-        logger.info("dictionary: {} types".format(len(dictionary)))
-        return cls(args, dictionary)
-
-    def _get_whole_word_mask(self):
-        # create masked input and targets
-        if self.args.mask_whole_words:
-            bpe = encoders.build_bpe(self.args)
-            if bpe is not None:
-
-                def is_beginning_of_word(i):
-                    if i < self.source_dictionary.nspecial:
-                        # special elements are always considered beginnings
-                        return True
-                    tok = self.source_dictionary[i]
-                    if tok.startswith("madeupword"):
-                        return True
-                    try:
-                        return bpe.is_beginning_of_word(tok)
-                    except ValueError:
-                        return True
-
-                mask_whole_words = torch.ByteTensor(
-                    list(map(is_beginning_of_word, range(len(self.source_dictionary))))
-                )
-        else:
-            mask_whole_words = None
-        return mask_whole_words
-
-    def _get_sample_prob(self, dataset_lens):
-        """
-        Get smoothed sampling porbability by languages. This helps low resource
-        languages by upsampling them.
-        """
-        prob = dataset_lens / dataset_lens.sum()
-        smoothed_prob = prob ** self.args.multilang_sampling_alpha
-        smoothed_prob = smoothed_prob / smoothed_prob.sum()
-        return smoothed_prob
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        paths = utils.split_paths(self.args.data)
-        assert len(paths) > 0
-        data_path = paths[(epoch - 1) % len(paths)]
-
-        languages = sorted(
-            name
-            for name in os.listdir(data_path)
-            if os.path.isdir(os.path.join(data_path, name))
-        )
-
-        logger.info("Training on {0} languages: {1}".format(len(languages), languages))
-        logger.info(
-            "Language to id mapping: ", {lang: id for id, lang in enumerate(languages)}
-        )
-
-        mask_whole_words = self._get_whole_word_mask()
-        lang_datasets = []
-        for lang_id, language in enumerate(languages):
-            split_path = os.path.join(data_path, language, split)
-
-            dataset = data_utils.load_indexed_dataset(
-                split_path,
-                self.source_dictionary,
-                self.args.dataset_impl,
-                combine=combine,
-            )
-            if dataset is None:
-                raise FileNotFoundError(
-                    "Dataset not found: {} ({})".format(split, split_path)
-                )
-
-            # create continuous blocks of tokens
-            dataset = TokenBlockDataset(
-                dataset,
-                dataset.sizes,
-                self.args.tokens_per_sample - 1,  # one less for <s>
-                pad=self.source_dictionary.pad(),
-                eos=self.source_dictionary.eos(),
-                break_mode=self.args.sample_break_mode,
-            )
-            logger.info("loaded {} blocks from: {}".format(len(dataset), split_path))
-
-            # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)
-            dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())
-
-            src_dataset, tgt_dataset = MaskTokensDataset.apply_mask(
-                dataset,
-                self.source_dictionary,
-                pad_idx=self.source_dictionary.pad(),
-                mask_idx=self.mask_idx,
-                seed=self.args.seed,
-                mask_prob=self.args.mask_prob,
-                leave_unmasked_prob=self.args.leave_unmasked_prob,
-                random_token_prob=self.args.random_token_prob,
-                freq_weighted_replacement=self.args.freq_weighted_replacement,
-                mask_whole_words=mask_whole_words,
-            )
-
-            lang_dataset = NestedDictionaryDataset(
-                {
-                    "net_input": {
-                        "src_tokens": PadDataset(
-                            src_dataset,
-                            pad_idx=self.source_dictionary.pad(),
-                            left_pad=False,
-                        ),
-                        "src_lengths": NumelDataset(src_dataset, reduce=False),
-                    },
-                    "target": PadDataset(
-                        tgt_dataset,
-                        pad_idx=self.source_dictionary.pad(),
-                        left_pad=False,
-                    ),
-                    "nsentences": NumSamplesDataset(),
-                    "ntokens": NumelDataset(src_dataset, reduce=True),
-                    "lang_id": RawLabelDataset([lang_id] * src_dataset.sizes.shape[0]),
-                },
-                sizes=[src_dataset.sizes],
-            )
-            lang_datasets.append(lang_dataset)
-
-        dataset_lengths = np.array(
-            [len(d) for d in lang_datasets],
-            dtype=float,
-        )
-        logger.info(
-            "loaded total {} blocks for all languages".format(
-                dataset_lengths.sum(),
-            )
-        )
-        if split == self.args.train_subset:
-            # For train subset, additionally up or down sample languages.
-            sample_probs = self._get_sample_prob(dataset_lengths)
-            logger.info(
-                "Sample probability by language: ",
-                {
-                    lang: "{0:.4f}".format(sample_probs[id])
-                    for id, lang in enumerate(languages)
-                },
-            )
-            size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths
-            logger.info(
-                "Up/Down Sampling ratio by language: ",
-                {
-                    lang: "{0:.2f}".format(size_ratio[id])
-                    for id, lang in enumerate(languages)
-                },
-            )
-
-            resampled_lang_datasets = [
-                ResamplingDataset(
-                    lang_datasets[i],
-                    size_ratio=size_ratio[i],
-                    seed=self.args.seed,
-                    epoch=epoch,
-                    replace=size_ratio[i] >= 1.0,
-                )
-                for i, d in enumerate(lang_datasets)
-            ]
-            dataset = ConcatDataset(resampled_lang_datasets)
-        else:
-            dataset = ConcatDataset(lang_datasets)
-            lang_splits = [split]
-            for lang_id, lang_dataset in enumerate(lang_datasets):
-                split_name = split + "_" + languages[lang_id]
-                lang_splits.append(split_name)
-                self.datasets[split_name] = lang_dataset
-
-            # [TODO]: This is hacky for now to print validation ppl for each
-            # language individually. Maybe need task API changes to allow it
-            # in more generic ways.
-            if split in self.args.valid_subset:
-                self.args.valid_subset = self.args.valid_subset.replace(
-                    split, ",".join(lang_splits)
-                )
-
-        with data_utils.numpy_seed(self.args.seed + epoch):
-            shuffle = np.random.permutation(len(dataset))
-
-        self.datasets[split] = SortDataset(
-            dataset,
-            sort_order=[
-                shuffle,
-                dataset.sizes,
-            ],
-        )
-
-    def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):
-        src_dataset = PadDataset(
-            TokenBlockDataset(
-                src_tokens,
-                src_lengths,
-                self.args.tokens_per_sample - 1,  # one less for <s>
-                pad=self.source_dictionary.pad(),
-                eos=self.source_dictionary.eos(),
-                break_mode="eos",
-            ),
-            pad_idx=self.source_dictionary.pad(),
-            left_pad=False,
-        )
-        src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos())
-        src_dataset = NestedDictionaryDataset(
-            {
-                "id": IdDataset(),
-                "net_input": {
-                    "src_tokens": src_dataset,
-                    "src_lengths": NumelDataset(src_dataset, reduce=False),
-                },
-            },
-            sizes=src_lengths,
-        )
-        if sort:
-            src_dataset = SortDataset(src_dataset, sort_order=[src_lengths])
-        return src_dataset
-
-    @property
-    def source_dictionary(self):
-        return self.dictionary
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/multilingual_translation.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/multilingual_translation.py
deleted file mode 100644
index f6cb17f12a1144299169a2cef2bde6f5b5cd4756..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/multilingual_translation.py
+++ /dev/null
@@ -1,449 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import contextlib
-import logging
-import os
-from collections import OrderedDict
-
-import torch
-from fairseq import metrics, options, utils
-from fairseq.data import (
-    Dictionary,
-    LanguagePairDataset,
-    RoundRobinZipDatasets,
-    TransformEosLangPairDataset,
-)
-from fairseq.models import FairseqMultiModel
-from fairseq.tasks.translation import load_langpair_dataset
-
-from . import LegacyFairseqTask, register_task
-
-
-logger = logging.getLogger(__name__)
-
-
-def _lang_token(lang: str):
-    return "__{}__".format(lang)
-
-
-def _lang_token_index(dic: Dictionary, lang: str):
-    """Return language token index."""
-    idx = dic.index(_lang_token(lang))
-    assert idx != dic.unk_index, "cannot find language token for lang {}".format(lang)
-    return idx
-
-
-@register_task("multilingual_translation")
-class MultilingualTranslationTask(LegacyFairseqTask):
-    """A task for training multiple translation models simultaneously.
-
-    We iterate round-robin over batches from multiple language pairs, ordered
-    according to the `--lang-pairs` argument.
-
-    The training loop is roughly:
-
-        for i in range(len(epoch)):
-            for lang_pair in args.lang_pairs:
-                batch = next_batch_for_lang_pair(lang_pair)
-                loss = criterion(model_for_lang_pair(lang_pair), batch)
-                loss.backward()
-            optimizer.step()
-
-    In practice, `next_batch_for_lang_pair` is abstracted in a FairseqDataset
-    (e.g., `RoundRobinZipDatasets`) and `model_for_lang_pair` is a model that
-    implements the `FairseqMultiModel` interface.
-
-    During inference it is required to specify a single `--source-lang` and
-    `--target-lang`, which indicates the inference langauge direction.
-    `--lang-pairs`, `--encoder-langtok`, `--decoder-langtok` have to be set to
-    the same value as training.
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('data', metavar='DIR', help='path to data directory')
-        parser.add_argument('--lang-pairs', default=None, metavar='PAIRS',
-                            help='comma-separated list of language pairs (in training order): en-de,en-fr,de-fr')
-        parser.add_argument('-s', '--source-lang', default=None, metavar='SRC',
-                            help='source language (only needed for inference)')
-        parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
-                            help='target language (only needed for inference)')
-        parser.add_argument('--left-pad-source', default='True', type=str, metavar='BOOL',
-                            help='pad the source on the left (default: True)')
-        parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL',
-                            help='pad the target on the left (default: False)')
-        parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N',
-                            help='max number of tokens in the source sequence')
-        parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N',
-                            help='max number of tokens in the target sequence')
-        parser.add_argument('--upsample-primary', default=1, type=int,
-                            help='amount to upsample primary dataset')
-        parser.add_argument('--encoder-langtok', default=None, type=str, choices=['src', 'tgt'],
-                            metavar='SRCTGT',
-                            help='replace beginning-of-sentence in source sentence with source or target '
-                                 'language token. (src/tgt)')
-        parser.add_argument('--decoder-langtok', action='store_true',
-                            help='replace beginning-of-sentence in target sentence with target language token')
-        # fmt: on
-
-    def __init__(self, args, dicts, training):
-        super().__init__(args)
-        self.dicts = dicts
-        self.training = training
-        if training:
-            self.lang_pairs = args.lang_pairs
-        else:
-            self.lang_pairs = ["{}-{}".format(args.source_lang, args.target_lang)]
-        # eval_lang_pairs for multilingual translation is usually all of the
-        # lang_pairs. However for other multitask settings or when we want to
-        # optimize for certain languages we want to use a different subset. Thus
-        # the eval_lang_pairs class variable is provided for classes that extend
-        # this class.
-        self.eval_lang_pairs = self.lang_pairs
-        # model_lang_pairs will be used to build encoder-decoder model pairs in
-        # models.build_model(). This allows multitask type of sub-class can
-        # build models other than the input lang_pairs
-        self.model_lang_pairs = self.lang_pairs
-        self.langs = list(dicts.keys())
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        dicts, training = cls.prepare(args, **kwargs)
-        return cls(args, dicts, training)
-
-    @classmethod
-    def prepare(cls, args, **kargs):
-        args.left_pad_source = utils.eval_bool(args.left_pad_source)
-        args.left_pad_target = utils.eval_bool(args.left_pad_target)
-
-        if args.lang_pairs is None:
-            raise ValueError(
-                "--lang-pairs is required. List all the language pairs in the training objective."
-            )
-        if isinstance(args.lang_pairs, str):
-            args.lang_pairs = args.lang_pairs.split(",")
-        sorted_langs = sorted(
-            list({x for lang_pair in args.lang_pairs for x in lang_pair.split("-")})
-        )
-        if args.source_lang is not None or args.target_lang is not None:
-            training = False
-        else:
-            training = True
-
-        # load dictionaries
-        dicts = OrderedDict()
-        for lang in sorted_langs:
-            paths = utils.split_paths(args.data)
-            assert len(paths) > 0
-            dicts[lang] = cls.load_dictionary(
-                os.path.join(paths[0], "dict.{}.txt".format(lang))
-            )
-            if len(dicts) > 0:
-                assert dicts[lang].pad() == dicts[sorted_langs[0]].pad()
-                assert dicts[lang].eos() == dicts[sorted_langs[0]].eos()
-                assert dicts[lang].unk() == dicts[sorted_langs[0]].unk()
-            if args.encoder_langtok is not None or args.decoder_langtok:
-                for lang_to_add in sorted_langs:
-                    dicts[lang].add_symbol(_lang_token(lang_to_add))
-            logger.info("[{}] dictionary: {} types".format(lang, len(dicts[lang])))
-        return dicts, training
-
-    def get_encoder_langtok(self, src_lang, tgt_lang):
-        if self.args.encoder_langtok is None:
-            return self.dicts[src_lang].eos()
-        if self.args.encoder_langtok == "src":
-            return _lang_token_index(self.dicts[src_lang], src_lang)
-        else:
-            return _lang_token_index(self.dicts[src_lang], tgt_lang)
-
-    def get_decoder_langtok(self, tgt_lang):
-        if not self.args.decoder_langtok:
-            return self.dicts[tgt_lang].eos()
-        return _lang_token_index(self.dicts[tgt_lang], tgt_lang)
-
-    def alter_dataset_langtok(
-        self,
-        lang_pair_dataset,
-        src_eos=None,
-        src_lang=None,
-        tgt_eos=None,
-        tgt_lang=None,
-    ):
-        if self.args.encoder_langtok is None and not self.args.decoder_langtok:
-            return lang_pair_dataset
-
-        new_src_eos = None
-        if (
-            self.args.encoder_langtok is not None
-            and src_eos is not None
-            and src_lang is not None
-            and tgt_lang is not None
-        ):
-            new_src_eos = self.get_encoder_langtok(src_lang, tgt_lang)
-        else:
-            src_eos = None
-
-        new_tgt_bos = None
-        if self.args.decoder_langtok and tgt_eos is not None and tgt_lang is not None:
-            new_tgt_bos = self.get_decoder_langtok(tgt_lang)
-        else:
-            tgt_eos = None
-
-        return TransformEosLangPairDataset(
-            lang_pair_dataset,
-            src_eos=src_eos,
-            new_src_eos=new_src_eos,
-            tgt_bos=tgt_eos,
-            new_tgt_bos=new_tgt_bos,
-        )
-
-    def load_dataset(self, split, epoch=1, **kwargs):
-        """Load a dataset split."""
-        paths = utils.split_paths(self.args.data)
-        assert len(paths) > 0
-        data_path = paths[(epoch - 1) % len(paths)]
-
-        def language_pair_dataset(lang_pair):
-            src, tgt = lang_pair.split("-")
-            langpair_dataset = load_langpair_dataset(
-                data_path,
-                split,
-                src,
-                self.dicts[src],
-                tgt,
-                self.dicts[tgt],
-                combine=True,
-                dataset_impl=self.args.dataset_impl,
-                upsample_primary=self.args.upsample_primary,
-                left_pad_source=self.args.left_pad_source,
-                left_pad_target=self.args.left_pad_target,
-                max_source_positions=self.args.max_source_positions,
-                max_target_positions=self.args.max_target_positions,
-            )
-            return self.alter_dataset_langtok(
-                langpair_dataset,
-                src_eos=self.dicts[src].eos(),
-                src_lang=src,
-                tgt_eos=self.dicts[tgt].eos(),
-                tgt_lang=tgt,
-            )
-
-        self.datasets[split] = RoundRobinZipDatasets(
-            OrderedDict(
-                [
-                    (lang_pair, language_pair_dataset(lang_pair))
-                    for lang_pair in self.lang_pairs
-                ]
-            ),
-            eval_key=None
-            if self.training
-            else "%s-%s" % (self.args.source_lang, self.args.target_lang),
-        )
-
-    def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None):
-        if constraints is not None:
-            raise NotImplementedError(
-                "Constrained decoding with the multilingual_translation task is not supported"
-            )
-
-        lang_pair = "%s-%s" % (self.args.source_lang, self.args.target_lang)
-        return RoundRobinZipDatasets(
-            OrderedDict(
-                [
-                    (
-                        lang_pair,
-                        self.alter_dataset_langtok(
-                            LanguagePairDataset(
-                                src_tokens, src_lengths, self.source_dictionary
-                            ),
-                            src_eos=self.source_dictionary.eos(),
-                            src_lang=self.args.source_lang,
-                            tgt_eos=self.target_dictionary.eos(),
-                            tgt_lang=self.args.target_lang,
-                        ),
-                    )
-                ]
-            ),
-            eval_key=lang_pair,
-        )
-
-    def build_model(self, args):
-        def check_args():
-            messages = []
-            if (
-                len(set(self.args.lang_pairs).symmetric_difference(args.lang_pairs))
-                != 0
-            ):
-                messages.append(
-                    "--lang-pairs should include all the language pairs {}.".format(
-                        args.lang_pairs
-                    )
-                )
-            if self.args.encoder_langtok != args.encoder_langtok:
-                messages.append(
-                    "--encoder-langtok should be {}.".format(args.encoder_langtok)
-                )
-            if self.args.decoder_langtok != args.decoder_langtok:
-                messages.append(
-                    "--decoder-langtok should {} be set.".format(
-                        "" if args.decoder_langtok else "not"
-                    )
-                )
-
-            if len(messages) > 0:
-                raise ValueError(" ".join(messages))
-
-        # Check if task args are consistant with model args
-        check_args()
-
-        from fairseq import models
-
-        model = models.build_model(args, self)
-        if not isinstance(model, FairseqMultiModel):
-            raise ValueError(
-                "MultilingualTranslationTask requires a FairseqMultiModel architecture"
-            )
-        return model
-
-    def _per_lang_pair_train_loss(
-        self, lang_pair, model, update_num, criterion, sample, optimizer, ignore_grad
-    ):
-        loss, sample_size, logging_output = criterion(
-            model.models[lang_pair], sample[lang_pair]
-        )
-        if ignore_grad:
-            loss *= 0
-        optimizer.backward(loss)
-        return loss, sample_size, logging_output
-
-    def train_step(
-        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
-    ):
-        model.train()
-        from collections import defaultdict
-
-        agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, defaultdict(float)
-        curr_lang_pairs = [
-            lang_pair
-            for lang_pair in self.model_lang_pairs
-            if sample[lang_pair] is not None and len(sample[lang_pair]) != 0
-        ]
-
-        for idx, lang_pair in enumerate(curr_lang_pairs):
-
-            def maybe_no_sync():
-                if (
-                    self.args.distributed_world_size > 1
-                    and hasattr(model, "no_sync")
-                    and idx < len(curr_lang_pairs) - 1
-                ):
-                    return model.no_sync()
-                else:
-                    return contextlib.ExitStack()  # dummy contextmanager
-
-            with maybe_no_sync():
-                loss, sample_size, logging_output = self._per_lang_pair_train_loss(
-                    lang_pair,
-                    model,
-                    update_num,
-                    criterion,
-                    sample,
-                    optimizer,
-                    ignore_grad,
-                )
-            agg_loss += loss.detach().item()
-            # TODO make summing of the sample sizes configurable
-            agg_sample_size += sample_size
-            for k in logging_output:
-                agg_logging_output[k] += logging_output[k]
-                agg_logging_output[f"{lang_pair}:{k}"] += logging_output[k]
-        return agg_loss, agg_sample_size, agg_logging_output
-
-    def _per_lang_pair_valid_loss(self, lang_pair, model, criterion, sample):
-        return criterion(model.models[lang_pair], sample[lang_pair])
-
-    def valid_step(self, sample, model, criterion):
-        model.eval()
-        with torch.no_grad():
-            from collections import defaultdict
-
-            agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, defaultdict(float)
-            for lang_pair in self.eval_lang_pairs:
-                if (
-                    lang_pair not in sample
-                    or sample[lang_pair] is None
-                    or len(sample[lang_pair]) == 0
-                ):
-                    continue
-                loss, sample_size, logging_output = self._per_lang_pair_valid_loss(
-                    lang_pair, model, criterion, sample
-                )
-                agg_loss += loss.data.item()
-                # TODO make summing of the sample sizes configurable
-                agg_sample_size += sample_size
-                for k in logging_output:
-                    agg_logging_output[k] += logging_output[k]
-                    agg_logging_output[f"{lang_pair}:{k}"] += logging_output[k]
-        return agg_loss, agg_sample_size, agg_logging_output
-
-    def inference_step(
-        self, generator, models, sample, prefix_tokens=None, constraints=None
-    ):
-        with torch.no_grad():
-            if self.args.decoder_langtok:
-                bos_token = _lang_token_index(
-                    self.target_dictionary, self.args.target_lang
-                )
-            else:
-                bos_token = self.target_dictionary.eos()
-            return generator.generate(
-                models,
-                sample,
-                prefix_tokens=prefix_tokens,
-                constraints=constraints,
-                bos_token=bos_token,
-            )
-
-    def reduce_metrics(self, logging_outputs, criterion):
-        with metrics.aggregate():
-            # pass 'sample_size', 'nsentences', 'ntokens' stats to fairseq_task
-            super().reduce_metrics(logging_outputs, criterion)
-            for k in ["sample_size", "nsentences", "ntokens"]:
-                metrics.log_scalar(k, sum(l[k] for l in logging_outputs))
-
-    @property
-    def source_dictionary(self):
-        if self.training:
-            return next(iter(self.dicts.values()))
-        else:
-            return self.dicts[self.args.source_lang]
-
-    @property
-    def target_dictionary(self):
-        if self.training:
-            return next(iter(self.dicts.values()))
-        else:
-            return self.dicts[self.args.target_lang]
-
-    def max_positions(self):
-        """Return the max sentence length allowed by the task."""
-        if len(self.datasets.values()) == 0:
-            return {
-                "%s-%s"
-                % (self.args.source_lang, self.args.target_lang): (
-                    self.args.max_source_positions,
-                    self.args.max_target_positions,
-                )
-            }
-        return OrderedDict(
-            [
-                (key, (self.args.max_source_positions, self.args.max_target_positions))
-                for split in self.datasets.keys()
-                for key in self.datasets[split].datasets.keys()
-            ]
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/semisupervised_translation.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/semisupervised_translation.py
deleted file mode 100644
index b2f9bf9a733d94e50b588e4316b4a02e1c8bcf51..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/semisupervised_translation.py
+++ /dev/null
@@ -1,485 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-from collections import OrderedDict
-
-from fairseq import utils
-from fairseq.data import (
-    BacktranslationDataset,
-    IndexedCachedDataset,
-    IndexedDataset,
-    IndexedRawTextDataset,
-    LanguagePairDataset,
-    NoisingDataset,
-    RoundRobinZipDatasets,
-    data_utils,
-    indexed_dataset,
-)
-from fairseq.models import FairseqMultiModel
-from fairseq.sequence_generator import SequenceGenerator
-
-from . import register_task
-from .multilingual_translation import MultilingualTranslationTask
-
-
-logger = logging.getLogger(__name__)
-
-
-def _get_bt_dataset_key(lang_pair):
-    return "bt:" + lang_pair
-
-
-def _get_denoising_dataset_key(lang_pair):
-    return "denoising:" + lang_pair
-
-
-# ported from UnsupervisedMT
-def parse_lambda_config(x):
-    """
-    Parse the configuration of lambda coefficient (for scheduling).
-    x = "3"                  # lambda will be a constant equal to x
-    x = "0:1,1000:0"         # lambda will start from 1 and linearly decrease
-                             # to 0 during the first 1000 iterations
-    x = "0:0,1000:0,2000:1"  # lambda will be equal to 0 for the first 1000
-                             # iterations, then will linearly increase to 1 until iteration 2000
-    """
-    split = x.split(",")
-    if len(split) == 1:
-        return float(x), None
-    else:
-        split = [s.split(os.pathsep) for s in split]
-        assert all(len(s) == 2 for s in split)
-        assert all(k.isdigit() for k, _ in split)
-        assert all(
-            int(split[i][0]) < int(split[i + 1][0]) for i in range(len(split) - 1)
-        )
-        return float(split[0][1]), [(int(k), float(v)) for k, v in split]
-
-
-@register_task("semisupervised_translation")
-class SemisupervisedTranslationTask(MultilingualTranslationTask):
-    """A task for training multiple translation models simultaneously.
-
-    We iterate round-robin over batches from multiple language pairs, ordered
-    according to the `--lang-pairs` argument.
-
-    The training loop is roughly:
-
-        for i in range(len(epoch)):
-            for lang_pair in args.lang_pairs:
-                batch = next_batch_for_lang_pair(lang_pair)
-                loss = criterion(model_for_lang_pair(lang_pair), batch)
-                loss.backward()
-            optimizer.step()
-
-    In practice, `next_batch_for_lang_pair` is abstracted in a FairseqDataset
-    (e.g., `RoundRobinZipDatasets`) and `model_for_lang_pair` is a model that
-    implements the `FairseqMultiModel` interface.
-
-    During inference it is required to specify a single `--source-lang` and
-    `--target-lang`, instead of `--lang-pairs`.
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        # fmt: off
-        MultilingualTranslationTask.add_args(parser)
-        parser.add_argument('--lambda-parallel-config', default="1.0", type=str, metavar='CONFIG',
-                            help='cross-entropy reconstruction coefficient (parallel data). '
-                                 'use fixed weight during training if set to floating point number. '
-                                 'use piecewise linear function over number of updates to schedule the '
-                                 'weight with the format: w0:step0,w1:step1,...')
-        parser.add_argument('--lambda-denoising-config', default="0.0", type=str, metavar='CONFIG',
-                            help='Cross-entropy reconstruction coefficient (denoising autoencoding)'
-                                 'use fixed weight during training if set to floating point number. '
-                                 'use piecewise linear function over number of updates to schedule the '
-                                 'weight with the format: w0:step0,w1:step1,...')
-        parser.add_argument('--lambda-otf-bt-config', default="0.0", type=str, metavar='CONFIG',
-                            help='cross-entropy reconstruction coefficient (on-the-fly back-translation parallel data)'
-                                 'use fixed weight during training if set to floating point number. '
-                                 'use piecewise linear function over number of updates to schedule the '
-                                 'weight with the format: w0:step0,w1:step1,...')
-        parser.add_argument('--bt-max-len-a', default=1.1, type=float, metavar='N',
-                            help='generate back-translated sequences of maximum length ax + b, where x is the '
-                                 'source length')
-        parser.add_argument('--bt-max-len-b', default=10.0, type=float, metavar='N',
-                            help='generate back-translated sequences of maximum length ax + b, where x is the '
-                                 'source length')
-        parser.add_argument('--bt-beam-size', default=1, type=int, metavar='N',
-                            help='beam size used in beam search of online back-translation')
-        parser.add_argument('--max-word-shuffle-distance', default=3.0, type=float, metavar='N',
-                            help='maximum word shuffle distance for denoising autoencoding data generation')
-        parser.add_argument('--word-dropout-prob', default=0.1, type=float, metavar='N',
-                            help='word dropout probability for denoising autoencoding data generation')
-        parser.add_argument('--word-blanking-prob', default=0.2, type=float, metavar='N',
-                            help='word blanking probability for denoising autoencoding data generation')
-        # fmt: on
-
-    def __init__(self, args, dicts, training):
-        super().__init__(args, dicts, training)
-        self.lambda_parallel, self.lambda_parallel_steps = parse_lambda_config(
-            args.lambda_parallel_config
-        )
-        self.lambda_otf_bt, self.lambda_otf_bt_steps = parse_lambda_config(
-            args.lambda_otf_bt_config
-        )
-        self.lambda_denoising, self.lambda_denoising_steps = parse_lambda_config(
-            args.lambda_denoising_config
-        )
-        if self.lambda_denoising > 0.0 or self.lambda_denoising_steps is not None:
-            denoising_lang_pairs = [
-                "%s-%s" % (tgt, tgt)
-                for tgt in {lang_pair.split("-")[1] for lang_pair in args.lang_pairs}
-            ]
-            self.model_lang_pairs = self.model_lang_pairs + denoising_lang_pairs
-        self.backtranslate_datasets = {}
-        self.backtranslators = {}
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        dicts, training = MultilingualTranslationTask.prepare(args, **kwargs)
-        return cls(args, dicts, training)
-
-    def load_dataset(self, split, epoch=1, **kwargs):
-        """Load a dataset split."""
-        paths = utils.split_paths(self.args.data)
-        assert len(paths) > 0
-        data_path = paths[(epoch - 1) % len(paths)]
-
-        def split_exists(split, src, tgt, lang):
-            if src is not None:
-                filename = os.path.join(
-                    data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)
-                )
-            else:
-                filename = os.path.join(
-                    data_path, "{}.{}-None.{}".format(split, src, tgt)
-                )
-            return indexed_dataset.dataset_exists(filename, impl=self.args.dataset_impl)
-
-        def load_indexed_dataset(path, dictionary):
-            return data_utils.load_indexed_dataset(
-                path, dictionary, self.args.dataset_impl
-            )
-
-        # load parallel datasets
-        src_datasets, tgt_datasets = {}, {}
-        if (
-            self.lambda_parallel > 0.0
-            or self.lambda_parallel_steps is not None
-            or not split.startswith("train")
-        ):
-            for lang_pair in self.lang_pairs:
-                src, tgt = lang_pair.split("-")
-                if split_exists(split, src, tgt, src):
-                    prefix = os.path.join(
-                        data_path, "{}.{}-{}.".format(split, src, tgt)
-                    )
-                elif split_exists(split, tgt, src, src):
-                    prefix = os.path.join(
-                        data_path, "{}.{}-{}.".format(split, tgt, src)
-                    )
-                else:
-                    continue
-                src_datasets[lang_pair] = load_indexed_dataset(
-                    prefix + src, self.dicts[src]
-                )
-                tgt_datasets[lang_pair] = load_indexed_dataset(
-                    prefix + tgt, self.dicts[tgt]
-                )
-                logger.info(
-                    "parallel-{} {} {} examples".format(
-                        data_path, split, len(src_datasets[lang_pair])
-                    )
-                )
-            if len(src_datasets) == 0:
-                raise FileNotFoundError(
-                    "Dataset not found: {} ({})".format(split, data_path)
-                )
-
-        # back translation datasets
-        backtranslate_datasets = {}
-        if (
-            self.lambda_otf_bt > 0.0 or self.lambda_otf_bt_steps is not None
-        ) and split.startswith("train"):
-            for lang_pair in self.lang_pairs:
-                src, tgt = lang_pair.split("-")
-                if not split_exists(split, tgt, None, tgt):
-                    raise FileNotFoundError(
-                        "Dataset not found: backtranslation {} ({})".format(
-                            split, data_path
-                        )
-                    )
-                filename = os.path.join(
-                    data_path, "{}.{}-None.{}".format(split, tgt, tgt)
-                )
-                dataset = load_indexed_dataset(filename, self.dicts[tgt])
-                lang_pair_dataset_tgt = LanguagePairDataset(
-                    dataset,
-                    dataset.sizes,
-                    self.dicts[tgt],
-                    left_pad_source=self.args.left_pad_source,
-                    left_pad_target=self.args.left_pad_target,
-                )
-                lang_pair_dataset = LanguagePairDataset(
-                    dataset,
-                    dataset.sizes,
-                    src_dict=self.dicts[src],
-                    tgt=dataset,
-                    tgt_sizes=dataset.sizes,
-                    tgt_dict=self.dicts[tgt],
-                    left_pad_source=self.args.left_pad_source,
-                    left_pad_target=self.args.left_pad_target,
-                )
-                backtranslate_datasets[lang_pair] = BacktranslationDataset(
-                    tgt_dataset=self.alter_dataset_langtok(
-                        lang_pair_dataset_tgt,
-                        src_eos=self.dicts[tgt].eos(),
-                        src_lang=tgt,
-                        tgt_lang=src,
-                    ),
-                    backtranslation_fn=self.backtranslators[lang_pair],
-                    src_dict=self.dicts[src],
-                    tgt_dict=self.dicts[tgt],
-                    output_collater=self.alter_dataset_langtok(
-                        lang_pair_dataset=lang_pair_dataset,
-                        src_eos=self.dicts[src].eos(),
-                        src_lang=src,
-                        tgt_eos=self.dicts[tgt].eos(),
-                        tgt_lang=tgt,
-                    ).collater,
-                )
-                logger.info(
-                    "backtranslate-{}: {} {} {} examples".format(
-                        tgt,
-                        data_path,
-                        split,
-                        len(backtranslate_datasets[lang_pair]),
-                    )
-                )
-                self.backtranslate_datasets[lang_pair] = backtranslate_datasets[
-                    lang_pair
-                ]
-
-        # denoising autoencoder
-        noising_datasets = {}
-        if (
-            self.lambda_denoising > 0.0 or self.lambda_denoising_steps is not None
-        ) and split.startswith("train"):
-            for lang_pair in self.lang_pairs:
-                _, tgt = lang_pair.split("-")
-                if not split_exists(split, tgt, None, tgt):
-                    continue
-                filename = os.path.join(
-                    data_path, "{}.{}-None.{}".format(split, tgt, tgt)
-                )
-                tgt_dataset1 = load_indexed_dataset(filename, self.dicts[tgt])
-                tgt_dataset2 = load_indexed_dataset(filename, self.dicts[tgt])
-                noising_dataset = NoisingDataset(
-                    tgt_dataset1,
-                    self.dicts[tgt],
-                    seed=1,
-                    max_word_shuffle_distance=self.args.max_word_shuffle_distance,
-                    word_dropout_prob=self.args.word_dropout_prob,
-                    word_blanking_prob=self.args.word_blanking_prob,
-                )
-                noising_datasets[lang_pair] = self.alter_dataset_langtok(
-                    LanguagePairDataset(
-                        noising_dataset,
-                        tgt_dataset1.sizes,
-                        self.dicts[tgt],
-                        tgt_dataset2,
-                        tgt_dataset2.sizes,
-                        self.dicts[tgt],
-                        left_pad_source=self.args.left_pad_source,
-                        left_pad_target=self.args.left_pad_target,
-                    ),
-                    src_eos=self.dicts[tgt].eos(),
-                    src_lang=tgt,
-                    tgt_eos=self.dicts[tgt].eos(),
-                    tgt_lang=tgt,
-                )
-                logger.info(
-                    "denoising-{}: {} {} {} examples".format(
-                        tgt,
-                        data_path,
-                        split,
-                        len(noising_datasets[lang_pair]),
-                    )
-                )
-
-        def language_pair_dataset(lang_pair):
-            src, tgt = lang_pair.split("-")
-            src_dataset, tgt_dataset = src_datasets[lang_pair], tgt_datasets[lang_pair]
-            return self.alter_dataset_langtok(
-                LanguagePairDataset(
-                    src_dataset,
-                    src_dataset.sizes,
-                    self.dicts[src],
-                    tgt_dataset,
-                    tgt_dataset.sizes,
-                    self.dicts[tgt],
-                    left_pad_source=self.args.left_pad_source,
-                    left_pad_target=self.args.left_pad_target,
-                ),
-                self.dicts[src].eos(),
-                src,
-                self.dicts[tgt].eos(),
-                tgt,
-            )
-
-        self.datasets[split] = RoundRobinZipDatasets(
-            OrderedDict(
-                [
-                    (lang_pair, language_pair_dataset(lang_pair))
-                    for lang_pair in src_datasets.keys()
-                ]
-                + [
-                    (_get_bt_dataset_key(lang_pair), dataset)
-                    for lang_pair, dataset in backtranslate_datasets.items()
-                ]
-                + [
-                    (_get_denoising_dataset_key(lang_pair), dataset)
-                    for lang_pair, dataset in noising_datasets.items()
-                ]
-            ),
-            eval_key=None
-            if self.training
-            else "%s-%s" % (self.args.source_lang, self.args.target_lang),
-        )
-
-    def build_model(self, args):
-        from fairseq import models
-
-        model = models.build_model(args, self)
-        if not isinstance(model, FairseqMultiModel):
-            raise ValueError(
-                "SemisupervisedTranslationTask requires a FairseqMultiModel architecture"
-            )
-
-        # create SequenceGenerator for each model that has backtranslation dependency on it
-        self.sequence_generators = {}
-        if (
-            self.lambda_otf_bt > 0.0 or self.lambda_otf_bt_steps is not None
-        ) and self.training:
-            for lang_pair in self.lang_pairs:
-                src, tgt = lang_pair.split("-")
-                key = "{}-{}".format(tgt, src)
-                self.sequence_generators[key] = SequenceGenerator(
-                    [model.models[key]],
-                    tgt_dict=self.dicts[src],
-                    beam_size=args.bt_beam_size,
-                    max_len_a=args.bt_max_len_a,
-                    max_len_b=args.bt_max_len_b,
-                )
-                decoder_lang_tok_idx = self.get_decoder_langtok(src)
-
-                def backtranslate_fn(
-                    sample,
-                    model=model.models[key],
-                    bos_token=decoder_lang_tok_idx,
-                    sequence_generator=self.sequence_generators[key],
-                ):
-                    return sequence_generator.generate(
-                        [model],
-                        sample,
-                        bos_token=bos_token,
-                    )
-
-                self.backtranslators[lang_pair] = backtranslate_fn
-
-        return model
-
-    def train_step(
-        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
-    ):
-        model.train()
-
-        if update_num > 0:
-            self.update_step(update_num)
-
-        agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, {}
-
-        def forward_backward(model, samples, logging_output_key, weight):
-            nonlocal agg_loss, agg_sample_size, agg_logging_output
-            if samples is None or len(samples) == 0:
-                return
-            loss, sample_size, logging_output = criterion(model, samples)
-            if ignore_grad:
-                loss *= 0
-            else:
-                loss *= weight
-            optimizer.backward(loss)
-            agg_loss += loss.detach().item()
-            # TODO make summing of the sample sizes configurable
-            agg_sample_size += sample_size
-            for k in logging_output:
-                agg_logging_output[k] += logging_output[k]
-                agg_logging_output[logging_output_key] += logging_output[k]
-
-        if self.lambda_parallel > 0.0:
-            for lang_pair in self.lang_pairs:
-                forward_backward(
-                    model.models[lang_pair],
-                    sample[lang_pair],
-                    lang_pair,
-                    self.lambda_parallel,
-                )
-
-        if self.lambda_otf_bt > 0.0:
-            for lang_pair in self.lang_pairs:
-                sample_key = _get_bt_dataset_key(lang_pair)
-                forward_backward(
-                    model.models[lang_pair],
-                    sample[sample_key],
-                    sample_key,
-                    self.lambda_otf_bt,
-                )
-
-        if self.lambda_denoising > 0.0:
-            for lang_pair in self.lang_pairs:
-                _, tgt = lang_pair.split("-")
-                sample_key = _get_denoising_dataset_key(lang_pair)
-                forward_backward(
-                    model.models["{0}-{0}".format(tgt)],
-                    sample[sample_key],
-                    sample_key,
-                    self.lambda_denoising,
-                )
-
-        return agg_loss, agg_sample_size, agg_logging_output
-
-    def update_step(self, num_updates):
-        def lambda_step_func(config, n_iter):
-            """
-            Update a lambda value according to its schedule configuration.
-            """
-            ranges = [
-                i
-                for i in range(len(config) - 1)
-                if config[i][0] <= n_iter < config[i + 1][0]
-            ]
-            if len(ranges) == 0:
-                assert n_iter >= config[-1][0]
-                return config[-1][1]
-            assert len(ranges) == 1
-            i = ranges[0]
-            x_a, y_a = config[i]
-            x_b, y_b = config[i + 1]
-            return y_a + (n_iter - x_a) * float(y_b - y_a) / float(x_b - x_a)
-
-        if self.lambda_parallel_steps is not None:
-            self.lambda_parallel = lambda_step_func(
-                self.lambda_parallel_steps, num_updates
-            )
-        if self.lambda_denoising_steps is not None:
-            self.lambda_denoising = lambda_step_func(
-                self.lambda_denoising_steps, num_updates
-            )
-        if self.lambda_otf_bt_steps is not None:
-            self.lambda_otf_bt = lambda_step_func(self.lambda_otf_bt_steps, num_updates)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/sentence_prediction.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/sentence_prediction.py
deleted file mode 100644
index 69dc996e6afd81eba5c2fbad1084db99bb832c19..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/sentence_prediction.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-
-import numpy as np
-from fairseq import utils
-from fairseq.data import (
-    ConcatSentencesDataset,
-    Dictionary,
-    IdDataset,
-    NestedDictionaryDataset,
-    NumelDataset,
-    NumSamplesDataset,
-    OffsetTokensDataset,
-    PrependTokenDataset,
-    RawLabelDataset,
-    RightPadDataset,
-    RollDataset,
-    SortDataset,
-    StripTokenDataset,
-    data_utils,
-)
-from fairseq.data.shorten_dataset import maybe_shorten_dataset
-from fairseq.tasks import LegacyFairseqTask, register_task
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("sentence_prediction")
-class SentencePredictionTask(LegacyFairseqTask):
-    """
-    Sentence (or sentence pair) prediction (classification or regression) task.
-
-    Args:
-        dictionary (Dictionary): the dictionary for the input of the task
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        parser.add_argument("data", metavar="FILE", help="file prefix for data")
-        parser.add_argument(
-            "--num-classes",
-            type=int,
-            default=-1,
-            help="number of classes or regression targets",
-        )
-        parser.add_argument(
-            "--init-token",
-            type=int,
-            default=None,
-            help="add token at the beginning of each batch item",
-        )
-        parser.add_argument(
-            "--separator-token",
-            type=int,
-            default=None,
-            help="add separator token between inputs",
-        )
-        parser.add_argument("--regression-target", action="store_true", default=False)
-        parser.add_argument("--no-shuffle", action="store_true", default=False)
-        parser.add_argument(
-            "--shorten-method",
-            default="none",
-            choices=["none", "truncate", "random_crop"],
-            help="if not none, shorten sequences that exceed --tokens-per-sample",
-        )
-        parser.add_argument(
-            "--shorten-data-split-list",
-            default="",
-            help="comma-separated list of dataset splits to apply shortening to, "
-            'e.g., "train,valid" (default: all dataset splits)',
-        )
-        parser.add_argument(
-            "--add-prev-output-tokens",
-            action="store_true",
-            default=False,
-            help="add prev_output_tokens to sample, used for encoder-decoder arch",
-        )
-
-    def __init__(self, args, data_dictionary, label_dictionary):
-        super().__init__(args)
-        self.dictionary = data_dictionary
-        self._label_dictionary = label_dictionary
-        if not hasattr(args, "max_positions"):
-            self._max_positions = (
-                args.max_source_positions,
-                args.max_target_positions,
-            )
-        else:
-            self._max_positions = args.max_positions
-        args.tokens_per_sample = self._max_positions
-
-    @classmethod
-    def load_dictionary(cls, args, filename, source=True):
-        """Load the dictionary from the filename
-
-        Args:
-            filename (str): the filename
-        """
-        dictionary = Dictionary.load(filename)
-        dictionary.add_symbol("<mask>")
-        return dictionary
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        assert args.num_classes > 0, "Must set --num-classes"
-
-        # load data dictionary
-        data_dict = cls.load_dictionary(
-            args,
-            os.path.join(args.data, "input0", "dict.txt"),
-            source=True,
-        )
-        logger.info("[input] dictionary: {} types".format(len(data_dict)))
-
-        label_dict = None
-        if not args.regression_target:
-            # load label dictionary
-            label_dict = cls.load_dictionary(
-                args,
-                os.path.join(args.data, "label", "dict.txt"),
-                source=False,
-            )
-            logger.info("[label] dictionary: {} types".format(len(label_dict)))
-        else:
-            label_dict = data_dict
-        return cls(args, data_dict, label_dict)
-
-    def load_dataset(self, split, combine=False, **kwargs):
-        """Load a given dataset split (e.g., train, valid, test)."""
-
-        def get_path(type, split):
-            return os.path.join(self.args.data, type, split)
-
-        def make_dataset(type, dictionary):
-            split_path = get_path(type, split)
-
-            dataset = data_utils.load_indexed_dataset(
-                split_path,
-                dictionary,
-                self.args.dataset_impl,
-                combine=combine,
-            )
-            return dataset
-
-        input0 = make_dataset("input0", self.source_dictionary)
-        assert input0 is not None, "could not find dataset: {}".format(
-            get_path(type, split)
-        )
-        input1 = make_dataset("input1", self.source_dictionary)
-
-        if self.args.init_token is not None:
-            input0 = PrependTokenDataset(input0, self.args.init_token)
-
-        if input1 is None:
-            src_tokens = input0
-        else:
-            if self.args.separator_token is not None:
-                input1 = PrependTokenDataset(input1, self.args.separator_token)
-
-            src_tokens = ConcatSentencesDataset(input0, input1)
-
-        with data_utils.numpy_seed(self.args.seed):
-            shuffle = np.random.permutation(len(src_tokens))
-
-        src_tokens = maybe_shorten_dataset(
-            src_tokens,
-            split,
-            self.args.shorten_data_split_list,
-            self.args.shorten_method,
-            self.args.max_positions,
-            self.args.seed,
-        )
-
-        dataset = {
-            "id": IdDataset(),
-            "net_input": {
-                "src_tokens": RightPadDataset(
-                    src_tokens,
-                    pad_idx=self.source_dictionary.pad(),
-                ),
-                "src_lengths": NumelDataset(src_tokens, reduce=False),
-            },
-            "nsentences": NumSamplesDataset(),
-            "ntokens": NumelDataset(src_tokens, reduce=True),
-        }
-
-        if self.args.add_prev_output_tokens:
-            prev_tokens_dataset = RightPadDataset(
-                RollDataset(src_tokens, 1),
-                pad_idx=self.dictionary.pad(),
-            )
-            dataset["net_input"].update(
-                prev_output_tokens=prev_tokens_dataset,
-            )
-
-        if not self.args.regression_target:
-            label_dataset = make_dataset("label", self.label_dictionary)
-            if label_dataset is not None:
-                dataset.update(
-                    target=OffsetTokensDataset(
-                        StripTokenDataset(
-                            label_dataset,
-                            id_to_strip=self.label_dictionary.eos(),
-                        ),
-                        offset=-self.label_dictionary.nspecial,
-                    )
-                )
-        else:
-            label_path = "{0}.label".format(get_path("label", split))
-            if os.path.exists(label_path):
-
-                def parse_regression_target(i, line):
-                    values = line.split()
-                    assert (
-                        len(values) == self.args.num_classes
-                    ), f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"'
-                    return [float(x) for x in values]
-
-                with open(label_path) as h:
-                    dataset.update(
-                        target=RawLabelDataset(
-                            [
-                                parse_regression_target(i, line.strip())
-                                for i, line in enumerate(h.readlines())
-                            ]
-                        )
-                    )
-
-        nested_dataset = NestedDictionaryDataset(
-            dataset,
-            sizes=[src_tokens.sizes],
-        )
-
-        if self.args.no_shuffle:
-            dataset = nested_dataset
-        else:
-            dataset = SortDataset(
-                nested_dataset,
-                # shuffle
-                sort_order=[shuffle],
-            )
-
-        logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset)))
-
-        self.datasets[split] = dataset
-        return self.datasets[split]
-
-    def build_model(self, args):
-        from fairseq import models
-
-        model = models.build_model(args, self)
-
-        model.register_classification_head(
-            getattr(args, "classification_head_name", "sentence_classification_head"),
-            num_classes=self.args.num_classes,
-        )
-
-        return model
-
-    def max_positions(self):
-        return self._max_positions
-
-    @property
-    def source_dictionary(self):
-        return self.dictionary
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
-
-    @property
-    def label_dictionary(self):
-        return self._label_dictionary
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/sentence_ranking.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/sentence_ranking.py
deleted file mode 100644
index bed44f34e5f8e506b6ae7ba30ddaa661bf4a7522..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/sentence_ranking.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-
-import numpy as np
-from fairseq import utils
-from fairseq.data import (
-    ConcatSentencesDataset,
-    Dictionary,
-    IdDataset,
-    NestedDictionaryDataset,
-    NumelDataset,
-    NumSamplesDataset,
-    PrependTokenDataset,
-    RawLabelDataset,
-    RightPadDataset,
-    SortDataset,
-    TruncateDataset,
-    data_utils,
-)
-from fairseq.data.shorten_dataset import maybe_shorten_dataset
-from fairseq.tasks import LegacyFairseqTask, register_task
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("sentence_ranking")
-class SentenceRankingTask(LegacyFairseqTask):
-    """
-    Ranking task on multiple sentences.
-
-    Args:
-        dictionary (Dictionary): the dictionary for the input of the task
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        parser.add_argument("data", metavar="FILE", help="file prefix for data")
-        parser.add_argument(
-            "--num-classes", type=int, help="number of sentences to be ranked"
-        )
-        parser.add_argument(
-            "--init-token",
-            type=int,
-            help="add token at the beginning of each batch item",
-        )
-        parser.add_argument(
-            "--separator-token", type=int, help="add separator token between inputs"
-        )
-        parser.add_argument("--no-shuffle", action="store_true")
-        parser.add_argument(
-            "--shorten-method",
-            default="none",
-            choices=["none", "truncate", "random_crop"],
-            help="if not none, shorten sequences that exceed --tokens-per-sample",
-        )
-        parser.add_argument(
-            "--shorten-data-split-list",
-            default="",
-            help="comma-separated list of dataset splits to apply shortening to, "
-            'e.g., "train,valid" (default: all dataset splits)',
-        )
-        parser.add_argument(
-            "--max-option-length", type=int, help="max length for each option"
-        )
-
-    def __init__(self, args, dictionary):
-        super().__init__(args)
-        self.dictionary = dictionary
-
-    @classmethod
-    def load_dictionary(cls, args, filename, source=True):
-        """Load the dictionary from the filename
-
-        Args:
-            filename (str): the filename
-        """
-        dictionary = Dictionary.load(filename)
-        dictionary.add_symbol("<mask>")
-        return dictionary
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        assert (
-            args.criterion == "sentence_ranking"
-        ), "Must set --criterion=sentence_ranking"
-
-        # load data dictionary
-        data_dict = cls.load_dictionary(
-            args,
-            os.path.join(args.data, "input0", "dict.txt"),
-            source=True,
-        )
-        logger.info("[input] dictionary: {} types".format(len(data_dict)))
-        return SentenceRankingTask(args, data_dict)
-
-    def load_dataset(self, split, combine=False, **kwargs):
-        """Load a given dataset split (e.g., train, valid, test)."""
-
-        def get_path(type, split):
-            return os.path.join(self.args.data, type, split)
-
-        def make_dataset(type, dictionary):
-            split_path = get_path(type, split)
-
-            dataset = data_utils.load_indexed_dataset(
-                split_path,
-                self.source_dictionary,
-                self.args.dataset_impl,
-                combine=combine,
-            )
-            return dataset
-
-        input0 = make_dataset("input0", self.source_dictionary)
-        input_options = [
-            make_dataset("input{idx}".format(idx=idx + 1), self.source_dictionary)
-            for idx in range(self.args.num_classes)
-        ]
-
-        if self.args.separator_token is not None:
-            input0 = PrependTokenDataset(input0, self.args.separator_token)
-
-        src_tokens = []
-        for input_option in input_options:
-            if self.args.init_token is not None:
-                input_option = PrependTokenDataset(input_option, self.args.init_token)
-            if self.args.max_option_length is not None:
-                input_option = TruncateDataset(
-                    input_option, self.args.max_option_length
-                )
-            src_token = ConcatSentencesDataset(input_option, input0)
-            src_token = maybe_shorten_dataset(
-                src_token,
-                split,
-                self.args.shorten_data_split_list,
-                self.args.shorten_method,
-                self.args.max_positions,
-                self.args.seed,
-            )
-            src_tokens.append(src_token)
-
-        with data_utils.numpy_seed(self.args.seed):
-            shuffle = np.random.permutation(len(src_tokens[0]))
-
-        dataset = {
-            "id": IdDataset(),
-            "nsentences": NumSamplesDataset(),
-            "ntokens": NumelDataset(src_tokens[0], reduce=True),
-        }
-
-        for src_token_idx in range(len(src_tokens)):
-            dataset.update(
-                {
-                    "net_input{idx}".format(idx=src_token_idx + 1): {
-                        "src_tokens": RightPadDataset(
-                            src_tokens[src_token_idx],
-                            pad_idx=self.source_dictionary.pad(),
-                        ),
-                        "src_lengths": NumelDataset(
-                            src_tokens[src_token_idx], reduce=False
-                        ),
-                    }
-                }
-            )
-
-        label_path = "{}.label".format(get_path("label", split))
-        if os.path.exists(label_path):
-            with open(label_path) as h:
-                dataset.update(
-                    target=RawLabelDataset([int(x.strip()) for x in h.readlines()])
-                )
-
-        nested_dataset = NestedDictionaryDataset(
-            dataset,
-            sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])],
-        )
-
-        if self.args.no_shuffle:
-            dataset = nested_dataset
-        else:
-            dataset = SortDataset(
-                nested_dataset,
-                # shuffle
-                sort_order=[shuffle],
-            )
-
-        logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset)))
-
-        self.datasets[split] = dataset
-        return self.datasets[split]
-
-    def build_model(self, args):
-        from fairseq import models
-
-        model = models.build_model(args, self)
-
-        model.register_classification_head(
-            getattr(args, "ranking_head_name", "sentence_classification_head"),
-            num_classes=1,
-        )
-
-        return model
-
-    def max_positions(self):
-        return self.args.max_positions
-
-    @property
-    def source_dictionary(self):
-        return self.dictionary
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/speech_to_text.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/speech_to_text.py
deleted file mode 100644
index c628030883a20bca1d589b033a4c25d4e9022959..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/speech_to_text.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os.path as op
-from argparse import Namespace
-
-from fairseq.data import Dictionary, encoders
-from fairseq.data.audio.speech_to_text_dataset import (
-    S2TDataConfig,
-    SpeechToTextDataset,
-    SpeechToTextDatasetCreator,
-)
-from fairseq.tasks import FairseqTask, register_task
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("speech_to_text")
-class SpeechToTextTask(FairseqTask):
-    @staticmethod
-    def add_args(parser):
-        parser.add_argument("data", help="manifest root path")
-        parser.add_argument(
-            "--config-yaml",
-            type=str,
-            default="config.yaml",
-            help="Configuration YAML filename (under manifest root)",
-        )
-        parser.add_argument(
-            "--max-source-positions",
-            default=6000,
-            type=int,
-            metavar="N",
-            help="max number of tokens in the source sequence",
-        )
-        parser.add_argument(
-            "--max-target-positions",
-            default=1024,
-            type=int,
-            metavar="N",
-            help="max number of tokens in the target sequence",
-        )
-
-    def __init__(self, args, tgt_dict):
-        super().__init__(args)
-        self.tgt_dict = tgt_dict
-        self.data_cfg = S2TDataConfig(op.join(args.data, args.config_yaml))
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        data_cfg = S2TDataConfig(op.join(args.data, args.config_yaml))
-        dict_path = op.join(args.data, data_cfg.vocab_filename)
-        if not op.isfile(dict_path):
-            raise FileNotFoundError(f"Dict not found: {dict_path}")
-        tgt_dict = Dictionary.load(dict_path)
-        logger.info(
-            f"dictionary size ({data_cfg.vocab_filename}): " f"{len(tgt_dict):,}"
-        )
-
-        if getattr(args, "train_subset", None) is not None:
-            if not all(s.startswith("train") for s in args.train_subset.split(",")):
-                raise ValueError('Train splits should be named like "train*".')
-        return cls(args, tgt_dict)
-
-    def build_criterion(self, args):
-        from fairseq import criterions
-
-        if self.data_cfg.prepend_tgt_lang_tag and args.ignore_prefix_size != 1:
-            raise ValueError(
-                'Please set "--ignore-prefix-size 1" since '
-                "target language ID token is prepended as BOS."
-            )
-        return criterions.build_criterion(args, self)
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        is_train_split = split.startswith("train")
-        pre_tokenizer = self.build_tokenizer(self.args)
-        bpe_tokenizer = self.build_bpe(self.args)
-        self.datasets[split] = SpeechToTextDatasetCreator.from_tsv(
-            self.args.data,
-            self.data_cfg,
-            split,
-            self.tgt_dict,
-            pre_tokenizer,
-            bpe_tokenizer,
-            is_train_split=is_train_split,
-            epoch=epoch,
-            seed=self.args.seed,
-        )
-
-    @property
-    def target_dictionary(self):
-        return self.tgt_dict
-
-    @property
-    def source_dictionary(self):
-        return None
-
-    def max_positions(self):
-        return self.args.max_source_positions, self.args.max_target_positions
-
-    def build_model(self, args):
-        args.input_feat_per_channel = self.data_cfg.input_feat_per_channel
-        args.input_channels = self.data_cfg.input_channels
-        return super(SpeechToTextTask, self).build_model(args)
-
-    def build_generator(
-        self,
-        models,
-        args,
-        seq_gen_cls=None,
-        extra_gen_cls_kwargs=None,
-    ):
-        if self.data_cfg.prepend_tgt_lang_tag and args.prefix_size != 1:
-            raise ValueError(
-                'Please set "--prefix-size 1" since '
-                "target language ID token is prepended as BOS."
-            )
-        lang_token_ids = {
-            i
-            for s, i in self.tgt_dict.indices.items()
-            if SpeechToTextDataset.is_lang_tag(s)
-        }
-        extra_gen_cls_kwargs = {"symbols_to_strip_from_output": lang_token_ids}
-        return super().build_generator(
-            models, args, seq_gen_cls=None, extra_gen_cls_kwargs=extra_gen_cls_kwargs
-        )
-
-    def build_tokenizer(self, args):
-        logger.info(f"pre-tokenizer: {self.data_cfg.pre_tokenizer}")
-        return encoders.build_tokenizer(Namespace(**self.data_cfg.pre_tokenizer))
-
-    def build_bpe(self, args):
-        logger.info(f"tokenizer: {self.data_cfg.bpe_tokenizer}")
-        return encoders.build_bpe(Namespace(**self.data_cfg.bpe_tokenizer))
-
-    @classmethod
-    def build_dataset_for_inference(cls, audio_paths, n_frames):
-        return SpeechToTextDataset("interactive", False, {}, audio_paths, n_frames)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation.py
deleted file mode 100644
index 79007a6d9fdbe416ec0c8aa48e4291efd2ef1356..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation.py
+++ /dev/null
@@ -1,448 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import itertools
-import json
-import logging
-import os
-from argparse import Namespace
-
-import numpy as np
-from fairseq import metrics, options, utils
-from fairseq.data import (
-    AppendTokenDataset,
-    ConcatDataset,
-    LanguagePairDataset,
-    PrependTokenDataset,
-    StripTokenDataset,
-    TruncateDataset,
-    data_utils,
-    encoders,
-    indexed_dataset,
-)
-from fairseq.tasks import LegacyFairseqTask, register_task
-
-
-EVAL_BLEU_ORDER = 4
-
-
-logger = logging.getLogger(__name__)
-
-
-def load_langpair_dataset(
-    data_path,
-    split,
-    src,
-    src_dict,
-    tgt,
-    tgt_dict,
-    combine,
-    dataset_impl,
-    upsample_primary,
-    left_pad_source,
-    left_pad_target,
-    max_source_positions,
-    max_target_positions,
-    prepend_bos=False,
-    load_alignments=False,
-    truncate_source=False,
-    append_source_id=False,
-    num_buckets=0,
-    shuffle=True,
-    pad_to_multiple=1,
-):
-    def split_exists(split, src, tgt, lang, data_path):
-        filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang))
-        return indexed_dataset.dataset_exists(filename, impl=dataset_impl)
-
-    src_datasets = []
-    tgt_datasets = []
-
-    for k in itertools.count():
-        split_k = split + (str(k) if k > 0 else "")
-
-        # infer langcode
-        if split_exists(split_k, src, tgt, src, data_path):
-            prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt))
-        elif split_exists(split_k, tgt, src, src, data_path):
-            prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src))
-        else:
-            if k > 0:
-                break
-            else:
-                raise FileNotFoundError(
-                    "Dataset not found: {} ({})".format(split, data_path)
-                )
-
-        src_dataset = data_utils.load_indexed_dataset(
-            prefix + src, src_dict, dataset_impl
-        )
-        if truncate_source:
-            src_dataset = AppendTokenDataset(
-                TruncateDataset(
-                    StripTokenDataset(src_dataset, src_dict.eos()),
-                    max_source_positions - 1,
-                ),
-                src_dict.eos(),
-            )
-        src_datasets.append(src_dataset)
-
-        tgt_dataset = data_utils.load_indexed_dataset(
-            prefix + tgt, tgt_dict, dataset_impl
-        )
-        if tgt_dataset is not None:
-            tgt_datasets.append(tgt_dataset)
-
-        logger.info(
-            "{} {} {}-{} {} examples".format(
-                data_path, split_k, src, tgt, len(src_datasets[-1])
-            )
-        )
-
-        if not combine:
-            break
-
-    assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0
-
-    if len(src_datasets) == 1:
-        src_dataset = src_datasets[0]
-        tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None
-    else:
-        sample_ratios = [1] * len(src_datasets)
-        sample_ratios[0] = upsample_primary
-        src_dataset = ConcatDataset(src_datasets, sample_ratios)
-        if len(tgt_datasets) > 0:
-            tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios)
-        else:
-            tgt_dataset = None
-
-    if prepend_bos:
-        assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index")
-        src_dataset = PrependTokenDataset(src_dataset, src_dict.bos())
-        if tgt_dataset is not None:
-            tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos())
-
-    eos = None
-    if append_source_id:
-        src_dataset = AppendTokenDataset(
-            src_dataset, src_dict.index("[{}]".format(src))
-        )
-        if tgt_dataset is not None:
-            tgt_dataset = AppendTokenDataset(
-                tgt_dataset, tgt_dict.index("[{}]".format(tgt))
-            )
-        eos = tgt_dict.index("[{}]".format(tgt))
-
-    align_dataset = None
-    if load_alignments:
-        align_path = os.path.join(data_path, "{}.align.{}-{}".format(split, src, tgt))
-        if indexed_dataset.dataset_exists(align_path, impl=dataset_impl):
-            align_dataset = data_utils.load_indexed_dataset(
-                align_path, None, dataset_impl
-            )
-
-    tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None
-    return LanguagePairDataset(
-        src_dataset,
-        src_dataset.sizes,
-        src_dict,
-        tgt_dataset,
-        tgt_dataset_sizes,
-        tgt_dict,
-        left_pad_source=left_pad_source,
-        left_pad_target=left_pad_target,
-        align_dataset=align_dataset,
-        eos=eos,
-        num_buckets=num_buckets,
-        shuffle=shuffle,
-        pad_to_multiple=pad_to_multiple,
-    )
-
-
-@register_task("translation")
-class TranslationTask(LegacyFairseqTask):
-    """
-    Translate from one (source) language to another (target) language.
-
-    Args:
-        src_dict (~fairseq.data.Dictionary): dictionary for the source language
-        tgt_dict (~fairseq.data.Dictionary): dictionary for the target language
-
-    .. note::
-
-        The translation task is compatible with :mod:`fairseq-train`,
-        :mod:`fairseq-generate` and :mod:`fairseq-interactive`.
-
-    The translation task provides the following additional command-line
-    arguments:
-
-    .. argparse::
-        :ref: fairseq.tasks.translation_parser
-        :prog:
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('data', help='colon separated path to data directories list, \
-                            will be iterated upon during epochs in round-robin manner; \
-                            however, valid and test data are always in the first directory to \
-                            avoid the need for repeating them in all directories')
-        parser.add_argument('-s', '--source-lang', default=None, metavar='SRC',
-                            help='source language')
-        parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
-                            help='target language')
-        parser.add_argument('--load-alignments', action='store_true',
-                            help='load the binarized alignments')
-        parser.add_argument('--left-pad-source', default='True', type=str, metavar='BOOL',
-                            help='pad the source on the left')
-        parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL',
-                            help='pad the target on the left')
-        parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N',
-                            help='max number of tokens in the source sequence')
-        parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N',
-                            help='max number of tokens in the target sequence')
-        parser.add_argument('--upsample-primary', default=1, type=int,
-                            help='amount to upsample primary dataset')
-        parser.add_argument('--truncate-source', action='store_true', default=False,
-                            help='truncate source to max-source-positions')
-        parser.add_argument('--num-batch-buckets', default=0, type=int, metavar='N',
-                            help='if >0, then bucket source and target lengths into N '
-                                 'buckets and pad accordingly; this is useful on TPUs '
-                                 'to minimize the number of compilations')
-
-        # options for reporting BLEU during validation
-        parser.add_argument('--eval-bleu', action='store_true',
-                            help='evaluation with BLEU scores')
-        parser.add_argument('--eval-bleu-detok', type=str, default="space",
-                            help='detokenize before computing BLEU (e.g., "moses"); '
-                                 'required if using --eval-bleu; use "space" to '
-                                 'disable detokenization; see fairseq.data.encoders '
-                                 'for other options')
-        parser.add_argument('--eval-bleu-detok-args', type=str, metavar='JSON',
-                            help='args for building the tokenizer, if needed')
-        parser.add_argument('--eval-tokenized-bleu', action='store_true', default=False,
-                            help='compute tokenized BLEU instead of sacrebleu')
-        parser.add_argument('--eval-bleu-remove-bpe', nargs='?', const='@@ ', default=None,
-                            help='remove BPE before computing BLEU')
-        parser.add_argument('--eval-bleu-args', type=str, metavar='JSON',
-                            help='generation args for BLUE scoring, '
-                                 'e.g., \'{"beam": 4, "lenpen": 0.6}\'')
-        parser.add_argument('--eval-bleu-print-samples', action='store_true',
-                            help='print sample generations during validation')
-        # fmt: on
-
-    def __init__(self, args, src_dict, tgt_dict):
-        super().__init__(args)
-        self.src_dict = src_dict
-        self.tgt_dict = tgt_dict
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        """Setup the task (e.g., load dictionaries).
-
-        Args:
-            args (argparse.Namespace): parsed command-line arguments
-        """
-        args.left_pad_source = utils.eval_bool(args.left_pad_source)
-        args.left_pad_target = utils.eval_bool(args.left_pad_target)
-
-        paths = utils.split_paths(args.data)
-        assert len(paths) > 0
-        # find language pair automatically
-        if args.source_lang is None or args.target_lang is None:
-            args.source_lang, args.target_lang = data_utils.infer_language_pair(
-                paths[0]
-            )
-        if args.source_lang is None or args.target_lang is None:
-            raise Exception(
-                "Could not infer language pair, please provide it explicitly"
-            )
-
-        # load dictionaries
-        src_dict = cls.load_dictionary(
-            os.path.join(paths[0], "dict.{}.txt".format(args.source_lang))
-        )
-        tgt_dict = cls.load_dictionary(
-            os.path.join(paths[0], "dict.{}.txt".format(args.target_lang))
-        )
-        assert src_dict.pad() == tgt_dict.pad()
-        assert src_dict.eos() == tgt_dict.eos()
-        assert src_dict.unk() == tgt_dict.unk()
-        logger.info("[{}] dictionary: {} types".format(args.source_lang, len(src_dict)))
-        logger.info("[{}] dictionary: {} types".format(args.target_lang, len(tgt_dict)))
-
-        return cls(args, src_dict, tgt_dict)
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        paths = utils.split_paths(self.args.data)
-        assert len(paths) > 0
-        if split != getattr(self.args, "train_subset", None):
-            # if not training data set, use the first shard for valid and test
-            paths = paths[:1]
-        data_path = paths[(epoch - 1) % len(paths)]
-
-        # infer langcode
-        src, tgt = self.args.source_lang, self.args.target_lang
-
-        self.datasets[split] = load_langpair_dataset(
-            data_path,
-            split,
-            src,
-            self.src_dict,
-            tgt,
-            self.tgt_dict,
-            combine=combine,
-            dataset_impl=self.args.dataset_impl,
-            upsample_primary=self.args.upsample_primary,
-            left_pad_source=self.args.left_pad_source,
-            left_pad_target=self.args.left_pad_target,
-            max_source_positions=self.args.max_source_positions,
-            max_target_positions=self.args.max_target_positions,
-            load_alignments=self.args.load_alignments,
-            truncate_source=self.args.truncate_source,
-            num_buckets=self.args.num_batch_buckets,
-            shuffle=(split != "test"),
-            pad_to_multiple=self.args.required_seq_len_multiple,
-        )
-
-    def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None):
-        return LanguagePairDataset(
-            src_tokens,
-            src_lengths,
-            self.source_dictionary,
-            tgt_dict=self.target_dictionary,
-            constraints=constraints,
-        )
-
-    def build_model(self, args):
-        model = super().build_model(args)
-        if getattr(args, "eval_bleu", False):
-            assert getattr(args, "eval_bleu_detok", None) is not None, (
-                "--eval-bleu-detok is required if using --eval-bleu; "
-                "try --eval-bleu-detok=moses (or --eval-bleu-detok=space "
-                "to disable detokenization, e.g., when using sentencepiece)"
-            )
-            detok_args = json.loads(getattr(args, "eval_bleu_detok_args", "{}") or "{}")
-            self.tokenizer = encoders.build_tokenizer(
-                Namespace(
-                    tokenizer=getattr(args, "eval_bleu_detok", None), **detok_args
-                )
-            )
-
-            gen_args = json.loads(getattr(args, "eval_bleu_args", "{}") or "{}")
-            self.sequence_generator = self.build_generator(
-                [model], Namespace(**gen_args)
-            )
-        return model
-
-    def valid_step(self, sample, model, criterion):
-        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
-        if self.args.eval_bleu:
-            bleu = self._inference_with_bleu(self.sequence_generator, sample, model)
-            logging_output["_bleu_sys_len"] = bleu.sys_len
-            logging_output["_bleu_ref_len"] = bleu.ref_len
-            # we split counts into separate entries so that they can be
-            # summed efficiently across workers using fast-stat-sync
-            assert len(bleu.counts) == EVAL_BLEU_ORDER
-            for i in range(EVAL_BLEU_ORDER):
-                logging_output["_bleu_counts_" + str(i)] = bleu.counts[i]
-                logging_output["_bleu_totals_" + str(i)] = bleu.totals[i]
-        return loss, sample_size, logging_output
-
-    def reduce_metrics(self, logging_outputs, criterion):
-        super().reduce_metrics(logging_outputs, criterion)
-        if self.args.eval_bleu:
-
-            def sum_logs(key):
-                return sum(log.get(key, 0) for log in logging_outputs)
-
-            counts, totals = [], []
-            for i in range(EVAL_BLEU_ORDER):
-                counts.append(sum_logs("_bleu_counts_" + str(i)))
-                totals.append(sum_logs("_bleu_totals_" + str(i)))
-
-            if max(totals) > 0:
-                # log counts as numpy arrays -- log_scalar will sum them correctly
-                metrics.log_scalar("_bleu_counts", np.array(counts))
-                metrics.log_scalar("_bleu_totals", np.array(totals))
-                metrics.log_scalar("_bleu_sys_len", sum_logs("_bleu_sys_len"))
-                metrics.log_scalar("_bleu_ref_len", sum_logs("_bleu_ref_len"))
-
-                def compute_bleu(meters):
-                    import inspect
-                    import sacrebleu
-
-                    fn_sig = inspect.getfullargspec(sacrebleu.compute_bleu)[0]
-                    if "smooth_method" in fn_sig:
-                        smooth = {"smooth_method": "exp"}
-                    else:
-                        smooth = {"smooth": "exp"}
-                    bleu = sacrebleu.compute_bleu(
-                        correct=meters["_bleu_counts"].sum,
-                        total=meters["_bleu_totals"].sum,
-                        sys_len=meters["_bleu_sys_len"].sum,
-                        ref_len=meters["_bleu_ref_len"].sum,
-                        **smooth
-                    )
-                    return round(bleu.score, 2)
-
-                metrics.log_derived("bleu", compute_bleu)
-
-    def max_positions(self):
-        """Return the max sentence length allowed by the task."""
-        return (self.args.max_source_positions, self.args.max_target_positions)
-
-    @property
-    def source_dictionary(self):
-        """Return the source :class:`~fairseq.data.Dictionary`."""
-        return self.src_dict
-
-    @property
-    def target_dictionary(self):
-        """Return the target :class:`~fairseq.data.Dictionary`."""
-        return self.tgt_dict
-
-    def _inference_with_bleu(self, generator, sample, model):
-        import sacrebleu
-
-        def decode(toks, escape_unk=False):
-            s = self.tgt_dict.string(
-                toks.int().cpu(),
-                self.args.eval_bleu_remove_bpe,
-                # The default unknown string in fairseq is `<unk>`, but
-                # this is tokenized by sacrebleu as `< unk >`, inflating
-                # BLEU scores. Instead, we use a somewhat more verbose
-                # alternative that is unlikely to appear in the real
-                # reference, but doesn't get split into multiple tokens.
-                unk_string=("UNKNOWNTOKENINREF" if escape_unk else "UNKNOWNTOKENINHYP"),
-            )
-            if self.tokenizer:
-                s = self.tokenizer.decode(s)
-            return s
-
-        gen_out = self.inference_step(generator, [model], sample, prefix_tokens=None)
-        hyps, refs = [], []
-        for i in range(len(gen_out)):
-            hyps.append(decode(gen_out[i][0]["tokens"]))
-            refs.append(
-                decode(
-                    utils.strip_pad(sample["target"][i], self.tgt_dict.pad()),
-                    escape_unk=True,  # don't count <unk> as matches to the hypo
-                )
-            )
-        if self.args.eval_bleu_print_samples:
-            logger.info("example hypothesis: " + hyps[0])
-            logger.info("example reference: " + refs[0])
-        if self.args.eval_tokenized_bleu:
-            return sacrebleu.corpus_bleu(hyps, [refs], tokenize="none")
-        else:
-            return sacrebleu.corpus_bleu(hyps, [refs])
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation_from_pretrained_bart.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation_from_pretrained_bart.py
deleted file mode 100644
index 8710b7fe7d3509c56769183270f205bb93dae873..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation_from_pretrained_bart.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import torch
-from fairseq import utils
-from fairseq.data import LanguagePairDataset
-
-from . import register_task
-from .translation import TranslationTask, load_langpair_dataset
-
-
-@register_task("translation_from_pretrained_bart")
-class TranslationFromPretrainedBARTTask(TranslationTask):
-    """
-    Translate from source language to target language with a model initialized with a multilingual pretrain.
-
-    Args:
-        src_dict (~fairseq.data.Dictionary): dictionary for the source language
-        tgt_dict (~fairseq.data.Dictionary): dictionary for the target language
-
-    .. note::
-
-        The translation task is compatible with :mod:`fairseq-train`,
-        :mod:`fairseq-generate` and :mod:`fairseq-interactive`.
-
-    The translation task provides the following additional command-line
-    arguments:
-
-    .. argparse::
-        :ref: fairseq.tasks.translation_parser
-        :prog:
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        # fmt: off
-        TranslationTask.add_args(parser)
-        parser.add_argument('--langs', required=True, metavar='LANG',
-                            help='comma-separated list of monolingual language, '
-                                 'for example, "en,de,fr". These should match the '
-                                 'langs from pretraining (and be in the same order). '
-                                 'You should always add all pretraining language idx '
-                                 'during finetuning.')
-        parser.add_argument('--prepend-bos', action='store_true',
-                            help='prepend bos token to each sentence, which matches '
-                                 'mBART pretraining')
-        # fmt: on
-
-    def __init__(self, args, src_dict, tgt_dict):
-        super().__init__(args, src_dict, tgt_dict)
-        self.langs = args.langs.split(",")
-        for d in [src_dict, tgt_dict]:
-            for l in self.langs:
-                d.add_symbol("[{}]".format(l))
-            d.add_symbol("<mask>")
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        paths = utils.split_paths(self.args.data)
-        assert len(paths) > 0
-        data_path = paths[(epoch - 1) % len(paths)]
-
-        # infer langcode
-        src, tgt = self.args.source_lang, self.args.target_lang
-
-        self.datasets[split] = load_langpair_dataset(
-            data_path,
-            split,
-            src,
-            self.src_dict,
-            tgt,
-            self.tgt_dict,
-            combine=combine,
-            dataset_impl=self.args.dataset_impl,
-            upsample_primary=self.args.upsample_primary,
-            left_pad_source=self.args.left_pad_source,
-            left_pad_target=self.args.left_pad_target,
-            max_source_positions=getattr(self.args, "max_source_positions", 1024),
-            max_target_positions=getattr(self.args, "max_target_positions", 1024),
-            load_alignments=self.args.load_alignments,
-            prepend_bos=getattr(self.args, "prepend_bos", False),
-            append_source_id=True,
-        )
-
-    def build_generator(self, models, args, **unused):
-        if getattr(args, "score_reference", False):
-            from fairseq.sequence_scorer import SequenceScorer
-
-            return SequenceScorer(
-                self.target_dictionary,
-                eos=self.tgt_dict.index("[{}]".format(self.args.target_lang)),
-            )
-        else:
-            from fairseq.sequence_generator import SequenceGenerator
-
-            return SequenceGenerator(
-                models,
-                self.target_dictionary,
-                beam_size=getattr(args, "beam", 5),
-                max_len_a=getattr(args, "max_len_a", 0),
-                max_len_b=getattr(args, "max_len_b", 200),
-                min_len=getattr(args, "min_len", 1),
-                normalize_scores=(not getattr(args, "unnormalized", False)),
-                len_penalty=getattr(args, "lenpen", 1),
-                unk_penalty=getattr(args, "unkpen", 0),
-                temperature=getattr(args, "temperature", 1.0),
-                match_source_len=getattr(args, "match_source_len", False),
-                no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0),
-                eos=self.tgt_dict.index("[{}]".format(self.args.target_lang)),
-            )
-
-    def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None):
-        src_lang_id = self.source_dictionary.index("[{}]".format(self.args.source_lang))
-        source_tokens = []
-        for s_t in src_tokens:
-            s_t = torch.cat([s_t, s_t.new(1).fill_(src_lang_id)])
-            source_tokens.append(s_t)
-        dataset = LanguagePairDataset(
-            source_tokens,
-            src_lengths,
-            self.source_dictionary,
-            tgt_dict=self.target_dictionary,
-            constraints=constraints,
-        )
-        return dataset
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation_from_pretrained_xlm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation_from_pretrained_xlm.py
deleted file mode 100644
index 347a6eccb7657e6d20d1f1304b76fe31bc731393..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation_from_pretrained_xlm.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary
-from fairseq.tasks.translation import TranslationTask
-
-from . import register_task
-
-
-@register_task("translation_from_pretrained_xlm")
-class TranslationFromPretrainedXLMTask(TranslationTask):
-    """
-    Same as TranslationTask except use the MaskedLMDictionary class so that
-    we can load data that was binarized with the MaskedLMDictionary class.
-
-    This task should be used for the entire training pipeline when we want to
-    train an NMT model from a pretrained XLM checkpoint: binarizing NMT data,
-    training NMT with the pretrained XLM checkpoint, and subsequent evaluation
-    of that trained model.
-    """
-
-    @classmethod
-    def load_dictionary(cls, filename):
-        """Load the masked LM dictionary from the filename
-
-        Args:
-            filename (str): the filename
-        """
-        return MaskedLMDictionary.load(filename)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation_lev.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation_lev.py
deleted file mode 100644
index 4678774922ffc1de69180c972c62f40e1b25961d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation_lev.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-
-import torch
-from fairseq import utils
-from fairseq.data import LanguagePairDataset
-from fairseq.tasks import register_task
-from fairseq.tasks.translation import TranslationTask, load_langpair_dataset
-from fairseq.utils import new_arange
-
-
-@register_task("translation_lev")
-class TranslationLevenshteinTask(TranslationTask):
-    """
-    Translation (Sequence Generation) task for Levenshtein Transformer
-    See `"Levenshtein Transformer" <https://arxiv.org/abs/1905.11006>`_.
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        # fmt: off
-        TranslationTask.add_args(parser)
-        parser.add_argument(
-            '--noise',
-            default='random_delete',
-            choices=['random_delete', 'random_mask', 'no_noise', 'full_mask'])
-        # fmt: on
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        paths = utils.split_paths(self.args.data)
-        assert len(paths) > 0
-        data_path = paths[(epoch - 1) % len(paths)]
-
-        # infer langcode
-        src, tgt = self.args.source_lang, self.args.target_lang
-
-        self.datasets[split] = load_langpair_dataset(
-            data_path,
-            split,
-            src,
-            self.src_dict,
-            tgt,
-            self.tgt_dict,
-            combine=combine,
-            dataset_impl=self.args.dataset_impl,
-            upsample_primary=self.args.upsample_primary,
-            left_pad_source=self.args.left_pad_source,
-            left_pad_target=self.args.left_pad_target,
-            max_source_positions=self.args.max_source_positions,
-            max_target_positions=self.args.max_target_positions,
-            prepend_bos=True,
-        )
-
-    def inject_noise(self, target_tokens):
-        def _random_delete(target_tokens):
-            pad = self.tgt_dict.pad()
-            bos = self.tgt_dict.bos()
-            eos = self.tgt_dict.eos()
-
-            max_len = target_tokens.size(1)
-            target_mask = target_tokens.eq(pad)
-            target_score = target_tokens.clone().float().uniform_()
-            target_score.masked_fill_(
-                target_tokens.eq(bos) | target_tokens.eq(eos), 0.0
-            )
-            target_score.masked_fill_(target_mask, 1)
-            target_score, target_rank = target_score.sort(1)
-            target_length = target_mask.size(1) - target_mask.float().sum(
-                1, keepdim=True
-            )
-
-            # do not delete <bos> and <eos> (we assign 0 score for them)
-            target_cutoff = (
-                2
-                + (
-                    (target_length - 2)
-                    * target_score.new_zeros(target_score.size(0), 1).uniform_()
-                ).long()
-            )
-            target_cutoff = target_score.sort(1)[1] >= target_cutoff
-
-            prev_target_tokens = (
-                target_tokens.gather(1, target_rank)
-                .masked_fill_(target_cutoff, pad)
-                .gather(1, target_rank.masked_fill_(target_cutoff, max_len).sort(1)[1])
-            )
-            prev_target_tokens = prev_target_tokens[
-                :, : prev_target_tokens.ne(pad).sum(1).max()
-            ]
-
-            return prev_target_tokens
-
-        def _random_mask(target_tokens):
-            pad = self.tgt_dict.pad()
-            bos = self.tgt_dict.bos()
-            eos = self.tgt_dict.eos()
-            unk = self.tgt_dict.unk()
-
-            target_masks = (
-                target_tokens.ne(pad) & target_tokens.ne(bos) & target_tokens.ne(eos)
-            )
-            target_score = target_tokens.clone().float().uniform_()
-            target_score.masked_fill_(~target_masks, 2.0)
-            target_length = target_masks.sum(1).float()
-            target_length = target_length * target_length.clone().uniform_()
-            target_length = target_length + 1  # make sure to mask at least one token.
-
-            _, target_rank = target_score.sort(1)
-            target_cutoff = new_arange(target_rank) < target_length[:, None].long()
-            prev_target_tokens = target_tokens.masked_fill(
-                target_cutoff.scatter(1, target_rank, target_cutoff), unk
-            )
-            return prev_target_tokens
-
-        def _full_mask(target_tokens):
-            pad = self.tgt_dict.pad()
-            bos = self.tgt_dict.bos()
-            eos = self.tgt_dict.eos()
-            unk = self.tgt_dict.unk()
-
-            target_mask = (
-                target_tokens.eq(bos) | target_tokens.eq(eos) | target_tokens.eq(pad)
-            )
-            return target_tokens.masked_fill(~target_mask, unk)
-
-        if self.args.noise == "random_delete":
-            return _random_delete(target_tokens)
-        elif self.args.noise == "random_mask":
-            return _random_mask(target_tokens)
-        elif self.args.noise == "full_mask":
-            return _full_mask(target_tokens)
-        elif self.args.noise == "no_noise":
-            return target_tokens
-        else:
-            raise NotImplementedError
-
-    def build_generator(self, models, args, **unused):
-        # add models input to match the API for SequenceGenerator
-        from fairseq.iterative_refinement_generator import IterativeRefinementGenerator
-
-        return IterativeRefinementGenerator(
-            self.target_dictionary,
-            eos_penalty=getattr(args, "iter_decode_eos_penalty", 0.0),
-            max_iter=getattr(args, "iter_decode_max_iter", 10),
-            beam_size=getattr(args, "iter_decode_with_beam", 1),
-            reranking=getattr(args, "iter_decode_with_external_reranker", False),
-            decoding_format=getattr(args, "decoding_format", None),
-            adaptive=not getattr(args, "iter_decode_force_max_iter", False),
-            retain_history=getattr(args, "retain_iter_history", False),
-        )
-
-    def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None):
-        if constraints is not None:
-            # Though see Susanto et al. (ACL 2020): https://www.aclweb.org/anthology/2020.acl-main.325/
-            raise NotImplementedError(
-                "Constrained decoding with the translation_lev task is not supported"
-            )
-
-        return LanguagePairDataset(
-            src_tokens, src_lengths, self.source_dictionary, append_bos=True
-        )
-
-    def train_step(
-        self, sample, model, criterion, optimizer, update_num, ignore_grad=False
-    ):
-        model.train()
-        sample["prev_target"] = self.inject_noise(sample["target"])
-        loss, sample_size, logging_output = criterion(model, sample)
-        if ignore_grad:
-            loss *= 0
-        optimizer.backward(loss)
-        return loss, sample_size, logging_output
-
-    def valid_step(self, sample, model, criterion):
-        model.eval()
-        with torch.no_grad():
-            sample["prev_target"] = self.inject_noise(sample["target"])
-            loss, sample_size, logging_output = criterion(model, sample)
-        return loss, sample_size, logging_output
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation_multi_simple_epoch.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation_multi_simple_epoch.py
deleted file mode 100644
index 95a2d162c04169a5be916fd3ad0d28a704a29431..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tasks/translation_multi_simple_epoch.py
+++ /dev/null
@@ -1,414 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import datetime
-import logging
-import time
-
-import torch
-from fairseq.data import (
-    FairseqDataset,
-    LanguagePairDataset,
-    ListDataset,
-    data_utils,
-    iterators,
-)
-from fairseq.data.multilingual.multilingual_data_manager import (
-    MultilingualDatasetManager,
-)
-from fairseq.data.multilingual.sampling_method import SamplingMethod
-from fairseq.tasks import LegacyFairseqTask, register_task
-from fairseq.utils import FileContentsAction
-
-
-###
-def get_time_gap(s, e):
-    return (
-        datetime.datetime.fromtimestamp(e) - datetime.datetime.fromtimestamp(s)
-    ).__str__()
-
-
-###
-
-
-logger = logging.getLogger(__name__)
-
-
-@register_task("translation_multi_simple_epoch")
-class TranslationMultiSimpleEpochTask(LegacyFairseqTask):
-    """
-    Translate from one (source) language to another (target) language.
-
-    Args:
-        langs (List[str]): a list of languages that are being supported
-        dicts (Dict[str, fairseq.data.Dictionary]): mapping from supported languages to their dictionaries
-        training (bool): whether the task should be configured for training or not
-
-    .. note::
-
-        The translation task is compatible with :mod:`fairseq-train`,
-        :mod:`fairseq-generate` and :mod:`fairseq-interactive`.
-
-    The translation task provides the following additional command-line
-    arguments:
-
-    .. argparse::
-        :ref: fairseq.tasks.translation_parser
-        :prog:
-    """
-
-    @staticmethod
-    def add_args(parser):
-        """Add task-specific arguments to the parser."""
-        # fmt: off
-        parser.add_argument('-s', '--source-lang', default=None, metavar='SRC',
-                            help='inference source language')
-        parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
-                            help='inference target language')
-        parser.add_argument('--lang-pairs', default=None, metavar='PAIRS',
-                            help='comma-separated list of language pairs (in training order): en-de,en-fr,de-fr',
-                            action=FileContentsAction)
-        parser.add_argument('--keep-inference-langtok', action='store_true',
-                            help='keep language tokens in inference output (e.g. for analysis or debugging)')
-
-        SamplingMethod.add_arguments(parser)
-        MultilingualDatasetManager.add_args(parser)
-        # fmt: on
-
-    def __init__(self, args, langs, dicts, training):
-        super().__init__(args)
-        self.langs = langs
-        self.dicts = dicts
-        self.training = training
-        if training:
-            self.lang_pairs = args.lang_pairs
-        else:
-            self.lang_pairs = ["{}-{}".format(args.source_lang, args.target_lang)]
-        # eval_lang_pairs for multilingual translation is usually all of the
-        # lang_pairs. However for other multitask settings or when we want to
-        # optimize for certain languages we want to use a different subset. Thus
-        # the eval_lang_pairs class variable is provided for classes that extend
-        # this class.
-        self.eval_lang_pairs = self.lang_pairs
-        # model_lang_pairs will be used to build encoder-decoder model pairs in
-        # models.build_model(). This allows multitask type of sub-class can
-        # build models other than the input lang_pairs
-        self.model_lang_pairs = self.lang_pairs
-        self.sampling_method = SamplingMethod.build_sampler(args, self)
-        self.data_manager = MultilingualDatasetManager.setup_data_manager(
-            args, self.lang_pairs, langs, dicts, self.sampling_method
-        )
-
-    @classmethod
-    def setup_task(cls, args, **kwargs):
-        langs, dicts, training = MultilingualDatasetManager.prepare(
-            cls.load_dictionary, args, **kwargs
-        )
-        dict0 = None
-        for _, lang_dict in dicts.items():
-            if dict0 is None:
-                dict0 = lang_dict
-            else:
-                assert (
-                    dict0 == lang_dict
-                ), "Diffrent dictionary are specified for different languages; "
-                "TranslationMultiSimpleEpochTask only supports one shared dictionary across all languages"
-        return cls(args, langs, dicts, training)
-
-    def has_sharded_data(self, split):
-        return self.data_manager.has_sharded_data(split)
-
-    def load_dataset(self, split, epoch=1, combine=False, **kwargs):
-        """Load a given dataset split.
-
-        Args:
-            split (str): name of the split (e.g., train, valid, test)
-        """
-        if split in self.datasets:
-            dataset = self.datasets[split]
-            if self.has_sharded_data(split) and dataset.load_next_shard:
-                shard_epoch = dataset.shard_epoch
-            else:
-                # no need to load next shard so skip loading
-                # also this avoid always loading from beginning of the data
-                return
-        else:
-            # estimate the shard epoch from virtual data size and virtual epoch size
-            shard_epoch = self.data_manager.estimate_global_pass_epoch(epoch)
-        logger.info(f"loading data for {split} epoch={epoch}/{shard_epoch}")
-        logger.info(f"mem usage: {data_utils.get_mem_usage()}")
-        if split in self.datasets:
-            del self.datasets[split]
-            logger.info("old dataset deleted manually")
-            logger.info(f"mem usage: {data_utils.get_mem_usage()}")
-        self.datasets[split] = self.data_manager.load_sampled_multi_epoch_dataset(
-            split,
-            self.training,
-            epoch=epoch,
-            combine=combine,
-            shard_epoch=shard_epoch,
-            **kwargs,
-        )
-
-    def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None):
-        if constraints is not None:
-            raise NotImplementedError(
-                "Constrained decoding with the multilingual_translation task is not supported"
-            )
-
-        src_data = ListDataset(src_tokens, src_lengths)
-        dataset = LanguagePairDataset(src_data, src_lengths, self.source_dictionary)
-        src_langtok_spec, tgt_langtok_spec = self.args.langtoks["main"]
-        if self.args.lang_tok_replacing_bos_eos:
-            dataset = self.data_manager.alter_dataset_langtok(
-                dataset,
-                src_eos=self.source_dictionary.eos(),
-                src_lang=self.args.source_lang,
-                tgt_eos=self.target_dictionary.eos(),
-                tgt_lang=self.args.target_lang,
-                src_langtok_spec=src_langtok_spec,
-                tgt_langtok_spec=tgt_langtok_spec,
-            )
-        else:
-            dataset.src = self.data_manager.src_dataset_tranform_func(
-                self.args.source_lang,
-                self.args.target_lang,
-                dataset=dataset.src,
-                spec=src_langtok_spec,
-            )
-        return dataset
-
-    def build_generator(
-        self,
-        models,
-        args,
-        seq_gen_cls=None,
-        extra_gen_cls_kwargs=None,
-    ):
-        if not getattr(args, "keep_inference_langtok", False):
-            _, tgt_langtok_spec = self.args.langtoks["main"]
-            if tgt_langtok_spec:
-                tgt_lang_tok = self.data_manager.get_decoder_langtok(
-                    self.args.target_lang, tgt_langtok_spec
-                )
-                extra_gen_cls_kwargs = extra_gen_cls_kwargs or {}
-                extra_gen_cls_kwargs["symbols_to_strip_from_output"] = {tgt_lang_tok}
-
-        return super().build_generator(
-            models, args, seq_gen_cls=None, extra_gen_cls_kwargs=extra_gen_cls_kwargs
-        )
-
-    def build_model(self, args):
-        return super().build_model(args)
-
-    def valid_step(self, sample, model, criterion):
-        loss, sample_size, logging_output = super().valid_step(sample, model, criterion)
-        return loss, sample_size, logging_output
-
-    def inference_step(
-        self, generator, models, sample, prefix_tokens=None, constraints=None
-    ):
-        with torch.no_grad():
-            _, tgt_langtok_spec = self.args.langtoks["main"]
-            if not self.args.lang_tok_replacing_bos_eos:
-                if prefix_tokens is None and tgt_langtok_spec:
-                    tgt_lang_tok = self.data_manager.get_decoder_langtok(
-                        self.args.target_lang, tgt_langtok_spec
-                    )
-                    src_tokens = sample["net_input"]["src_tokens"]
-                    bsz = src_tokens.size(0)
-                    prefix_tokens = (
-                        torch.LongTensor([[tgt_lang_tok]]).expand(bsz, 1).to(src_tokens)
-                    )
-                return generator.generate(
-                    models,
-                    sample,
-                    prefix_tokens=prefix_tokens,
-                    constraints=constraints,
-                )
-            else:
-                return generator.generate(
-                    models,
-                    sample,
-                    prefix_tokens=prefix_tokens,
-                    bos_token=self.data_manager.get_decoder_langtok(
-                        self.args.target_lang, tgt_langtok_spec
-                    )
-                    if tgt_langtok_spec
-                    else self.target_dictionary.eos(),
-                )
-
-    def reduce_metrics(self, logging_outputs, criterion):
-        super().reduce_metrics(logging_outputs, criterion)
-
-    def max_positions(self):
-        """Return the max sentence length allowed by the task."""
-        return (self.args.max_source_positions, self.args.max_target_positions)
-
-    @property
-    def source_dictionary(self):
-        return next(iter(self.dicts.values()))
-
-    @property
-    def target_dictionary(self):
-        return next(iter(self.dicts.values()))
-
-    def create_batch_sampler_func(
-        self,
-        max_positions,
-        ignore_invalid_inputs,
-        max_tokens,
-        max_sentences,
-        required_batch_size_multiple=1,
-        seed=1,
-    ):
-        def construct_batch_sampler(dataset, epoch):
-            splits = [
-                s for s, _ in self.datasets.items() if self.datasets[s] == dataset
-            ]
-            split = splits[0] if len(splits) > 0 else None
-            # NEW implementation
-            if epoch is not None:
-                # initialize the dataset with the correct starting epoch
-                dataset.set_epoch(epoch)
-
-            # get indices ordered by example size
-            start_time = time.time()
-            logger.info(f"start batch sampler: mem usage: {data_utils.get_mem_usage()}")
-
-            with data_utils.numpy_seed(seed):
-                indices = dataset.ordered_indices()
-            logger.info(
-                f"[{split}] @batch_sampler order indices time: {get_time_gap(start_time, time.time())}"
-            )
-            logger.info(f"mem usage: {data_utils.get_mem_usage()}")
-
-            # filter examples that are too large
-            if max_positions is not None:
-                my_time = time.time()
-                indices = self.filter_indices_by_size(
-                    indices, dataset, max_positions, ignore_invalid_inputs
-                )
-                logger.info(
-                    f"[{split}] @batch_sampler filter_by_size time: {get_time_gap(my_time, time.time())}"
-                )
-                logger.info(f"mem usage: {data_utils.get_mem_usage()}")
-
-            # create mini-batches with given size constraints
-            my_time = time.time()
-            batch_sampler = dataset.batch_by_size(
-                indices,
-                max_tokens=max_tokens,
-                max_sentences=max_sentences,
-                required_batch_size_multiple=required_batch_size_multiple,
-            )
-
-            logger.info(
-                f"[{split}] @batch_sampler batch_by_size time: {get_time_gap(my_time, time.time())}"
-            )
-            logger.info(
-                f"[{split}] per epoch batch_sampler set-up time: {get_time_gap(start_time, time.time())}"
-            )
-            logger.info(f"mem usage: {data_utils.get_mem_usage()}")
-
-            return batch_sampler
-
-        return construct_batch_sampler
-
-    # we need to override get_batch_iterator because we want to reset the epoch iterator each time
-    def get_batch_iterator(
-        self,
-        dataset,
-        max_tokens=None,
-        max_sentences=None,
-        max_positions=None,
-        ignore_invalid_inputs=False,
-        required_batch_size_multiple=1,
-        seed=1,
-        num_shards=1,
-        shard_id=0,
-        num_workers=0,
-        epoch=1,
-        data_buffer_size=0,
-        disable_iterator_cache=False,
-    ):
-        """
-        Get an iterator that yields batches of data from the given dataset.
-
-        Args:
-            dataset (~fairseq.data.FairseqDataset): dataset to batch
-            max_tokens (int, optional): max number of tokens in each batch
-                (default: None).
-            max_sentences (int, optional): max number of sentences in each
-                batch (default: None).
-            max_positions (optional): max sentence length supported by the
-                model (default: None).
-            ignore_invalid_inputs (bool, optional): don't raise Exception for
-                sentences that are too long (default: False).
-            required_batch_size_multiple (int, optional): require batch size to
-                be a multiple of N (default: 1).
-            seed (int, optional): seed for random number generator for
-                reproducibility (default: 1).
-            num_shards (int, optional): shard the data iterator into N
-                shards (default: 1).
-            shard_id (int, optional): which shard of the data iterator to
-                return (default: 0).
-            num_workers (int, optional): how many subprocesses to use for data
-                loading. 0 means the data will be loaded in the main process
-                (default: 0).
-            epoch (int, optional): the epoch to start the iterator from
-                (default: 0).
-            data_buffer_size (int, optional): number of batches to
-                preload (default: 0).
-            disable_iterator_cache (bool, optional): don't cache the
-                EpochBatchIterator (ignores `FairseqTask::can_reuse_epoch_itr`)
-                (default: False).
-        Returns:
-            ~fairseq.iterators.EpochBatchIterator: a batched iterator over the
-                given dataset split
-        """
-        # initialize the dataset with the correct starting epoch
-        assert isinstance(dataset, FairseqDataset)
-        if dataset in self.dataset_to_epoch_iter:
-            return self.dataset_to_epoch_iter[dataset]
-        if self.args.sampling_method == "RoundRobin":
-            batch_iter = super().get_batch_iterator(
-                dataset,
-                max_tokens=max_tokens,
-                max_sentences=max_sentences,
-                max_positions=max_positions,
-                ignore_invalid_inputs=ignore_invalid_inputs,
-                required_batch_size_multiple=required_batch_size_multiple,
-                seed=seed,
-                num_shards=num_shards,
-                shard_id=shard_id,
-                num_workers=num_workers,
-                epoch=epoch,
-                data_buffer_size=data_buffer_size,
-                disable_iterator_cache=disable_iterator_cache,
-            )
-            self.dataset_to_epoch_iter[dataset] = batch_iter
-            return batch_iter
-
-        construct_batch_sampler = self.create_batch_sampler_func(
-            max_positions,
-            ignore_invalid_inputs,
-            max_tokens,
-            max_sentences,
-            required_batch_size_multiple=required_batch_size_multiple,
-            seed=seed,
-        )
-
-        epoch_iter = iterators.EpochBatchIterator(
-            dataset=dataset,
-            collate_fn=dataset.collater,
-            batch_sampler=construct_batch_sampler,
-            seed=seed,
-            num_shards=num_shards,
-            shard_id=shard_id,
-            num_workers=num_workers,
-            epoch=epoch,
-        )
-        return epoch_iter
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/token_generation_constraints.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/token_generation_constraints.py
deleted file mode 100644
index e708dc51bcb0ffb7b411496239c74d5e6f3c2448..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/token_generation_constraints.py
+++ /dev/null
@@ -1,506 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""Implements tracking of constraints for a beam item.
-
-A list of constraints is given as a list of one or more token
-sequences, each of length at least one token. For example, for an input sentence
-
-> Die maschinelle Übersetzung ist schwer zu kontrollieren.
-
-We could have the constraints:
-* to influence
-* hard
-
-There are two implementations:
-* OrderedConstraintState: Tracks progress through an ordered list of multitoken constraints.
-* UnorderedConstraintState: Tracks progress through an unordered list of multitoken constraints.
-
-The difference is that in the first, the constraints are assumed to be
-in order; the algorithm will permit zero or more tokens between them.
-In the second, the constraints are not ordered, so many orderings will
-be explored.
-
-The same sequence can be present any number of times, and will appear
-that many times in the output.
-"""
-
-from collections import Counter
-from typing import List, Optional, Set, Tuple
-
-import torch
-
-
-class ConstraintState:
-    def __init__(self):
-        pass
-
-
-def pack_constraints(batch_constraints: List[List[torch.Tensor]]) -> torch.Tensor:
-    """Takes a list of list of constraints in tensor form (a list of
-    tensor constraints for each sentence) and transforms it into a
-    packed Tensor. For example, here is a batch of size 3 with 3, 0,
-    and 1 constraints:
-
-        [ [ [3 1 2], [3], [4 5 6 7], ]
-          [],
-          [ [1 8 9 10 1 4 11 12], ]
-        ]
-
-    Its corresponding packed structure is:
-
-        [ [ 3  3  1  2  0  3  0  4  5  6  7  0],
-          [ 0  0  0  0  0  0  0  0  0  0  0  0],
-          [ 1  1  8  9 10  1  4 11 12  0  0  0] ]
-
-    The packed tensor has shape (batch size, maxlen), where
-    maxlen is defined below. Each row contains concatenated
-    constraint tokens for that sentence, with 0 appended after
-    each constraint. The first item in each row is the number
-    of constraints for that sentence. So maxlen is the maximum
-    of
-
-    (number of constraints) + (sum length of constraints) + 1.
-
-    across all sentences in the batch.
-    """
-    # The maximum word length of concatenated constraints for any sentence
-    max_constraints_len = 1
-    for sentence_constraints in batch_constraints:
-        if len(sentence_constraints):
-            # number of constraints, plus sum of constrain lens, plus a zero after each
-            constraints_len = (
-                1
-                + sum([c.size(0) for c in sentence_constraints])
-                + len(sentence_constraints)
-            )
-            max_constraints_len = max(max_constraints_len, constraints_len)
-
-    batch_size = len(batch_constraints)
-    constraints_tensor = torch.zeros((batch_size, max_constraints_len)).long()
-    for i, sentence_constraints in enumerate(batch_constraints):
-        constraints_tensor[i, 0] = len(sentence_constraints)
-        offset = 1
-        for j, constraint in enumerate(sentence_constraints):
-            this_len = constraint.size(0)
-            constraints_tensor[i, offset : offset + this_len] = constraint
-            offset += this_len + 1
-
-    return constraints_tensor.long()
-
-
-def unpack_constraints(constraint_tensor: torch.Tensor) -> List[torch.Tensor]:
-    """
-    Transforms *one row* of a packed constraint tensor (e.g., for one
-    sentence in the batch) into a list of constraint tensors.
-    """
-    constraint_list = []
-    num_constraints = constraint_tensor[0]
-    constraints = constraint_tensor.tolist()
-    offset = 1
-    for i in range(num_constraints):
-        where = constraints.index(0, offset)
-        constraint_list.append(constraint_tensor[offset:where])
-        offset = where + 1
-
-    return constraint_list
-
-
-class ConstraintNode:
-    """
-    Represents a node in a trie managing unordered constraints.
-    """
-
-    def __init__(self, token: int = None, parent=None):
-        # The token associate with this node (None for the root)
-        self.token = int(token) if token is not None else None
-        # The parent (None at the root)
-        self.parent = parent
-        # Whether this node is a completed constraint
-        self.terminal = 0
-        # List of child nodes
-        self.children = {}
-
-        # The cumulative number of constraints from this point in the
-        # trie forward
-        self.num_constraints = 0
-
-    @property
-    def id(self):
-        return self.token
-
-    def __str__(self):
-        term = self.terminal != 0
-        return f"[{self.token}].{term}#{self.num_constraints}"
-
-    def __getitem__(self, key: int):
-        return self.children.get(key, None)
-
-    def next_tokens(self) -> Set[int]:
-        """The set of child labels."""
-        return set(self.children.keys())
-
-    @staticmethod
-    def create(constraints: List[List[int]]):
-        root = ConstraintNode()
-        for sequence in constraints:
-            root.add_sequence(sequence)
-
-        return root
-
-    @staticmethod
-    def print_graph(node: "ConstraintNode"):
-        if len(node.children) == 0:
-            return str(node)
-        else:
-            s = f"({node}"
-            for child in node.children.values():
-                s += " " + ConstraintNode.print_graph(child)
-            s += ")"
-            return s
-
-    def token_counts(self) -> Counter:
-        """Returns a counter of the number of times each token is used
-        in a constraint.
-        """
-        token_counts = Counter()
-        kids = list(self.children.values())
-        while len(kids) > 0:
-            kid = kids.pop()
-            token_counts[kid.id] += kid.num_constraints
-            kids += list(kid.children.values())
-
-        return token_counts
-
-    def tokens(self) -> Set[int]:
-        """Returns the set of tokens in constraints."""
-        return set(self.token_counts().keys())
-
-    def add_sequence(self, sequence: List[int]):
-        """Adds a constraint, represented as a list of integers, to
-        the trie."""
-        assert len(sequence) > 0
-
-        token = int(sequence[0])
-        if token not in self.children:
-            self.children[token] = ConstraintNode(token, parent=self)
-
-        node = self.children[token]
-        if len(sequence) == 1:
-            node.terminal += 1
-            node.num_constraints += 1
-            parent = node.parent
-            while parent is not None:
-                parent.num_constraints += 1
-                parent = parent.parent
-        else:
-            node.add_sequence(sequence[1:])
-
-
-class UnorderedConstraintState(ConstraintState):
-    """
-    Records progress through the set of constraints for each item in the beam
-    using a trie.
-    """
-
-    def __init__(self, node: ConstraintNode, copy_from: "ConstraintState" = None):
-        self.node = node
-
-        if copy_from is None:
-            # The root node
-            self.root = node
-            # The set of states in the graph that have been completed
-            self.completed = Counter()
-            # The...
-            self.generated = Counter()
-            # The list of tokens we need to generate
-            self.needed_tokens = self.root.tokens()
-        else:
-            self.completed = Counter(copy_from.completed)
-            self.generated = Counter(copy_from.generated)
-            self.root = copy_from.root
-
-        # Mark the node as generated
-        if self.node != self.root:
-            self.generated[node] += 1
-
-    @staticmethod
-    def create(constraint_tensor: torch.Tensor):
-        constraint_list = unpack_constraints(constraint_tensor)
-        constraint_trie_root = ConstraintNode.create(constraint_list)
-        return UnorderedConstraintState(constraint_trie_root)
-
-    def __str__(self):
-        gen_str = ",".join([str(node) for node in self.generated])
-        return f"{self.name}/{self.bank}({gen_str})x{self.num_completed}"
-
-    def __copy__(self):
-        copied_state = UnorderedConstraintState(self.node, copy_from=self)
-        return copied_state
-
-    def copy(self):
-        return self.__copy__()
-
-    @property
-    def name(self):
-        if self.node.id is None:
-            return "ROOT"
-        else:
-            return str(self.node.id)
-
-    @property
-    def is_root(self):
-        return self.node == self.root
-
-    @property
-    def bank(self):
-        return sum(self.generated.values())
-
-    @property
-    def num_completed(self):
-        """The number of constraints (not constraint tokens) that are completed.
-        In addition to the already-completed states, we need to account for the
-        current state, which might get marked as completed when another token
-        is generated.
-        """
-        in_final = self.node.terminal and self.completed[self.node] < self.node.terminal
-        return sum(self.completed.values()) + in_final
-
-    @property
-    def finished(self):
-        return self.root.num_constraints - self.num_completed == 0
-
-    @property
-    def token_counts(self):
-        return self.root.token_counts()
-
-    @property
-    def tokens(self):
-        return self.root.tokens()
-
-    @property
-    def num_constraint_tokens(self):
-        return sum(self.token_counts.values())
-
-    def next_tokens(self) -> Set[int]:
-        """Returns the list of tokens that could come next.
-        These are (a) all tokens extending the root state and, for
-        non-root states, additionally all tokens extending the current
-        state."""
-
-        if self.node != self.root:
-            return self.root.next_tokens().union(self.node.next_tokens())
-        else:
-            return self.root.next_tokens()
-
-    def advance(self, token: int):
-        """Reads in a token and advances the state. Here's how it works.
-
-        We can advance to the next state if:
-        - there is a matching child
-        - its path isn't blocked
-
-        A path is blocked when all constraints that are descendants of
-        that node have already been generated, in the current state.
-
-        If we are not able to advance from the current state, we "fall
-        off the graph" and return to the root state. There, we again
-        try to advance, checking the same criteria.
-
-        In any case, when falling off the graph, we need to do some
-        bookkeeping. We:
-        - check whether any constraints were met (all prefixes of
-          current state)
-        - if one is found, mark it as completed
-        - adjust visited nodes accordingly
-        """
-        token = int(token)
-
-        next_state = None
-        child = self.node[token]
-        if child is not None and self.generated[child] < child.num_constraints:
-            next_state = UnorderedConstraintState(child, copy_from=self)
-
-        def rewind():
-            """If we're mid-trie and an "illegal" token is chosen next, we need
-            to reset our state to the root state. However, along the way, we need
-            to check whether a prefix of the current trie state represents a state
-            we could mark as completed.
-            """
-            node = self.node
-            while node != self.root:
-                if node.terminal and self.completed[node] < node.terminal:
-                    next_state.completed[node] += 1
-                    return
-
-                next_state.generated[node] -= 1
-                node = node.parent
-
-        # Fall off the graph, check the root
-        if next_state is None and token in self.root.next_tokens():
-            child = self.root[token]
-            # We can only traverse this edge if it's not saturated
-            if self.generated[child] < child.num_constraints:
-                next_state = UnorderedConstraintState(child, copy_from=self)
-            else:
-                next_state = UnorderedConstraintState(self.root, copy_from=self)
-
-            # Rewind
-            rewind()
-
-        elif next_state is None:
-            next_state = UnorderedConstraintState(self.root, copy_from=self)
-            # Rewind
-            rewind()
-
-        return next_state
-
-
-class ConstraintSequence:
-    def __init__(self, sequences: List[List[int]]):
-        """Represents a set of possibly multitoken constraints by
-        concatenating them and internally recording the end points.
-        """
-        self.sequences = []
-        self.endpoints = []
-        self.num_tokens = 0
-        self.tokens = set()
-        for sequence in sequences:
-            for token in sequence:
-                self.tokens.add(token)
-            self.num_tokens += len(sequence)
-            self.endpoints += [False for x in range(len(sequence) - 1)] + [True]
-            self.sequences += sequence
-
-    def __getitem__(self, key: int):
-        return self.sequences[key]
-
-    def __len__(self):
-        return len(self.sequences)
-
-    def __str__(self):
-        return str(self.sequences)
-
-
-class OrderedConstraintState(ConstraintState):
-    """
-    Records progress through the set of linear nonbranching constraints with gaps.
-    """
-
-    def __init__(self, sequence: ConstraintSequence, state: int = -1):
-        self.sequence = sequence
-        self.state = state
-
-    @staticmethod
-    def create(constraint_tensor: torch.Tensor):
-        constraint_list = unpack_constraints(constraint_tensor)
-        return OrderedConstraintState(ConstraintSequence(constraint_list), -1)
-
-    def __str__(self):
-        return f"{self.state}/{self.bank}x{self.num_completed}"
-
-    def __copy__(self):
-        return OrderedConstraintState(self.sequence, self.state)
-
-    def copy(self):
-        return self.__copy__()
-
-    @property
-    def num_completed(self):
-        if self.state == -1:
-            return 0
-        count = len(
-            list(filter(lambda x: x, self.sequence.endpoints[0 : self.state + 1]))
-        )
-        return count
-
-    @property
-    def is_root(self):
-        return self.state == -1
-
-    @property
-    def name(self):
-        if self.state == -1:
-            return "ROOT"
-        else:
-            return str(self.sequence[self.state])
-
-    @property
-    def bank(self) -> int:
-        return self.state + 1
-
-    @property
-    def finished(self):
-        return self.state + 1 == len(self.sequence)
-
-    @property
-    def token_counts(self):
-        return self.sequence.token_counts()
-
-    @property
-    def tokens(self):
-        return self.sequence.tokens
-
-    @property
-    def num_constraint_tokens(self):
-        return sum(self.token_counts.values())
-
-    def next_tokens(self) -> Set[int]:
-        """Returns the list of tokens that could come next.
-        These are (a) all tokens extending the root state and, for
-        non-root states, additionally all tokens extending the current
-        state."""
-
-        tokens = set()
-        if self.state > 0:
-            tokens.add(self.sequence[0])
-        if not self.finished:
-            tokens.add(self.sequence[self.state + 1])
-        return tokens
-
-    def advance(self, token: int):
-        """Reads in a token and advances the state. Here's how it works.
-
-        We can advance to the next state if:
-        - there is a matching child
-        - its path isn't blocked
-
-        A path is blocked when all constraints that are descendants of
-        that node have already been generated, in the current state.
-
-        If we are not able to advance from the current state, we "fall
-        off the graph" and return to the root state. There, we again
-        try to advance, checking the same criteria.
-
-        In any case, when falling off the graph, we need to do some
-        bookkeeping. We:
-        - check whether any constraints were met (all prefixes of
-          current state)
-        - if one is found, mark it as completed
-        - adjust visited nodes accordingly
-        """
-        token = int(token)
-        # print(f"{self} ADVANCE({token}) {self.sequence} -> ", end="")
-
-        if self.finished:
-            # Accept anything
-            next_state = self.copy()
-
-        elif self.sequence[self.state + 1] == token:
-            # Advance to the next token
-            next_state = OrderedConstraintState(self.sequence, self.state + 1)
-
-        elif self.sequence.endpoints[self.state]:
-            # Accept anything between constraints (*)
-            next_state = self.copy()
-
-        elif token == self.sequence[0]:
-            # Start over having generated the first token
-            next_state = OrderedConstraintState(self.sequence, 0)
-        else:
-            # Start over from the root
-            next_state = OrderedConstraintState(self.sequence, -1)
-
-        return next_state
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tokenizer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tokenizer.py
deleted file mode 100644
index 42131f7b1d334020c3b48a6e44d4139f7c62ad28..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/tokenizer.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import re
-
-
-SPACE_NORMALIZER = re.compile(r"\s+")
-
-
-def tokenize_line(line):
-    line = SPACE_NORMALIZER.sub(" ", line)
-    line = line.strip()
-    return line.split()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/trainer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/trainer.py
deleted file mode 100644
index 25332f9b38e63ec6d02752759a207596755d3fce..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/trainer.py
+++ /dev/null
@@ -1,1171 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Train a network across multiple GPUs.
-"""
-
-import contextlib
-import logging
-import sys
-import time
-from itertools import chain
-from typing import Any, Dict, List
-
-import torch
-import torch.distributed as dist
-from fairseq import checkpoint_utils, distributed_utils, models, optim, utils
-from fairseq.file_io import PathManager
-from fairseq.logging import meters, metrics
-from fairseq.nan_detector import NanDetector
-from fairseq.optim import lr_scheduler
-from fairseq.modules.fairseq_dropout import NpuFairseqDropout, get_dropout_class
-
-
-logger = logging.getLogger(__name__)
-
-
-class PreFetcher:
-    def __init__(self, loader, device):
-        self.stream = torch.npu.Stream()
-        self.iterable = iter(loader)
-        self.len = len(loader)
-        self.device = device
-        self.preload()
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        torch.npu.current_stream().wait_stream(self.stream)
-        data = self.next_data
-        if data == -1:
-            raise StopIteration
-        if data is not None:
-            self.preload()
-        return data
-
-    def preload(self):
-        try:
-            self.next_data = next(self.iterable)
-        except StopIteration:
-            self.next_data = -1
-            return
-        with torch.npu.stream(self.stream):
-            self.next_data = utils.move_to_cuda(self.next_data, self.device)
-
-
-class Trainer(object):
-    """Main class for data parallel training.
-
-    This class supports synchronous distributed data parallel training,
-    where multiple workers each have a full model replica and gradients
-    are accumulated across workers before each update. We use
-    :class:`~torch.nn.parallel.DistributedDataParallel` to handle
-    communication of the gradients across workers.
-    """
-
-    def __init__(self, args, task, model, criterion, quantizer=None):
-        self.args = args
-        self.task = task
-        self.reduce_stream = torch.npu.Stream()
-        self.first_grad = None
-        # catalog shared parameters
-        shared_params = _catalog_shared_params(model)
-
-        self.tpu = getattr(args, "tpu", False)
-        self.npu = torch.npu.is_available() and not args.cpu and not self.tpu
-        if self.npu:
-            self.device = torch.device("npu")
-        elif self.tpu:
-            self.device = utils.get_tpu_device(args)
-        else:
-            self.device = torch.device("cpu")
-
-        # copy model and criterion to current device/dtype
-        self._criterion = criterion
-        self._model = model
-        if self.tpu:
-            import torch_xla.core.xla_model as xm
-
-            self._model = xm.send_cpu_data_to_device(self._model, self.device)
-        if args.fp16:
-            self._criterion = self._criterion.half()
-            self._model = self._model.half()
-        elif args.bf16:
-            self._criterion = self._criterion.to(dtype=torch.bfloat16)
-            self._model = self._model.to(dtype=torch.bfloat16)
-        if not args.pipeline_model_parallel:
-            self._criterion = self._criterion.to(device=self.device)
-            self._model = self._model.to(device=self.device)
-
-        self._model.encoder.embed_positions.weight.data = self._model.encoder.embed_positions.weight.data.npu_format_cast(29)
-        self._model.decoder.embed_positions.weight.data = self._model.decoder.embed_positions.weight.data.npu_format_cast(29)
-        self.pipeline_model_parallel = args.pipeline_model_parallel
-        self.last_device = None
-        if self.npu and self.pipeline_model_parallel:
-            self.last_device = torch.device(args.pipeline_devices[-1])
-
-        # check that shared parameters are preserved after device transfer
-        for shared_param in shared_params:
-            ref = _get_module_by_path(self._model, shared_param[0])
-            for path in shared_param[1:]:
-                logger.info(
-                    "detected shared parameter: {} <- {}".format(shared_param[0], path)
-                )
-                _set_module_by_path(self._model, path, ref)
-
-        self._dummy_batch = None  # indicates we don't have a dummy batch at first
-        self._lr_scheduler = None
-        self._num_updates = 0
-        self._num_xla_compiles = 0  # for TPUs
-        self._optim_history = None
-        self._optimizer = None
-        self._warn_once = set()
-        self._wrapped_criterion = None
-        self._wrapped_model = None
-
-        # TODO(myleott): support tpu
-        if self.npu and self.data_parallel_world_size > 1:
-            self._grad_norm_buf = torch.npu.FloatTensor(self.data_parallel_world_size)
-        else:
-            self._grad_norm_buf = None
-
-        self.quantizer = quantizer
-        if self.quantizer is not None:
-            self.quantizer.set_trainer(self)
-
-        # get detailed npu environment
-        if not self.npu:
-            self.cuda_env = utils.CudaEnvironment()
-            if self.data_parallel_world_size > 1:
-                self.cuda_env_arr = distributed_utils.all_gather_list(self.cuda_env)
-            else:
-                self.cuda_env_arr = [self.cuda_env]
-            if self.data_parallel_rank == 0:
-                utils.CudaEnvironment.pretty_print_cuda_env_list(self.cuda_env_arr)
-        else:
-            self.cuda_env = None
-            self.cuda_env_arr = None
-
-        metrics.log_start_time("wall", priority=790, round=4)
-
-        self._start_time = time.time()
-        self._previous_training_time = 0
-        self._cumulative_training_time = None
-        if get_dropout_class() is NpuFairseqDropout:
-            NpuFairseqDropout.enable_dropout_ensemble(self.model)
-
-    def reinitialize(self):
-        """Reinitialize the Trainer, typically after model params change."""
-        self._lr_scheduler = None
-        self._optimizer = None
-        self._wrapped_criterion = None
-        self._wrapped_model = None
-
-    @property
-    def data_parallel_world_size(self):
-        return self.args.distributed_world_size
-
-    @property
-    def data_parallel_process_group(self):
-        if self.tpu:
-            return ("tpu", None)
-        else:
-            return None
-
-    @property
-    def data_parallel_rank(self):
-        return self.args.distributed_rank
-
-    @property
-    def is_data_parallel_master(self):
-        return distributed_utils.is_master(self.args)
-
-    @property
-    def criterion(self):
-        if self._wrapped_criterion is None:
-            if (
-                utils.has_parameters(self._criterion)
-                and self.data_parallel_world_size > 1
-                and not self.args.use_bmuf
-                and not self.tpu
-            ):
-                self._wrapped_criterion = models.DistributedFairseqModel(
-                    self.args,
-                    self._criterion,
-                    process_group=self.data_parallel_process_group,
-                )
-            else:
-                self._wrapped_criterion = self._criterion
-        return self._wrapped_criterion
-
-    @property
-    def model(self):
-        if self._wrapped_model is None:
-            if (
-                self.data_parallel_world_size > 1
-                and not self.args.use_bmuf
-                and not self.tpu
-            ):
-                self._wrapped_model = models.DistributedFairseqModel(
-                    self.args,
-                    self._model,
-                    process_group=self.data_parallel_process_group,
-                )
-            else:
-                self._wrapped_model = self._model
-        return self._wrapped_model
-
-    @property
-    def optimizer(self):
-        if self._optimizer is None:
-            self._build_optimizer()
-        return self._optimizer
-
-    @property
-    def lr_scheduler(self):
-        if self._lr_scheduler is None:
-            self._build_optimizer()  # this will initialize self._lr_scheduler
-        return self._lr_scheduler
-
-    def _build_optimizer(self):
-        params = list(
-            filter(
-                lambda p: p.requires_grad,
-                chain(self.model.parameters(), self.criterion.parameters()),
-            )
-        )
-
-        if self.args.fp16 or self.args.bf16:
-            if self.args.memory_efficient_fp16 or self.args.memory_efficient_bf16:
-                self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer(
-                    self.args, params
-                )
-            else:
-                self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params)
-        else:
-            self._optimizer = optim.build_optimizer(self.args, params)
-
-        if self.args.use_bmuf:
-            self._optimizer = optim.FairseqBMUF(self.args, self._optimizer)
-
-        if self.args.zero_sharding == "os":
-            if (
-                self.args.fp16
-                and not self.args.memory_efficient_fp16
-                and not self.args.memory_efficient_bf16
-            ) and not self.args.fp16_no_flatten_grads:
-                raise ValueError(
-                    "ZeRO is incomptabile with fp16 and flattened grads. "
-                    "Please use --fp16-no-flatten-grads"
-                )
-            else:
-                optim.shard_(
-                    self.args, self._optimizer, self.data_parallel_process_group
-                )
-
-        # We should initialize the learning rate scheduler immediately after
-        # building the optimizer, so that the initial learning rate is set.
-        self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer)
-        self._lr_scheduler.step_update(0)
-
-    def consolidate_optimizer(self):
-        """For OSS, we need to consolidate the state dict."""
-        if hasattr(self.optimizer.optimizer, "consolidate_state_dict"):
-            self.optimizer.optimizer.consolidate_state_dict()
-
-    def save_checkpoint(self, filename, extra_state):
-        """Save all training state in a checkpoint file."""
-        if self.is_data_parallel_master:  # only save one checkpoint
-            extra_state["metrics"] = metrics.state_dict()
-            extra_state["previous_training_time"] = self.cumulative_training_time()
-            checkpoint_utils.save_state(
-                filename,
-                self.args,
-                self.get_model().state_dict(),
-                self.get_criterion(),
-                self.optimizer,
-                self.lr_scheduler,
-                self.get_num_updates(),
-                self._optim_history,
-                extra_state,
-            )
-
-    def load_checkpoint(
-        self,
-        filename,
-        reset_optimizer=False,
-        reset_lr_scheduler=False,
-        optimizer_overrides=None,
-        reset_meters=False,
-    ):
-        """Load all training state from a checkpoint file."""
-        extra_state, self._optim_history, last_optim_state = None, [], None
-
-        bexists = PathManager.isfile(filename)
-        if bexists:
-            state = checkpoint_utils.load_checkpoint_to_cpu(filename)
-
-            # load model parameters
-            try:
-                self.get_model().load_state_dict(
-                    state["model"], strict=True, args=self.args
-                )
-                if utils.has_parameters(self.get_criterion()):
-                    self.get_criterion().load_state_dict(
-                        state["criterion"], strict=True
-                    )
-            except Exception:
-                raise Exception(
-                    "Cannot load model parameters from checkpoint {}; "
-                    "please ensure that the architectures match.".format(filename)
-                )
-
-            extra_state = state["extra_state"]
-            self._optim_history = state["optimizer_history"]
-            last_optim_state = state.get("last_optimizer_state", None)
-
-        if last_optim_state is not None and not reset_optimizer:
-            # rebuild optimizer after loading model, since params may have changed
-            self._build_optimizer()
-
-            # only reload optimizer and lr_scheduler if they match
-            last_optim = self._optim_history[-1]
-            assert (
-                last_optim["criterion_name"] == self.get_criterion().__class__.__name__
-            ), "Criterion does not match; please reset the optimizer (--reset-optimizer)."
-            assert (
-                last_optim["optimizer_name"] == self.optimizer.__class__.__name__
-            ), "Optimizer does not match; please reset the optimizer (--reset-optimizer)."
-
-            if not reset_lr_scheduler:
-                self.lr_scheduler.load_state_dict(last_optim["lr_scheduler_state"])
-            self.optimizer.load_state_dict(last_optim_state, optimizer_overrides)
-
-            self.set_num_updates(last_optim["num_updates"])
-
-        if extra_state is not None:
-            epoch = extra_state["train_iterator"]["epoch"]
-            logger.info(
-                "loaded checkpoint {} (epoch {} @ {} updates)".format(
-                    filename, epoch, self.get_num_updates()
-                )
-            )
-
-            if "previous_training_time" in extra_state:
-                self._previous_training_time = extra_state["previous_training_time"]
-                self._start_time = time.time()
-
-            self.lr_step(epoch)
-
-            if "metrics" in extra_state and not reset_meters:
-                metrics.load_state_dict(extra_state["metrics"])
-
-                # reset TimeMeters, since their start times don't make sense anymore
-                for meter in metrics.get_meters("default"):
-                    if isinstance(meter, meters.TimeMeter):
-                        meter.reset()
-        else:
-            logger.info("no existing checkpoint found {}".format(filename))
-
-        return extra_state
-
-    def get_train_iterator(
-        self,
-        epoch,
-        combine=True,
-        load_dataset=True,
-        data_selector=None,
-        shard_batch_itr=True,
-        disable_iterator_cache=False,
-    ):
-        """Return an EpochBatchIterator over the training set for a given epoch."""
-        if load_dataset:
-            logger.info("loading train data for epoch {}".format(epoch))
-            self.task.load_dataset(
-                self.args.train_subset,
-                epoch=epoch,
-                combine=combine,
-                data_selector=data_selector,
-            )
-        batch_iterator = self.task.get_batch_iterator(
-            dataset=self.task.dataset(self.args.train_subset),
-            max_tokens=self.args.max_tokens,
-            max_sentences=self.args.batch_size,
-            max_positions=utils.resolve_max_positions(
-                self.task.max_positions(),
-                self.model.max_positions(),
-                self.args.max_tokens,
-            ),
-            ignore_invalid_inputs=True,
-            required_batch_size_multiple=self.args.required_batch_size_multiple,
-            seed=self.args.seed,
-            num_shards=self.data_parallel_world_size if shard_batch_itr else 1,
-            shard_id=self.data_parallel_rank if shard_batch_itr else 0,
-            num_workers=self.args.num_workers,
-            epoch=epoch,
-            data_buffer_size=self.args.data_buffer_size,
-            disable_iterator_cache=disable_iterator_cache,
-        )
-        self.reset_dummy_batch(batch_iterator.first_batch)
-        return batch_iterator
-
-    def get_valid_iterator(
-        self,
-        subset,
-        disable_iterator_cache=False,
-    ):
-        """Return an EpochBatchIterator over given validation subset for a given epoch."""
-        batch_iterator = self.task.get_batch_iterator(
-            dataset=self.task.dataset(subset),
-            max_tokens=self.args.max_tokens_valid,
-            max_sentences=self.args.batch_size_valid,
-            max_positions=utils.resolve_max_positions(
-                self.task.max_positions(),
-                self.model.max_positions(),
-            ),
-            ignore_invalid_inputs=self.args.skip_invalid_size_inputs_valid_test,
-            required_batch_size_multiple=self.args.required_batch_size_multiple,
-            seed=self.args.seed,
-            num_shards=self.data_parallel_world_size,
-            shard_id=self.data_parallel_rank,
-            num_workers=self.args.num_workers,
-            data_buffer_size=self.args.data_buffer_size,
-            disable_iterator_cache=disable_iterator_cache,
-        )
-        self.reset_dummy_batch(batch_iterator.first_batch)
-        return batch_iterator
-
-    def begin_epoch(self, epoch):
-        """Called at the beginning of each epoch."""
-        logger.info("begin training epoch {}".format(epoch))
-
-        self.lr_step_begin_epoch(epoch)
-
-        if self.quantizer is not None:
-            self.quantizer.begin_epoch(epoch)
-
-        # task specific setup per epoch
-        self.task.begin_epoch(epoch, self.get_model())
-
-        if self.tpu:
-            import torch_xla.core.xla_model as xm
-
-            xm.rendezvous("begin_epoch")  # wait for all workers
-            xm.mark_step()
-
-    def begin_valid_epoch(self, epoch):
-        """Called at the beginning of each validation epoch."""
-
-        # task specific setup per validation epoch
-        self.task.begin_valid_epoch(epoch, self.get_model())
-
-    def reset_dummy_batch(self, batch):
-        self._dummy_batch = batch
-
-    @metrics.aggregate("train")
-    def train_step(self, samples, raise_oom=False):
-        """Do forward, backward and parameter update."""
-        self._set_seed()
-        self.model.train()
-        self.criterion.train()
-        self.zero_grad()
-
-        prefetch_samples = PreFetcher(samples, torch.npu.current_device())
-        metrics.log_start_time("train_wall", priority=800, round=4)
-
-        # forward and backward pass
-        logging_outputs, sample_size, ooms = [], 0, 0
-        for i, sample in enumerate(prefetch_samples):
-            if sample == {}:
-                # when sample is None, run forward/backward on a dummy batch
-                # and ignore the resulting gradients
-                sample = self._prepare_sample(self._dummy_batch)
-                is_dummy_batch = True
-            else:
-                if self._dummy_batch == "DUMMY":
-                    self._dummy_batch = sample
-                is_dummy_batch = False
-
-            def maybe_no_sync():
-                """
-                Whenever *samples* contains more than one mini-batch, we
-                want to accumulate gradients locally and only call
-                all-reduce in the last backwards pass.
-                """
-                if (
-                    self.data_parallel_world_size > 1
-                    and hasattr(self.model, "no_sync")
-                    and i < len(samples) - 1
-                ):
-                    return self.model.no_sync()
-                else:
-                    return contextlib.ExitStack()  # dummy contextmanager
-
-            try:
-                with maybe_no_sync():
-                    # forward and backward
-                    loss, sample_size_i, logging_output = self.task.train_step(
-                        sample=sample,
-                        model=self.model,
-                        criterion=self.criterion,
-                        optimizer=self.optimizer,
-                        update_num=self.get_num_updates(),
-                        ignore_grad=is_dummy_batch,
-                    )
-                    del loss
-
-                logging_outputs.append(logging_output)
-                sample_size += sample_size_i
-
-                # emptying the CUDA cache after the first step can
-                # reduce the chance of OOM
-                if self.npu and self.get_num_updates() == 0:
-                    torch.npu.empty_cache()
-            except RuntimeError as e:
-                if "out of memory" in str(e):
-                    self._log_oom(e)
-                    if raise_oom:
-                        raise e
-                    logger.warning(
-                        "attempting to recover from OOM in forward/backward pass"
-                    )
-                    ooms += 1
-                    self.zero_grad()
-                    if self.npu:
-                        torch.npu.empty_cache()
-                    if self.args.distributed_world_size == 1:
-                        return None
-                else:
-                    raise e
-
-            if self.tpu and i < len(samples) - 1:
-                # tpu-comment: every XLA operation before marking step is
-                # appended to the IR graph, and processing too many batches
-                # before marking step can lead to OOM errors.
-                # To handle gradient accumulation use case, we explicitly
-                # mark step here for every forward pass without a backward pass
-                import torch_xla.core.xla_model as xm
-
-                xm.mark_step()
-
-        if is_dummy_batch:
-            if torch.is_tensor(sample_size):
-                sample_size.zero_()
-            else:
-                sample_size *= 0.0
-
-        if torch.is_tensor(sample_size):
-            sample_size = sample_size.float()
-        else:
-            sample_size = float(sample_size)
-
-        # gather logging outputs from all replicas
-        if self._sync_stats():
-            train_time = self._local_cumulative_training_time()
-            logging_outputs, (
-                sample_size,
-                ooms,
-                total_train_time,
-            ) = self._aggregate_logging_outputs(
-                logging_outputs,
-                sample_size,
-                ooms,
-                train_time,
-                ignore=is_dummy_batch,
-            )
-            self._cumulative_training_time = (
-                total_train_time / self.data_parallel_world_size
-            )
-
-        if hasattr(self.model, "all_reduce"):
-            torch.npu.current_stream().wait_stream(self.reduce_stream)
-            if self.first_grad is not None:
-                self.first_grad.div_(8)
-                dist.all_reduce(self.first_grad)
-
-        overflow = False
-        try:
-            if self.tpu and self.data_parallel_world_size > 1:
-                import torch_xla.core.xla_model as xm
-
-                gradients = xm._fetch_gradients(self.optimizer.optimizer)
-                xm.all_reduce(
-                    "sum", gradients, scale=1.0 / self.data_parallel_world_size
-                )
-
-            # multiply gradients by (# GPUs / sample_size) since DDP
-            # already normalizes by the number of GPUs. Thus we get
-            # (sum_of_gradients / sample_size).
-            if not self.args.use_bmuf:
-                self.optimizer.multiply_grads(
-                    self.data_parallel_world_size / sample_size
-                )
-            elif sample_size > 0:  # BMUF needs to check sample size
-                num = self.data_parallel_world_size if self._sync_stats() else 1
-                self.optimizer.multiply_grads(num / sample_size)
-
-                # clip grads
-            grad_norm = self.clip_grad_norm(self.args.clip_norm)
-
-            # check that grad norms are consistent across workers
-            # on tpu check tensor is slow
-            if not self.tpu:
-                if (
-                    not self.args.use_bmuf
-                    and self.args.distributed_wrapper != "SlowMo"
-                ):
-                    self._check_grad_norms(grad_norm)
-                if not torch.isfinite(grad_norm).all():
-                    # check local gradnorm single GPU case, trigger NanDetector
-                    raise FloatingPointError("gradients are Nan/Inf")
-
-                # take an optimization step
-            self.optimizer.step()
-
-        except FloatingPointError:
-            # re-run the forward and backward pass with hooks attached to print
-            # out where it fails
-            with NanDetector(self.get_model()):
-                self.task.train_step(
-                    sample,
-                    self.model,
-                    self.criterion,
-                    self.optimizer,
-                    self.get_num_updates(),
-                    ignore_grad=False,
-                )
-            raise
-        except OverflowError as e:
-            overflow = True
-            logger.info("NOTE: overflow detected, " + str(e))
-            grad_norm = torch.tensor(0.0).npu()
-            self.zero_grad()
-        except RuntimeError as e:
-            if "out of memory" in str(e):
-                self._log_oom(e)
-                logger.error("OOM during optimization, irrecoverable")
-            raise e
-
-        # Some distributed wrappers (e.g., SlowMo) need access to the optimizer after the step
-        if hasattr(self.model, "perform_additional_optimizer_actions"):
-            if hasattr(self.optimizer, "fp32_params"):
-                self.model.perform_additional_optimizer_actions(
-                    self.optimizer.optimizer, self.optimizer.fp32_params
-                )
-            else:
-                self.model.perform_additional_optimizer_actions(
-                    self.optimizer.optimizer
-                )
-
-        if not overflow or self.args.distributed_wrapper == "SlowMo":
-            self.set_num_updates(self.get_num_updates() + 1)
-
-            if self.tpu:
-                # mark step on TPUs
-                import torch_xla.core.xla_model as xm
-
-                xm.mark_step()
-
-                # only log stats every log_interval steps
-                # this causes wps to be misreported when log_interval > 1
-                logging_output = {}
-                if self.get_num_updates() % self.args.log_interval == 0:
-                    # log memory usage
-                    mem_info = xm.get_memory_info(self.device)
-                    gb_free = mem_info["kb_free"] / 1024 / 1024
-                    gb_total = mem_info["kb_total"] / 1024 / 1024
-                    metrics.log_scalar(
-                        "gb_free",
-                        gb_free,
-                        priority=1500,
-                        round=1,
-                        weight=0,
-                    )
-                    metrics.log_scalar(
-                        "gb_total",
-                        gb_total,
-                        priority=1600,
-                        round=1,
-                        weight=0,
-                    )
-
-                    logging_output = self._reduce_and_log_stats(
-                        logging_outputs,
-                        sample_size,
-                        grad_norm,
-                    )
-
-                # log whenever there's an XLA compilation, since these
-                # slow down training and may indicate opportunities for
-                # optimization
-                self._check_xla_compilation()
-            else:
-                # log stats
-                logging_output = self._reduce_and_log_stats(
-                    logging_outputs,
-                    sample_size,
-                    grad_norm,
-                )
-
-                # clear npu cache to reduce memory fragmentation
-                if (
-                    self.npu
-                    and self.args.empty_cache_freq > 0
-                    and (
-                        (self.get_num_updates() + self.args.empty_cache_freq - 1)
-                        % self.args.empty_cache_freq
-                    )
-                    == 0
-                ):
-                    torch.npu.empty_cache()
-
-        if self.args.fp16:
-            metrics.log_scalar(
-                "loss_scale",
-                self.optimizer.scaler.loss_scale,
-                priority=700,
-                round=4,
-                weight=0,
-            )
-
-        metrics.log_stop_time("train_wall")
-        return logging_output
-
-    @metrics.aggregate("valid")
-    def valid_step(self, sample, raise_oom=False):
-        """Do forward pass in evaluation mode."""
-        if self.tpu:
-            import torch_xla.core.xla_model as xm
-
-            xm.rendezvous("valid_step")  # wait for all workers
-            xm.mark_step()
-
-        with torch.no_grad():
-            self.model.eval()
-            self.criterion.eval()
-
-            sample = self._prepare_sample(sample)
-            if sample is None:
-                sample = self._prepare_sample(self._dummy_batch)
-                is_dummy_batch = True
-            else:
-                if self._dummy_batch == "DUMMY":
-                    self._dummy_batch = sample
-                is_dummy_batch = False
-
-            try:
-                _loss, sample_size, logging_output = self.task.valid_step(
-                    sample, self.model, self.criterion
-                )
-            except RuntimeError as e:
-                if "out of memory" in str(e):
-                    self._log_oom(e)
-                    if not raise_oom:
-                        logger.warning(
-                            "ran out of memory in validation step, retrying batch"
-                        )
-                        for p in self.model.parameters():
-                            if p.grad is not None:
-                                p.grad = None  # free some memory
-                        if self.npu:
-                            torch.npu.empty_cache()
-                        return self.valid_step(sample, raise_oom=True)
-                raise e
-
-            logging_outputs = [logging_output]
-            if is_dummy_batch:
-                if torch.is_tensor(sample_size):
-                    sample_size.zero_()
-                else:
-                    sample_size *= 0.0
-
-        # gather logging outputs from all replicas
-        if self.data_parallel_world_size > 1:
-            logging_outputs, (sample_size,) = self._aggregate_logging_outputs(
-                logging_outputs,
-                sample_size,
-                ignore=is_dummy_batch,
-            )
-
-        # log validation stats
-        logging_output = self._reduce_and_log_stats(logging_outputs, sample_size)
-
-        return logging_output
-
-    def zero_grad(self):
-        self.optimizer.zero_grad()
-
-    def lr_step_begin_epoch(self, epoch):
-        """Adjust the learning rate at the beginning of the epoch."""
-        self.lr_scheduler.step_begin_epoch(epoch)
-        # prefer updating the LR based on the number of steps
-        return self.lr_step_update()
-
-    def lr_step(self, epoch, val_loss=None):
-        """Adjust the learning rate at the end of the epoch."""
-        self.lr_scheduler.step(epoch, val_loss)
-        # prefer updating the LR based on the number of steps
-        return self.lr_step_update()
-
-    def lr_step_update(self):
-        """Update the learning rate after each update."""
-        new_lr = self.lr_scheduler.step_update(self.get_num_updates())
-        metrics.log_scalar("lr", new_lr, weight=0, priority=300)
-        return new_lr
-
-    def get_lr(self):
-        """Get the current learning rate."""
-        return self.optimizer.get_lr()
-
-    def get_model(self):
-        """Get the (non-wrapped) model instance."""
-        return self._model
-
-    def get_criterion(self):
-        """Get the (non-wrapped) criterion instance."""
-        return self._criterion
-
-    def get_meter(self, name):
-        """[deprecated] Get a specific meter by name."""
-        from fairseq import meters
-
-        if "get_meter" not in self._warn_once:
-            self._warn_once.add("get_meter")
-            utils.deprecation_warning(
-                "Trainer.get_meter is deprecated. Please use fairseq.metrics instead."
-            )
-
-        train_meters = metrics.get_meters("train")
-        if train_meters is None:
-            train_meters = {}
-
-        if name == "train_loss" and "loss" in train_meters:
-            return train_meters["loss"]
-        elif name == "train_nll_loss":
-            # support for legacy train.py, which assumed this meter is
-            # always initialized
-            m = train_meters.get("nll_loss", None)
-            return m or meters.AverageMeter()
-        elif name == "wall":
-            # support for legacy train.py, which assumed this meter is
-            # always initialized
-            m = metrics.get_meter("default", "wall")
-            return m or meters.TimeMeter()
-        elif name == "wps":
-            m = metrics.get_meter("train", "wps")
-            return m or meters.TimeMeter()
-        elif name in {"valid_loss", "valid_nll_loss"}:
-            # support for legacy train.py, which assumed these meters
-            # are always initialized
-            k = name[len("valid_") :]
-            m = metrics.get_meter("valid", k)
-            return m or meters.AverageMeter()
-        elif name == "oom":
-            return meters.AverageMeter()
-        elif name in train_meters:
-            return train_meters[name]
-        return None
-
-    def get_num_updates(self):
-        """Get the number of parameters updates."""
-        return self._num_updates
-
-    def set_num_updates(self, num_updates):
-        """Set the number of parameters updates."""
-        self._num_updates = num_updates
-        self.lr_step_update()
-        if self.quantizer:
-            self.quantizer.step_update(self._num_updates)
-        metrics.log_scalar("num_updates", self._num_updates, weight=0, priority=200)
-
-    def clip_grad_norm(self, clip_norm):
-        return self.optimizer.clip_grad_norm(clip_norm, aggregate_norm_fn=None)
-
-    def cumulative_training_time(self):
-        if self._cumulative_training_time is None:
-            # single GPU
-            return self._local_cumulative_training_time()
-        else:
-            return self._cumulative_training_time
-
-    def _local_cumulative_training_time(self):
-        """Aggregate training time in seconds."""
-        return time.time() - self._start_time + self._previous_training_time
-
-    def _prepare_sample(self, sample):
-        if sample == "DUMMY":
-            raise Exception(
-                "Trying to use an uninitialized 'dummy' batch. This usually indicates "
-                "that the total number of batches is smaller than the number of "
-                "participating GPUs. Try reducing the batch size or using fewer GPUs."
-            )
-
-        if sample is None or len(sample) == 0:
-            return None
-
-        if self.npu:
-            if self.pipeline_model_parallel:
-                if "target" in sample:
-                    sample["target"] = utils.move_to_cuda(
-                        sample["target"], device=self.last_device
-                    )
-            else:
-                sample = utils.move_to_cuda(sample, self.device)
-
-        def apply_half(t):
-            if t.dtype is torch.float32:
-                return t.half()
-            return t
-
-        def apply_bfloat16(t):
-            if t.dtype is torch.float32:
-                return t.to(dtype=torch.bfloat16)
-            return t
-
-        if self.args.fp16:
-            sample = utils.apply_to_sample(apply_half, sample)
-
-        if self.args.bf16:
-            sample = utils.apply_to_sample(apply_bfloat16, sample)
-
-        return sample
-
-    def _set_seed(self):
-        # Set seed based on args.seed and the update number so that we get
-        # reproducible results when resuming from checkpoints
-        seed = self.args.seed + self.get_num_updates()
-        utils.set_torch_seed(seed)
-
-    def _sync_stats(self):
-        # Return True if it's using multiple GPUs and DDP or multiple GPUs with
-        # BMUF and it's a bmuf sync with warmup iterations completed before.
-        if self.data_parallel_world_size == 1:
-            return False
-        elif self.args.use_bmuf:
-            return (self.get_num_updates() + 1) % self.args.global_sync_iter == 0 and (
-                self.get_num_updates() + 1
-            ) > self.args.warmup_iterations
-        else:
-            return True
-
-    def _log_oom(self, exc):
-        msg = "OOM: Ran out of memory with exception: {}".format(exc)
-        logger.warning(msg)
-        if torch.npu.is_available() and hasattr(torch.npu, "memory_summary"):
-            for device_idx in range(torch.npu.device_count()):
-                logger.warning(torch.npu.memory_summary(device=device_idx))
-        sys.stderr.flush()
-
-    def _aggregate_logging_outputs(
-        self,
-        logging_outputs: List[Dict[str, Any]],
-        *extra_stats_to_sum,
-        ignore=False,
-    ):
-        if self.task.__class__.logging_outputs_can_be_summed(self.get_criterion()):
-            return self._fast_stat_sync_sum(
-                logging_outputs, *extra_stats_to_sum, ignore=ignore
-            )
-        else:
-            return self._all_gather_list_sync(
-                logging_outputs, *extra_stats_to_sum, ignore=ignore
-            )
-
-    def _all_gather_list_sync(
-        self,
-        logging_outputs: List[Dict[str, Any]],
-        *extra_stats_to_sum,
-        ignore=False,
-    ):
-        """
-        Sync logging outputs across workers. all_gather_list_sync is
-        suitable when logging outputs are complex types.
-        """
-        if self.tpu:
-            raise NotImplementedError
-        if ignore:
-            logging_outputs = []
-        results = list(
-            zip(
-                *distributed_utils.all_gather_list(
-                    [logging_outputs] + list(extra_stats_to_sum),
-                    max_size=getattr(self.args, "all_gather_list_size", 16384),
-                    group=self.data_parallel_process_group,
-                )
-            )
-        )
-        logging_outputs, extra_stats_to_sum = results[0], results[1:]
-        logging_outputs = list(chain.from_iterable(logging_outputs))
-        extra_stats_to_sum = [sum(s) for s in extra_stats_to_sum]
-        return logging_outputs, extra_stats_to_sum
-
-    def _fast_stat_sync_sum(
-        self,
-        logging_outputs: List[Dict[str, Any]],
-        *extra_stats_to_sum,
-        ignore=False,
-    ):
-        """
-        Sync logging outputs across workers. fast_stat_sync_sum is
-        faster than all_gather_list_sync, but is only suitable when
-        logging outputs are scalars and can be summed. Note that
-        *logging_outputs* cannot contain any nested dicts/lists.
-        """
-        data = {}
-        for i, stat in enumerate(extra_stats_to_sum):
-            data["extra_stats_" + str(i)] = stat
-        if len(logging_outputs) > 0:
-            log_keys = list(logging_outputs[0].keys())
-            for k in log_keys:
-                if not ignore:
-                    v = sum(log[k] for log in logging_outputs if k in log)
-                else:
-                    v = logging_outputs[0][k]
-                    v = torch.zeros_like(v) if torch.is_tensor(v) else 0
-                data["logging_outputs_" + k] = v
-        else:
-            log_keys = None
-
-        data = distributed_utils.all_reduce_dict(
-            data, device=self.device, group=self.data_parallel_process_group
-        )
-
-        extra_stats_to_sum = [
-            data["extra_stats_" + str(i)] for i in range(len(extra_stats_to_sum))
-        ]
-        if log_keys is not None:
-            logging_outputs = [{k: data["logging_outputs_" + k] for k in log_keys}]
-        else:
-            logging_outputs = []
-        return logging_outputs, extra_stats_to_sum
-
-    def _check_grad_norms(self, grad_norm):
-        """Check that grad norms are consistent across workers."""
-        if self._grad_norm_buf is not None:
-            self._grad_norm_buf.zero_()
-            self._grad_norm_buf[self.data_parallel_rank] = grad_norm
-            distributed_utils.all_reduce(
-                self._grad_norm_buf, group=self.data_parallel_process_group
-            )
-
-            def is_consistent(tensor):
-                max_abs_diff = torch.max(torch.abs(tensor - tensor[0]))
-                return (
-                    torch.isfinite(tensor).all()
-                    or (max_abs_diff / (tensor[0] + 1e-6) < 1e-6).all()
-                )
-
-            if not is_consistent(self._grad_norm_buf):
-                pretty_detail = "\n".join(
-                    "rank {:3d} = {:.8f}".format(r, n)
-                    for r, n in enumerate(self._grad_norm_buf.tolist())
-                )
-                error_detail = "grad_norm across the workers:\n{}\n".format(
-                    pretty_detail
-                )
-                # use FloatingPointError to trigger NanDetector
-                raise FloatingPointError(
-                    "Fatal error: gradients are inconsistent between workers. "
-                    "Try --ddp-backend=no_c10d. "
-                    "Or are you mixing up different generation of GPUs in training?"
-                    + "\n"
-                    + "-" * 80
-                    + "\n{}\n".format(error_detail)
-                    + "-" * 80
-                )
-
-    def _reduce_and_log_stats(self, logging_outputs, sample_size, grad_norm=None):
-        if grad_norm is not None:
-            metrics.log_speed("ups", 1.0, priority=100, round=2)
-            metrics.log_scalar("gnorm", grad_norm, priority=400, round=3)
-            if self.args.clip_norm > 0:
-                metrics.log_scalar(
-                    "clip",
-                    torch.where(
-                        grad_norm > self.args.clip_norm,
-                        grad_norm.new_tensor(100),
-                        grad_norm.new_tensor(0),
-                    ),
-                    priority=500,
-                    round=1,
-                )
-
-        with metrics.aggregate() as agg:
-            if logging_outputs is not None:
-                self.task.reduce_metrics(logging_outputs, self.get_criterion())
-                del logging_outputs
-
-            # extra warning for criterions that don't properly log a loss value
-            if "loss" not in agg:
-                if "loss" not in self._warn_once:
-                    self._warn_once.add("loss")
-                    logger.warning(
-                        "Criterion.reduce_metrics did not log a 'loss' value, "
-                        "which may break some functionality"
-                    )
-                metrics.log_scalar("loss", -1)
-
-            # support legacy interface
-            if self.tpu:
-                logging_output = {}
-            else:
-                logging_output = agg.get_smoothed_values()
-                logging_output["sample_size"] = sample_size
-                for key_to_delete in ["ppl", "wps", "wpb", "bsz"]:
-                    if key_to_delete in logging_output:
-                        del logging_output[key_to_delete]
-            return logging_output
-
-    def _check_xla_compilation(self):
-        import torch_xla.debug.metrics as met
-
-        compile_stats = met.metric_data("CompileTime")
-        if compile_stats is None:
-            return
-        num_xla_compiles = compile_stats[0]
-        if num_xla_compiles > self._num_xla_compiles:
-            logger.warning(
-                "XLA compilation detected on device #{}; too many of these can lead "
-                "to slow training, but we expect a few in the beginning".format(
-                    self.args.distributed_rank
-                )
-            )
-        self._num_xla_compiles = num_xla_compiles
-
-
-def _catalog_shared_params(module, memo=None, prefix=""):
-    if memo is None:
-        first_call = True
-        memo = {}
-    else:
-        first_call = False
-    for name, param in module._parameters.items():
-        param_prefix = prefix + ("." if prefix else "") + name
-        if param not in memo:
-            memo[param] = []
-        memo[param].append(param_prefix)
-    for name, m in module._modules.items():
-        if m is None:
-            continue
-        submodule_prefix = prefix + ("." if prefix else "") + name
-        _catalog_shared_params(m, memo, submodule_prefix)
-    if first_call:
-        return [x for x in memo.values() if len(x) > 1]
-
-
-def _get_module_by_path(module, path):
-    path = path.split(".")
-    for name in path:
-        module = getattr(module, name)
-    return module
-
-
-def _set_module_by_path(module, path, value):
-    path = path.split(".")
-    for name in path[:-1]:
-        module = getattr(module, name)
-    setattr(module, path[-1], value)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/utils.py
deleted file mode 100644
index 9011292d3541dae23125eccae2f401bc0aacb819..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq/utils.py
+++ /dev/null
@@ -1,700 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import contextlib
-import copy
-import importlib
-import logging
-import os
-import sys
-import tempfile
-import warnings
-from itertools import accumulate
-from typing import Callable, Dict, List, Optional
-
-import torch
-import torch.nn.functional as F
-from fairseq.data import iterators
-from fairseq.file_io import PathManager
-from fairseq.logging.meters import safe_round
-from fairseq.modules import gelu, gelu_accurate
-from fairseq.modules.multihead_attention import MultiheadAttention
-from torch import Tensor
-
-
-try:
-    from amp_C import multi_tensor_l2norm
-
-    multi_tensor_l2norm_available = True
-except ImportError:
-    multi_tensor_l2norm_available = False
-
-
-logger = logging.getLogger(__name__)
-
-
-MANIFOLD_PATH_SEP = "|"
-
-
-class FileContentsAction(argparse.Action):
-    def __init__(self, option_strings, dest, nargs=None, **kwargs):
-        if nargs is not None:
-            raise ValueError("nargs not allowed")
-        super(FileContentsAction, self).__init__(option_strings, dest, **kwargs)
-
-    def __call__(self, parser, namespace, values, option_string=None):
-        if PathManager.isfile(values):
-            with PathManager.open(values) as f:
-                argument = f.read().strip()
-        else:
-            argument = values
-        setattr(namespace, self.dest, argument)
-
-
-def split_paths(paths: str) -> List[str]:
-    return (
-        paths.split(os.pathsep)
-        if "://" not in paths
-        else paths.split(MANIFOLD_PATH_SEP)
-    )
-
-
-def load_ensemble_for_inference(filenames, task, model_arg_overrides=None):
-    from fairseq import checkpoint_utils
-
-    deprecation_warning(
-        "utils.load_ensemble_for_inference is deprecated. "
-        "Please use checkpoint_utils.load_model_ensemble instead."
-    )
-    return checkpoint_utils.load_model_ensemble(
-        filenames, arg_overrides=model_arg_overrides, task=task
-    )
-
-
-def apply_to_sample(f, sample):
-    if hasattr(sample, "__len__") and len(sample) == 0:
-        return {}
-
-    def _apply(x):
-        if torch.is_tensor(x):
-            return f(x)
-        elif isinstance(x, dict):
-            return {key: _apply(value) for key, value in x.items()}
-        elif isinstance(x, list):
-            return [_apply(x) for x in x]
-        elif isinstance(x, tuple):
-            return tuple(_apply(x) for x in x)
-        elif isinstance(x, set):
-            return {_apply(x) for x in x}
-        else:
-            return x
-
-    return _apply(sample)
-
-
-def move_to_cuda(sample, device=None):
-
-    def _move_to_cuda(tensor):
-        # non_blocking is ignored if tensor is not pinned, so we can always set
-        # to True (see github.com/PyTorchLightning/pytorch-lightning/issues/620)
-        return tensor.to(device)
-
-    return apply_to_sample(_move_to_cuda, sample)
-
-
-def move_to_cpu(sample):
-    def _move_to_cpu(tensor):
-        # PyTorch has poor support for half tensors (float16) on CPU.
-        # Move any such tensors to float32.
-        if tensor.dtype in {torch.bfloat16, torch.float16}:
-            tensor = tensor.to(dtype=torch.float32)
-        return tensor.cpu()
-
-    return apply_to_sample(_move_to_cpu, sample)
-
-
-def get_incremental_state(
-    module: MultiheadAttention,
-    incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
-    key: str,
-) -> Optional[Dict[str, Optional[Tensor]]]:
-    """Helper for getting incremental state for an nn.Module."""
-    return module.get_incremental_state(incremental_state, key)
-
-
-def set_incremental_state(
-    module: MultiheadAttention,
-    incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
-    key: str,
-    value: Dict[str, Optional[Tensor]],
-) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]:
-    """Helper for setting incremental state for an nn.Module."""
-    if incremental_state is not None:
-        result = module.set_incremental_state(incremental_state, key, value)
-        if result is not None:
-            incremental_state = result
-    return incremental_state
-
-
-def load_align_dict(replace_unk):
-    if replace_unk is None:
-        align_dict = None
-    elif isinstance(replace_unk, str) and len(replace_unk) > 0:
-        # Load alignment dictionary for unknown word replacement if it was passed as an argument.
-        align_dict = {}
-        with open(replace_unk, "r") as f:
-            for line in f:
-                cols = line.split()
-                align_dict[cols[0]] = cols[1]
-    else:
-        # No alignment dictionary provided but we still want to perform unknown word replacement by copying the
-        # original source word.
-        align_dict = {}
-    return align_dict
-
-
-def print_embed_overlap(embed_dict, vocab_dict):
-    embed_keys = set(embed_dict.keys())
-    vocab_keys = set(vocab_dict.symbols)
-    overlap = len(embed_keys & vocab_keys)
-    logger.info("found {}/{} types in embedding file".format(overlap, len(vocab_dict)))
-
-
-def parse_embedding(embed_path):
-    """Parse embedding text file into a dictionary of word and embedding tensors.
-
-    The first line can have vocabulary size and dimension. The following lines
-    should contain word and embedding separated by spaces.
-
-    Example:
-        2 5
-        the -0.0230 -0.0264  0.0287  0.0171  0.1403
-        at -0.0395 -0.1286  0.0275  0.0254 -0.0932
-    """
-    embed_dict = {}
-    with open(embed_path) as f_embed:
-        next(f_embed)  # skip header
-        for line in f_embed:
-            pieces = line.rstrip().split(" ")
-            embed_dict[pieces[0]] = torch.Tensor(
-                [float(weight) for weight in pieces[1:]]
-            )
-    return embed_dict
-
-
-def load_embedding(embed_dict, vocab, embedding):
-    for idx in range(len(vocab)):
-        token = vocab[idx]
-        if token in embed_dict:
-            embedding.weight.data[idx] = embed_dict[token]
-    return embedding
-
-
-def replace_unk(hypo_str, src_str, alignment, align_dict, unk):
-    from fairseq import tokenizer
-
-    # Tokens are strings here
-    hypo_tokens = tokenizer.tokenize_line(hypo_str)
-    # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully
-    src_tokens = tokenizer.tokenize_line(src_str) + ["<eos>"]
-    for i, ht in enumerate(hypo_tokens):
-        if ht == unk:
-            src_token = src_tokens[alignment[i]]
-            # Either take the corresponding value in the aligned dictionary or just copy the original value.
-            hypo_tokens[i] = align_dict.get(src_token, src_token)
-    return " ".join(hypo_tokens)
-
-
-def post_process_prediction(
-    hypo_tokens,
-    src_str,
-    alignment,
-    align_dict,
-    tgt_dict,
-    remove_bpe=None,
-    extra_symbols_to_ignore=None,
-):
-    hypo_str = tgt_dict.string(
-        hypo_tokens, remove_bpe, extra_symbols_to_ignore=extra_symbols_to_ignore
-    )
-    if align_dict is not None:
-        hypo_str = replace_unk(
-            hypo_str, src_str, alignment, align_dict, tgt_dict.unk_string()
-        )
-    if align_dict is not None or remove_bpe is not None:
-        # Convert back to tokens for evaluating with unk replacement or without BPE
-        # Note that the dictionary can be modified inside the method.
-        hypo_tokens = tgt_dict.encode_line(hypo_str, add_if_not_exist=True)
-    return hypo_tokens, hypo_str, alignment
-
-
-def make_positions(tensor, padding_idx: int, onnx_trace: bool = False):
-    """Replace non-padding symbols with their position numbers.
-
-    Position numbers begin at padding_idx+1. Padding symbols are ignored.
-    """
-    # The series of casts and type-conversions here are carefully
-    # balanced to both work with ONNX export and XLA. In particular XLA
-    # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
-    # how to handle the dtype kwarg in cumsum.
-    mask = tensor.ne(padding_idx).int()
-    return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx
-
-
-def strip_pad(tensor, pad):
-    return tensor[tensor.ne(pad)]
-
-
-def buffered_arange(max):
-    if not hasattr(buffered_arange, "buf"):
-        buffered_arange.buf = torch.LongTensor()
-    if max > buffered_arange.buf.numel():
-        buffered_arange.buf.resize_(max)
-        torch.arange(max, out=buffered_arange.buf)
-    return buffered_arange.buf[:max]
-
-
-def convert_padding_direction(
-    src_tokens, padding_idx, right_to_left: bool = False, left_to_right: bool = False
-):
-    assert right_to_left ^ left_to_right
-    pad_mask = src_tokens.eq(padding_idx)
-    if not pad_mask.any():
-        # no padding, return early
-        return src_tokens
-    if left_to_right and not pad_mask[:, 0].any():
-        # already right padded
-        return src_tokens
-    if right_to_left and not pad_mask[:, -1].any():
-        # already left padded
-        return src_tokens
-    max_len = src_tokens.size(1)
-    buffered = torch.empty(0).long()
-    if max_len > 0:
-        torch.arange(max_len, out=buffered)
-    range = buffered.type_as(src_tokens).expand_as(src_tokens)
-    num_pads = pad_mask.long().sum(dim=1, keepdim=True)
-    if right_to_left:
-        index = torch.remainder(range - num_pads, max_len)
-    else:
-        index = torch.remainder(range + num_pads, max_len)
-    return src_tokens.gather(1, index)
-
-
-def item(tensor):
-    if hasattr(tensor, "item"):
-        return tensor.item()
-    if hasattr(tensor, "__getitem__"):
-        return tensor[0]
-    return tensor
-
-
-def multi_tensor_total_norm(grads, chunk_size=2048 * 32) -> torch.Tensor:
-    per_device_grads = {}
-    norms = []
-    for grad in grads:
-        device = grad.device
-        cur_device_grads = per_device_grads.get(device)
-        if cur_device_grads is None:
-            cur_device_grads = []
-            per_device_grads[device] = cur_device_grads
-        cur_device_grads.append(grad)
-    for device in per_device_grads.keys():
-        cur_device_grads = per_device_grads[device]
-        if device.type == "cuda":
-            # TODO(msb) return has_inf
-            has_inf = torch.zeros((1, 1), dtype=torch.int, device=device)
-            with torch.cuda.device(device):
-                norm = multi_tensor_l2norm(
-                    chunk_size, has_inf, [cur_device_grads], False
-                )
-            norms.append(norm[0].to(torch.cuda.current_device()))
-        else:
-            norms += [torch.norm(g, p=2, dtype=torch.float32) for g in cur_device_grads]
-    total_norm = torch.norm(torch.stack(norms))
-    return total_norm
-
-
-@torch.no_grad()
-def clip_grad_norm_(params, max_norm, aggregate_norm_fn=None) -> torch.Tensor:
-    if isinstance(params, torch.Tensor):
-        params = [params]
-    params = list(params)
-    grads = [p.grad.detach() for p in filter(lambda p: p.grad is not None, params)]
-    if len(grads) == 0:
-        if len(params) > 0:
-            return params[0].new_tensor(0.0)
-        else:
-            return torch.tensor(0.0)
-
-    if len(grads) == 1:
-        total_norm = torch.norm(grads[0], p=2, dtype=torch.float32)
-    else:
-        if multi_tensor_l2norm_available:
-            total_norm = multi_tensor_total_norm(grads)
-        else:
-            if torch.cuda.is_available():
-                warnings.warn(
-                    "amp_C fused kernels unavailable, disabling multi_tensor_l2norm; "
-                    "you may get better performance by installing NVIDIA's apex library"
-                )
-                device = torch.cuda.current_device()
-            elif grads[0].device.type == "xla":
-                device = grads[0].device
-            else:
-                device = torch.device("cpu")
-            total_norm = torch.norm(
-                torch.stack(
-                    [torch.norm(g, p=2, dtype=torch.float32).to(device) for g in grads]
-                )
-            )
-
-    if aggregate_norm_fn is not None:
-        total_norm = aggregate_norm_fn(total_norm)
-
-    if max_norm > 0:
-        max_norm = float(max_norm)
-        clip_coef = (max_norm / (total_norm + 1e-6)).clamp_(max=1)
-        for g in grads:
-            g.mul_(clip_coef)
-    return total_norm
-
-
-def fill_with_neg_inf(t):
-    """FP16-compatible function that fills a tensor with -inf."""
-    return t.float().fill_(float("-inf")).type_as(t)
-
-
-def _match_types(arg1, arg2):
-    """Convert the numerical argument to the same type as the other argument"""
-
-    def upgrade(arg_number, arg_structure):
-        if isinstance(arg_structure, tuple):
-            return tuple([arg_number] * len(arg_structure))
-        elif isinstance(arg_structure, dict):
-            arg = copy.deepcopy(arg_structure)
-            for k in arg:
-                arg[k] = upgrade(arg_number, arg_structure[k])
-            return arg
-        else:
-            return arg_number
-
-    if isinstance(arg1, float) or isinstance(arg1, int):
-        return upgrade(arg1, arg2), arg2
-    elif isinstance(arg2, float) or isinstance(arg2, int):
-        return arg1, upgrade(arg2, arg1)
-
-    return arg1, arg2
-
-
-def resolve_max_positions(*args):
-    """Resolve max position constraints from multiple sources."""
-
-    def map_value_update(d1, d2):
-        updated_value = copy.deepcopy(d1)
-        for key in d2:
-            if key not in updated_value:
-                updated_value[key] = d2[key]
-            else:
-                updated_value[key] = min(d1[key], d2[key])
-        return updated_value
-
-    def nullsafe_min(l):
-        minim = None
-        for item in l:
-            if minim is None:
-                minim = item
-            elif item is not None and item < minim:
-                minim = item
-        return minim
-
-    max_positions = None
-    for arg in args:
-        if max_positions is None:
-            max_positions = arg
-        elif arg is not None:
-            max_positions, arg = _match_types(max_positions, arg)
-            if isinstance(arg, float) or isinstance(arg, int):
-                max_positions = min(max_positions, arg)
-            elif isinstance(arg, dict):
-                max_positions = map_value_update(max_positions, arg)
-            else:
-                max_positions = tuple(map(nullsafe_min, zip(max_positions, arg)))
-
-    return max_positions
-
-
-def import_user_module(args):
-    module_path = getattr(args, "user_dir", None)
-    if module_path is not None:
-        module_path = os.path.abspath(args.user_dir)
-        if not os.path.exists(module_path):
-            fairseq_rel_path = os.path.join(os.path.dirname(__file__), args.user_dir)
-            if os.path.exists(fairseq_rel_path):
-                module_path = fairseq_rel_path
-            else:
-                fairseq_rel_path = os.path.join(
-                    os.path.dirname(__file__), "..", args.user_dir
-                )
-                if os.path.exists(fairseq_rel_path):
-                    module_path = fairseq_rel_path
-                else:
-                    raise FileNotFoundError(module_path)
-
-        # ensure that user modules are only imported once
-        import_user_module.memo = getattr(import_user_module, "memo", set())
-        if module_path not in import_user_module.memo:
-            import_user_module.memo.add(module_path)
-
-            module_parent, module_name = os.path.split(module_path)
-            if module_name not in sys.modules:
-                sys.path.insert(0, module_parent)
-                importlib.import_module(module_name)
-            else:
-                raise ImportError(
-                    "Failed to import --user-dir={} because the corresponding module name "
-                    "({}) is not globally unique. Please rename the directory to "
-                    "something unique and try again.".format(module_path, module_name)
-                )
-
-
-def softmax(x, dim: int, onnx_trace: bool = False):
-    if onnx_trace:
-        return F.softmax(x.float(), dim=dim)
-    else:
-        return F.softmax(x, dim=dim, dtype=torch.float32)
-
-
-def log_softmax(x, dim: int, onnx_trace: bool = False):
-    if onnx_trace:
-        return F.log_softmax(x.float(), dim=dim)
-    else:
-        return F.log_softmax(x, dim=dim, dtype=torch.float32)
-
-
-def get_perplexity(loss, round=2, base=2):
-    if loss is None:
-        return 0.0
-    try:
-        return safe_round(base ** loss, round)
-    except OverflowError:
-        return float("inf")
-
-
-def deprecation_warning(message, stacklevel=3):
-    # don't use DeprecationWarning, since it's ignored by default
-    warnings.warn(message, stacklevel=stacklevel)
-
-
-def get_activation_fn(activation: str) -> Callable:
-    """ Returns the activation function corresponding to `activation` """
-    if activation == "relu":
-        return F.relu
-    elif activation == "gelu":
-        return gelu
-    elif activation == "gelu_fast":
-        deprecation_warning(
-            "--activation-fn=gelu_fast has been renamed to gelu_accurate"
-        )
-        return gelu_accurate
-    elif activation == "gelu_accurate":
-        return gelu_accurate
-    elif activation == "tanh":
-        return torch.tanh
-    elif activation == "linear":
-        return lambda x: x
-    else:
-        raise RuntimeError("--activation-fn {} not supported".format(activation))
-
-
-def get_available_activation_fns() -> List:
-    return [
-        "relu",
-        "gelu",
-        "gelu_fast",  # deprecated
-        "gelu_accurate",
-        "tanh",
-        "linear",
-    ]
-
-
-@contextlib.contextmanager
-def model_eval(model):
-    is_training = model.training
-    model.eval()
-    yield
-    model.train(is_training)
-
-
-def has_parameters(module):
-    try:
-        next(module.parameters())
-        return True
-    except StopIteration:
-        return False
-
-
-def set_torch_seed(seed):
-    # Set seed based on args.seed and the update number so that we get
-    # reproducible results when resuming from checkpoints
-    assert isinstance(seed, int)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-
-
-@contextlib.contextmanager
-def with_torch_seed(seed):
-    assert isinstance(seed, int)
-    rng_state = torch.get_rng_state()
-    cuda_rng_state = torch.cuda.get_rng_state()
-    set_torch_seed(seed)
-    yield
-    torch.set_rng_state(rng_state)
-    torch.cuda.set_rng_state(cuda_rng_state)
-
-
-def parse_alignment(line):
-    """
-    Parses a single line from the alingment file.
-
-    Args:
-        line (str): String containing the alignment of the format:
-            <src_idx_1>-<tgt_idx_1> <src_idx_2>-<tgt_idx_2> ..
-            <src_idx_m>-<tgt_idx_m>. All indices are 0 indexed.
-
-    Returns:
-        torch.IntTensor: packed alignments of shape (2 * m).
-    """
-    alignments = line.strip().split()
-    parsed_alignment = torch.IntTensor(2 * len(alignments))
-    for idx, alignment in enumerate(alignments):
-        src_idx, tgt_idx = alignment.split("-")
-        parsed_alignment[2 * idx] = int(src_idx)
-        parsed_alignment[2 * idx + 1] = int(tgt_idx)
-    return parsed_alignment
-
-
-def get_token_to_word_mapping(tokens, exclude_list):
-    n = len(tokens)
-    word_start = [int(token not in exclude_list) for token in tokens]
-    word_idx = list(accumulate(word_start))
-    token_to_word = {i: word_idx[i] for i in range(n)}
-    return token_to_word
-
-
-def extract_hard_alignment(attn, src_sent, tgt_sent, pad, eos):
-    tgt_valid = (
-        ((tgt_sent != pad) & (tgt_sent != eos)).nonzero(as_tuple=False).squeeze(dim=-1)
-    )
-    src_invalid = (
-        ((src_sent == pad) | (src_sent == eos)).nonzero(as_tuple=False).squeeze(dim=-1)
-    )
-    src_token_to_word = get_token_to_word_mapping(src_sent, [eos, pad])
-    tgt_token_to_word = get_token_to_word_mapping(tgt_sent, [eos, pad])
-    alignment = []
-    if len(tgt_valid) != 0 and len(src_invalid) < len(src_sent):
-        attn_valid = attn[tgt_valid]
-        attn_valid[:, src_invalid] = float("-inf")
-        _, src_indices = attn_valid.max(dim=1)
-        for tgt_idx, src_idx in zip(tgt_valid, src_indices):
-            alignment.append(
-                (
-                    src_token_to_word[src_idx.item()] - 1,
-                    tgt_token_to_word[tgt_idx.item()] - 1,
-                )
-            )
-    return alignment
-
-
-def new_arange(x, *size):
-    """
-    Return a Tensor of `size` filled with a range function on the device of x.
-    If size is empty, using the size of the variable x.
-    """
-    if len(size) == 0:
-        size = x.size()
-    return torch.arange(size[-1], device=x.device).expand(*size).contiguous()
-
-
-def get_tpu_device(args):
-    import torch_xla.core.xla_model as xm
-
-    return xm.xla_device()
-
-
-def tpu_data_loader(itr):
-    import torch_xla.core.xla_model as xm
-    import torch_xla.distributed.parallel_loader as pl
-
-    xm.rendezvous("tpu_data_loader")  # wait for all workers
-    xm.mark_step()
-    device = xm.xla_device()
-    return iterators.CountingIterator(
-        pl.ParallelLoader(itr, [device]).per_device_loader(device),
-        start=getattr(itr, "n", 0),
-        total=len(itr),
-    )
-
-
-class CudaEnvironment(object):
-    def __init__(self):
-        cur_device = torch.cuda.current_device()
-        prop = torch.cuda.get_device_properties("cuda:{}".format(cur_device))
-        self.name = prop.name
-        self.major = prop.major
-        self.minor = prop.minor
-        self.total_memory_in_GB = prop.total_memory / 1024 / 1024 / 1024
-
-    @staticmethod
-    def pretty_print_cuda_env_list(cuda_env_list):
-        """
-        Given a list of CudaEnviorments, pretty print them
-        """
-        num_workers = len(cuda_env_list)
-        center = "CUDA enviroments for all {} workers".format(num_workers)
-        banner_len = 40 - len(center) // 2
-        first_line = "*" * banner_len + center + "*" * banner_len
-        logger.info(first_line)
-        for r, env in enumerate(cuda_env_list):
-            logger.info(
-                "rank {:3d}: ".format(r)
-                + "capabilities = {:2d}.{:<2d} ; ".format(env.major, env.minor)
-                + "total memory = {:.3f} GB ; ".format(env.total_memory_in_GB)
-                + "name = {:40s}".format(env.name)
-            )
-        logger.info(first_line)
-
-
-def csv_str_list(x):
-    return x.split(",")
-
-
-def eval_str_list(x, type=float):
-    if x is None:
-        return None
-    if isinstance(x, str):
-        x = eval(x)
-    try:
-        return list(map(type, x))
-    except TypeError:
-        return [type(x)]
-
-
-def eval_str_dict(x, type=dict):
-    if x is None:
-        return None
-    if isinstance(x, str):
-        x = eval(x)
-    return x
-
-
-def eval_bool(x, default=False):
-    if x is None:
-        return default
-    try:
-        return bool(eval(x))
-    except TypeError:
-        return default
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/eval_lm.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/eval_lm.py
deleted file mode 100644
index 9a4ff8ee392f3a15555b74bb690888476c1f36bf..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/eval_lm.py
+++ /dev/null
@@ -1,279 +0,0 @@
-#!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Evaluate the perplexity of a trained language model.
-"""
-
-import logging
-import math
-import os
-
-import torch
-from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
-from fairseq.data import LMContextWindowDataset
-from fairseq.logging import progress_bar
-from fairseq.logging.meters import StopwatchMeter, TimeMeter
-from fairseq.sequence_scorer import SequenceScorer
-
-
-logging.basicConfig(
-    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    level=os.environ.get("LOGLEVEL", "INFO").upper(),
-)
-logger = logging.getLogger("fairseq_cli.eval_lm")
-
-
-class WordStat(object):
-    def __init__(self, word, is_bpe):
-        self.word = word
-        self.is_bpe = is_bpe
-        self.log_prob = 0
-        self.next_word_prob = 0
-        self.count = 0
-        self.missing_next_words = 0
-
-    def add(self, log_prob, next_word_prob):
-        """increments counters for the sum of log probs of current word and next
-        word (given context ending at current word). Since the next word might be at the end of the example,
-        or it might be not counted because it is not an ending subword unit,
-        also keeps track of how many of those we have seen"""
-        if next_word_prob is not None:
-            self.next_word_prob += next_word_prob
-        else:
-            self.missing_next_words += 1
-        self.log_prob += log_prob
-        self.count += 1
-
-    def __str__(self):
-        return "{}\t{}\t{}\t{}\t{}\t{}".format(
-            self.word,
-            self.count,
-            self.log_prob,
-            self.is_bpe,
-            self.next_word_prob,
-            self.count - self.missing_next_words,
-        )
-
-
-def main(parsed_args, **unused_kwargs):
-    assert parsed_args.path is not None, "--path required for evaluation!"
-
-    if torch.cuda.is_available() and not parsed_args.cpu:
-        torch.cuda.set_device(parsed_args.device_id)
-
-    utils.import_user_module(parsed_args)
-
-    logger.info(parsed_args)
-
-    use_cuda = torch.cuda.is_available() and not parsed_args.cpu
-
-    task = tasks.setup_task(parsed_args)
-
-    # Load ensemble
-    logger.info("loading model(s) from {}".format(parsed_args.path))
-    models, args = checkpoint_utils.load_model_ensemble(
-        parsed_args.path.split(os.pathsep),
-        arg_overrides=eval(parsed_args.model_overrides),
-        task=task,
-        suffix=getattr(parsed_args, "checkpoint_suffix", ""),
-        strict=(parsed_args.checkpoint_shard_count == 1),
-        num_shards=parsed_args.checkpoint_shard_count,
-    )
-
-    for arg in vars(parsed_args).keys():
-        if arg not in {
-            "self_target",
-            "future_target",
-            "past_target",
-            "tokens_per_sample",
-            "output_size_dictionary",
-            "add_bos_token",
-        }:
-            setattr(args, arg, getattr(parsed_args, arg))
-
-    # reduce tokens per sample by the required context window size
-    args.tokens_per_sample -= args.context_window
-    task = tasks.setup_task(args)
-
-    # Load dataset splits
-    task.load_dataset(args.gen_subset)
-    dataset = task.dataset(args.gen_subset)
-    if args.context_window > 0:
-        dataset = LMContextWindowDataset(
-            dataset=dataset,
-            tokens_per_sample=args.tokens_per_sample,
-            context_window=args.context_window,
-            pad_idx=task.source_dictionary.pad(),
-        )
-    logger.info("{} {} {} examples".format(args.data, args.gen_subset, len(dataset)))
-
-    # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer)
-    for model in models:
-        if args.fp16:
-            model.half()
-        if use_cuda and not args.pipeline_model_parallel:
-            model.cuda()
-        model.prepare_for_inference_(args)
-
-    assert len(models) > 0
-
-    logger.info(
-        "num. model params: {}".format(sum(p.numel() for p in models[0].parameters()))
-    )
-
-    itr = task.get_batch_iterator(
-        dataset=dataset,
-        max_tokens=args.max_tokens or 36000,
-        max_sentences=args.batch_size,
-        max_positions=utils.resolve_max_positions(
-            *[model.max_positions() for model in models]
-        ),
-        ignore_invalid_inputs=True,
-        num_shards=args.num_shards,
-        shard_id=args.shard_id,
-        num_workers=args.num_workers,
-        data_buffer_size=args.data_buffer_size,
-    ).next_epoch_itr(shuffle=False)
-    progress = progress_bar.progress_bar(
-        itr,
-        log_format=args.log_format,
-        log_interval=args.log_interval,
-        default_log_format=("tqdm" if not args.no_progress_bar else "none"),
-    )
-
-    gen_timer = StopwatchMeter()
-    scorer = SequenceScorer(task.target_dictionary, args.softmax_batch)
-
-    score_sum = 0.0
-    count = 0
-
-    if args.remove_bpe is not None:
-        if args.remove_bpe == "sentencepiece":
-            raise NotImplementedError
-        else:
-            bpe_cont = args.remove_bpe.rstrip()
-            bpe_toks = {
-                i
-                for i in range(len(task.source_dictionary))
-                if task.source_dictionary[i].endswith(bpe_cont)
-            }
-        bpe_len = len(bpe_cont)
-    else:
-        bpe_toks = None
-        bpe_len = 0
-
-    word_stats = dict()
-
-    wps_meter = TimeMeter()
-
-    for sample in progress:
-        if "net_input" not in sample:
-            continue
-
-        sample = utils.move_to_cuda(sample) if use_cuda else sample
-
-        gen_timer.start()
-        hypos = scorer.generate(models, sample)
-        gen_timer.stop(sample["ntokens"])
-
-        for i, hypos_i in enumerate(hypos):
-            hypo = hypos_i[0]
-            sample_id = sample["id"][i]
-
-            tokens = hypo["tokens"]
-            tgt_len = tokens.numel()
-            pos_scores = hypo["positional_scores"].float()
-
-            if getattr(args, "add_bos_token", False):
-                assert hypo["tokens"][0].item() == task.target_dictionary.bos()
-                tokens = tokens[1:]
-                pos_scores = pos_scores[1:]
-
-            skipped_toks = 0
-            if bpe_toks is not None:
-                for i in range(tgt_len - 1):
-                    if tokens[i].item() in bpe_toks:
-                        skipped_toks += 1
-                        pos_scores[i + 1] += pos_scores[i]
-                        pos_scores[i] = 0
-
-            inf_scores = pos_scores.eq(float("inf")) | pos_scores.eq(float("-inf"))
-            if inf_scores.any():
-                logger.info(
-                    "skipping tokens with inf scores:",
-                    task.target_dictionary.string(tokens[inf_scores.nonzero()]),
-                )
-                pos_scores = pos_scores[(~inf_scores).nonzero()]
-            score_sum += pos_scores.sum().cpu()
-            count += pos_scores.numel() - skipped_toks
-
-            if args.output_word_probs or args.output_word_stats:
-                w = ""
-                word_prob = []
-                is_bpe = False
-                for i in range(len(tokens)):
-                    w_ind = tokens[i].item()
-                    w += task.source_dictionary[w_ind]
-                    if bpe_toks is not None and w_ind in bpe_toks:
-                        w = w[:-bpe_len]
-                        is_bpe = True
-                    else:
-                        word_prob.append((w, pos_scores[i].item()))
-
-                        next_prob = None
-                        ind = i + 1
-                        while ind < len(tokens):
-                            if pos_scores[ind].item() != 0:
-                                next_prob = pos_scores[ind]
-                                break
-                            ind += 1
-
-                        word_stats.setdefault(w, WordStat(w, is_bpe)).add(
-                            pos_scores[i].item(), next_prob
-                        )
-                        is_bpe = False
-                        w = ""
-                if args.output_word_probs:
-                    logger.info(
-                        str(int(sample_id))
-                        + " "
-                        + (
-                            "\t".join(
-                                "{} [{:2f}]".format(x[0], x[1]) for x in word_prob
-                            )
-                        )
-                    )
-
-        wps_meter.update(sample["ntokens"])
-        progress.log({"wps": round(wps_meter.avg)})
-
-    avg_nll_loss = -score_sum / count / math.log(2)  # convert to base 2
-    logger.info(
-        "Evaluated {} tokens in {:.1f}s ({:.2f} tokens/s)".format(
-            gen_timer.n, gen_timer.sum, 1.0 / gen_timer.avg
-        )
-    )
-    logger.info(
-        "Loss (base 2): {:.4f}, Perplexity: {:.2f}".format(
-            avg_nll_loss, 2 ** avg_nll_loss
-        )
-    )
-
-    if args.output_word_stats:
-        for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True):
-            logger.info(ws)
-
-
-def cli_main():
-    parser = options.get_eval_lm_parser()
-    args = options.parse_args_and_arch(parser)
-    distributed_utils.call_main(args, main)
-
-
-if __name__ == "__main__":
-    cli_main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/generate.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/generate.py
deleted file mode 100644
index 277c5d2de438c322256ff49e66dfb056480aaf70..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/generate.py
+++ /dev/null
@@ -1,387 +0,0 @@
-#!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Translate pre-processed data with a trained model.
-"""
-
-import ast
-import logging
-import math
-import os
-import sys
-from itertools import chain
-
-import numpy as np
-import torch
-from fairseq import checkpoint_utils, options, scoring, tasks, utils
-from fairseq.logging import progress_bar
-from fairseq.logging.meters import StopwatchMeter, TimeMeter
-
-
-def main(args):
-    assert args.path is not None, "--path required for generation!"
-    assert (
-        not args.sampling or args.nbest == args.beam
-    ), "--sampling requires --nbest to be equal to --beam"
-    assert (
-        args.replace_unk is None or args.dataset_impl == "raw"
-    ), "--replace-unk requires a raw text dataset (--dataset-impl=raw)"
-
-    if args.results_path is not None:
-        os.makedirs(args.results_path, exist_ok=True)
-        output_path = os.path.join(
-            args.results_path, "generate-{}.txt".format(args.gen_subset)
-        )
-        with open(output_path, "w", buffering=1, encoding="utf-8") as h:
-            return _main(args, h)
-    else:
-        return _main(args, sys.stdout)
-
-
-def get_symbols_to_strip_from_output(generator):
-    if hasattr(generator, "symbols_to_strip_from_output"):
-        return generator.symbols_to_strip_from_output
-    else:
-        return {generator.eos}
-
-
-def _main(args, output_file):
-    logging.basicConfig(
-        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-        level=os.environ.get("LOGLEVEL", "INFO").upper(),
-        stream=output_file,
-    )
-    logger = logging.getLogger("fairseq_cli.generate")
-    torch.npu.global_step_inc()
-    option={}
-    option['ACL_OP_SELECT_IMPL_MODE'] = 'high_performance'
-    option['ACL_OPTYPELIST_FOR_IMPLMODE'] = 'LayerNorm'
-    torch.npu.set_option(option)
-    utils.import_user_module(args)
-
-    if args.max_tokens is None and args.batch_size is None:
-        args.max_tokens = 12000
-    logger.info(args)
-
-    # Fix seed for stochastic decoding
-    if args.seed is not None and not args.no_seed_provided:
-        np.random.seed(args.seed)
-        utils.set_torch_seed(args.seed)
-
-    use_npu = torch.npu.is_available() and not args.cpu
-
-    # Load dataset splits
-    task = tasks.setup_task(args)
-    task.load_dataset(args.gen_subset)
-
-    # Set dictionaries
-    try:
-        src_dict = getattr(task, "source_dictionary", None)
-    except NotImplementedError:
-        src_dict = None
-    tgt_dict = task.target_dictionary
-
-    overrides = ast.literal_eval(args.model_overrides)
-
-    # Load ensemble
-    logger.info("loading model(s) from {}".format(args.path))
-    models, _model_args = checkpoint_utils.load_model_ensemble(
-        utils.split_paths(args.path),
-        arg_overrides=overrides,
-        task=task,
-        suffix=getattr(args, "checkpoint_suffix", ""),
-        strict=(args.checkpoint_shard_count == 1),
-        num_shards=args.checkpoint_shard_count,
-    )
-
-    if args.lm_path is not None:
-        overrides["data"] = args.data
-
-        try:
-            lms, _ = checkpoint_utils.load_model_ensemble(
-                [args.lm_path],
-                arg_overrides=overrides,
-                task=None,
-            )
-        except:
-            logger.warning(
-                f"Failed to load language model! Please make sure that the language model dict is the same "
-                f"as target dict and is located in the data dir ({args.data})"
-            )
-            raise
-
-        assert len(lms) == 1
-    else:
-        lms = [None]
-
-    # Optimize ensemble for generation
-    for model in chain(models, lms):
-        if model is None:
-            continue
-        if args.fp16:
-            model.half()
-        if use_npu and not args.pipeline_model_parallel:
-            model.npu()
-        model.prepare_for_inference_(args)
-
-    # Load alignment dictionary for unknown word replacement
-    # (None if no unknown word replacement, empty if no path to align dictionary)
-    align_dict = utils.load_align_dict(args.replace_unk)
-
-    # Load dataset (possibly sharded)
-    itr = task.get_batch_iterator(
-        dataset=task.dataset(args.gen_subset),
-        max_tokens=args.max_tokens,
-        max_sentences=args.batch_size,
-        max_positions=utils.resolve_max_positions(
-            task.max_positions(), *[model.max_positions() for model in models]
-        ),
-        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
-        required_batch_size_multiple=args.required_batch_size_multiple,
-        num_shards=args.num_shards,
-        shard_id=args.shard_id,
-        num_workers=args.num_workers,
-        data_buffer_size=args.data_buffer_size,
-    ).next_epoch_itr(shuffle=False)
-    progress = progress_bar.progress_bar(
-        itr,
-        log_format=args.log_format,
-        log_interval=args.log_interval,
-        default_log_format=("tqdm" if not args.no_progress_bar else "none"),
-    )
-
-    # Initialize generator
-    gen_timer = StopwatchMeter()
-
-    extra_gen_cls_kwargs = {"lm_model": lms[0], "lm_weight": args.lm_weight}
-    generator = task.build_generator(
-        models, args, extra_gen_cls_kwargs=extra_gen_cls_kwargs
-    )
-
-    # Handle tokenization and BPE
-    tokenizer = task.build_tokenizer(args)
-    bpe = task.build_bpe(args)
-
-    def decode_fn(x):
-        if bpe is not None:
-            x = bpe.decode(x)
-        if tokenizer is not None:
-            x = tokenizer.decode(x)
-        return x
-
-    scorer = scoring.build_scorer(args, tgt_dict)
-
-    num_sentences = 0
-    has_target = True
-    wps_meter = TimeMeter()
-    for sample in progress:
-        sample = utils.move_to_cuda(sample, 'npu:{}'.format(args.npu_id)) if use_npu else sample
-        if "net_input" not in sample:
-            continue
-
-        prefix_tokens = None
-        if args.prefix_size > 0:
-            prefix_tokens = sample["target"][:, : args.prefix_size]
-
-        constraints = None
-        if "constraints" in sample:
-            constraints = sample["constraints"]
-
-        gen_timer.start()
-        hypos = task.inference_step(
-            generator,
-            models,
-            sample,
-            prefix_tokens=prefix_tokens,
-            constraints=constraints,
-        )
-        num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos)
-        gen_timer.stop(num_generated_tokens)
-
-        for i, sample_id in enumerate(sample["id"].tolist()):
-            has_target = sample["target"] is not None
-
-            # Remove padding
-            if "src_tokens" in sample["net_input"]:
-                src_tokens = utils.strip_pad(
-                    sample["net_input"]["src_tokens"][i, :], tgt_dict.pad()
-                )
-            else:
-                src_tokens = None
-
-            target_tokens = None
-            if has_target:
-                target_tokens = (
-                    utils.strip_pad(sample["target"][i, :], tgt_dict.pad()).int().cpu()
-                )
-
-            # Either retrieve the original sentences or regenerate them from tokens.
-            if align_dict is not None:
-                src_str = task.dataset(args.gen_subset).src.get_original_text(sample_id)
-                target_str = task.dataset(args.gen_subset).tgt.get_original_text(
-                    sample_id
-                )
-            else:
-                if src_dict is not None:
-                    src_str = src_dict.string(src_tokens, args.remove_bpe)
-                else:
-                    src_str = ""
-                if has_target:
-                    target_str = tgt_dict.string(
-                        target_tokens,
-                        args.remove_bpe,
-                        escape_unk=True,
-                        extra_symbols_to_ignore=get_symbols_to_strip_from_output(
-                            generator
-                        ),
-                    )
-
-            src_str = decode_fn(src_str)
-            if has_target:
-                target_str = decode_fn(target_str)
-
-            if not args.quiet:
-                if src_dict is not None:
-                    print("S-{}\t{}".format(sample_id, src_str), file=output_file)
-                if has_target:
-                    print("T-{}\t{}".format(sample_id, target_str), file=output_file)
-
-            # Process top predictions
-            for j, hypo in enumerate(hypos[i][: args.nbest]):
-                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
-                    hypo_tokens=hypo["tokens"].int().cpu(),
-                    src_str=src_str,
-                    alignment=hypo["alignment"],
-                    align_dict=align_dict,
-                    tgt_dict=tgt_dict,
-                    remove_bpe=args.remove_bpe,
-                    extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator),
-                )
-                detok_hypo_str = decode_fn(hypo_str)
-                if not args.quiet:
-                    score = hypo["score"] / math.log(2)  # convert to base 2
-                    # original hypothesis (after tokenization and BPE)
-                    print(
-                        "H-{}\t{}\t{}".format(sample_id, score, hypo_str),
-                        file=output_file,
-                    )
-                    # detokenized hypothesis
-                    print(
-                        "D-{}\t{}\t{}".format(sample_id, score, detok_hypo_str),
-                        file=output_file,
-                    )
-                    print(
-                        "P-{}\t{}".format(
-                            sample_id,
-                            " ".join(
-                                map(
-                                    lambda x: "{:.4f}".format(x),
-                                    # convert from base e to base 2
-                                    hypo["positional_scores"]
-                                    .div_(math.log(2))
-                                    .tolist(),
-                                )
-                            ),
-                        ),
-                        file=output_file,
-                    )
-
-                    if args.print_alignment:
-                        print(
-                            "A-{}\t{}".format(
-                                sample_id,
-                                " ".join(
-                                    [
-                                        "{}-{}".format(src_idx, tgt_idx)
-                                        for src_idx, tgt_idx in alignment
-                                    ]
-                                ),
-                            ),
-                            file=output_file,
-                        )
-
-                    if args.print_step:
-                        print(
-                            "I-{}\t{}".format(sample_id, hypo["steps"]),
-                            file=output_file,
-                        )
-
-                    if getattr(args, "retain_iter_history", False):
-                        for step, h in enumerate(hypo["history"]):
-                            _, h_str, _ = utils.post_process_prediction(
-                                hypo_tokens=h["tokens"].int().cpu(),
-                                src_str=src_str,
-                                alignment=None,
-                                align_dict=None,
-                                tgt_dict=tgt_dict,
-                                remove_bpe=None,
-                            )
-                            print(
-                                "E-{}_{}\t{}".format(sample_id, step, h_str),
-                                file=output_file,
-                            )
-
-                # Score only the top hypothesis
-                if has_target and j == 0:
-                    if align_dict is not None or args.remove_bpe is not None:
-                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
-                        target_tokens = tgt_dict.encode_line(
-                            target_str, add_if_not_exist=True
-                        )
-                        hypo_tokens = tgt_dict.encode_line(
-                            detok_hypo_str, add_if_not_exist=True
-                        )
-                    if hasattr(scorer, "add_string"):
-                        scorer.add_string(target_str, detok_hypo_str)
-                    else:
-                        scorer.add(target_tokens, hypo_tokens)
-
-        wps_meter.update(num_generated_tokens)
-        progress.log({"wps": round(wps_meter.avg)})
-        num_sentences += (
-            sample["nsentences"] if "nsentences" in sample else sample["id"].numel()
-        )
-
-    logger.info("NOTE: hypothesis and token scores are output in base 2")
-    logger.info(
-        "Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)".format(
-            num_sentences,
-            gen_timer.n,
-            gen_timer.sum,
-            num_sentences / gen_timer.sum,
-            1.0 / gen_timer.avg,
-        )
-    )
-    if has_target:
-        if args.bpe and not args.sacrebleu:
-            if args.remove_bpe:
-                logger.warning(
-                    "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization"
-                )
-            else:
-                logger.warning(
-                    "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words.  Use --sacrebleu for standard 13a BLEU tokenization"
-                )
-        # use print to be consistent with other main outputs: S-, H-, T-, D- and so on
-        print(
-            "Generate {} with beam={}: {}".format(
-                args.gen_subset, args.beam, scorer.result_string()
-            ),
-            file=output_file,
-        )
-
-    return scorer
-
-
-def cli_main():
-    parser = options.get_generation_parser()
-    args = options.parse_args_and_arch(parser)
-    main(args)
-
-
-if __name__ == "__main__":
-    cli_main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/interactive.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/interactive.py
deleted file mode 100644
index de3893a385cef9520f6670907408ae54e3986001..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/interactive.py
+++ /dev/null
@@ -1,311 +0,0 @@
-#!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Translate raw text with a trained model. Batches data on-the-fly.
-"""
-
-import fileinput
-import logging
-import math
-import os
-import sys
-import time
-from collections import namedtuple
-
-import numpy as np
-import torch
-from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
-from fairseq.data import encoders
-from fairseq.token_generation_constraints import pack_constraints, unpack_constraints
-from fairseq_cli.generate import get_symbols_to_strip_from_output
-
-
-logging.basicConfig(
-    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    level=os.environ.get("LOGLEVEL", "INFO").upper(),
-    stream=sys.stdout,
-)
-logger = logging.getLogger("fairseq_cli.interactive")
-
-
-Batch = namedtuple("Batch", "ids src_tokens src_lengths constraints")
-Translation = namedtuple("Translation", "src_str hypos pos_scores alignments")
-
-
-def buffered_read(input, buffer_size):
-    buffer = []
-    with fileinput.input(files=[input], openhook=fileinput.hook_encoded("utf-8")) as h:
-        for src_str in h:
-            buffer.append(src_str.strip())
-            if len(buffer) >= buffer_size:
-                yield buffer
-                buffer = []
-
-    if len(buffer) > 0:
-        yield buffer
-
-
-def make_batches(lines, args, task, max_positions, encode_fn):
-    def encode_fn_target(x):
-        return encode_fn(x)
-
-    if args.constraints:
-        # Strip (tab-delimited) contraints, if present, from input lines,
-        # store them in batch_constraints
-        batch_constraints = [list() for _ in lines]
-        for i, line in enumerate(lines):
-            if "\t" in line:
-                lines[i], *batch_constraints[i] = line.split("\t")
-
-        # Convert each List[str] to List[Tensor]
-        for i, constraint_list in enumerate(batch_constraints):
-            batch_constraints[i] = [
-                task.target_dictionary.encode_line(
-                    encode_fn_target(constraint),
-                    append_eos=False,
-                    add_if_not_exist=False,
-                )
-                for constraint in constraint_list
-            ]
-
-    tokens = [
-        task.source_dictionary.encode_line(
-            encode_fn(src_str), add_if_not_exist=False
-        ).long()
-        for src_str in lines
-    ]
-
-    if args.constraints:
-        constraints_tensor = pack_constraints(batch_constraints)
-    else:
-        constraints_tensor = None
-
-    lengths = [t.numel() for t in tokens]
-    itr = task.get_batch_iterator(
-        dataset=task.build_dataset_for_inference(
-            tokens, lengths, constraints=constraints_tensor
-        ),
-        max_tokens=args.max_tokens,
-        max_sentences=args.batch_size,
-        max_positions=max_positions,
-        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
-    ).next_epoch_itr(shuffle=False)
-    for batch in itr:
-        ids = batch["id"]
-        src_tokens = batch["net_input"]["src_tokens"]
-        src_lengths = batch["net_input"]["src_lengths"]
-        constraints = batch.get("constraints", None)
-
-        yield Batch(
-            ids=ids,
-            src_tokens=src_tokens,
-            src_lengths=src_lengths,
-            constraints=constraints,
-        )
-
-
-def main(args):
-    start_time = time.time()
-    total_translate_time = 0
-
-    utils.import_user_module(args)
-
-    if args.buffer_size < 1:
-        args.buffer_size = 1
-    if args.max_tokens is None and args.batch_size is None:
-        args.batch_size = 1
-
-    assert (
-        not args.sampling or args.nbest == args.beam
-    ), "--sampling requires --nbest to be equal to --beam"
-    assert (
-        not args.batch_size or args.batch_size <= args.buffer_size
-    ), "--batch-size cannot be larger than --buffer-size"
-
-    logger.info(args)
-
-    # Fix seed for stochastic decoding
-    if args.seed is not None and not args.no_seed_provided:
-        np.random.seed(args.seed)
-        utils.set_torch_seed(args.seed)
-
-    use_cuda = torch.cuda.is_available() and not args.cpu
-
-    # Setup task, e.g., translation
-    task = tasks.setup_task(args)
-
-    # Load ensemble
-    logger.info("loading model(s) from {}".format(args.path))
-    models, _model_args = checkpoint_utils.load_model_ensemble(
-        args.path.split(os.pathsep),
-        arg_overrides=eval(args.model_overrides),
-        task=task,
-        suffix=getattr(args, "checkpoint_suffix", ""),
-        strict=(args.checkpoint_shard_count == 1),
-        num_shards=args.checkpoint_shard_count,
-    )
-
-    # Set dictionaries
-    src_dict = task.source_dictionary
-    tgt_dict = task.target_dictionary
-
-    # Optimize ensemble for generation
-    for model in models:
-        if args.fp16:
-            model.half()
-        if use_cuda and not args.pipeline_model_parallel:
-            model.cuda()
-        model.prepare_for_inference_(args)
-
-    # Initialize generator
-    generator = task.build_generator(models, args)
-
-    # Handle tokenization and BPE
-    tokenizer = encoders.build_tokenizer(args)
-    bpe = encoders.build_bpe(args)
-
-    def encode_fn(x):
-        if tokenizer is not None:
-            x = tokenizer.encode(x)
-        if bpe is not None:
-            x = bpe.encode(x)
-        return x
-
-    def decode_fn(x):
-        if bpe is not None:
-            x = bpe.decode(x)
-        if tokenizer is not None:
-            x = tokenizer.decode(x)
-        return x
-
-    # Load alignment dictionary for unknown word replacement
-    # (None if no unknown word replacement, empty if no path to align dictionary)
-    align_dict = utils.load_align_dict(args.replace_unk)
-
-    max_positions = utils.resolve_max_positions(
-        task.max_positions(), *[model.max_positions() for model in models]
-    )
-
-    if args.constraints:
-        logger.warning(
-            "NOTE: Constrained decoding currently assumes a shared subword vocabulary."
-        )
-
-    if args.buffer_size > 1:
-        logger.info("Sentence buffer size: %s", args.buffer_size)
-    logger.info("NOTE: hypothesis and token scores are output in base 2")
-    logger.info("Type the input sentence and press return:")
-    start_id = 0
-    for inputs in buffered_read(args.input, args.buffer_size):
-        results = []
-        for batch in make_batches(inputs, args, task, max_positions, encode_fn):
-            bsz = batch.src_tokens.size(0)
-            src_tokens = batch.src_tokens
-            src_lengths = batch.src_lengths
-            constraints = batch.constraints
-            if use_cuda:
-                src_tokens = src_tokens.cuda()
-                src_lengths = src_lengths.cuda()
-                if constraints is not None:
-                    constraints = constraints.cuda()
-
-            sample = {
-                "net_input": {
-                    "src_tokens": src_tokens,
-                    "src_lengths": src_lengths,
-                },
-            }
-            translate_start_time = time.time()
-            translations = task.inference_step(
-                generator, models, sample, constraints=constraints
-            )
-            translate_time = time.time() - translate_start_time
-            total_translate_time += translate_time
-            list_constraints = [[] for _ in range(bsz)]
-            if args.constraints:
-                list_constraints = [unpack_constraints(c) for c in constraints]
-            for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)):
-                src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad())
-                constraints = list_constraints[i]
-                results.append(
-                    (
-                        start_id + id,
-                        src_tokens_i,
-                        hypos,
-                        {
-                            "constraints": constraints,
-                            "time": translate_time / len(translations),
-                        },
-                    )
-                )
-
-        # sort output to match input order
-        for id_, src_tokens, hypos, info in sorted(results, key=lambda x: x[0]):
-            if src_dict is not None:
-                src_str = src_dict.string(src_tokens, args.remove_bpe)
-                print("S-{}\t{}".format(id_, src_str))
-                print("W-{}\t{:.3f}\tseconds".format(id_, info["time"]))
-                for constraint in info["constraints"]:
-                    print(
-                        "C-{}\t{}".format(
-                            id_, tgt_dict.string(constraint, args.remove_bpe)
-                        )
-                    )
-
-            # Process top predictions
-            for hypo in hypos[: min(len(hypos), args.nbest)]:
-                hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
-                    hypo_tokens=hypo["tokens"].int().cpu(),
-                    src_str=src_str,
-                    alignment=hypo["alignment"],
-                    align_dict=align_dict,
-                    tgt_dict=tgt_dict,
-                    remove_bpe=args.remove_bpe,
-                    extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator),
-                )
-                detok_hypo_str = decode_fn(hypo_str)
-                score = hypo["score"] / math.log(2)  # convert to base 2
-                # original hypothesis (after tokenization and BPE)
-                print("H-{}\t{}\t{}".format(id_, score, hypo_str))
-                # detokenized hypothesis
-                print("D-{}\t{}\t{}".format(id_, score, detok_hypo_str))
-                print(
-                    "P-{}\t{}".format(
-                        id_,
-                        " ".join(
-                            map(
-                                lambda x: "{:.4f}".format(x),
-                                # convert from base e to base 2
-                                hypo["positional_scores"].div_(math.log(2)).tolist(),
-                            )
-                        ),
-                    )
-                )
-                if args.print_alignment:
-                    alignment_str = " ".join(
-                        ["{}-{}".format(src, tgt) for src, tgt in alignment]
-                    )
-                    print("A-{}\t{}".format(id_, alignment_str))
-
-        # update running id_ counter
-        start_id += len(inputs)
-
-    logger.info(
-        "Total time: {:.3f} seconds; translation time: {:.3f}".format(
-            time.time() - start_time, total_translate_time
-        )
-    )
-
-
-def cli_main():
-    parser = options.get_interactive_generation_parser()
-    args = options.parse_args_and_arch(parser)
-    distributed_utils.call_main(args, main)
-
-
-if __name__ == "__main__":
-    cli_main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/preprocess.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/preprocess.py
deleted file mode 100644
index fa77da8dba74e3e07cadfc66abf8fb5fe7bddd6c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/preprocess.py
+++ /dev/null
@@ -1,398 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Data pre-processing: build vocabularies and binarize training data.
-"""
-
-import logging
-import os
-import shutil
-import sys
-from collections import Counter
-from itertools import zip_longest
-from multiprocessing import Pool
-
-from fairseq import options, tasks, utils
-from fairseq.binarizer import Binarizer
-from fairseq.data import indexed_dataset
-
-
-logging.basicConfig(
-    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    level=os.environ.get("LOGLEVEL", "INFO").upper(),
-    stream=sys.stdout,
-)
-logger = logging.getLogger("fairseq_cli.preprocess")
-
-
-def main(args):
-    utils.import_user_module(args)
-
-    os.makedirs(args.destdir, exist_ok=True)
-
-    logger.addHandler(
-        logging.FileHandler(
-            filename=os.path.join(args.destdir, "preprocess.log"),
-        )
-    )
-    logger.info(args)
-
-    task = tasks.get_task(args.task)
-
-    def train_path(lang):
-        return "{}{}".format(args.trainpref, ("." + lang) if lang else "")
-
-    def file_name(prefix, lang):
-        fname = prefix
-        if lang is not None:
-            fname += ".{lang}".format(lang=lang)
-        return fname
-
-    def dest_path(prefix, lang):
-        return os.path.join(args.destdir, file_name(prefix, lang))
-
-    def dict_path(lang):
-        return dest_path("dict", lang) + ".txt"
-
-    def build_dictionary(filenames, src=False, tgt=False):
-        assert src ^ tgt
-        return task.build_dictionary(
-            filenames,
-            workers=args.workers,
-            threshold=args.thresholdsrc if src else args.thresholdtgt,
-            nwords=args.nwordssrc if src else args.nwordstgt,
-            padding_factor=args.padding_factor,
-        )
-
-    target = not args.only_source
-
-    if not args.srcdict and os.path.exists(dict_path(args.source_lang)):
-        raise FileExistsError(dict_path(args.source_lang))
-    if target and not args.tgtdict and os.path.exists(dict_path(args.target_lang)):
-        raise FileExistsError(dict_path(args.target_lang))
-
-    if args.joined_dictionary:
-        assert (
-            not args.srcdict or not args.tgtdict
-        ), "cannot use both --srcdict and --tgtdict with --joined-dictionary"
-
-        if args.srcdict:
-            src_dict = task.load_dictionary(args.srcdict)
-        elif args.tgtdict:
-            src_dict = task.load_dictionary(args.tgtdict)
-        else:
-            assert (
-                args.trainpref
-            ), "--trainpref must be set if --srcdict is not specified"
-            src_dict = build_dictionary(
-                {train_path(lang) for lang in [args.source_lang, args.target_lang]},
-                src=True,
-            )
-        tgt_dict = src_dict
-    else:
-        if args.srcdict:
-            src_dict = task.load_dictionary(args.srcdict)
-        else:
-            assert (
-                args.trainpref
-            ), "--trainpref must be set if --srcdict is not specified"
-            src_dict = build_dictionary([train_path(args.source_lang)], src=True)
-
-        if target:
-            if args.tgtdict:
-                tgt_dict = task.load_dictionary(args.tgtdict)
-            else:
-                assert (
-                    args.trainpref
-                ), "--trainpref must be set if --tgtdict is not specified"
-                tgt_dict = build_dictionary([train_path(args.target_lang)], tgt=True)
-        else:
-            tgt_dict = None
-
-    src_dict.save(dict_path(args.source_lang))
-    if target and tgt_dict is not None:
-        tgt_dict.save(dict_path(args.target_lang))
-
-    def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers):
-        logger.info("[{}] Dictionary: {} types".format(lang, len(vocab)))
-        n_seq_tok = [0, 0]
-        replaced = Counter()
-
-        def merge_result(worker_result):
-            replaced.update(worker_result["replaced"])
-            n_seq_tok[0] += worker_result["nseq"]
-            n_seq_tok[1] += worker_result["ntok"]
-
-        input_file = "{}{}".format(
-            input_prefix, ("." + lang) if lang is not None else ""
-        )
-        offsets = Binarizer.find_offsets(input_file, num_workers)
-        pool = None
-        if num_workers > 1:
-            pool = Pool(processes=num_workers - 1)
-            for worker_id in range(1, num_workers):
-                prefix = "{}{}".format(output_prefix, worker_id)
-                pool.apply_async(
-                    binarize,
-                    (
-                        args,
-                        input_file,
-                        vocab,
-                        prefix,
-                        lang,
-                        offsets[worker_id],
-                        offsets[worker_id + 1],
-                    ),
-                    callback=merge_result,
-                )
-            pool.close()
-
-        ds = indexed_dataset.make_builder(
-            dataset_dest_file(args, output_prefix, lang, "bin"),
-            impl=args.dataset_impl,
-            vocab_size=len(vocab),
-        )
-        merge_result(
-            Binarizer.binarize(
-                input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1]
-            )
-        )
-        if num_workers > 1:
-            pool.join()
-            for worker_id in range(1, num_workers):
-                prefix = "{}{}".format(output_prefix, worker_id)
-                temp_file_path = dataset_dest_prefix(args, prefix, lang)
-                ds.merge_file_(temp_file_path)
-                os.remove(indexed_dataset.data_file_path(temp_file_path))
-                os.remove(indexed_dataset.index_file_path(temp_file_path))
-
-        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
-
-        logger.info(
-            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
-                lang,
-                input_file,
-                n_seq_tok[0],
-                n_seq_tok[1],
-                100 * sum(replaced.values()) / n_seq_tok[1],
-                vocab.unk_word,
-            )
-        )
-
-    def make_binary_alignment_dataset(input_prefix, output_prefix, num_workers):
-        nseq = [0]
-
-        def merge_result(worker_result):
-            nseq[0] += worker_result["nseq"]
-
-        input_file = input_prefix
-        offsets = Binarizer.find_offsets(input_file, num_workers)
-        pool = None
-        if num_workers > 1:
-            pool = Pool(processes=num_workers - 1)
-            for worker_id in range(1, num_workers):
-                prefix = "{}{}".format(output_prefix, worker_id)
-                pool.apply_async(
-                    binarize_alignments,
-                    (
-                        args,
-                        input_file,
-                        utils.parse_alignment,
-                        prefix,
-                        offsets[worker_id],
-                        offsets[worker_id + 1],
-                    ),
-                    callback=merge_result,
-                )
-            pool.close()
-
-        ds = indexed_dataset.make_builder(
-            dataset_dest_file(args, output_prefix, None, "bin"), impl=args.dataset_impl
-        )
-
-        merge_result(
-            Binarizer.binarize_alignments(
-                input_file,
-                utils.parse_alignment,
-                lambda t: ds.add_item(t),
-                offset=0,
-                end=offsets[1],
-            )
-        )
-        if num_workers > 1:
-            pool.join()
-            for worker_id in range(1, num_workers):
-                prefix = "{}{}".format(output_prefix, worker_id)
-                temp_file_path = dataset_dest_prefix(args, prefix, None)
-                ds.merge_file_(temp_file_path)
-                os.remove(indexed_dataset.data_file_path(temp_file_path))
-                os.remove(indexed_dataset.index_file_path(temp_file_path))
-
-        ds.finalize(dataset_dest_file(args, output_prefix, None, "idx"))
-
-        logger.info("[alignments] {}: parsed {} alignments".format(input_file, nseq[0]))
-
-    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
-        if args.dataset_impl == "raw":
-            # Copy original text file to destination folder
-            output_text_file = dest_path(
-                output_prefix + ".{}-{}".format(args.source_lang, args.target_lang),
-                lang,
-            )
-            shutil.copyfile(file_name(input_prefix, lang), output_text_file)
-        else:
-            make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers)
-
-    def make_all(lang, vocab):
-        if args.trainpref:
-            make_dataset(vocab, args.trainpref, "train", lang, num_workers=args.workers)
-        if args.validpref:
-            for k, validpref in enumerate(args.validpref.split(",")):
-                outprefix = "valid{}".format(k) if k > 0 else "valid"
-                make_dataset(
-                    vocab, validpref, outprefix, lang, num_workers=args.workers
-                )
-        if args.testpref:
-            for k, testpref in enumerate(args.testpref.split(",")):
-                outprefix = "test{}".format(k) if k > 0 else "test"
-                make_dataset(vocab, testpref, outprefix, lang, num_workers=args.workers)
-
-    def make_all_alignments():
-        if args.trainpref and os.path.exists(args.trainpref + "." + args.align_suffix):
-            make_binary_alignment_dataset(
-                args.trainpref + "." + args.align_suffix,
-                "train.align",
-                num_workers=args.workers,
-            )
-        if args.validpref and os.path.exists(args.validpref + "." + args.align_suffix):
-            make_binary_alignment_dataset(
-                args.validpref + "." + args.align_suffix,
-                "valid.align",
-                num_workers=args.workers,
-            )
-        if args.testpref and os.path.exists(args.testpref + "." + args.align_suffix):
-            make_binary_alignment_dataset(
-                args.testpref + "." + args.align_suffix,
-                "test.align",
-                num_workers=args.workers,
-            )
-
-    make_all(args.source_lang, src_dict)
-    if target:
-        make_all(args.target_lang, tgt_dict)
-    if args.align_suffix:
-        make_all_alignments()
-
-    logger.info("Wrote preprocessed data to {}".format(args.destdir))
-
-    if args.alignfile:
-        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
-        src_file_name = train_path(args.source_lang)
-        tgt_file_name = train_path(args.target_lang)
-        freq_map = {}
-        with open(args.alignfile, "r", encoding="utf-8") as align_file:
-            with open(src_file_name, "r", encoding="utf-8") as src_file:
-                with open(tgt_file_name, "r", encoding="utf-8") as tgt_file:
-                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
-                        si = src_dict.encode_line(s, add_if_not_exist=False)
-                        ti = tgt_dict.encode_line(t, add_if_not_exist=False)
-                        ai = list(map(lambda x: tuple(x.split("-")), a.split()))
-                        for sai, tai in ai:
-                            srcidx = si[int(sai)]
-                            tgtidx = ti[int(tai)]
-                            if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
-                                assert srcidx != src_dict.pad()
-                                assert srcidx != src_dict.eos()
-                                assert tgtidx != tgt_dict.pad()
-                                assert tgtidx != tgt_dict.eos()
-
-                                if srcidx not in freq_map:
-                                    freq_map[srcidx] = {}
-                                if tgtidx not in freq_map[srcidx]:
-                                    freq_map[srcidx][tgtidx] = 1
-                                else:
-                                    freq_map[srcidx][tgtidx] += 1
-
-        align_dict = {}
-        for srcidx in freq_map.keys():
-            align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)
-
-        with open(
-            os.path.join(
-                args.destdir,
-                "alignment.{}-{}.txt".format(args.source_lang, args.target_lang),
-            ),
-            "w",
-            encoding="utf-8",
-        ) as f:
-            for k, v in align_dict.items():
-                print("{} {}".format(src_dict[k], tgt_dict[v]), file=f)
-
-
-def binarize(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True):
-    ds = indexed_dataset.make_builder(
-        dataset_dest_file(args, output_prefix, lang, "bin"),
-        impl=args.dataset_impl,
-        vocab_size=len(vocab),
-    )
-
-    def consumer(tensor):
-        ds.add_item(tensor)
-
-    res = Binarizer.binarize(
-        filename, vocab, consumer, append_eos=append_eos, offset=offset, end=end
-    )
-    ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
-    return res
-
-
-def binarize_alignments(args, filename, parse_alignment, output_prefix, offset, end):
-    ds = indexed_dataset.make_builder(
-        dataset_dest_file(args, output_prefix, None, "bin"),
-        impl=args.dataset_impl,
-        vocab_size=None,
-    )
-
-    def consumer(tensor):
-        ds.add_item(tensor)
-
-    res = Binarizer.binarize_alignments(
-        filename, parse_alignment, consumer, offset=offset, end=end
-    )
-    ds.finalize(dataset_dest_file(args, output_prefix, None, "idx"))
-    return res
-
-
-def dataset_dest_prefix(args, output_prefix, lang):
-    base = "{}/{}".format(args.destdir, output_prefix)
-    if lang is not None:
-        lang_part = ".{}-{}.{}".format(args.source_lang, args.target_lang, lang)
-    elif args.only_source:
-        lang_part = ""
-    else:
-        lang_part = ".{}-{}".format(args.source_lang, args.target_lang)
-
-    return "{}{}".format(base, lang_part)
-
-
-def dataset_dest_file(args, output_prefix, lang, extension):
-    base = dataset_dest_prefix(args, output_prefix, lang)
-    return "{}.{}".format(base, extension)
-
-
-def get_offsets(input_file, num_workers):
-    return Binarizer.find_offsets(input_file, num_workers)
-
-
-def cli_main():
-    parser = options.get_preprocessing_parser()
-    args = parser.parse_args()
-    main(args)
-
-
-if __name__ == "__main__":
-    cli_main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/score.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/score.py
deleted file mode 100644
index b8354eb95a8b786c0e21b8dfe777f36af6f261a3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/score.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-BLEU scoring of generated translations against reference translations.
-"""
-
-import argparse
-import os
-import sys
-
-from fairseq.data import dictionary
-from fairseq.scoring import bleu
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        description="Command-line script for BLEU scoring."
-    )
-    # fmt: off
-    parser.add_argument('-s', '--sys', default='-', help='system output')
-    parser.add_argument('-r', '--ref', required=True, help='references')
-    parser.add_argument('-o', '--order', default=4, metavar='N',
-                        type=int, help='consider ngrams up to this order')
-    parser.add_argument('--ignore-case', action='store_true',
-                        help='case-insensitive scoring')
-    parser.add_argument('--sacrebleu', action='store_true',
-                        help='score with sacrebleu')
-    parser.add_argument('--sentence-bleu', action='store_true',
-                        help='report sentence-level BLEUs (i.e., with +1 smoothing)')
-    # fmt: on
-    return parser
-
-
-def cli_main():
-    parser = get_parser()
-    args = parser.parse_args()
-    print(args)
-
-    assert args.sys == "-" or os.path.exists(
-        args.sys
-    ), "System output file {} does not exist".format(args.sys)
-    assert os.path.exists(args.ref), "Reference file {} does not exist".format(args.ref)
-
-    dict = dictionary.Dictionary()
-
-    def readlines(fd):
-        for line in fd.readlines():
-            if args.ignore_case:
-                yield line.lower()
-            else:
-                yield line
-
-    if args.sacrebleu:
-        import sacrebleu
-
-        def score(fdsys):
-            with open(args.ref) as fdref:
-                print(sacrebleu.corpus_bleu(fdsys, [fdref]))
-
-    elif args.sentence_bleu:
-
-        def score(fdsys):
-            with open(args.ref) as fdref:
-                scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
-                for i, (sys_tok, ref_tok) in enumerate(
-                    zip(readlines(fdsys), readlines(fdref))
-                ):
-                    scorer.reset(one_init=True)
-                    sys_tok = dict.encode_line(sys_tok)
-                    ref_tok = dict.encode_line(ref_tok)
-                    scorer.add(ref_tok, sys_tok)
-                    print(i, scorer.result_string(args.order))
-
-    else:
-
-        def score(fdsys):
-            with open(args.ref) as fdref:
-                scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk())
-                for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)):
-                    sys_tok = dict.encode_line(sys_tok)
-                    ref_tok = dict.encode_line(ref_tok)
-                    scorer.add(ref_tok, sys_tok)
-                print(scorer.result_string(args.order))
-
-    if args.sys == "-":
-        score(sys.stdin)
-    else:
-        with open(args.sys, "r") as f:
-            score(f)
-
-
-if __name__ == "__main__":
-    cli_main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/train.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/train.py
deleted file mode 100644
index eb4e45c3a14b26a24b1cef6b355eebc25b56924e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/train.py
+++ /dev/null
@@ -1,427 +0,0 @@
-#!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Train a new model on one or across multiple GPUs.
-"""
-
-import argparse
-import logging
-import math
-import os
-import random
-import sys
-
-import numpy as np
-import torch
-from fairseq import (
-    checkpoint_utils,
-    distributed_utils,
-    options,
-    quantization_utils,
-    tasks,
-    utils,
-)
-from fairseq.data import iterators
-from fairseq.logging import meters, metrics, progress_bar
-from fairseq.model_parallel.megatron_trainer import MegatronTrainer
-from fairseq.trainer import Trainer
-from fairseq.modules.multihead_attention import MHAConfig
-
-
-logging.basicConfig(
-    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    level=os.environ.get("LOGLEVEL", "INFO").upper(),
-    stream=sys.stdout,
-)
-logger = logging.getLogger("fairseq_cli.train")
-
-
-def main(args):
-    utils.import_user_module(args)
-    option = {}
-    option['ACL_OP_SELECT_IMPL_MODE'] = 'high_performance'
-    option['ACL_OPTYPELIST_FOR_IMPLMODE'] = 'LayerNorm'
-    torch.npu.set_option(option)
-    if args.distributed_world_size == 1:
-        torch.npu.set_device('npu:{}'.format(args.npu_id))
-    assert (
-        args.max_tokens is not None or args.batch_size is not None
-    ), "Must specify batch size either with --max-tokens or --batch-size"
-
-    metrics.reset()
-
-    np.random.seed(args.seed)
-    utils.set_torch_seed(args.seed)
-
-    if distributed_utils.is_master(args):
-        checkpoint_utils.verify_checkpoint_directory(args.save_dir)
-
-    # Print args
-    logger.info(args)
-
-    # Setup task, e.g., translation, language modeling, etc.
-    task = tasks.setup_task(args)
-
-    # Load valid dataset (we load training data below, based on the latest checkpoint)
-    for valid_sub_split in args.valid_subset.split(","):
-        task.load_dataset(valid_sub_split, combine=False, epoch=1)
-
-    # Build model and criterion
-    model = task.build_model(args)
-    criterion = task.build_criterion(args)
-    logger.info(model)
-    logger.info("task: {} ({})".format(args.task, task.__class__.__name__))
-    logger.info("model: {} ({})".format(args.arch, model.__class__.__name__))
-    logger.info(
-        "criterion: {} ({})".format(args.criterion, criterion.__class__.__name__)
-    )
-    logger.info(
-        "num. model params: {} (num. trained: {})".format(
-            sum(p.numel() for p in model.parameters()),
-            sum(p.numel() for p in model.parameters() if p.requires_grad),
-        )
-    )
-
-    # (optionally) Configure quantization
-    if args.quantization_config_path is not None:
-        quantizer = quantization_utils.Quantizer(
-            config_path=args.quantization_config_path,
-            max_epoch=args.max_epoch,
-            max_update=args.max_update,
-        )
-    else:
-        quantizer = None
-
-    # Build trainer
-    if args.model_parallel_size == 1:
-        trainer = Trainer(args, task, model, criterion, quantizer)
-    else:
-        trainer = MegatronTrainer(args, task, model, criterion)
-
-    logger.info(
-        "training on {} devices (GPUs/TPUs)".format(args.distributed_world_size)
-    )
-    logger.info(
-        "max tokens per GPU = {} and max sentences per GPU = {}".format(
-            args.max_tokens, args.batch_size
-        )
-    )
-
-    # Load the latest checkpoint if one is available and restore the
-    # corresponding train iterator
-    extra_state, epoch_itr = checkpoint_utils.load_checkpoint(
-        args,
-        trainer,
-        # don't cache epoch iterators for sharded datasets
-        disable_iterator_cache=task.has_sharded_data("train"),
-    )
-
-    # Train until the learning rate gets too small
-    max_epoch = args.max_epoch or math.inf
-    lr = trainer.get_lr()
-    train_meter = meters.StopwatchMeter()
-    train_meter.start()
-
-    while lr > args.min_lr and epoch_itr.next_epoch_idx <= max_epoch:
-        # train for one epoch
-        valid_losses, should_stop = train(args, trainer, task, epoch_itr)
-        if should_stop:
-            break
-
-        # only use first validation loss to update the learning rate
-        lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0])
-
-        epoch_itr = trainer.get_train_iterator(
-            epoch_itr.next_epoch_idx,
-            # sharded data: get train iterator for next epoch
-            load_dataset=task.has_sharded_data("train"),
-            # don't cache epoch iterators for sharded datasets
-            disable_iterator_cache=task.has_sharded_data("train"),
-        )
-    train_meter.stop()
-    logger.info("done training in {:.1f} seconds".format(train_meter.sum))
-
-
-def should_stop_early(args, valid_loss):
-    # skip check if no validation was done in the current epoch
-    if valid_loss is None:
-        return False
-    if args.patience <= 0:
-        return False
-
-    def is_better(a, b):
-        return a > b if args.maximize_best_checkpoint_metric else a < b
-
-    prev_best = getattr(should_stop_early, "best", None)
-    if prev_best is None or is_better(valid_loss, prev_best):
-        should_stop_early.best = valid_loss
-        should_stop_early.num_runs = 0
-        return False
-    else:
-        should_stop_early.num_runs += 1
-        if should_stop_early.num_runs >= args.patience:
-            logger.info(
-                "early stop since valid performance hasn't improved for last {} runs".format(
-                    args.patience
-                )
-            )
-            return True
-        else:
-            return False
-
-def wrapper_model_all_reduce(model, fp_16_grads, reduce_stream):
-    from change_data_ptr import change_data_ptr
-    import torch.distributed as dist
-    total_param_size = 0
-    for name, para in model.named_parameters():
-        if name == "module.encoder.embed_tokens.weight":
-            continue
-        total_param_size += para.storage().size()
-
-    target_para_size_list = []
-    tmp_size = 0
-    name_dict = dict()
-    name_order = 0
-    for name, para in model.named_parameters():
-        if name == "module.necoder.embed_tokens.weight":
-            target_para_size_list.append(para.storage().size())
-            name_dict[name] =name_order
-            name_order += 1
-            continue
-        tmp_size += para.storage().size()
-        name_dict[name] = name_order
-        if tmp_size > total_param_size // 8:
-            target_para_size_list.append(tmp_size)
-            tmp_size = 0
-            name_order += 1
-    target_para_size_list.append(tmp_size)
-    partial_combined_grad_list = []
-    idx = 0
-    for ss in target_para_size_list:
-        tmp_tensor = torch.zeros(ss).half().npu()
-        for device in fp_16_grads:
-            change_data_ptr(tmp_tensor, fp_16_grads[device], idx)
-        partial_combined_grad_list.append(tmp_tensor)
-        idx += ss
-
-    target_para_size_list = [pp *2 for pp in target_para_size_list]
-    current_para_size_list = [0] *(len(target_para_size_list))
-    ready_reduce_index = []
-
-    def hook_func(name, target_para_size_list, current_para_size_list, name_dict, reduce_stream, partial_combined_grad_list, ready_reduce_index):
-        def hook_function(grad):
-            if ready_reduce_index:
-                index = ready_reduce_index.pop()
-                current_para_size_list[index] = 0
-                with torch.npu.stream(reduce_stream):
-                    partial_combined_grad_list[index].div_(8)
-                    dist.all_reduce(partial_combined_grad_list[index])
-
-            current_para_size_list[name_dict[name]] += grad.storage().size()
-            for i in range(len(current_para_size_list)):
-                if current_para_size_list[i] == target_para_size_list[i] and i != 0:
-                    ready_reduce_index.append(i)
-                    return
-        return hook_function
-
-    for name, para in model.named_parameters():
-        para.register_hook(hook_func(name, target_para_size_list, current_para_size_list, name_dict, reduce_stream, partial_combined_grad_list, ready_reduce_index))
-
-    return partial_combined_grad_list[0]
-
-
-@metrics.aggregate("train")
-def train(args, trainer, task, epoch_itr):
-    """Train the model for one epoch and return validation losses."""
-    # Initialize data iterator
-    itr = epoch_itr.next_epoch_itr(
-        fix_batches_to_gpus=args.fix_batches_to_gpus,
-        shuffle=(epoch_itr.next_epoch_idx > args.curriculum),
-    )
-    update_freq = (
-        args.update_freq[epoch_itr.epoch - 1]
-        if epoch_itr.epoch <= len(args.update_freq)
-        else args.update_freq[-1]
-    )
-    itr = iterators.GroupedIterator(itr, update_freq)
-    if getattr(args, "tpu", False):
-        itr = utils.tpu_data_loader(itr)
-    progress = progress_bar.progress_bar(
-        itr,
-        log_format=args.log_format,
-        log_interval=args.log_interval,
-        epoch=epoch_itr.epoch,
-        tensorboard_logdir=(
-            args.tensorboard_logdir if distributed_utils.is_master(args) else None
-        ),
-        default_log_format=("tqdm" if not args.no_progress_bar else "simple"),
-    )
-
-    trainer.begin_epoch(epoch_itr.epoch)
-
-    valid_losses = [None]
-    valid_subsets = args.valid_subset.split(",")
-    should_stop = False
-    num_updates = trainer.get_num_updates()
-    visited = False
-    MHAConfig.set_fussion()
-    for i, samples in enumerate(progress):
-        with metrics.aggregate("train_inner"), torch.autograd.profiler.record_function(
-            "train_step-%d" % i
-        ):
-            log_output = trainer.train_step(samples)
-            if hasattr(trainer.model, "all_reduce") and (trainer.optimizer.fp16_tmp_grads is not None) and (not visited) and (epoch_itr.epoch <= 1):
-                trainer.first_grad = wrapper_model_all_reduce(trainer.model, trainer.optimizer.fp16_tmp_grads, trainer.reduce_stream)
-                visited = True
-
-        if log_output is not None:  # not OOM, overflow, ...
-            # log mid-epoch stats
-            num_updates = trainer.get_num_updates()
-            if num_updates % args.log_interval == 0:
-                stats = get_training_stats(metrics.get_smoothed_values("train_inner"))
-                progress.log(stats, tag="train_inner", step=num_updates)
-
-                # reset mid-epoch stats after each log interval
-                # the end-of-epoch stats will still be preserved
-                metrics.reset_meters("train_inner")
-
-        end_of_epoch = not itr.has_next()
-        valid_losses, should_stop = validate_and_save(
-            args, trainer, task, epoch_itr, valid_subsets, end_of_epoch
-        )
-
-        if should_stop:
-            break
-
-    # log end-of-epoch stats
-    logger.info("end of epoch {} (average epoch stats below)".format(epoch_itr.epoch))
-    stats = get_training_stats(metrics.get_smoothed_values("train"))
-    progress.print(stats, tag="train", step=num_updates)
-
-    # reset epoch-level meters
-    metrics.reset_meters("train")
-    return valid_losses, should_stop
-
-
-def validate_and_save(args, trainer, task, epoch_itr, valid_subsets, end_of_epoch):
-    num_updates = trainer.get_num_updates()
-    max_update = args.max_update or math.inf
-    do_save = (
-        (end_of_epoch and epoch_itr.epoch % args.save_interval == 0)
-        or num_updates >= max_update
-        or (
-            args.save_interval_updates > 0
-            and num_updates > 0
-            and num_updates % args.save_interval_updates == 0
-            and num_updates >= args.validate_after_updates
-        )
-    )
-    do_validate = (
-        (not end_of_epoch and do_save)  # validate during mid-epoch saves
-        or (end_of_epoch and epoch_itr.epoch % args.validate_interval == 0)
-        or num_updates >= max_update
-        or (
-            args.validate_interval_updates > 0
-            and num_updates > 0
-            and num_updates % args.validate_interval_updates == 0
-        )
-    ) and not args.disable_validation
-
-    # Validate
-    valid_losses = [None]
-    if do_validate:
-        valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets)
-
-    # Stopping conditions
-    should_stop = (
-        should_stop_early(args, valid_losses[0])
-        or num_updates >= max_update
-        or (
-            args.stop_time_hours > 0
-            and trainer.cumulative_training_time() / (60 * 60) > args.stop_time_hours
-        )
-    )
-
-    # Save checkpoint
-    if do_save or should_stop:
-        logger.info("begin save checkpoint")
-        checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0])
-
-    return valid_losses, should_stop
-
-
-def get_training_stats(stats):
-    stats["wall"] = round(metrics.get_meter("default", "wall").elapsed_time, 0)
-    return stats
-
-
-def validate(args, trainer, task, epoch_itr, subsets):
-    """Evaluate the model on the validation set(s) and return the losses."""
-
-    if args.fixed_validation_seed is not None:
-        # set fixed seed for every validation
-        utils.set_torch_seed(args.fixed_validation_seed)
-
-    trainer.begin_valid_epoch(epoch_itr.epoch)
-    valid_losses = []
-    for subset in subsets:
-        logger.info('begin validation on "{}" subset'.format(subset))
-
-        # Initialize data iterator
-        itr = trainer.get_valid_iterator(subset).next_epoch_itr(shuffle=False)
-        if getattr(args, "tpu", False):
-            itr = utils.tpu_data_loader(itr)
-        progress = progress_bar.progress_bar(
-            itr,
-            log_format=args.log_format,
-            log_interval=args.log_interval,
-            epoch=epoch_itr.epoch,
-            prefix=f"valid on '{subset}' subset",
-            tensorboard_logdir=(
-                args.tensorboard_logdir if distributed_utils.is_master(args) else None
-            ),
-            default_log_format=("tqdm" if not args.no_progress_bar else "simple"),
-        )
-
-        # create a new root metrics aggregator so validation metrics
-        # don't pollute other aggregators (e.g., train meters)
-        with metrics.aggregate(new_root=True) as agg:
-            for sample in progress:
-                trainer.valid_step(sample)
-
-        # log validation stats
-        stats = get_valid_stats(args, trainer, agg.get_smoothed_values())
-        progress.print(stats, tag=subset, step=trainer.get_num_updates())
-
-        valid_losses.append(stats[args.best_checkpoint_metric])
-    return valid_losses
-
-
-def get_valid_stats(args, trainer, stats):
-    stats["num_updates"] = trainer.get_num_updates()
-    if hasattr(checkpoint_utils.save_checkpoint, "best"):
-        key = "best_{0}".format(args.best_checkpoint_metric)
-        best_function = max if args.maximize_best_checkpoint_metric else min
-        stats[key] = best_function(
-            checkpoint_utils.save_checkpoint.best, stats[args.best_checkpoint_metric]
-        )
-    return stats
-
-
-def cli_main(modify_parser=None):
-    parser = options.get_training_parser()
-    args = options.parse_args_and_arch(parser, modify_parser=modify_parser)
-    if args.profile:
-        with torch.cuda.profiler.profile():
-            with torch.autograd.profiler.emit_nvtx():
-                distributed_utils.call_main(args, main)
-    else:
-        distributed_utils.call_main(args, main)
-
-
-if __name__ == "__main__":
-    cli_main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/validate.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/validate.py
deleted file mode 100644
index df857550d13f3322f03851c70ed268384116abd3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/fairseq_cli/validate.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/usr/bin/env python3 -u
-#!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import os
-import sys
-from itertools import chain
-
-import torch
-from fairseq import checkpoint_utils, distributed_utils, options, utils
-from fairseq.logging import metrics, progress_bar
-
-
-logging.basicConfig(
-    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    level=os.environ.get("LOGLEVEL", "INFO").upper(),
-    stream=sys.stdout,
-)
-logger = logging.getLogger("fairseq_cli.validate")
-
-
-def main(args, override_args=None):
-    utils.import_user_module(args)
-
-    assert (
-        args.max_tokens is not None or args.batch_size is not None
-    ), "Must specify batch size either with --max-tokens or --batch-size"
-
-    use_fp16 = args.fp16
-    use_cuda = torch.cuda.is_available() and not args.cpu
-
-    if use_cuda:
-        torch.cuda.set_device(args.device_id)
-
-    if override_args is not None:
-        overrides = vars(override_args)
-        overrides.update(eval(getattr(override_args, "model_overrides", "{}")))
-    else:
-        overrides = None
-
-    # Load ensemble
-    logger.info("loading model(s) from {}".format(args.path))
-    models, model_args, task = checkpoint_utils.load_model_ensemble_and_task(
-        [args.path],
-        arg_overrides=overrides,
-        suffix=getattr(args, "checkpoint_suffix", ""),
-    )
-    model = models[0]
-
-    # Move models to GPU
-    for model in models:
-        if use_fp16:
-            model.half()
-        if use_cuda:
-            model.cuda()
-
-    # Print args
-    logger.info(model_args)
-
-    # Build criterion
-    criterion = task.build_criterion(model_args)
-    criterion.eval()
-
-    for subset in args.valid_subset.split(","):
-        try:
-            task.load_dataset(subset, combine=False, epoch=1)
-            dataset = task.dataset(subset)
-        except KeyError:
-            raise Exception("Cannot find dataset: " + subset)
-
-        # Initialize data iterator
-        itr = task.get_batch_iterator(
-            dataset=dataset,
-            max_tokens=args.max_tokens,
-            max_sentences=args.batch_size,
-            max_positions=utils.resolve_max_positions(
-                task.max_positions(),
-                *[m.max_positions() for m in models],
-            ),
-            ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
-            required_batch_size_multiple=args.required_batch_size_multiple,
-            seed=args.seed,
-            num_shards=args.distributed_world_size,
-            shard_id=args.distributed_rank,
-            num_workers=args.num_workers,
-            data_buffer_size=args.data_buffer_size,
-        ).next_epoch_itr(shuffle=False)
-        progress = progress_bar.progress_bar(
-            itr,
-            log_format=args.log_format,
-            log_interval=args.log_interval,
-            prefix=f"valid on '{subset}' subset",
-            default_log_format=("tqdm" if not args.no_progress_bar else "simple"),
-        )
-
-        log_outputs = []
-        for i, sample in enumerate(progress):
-            sample = utils.move_to_cuda(sample) if use_cuda else sample
-            _loss, _sample_size, log_output = task.valid_step(sample, model, criterion)
-            progress.log(log_output, step=i)
-            log_outputs.append(log_output)
-
-        if args.distributed_world_size > 1:
-            log_outputs = distributed_utils.all_gather_list(
-                log_outputs,
-                max_size=getattr(args, "all_gather_list_size", 16384),
-            )
-            log_outputs = list(chain.from_iterable(log_outputs))
-
-        with metrics.aggregate() as agg:
-            task.reduce_metrics(log_outputs, criterion)
-            log_output = agg.get_smoothed_values()
-
-        progress.print(log_output, tag=subset, step=i)
-
-
-def cli_main():
-    parser = options.get_validation_parser()
-    args = options.parse_args_and_arch(parser)
-
-    # only override args that are explicitly given on the command line
-    override_parser = options.get_validation_parser()
-    override_args = options.parse_args_and_arch(override_parser, suppress_defaults=True)
-
-    distributed_utils.call_main(args, main, override_args=override_args)
-
-
-if __name__ == "__main__":
-    cli_main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/generate_on_en_ro.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/generate_on_en_ro.sh
deleted file mode 100644
index 66ea0d5ece72d47c0037fca9de111823868a8232..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/generate_on_en_ro.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-source env.sh
-DATA_PATH=path_of_data                     # fix it to your own train data path
-BPE_PATH=/path/sentence.bpe.model         # fix it to your own sentence.bpe.model path
-SCRIPTS=mosesdecoder/scripts              # fix it to your own mosesdecoder path
-WMT16_SCRIPTS=wmt16-scripts               # fix it to your own wmt16-scripts path
-
-langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN
-model_dir=$1
-REPLACE_UNICODE_PUNCT=$SCRIPTS/tokenizer/replace-unicode-punctuation.perl
-TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
-NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
-REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
-NORMALIZE_ROMANIAN=$WMT16_SCRIPTS/preprocess/normalise-romanian.py
-REMOVE_DIACRITICS=$WMT16_SCRIPTS/preprocess/remove-diacritics.py
-HYP=hyp
-REF=ref
-
-fairseq-generate $DATA_PATH \
-  --fp16 --path $model_dir --max-tokens 4096 \
-  --task translation_from_pretrained_bart \
-  --gen-subset test \
-  -t ro_RO -s en_XX \
-  --bpe 'sentencepiece' --sentencepiece-model $BPE_PATH \
-  --scoring sacrebleu --remove-bpe 'sentencepiece' \
-  --batch-size 32 --langs $langs > en_ro
-sed -i '$d' en_ro
-cat en_ro | grep -P "^H" |sort -V |cut -f 3- | sed 's/\[ro_RO\]//g' |$TOKENIZER ro > $HYP".txt"
-cat en_ro | grep -P "^T" |sort -V |cut -f 2- | sed 's/\[ro_RO\]//g' |$TOKENIZER ro > $REF".txt"
-
-for f in $HYP $REF
-	do
-	rm -rf "en_ro."$f
-	cat $f".txt" | \
-	perl $REPLACE_UNICODE_PUNCT | \
-	perl $NORM_PUNC -l ro | \
-	perl $REM_NON_PRINT_CHAR | \
-	python3 $NORMALIZE_ROMANIAN | \
-	python3 $REMOVE_DIACRITICS | \
-	perl $TOKENIZER -no-escape -threads 16 -a -l ro >"en_ro."$f
-	done
-sacrebleu -tok 'none' -s 'none' en_ro.ref < en_ro.hyp
\ No newline at end of file
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/hubconf.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/hubconf.py
deleted file mode 100644
index ce7d76cfe1e18f55fdc97a26885f2257236a2d45..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/hubconf.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import functools
-import importlib
-
-from fairseq.hub_utils import (  # noqa; noqa
-    BPEHubInterface as bpe,
-    TokenizerHubInterface as tokenizer,
-)
-from fairseq.models import MODEL_REGISTRY  # noqa
-
-
-dependencies = [
-    "dataclasses",
-    "hydra",
-    "numpy",
-    "regex",
-    "requests",
-    "torch",
-]
-
-
-# Check for required dependencies and raise a RuntimeError if any are missing.
-missing_deps = []
-for dep in dependencies:
-    try:
-        importlib.import_module(dep)
-    except ImportError:
-        # Hack: the hydra package is provided under the "hydra-core" name in
-        # pypi. We don't want the user mistakenly calling `pip install hydra`
-        # since that will install an unrelated package.
-        if dep == "hydra":
-            dep = "hydra-core"
-        missing_deps.append(dep)
-if len(missing_deps) > 0:
-    raise RuntimeError("Missing dependencies: {}".format(", ".join(missing_deps)))
-
-
-# torch.hub doesn't build Cython components, so if they are not found then try
-# to build them here
-try:
-    import fairseq.data.token_block_utils_fast  # noqa
-except ImportError:
-    try:
-        import cython  # noqa
-        import os
-        from setuptools import sandbox
-
-        sandbox.run_setup(
-            os.path.join(os.path.dirname(__file__), "setup.py"),
-            ["build_ext", "--inplace"],
-        )
-    except ImportError:
-        print(
-            "Unable to build Cython components. Please make sure Cython is "
-            "installed if the torch.hub model you are loading depends on it."
-        )
-
-
-# automatically expose models defined in FairseqModel::hub_models
-for _model_type, _cls in MODEL_REGISTRY.items():
-    for model_name in _cls.hub_models().keys():
-        globals()[model_name] = functools.partial(
-            _cls.from_pretrained,
-            model_name,
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/modelzoo_level.txt b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/modelzoo_level.txt
deleted file mode 100644
index 31529da2e68f25b61e2a3e698a07537281443c03..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/modelzoo_level.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-FuncStatus:OK
-PerfStatus:OK
-PrecisionStatus:OK
\ No newline at end of file
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/pyproject.toml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/pyproject.toml
deleted file mode 100644
index 6d1b4c5b6fb56a63069147e3a1de922ce71a45d8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/pyproject.toml
+++ /dev/null
@@ -1,3 +0,0 @@
-[build-system]
-requires = ["setuptools", "wheel", "cython"]
-build-backend = "setuptools.build_meta"
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/requirements.txt b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/requirements.txt
deleted file mode 100644
index 6df9e088d7f10ef6fb7f4bd3b12cbeb545be3bfc..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/requirements.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# progress bars in model download and training scripts
-tqdm
-# Accessing files from S3 directly.
-boto3
-# Used for downloading models over HTTP
-requests
-six
-ipdb
-#Data processing
-h5py
-html2text
-nltk
-progressbar
-#Others
-onnxruntime
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/run_8p.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/run_8p.sh
deleted file mode 100644
index e8761ac32a290e06f4ab149146685058add6ed4a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/run_8p.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-source env.sh
-PRETRAIN=/path/model.pt                 # fix it to your own model path
-DATA_PATH=path_of_data                   # fix it to your own data path
-langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN
-
-
-export RANK_SIZE=8
-for((RANK_ID=0;RANK_ID<RANK_SIZE;RANK_ID++))
-do
-    export RANK=$RANK_ID
-	if [ $(uname -m) = 'aarch64' ]
-		then
-			let a=0+RANK_ID*24
-			let b=23+RANK_ID*24
-			taskset -c $a-$b fairseq-train $DATA_PATH --fp16 --distributed-world-size 8 --npu \
-							  --device-id $RANK_ID --distributed-rank $RANK_ID --distributed-no-spawn --max-update 40000 \
-							  --encoder-normalize-before --decoder-normalize-before \
-							  --arch mbart_large --layernorm-embedding \
-							  --task translation_from_pretrained_bart \
-							  --source-lang en_XX --target-lang ro_RO \
-							  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
-							  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
-							  --lr-scheduler polynomial_decay --lr 3e-05 --min-lr -1 --warmup-updates 2500 --total-num-update 40000 \
-							  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
-							  --max-tokens 1024 --update-freq 2 \
-							  --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
-							  --seed 222 --log-format simple --log-interval 2 \
-							  --restore-file $PRETRAIN \
-							  --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
-							  --langs $langs \
-							  --ddp-backend no_c10d &
-	else
-		fairseq-train $DATA_PATH --fp16 --distributed-world-size 8 --npu \
-							  --device-id $RANK_ID --distributed-rank $RANK_ID --distributed-no-spawn --max-update 40000 \
-							  --encoder-normalize-before --decoder-normalize-before \
-							  --arch mbart_large --layernorm-embedding \
-							  --task translation_from_pretrained_bart \
-							  --source-lang en_XX --target-lang ro_RO \
-							  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
-							  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
-							  --lr-scheduler polynomial_decay --lr 3e-05 --min-lr -1 --warmup-updates 2500 --total-num-update 40000 \
-							  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
-							  --max-tokens 1024 --update-freq 2 \
-							  --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
-							  --seed 222 --log-format simple --log-interval 2 \
-							  --restore-file $PRETRAIN \
-							  --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
-							  --langs $langs \
-							  --ddp-backend no_c10d &
-	fi
-done
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/average_checkpoints.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/average_checkpoints.py
deleted file mode 100644
index c512f802bce6b3395cc42a0e4eb39181e9f8c873..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/average_checkpoints.py
+++ /dev/null
@@ -1,158 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import collections
-import os
-import re
-
-import torch
-from fairseq.file_io import PathManager
-
-
-def average_checkpoints(inputs):
-    """Loads checkpoints from inputs and returns a model with averaged weights.
-
-    Args:
-      inputs: An iterable of string paths of checkpoints to load from.
-
-    Returns:
-      A dict of string keys mapping to various values. The 'model' key
-      from the returned dict should correspond to an OrderedDict mapping
-      string parameter names to torch Tensors.
-    """
-    params_dict = collections.OrderedDict()
-    params_keys = None
-    new_state = None
-    num_models = len(inputs)
-
-    for fpath in inputs:
-        with PathManager.open(fpath, "rb") as f:
-            state = torch.load(
-                f,
-                map_location=(
-                    lambda s, _: torch.serialization.default_restore_location(s, "cpu")
-                ),
-            )
-        # Copies over the settings from the first checkpoint
-        if new_state is None:
-            new_state = state
-
-        model_params = state["model"]
-
-        model_params_keys = list(model_params.keys())
-        if params_keys is None:
-            params_keys = model_params_keys
-        elif params_keys != model_params_keys:
-            raise KeyError(
-                "For checkpoint {}, expected list of params: {}, "
-                "but found: {}".format(f, params_keys, model_params_keys)
-            )
-
-        for k in params_keys:
-            p = model_params[k]
-            if isinstance(p, torch.HalfTensor):
-                p = p.float()
-            if k not in params_dict:
-                params_dict[k] = p.clone()
-                # NOTE: clone() is needed in case of p is a shared parameter
-            else:
-                params_dict[k] += p
-
-    averaged_params = collections.OrderedDict()
-    for k, v in params_dict.items():
-        averaged_params[k] = v
-        if averaged_params[k].is_floating_point():
-            averaged_params[k].div_(num_models)
-        else:
-            averaged_params[k] //= num_models
-    new_state["model"] = averaged_params
-    return new_state
-
-
-def last_n_checkpoints(paths, n, update_based, upper_bound=None):
-    assert len(paths) == 1
-    path = paths[0]
-    if update_based:
-        pt_regexp = re.compile(r"checkpoint_\d+_(\d+)\.pt")
-    else:
-        pt_regexp = re.compile(r"checkpoint(\d+)\.pt")
-    files = PathManager.ls(path)
-
-    entries = []
-    for f in files:
-        m = pt_regexp.fullmatch(f)
-        if m is not None:
-            sort_key = int(m.group(1))
-            if upper_bound is None or sort_key <= upper_bound:
-                entries.append((sort_key, m.group(0)))
-    if len(entries) < n:
-        raise Exception(
-            "Found {} checkpoint files but need at least {}", len(entries), n
-        )
-    return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]]
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Tool to average the params of input checkpoints to "
-        "produce a new checkpoint",
-    )
-    # fmt: off
-    parser.add_argument('--inputs', required=True, nargs='+',
-                        help='Input checkpoint file paths.')
-    parser.add_argument('--output', required=True, metavar='FILE',
-                        help='Write the new checkpoint containing the averaged weights to this path.')
-    num_group = parser.add_mutually_exclusive_group()
-    num_group.add_argument('--num-epoch-checkpoints', type=int,
-                           help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, '
-                           'and average last this many of them.')
-    num_group.add_argument('--num-update-checkpoints', type=int,
-                           help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, '
-                           'and average last this many of them.')
-    parser.add_argument('--checkpoint-upper-bound', type=int,
-                        help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, '
-                        'when using --num-update-checkpoints, this will set an upper bound on which update to use'
-                        'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.'
-                        'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500'
-                        )
-    # fmt: on
-    args = parser.parse_args()
-    print(args)
-
-    num = None
-    is_update_based = False
-    if args.num_update_checkpoints is not None:
-        num = args.num_update_checkpoints
-        is_update_based = True
-    elif args.num_epoch_checkpoints is not None:
-        num = args.num_epoch_checkpoints
-
-    assert args.checkpoint_upper_bound is None or (
-        args.num_epoch_checkpoints is not None
-        or args.num_update_checkpoints is not None
-    ), "--checkpoint-upper-bound requires --num-epoch-checkpoints or --num-update-checkpoints"
-    assert (
-        args.num_epoch_checkpoints is None or args.num_update_checkpoints is None
-    ), "Cannot combine --num-epoch-checkpoints and --num-update-checkpoints"
-
-    if num is not None:
-        args.inputs = last_n_checkpoints(
-            args.inputs,
-            num,
-            is_update_based,
-            upper_bound=args.checkpoint_upper_bound,
-        )
-        print("averaging checkpoints: ", args.inputs)
-
-    new_state = average_checkpoints(args.inputs)
-    with PathManager.open(args.output, "wb") as f:
-        torch.save(new_state, f)
-    print("Finished writing averaged checkpoint to {}".format(args.output))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/build_sym_alignment.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/build_sym_alignment.py
deleted file mode 100644
index 0ca5c18f7bd4b0fbf58b203793506ca395466129..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/build_sym_alignment.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Use this script in order to build symmetric alignments for your translation
-dataset.
-This script depends on fast_align and mosesdecoder tools. You will need to
-build those before running the script.
-fast_align:
-    github: http://github.com/clab/fast_align
-    instructions: follow the instructions in README.md
-mosesdecoder:
-    github: http://github.com/moses-smt/mosesdecoder
-    instructions: http://www.statmt.org/moses/?n=Development.GetStarted
-The script produces the following files under --output_dir:
-    text.joined - concatenation of lines from the source_file and the
-    target_file.
-    align.forward - forward pass of fast_align.
-    align.backward - backward pass of fast_align.
-    aligned.sym_heuristic - symmetrized alignment.
-"""
-
-import argparse
-import os
-from itertools import zip_longest
-
-
-def main():
-    parser = argparse.ArgumentParser(description="symmetric alignment builer")
-    # fmt: off
-    parser.add_argument('--fast_align_dir',
-                        help='path to fast_align build directory')
-    parser.add_argument('--mosesdecoder_dir',
-                        help='path to mosesdecoder root directory')
-    parser.add_argument('--sym_heuristic',
-                        help='heuristic to use for symmetrization',
-                        default='grow-diag-final-and')
-    parser.add_argument('--source_file',
-                        help='path to a file with sentences '
-                             'in the source language')
-    parser.add_argument('--target_file',
-                        help='path to a file with sentences '
-                             'in the target language')
-    parser.add_argument('--output_dir',
-                        help='output directory')
-    # fmt: on
-    args = parser.parse_args()
-
-    fast_align_bin = os.path.join(args.fast_align_dir, "fast_align")
-    symal_bin = os.path.join(args.mosesdecoder_dir, "bin", "symal")
-    sym_fast_align_bin = os.path.join(
-        args.mosesdecoder_dir, "scripts", "ems", "support", "symmetrize-fast-align.perl"
-    )
-
-    # create joined file
-    joined_file = os.path.join(args.output_dir, "text.joined")
-    with open(args.source_file, "r", encoding="utf-8") as src, open(
-        args.target_file, "r", encoding="utf-8"
-    ) as tgt:
-        with open(joined_file, "w", encoding="utf-8") as joined:
-            for s, t in zip_longest(src, tgt):
-                print("{} ||| {}".format(s.strip(), t.strip()), file=joined)
-
-    bwd_align_file = os.path.join(args.output_dir, "align.backward")
-
-    # run forward alignment
-    fwd_align_file = os.path.join(args.output_dir, "align.forward")
-    fwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v > {FWD}".format(
-        FASTALIGN=fast_align_bin, JOINED=joined_file, FWD=fwd_align_file
-    )
-    assert os.system(fwd_fast_align_cmd) == 0
-
-    # run backward alignment
-    bwd_align_file = os.path.join(args.output_dir, "align.backward")
-    bwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v -r > {BWD}".format(
-        FASTALIGN=fast_align_bin, JOINED=joined_file, BWD=bwd_align_file
-    )
-    assert os.system(bwd_fast_align_cmd) == 0
-
-    # run symmetrization
-    sym_out_file = os.path.join(args.output_dir, "aligned")
-    sym_cmd = "{SYMFASTALIGN} {FWD} {BWD} {SRC} {TGT} {OUT} {HEURISTIC} {SYMAL}".format(
-        SYMFASTALIGN=sym_fast_align_bin,
-        FWD=fwd_align_file,
-        BWD=bwd_align_file,
-        SRC=args.source_file,
-        TGT=args.target_file,
-        OUT=sym_out_file,
-        HEURISTIC=args.sym_heuristic,
-        SYMAL=symal_bin,
-    )
-    assert os.system(sym_cmd) == 0
-
-
-if __name__ == "__main__":
-    main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/compare_namespaces.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/compare_namespaces.py
deleted file mode 100644
index 739a961ec764b165b4ffba6e852475cdd1aa3671..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/compare_namespaces.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# BSD 3-Clause License
-#
-# Copyright (c) 2017 xxxx
-# All rights reserved.
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# ============================================================================
-#!/usr/bin/env python
-"""Helper script to compare two argparse.Namespace objects."""
-
-from argparse import Namespace  # noqa
-
-
-def main():
-
-    ns1 = eval(input("Namespace 1: "))
-    ns2 = eval(input("Namespace 2: "))
-
-    def keys(ns):
-        ks = set()
-        for k in dir(ns):
-            if not k.startswith("_"):
-                ks.add(k)
-        return ks
-
-    k1 = keys(ns1)
-    k2 = keys(ns2)
-
-    def print_keys(ks, ns1, ns2=None):
-        for k in ks:
-            if ns2 is None:
-                print("{}\t{}".format(k, getattr(ns1, k, None)))
-            else:
-                print(
-                    "{}\t{}\t{}".format(k, getattr(ns1, k, None), getattr(ns2, k, None))
-                )
-
-    print("Keys unique to namespace 1:")
-    print_keys(k1 - k2, ns1)
-    print()
-
-    print("Keys unique to namespace 2:")
-    print_keys(k2 - k1, ns2)
-    print()
-
-    print("Overlapping keys with different values:")
-    ks = [k for k in k1 & k2 if getattr(ns1, k, "None") != getattr(ns2, k, "None")]
-    print_keys(ks, ns1, ns2)
-    print()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/compound_split_bleu.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/compound_split_bleu.sh
deleted file mode 100644
index 1972fddcebff9a43a70bcf14c287175c68f60e3f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/compound_split_bleu.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-if [ $# -ne 1 ]; then
-    echo "usage: $0 GENERATE_PY_OUTPUT"
-    exit 1
-fi
-
-GEN=$1
-
-SYS=$GEN.sys
-REF=$GEN.ref
-
-if [ $(tail -n 1 $GEN | grep BLEU | wc -l) -ne 1 ]; then
-    echo "not done generating"
-    exit
-fi
-
-grep ^H $GEN | awk -F '\t' '{print $NF}' | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $SYS
-grep ^T $GEN | cut -f2- | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $REF
-fairseq-score --sys $SYS --ref $REF
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/constraints/extract.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/constraints/extract.py
deleted file mode 100644
index f6155d0a0538aadb46bf612256b6b949728de69e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/constraints/extract.py
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""Extracts random constraints from reference files."""
-
-import argparse
-import random
-import sys
-
-from sacrebleu import extract_ngrams
-
-
-def get_phrase(words, index, length):
-    assert index < len(words) - length + 1
-    phr = " ".join(words[index : index + length])
-    for i in range(index, index + length):
-        words.pop(index)
-    return phr
-
-
-def main(args):
-
-    if args.seed:
-        random.seed(args.seed)
-
-    for line in sys.stdin:
-        constraints = []
-
-        def add_constraint(constraint):
-            constraints.append(constraint)
-
-        source = line.rstrip()
-        if "\t" in line:
-            source, target = line.split("\t")
-            if args.add_sos:
-                target = f"<s> {target}"
-            if args.add_eos:
-                target = f"{target} </s>"
-
-            if len(target.split()) >= args.len:
-                words = [target]
-
-                num = args.number
-
-                choices = {}
-                for i in range(num):
-                    if len(words) == 0:
-                        break
-                    segmentno = random.choice(range(len(words)))
-                    segment = words.pop(segmentno)
-                    tokens = segment.split()
-                    phrase_index = random.choice(range(len(tokens)))
-                    choice = " ".join(
-                        tokens[phrase_index : min(len(tokens), phrase_index + args.len)]
-                    )
-                    for j in range(
-                        phrase_index, min(len(tokens), phrase_index + args.len)
-                    ):
-                        tokens.pop(phrase_index)
-                    if phrase_index > 0:
-                        words.append(" ".join(tokens[0:phrase_index]))
-                    if phrase_index + 1 < len(tokens):
-                        words.append(" ".join(tokens[phrase_index:]))
-                    choices[target.find(choice)] = choice
-
-                    # mask out with spaces
-                    target = target.replace(choice, " " * len(choice), 1)
-
-                for key in sorted(choices.keys()):
-                    add_constraint(choices[key])
-
-        print(source, *constraints, sep="\t")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--number", "-n", type=int, default=1, help="number of phrases")
-    parser.add_argument("--len", "-l", type=int, default=1, help="phrase length")
-    parser.add_argument(
-        "--add-sos", default=False, action="store_true", help="add <s> token"
-    )
-    parser.add_argument(
-        "--add-eos", default=False, action="store_true", help="add </s> token"
-    )
-    parser.add_argument("--seed", "-s", default=0, type=int)
-    args = parser.parse_args()
-
-    main(args)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/constraints/validate.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/constraints/validate.py
deleted file mode 100644
index d531ad9f39b1df42c98fe8f26ad61fe53a9ac0c5..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/constraints/validate.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env python3
-#
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import sys
-
-
-"""Reads in a fairseq output file, and verifies that the constraints
-(C- lines) are present in the output (the first H- line). Assumes that
-constraints are listed prior to the first hypothesis.
-"""
-
-constraints = []
-found = 0
-total = 0
-for line in sys.stdin:
-    if line.startswith("C-"):
-        constraints.append(line.rstrip().split("\t")[1])
-    elif line.startswith("H-"):
-        text = line.split("\t")[2]
-
-        for constraint in constraints:
-            total += 1
-            if constraint in text:
-                found += 1
-            else:
-                print(f"No {constraint} in {text}", file=sys.stderr)
-
-        constraints = []
-
-print(f"Found {found} / {total} = {100 * found / total:.1f}%")
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/convert_dictionary.lua b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/convert_dictionary.lua
deleted file mode 100644
index 14ee8c997f642c8ff196617c2dcd0584037a60c4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/convert_dictionary.lua
+++ /dev/null
@@ -1,34 +0,0 @@
--- Copyright (c) Facebook, Inc. and its affiliates.
---
--- This source code is licensed under the MIT license found in the
--- LICENSE file in the root directory of this source tree.
---
--- Usage: convert_dictionary.lua <dict.th7>
-require 'fairseq'
-require 'torch'
-require 'paths'
-
-if #arg < 1 then
-   print('usage: convert_dictionary.lua <dict.th7>')
-   os.exit(1)
-end
-if not paths.filep(arg[1]) then
-   print('error: file does not exit: ' .. arg[1])
-   os.exit(1)
-end
-
-dict = torch.load(arg[1])
-dst = paths.basename(arg[1]):gsub('.th7', '.txt')
-assert(dst:match('.txt$'))
-
-f = io.open(dst, 'w')
-for idx, symbol in ipairs(dict.index_to_symbol) do
-  if idx > dict.cutoff then
-    break
-  end
-  f:write(symbol)
-  f:write(' ')
-  f:write(dict.index_to_freq[idx])
-  f:write('\n')
-end
-f:close()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/convert_model.lua b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/convert_model.lua
deleted file mode 100644
index 61b92139294fb90a25989ebd2ee52a765fb278a2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/convert_model.lua
+++ /dev/null
@@ -1,108 +0,0 @@
--- Copyright (c) Facebook, Inc. and its affiliates.
---
--- This source code is licensed under the MIT license found in the
--- LICENSE file in the root directory of this source tree.
---
--- Usage: convert_model.lua <model_epoch1.th7>
-require 'torch'
-local fairseq = require 'fairseq'
-
-model = torch.load(arg[1])
-
-function find_weight_norm(container, module)
-  for _, wn in ipairs(container:listModules()) do
-    if torch.type(wn) == 'nn.WeightNorm' and wn.modules[1] == module then
-      return wn
-    end
-  end
-end
-
-function push_state(dict, key, module)
-  if torch.type(module) == 'nn.Linear' then
-    local wn = find_weight_norm(model.module, module)
-    assert(wn)
-    dict[key .. '.weight_v'] = wn.v:float()
-    dict[key .. '.weight_g'] = wn.g:float()
-  elseif torch.type(module) == 'nn.TemporalConvolutionTBC' then
-    local wn = find_weight_norm(model.module, module)
-    assert(wn)
-    local v = wn.v:float():view(wn.viewOut):transpose(2, 3)
-    dict[key .. '.weight_v'] = v
-    dict[key .. '.weight_g'] = wn.g:float():view(module.weight:size(3), 1, 1)
-  else
-    dict[key .. '.weight'] = module.weight:float()
-  end
-  if module.bias then
-    dict[key .. '.bias'] = module.bias:float()
-  end
-end
-
-encoder_dict = {}
-decoder_dict = {}
-combined_dict = {}
-
-function encoder_state(encoder)
-  luts = encoder:findModules('nn.LookupTable')
-  push_state(encoder_dict, 'embed_tokens', luts[1])
-  push_state(encoder_dict, 'embed_positions', luts[2])
-
-  fcs = encoder:findModules('nn.Linear')
-  assert(#fcs >= 2)
-  local nInputPlane = fcs[1].weight:size(1)
-  push_state(encoder_dict, 'fc1', table.remove(fcs, 1))
-  push_state(encoder_dict, 'fc2', table.remove(fcs, #fcs))
-
-  for i, module in ipairs(encoder:findModules('nn.TemporalConvolutionTBC')) do
-    push_state(encoder_dict, 'convolutions.' .. tostring(i - 1), module)
-    if nInputPlane ~= module.weight:size(3) / 2 then
-      push_state(encoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1))
-    end
-    nInputPlane = module.weight:size(3) / 2
-  end
-  assert(#fcs == 0)
-end
-
-function decoder_state(decoder)
-  luts = decoder:findModules('nn.LookupTable')
-  push_state(decoder_dict, 'embed_tokens', luts[1])
-  push_state(decoder_dict, 'embed_positions', luts[2])
-
-  fcs = decoder:findModules('nn.Linear')
-  local nInputPlane = fcs[1].weight:size(1)
-  push_state(decoder_dict, 'fc1', table.remove(fcs, 1))
-  push_state(decoder_dict, 'fc2', fcs[#fcs - 1])
-  push_state(decoder_dict, 'fc3', fcs[#fcs])
-
-  table.remove(fcs, #fcs)
-  table.remove(fcs, #fcs)
-
-  for i, module in ipairs(decoder:findModules('nn.TemporalConvolutionTBC')) do
-    if nInputPlane ~= module.weight:size(3) / 2 then
-      push_state(decoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1))
-    end
-    nInputPlane = module.weight:size(3) / 2
-
-    local prefix = 'attention.' .. tostring(i - 1)
-    push_state(decoder_dict, prefix .. '.in_projection', table.remove(fcs, 1))
-    push_state(decoder_dict, prefix .. '.out_projection', table.remove(fcs, 1))
-    push_state(decoder_dict, 'convolutions.' .. tostring(i - 1), module)
-  end
-  assert(#fcs == 0)
-end
-
-
-_encoder = model.module.modules[2]
-_decoder = model.module.modules[3]
-
-encoder_state(_encoder)
-decoder_state(_decoder)
-
-for k, v in pairs(encoder_dict) do
-  combined_dict['encoder.' .. k] = v
-end
-for k, v in pairs(decoder_dict) do
-  combined_dict['decoder.' .. k] = v
-end
-
-
-torch.save('state_dict.t7', combined_dict)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/count_docs.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/count_docs.py
deleted file mode 100644
index 58d85af85e91377a34dbd01f7674436152fd08e8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/count_docs.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Count the number of documents and average number of lines and tokens per
-document in a large file. Documents should be separated by a single empty line.
-"""
-
-import argparse
-import gzip
-import sys
-
-import numpy as np
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("input")
-    parser.add_argument("--gzip", action="store_true")
-    args = parser.parse_args()
-
-    def gopen():
-        if args.gzip:
-            return gzip.open(args.input, "r")
-        else:
-            return open(args.input, "r", encoding="utf-8")
-
-    num_lines = []
-    num_toks = []
-    with gopen() as h:
-        num_docs = 1
-        num_lines_in_doc = 0
-        num_toks_in_doc = 0
-        for i, line in enumerate(h):
-            if len(line.strip()) == 0:  # empty line indicates new document
-                num_docs += 1
-                num_lines.append(num_lines_in_doc)
-                num_toks.append(num_toks_in_doc)
-                num_lines_in_doc = 0
-                num_toks_in_doc = 0
-            else:
-                num_lines_in_doc += 1
-                num_toks_in_doc += len(line.rstrip().split())
-            if i % 1000000 == 0:
-                print(i, file=sys.stderr, end="", flush=True)
-            elif i % 100000 == 0:
-                print(".", file=sys.stderr, end="", flush=True)
-        print(file=sys.stderr, flush=True)
-
-    print("found {} docs".format(num_docs))
-    print("average num lines per doc: {}".format(np.mean(num_lines)))
-    print("average num toks per doc: {}".format(np.mean(num_toks)))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/read_binarized.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/read_binarized.py
deleted file mode 100644
index a414095d03fb022a6753e816fc8bfd80e11db24d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/read_binarized.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-
-from fairseq.data import Dictionary, data_utils, indexed_dataset
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        description="writes text from binarized file to stdout"
-    )
-    # fmt: off
-    parser.add_argument('--dataset-impl', help='dataset implementation',
-                        choices=indexed_dataset.get_available_dataset_impl())
-    parser.add_argument('--dict', metavar='FP', help='dictionary containing known words', default=None)
-    parser.add_argument('--input', metavar='FP', required=True, help='binarized file to read')
-    # fmt: on
-
-    return parser
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    dictionary = Dictionary.load(args.dict) if args.dict is not None else None
-    dataset = data_utils.load_indexed_dataset(
-        args.input,
-        dictionary,
-        dataset_impl=args.dataset_impl,
-        default="lazy",
-    )
-
-    for tensor_line in dataset:
-        if dictionary is None:
-            line = " ".join([str(int(x)) for x in tensor_line])
-        else:
-            line = dictionary.string(tensor_line)
-
-        print(line)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/rm_pt.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/rm_pt.py
deleted file mode 100644
index 6cd063d21f0610fa7c42c2cfb2ee8af7c9c78677..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/rm_pt.py
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import os
-import re
-import shutil
-import sys
-
-
-pt_regexp = re.compile(r"checkpoint(\d+|_\d+_\d+|_[a-z]+)\.pt")
-pt_regexp_epoch_based = re.compile(r"checkpoint(\d+)\.pt")
-pt_regexp_update_based = re.compile(r"checkpoint_\d+_(\d+)\.pt")
-
-
-def parse_checkpoints(files):
-    entries = []
-    for f in files:
-        m = pt_regexp_epoch_based.fullmatch(f)
-        if m is not None:
-            entries.append((int(m.group(1)), m.group(0)))
-        else:
-            m = pt_regexp_update_based.fullmatch(f)
-            if m is not None:
-                entries.append((int(m.group(1)), m.group(0)))
-    return entries
-
-
-def last_n_checkpoints(files, n):
-    entries = parse_checkpoints(files)
-    return [x[1] for x in sorted(entries, reverse=True)[:n]]
-
-
-def every_n_checkpoints(files, n):
-    entries = parse_checkpoints(files)
-    return [x[1] for x in sorted(sorted(entries)[::-n])]
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description=(
-            "Recursively delete checkpoint files from `root_dir`, "
-            "but preserve checkpoint_best.pt and checkpoint_last.pt"
-        )
-    )
-    parser.add_argument("root_dirs", nargs="*")
-    parser.add_argument(
-        "--save-last", type=int, default=0, help="number of last checkpoints to save"
-    )
-    parser.add_argument(
-        "--save-every", type=int, default=0, help="interval of checkpoints to save"
-    )
-    parser.add_argument(
-        "--preserve-test",
-        action="store_true",
-        help="preserve checkpoints in dirs that start with test_ prefix (default: delete them)",
-    )
-    parser.add_argument(
-        "--delete-best", action="store_true", help="delete checkpoint_best.pt"
-    )
-    parser.add_argument(
-        "--delete-last", action="store_true", help="delete checkpoint_last.pt"
-    )
-    parser.add_argument(
-        "--no-dereference", action="store_true", help="don't dereference symlinks"
-    )
-    args = parser.parse_args()
-
-    files_to_desymlink = []
-    files_to_preserve = []
-    files_to_delete = []
-    for root_dir in args.root_dirs:
-        for root, _subdirs, files in os.walk(root_dir):
-            if args.save_last > 0:
-                to_save = last_n_checkpoints(files, args.save_last)
-            else:
-                to_save = []
-            if args.save_every > 0:
-                to_save += every_n_checkpoints(files, args.save_every)
-            for file in files:
-                if not pt_regexp.fullmatch(file):
-                    continue
-                full_path = os.path.join(root, file)
-                if (
-                    not os.path.basename(root).startswith("test_") or args.preserve_test
-                ) and (
-                    (file == "checkpoint_last.pt" and not args.delete_last)
-                    or (file == "checkpoint_best.pt" and not args.delete_best)
-                    or file in to_save
-                ):
-                    if os.path.islink(full_path) and not args.no_dereference:
-                        files_to_desymlink.append(full_path)
-                    else:
-                        files_to_preserve.append(full_path)
-                else:
-                    files_to_delete.append(full_path)
-
-    if len(files_to_desymlink) == 0 and len(files_to_delete) == 0:
-        print("Nothing to do.")
-        sys.exit(0)
-
-    files_to_desymlink = sorted(files_to_desymlink)
-    files_to_preserve = sorted(files_to_preserve)
-    files_to_delete = sorted(files_to_delete)
-
-    print("Operations to perform (in order):")
-    if len(files_to_desymlink) > 0:
-        for file in files_to_desymlink:
-            print(" - preserve (and dereference symlink): " + file)
-    if len(files_to_preserve) > 0:
-        for file in files_to_preserve:
-            print(" - preserve: " + file)
-    if len(files_to_delete) > 0:
-        for file in files_to_delete:
-            print(" - delete: " + file)
-    while True:
-        resp = input("Continue? (Y/N): ")
-        if resp.strip().lower() == "y":
-            break
-        elif resp.strip().lower() == "n":
-            sys.exit(0)
-
-    print("Executing...")
-    if len(files_to_desymlink) > 0:
-        for file in files_to_desymlink:
-            realpath = os.path.realpath(file)
-            print("rm " + file)
-            os.remove(file)
-            print("cp {} {}".format(realpath, file))
-            shutil.copyfile(realpath, file)
-    if len(files_to_delete) > 0:
-        for file in files_to_delete:
-            print("rm " + file)
-            os.remove(file)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/sacrebleu.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/sacrebleu.sh
deleted file mode 100644
index c10bf2b76ea032deabab6f5c9d8a3e1e884f1642..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/sacrebleu.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-if [ $# -ne 4 ]; then
-    echo "usage: $0 TESTSET SRCLANG TGTLANG GEN"
-    exit 1
-fi
-
-TESTSET=$1
-SRCLANG=$2
-TGTLANG=$3
-
-GEN=$4
-
-if ! command -v sacremoses &> /dev/null
-then
-    echo "sacremoses could not be found, please install with: pip install sacremoses"
-    exit
-fi
-
-grep ^H $GEN \
-| sed 's/^H\-//' \
-| sort -n -k 1 \
-| cut -f 3 \
-| sacremoses detokenize \
-> $GEN.sorted.detok
-
-sacrebleu --test-set $TESTSET --language-pair "${SRCLANG}-${TGTLANG}" < $GEN.sorted.detok
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/shard_docs.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/shard_docs.py
deleted file mode 100644
index 97232c3c845ee01dc5ab627388934cc0f9588280..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/shard_docs.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Split a large file into shards while respecting document boundaries. Documents
-should be separated by a single empty line.
-"""
-
-import argparse
-import contextlib
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("input")
-    parser.add_argument("--num-shards", type=int)
-    args = parser.parse_args()
-
-    assert args.num_shards is not None and args.num_shards > 1
-
-    with open(args.input, "r", encoding="utf-8") as h:
-        with contextlib.ExitStack() as stack:
-            outputs = [
-                stack.enter_context(
-                    open(args.input + ".shard" + str(i), "w", encoding="utf-8")
-                )
-                for i in range(args.num_shards)
-            ]
-
-            doc = []
-            first_doc = [True] * args.num_shards
-
-            def output_doc(i):
-                if not first_doc[i]:
-                    outputs[i].write("\n")
-                first_doc[i] = False
-                for line in doc:
-                    outputs[i].write(line)
-                doc.clear()
-
-            num_docs = 0
-            for line in h:
-                if line.strip() == "":  # empty line indicates new document
-                    output_doc(num_docs % args.num_shards)
-                    num_docs += 1
-                else:
-                    doc.append(line)
-            output_doc(num_docs % args.num_shards)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/split_train_valid_docs.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/split_train_valid_docs.py
deleted file mode 100644
index ff159785284a13b44626b207d84430c592acaf8f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/split_train_valid_docs.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Split a large file into a train and valid set while respecting document
-boundaries. Documents should be separated by a single empty line.
-"""
-
-import argparse
-import random
-import sys
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("input")
-    parser.add_argument("sample_output", help="train output file")
-    parser.add_argument("remainder_output", help="valid output file")
-    parser.add_argument("-k", type=int, help="remainder size")
-    parser.add_argument(
-        "--lines", action="store_true", help="split lines instead of docs"
-    )
-    args = parser.parse_args()
-
-    assert args.k is not None
-
-    sample = []
-    remainder = []
-    num_docs = [0]
-
-    def update_sample(doc):
-        if len(sample) < args.k:
-            sample.append(doc.copy())
-        else:
-            i = num_docs[0]
-            j = random.randrange(i + 1)
-            if j < args.k:
-                remainder.append(sample[j])
-                sample[j] = doc.copy()
-            else:
-                remainder.append(doc.copy())
-        num_docs[0] += 1
-        doc.clear()
-
-    with open(args.input, "r", encoding="utf-8") as h:
-        doc = []
-        for i, line in enumerate(h):
-            if line.strip() == "":  # empty line indicates new document
-                update_sample(doc)
-            else:
-                doc.append(line)
-            if args.lines:
-                update_sample(doc)
-            if i % 1000000 == 0:
-                print(i, file=sys.stderr, end="", flush=True)
-            elif i % 100000 == 0:
-                print(".", file=sys.stderr, end="", flush=True)
-        if len(doc) > 0:
-            update_sample(doc)
-    print(file=sys.stderr, flush=True)
-
-    assert len(sample) == args.k
-
-    with open(args.sample_output, "w", encoding="utf-8") as out:
-        first = True
-        for doc in sample:
-            if not first and not args.lines:
-                out.write("\n")
-            first = False
-            for line in doc:
-                out.write(line)
-
-    with open(args.remainder_output, "w", encoding="utf-8") as out:
-        first = True
-        for doc in remainder:
-            if not first and not args.lines:
-                out.write("\n")
-            first = False
-            for line in doc:
-                out.write(line)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/spm_decode.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/spm_decode.py
deleted file mode 100644
index 1c18b1d2a7d7628b7aeb6fdb6c4ab5a096e9edf8..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/spm_decode.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import argparse
-
-import sentencepiece as spm
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model", required=True, help="sentencepiece model to use for decoding"
-    )
-    parser.add_argument("--input", required=True, help="input file to decode")
-    parser.add_argument("--input_format", choices=["piece", "id"], default="piece")
-    args = parser.parse_args()
-
-    sp = spm.SentencePieceProcessor()
-    sp.Load(args.model)
-
-    if args.input_format == "piece":
-
-        def decode(l):
-            return "".join(sp.DecodePieces(l))
-
-    elif args.input_format == "id":
-
-        def decode(l):
-            return "".join(sp.DecodeIds(l))
-
-    else:
-        raise NotImplementedError
-
-    def tok2int(tok):
-        # remap reference-side <unk> (represented as <<unk>>) to 0
-        return int(tok) if tok != "<<unk>>" else 0
-
-    with open(args.input, "r", encoding="utf-8") as h:
-        for line in h:
-            if args.input_format == "id":
-                print(decode(list(map(tok2int, line.rstrip().split()))))
-            elif args.input_format == "piece":
-                print(decode(line.rstrip().split()))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/spm_encode.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/spm_encode.py
deleted file mode 100644
index 83facfb3b184aff8b9cc3f0c82dd53668c63e57b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/spm_encode.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import argparse
-import contextlib
-import sys
-
-import sentencepiece as spm
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model", required=True, help="sentencepiece model to use for encoding"
-    )
-    parser.add_argument(
-        "--inputs", nargs="+", default=["-"], help="input files to filter/encode"
-    )
-    parser.add_argument(
-        "--outputs", nargs="+", default=["-"], help="path to save encoded outputs"
-    )
-    parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
-    parser.add_argument(
-        "--min-len",
-        type=int,
-        metavar="N",
-        help="filter sentence pairs with fewer than N tokens",
-    )
-    parser.add_argument(
-        "--max-len",
-        type=int,
-        metavar="N",
-        help="filter sentence pairs with more than N tokens",
-    )
-    args = parser.parse_args()
-
-    assert len(args.inputs) == len(
-        args.outputs
-    ), "number of input and output paths should match"
-
-    sp = spm.SentencePieceProcessor()
-    sp.Load(args.model)
-
-    if args.output_format == "piece":
-
-        def encode(l):
-            return sp.EncodeAsPieces(l)
-
-    elif args.output_format == "id":
-
-        def encode(l):
-            return list(map(str, sp.EncodeAsIds(l)))
-
-    else:
-        raise NotImplementedError
-
-    if args.min_len is not None or args.max_len is not None:
-
-        def valid(line):
-            return (args.min_len is None or len(line) >= args.min_len) and (
-                args.max_len is None or len(line) <= args.max_len
-            )
-
-    else:
-
-        def valid(lines):
-            return True
-
-    with contextlib.ExitStack() as stack:
-        inputs = [
-            stack.enter_context(open(input, "r", encoding="utf-8"))
-            if input != "-"
-            else sys.stdin
-            for input in args.inputs
-        ]
-        outputs = [
-            stack.enter_context(open(output, "w", encoding="utf-8"))
-            if output != "-"
-            else sys.stdout
-            for output in args.outputs
-        ]
-
-        stats = {
-            "num_empty": 0,
-            "num_filtered": 0,
-        }
-
-        def encode_line(line):
-            line = line.strip()
-            if len(line) > 0:
-                line = encode(line)
-                if valid(line):
-                    return line
-                else:
-                    stats["num_filtered"] += 1
-            else:
-                stats["num_empty"] += 1
-            return None
-
-        for i, lines in enumerate(zip(*inputs), start=1):
-            enc_lines = list(map(encode_line, lines))
-            if not any(enc_line is None for enc_line in enc_lines):
-                for enc_line, output_h in zip(enc_lines, outputs):
-                    print(" ".join(enc_line), file=output_h)
-            if i % 10000 == 0:
-                print("processed {} lines".format(i), file=sys.stderr)
-
-        print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
-        print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/spm_train.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/spm_train.py
deleted file mode 100644
index 9db668fd4166a860198784990de68ea26157995d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/scripts/spm_train.py
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import sys
-
-import sentencepiece as spm
-
-
-if __name__ == "__main__":
-    spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/setup.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/setup.py
deleted file mode 100644
index 02ee1f11bbf27b681e07804e1361de871d0ef833..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/setup.py
+++ /dev/null
@@ -1,217 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import sys
-
-from setuptools import Extension, find_packages, setup
-
-
-if sys.version_info < (3, 6):
-    sys.exit("Sorry, Python >= 3.6 is required for fairseq.")
-
-
-with open("README.md") as f:
-    readme = f.read()
-
-
-if sys.platform == "darwin":
-    extra_compile_args = ["-stdlib=libc++", "-O3"]
-else:
-    extra_compile_args = ["-std=c++11", "-O3"]
-
-
-class NumpyExtension(Extension):
-    """Source: https://stackoverflow.com/a/54128391"""
-
-    def __init__(self, *args, **kwargs):
-        self.__include_dirs = []
-        super().__init__(*args, **kwargs)
-
-    @property
-    def include_dirs(self):
-        import numpy
-
-        return self.__include_dirs + [numpy.get_include()]
-
-    @include_dirs.setter
-    def include_dirs(self, dirs):
-        self.__include_dirs = dirs
-
-
-extensions = [
-    Extension(
-        "fairseq.libbleu",
-        sources=[
-            "fairseq/clib/libbleu/libbleu.cpp",
-            "fairseq/clib/libbleu/module.cpp",
-        ],
-        extra_compile_args=extra_compile_args,
-    ),
-    NumpyExtension(
-        "fairseq.data.data_utils_fast",
-        sources=["fairseq/data/data_utils_fast.pyx"],
-        language="c++",
-        extra_compile_args=extra_compile_args,
-    ),
-    NumpyExtension(
-        "fairseq.data.token_block_utils_fast",
-        sources=["fairseq/data/token_block_utils_fast.pyx"],
-        language="c++",
-        extra_compile_args=extra_compile_args,
-    ),
-]
-
-
-cmdclass = {}
-
-
-try:
-    # torch is not available when generating docs
-    from torch.utils import cpp_extension
-
-    extensions.extend(
-        [
-            cpp_extension.CppExtension(
-                "fairseq.libnat",
-                sources=[
-                    "fairseq/clib/libnat/edit_dist.cpp",
-                ],
-            )
-        ]
-    )
-
-    if "CUDA_HOME" in os.environ:
-        extensions.extend(
-            [
-                cpp_extension.CppExtension(
-                    "fairseq.libnat_cuda",
-                    sources=[
-                        "fairseq/clib/libnat_cuda/edit_dist.cu",
-                        "fairseq/clib/libnat_cuda/binding.cpp",
-                    ],
-                )
-            ]
-        )
-    cmdclass["build_ext"] = cpp_extension.BuildExtension
-
-except ImportError:
-    pass
-
-
-if "READTHEDOCS" in os.environ:
-    # don't build extensions when generating docs
-    extensions = []
-    if "build_ext" in cmdclass:
-        del cmdclass["build_ext"]
-
-    # use CPU build of PyTorch
-    dependency_links = [
-        "https://download.pytorch.org/whl/cpu/torch-1.7.0%2Bcpu-cp36-cp36m-linux_x86_64.whl"
-    ]
-else:
-    dependency_links = []
-
-
-if "clean" in sys.argv[1:]:
-    # Source: https://bit.ly/2NLVsgE
-    print("deleting Cython files...")
-    import subprocess
-
-    subprocess.run(
-        ["rm -f fairseq/*.so fairseq/**/*.so fairseq/*.pyd fairseq/**/*.pyd"],
-        shell=True,
-    )
-
-
-extra_packages = []
-if os.path.exists(os.path.join("fairseq", "model_parallel", "megatron", "mpu")):
-    extra_packages.append("fairseq.model_parallel.megatron.mpu")
-
-
-def do_setup(package_data):
-    setup(
-        name="fairseq",
-        version="0.10.2",
-        description="Facebook AI Research Sequence-to-Sequence Toolkit",
-        url="https://github.com/pytorch/fairseq",
-        classifiers=[
-            "Intended Audience :: Science/Research",
-            "License :: OSI Approved :: MIT License",
-            "Programming Language :: Python :: 3.6",
-            "Topic :: Scientific/Engineering :: Artificial Intelligence",
-        ],
-        long_description=readme,
-        long_description_content_type="text/markdown",
-        setup_requires=[
-            "cython",
-            "numpy",
-            "setuptools>=18.0",
-        ],
-        install_requires=[
-            "cffi",
-            "cython",
-            "dataclasses",
-            "hydra-core",
-            "numpy",
-            "regex",
-            "sacrebleu>=1.4.12",
-            "torch",
-            "tqdm",
-        ],
-        dependency_links=dependency_links,
-        packages=find_packages(
-            exclude=[
-                "examples",
-                "examples.*",
-                "scripts",
-                "scripts.*",
-                "tests",
-                "tests.*",
-            ]
-        ) + extra_packages,
-        package_data=package_data,
-        ext_modules=extensions,
-        test_suite="tests",
-        entry_points={
-            "console_scripts": [
-                "fairseq-eval-lm = fairseq_cli.eval_lm:cli_main",
-                "fairseq-generate = fairseq_cli.generate:cli_main",
-                "fairseq-interactive = fairseq_cli.interactive:cli_main",
-                "fairseq-preprocess = fairseq_cli.preprocess:cli_main",
-                "fairseq-score = fairseq_cli.score:cli_main",
-                "fairseq-train = fairseq_cli.train:cli_main",
-                "fairseq-validate = fairseq_cli.validate:cli_main",
-            ],
-        },
-        cmdclass=cmdclass,
-        zip_safe=False,
-    )
-
-
-def get_files(path, relative_to="fairseq"):
-    all_files = []
-    for root, _dirs, files in os.walk(path, followlinks=True):
-        root = os.path.relpath(root, relative_to)
-        for file in files:
-            if file.endswith(".pyc"):
-                continue
-            all_files.append(os.path.join(root, file))
-    return all_files
-
-
-try:
-    # symlink examples into fairseq package so package_data accepts them
-    fairseq_examples = os.path.join("fairseq", "examples")
-    if "build_ext" not in sys.argv[1:] and not os.path.exists(fairseq_examples):
-        os.symlink(os.path.join("..", "examples"), fairseq_examples)
-    package_data = {
-        "fairseq": get_files("fairseq/config") + get_files("fairseq/examples"),
-    }
-    do_setup(package_data)
-finally:
-    if "build_ext" not in sys.argv[1:] and os.path.exists(fairseq_examples):
-        os.unlink(fairseq_examples)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/env.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/env.sh
deleted file mode 100644
index 5e6632b494eea9d09a8166d6829a71dbc0a64cf6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/env.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-export install_path=/usr/local/Ascend
-
-if [ -d ${install_path}/toolkit ]; then
-    export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH}
-    export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
-    export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH
-    export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH
-    export ASCEND_OPP_PATH=${install_path}/opp
-else
-    if [ -d ${install_path}/nnae/latest ];then
-        export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH
-        export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/
-        export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/
-        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
-        export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
-        export ASCEND_AICPU_PATH=${install_path}/nnae/latest
-    else
-        export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH
-        export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/
-        export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/
-        export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so
-        export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH
-        export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest
-    fi
-fi
-
-export SCALAR_TO_HOST_MEM=1
-export BMMV2_ENABLE=1
-#将Host日志输出到串口,0-关闭/1-开启
-export ASCEND_SLOG_PRINT_TO_STDOUT=0
-#设置默认日志级别,0-debug/1-info/2-warning/3-error
-export ASCEND_GLOBAL_LOG_LEVEL=3
-#设置Host侧Event日志开启标志,0-关闭/1-开启
-export ASCEND_GLOBAL_EVENT_ENABLE=0
-#设置是否开启taskque,0-关闭/1-开启
-export TASK_QUEUE_ENABLE=1
-#设置是否开启PTCopy,0-关闭/1-开启
-export PTCOPY_ENABLE=1
-#设置是否开启combined标志,0-关闭/1-开启
-export COMBINED_ENABLE=1
-#设置特殊场景是否需要重新编译,不需要修改
-export DYNAMIC_OP="ADD#MUL"
-#HCCL白名单开关,1-关闭/0-开启
-export HCCL_WHITELIST_DISABLE=1
-#设置Device侧日志等级为error
-${install_path}/driver/tools/msnpureport -d 0 -g error
-${install_path}/driver/tools/msnpureport -d 1 -g error
-${install_path}/driver/tools/msnpureport -d 2 -g error
-${install_path}/driver/tools/msnpureport -d 3 -g error
-${install_path}/driver/tools/msnpureport -d 4 -g error
-${install_path}/driver/tools/msnpureport -d 5 -g error
-${install_path}/driver/tools/msnpureport -d 6 -g error
-${install_path}/driver/tools/msnpureport -d 7 -g error
-#关闭Device侧Event日志
-${install_path}/driver/tools/msnpureport -e disable
-
-
-path_lib=$(python3.7 -c """
-import sys
-import re
-result=''
-for index in range(len(sys.path)):
-    match_sit = re.search('-packages', sys.path[index])
-    if match_sit is not None:
-        match_lib = re.search('lib', sys.path[index])
-
-        if match_lib is not None:
-            end=match_lib.span()[1]
-            result += sys.path[index][0:end] + ':'
-
-        result+=sys.path[index] + '/torch/lib:'
-print(result)"""
-)
-
-echo ${path_lib}
-
-export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/set_conda.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/set_conda.sh
deleted file mode 100644
index 55087d8622f46e055bb105a2acd1fe7006d4af07..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/set_conda.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-export PATH=/home/anaconda3/bin:$PATH
-export LD_LIBRARY_PATH=/home/anaconda3/lib:$LD_LIBRARY_PATH
\ No newline at end of file
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/train_full_8p.sh
deleted file mode 100644
index 27e798534c1c141d1a342e6f13ea71311f4f28de..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/train_full_8p.sh
+++ /dev/null
@@ -1,170 +0,0 @@
-#!/bin/bash
-
-#当前路径,不需要修改
-cur_path=`pwd`
-source env.sh
-
-#集合通信参数,不需要修改
-
-export RANK_SIZE=8
-
-#基础参数，需要模型审视修改
-#网络名称，同目录名称
-Network="mBART_for_PyTorch"
-#训练batch_size
-token_size=1024
-
-#训练开始时间，不需要修改
-start_time=$(date +%s)
-
-#进入训练脚本目录，需要模型审视修改
-cd $cur_path/../
-
-# 将对应的数据以及模型等放到对应路径 或 修改以下路径以适应本地训练
-DATA_PATH=train_data/en_ro
-PRETRAIN=mbart.cc25/model.pt
-BPE_PATH=mbart.cc25/sentence.bpe.model
-model_dir=checkpoints/checkpoint_best.pt
-SCRIPTS=mosesdecoder/scripts
-WMT16_SCRIPTS=wmt16-scripts
-
-REPLACE_UNICODE_PUNCT=$SCRIPTS/tokenizer/replace-unicode-punctuation.perl
-TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
-NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl
-REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
-NORMALIZE_ROMANIAN=$WMT16_SCRIPTS/preprocess/normalise-romanian.py
-REMOVE_DIACRITICS=$WMT16_SCRIPTS/preprocess/remove-diacritics.py
-HYP=hyp
-REF=ref
-langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN
-#创建DeviceID输出目录，不需要修改
-
-
-export RANK_SIZE=8
-for((RANK_ID=0;RANK_ID<RANK_SIZE;RANK_ID++))
-do
-    export RANK=$RANK_ID
-      if [ -d ${cur_path}/output/${RANK_ID} ];then
-        rm -rf ${cur_path}/output/${RANK_ID}
-        mkdir -p ${cur_path}/output/$RANK_ID
-      else
-            mkdir -p ${cur_path}/output/$RANK_ID
-      fi
-	if [ $(uname -m) = 'aarch64' ]
-		then
-			let a=0+RANK_ID*24
-			let b=23+RANK_ID*24
-			taskset -c $a-$b fairseq-train $DATA_PATH --fp16 --distributed-world-size 8 --npu \
-							  --device-id $RANK_ID --distributed-rank $RANK_ID --distributed-no-spawn --max-update 40000 \
-							  --encoder-normalize-before --decoder-normalize-before \
-							  --arch mbart_large --layernorm-embedding \
-							  --task translation_from_pretrained_bart \
-							  --source-lang en_XX --target-lang ro_RO \
-							  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
-							  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
-							  --lr-scheduler polynomial_decay --lr 3e-05 --min-lr -1 --warmup-updates 2500 --total-num-update 40000 \
-							  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
-							  --max-tokens 1024 --update-freq 2 \
-							  --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
-							  --seed 222 --log-format simple --log-interval 2 \
-							  --restore-file $PRETRAIN \
-							  --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
-							  --langs $langs \
-							  --ddp-backend no_c10d > ${cur_path}/output/${RANK_ID}/train_${RANK_ID}.log 2>&1 &
-	else
-		fairseq-train $DATA_PATH --fp16 --distributed-world-size 8 --npu \
-							  --device-id $RANK_ID --distributed-rank $RANK_ID --distributed-no-spawn --max-update 40000 \
-							  --encoder-normalize-before --decoder-normalize-before \
-							  --arch mbart_large --layernorm-embedding \
-							  --task translation_from_pretrained_bart \
-							  --source-lang en_XX --target-lang ro_RO \
-							  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
-							  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
-							  --lr-scheduler polynomial_decay --lr 3e-05 --min-lr -1 --warmup-updates 2500 --total-num-update 40000 \
-							  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
-							  --max-tokens 1024 --update-freq 2 \
-							  --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
-							  --seed 222 --log-format simple --log-interval 2 \
-							  --restore-file $PRETRAIN \
-							  --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
-							  --langs $langs \
-							  --ddp-backend no_c10d > ${cur_path}/output/${RANK_ID}/train_${RANK_ID}.log 2>&1 &
-	fi
-done
-wait
-#训练结束时间，不需要修改
-end_time=$(date +%s)
-e2e_time=$(( $end_time - $start_time ))
-
-
-fairseq-generate $DATA_PATH \
-  --fp16 --path $model_dir --max-tokens 4096 \
-  --task translation_from_pretrained_bart \
-  --gen-subset test \
-  -t ro_RO -s en_XX \
-  --bpe 'sentencepiece' --sentencepiece-model $BPE_PATH \
-  --scoring sacrebleu --remove-bpe 'sentencepiece' \
-  --batch-size 32 --langs $langs > en_ro
-sed -i '$d' en_ro
-cat en_ro | grep -P "^H" |sort -V |cut -f 3- | sed 's/\[ro_RO\]//g' |$TOKENIZER ro > $HYP".txt"
-cat en_ro | grep -P "^T" |sort -V |cut -f 2- | sed 's/\[ro_RO\]//g' |$TOKENIZER ro > $REF".txt"
-
-for f in $HYP $REF
-	do
-	rm "en_ro."$f
-	cat $f".txt" | \
-	perl $REPLACE_UNICODE_PUNCT | \
-	perl $NORM_PUNC -l ro | \
-	perl $REM_NON_PRINT_CHAR | \
-	python3 $NORMALIZE_ROMANIAN | \
-	python3 $REMOVE_DIACRITICS | \
-	perl $TOKENIZER -no-escape -threads 16 -a -l ro >"en_ro."$f
-	done
-sacrebleu -tok 'none' -s 'none' en_ro.ref < en_ro.hyp > res.log
-wait
-ASCEND_DEVICE_ID=0
-
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-WPS=`grep 'train_inner ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "wps=" '{print $NF}'|awk -F "wps" '{print $1}'|awk -F "," '{print $1}'|awk 'END {print}'`
-train_wall=`grep 'train_inner ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk 'NR==1{min=$1;next}{min=min<$1?min:$1}END{print min}'`
-#打印，不需要修改
-echo "Final Performance images/sec : $WPS"
-
-#输出训练精度,需要模型审视修改
-train_accuracy=`grep 'version.1.5.1 = ' res.log |awk '{print $3}'`
-#打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
-echo "E2E Training Duration sec : $e2e_time"
-
-#性能看护结果汇总
-#训练用例信息，不需要修改
-TokenSize=${token_size}
-DeviceType=`uname -m`
-CaseName=${Network}_bs${TokenSize}_${RANK_SIZE}'p'_'acc'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualWPS=${WPS}
-#单迭代训练时长
-TrainingTime=${train_wall}
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep -r "step_loss :" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $19}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
-
-#关键信息打印到${CaseName}.log中，不需要修改
-echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TokenSize = ${TokenSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualWPS = ${ActualWPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainAccuracy = ${train_accuracy}">> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/train_performance_1p.sh
deleted file mode 100644
index 1c329db30fa6158e3ccc768126b1c5d08076a094..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/train_performance_1p.sh
+++ /dev/null
@@ -1,127 +0,0 @@
-#!/bin/bash
-
-cur_path=`pwd`/../
-#失败用例打屏
-export ASCEND_SLOG_PRINT_TO_STDOUT=0
-export SCALAR_TO_HOST_MEM=1
-
-export MKL_SERVICE_FORCE_INTEL=1
-export BMMV2_ENABLE=1
-langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN
-#基础参数，需要模型审视修改
-#Batch Size
-batch_size=1024
-#网络名称，同目录名称
-Network="mBART_ID2372_for_PyTorch"
-#Device数量，单卡默认为1
-RankSize=1
-#训练epoch，可选
-train_epochs=1
-#训练step
-train_steps=
-#学习率
-learning_rate=3e-05
-
-#参数配置
-data_path=""
-
-if [[ $1 == --help || $1 == --h ]];then
-	echo "usage:./train_performance_1p.sh "
-	exit 1
-fi
-
-for para in $*
-do
-	if [[ $para == --data_path* ]];then
-		data_path=`echo ${para#*=}`
-    elif [[ $para == --conda_name* ]];then
-        conda_name=`echo ${para#*=}`
-        source set_conda.sh --conda_name=$conda_name
-        #export PATH=/usr/local/python3.7.5/bin:/home/anaconda3/bin:$PATH
-        #source activate py8
-        source activate $conda_name
-        
-	fi
-done
-
-if [[ $data_path  == "" ]];then
-	echo "[Error] para \"data_path\" must be config"
-	exit 1
-
-fi
-sed -i "s|checkpoint_utils.save_checkpoint(|#checkpoint_utils.save_checkpoint(|g" $cur_path/fairseq_cli/train.py
-##############执行训练##########
-cd $cur_path
-if [ -d $cur_path/test/output ];then
-	rm -rf $cur_path/test/output/*
-	mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-else
-	mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-fi
-wait
-
-
-pip3 install --editable ./
-start=$(date +%s)
-python3 train.py $data_path/en_ro/ \
-  --distributed-world-size 1 --npu --npu-id $ASCEND_DEVICE_ID --fp16 --encoder-normalize-before --decoder-normalize-before \
-  --arch mbart_large --layernorm-embedding \
-  --task translation_from_pretrained_bart \
-  --source-lang en_XX --target-lang ro_RO \
-  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
-  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
-  --lr-scheduler polynomial_decay --lr 3e-05 --min-lr -1 --warmup-updates 2500 --total-num-update 40000 \
-  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
-  --max-tokens 1024 --update-freq 2 \
-  --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
-  --seed 222 --log-format simple --log-interval 2 \
-  --restore-file $data_path/mbart.cc25/model.pt \
-  --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
-  --langs $langs \
-  --max-epoch $train_epochs \
-  --max-update 200 \
-  --ddp-backend no_c10d > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
-wait
-end=$(date +%s)
-e2etime=$(( $end - $start ))
-
-sed -i "s|#checkpoint_utils.save_checkpoint(|checkpoint_utils.save_checkpoint(|g" $cur_path/fairseq_cli/train.py
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-TrainingTime=0
-FPS=`grep -rn train_inner $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "wps=" '{print$2}' | awk -F "," '{print$1}' | tail -n+6 | awk '{sum+=$1} END {print"",sum/NR}' | sed s/[[:space:]]//g`
-#打印，不需要修改
-echo "Final Performance images/sec : $FPS"
-TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'*1000/'${FPS}'}'`
-#输出训练精度,需要模型审视修改
-#打印，不需要修改
-#echo "Final Train Accuracy : ${train_accuracy}"
-echo "E2E Training Duration sec : $e2e_time"
-#性能看护结果汇总
-#训练用例信息，不需要修改
-BatchSize=${batch_size}
-DeviceType=`uname -m`
-CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualFPS=${FPS}
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep train_inner $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss=" '{print$2}' | awk -F "," '{print$1}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
-
-#关键信息打印到${CaseName}.log中，不需要修改
-echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2etime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/train_performance_8p.sh
deleted file mode 100644
index ab1b36a9e03ccc957274b9c973c27affae8ff8ec..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/test/train_performance_8p.sh
+++ /dev/null
@@ -1,158 +0,0 @@
-#!/bin/bash
-
-#当前路径,不需要修改
-cur_path=`pwd`
-export ASCEND_SLOG_PRINT_TO_STDOUT=0
-export SCALAR_TO_HOST_MEM=1
-
-export MKL_SERVICE_FORCE_INTEL=1
-export BMMV2_ENABLE=1
-langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN
-
-#集合通信参数,不需要修改
-export RANK_SIZE=8
-train_epochs=1
-#基础参数，需要模型审视修改
-#网络名称，同目录名称
-Network="mBART_ID2372_for_PyTorch"
-#训练batch_size
-token_size=1024
-
-#训练开始时间，不需要修改
-start_time=$(date +%s)
-learning_rate=3e-05
-#参数配置
-data_path=""
-
-if [[ $1 == --help || $1 == --h ]];then
-	echo "usage:./train_performance_1p.sh "
-	exit 1
-fi
-
-for para in $*
-do
-	if [[ $para == --data_path* ]];then
-		data_path=`echo ${para#*=}`
-    elif [[ $para == --conda_name* ]];then
-        conda_name=`echo ${para#*=}`
-        source set_conda.sh --conda_name=$conda_name
-        #export PATH=/usr/local/python3.7.5/bin:/home/anaconda3/bin:$PATH
-        #source activate py8
-        source activate $conda_name
-        
-	fi
-done
-
-if [[ $data_path  == "" ]];then
-	echo "[Error] para \"data_path\" must be config"
-	exit 1
-
-fi
-
-sed -i "s|checkpoint_utils.save_checkpoint(|#checkpoint_utils.save_checkpoint(|g" $cur_path/../fairseq_cli/train.py
-
-##############执行训练##########
-cd $cur_path/../
-pip3 install --editable ./
-#进入训练脚本目录，需要模型审视修改
-
-export RANK_SIZE=8
-for((RANK_ID=0;RANK_ID<RANK_SIZE;RANK_ID++))
-do
-    export RANK=$RANK_ID
-    if [ -d ${cur_path}/output/${RANK_ID} ];then
-        rm -rf ${cur_path}/output/${RANK_ID}
-        mkdir -p ${cur_path}/output/$RANK_ID
-      else
-            mkdir -p ${cur_path}/output/$RANK_ID
-    fi
-	if [ $(uname -m) = 'aarch64' ]
-		then
-			let a=0+RANK_ID*24
-			let b=23+RANK_ID*24
-			taskset -c $a-$b python3 train.py $data_path/en_ro/ --fp16 --distributed-world-size 8 --npu \
-							  --device-id $RANK_ID --distributed-rank $RANK_ID --distributed-no-spawn --max-update 50 \
-							  --encoder-normalize-before --decoder-normalize-before \
-							  --arch mbart_large --layernorm-embedding \
-							  --task translation_from_pretrained_bart \
-							  --source-lang en_XX --target-lang ro_RO \
-							  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
-							  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
-							  --lr-scheduler polynomial_decay --lr 3e-05 --min-lr -1 --warmup-updates 2500 --total-num-update 40000 \
-							  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
-							  --max-tokens 1024 --update-freq 2 \
-							  --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
-							  --seed 222 --log-format simple --log-interval 2 \
-							  --restore-file $data_path/mbart.cc25/model.pt \
-                              --max-epoch $train_epochs \
-							  --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
-							  --langs $langs \
-							  --ddp-backend no_c10d > ${cur_path}/output/${RANK_ID}/train_${RANK_ID}.log 2>&1 &
-	else
-		python3 train.py $data_path/en_ro/ --fp16 --distributed-world-size 8 --npu \
-							  --device-id $RANK_ID --distributed-rank $RANK_ID --distributed-no-spawn --max-update 50 \
-							  --encoder-normalize-before --decoder-normalize-before \
-							  --arch mbart_large --layernorm-embedding \
-							  --task translation_from_pretrained_bart \
-							  --source-lang en_XX --target-lang ro_RO \
-							  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
-							  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
-							  --lr-scheduler polynomial_decay --lr 3e-05 --min-lr -1 --warmup-updates 2500 --total-num-update 40000 \
-							  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
-							  --max-tokens 1024 --update-freq 2 \
-							  --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
-							  --seed 222 --log-format simple --log-interval 2 \
-							  --restore-file $data_path/mbart.cc25/model.pt \
-							  --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \
-							  --langs $langs \
-                              --max-epoch $train_epochs \
-							  --ddp-backend no_c10d > ${cur_path}/output/${RANK_ID}/train_${RANK_ID}.log 2>&1 &
-	fi
-done
-wait
-
-ASCEND_DEVICE_ID=0
-#训练结束时间，不需要修改
-end_time=$(date +%s)
-e2e_time=$(( $end_time - $start_time ))
-
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能WPS，需要模型审视修改
-WPS=`grep 'train_inner ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "wps=" '{print $NF}'|awk -F "wps" '{print $1}'|awk -F "," '{print $1}'|awk 'END {print}'`
-train_wall=`grep 'train_inner ' $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk 'NR==1{min=$1;next}{min=min<$1?min:$1}END{print min}'|awk -F "," '{print$1}'`
-#打印，不需要修改
-echo "Final Performance images/sec : $WPS"
-echo "E2E Training Duration sec : $e2e_time"
-
-#性能看护结果汇总
-#训练用例信息，不需要修改
-TokenSize=${token_size}
-DeviceType=`uname -m`
-CaseName=${Network}_bs${TokenSize}_${RANK_SIZE}'p'_'perf'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualWPS=${WPS}
-#单迭代训练时长
-TrainingTime=${train_wall}
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep -r "loss=" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk -F "loss=" '{print $2}' |awk -F "," '{print $1}'  > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
-
-#关键信息打印到${CaseName}.log中，不需要修改
-echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${TokenSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualWPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
-
-sed -i "s|#checkpoint_utils.save_checkpoint(|checkpoint_utils.save_checkpoint(|g" $cur_path/../fairseq_cli/train.py
\ No newline at end of file
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/gpu/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/gpu/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/gpu/test_binaries_gpu.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/gpu/test_binaries_gpu.py
deleted file mode 100644
index 2ac60a09341746f3155ed4e876f6a93bfb8337cf..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/gpu/test_binaries_gpu.py
+++ /dev/null
@@ -1,305 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import contextlib
-import logging
-import os
-import tempfile
-import unittest
-from io import StringIO
-
-import torch
-from fairseq import options
-from fairseq_cli import train
-from tests.utils import (
-    create_dummy_data,
-    generate_main,
-    preprocess_lm_data,
-    preprocess_translation_data,
-    train_translation_model,
-)
-
-
-class TestTranslationGPU(unittest.TestCase):
-    def setUp(self):
-        logging.disable(logging.CRITICAL)
-
-    def tearDown(self):
-        logging.disable(logging.NOTSET)
-
-    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
-    def test_fp16(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_fp16") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(data_dir, "fconv_iwslt_de_en", ["--fp16"])
-                generate_main(data_dir)
-
-    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
-    def test_memory_efficient_fp16(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_memory_efficient_fp16") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(
-                    data_dir, "fconv_iwslt_de_en", ["--memory-efficient-fp16"]
-                )
-                generate_main(data_dir)
-
-    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
-    def test_transformer_fp16(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_transformer") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(
-                    data_dir,
-                    "transformer_iwslt_de_en",
-                    [
-                        "--encoder-layers",
-                        "2",
-                        "--decoder-layers",
-                        "2",
-                        "--encoder-embed-dim",
-                        "64",
-                        "--decoder-embed-dim",
-                        "64",
-                        "--fp16",
-                    ],
-                    run_validation=True,
-                )
-                generate_main(data_dir)
-
-    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
-    def test_levenshtein_transformer(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory(
-                "test_levenshtein_transformer"
-            ) as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir, ["--joined-dictionary"])
-                train_translation_model(
-                    data_dir,
-                    "levenshtein_transformer",
-                    [
-                        "--apply-bert-init",
-                        "--early-exit",
-                        "6,6,6",
-                        "--criterion",
-                        "nat_loss",
-                    ],
-                    task="translation_lev",
-                )
-                generate_main(
-                    data_dir,
-                    [
-                        "--task",
-                        "translation_lev",
-                        "--iter-decode-max-iter",
-                        "9",
-                        "--iter-decode-eos-penalty",
-                        "0",
-                        "--print-step",
-                    ],
-                )
-
-
-def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=False):
-    train_parser = options.get_training_parser()
-    train_args = options.parse_args_and_arch(
-        train_parser,
-        [
-            "--task",
-            "language_modeling",
-            data_dir,
-            "--arch",
-            arch,
-            "--optimizer",
-            "adam",
-            "--lr",
-            "0.0001",
-            "--criterion",
-            "adaptive_loss",
-            "--adaptive-softmax-cutoff",
-            "5,10,15",
-            "--max-tokens",
-            "500",
-            "--tokens-per-sample",
-            "500",
-            "--save-dir",
-            data_dir,
-            "--max-epoch",
-            "1",
-            "--no-progress-bar",
-            "--distributed-world-size",
-            "1",
-            "--ddp-backend",
-            "no_c10d",
-            "--num-workers",
-            "0",
-        ]
-        + (extra_flags or []),
-    )
-    train.main(train_args)
-
-    # try scalar quantization
-    scalar_quant_train_parser = options.get_training_parser()
-    scalar_quant_train_args = options.parse_args_and_arch(
-        scalar_quant_train_parser,
-        [
-            "--task",
-            "language_modeling",
-            data_dir,
-            "--arch",
-            arch,
-            "--optimizer",
-            "adam",
-            "--lr",
-            "0.0001",
-            "--criterion",
-            "adaptive_loss",
-            "--adaptive-softmax-cutoff",
-            "5,10,15",
-            "--max-tokens",
-            "500",
-            "--tokens-per-sample",
-            "500",
-            "--save-dir",
-            data_dir,
-            "--max-update",
-            "3",
-            "--no-progress-bar",
-            "--distributed-world-size",
-            "1",
-            "--ddp-backend",
-            "no_c10d",
-            "--num-workers",
-            "0",
-            "--quant-noise-scalar",
-            "0.5",
-        ]
-        + (extra_flags or []),
-    )
-    train.main(scalar_quant_train_args)
-
-    # try iterative PQ quantization
-    quantize_parser = options.get_training_parser()
-    quantize_args = options.parse_args_and_arch(
-        quantize_parser,
-        [
-            "--task",
-            "language_modeling",
-            data_dir,
-            "--arch",
-            arch,
-            "--optimizer",
-            "adam",
-            "--lr",
-            "0.0001",
-            "--criterion",
-            "adaptive_loss",
-            "--adaptive-softmax-cutoff",
-            "5,10,15",
-            "--max-tokens",
-            "50",
-            "--tokens-per-sample",
-            "50",
-            "--max-update",
-            "6",
-            "--no-progress-bar",
-            "--distributed-world-size",
-            "1",
-            "--ddp-backend",
-            "no_c10d",
-            "--num-workers",
-            "0",
-            "--restore-file",
-            os.path.join(data_dir, "checkpoint_last.pt"),
-            "--reset-optimizer",
-            "--quantization-config-path",
-            os.path.join(
-                os.path.dirname(__file__), "transformer_quantization_config.yaml"
-            ),
-        ]
-        + (extra_flags or []),
-    )
-    train.main(quantize_args)
-
-
-class TestQuantization(unittest.TestCase):
-    def setUp(self):
-        logging.disable(logging.CRITICAL)
-
-    def tearDown(self):
-        logging.disable(logging.NOTSET)
-
-    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
-    def test_quantization(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_quantization") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_lm_data(data_dir)
-                # tests both scalar and iterative PQ quantization
-                _quantize_language_model(data_dir, "transformer_lm")
-
-
-class TestOptimizersGPU(unittest.TestCase):
-    def setUp(self):
-        logging.disable(logging.CRITICAL)
-
-    def tearDown(self):
-        logging.disable(logging.NOTSET)
-
-    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
-    def test_flat_grads(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_flat_grads") as data_dir:
-                # Use just a bit of data and tiny model to keep this test runtime reasonable
-                create_dummy_data(data_dir, num_examples=10, maxlen=5)
-                preprocess_translation_data(data_dir)
-                with self.assertRaises(RuntimeError):
-                    # adafactor isn't compatible with flat grads, which
-                    # are used by default with --fp16
-                    train_translation_model(
-                        data_dir,
-                        "lstm",
-                        [
-                            "--required-batch-size-multiple",
-                            "1",
-                            "--encoder-layers",
-                            "1",
-                            "--encoder-hidden-size",
-                            "32",
-                            "--decoder-layers",
-                            "1",
-                            "--optimizer",
-                            "adafactor",
-                            "--fp16",
-                        ],
-                    )
-                # but it should pass once we set --fp16-no-flatten-grads
-                train_translation_model(
-                    data_dir,
-                    "lstm",
-                    [
-                        "--required-batch-size-multiple",
-                        "1",
-                        "--encoder-layers",
-                        "1",
-                        "--encoder-hidden-size",
-                        "32",
-                        "--decoder-layers",
-                        "1",
-                        "--optimizer",
-                        "adafactor",
-                        "--fp16",
-                        "--fp16-no-flatten-grads",
-                    ],
-                )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/gpu/transformer_quantization_config.yaml b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/gpu/transformer_quantization_config.yaml
deleted file mode 100644
index de31d8116ced675b81eb74119642217d768e7736..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/gpu/transformer_quantization_config.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-# This file defines example configuration arguments for quantizing
-# a transformer model with product quantization
-
-n_centroids:
-    Linear:
-        key: in_features
-        value: {"*": 8}
-    Embedding:
-        key: embedding_dim
-        value: {"*": 8}
-
-block_sizes:
-  Linear:
-      key: fuzzy_name
-      value: {fc: 8, attn: 4, emb: 4}
-  Embedding:
-      key: fuzzy_name
-      value: {emb: 8}
-
-layers_to_quantize:
-    - decoder\\.layers\\.\d+\\.fc[12]
-    - decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01]
-    - decoder\\.layers\\.\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/__init__.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/asr_test_base.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/asr_test_base.py
deleted file mode 100644
index 2383ae7e93e54cd35e83ec1b8130a51534b36cb6..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/asr_test_base.py
+++ /dev/null
@@ -1,589 +0,0 @@
-# BSD 3-Clause License
-#
-# Copyright (c) 2017 xxxx
-# All rights reserved.
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# ============================================================================
-#!/usr/bin/env python3
-
-import argparse
-import os
-import unittest
-from inspect import currentframe, getframeinfo
-
-import numpy as np
-import torch
-from examples.speech_recognition.data.data_utils import lengths_to_encoder_padding_mask
-from fairseq.data import data_utils as fairseq_data_utils
-from fairseq.data.dictionary import Dictionary
-from fairseq.models import (
-    BaseFairseqModel,
-    FairseqDecoder,
-    FairseqEncoder,
-    FairseqEncoderDecoderModel,
-    FairseqEncoderModel,
-    FairseqModel,
-)
-from fairseq.tasks.fairseq_task import LegacyFairseqTask
-
-
-DEFAULT_TEST_VOCAB_SIZE = 100
-
-
-# ///////////////////////////////////////////////////////////////////////////
-# utility function to setup dummy dict/task/input
-# ///////////////////////////////////////////////////////////////////////////
-
-
-def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE):
-    dummy_dict = Dictionary()
-    # add dummy symbol to satisfy vocab size
-    for id, _ in enumerate(range(vocab_size)):
-        dummy_dict.add_symbol("{}".format(id), 1000)
-    return dummy_dict
-
-
-class DummyTask(LegacyFairseqTask):
-    def __init__(self, args):
-        super().__init__(args)
-        self.dictionary = get_dummy_dictionary()
-        if getattr(self.args, "ctc", False):
-            self.dictionary.add_symbol("<ctc_blank>")
-        self.tgt_dict = self.dictionary
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
-
-
-def get_dummy_task_and_parser():
-    """
-    to build a fariseq model, we need some dummy parse and task. This function
-    is used to create dummy task and parser to faciliate model/criterion test
-
-    Note: we use FbSpeechRecognitionTask as the dummy task. You may want
-    to use other task by providing another function
-    """
-    parser = argparse.ArgumentParser(
-        description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS
-    )
-    DummyTask.add_args(parser)
-    args = parser.parse_args([])
-    task = DummyTask.setup_task(args)
-    return task, parser
-
-
-def get_dummy_input(T=100, D=80, B=5, K=100):
-    forward_input = {}
-    # T max sequence length
-    # D feature vector dimension
-    # B batch size
-    # K target dimension size
-    feature = torch.randn(B, T, D)
-    # this (B, T, D) layout is just a convention, you can override it by
-    # write your own _prepare_forward_input function
-    src_lengths = torch.from_numpy(
-        np.random.randint(low=1, high=T, size=B, dtype=np.int64)
-    )
-    src_lengths[0] = T  # make sure the maximum length matches
-    prev_output_tokens = []
-    for b in range(B):
-        token_length = np.random.randint(low=1, high=src_lengths[b].item() + 1)
-        tokens = np.random.randint(low=0, high=K, size=token_length, dtype=np.int64)
-        prev_output_tokens.append(torch.from_numpy(tokens))
-
-    prev_output_tokens = fairseq_data_utils.collate_tokens(
-        prev_output_tokens,
-        pad_idx=1,
-        eos_idx=2,
-        left_pad=False,
-        move_eos_to_beginning=False,
-    )
-    src_lengths, sorted_order = src_lengths.sort(descending=True)
-    forward_input["src_tokens"] = feature.index_select(0, sorted_order)
-    forward_input["src_lengths"] = src_lengths
-    forward_input["prev_output_tokens"] = prev_output_tokens
-
-    return forward_input
-
-
-def get_dummy_encoder_output(encoder_out_shape=(100, 80, 5)):
-    """
-    This only provides an example to generate dummy encoder output
-    """
-    (T, B, D) = encoder_out_shape
-    encoder_out = {}
-
-    encoder_out["encoder_out"] = torch.from_numpy(
-        np.random.randn(*encoder_out_shape).astype(np.float32)
-    )
-    seq_lengths = torch.from_numpy(np.random.randint(low=1, high=T, size=B))
-    # some dummy mask
-    encoder_out["encoder_padding_mask"] = torch.arange(T).view(1, T).expand(
-        B, -1
-    ) >= seq_lengths.view(B, 1).expand(-1, T)
-    encoder_out["encoder_padding_mask"].t_()
-
-    # encoer_padding_mask is (T, B) tensor, with (t, b)-th element indicate
-    # whether encoder_out[t, b] is valid (=0) or not (=1)
-    return encoder_out
-
-
-def _current_postion_info():
-    cf = currentframe()
-    frameinfo = " (at {}:{})".format(
-        os.path.basename(getframeinfo(cf).filename), cf.f_back.f_lineno
-    )
-    return frameinfo
-
-
-def check_encoder_output(encoder_output, batch_size=None):
-    """we expect encoder_output to be a dict with the following
-    key/value pairs:
-    - encoder_out: a Torch.Tensor
-    - encoder_padding_mask: a binary Torch.Tensor
-    """
-    if not isinstance(encoder_output, dict):
-        msg = (
-            "FairseqEncoderModel.forward(...) must be a dict" + _current_postion_info()
-        )
-        return False, msg
-
-    if "encoder_out" not in encoder_output:
-        msg = (
-            "FairseqEncoderModel.forward(...) must contain encoder_out"
-            + _current_postion_info()
-        )
-        return False, msg
-
-    if "encoder_padding_mask" not in encoder_output:
-        msg = (
-            "FairseqEncoderModel.forward(...) must contain encoder_padding_mask"
-            + _current_postion_info()
-        )
-        return False, msg
-
-    if not isinstance(encoder_output["encoder_out"], torch.Tensor):
-        msg = "encoder_out must be a torch.Tensor" + _current_postion_info()
-        return False, msg
-
-    if encoder_output["encoder_out"].dtype != torch.float32:
-        msg = "encoder_out must have float32 dtype" + _current_postion_info()
-        return False, msg
-
-    mask = encoder_output["encoder_padding_mask"]
-    if mask is not None:
-        if not isinstance(mask, torch.Tensor):
-            msg = (
-                "encoder_padding_mask must be a torch.Tensor" + _current_postion_info()
-            )
-            return False, msg
-        if mask.dtype != torch.uint8 and (
-            not hasattr(torch, "bool") or mask.dtype != torch.bool
-        ):
-            msg = (
-                "encoder_padding_mask must have dtype of uint8"
-                + _current_postion_info()
-            )
-            return False, msg
-
-        if mask.dim() != 2:
-            msg = (
-                "we expect encoder_padding_mask to be a 2-d tensor, in shape (T, B)"
-                + _current_postion_info()
-            )
-            return False, msg
-
-        if batch_size is not None and mask.size(1) != batch_size:
-            msg = (
-                "we expect encoder_padding_mask to be a 2-d tensor, with size(1)"
-                + " being the batch size"
-                + _current_postion_info()
-            )
-            return False, msg
-    return True, None
-
-
-def check_decoder_output(decoder_output):
-    """we expect output from a decoder is a tuple with the following constraint:
-    - the first element is a torch.Tensor
-    - the second element can be anything (reserved for future use)
-    """
-    if not isinstance(decoder_output, tuple):
-        msg = "FariseqDecoder output must be a tuple" + _current_postion_info()
-        return False, msg
-
-    if len(decoder_output) != 2:
-        msg = "FairseqDecoder output must be 2-elem tuple" + _current_postion_info()
-        return False, msg
-
-    if not isinstance(decoder_output[0], torch.Tensor):
-        msg = (
-            "FariseqDecoder output[0] must be a torch.Tensor" + _current_postion_info()
-        )
-        return False, msg
-
-    return True, None
-
-
-# ///////////////////////////////////////////////////////////////////////////
-# Base Test class
-# ///////////////////////////////////////////////////////////////////////////
-
-
-class TestBaseFairseqModelBase(unittest.TestCase):
-    """
-    This class is used to facilitate writing unittest for any class derived from
-    `BaseFairseqModel`.
-    """
-
-    @classmethod
-    def setUpClass(cls):
-        if cls is TestBaseFairseqModelBase:
-            raise unittest.SkipTest("Skipping test case in base")
-        super().setUpClass()
-
-    def setUpModel(self, model):
-        self.assertTrue(isinstance(model, BaseFairseqModel))
-        self.model = model
-
-    def setupInput(self):
-        pass
-
-    def setUp(self):
-        self.model = None
-        self.forward_input = None
-        pass
-
-
-class TestFairseqEncoderDecoderModelBase(TestBaseFairseqModelBase):
-    """
-    base code to test FairseqEncoderDecoderModel (formally known as
-    `FairseqModel`) must be derived from this base class
-    """
-
-    @classmethod
-    def setUpClass(cls):
-        if cls is TestFairseqEncoderDecoderModelBase:
-            raise unittest.SkipTest("Skipping test case in base")
-        super().setUpClass()
-
-    def setUpModel(self, model_cls, extra_args_setters=None):
-        self.assertTrue(
-            issubclass(model_cls, (FairseqEncoderDecoderModel, FairseqModel)),
-            msg="This class only tests for FairseqModel subclasses",
-        )
-
-        task, parser = get_dummy_task_and_parser()
-        model_cls.add_args(parser)
-
-        args = parser.parse_args([])
-        if extra_args_setters is not None:
-            for args_setter in extra_args_setters:
-                args_setter(args)
-        model = model_cls.build_model(args, task)
-        self.model = model
-
-    def setUpInput(self, input=None):
-        self.forward_input = get_dummy_input() if input is None else input
-
-    def setUp(self):
-        super().setUp()
-
-    def test_forward(self):
-        if self.model and self.forward_input:
-            forward_output = self.model.forward(**self.forward_input)
-            # for FairseqEncoderDecoderModel, forward returns a tuple of two
-            # elements, the first one is a Torch.Tensor
-            succ, msg = check_decoder_output(forward_output)
-            if not succ:
-                self.assertTrue(succ, msg=msg)
-            self.forward_output = forward_output
-
-    def test_get_normalized_probs(self):
-        if self.model and self.forward_input:
-            forward_output = self.model.forward(**self.forward_input)
-            logprob = self.model.get_normalized_probs(forward_output, log_probs=True)
-            prob = self.model.get_normalized_probs(forward_output, log_probs=False)
-
-            # in order for different models/criterion to play with each other
-            # we need to know whether the logprob or prob output is batch_first
-            # or not. We assume an additional attribute will be attached to logprob
-            # or prob. If you find your code failed here, simply override
-            # FairseqModel.get_normalized_probs, see example at
-            # https://fburl.com/batch_first_example
-            self.assertTrue(hasattr(logprob, "batch_first"))
-            self.assertTrue(hasattr(prob, "batch_first"))
-
-            self.assertTrue(torch.is_tensor(logprob))
-            self.assertTrue(torch.is_tensor(prob))
-
-
-class TestFairseqEncoderModelBase(TestBaseFairseqModelBase):
-    """
-    base class to test FairseqEncoderModel
-    """
-
-    @classmethod
-    def setUpClass(cls):
-        if cls is TestFairseqEncoderModelBase:
-            raise unittest.SkipTest("Skipping test case in base")
-        super().setUpClass()
-
-    def setUpModel(self, model_cls, extra_args_setters=None):
-        self.assertTrue(
-            issubclass(model_cls, FairseqEncoderModel),
-            msg="This class is only used for testing FairseqEncoderModel",
-        )
-        task, parser = get_dummy_task_and_parser()
-        model_cls.add_args(parser)
-        args = parser.parse_args([])
-        if extra_args_setters is not None:
-            for args_setter in extra_args_setters:
-                args_setter(args)
-
-        model = model_cls.build_model(args, task)
-        self.model = model
-
-    def setUpInput(self, input=None):
-        self.forward_input = get_dummy_input() if input is None else input
-        # get_dummy_input() is originally for s2s, here we delete extra dict
-        # items, so it can be used for EncoderModel / Encoder as well
-        self.forward_input.pop("prev_output_tokens", None)
-
-    def setUp(self):
-        super().setUp()
-
-    def test_forward(self):
-        if self.forward_input and self.model:
-            bsz = self.forward_input["src_tokens"].size(0)
-            forward_output = self.model.forward(**self.forward_input)
-
-            # we expect forward_output to be a dict with the following
-            # key/value pairs:
-            # - encoder_out: a Torch.Tensor
-            # - encoder_padding_mask: a binary Torch.Tensor
-            succ, msg = check_encoder_output(forward_output, batch_size=bsz)
-            if not succ:
-                self.assertTrue(succ, msg=msg)
-            self.forward_output = forward_output
-
-    def test_get_normalized_probs(self):
-        if self.model and self.forward_input:
-            forward_output = self.model.forward(**self.forward_input)
-            logprob = self.model.get_normalized_probs(forward_output, log_probs=True)
-            prob = self.model.get_normalized_probs(forward_output, log_probs=False)
-
-            # in order for different models/criterion to play with each other
-            # we need to know whether the logprob or prob output is batch_first
-            # or not. We assume an additional attribute will be attached to logprob
-            # or prob. If you find your code failed here, simply override
-            # FairseqModel.get_normalized_probs, see example at
-            # https://fburl.com/batch_first_example
-            self.assertTrue(hasattr(logprob, "batch_first"))
-            self.assertTrue(hasattr(prob, "batch_first"))
-
-            self.assertTrue(torch.is_tensor(logprob))
-            self.assertTrue(torch.is_tensor(prob))
-
-
-class TestFairseqEncoderBase(unittest.TestCase):
-    """
-    base class to test FairseqEncoder
-    """
-
-    @classmethod
-    def setUpClass(cls):
-        if cls is TestFairseqEncoderBase:
-            raise unittest.SkipTest("Skipping test case in base")
-        super().setUpClass()
-
-    def setUpEncoder(self, encoder):
-        self.assertTrue(
-            isinstance(encoder, FairseqEncoder),
-            msg="This class is only used for test FairseqEncoder",
-        )
-        self.encoder = encoder
-
-    def setUpInput(self, input=None):
-        self.forward_input = get_dummy_input() if input is None else input
-        # get_dummy_input() is originally for s2s, here we delete extra dict
-        # items, so it can be used for EncoderModel / Encoder as well
-        self.forward_input.pop("prev_output_tokens", None)
-
-    def setUp(self):
-        self.encoder = None
-        self.forward_input = None
-
-    def test_forward(self):
-        if self.encoder and self.forward_input:
-            bsz = self.forward_input["src_tokens"].size(0)
-
-            forward_output = self.encoder.forward(**self.forward_input)
-            succ, msg = check_encoder_output(forward_output, batch_size=bsz)
-            if not succ:
-                self.assertTrue(succ, msg=msg)
-            self.forward_output = forward_output
-
-
-class TestFairseqDecoderBase(unittest.TestCase):
-    """
-    base class to test FairseqDecoder
-    """
-
-    @classmethod
-    def setUpClass(cls):
-        if cls is TestFairseqDecoderBase:
-            raise unittest.SkipTest("Skipping test case in base")
-        super().setUpClass()
-
-    def setUpDecoder(self, decoder):
-        self.assertTrue(
-            isinstance(decoder, FairseqDecoder),
-            msg="This class is only used for test FairseqDecoder",
-        )
-        self.decoder = decoder
-
-    def setUpInput(self, input=None):
-        self.forward_input = get_dummy_encoder_output() if input is None else input
-
-    def setUpPrevOutputTokens(self, tokens=None):
-        if tokens is None:
-            self.encoder_input = get_dummy_input()
-            self.prev_output_tokens = self.encoder_input["prev_output_tokens"]
-        else:
-            self.prev_output_tokens = tokens
-
-    def setUp(self):
-        self.decoder = None
-        self.forward_input = None
-        self.prev_output_tokens = None
-
-    def test_forward(self):
-        if (
-            self.decoder is not None
-            and self.forward_input is not None
-            and self.prev_output_tokens is not None
-        ):
-            forward_output = self.decoder.forward(
-                prev_output_tokens=self.prev_output_tokens,
-                encoder_out=self.forward_input,
-            )
-            succ, msg = check_decoder_output(forward_output)
-            if not succ:
-                self.assertTrue(succ, msg=msg)
-            self.forward_input = forward_output
-
-
-class DummyEncoderModel(FairseqEncoderModel):
-    def __init__(self, encoder):
-        super().__init__(encoder)
-
-    @classmethod
-    def build_model(cls, args, task):
-        return cls(DummyEncoder())
-
-    def get_logits(self, net_output):
-        # Inverse of sigmoid to use with BinaryCrossEntropyWithLogitsCriterion as
-        # F.binary_cross_entropy_with_logits combines sigmoid and CE
-        return torch.log(
-            torch.div(net_output["encoder_out"], 1 - net_output["encoder_out"])
-        )
-
-    def get_normalized_probs(self, net_output, log_probs, sample=None):
-        lprobs = super().get_normalized_probs(net_output, log_probs, sample=sample)
-        lprobs.batch_first = True
-        return lprobs
-
-
-class DummyEncoder(FairseqEncoder):
-    def __init__(self):
-        super().__init__(None)
-
-    def forward(self, src_tokens, src_lengths):
-        mask, max_len = lengths_to_encoder_padding_mask(src_lengths)
-        return {"encoder_out": src_tokens, "encoder_padding_mask": mask}
-
-
-class CrossEntropyCriterionTestBase(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        if cls is CrossEntropyCriterionTestBase:
-            raise unittest.SkipTest("Skipping base class test case")
-        super().setUpClass()
-
-    def setUpArgs(self):
-        args = argparse.Namespace()
-        args.sentence_avg = False
-        args.threshold = 0.1  # to use with BinaryCrossEntropyWithLogitsCriterion
-        return args
-
-    def setUp(self):
-        args = self.setUpArgs()
-        self.model = DummyEncoderModel(encoder=DummyEncoder())
-        self.criterion = self.criterion_cls.build_criterion(
-            args=args, task=DummyTask(args)
-        )
-
-    def get_src_tokens(self, correct_prediction, aggregate):
-        """
-        correct_prediction: True if the net_output (src_tokens) should
-        predict the correct target
-        aggregate: True if the criterion expects net_output (src_tokens)
-        aggregated across time axis
-        """
-        predicted_idx = 0 if correct_prediction else 1
-        if aggregate:
-            src_tokens = torch.zeros((2, 2), dtype=torch.float)
-            for b in range(2):
-                src_tokens[b][predicted_idx] = 1.0
-        else:
-            src_tokens = torch.zeros((2, 10, 2), dtype=torch.float)
-            for b in range(2):
-                for t in range(10):
-                    src_tokens[b][t][predicted_idx] = 1.0
-        return src_tokens
-
-    def get_target(self, soft_target):
-        if soft_target:
-            target = torch.zeros((2, 2), dtype=torch.float)
-            for b in range(2):
-                target[b][0] = 1.0
-        else:
-            target = torch.zeros((2, 10), dtype=torch.long)
-        return target
-
-    def get_test_sample(self, correct, soft_target, aggregate):
-        src_tokens = self.get_src_tokens(correct, aggregate)
-        target = self.get_target(soft_target)
-        L = src_tokens.size(1)
-        return {
-            "net_input": {"src_tokens": src_tokens, "src_lengths": torch.tensor([L])},
-            "target": target,
-            "ntokens": src_tokens.size(0) * src_tokens.size(1),
-        }
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/test_collaters.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/test_collaters.py
deleted file mode 100644
index 6a5029a48faea2426d7a0277655a2c7c08c1d16c..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/test_collaters.py
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import numpy as np
-import torch
-from examples.speech_recognition.data.collaters import Seq2SeqCollater
-
-
-class TestSeq2SeqCollator(unittest.TestCase):
-    def test_collate(self):
-
-        eos_idx = 1
-        pad_idx = 0
-        collater = Seq2SeqCollater(
-            feature_index=0, label_index=1, pad_index=pad_idx, eos_index=eos_idx
-        )
-
-        # 2 frames in the first sample and 3 frames in the second one
-        frames1 = np.array([[7, 8], [9, 10]])
-        frames2 = np.array([[1, 2], [3, 4], [5, 6]])
-        target1 = np.array([4, 2, 3, eos_idx])
-        target2 = np.array([3, 2, eos_idx])
-        sample1 = {"id": 0, "data": [frames1, target1]}
-        sample2 = {"id": 1, "data": [frames2, target2]}
-        batch = collater.collate([sample1, sample2])
-
-        # collate sort inputs by frame's length before creating the batch
-        self.assertTensorEqual(batch["id"], torch.tensor([1, 0]))
-        self.assertEqual(batch["ntokens"], 7)
-        self.assertTensorEqual(
-            batch["net_input"]["src_tokens"],
-            torch.tensor(
-                [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [pad_idx, pad_idx]]]
-            ),
-        )
-        self.assertTensorEqual(
-            batch["net_input"]["prev_output_tokens"],
-            torch.tensor([[eos_idx, 3, 2, pad_idx], [eos_idx, 4, 2, 3]]),
-        )
-        self.assertTensorEqual(batch["net_input"]["src_lengths"], torch.tensor([3, 2]))
-        self.assertTensorEqual(
-            batch["target"],
-            torch.tensor([[3, 2, eos_idx, pad_idx], [4, 2, 3, eos_idx]]),
-        )
-        self.assertEqual(batch["nsentences"], 2)
-
-    def assertTensorEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertEqual(t1.ne(t2).long().sum(), 0)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/test_cross_entropy.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/test_cross_entropy.py
deleted file mode 100644
index b05400ed95e22762c3e3e5e8fd3ebfa6caf1e325..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/test_cross_entropy.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from examples.speech_recognition.criterions.cross_entropy_acc import (
-    CrossEntropyWithAccCriterion,
-)
-
-from .asr_test_base import CrossEntropyCriterionTestBase
-
-
-class CrossEntropyWithAccCriterionTest(CrossEntropyCriterionTestBase):
-    def setUp(self):
-        self.criterion_cls = CrossEntropyWithAccCriterion
-        super().setUp()
-
-    def test_cross_entropy_all_correct(self):
-        sample = self.get_test_sample(correct=True, soft_target=False, aggregate=False)
-        loss, sample_size, logging_output = self.criterion(
-            self.model, sample, "sum", log_probs=True
-        )
-        assert logging_output["correct"] == 20
-        assert logging_output["total"] == 20
-        assert logging_output["sample_size"] == 20
-        assert logging_output["ntokens"] == 20
-
-    def test_cross_entropy_all_wrong(self):
-        sample = self.get_test_sample(correct=False, soft_target=False, aggregate=False)
-        loss, sample_size, logging_output = self.criterion(
-            self.model, sample, "sum", log_probs=True
-        )
-        assert logging_output["correct"] == 0
-        assert logging_output["total"] == 20
-        assert logging_output["sample_size"] == 20
-        assert logging_output["ntokens"] == 20
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/test_data_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/test_data_utils.py
deleted file mode 100644
index a72e0b66948da1349d87eafdef4c4004dd535c96..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/test_data_utils.py
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-import unittest
-
-import torch
-from examples.speech_recognition.data import data_utils
-
-
-class DataUtilsTest(unittest.TestCase):
-    def test_normalization(self):
-        sample_len1 = torch.tensor(
-            [
-                [
-                    -0.7661,
-                    -1.3889,
-                    -2.0972,
-                    -0.9134,
-                    -0.7071,
-                    -0.9765,
-                    -0.8700,
-                    -0.8283,
-                    0.7512,
-                    1.3211,
-                    2.1532,
-                    2.1174,
-                    1.2800,
-                    1.2633,
-                    1.6147,
-                    1.6322,
-                    2.0723,
-                    3.1522,
-                    3.2852,
-                    2.2309,
-                    2.5569,
-                    2.2183,
-                    2.2862,
-                    1.5886,
-                    0.8773,
-                    0.8725,
-                    1.2662,
-                    0.9899,
-                    1.1069,
-                    1.3926,
-                    1.2795,
-                    1.1199,
-                    1.1477,
-                    1.2687,
-                    1.3843,
-                    1.1903,
-                    0.8355,
-                    1.1367,
-                    1.2639,
-                    1.4707,
-                ]
-            ]
-        )
-        out = data_utils.apply_mv_norm(sample_len1)
-        assert not torch.isnan(out).any()
-        assert (out == sample_len1).all()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/test_vggtransformer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/test_vggtransformer.py
deleted file mode 100644
index e5b09ed617cac1cbb073cf79c161ed2a5286f265..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/speech_recognition/test_vggtransformer.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# BSD 3-Clause License
-#
-# Copyright (c) 2017 xxxx
-# All rights reserved.
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# ============================================================================
-#!/usr/bin/env python3
-
-# import models/encoder/decoder to be tested
-from examples.speech_recognition.models.vggtransformer import (
-    TransformerDecoder,
-    VGGTransformerEncoder,
-    VGGTransformerModel,
-    vggtransformer_1,
-    vggtransformer_2,
-    vggtransformer_base,
-)
-
-# import base test class
-from .asr_test_base import (
-    DEFAULT_TEST_VOCAB_SIZE,
-    TestFairseqDecoderBase,
-    TestFairseqEncoderBase,
-    TestFairseqEncoderDecoderModelBase,
-    get_dummy_dictionary,
-    get_dummy_encoder_output,
-    get_dummy_input,
-)
-
-
-class VGGTransformerModelTest_mid(TestFairseqEncoderDecoderModelBase):
-    def setUp(self):
-        def override_config(args):
-            """
-            vggtrasformer_1 use 14 layers of transformer,
-            for testing purpose, it is too expensive. For fast turn-around
-            test, reduce the number of layers to 3.
-            """
-            args.transformer_enc_config = (
-                "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 3"
-            )
-
-        super().setUp()
-        extra_args_setter = [vggtransformer_1, override_config]
-
-        self.setUpModel(VGGTransformerModel, extra_args_setter)
-        self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE))
-
-
-class VGGTransformerModelTest_big(TestFairseqEncoderDecoderModelBase):
-    def setUp(self):
-        def override_config(args):
-            """
-            vggtrasformer_2 use 16 layers of transformer,
-            for testing purpose, it is too expensive. For fast turn-around
-            test, reduce the number of layers to 3.
-            """
-            args.transformer_enc_config = (
-                "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 3"
-            )
-
-        super().setUp()
-        extra_args_setter = [vggtransformer_2, override_config]
-
-        self.setUpModel(VGGTransformerModel, extra_args_setter)
-        self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE))
-
-
-class VGGTransformerModelTest_base(TestFairseqEncoderDecoderModelBase):
-    def setUp(self):
-        def override_config(args):
-            """
-            vggtrasformer_base use 12 layers of transformer,
-            for testing purpose, it is too expensive. For fast turn-around
-            test, reduce the number of layers to 3.
-            """
-            args.transformer_enc_config = (
-                "((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 3"
-            )
-
-        super().setUp()
-        extra_args_setter = [vggtransformer_base, override_config]
-
-        self.setUpModel(VGGTransformerModel, extra_args_setter)
-        self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE))
-
-
-class VGGTransformerEncoderTest(TestFairseqEncoderBase):
-    def setUp(self):
-        super().setUp()
-
-        self.setUpInput(get_dummy_input(T=50, D=80, B=5))
-
-    def test_forward(self):
-        print("1. test standard vggtransformer")
-        self.setUpEncoder(VGGTransformerEncoder(input_feat_per_channel=80))
-        super().test_forward()
-        print("2. test vggtransformer with limited right context")
-        self.setUpEncoder(
-            VGGTransformerEncoder(
-                input_feat_per_channel=80, transformer_context=(-1, 5)
-            )
-        )
-        super().test_forward()
-        print("3. test vggtransformer with limited left context")
-        self.setUpEncoder(
-            VGGTransformerEncoder(
-                input_feat_per_channel=80, transformer_context=(5, -1)
-            )
-        )
-        super().test_forward()
-        print("4. test vggtransformer with limited right context and sampling")
-        self.setUpEncoder(
-            VGGTransformerEncoder(
-                input_feat_per_channel=80,
-                transformer_context=(-1, 12),
-                transformer_sampling=(2, 2),
-            )
-        )
-        super().test_forward()
-        print("5. test vggtransformer with windowed context and sampling")
-        self.setUpEncoder(
-            VGGTransformerEncoder(
-                input_feat_per_channel=80,
-                transformer_context=(12, 12),
-                transformer_sampling=(2, 2),
-            )
-        )
-
-
-class TransformerDecoderTest(TestFairseqDecoderBase):
-    def setUp(self):
-        super().setUp()
-
-        dict = get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE)
-        decoder = TransformerDecoder(dict)
-        dummy_encoder_output = get_dummy_encoder_output(encoder_out_shape=(50, 5, 256))
-
-        self.setUpDecoder(decoder)
-        self.setUpInput(dummy_encoder_output)
-        self.setUpPrevOutputTokens()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_average_checkpoints.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_average_checkpoints.py
deleted file mode 100644
index f348b56b869372d8434fe03f13324d78e9093fa2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_average_checkpoints.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import collections
-import os
-import shutil
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-from scripts.average_checkpoints import average_checkpoints
-from torch import nn
-
-
-class ModelWithSharedParameter(nn.Module):
-    def __init__(self):
-        super(ModelWithSharedParameter, self).__init__()
-        self.embedding = nn.Embedding(1000, 200)
-        self.FC1 = nn.Linear(200, 200)
-        self.FC2 = nn.Linear(200, 200)
-        # tie weight in FC2 to FC1
-        self.FC2.weight = nn.Parameter(self.FC1.weight)
-        self.FC2.bias = nn.Parameter(self.FC1.bias)
-
-        self.relu = nn.ReLU()
-
-    def forward(self, input):
-        return self.FC2(self.ReLU(self.FC1(input))) + self.FC1(input)
-
-
-class TestAverageCheckpoints(unittest.TestCase):
-    def test_average_checkpoints(self):
-        params_0 = collections.OrderedDict(
-            [
-                ("a", torch.DoubleTensor([100.0])),
-                ("b", torch.FloatTensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])),
-                ("c", torch.IntTensor([7, 8, 9])),
-            ]
-        )
-        params_1 = collections.OrderedDict(
-            [
-                ("a", torch.DoubleTensor([1.0])),
-                ("b", torch.FloatTensor([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])),
-                ("c", torch.IntTensor([2, 2, 2])),
-            ]
-        )
-        params_avg = collections.OrderedDict(
-            [
-                ("a", torch.DoubleTensor([50.5])),
-                ("b", torch.FloatTensor([[1.0, 1.5, 2.0], [2.5, 3.0, 3.5]])),
-                # We expect truncation for integer division
-                ("c", torch.IntTensor([4, 5, 5])),
-            ]
-        )
-
-        fd_0, path_0 = tempfile.mkstemp()
-        fd_1, path_1 = tempfile.mkstemp()
-        torch.save(collections.OrderedDict([("model", params_0)]), path_0)
-        torch.save(collections.OrderedDict([("model", params_1)]), path_1)
-
-        output = average_checkpoints([path_0, path_1])["model"]
-
-        os.close(fd_0)
-        os.remove(path_0)
-        os.close(fd_1)
-        os.remove(path_1)
-
-        for (k_expected, v_expected), (k_out, v_out) in zip(
-            params_avg.items(), output.items()
-        ):
-            self.assertEqual(
-                k_expected,
-                k_out,
-                "Key mismatch - expected {} but found {}. "
-                "(Expected list of keys: {} vs actual list of keys: {})".format(
-                    k_expected, k_out, params_avg.keys(), output.keys()
-                ),
-            )
-            np.testing.assert_allclose(
-                v_expected.numpy(),
-                v_out.numpy(),
-                err_msg="Tensor value mismatch for key {}".format(k_expected),
-            )
-
-    def test_average_checkpoints_with_shared_parameters(self):
-        def _construct_model_with_shared_parameters(path, value):
-            m = ModelWithSharedParameter()
-            nn.init.constant_(m.FC1.weight, value)
-            torch.save({"model": m.state_dict()}, path)
-            return m
-
-        tmpdir = tempfile.mkdtemp()
-        paths = []
-        path = os.path.join(tmpdir, "m1.pt")
-        m1 = _construct_model_with_shared_parameters(path, 1.0)
-        paths.append(path)
-
-        path = os.path.join(tmpdir, "m2.pt")
-        m2 = _construct_model_with_shared_parameters(path, 2.0)
-        paths.append(path)
-
-        path = os.path.join(tmpdir, "m3.pt")
-        m3 = _construct_model_with_shared_parameters(path, 3.0)
-        paths.append(path)
-
-        new_model = average_checkpoints(paths)
-        self.assertTrue(
-            torch.equal(
-                new_model["model"]["embedding.weight"],
-                (m1.embedding.weight + m2.embedding.weight + m3.embedding.weight) / 3.0,
-            )
-        )
-
-        self.assertTrue(
-            torch.equal(
-                new_model["model"]["FC1.weight"],
-                (m1.FC1.weight + m2.FC1.weight + m3.FC1.weight) / 3.0,
-            )
-        )
-
-        self.assertTrue(
-            torch.equal(
-                new_model["model"]["FC2.weight"],
-                (m1.FC2.weight + m2.FC2.weight + m3.FC2.weight) / 3.0,
-            )
-        )
-        shutil.rmtree(tmpdir)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_backtranslation_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_backtranslation_dataset.py
deleted file mode 100644
index dffc3b49387dfdc046ea23d7db179377040b7cbc..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_backtranslation_dataset.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import tests.utils as test_utils
-import torch
-from fairseq.data import (
-    BacktranslationDataset,
-    LanguagePairDataset,
-    TransformEosDataset,
-)
-from fairseq.sequence_generator import SequenceGenerator
-
-
-class TestBacktranslationDataset(unittest.TestCase):
-    def setUp(self):
-        (
-            self.tgt_dict,
-            self.w1,
-            self.w2,
-            self.src_tokens,
-            self.src_lengths,
-            self.model,
-        ) = test_utils.sequence_generator_setup()
-
-        dummy_src_samples = self.src_tokens
-
-        self.tgt_dataset = test_utils.TestDataset(data=dummy_src_samples)
-        self.cuda = torch.cuda.is_available()
-
-    def _backtranslation_dataset_helper(
-        self,
-        remove_eos_from_input_src,
-        remove_eos_from_output_src,
-    ):
-        tgt_dataset = LanguagePairDataset(
-            src=self.tgt_dataset,
-            src_sizes=self.tgt_dataset.sizes,
-            src_dict=self.tgt_dict,
-            tgt=None,
-            tgt_sizes=None,
-            tgt_dict=None,
-        )
-
-        generator = SequenceGenerator(
-            [self.model],
-            tgt_dict=self.tgt_dict,
-            max_len_a=0,
-            max_len_b=200,
-            beam_size=2,
-            unk_penalty=0,
-        )
-
-        backtranslation_dataset = BacktranslationDataset(
-            tgt_dataset=TransformEosDataset(
-                dataset=tgt_dataset,
-                eos=self.tgt_dict.eos(),
-                # remove eos from the input src
-                remove_eos_from_src=remove_eos_from_input_src,
-            ),
-            src_dict=self.tgt_dict,
-            backtranslation_fn=(
-                lambda sample: generator.generate([self.model], sample)
-            ),
-            output_collater=TransformEosDataset(
-                dataset=tgt_dataset,
-                eos=self.tgt_dict.eos(),
-                # if we remove eos from the input src, then we need to add it
-                # back to the output tgt
-                append_eos_to_tgt=remove_eos_from_input_src,
-                remove_eos_from_src=remove_eos_from_output_src,
-            ).collater,
-            cuda=self.cuda,
-        )
-        dataloader = torch.utils.data.DataLoader(
-            backtranslation_dataset,
-            batch_size=2,
-            collate_fn=backtranslation_dataset.collater,
-        )
-        backtranslation_batch_result = next(iter(dataloader))
-
-        eos, pad, w1, w2 = self.tgt_dict.eos(), self.tgt_dict.pad(), self.w1, self.w2
-
-        # Note that we sort by src_lengths and add left padding, so actually
-        # ids will look like: [1, 0]
-        expected_src = torch.LongTensor([[w1, w2, w1, eos], [pad, pad, w1, eos]])
-        if remove_eos_from_output_src:
-            expected_src = expected_src[:, :-1]
-        expected_tgt = torch.LongTensor([[w1, w2, eos], [w1, w2, eos]])
-        generated_src = backtranslation_batch_result["net_input"]["src_tokens"]
-        tgt_tokens = backtranslation_batch_result["target"]
-
-        self.assertTensorEqual(expected_src, generated_src)
-        self.assertTensorEqual(expected_tgt, tgt_tokens)
-
-    def test_backtranslation_dataset_no_eos_in_output_src(self):
-        self._backtranslation_dataset_helper(
-            remove_eos_from_input_src=False,
-            remove_eos_from_output_src=True,
-        )
-
-    def test_backtranslation_dataset_with_eos_in_output_src(self):
-        self._backtranslation_dataset_helper(
-            remove_eos_from_input_src=False,
-            remove_eos_from_output_src=False,
-        )
-
-    def test_backtranslation_dataset_no_eos_in_input_src(self):
-        self._backtranslation_dataset_helper(
-            remove_eos_from_input_src=True,
-            remove_eos_from_output_src=False,
-        )
-
-    def assertTensorEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertEqual(t1.ne(t2).long().sum(), 0)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_binaries.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_binaries.py
deleted file mode 100644
index ca18adea04fe6345d3b622f507b1a282fc461f49..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_binaries.py
+++ /dev/null
@@ -1,1571 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import contextlib
-import logging
-import os
-import random
-import sys
-import tempfile
-import unittest
-from io import StringIO
-
-import torch
-from fairseq import options
-from fairseq_cli import eval_lm, train, validate
-from tests.utils import (
-    create_dummy_data,
-    generate_main,
-    preprocess_lm_data,
-    preprocess_summarization_data,
-    preprocess_translation_data,
-    train_translation_model,
-)
-
-
-class TestTranslation(unittest.TestCase):
-    def setUp(self):
-        logging.disable(logging.CRITICAL)
-
-    def tearDown(self):
-        logging.disable(logging.NOTSET)
-
-    def test_fconv(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_fconv") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(data_dir, "fconv_iwslt_de_en")
-                generate_main(data_dir)
-
-    def test_raw(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_fconv_raw") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir, ["--dataset-impl", "raw"])
-                train_translation_model(
-                    data_dir, "fconv_iwslt_de_en", ["--dataset-impl", "raw"]
-                )
-                generate_main(data_dir, ["--dataset-impl", "raw"])
-
-    def test_update_freq(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_update_freq") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(
-                    data_dir, "fconv_iwslt_de_en", ["--update-freq", "3"]
-                )
-                generate_main(data_dir)
-
-    def test_max_positions(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_max_positions") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                with self.assertRaises(Exception) as context:
-                    train_translation_model(
-                        data_dir,
-                        "fconv_iwslt_de_en",
-                        ["--max-target-positions", "5"],
-                    )
-                self.assertTrue(
-                    "skip this example with --skip-invalid-size-inputs-valid-test"
-                    in str(context.exception)
-                )
-                train_translation_model(
-                    data_dir,
-                    "fconv_iwslt_de_en",
-                    [
-                        "--max-target-positions",
-                        "5",
-                        "--skip-invalid-size-inputs-valid-test",
-                    ],
-                )
-                with self.assertRaises(Exception) as context:
-                    generate_main(data_dir)
-                generate_main(data_dir, ["--skip-invalid-size-inputs-valid-test"])
-
-    def test_generation(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_sampling") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(data_dir, "fconv_iwslt_de_en")
-                generate_main(
-                    data_dir,
-                    [
-                        "--sampling",
-                        "--temperature",
-                        "2",
-                        "--beam",
-                        "2",
-                        "--nbest",
-                        "2",
-                    ],
-                )
-                generate_main(
-                    data_dir,
-                    [
-                        "--sampling",
-                        "--sampling-topk",
-                        "3",
-                        "--beam",
-                        "2",
-                        "--nbest",
-                        "2",
-                    ],
-                )
-                generate_main(
-                    data_dir,
-                    [
-                        "--sampling",
-                        "--sampling-topp",
-                        "0.2",
-                        "--beam",
-                        "2",
-                        "--nbest",
-                        "2",
-                    ],
-                )
-                generate_main(
-                    data_dir,
-                    [
-                        "--diversity-rate",
-                        "0.5",
-                        "--beam",
-                        "6",
-                    ],
-                )
-                with self.assertRaises(ValueError):
-                    generate_main(
-                        data_dir,
-                        [
-                            "--diverse-beam-groups",
-                            "4",
-                            "--match-source-len",
-                        ],
-                    )
-                generate_main(data_dir, ["--prefix-size", "2"])
-                generate_main(data_dir, ["--retain-dropout"])
-
-    def test_eval_bleu(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_eval_bleu") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(
-                    data_dir,
-                    "fconv_iwslt_de_en",
-                    [
-                        "--eval-bleu",
-                        "--eval-bleu-print-samples",
-                        "--eval-bleu-remove-bpe",
-                        "--eval-bleu-detok",
-                        "space",
-                        "--eval-bleu-args",
-                        '{"beam": 4, "min_len": 10}',
-                    ],
-                )
-
-    def test_lstm(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_lstm") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(
-                    data_dir,
-                    "lstm_wiseman_iwslt_de_en",
-                    [
-                        "--encoder-layers",
-                        "2",
-                        "--decoder-layers",
-                        "2",
-                        "--encoder-embed-dim",
-                        "8",
-                        "--decoder-embed-dim",
-                        "8",
-                        "--decoder-out-embed-dim",
-                        "8",
-                    ],
-                )
-                generate_main(data_dir)
-
-    def test_lstm_bidirectional(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_lstm_bidirectional") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(
-                    data_dir,
-                    "lstm",
-                    [
-                        "--encoder-layers",
-                        "2",
-                        "--encoder-bidirectional",
-                        "--encoder-hidden-size",
-                        "16",
-                        "--encoder-embed-dim",
-                        "8",
-                        "--decoder-embed-dim",
-                        "8",
-                        "--decoder-out-embed-dim",
-                        "8",
-                        "--decoder-layers",
-                        "2",
-                    ],
-                )
-                generate_main(data_dir)
-
-    def test_transformer(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_transformer") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(
-                    data_dir,
-                    "transformer_iwslt_de_en",
-                    [
-                        "--encoder-layers",
-                        "2",
-                        "--decoder-layers",
-                        "2",
-                        "--encoder-embed-dim",
-                        "8",
-                        "--decoder-embed-dim",
-                        "8",
-                    ],
-                    run_validation=True,
-                )
-                generate_main(data_dir)
-
-    def test_multilingual_transformer(self):
-        # test with all combinations of encoder/decoder lang tokens
-        encoder_langtok_flags = [
-            [],
-            ["--encoder-langtok", "src"],
-            ["--encoder-langtok", "tgt"],
-        ]
-        decoder_langtok_flags = [[], ["--decoder-langtok"]]
-        with contextlib.redirect_stdout(StringIO()):
-            for i in range(len(encoder_langtok_flags)):
-                for j in range(len(decoder_langtok_flags)):
-                    enc_ltok_flag = encoder_langtok_flags[i]
-                    dec_ltok_flag = decoder_langtok_flags[j]
-                    with tempfile.TemporaryDirectory(
-                        f"test_multilingual_transformer_{i}_{j}"
-                    ) as data_dir:
-                        create_dummy_data(data_dir)
-                        preprocess_translation_data(data_dir)
-                        train_translation_model(
-                            data_dir,
-                            arch="multilingual_transformer",
-                            task="multilingual_translation",
-                            extra_flags=[
-                                "--encoder-layers",
-                                "2",
-                                "--decoder-layers",
-                                "2",
-                                "--encoder-embed-dim",
-                                "8",
-                                "--decoder-embed-dim",
-                                "8",
-                            ]
-                            + enc_ltok_flag
-                            + dec_ltok_flag,
-                            lang_flags=["--lang-pairs", "in-out,out-in"],
-                            run_validation=True,
-                            extra_valid_flags=enc_ltok_flag + dec_ltok_flag,
-                        )
-                        generate_main(
-                            data_dir,
-                            extra_flags=[
-                                "--task",
-                                "multilingual_translation",
-                                "--lang-pairs",
-                                "in-out,out-in",
-                                "--source-lang",
-                                "in",
-                                "--target-lang",
-                                "out",
-                            ]
-                            + enc_ltok_flag
-                            + dec_ltok_flag,
-                        )
-
-    @unittest.skipIf(sys.platform.lower() == "darwin", "skip latent depth test on MacOS")
-    def test_multilingual_translation_latent_depth(self):
-        # test with latent depth in encoder, decoder, or both
-        encoder_latent_layer = [[], ["--encoder-latent-layer"]]
-        decoder_latent_layer = [[], ["--decoder-latent-layer"]]
-        with contextlib.redirect_stdout(StringIO()):
-            for i in range(len(encoder_latent_layer)):
-                for j in range(len(decoder_latent_layer)):
-                    if i == 0 and j == 0:
-                        continue
-                    enc_ll_flag = encoder_latent_layer[i]
-                    dec_ll_flag = decoder_latent_layer[j]
-                    with tempfile.TemporaryDirectory(
-                        f"test_multilingual_translation_latent_depth_{i}_{j}"
-                    ) as data_dir:
-                        create_dummy_data(data_dir)
-                        preprocess_translation_data(
-                            data_dir, extra_flags=["--joined-dictionary"]
-                        )
-                        train_translation_model(
-                            data_dir,
-                            arch="latent_multilingual_transformer",
-                            task="multilingual_translation_latent_depth",
-                            extra_flags=[
-                                "--user-dir",
-                                "examples/latent_depth/latent_depth_src",
-                                "--encoder-layers",
-                                "2",
-                                "--decoder-layers",
-                                "2",
-                                "--encoder-embed-dim",
-                                "8",
-                                "--decoder-embed-dim",
-                                "8",
-                                "--share-encoders",
-                                "--share-decoders",
-                                "--sparsity-weight",
-                                "0.1",
-                            ]
-                            + enc_ll_flag
-                            + dec_ll_flag,
-                            lang_flags=["--lang-pairs", "in-out,out-in"],
-                            run_validation=True,
-                            extra_valid_flags=[
-                                "--user-dir",
-                                "examples/latent_depth/latent_depth_src",
-                            ]
-                            + enc_ll_flag
-                            + dec_ll_flag,
-                        )
-                        generate_main(
-                            data_dir,
-                            extra_flags=[
-                                "--user-dir",
-                                "examples/latent_depth/latent_depth_src",
-                                "--task",
-                                "multilingual_translation_latent_depth",
-                                "--lang-pairs",
-                                "in-out,out-in",
-                                "--source-lang",
-                                "in",
-                                "--target-lang",
-                                "out",
-                            ]
-                            + enc_ll_flag
-                            + dec_ll_flag,
-                        )
-
-    def test_translation_multi_simple_epoch(self):
-        # test with all combinations of encoder/decoder lang tokens
-        encoder_langtok_flags = [
-            [],
-            ["--encoder-langtok", "src"],
-            ["--encoder-langtok", "tgt"],
-        ]
-        decoder_langtok_flags = [[], ["--decoder-langtok"]]
-        with contextlib.redirect_stdout(StringIO()):
-            for i in range(len(encoder_langtok_flags)):
-                for j in range(len(decoder_langtok_flags)):
-                    enc_ltok_flag = encoder_langtok_flags[i]
-                    dec_ltok_flag = decoder_langtok_flags[j]
-                    with tempfile.TemporaryDirectory(
-                        f"test_translation_multi_simple_epoch_{i}_{j}"
-                    ) as data_dir:
-                        create_dummy_data(data_dir)
-                        preprocess_translation_data(
-                            data_dir, extra_flags=["--joined-dictionary"]
-                        )
-                        train_translation_model(
-                            data_dir,
-                            arch="transformer",
-                            task="translation_multi_simple_epoch",
-                            extra_flags=[
-                                "--encoder-layers",
-                                "2",
-                                "--decoder-layers",
-                                "2",
-                                "--encoder-embed-dim",
-                                "8",
-                                "--decoder-embed-dim",
-                                "8",
-                                "--sampling-method",
-                                "temperature",
-                                "--sampling-temperature",
-                                "1.5",
-                                "--virtual-epoch-size",
-                                "1000",
-                            ]
-                            + enc_ltok_flag
-                            + dec_ltok_flag,
-                            lang_flags=["--lang-pairs", "in-out,out-in"],
-                            run_validation=True,
-                            extra_valid_flags=enc_ltok_flag + dec_ltok_flag,
-                        )
-                        generate_main(
-                            data_dir,
-                            extra_flags=[
-                                "--task",
-                                "translation_multi_simple_epoch",
-                                "--lang-pairs",
-                                "in-out,out-in",
-                                "--source-lang",
-                                "in",
-                                "--target-lang",
-                                "out",
-                            ]
-                            + enc_ltok_flag
-                            + dec_ltok_flag,
-                        )
-
-    def test_transformer_cross_self_attention(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory(
-                "test_transformer_cross_self_attention"
-            ) as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(
-                    data_dir,
-                    "transformer_iwslt_de_en",
-                    [
-                        "--encoder-layers",
-                        "2",
-                        "--decoder-layers",
-                        "2",
-                        "--encoder-embed-dim",
-                        "8",
-                        "--decoder-embed-dim",
-                        "8",
-                        "--decoder-embed-dim",
-                        "8",
-                        "--no-cross-attention",
-                        "--cross-self-attention",
-                    ],
-                    run_validation=True,
-                )
-                generate_main(data_dir, extra_flags=[])
-
-    def test_transformer_pointer_generator(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory(
-                "test_transformer_pointer_generator"
-            ) as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_summarization_data(data_dir)
-                train_translation_model(
-                    data_dir,
-                    "transformer_pointer_generator",
-                    extra_flags=[
-                        "--user-dir",
-                        "examples/pointer_generator/pointer_generator_src",
-                        "--encoder-layers",
-                        "2",
-                        "--decoder-layers",
-                        "2",
-                        "--encoder-embed-dim",
-                        "8",
-                        "--decoder-embed-dim",
-                        "8",
-                        "--alignment-layer",
-                        "-1",
-                        "--alignment-heads",
-                        "1",
-                        "--source-position-markers",
-                        "0",
-                    ],
-                    run_validation=True,
-                    extra_valid_flags=["--user-dir", "examples/pointer_generator/pointer_generator_src"],
-                )
-                generate_main(
-                    data_dir,
-                    extra_flags=["--user-dir", "examples/pointer_generator/pointer_generator_src"],
-                )
-
-    def test_lightconv(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_lightconv") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(
-                    data_dir,
-                    "lightconv_iwslt_de_en",
-                    [
-                        "--encoder-conv-type",
-                        "lightweight",
-                        "--decoder-conv-type",
-                        "lightweight",
-                        "--encoder-embed-dim",
-                        "8",
-                        "--decoder-embed-dim",
-                        "8",
-                    ],
-                )
-                generate_main(data_dir)
-
-    def test_dynamicconv(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_dynamicconv") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(
-                    data_dir,
-                    "lightconv_iwslt_de_en",
-                    [
-                        "--encoder-conv-type",
-                        "dynamic",
-                        "--decoder-conv-type",
-                        "dynamic",
-                        "--encoder-embed-dim",
-                        "8",
-                        "--decoder-embed-dim",
-                        "8",
-                    ],
-                )
-                generate_main(data_dir)
-
-    def test_cmlm_transformer(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_cmlm_transformer") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir, ["--joined-dictionary"])
-                train_translation_model(
-                    data_dir,
-                    "cmlm_transformer",
-                    [
-                        "--apply-bert-init",
-                        "--criterion",
-                        "nat_loss",
-                        "--noise",
-                        "full_mask",
-                        "--pred-length-offset",
-                        "--length-loss-factor",
-                        "0.1",
-                    ],
-                    task="translation_lev",
-                )
-                generate_main(
-                    data_dir,
-                    [
-                        "--task",
-                        "translation_lev",
-                        "--iter-decode-max-iter",
-                        "9",
-                        "--iter-decode-eos-penalty",
-                        "0",
-                        "--print-step",
-                    ],
-                )
-
-    def test_nonautoregressive_transformer(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory(
-                "test_nonautoregressive_transformer"
-            ) as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir, ["--joined-dictionary"])
-                train_translation_model(
-                    data_dir,
-                    "nonautoregressive_transformer",
-                    [
-                        "--apply-bert-init",
-                        "--src-embedding-copy",
-                        "--criterion",
-                        "nat_loss",
-                        "--noise",
-                        "full_mask",
-                        "--pred-length-offset",
-                        "--length-loss-factor",
-                        "0.1",
-                    ],
-                    task="translation_lev",
-                )
-                generate_main(
-                    data_dir,
-                    [
-                        "--task",
-                        "translation_lev",
-                        "--iter-decode-max-iter",
-                        "0",
-                        "--iter-decode-eos-penalty",
-                        "0",
-                        "--print-step",
-                    ],
-                )
-
-    # def test_nat_crf_transformer(self):
-    #     with contextlib.redirect_stdout(StringIO()):
-    #         with tempfile.TemporaryDirectory('test_nat_crf_transformer') as data_dir:
-    #             create_dummy_data(data_dir)
-    #             preprocess_translation_data(data_dir, ['--joined-dictionary'])
-    #             train_translation_model(data_dir, 'nacrf_transformer', [
-    #                 '--apply-bert-init', '--criterion',
-    #                 'nat_loss', '--noise', 'full_mask', '--pred-length-offset',
-    #                 '--length-loss-factor', '0.1',
-    #                 '--word-ins-loss-factor', '0.5',
-    #                 '--crf-lowrank-approx', '1',
-    #                 '--crf-beam-approx', '1'
-    #             ], task='translation_lev')
-    #             generate_main(data_dir, [
-    #                 '--task', 'translation_lev',
-    #                 '--iter-decode-max-iter', '0',
-    #                 '--iter-decode-eos-penalty', '0',
-    #                 '--print-step',
-    #             ])
-
-    def test_iterative_nonautoregressive_transformer(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory(
-                "test_iterative_nonautoregressive_transformer"
-            ) as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir, ["--joined-dictionary"])
-                train_translation_model(
-                    data_dir,
-                    "iterative_nonautoregressive_transformer",
-                    [
-                        "--apply-bert-init",
-                        "--src-embedding-copy",
-                        "--criterion",
-                        "nat_loss",
-                        "--noise",
-                        "full_mask",
-                        "--stochastic-approx",
-                        "--dae-ratio",
-                        "0.5",
-                        "--train-step",
-                        "3",
-                    ],
-                    task="translation_lev",
-                )
-                generate_main(
-                    data_dir,
-                    [
-                        "--task",
-                        "translation_lev",
-                        "--iter-decode-max-iter",
-                        "9",
-                        "--iter-decode-eos-penalty",
-                        "0",
-                        "--print-step",
-                    ],
-                )
-
-    def test_insertion_transformer(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_insertion_transformer") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir, ["--joined-dictionary"])
-                train_translation_model(
-                    data_dir,
-                    "insertion_transformer",
-                    [
-                        "--apply-bert-init",
-                        "--criterion",
-                        "nat_loss",
-                        "--noise",
-                        "random_mask",
-                    ],
-                    task="translation_lev",
-                )
-                generate_main(
-                    data_dir,
-                    [
-                        "--task",
-                        "translation_lev",
-                        "--iter-decode-max-iter",
-                        "9",
-                        "--iter-decode-eos-penalty",
-                        "0",
-                        "--print-step",
-                    ],
-                )
-
-    def test_mixture_of_experts(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_moe") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                train_translation_model(
-                    data_dir,
-                    "transformer_iwslt_de_en",
-                    [
-                        "--task",
-                        "translation_moe",
-                        "--user-dir",
-                        "examples/translation_moe/translation_moe_src",
-                        "--method",
-                        "hMoElp",
-                        "--mean-pool-gating-network",
-                        "--num-experts",
-                        "3",
-                        "--encoder-layers",
-                        "2",
-                        "--decoder-layers",
-                        "2",
-                        "--encoder-embed-dim",
-                        "8",
-                        "--decoder-embed-dim",
-                        "8",
-                    ],
-                )
-                generate_main(
-                    data_dir,
-                    [
-                        "--task",
-                        "translation_moe",
-                        "--user-dir",
-                        "examples/translation_moe/translation_moe_src",
-                        "--method",
-                        "hMoElp",
-                        "--mean-pool-gating-network",
-                        "--num-experts",
-                        "3",
-                        "--gen-expert",
-                        "0",
-                    ],
-                )
-
-    def test_alignment(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_alignment") as data_dir:
-                create_dummy_data(data_dir, alignment=True)
-                preprocess_translation_data(data_dir, ["--align-suffix", "align"])
-                train_translation_model(
-                    data_dir,
-                    "transformer_align",
-                    [
-                        "--encoder-layers",
-                        "2",
-                        "--decoder-layers",
-                        "2",
-                        "--encoder-embed-dim",
-                        "8",
-                        "--decoder-embed-dim",
-                        "8",
-                        "--load-alignments",
-                        "--alignment-layer",
-                        "1",
-                        "--criterion",
-                        "label_smoothed_cross_entropy_with_alignment",
-                    ],
-                    run_validation=True,
-                )
-                generate_main(data_dir)
-
-    def test_alignment_full_context(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_alignment") as data_dir:
-                create_dummy_data(data_dir, alignment=True)
-                preprocess_translation_data(data_dir, ["--align-suffix", "align"])
-                train_translation_model(
-                    data_dir,
-                    "transformer_align",
-                    [
-                        "--encoder-layers",
-                        "2",
-                        "--decoder-layers",
-                        "2",
-                        "--encoder-embed-dim",
-                        "8",
-                        "--decoder-embed-dim",
-                        "8",
-                        "--load-alignments",
-                        "--alignment-layer",
-                        "1",
-                        "--criterion",
-                        "label_smoothed_cross_entropy_with_alignment",
-                        "--full-context-alignment",
-                    ],
-                    run_validation=True,
-                )
-                generate_main(data_dir)
-
-
-class TestStories(unittest.TestCase):
-    def setUp(self):
-        logging.disable(logging.CRITICAL)
-
-    def tearDown(self):
-        logging.disable(logging.NOTSET)
-
-    def test_fconv_self_att_wp(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_fconv_self_att_wp") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_translation_data(data_dir)
-                config = [
-                    "--encoder-layers",
-                    "[(128, 3)] * 2",
-                    "--decoder-layers",
-                    "[(128, 3)] * 2",
-                    "--decoder-attention",
-                    "True",
-                    "--encoder-attention",
-                    "False",
-                    "--gated-attention",
-                    "True",
-                    "--self-attention",
-                    "True",
-                    "--project-input",
-                    "True",
-                    "--encoder-embed-dim",
-                    "8",
-                    "--decoder-embed-dim",
-                    "8",
-                    "--decoder-out-embed-dim",
-                    "8",
-                    "--multihead-self-attention-nheads",
-                    "2",
-                ]
-                train_translation_model(data_dir, "fconv_self_att_wp", config)
-                generate_main(data_dir)
-
-                # fusion model
-                os.rename(
-                    os.path.join(data_dir, "checkpoint_last.pt"),
-                    os.path.join(data_dir, "pretrained.pt"),
-                )
-                config.extend(
-                    [
-                        "--pretrained",
-                        "True",
-                        "--pretrained-checkpoint",
-                        os.path.join(data_dir, "pretrained.pt"),
-                        "--save-dir",
-                        os.path.join(data_dir, "fusion_model"),
-                    ]
-                )
-                train_translation_model(data_dir, "fconv_self_att_wp", config)
-
-
-class TestLanguageModeling(unittest.TestCase):
-    def setUp(self):
-        logging.disable(logging.CRITICAL)
-
-    def tearDown(self):
-        logging.disable(logging.NOTSET)
-
-    def test_fconv_lm(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_fconv_lm") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_lm_data(data_dir)
-                train_language_model(
-                    data_dir,
-                    "fconv_lm",
-                    [
-                        "--decoder-layers",
-                        "[(850, 3)] * 2 + [(1024,4)]",
-                        "--decoder-embed-dim",
-                        "280",
-                        "--optimizer",
-                        "nag",
-                        "--lr",
-                        "0.1",
-                    ],
-                )
-                eval_lm_main(data_dir)
-                generate_main(
-                    data_dir,
-                    [
-                        "--task",
-                        "language_modeling",
-                        "--sample-break-mode",
-                        "eos",
-                        "--tokens-per-sample",
-                        "500",
-                    ],
-                )
-
-    def test_transformer_lm(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_lm_data(data_dir)
-                train_language_model(
-                    data_dir,
-                    "transformer_lm",
-                    ["--add-bos-token"],
-                    run_validation=True,
-                )
-                eval_lm_main(data_dir)
-                generate_main(
-                    data_dir,
-                    [
-                        "--task",
-                        "language_modeling",
-                        "--sample-break-mode",
-                        "eos",
-                        "--tokens-per-sample",
-                        "500",
-                    ],
-                )
-
-    def test_lightconv_lm(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_lightconv_lm") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_lm_data(data_dir)
-                train_language_model(
-                    data_dir,
-                    "lightconv_lm",
-                    ["--add-bos-token"],
-                    run_validation=True,
-                )
-                eval_lm_main(data_dir)
-                generate_main(
-                    data_dir,
-                    [
-                        "--task",
-                        "language_modeling",
-                        "--sample-break-mode",
-                        "eos",
-                        "--tokens-per-sample",
-                        "500",
-                    ],
-                )
-
-    def test_lstm_lm(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_lstm_lm") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_lm_data(data_dir)
-                train_language_model(
-                    data_dir,
-                    "lstm_lm",
-                    ["--add-bos-token"],
-                    run_validation=True,
-                )
-                eval_lm_main(data_dir)
-                generate_main(
-                    data_dir,
-                    [
-                        "--task",
-                        "language_modeling",
-                        "--sample-break-mode",
-                        "eos",
-                        "--tokens-per-sample",
-                        "500",
-                    ],
-                )
-
-    def test_lstm_lm_residuals(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_lstm_lm_residuals") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_lm_data(data_dir)
-                train_language_model(
-                    data_dir,
-                    "lstm_lm",
-                    ["--add-bos-token", "--residuals"],
-                    run_validation=True,
-                )
-                eval_lm_main(data_dir)
-                generate_main(
-                    data_dir,
-                    [
-                        "--task",
-                        "language_modeling",
-                        "--sample-break-mode",
-                        "eos",
-                        "--tokens-per-sample",
-                        "500",
-                    ],
-                )
-
-
-class TestMaskedLanguageModel(unittest.TestCase):
-    def setUp(self):
-        logging.disable(logging.CRITICAL)
-
-    def tearDown(self):
-        logging.disable(logging.NOTSET)
-
-    def test_legacy_masked_lm(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_legacy_mlm") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_lm_data(data_dir)
-                train_legacy_masked_language_model(data_dir, "masked_lm")
-
-    def test_roberta_masked_lm(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_roberta_mlm") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_lm_data(data_dir)
-                train_masked_lm(
-                    data_dir, "roberta_base", extra_flags=["--encoder-layers", "2"]
-                )
-
-    def test_roberta_sentence_prediction(self):
-        num_classes = 3
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_roberta_head") as data_dir:
-                create_dummy_roberta_head_data(data_dir, num_classes=num_classes)
-                preprocess_lm_data(os.path.join(data_dir, "input0"))
-                preprocess_lm_data(os.path.join(data_dir, "label"))
-                train_roberta_head(data_dir, "roberta_base", num_classes=num_classes)
-
-    def test_roberta_regression_single(self):
-        num_classes = 1
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory(
-                "test_roberta_regression_single"
-            ) as data_dir:
-                create_dummy_roberta_head_data(
-                    data_dir, num_classes=num_classes, regression=True
-                )
-                preprocess_lm_data(os.path.join(data_dir, "input0"))
-                train_roberta_head(
-                    data_dir,
-                    "roberta_base",
-                    num_classes=num_classes,
-                    extra_flags=["--regression-target"],
-                )
-
-    def test_roberta_regression_multiple(self):
-        num_classes = 3
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory(
-                "test_roberta_regression_multiple"
-            ) as data_dir:
-                create_dummy_roberta_head_data(
-                    data_dir, num_classes=num_classes, regression=True
-                )
-                preprocess_lm_data(os.path.join(data_dir, "input0"))
-                train_roberta_head(
-                    data_dir,
-                    "roberta_base",
-                    num_classes=num_classes,
-                    extra_flags=["--regression-target"],
-                )
-
-    def test_linformer_roberta_masked_lm(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_linformer_roberta_mlm") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_lm_data(data_dir)
-                train_masked_lm(
-                    data_dir,
-                    "linformer_roberta_base",
-                    extra_flags=[
-                        "--user-dir",
-                        "examples/linformer/linformer_src",
-                        "--encoder-layers",
-                        "2",
-                    ],
-                )
-
-    def test_linformer_roberta_sentence_prediction(self):
-        num_classes = 3
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_linformer_roberta_head") as data_dir:
-                create_dummy_roberta_head_data(data_dir, num_classes=num_classes)
-                preprocess_lm_data(os.path.join(data_dir, "input0"))
-                preprocess_lm_data(os.path.join(data_dir, "label"))
-                train_roberta_head(
-                    data_dir,
-                    "linformer_roberta_base",
-                    num_classes=num_classes,
-                    extra_flags=["--user-dir", "examples/linformer/linformer_src"],
-                )
-
-    def test_linformer_roberta_regression_single(self):
-        num_classes = 1
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory(
-                "test_linformer_roberta_regression_single"
-            ) as data_dir:
-                create_dummy_roberta_head_data(
-                    data_dir, num_classes=num_classes, regression=True
-                )
-                preprocess_lm_data(os.path.join(data_dir, "input0"))
-                train_roberta_head(
-                    data_dir,
-                    "linformer_roberta_base",
-                    num_classes=num_classes,
-                    extra_flags=[
-                        "--regression-target",
-                        "--user-dir",
-                        "examples/linformer/linformer_src",
-                    ],
-                )
-
-    def test_linformer_roberta_regression_multiple(self):
-        num_classes = 3
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory(
-                "test_linformer_roberta_regression_multiple"
-            ) as data_dir:
-                create_dummy_roberta_head_data(
-                    data_dir, num_classes=num_classes, regression=True
-                )
-                preprocess_lm_data(os.path.join(data_dir, "input0"))
-                train_roberta_head(
-                    data_dir,
-                    "linformer_roberta_base",
-                    num_classes=num_classes,
-                    extra_flags=[
-                        "--regression-target",
-                        "--user-dir",
-                        "examples/linformer/linformer_src",
-                    ],
-                )
-
-    def _test_pretrained_masked_lm_for_translation(self, learned_pos_emb, encoder_only):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_mlm") as data_dir:
-                create_dummy_data(data_dir)
-                preprocess_lm_data(data_dir)
-                train_legacy_masked_language_model(
-                    data_dir,
-                    arch="masked_lm",
-                    extra_args=("--encoder-learned-pos",) if learned_pos_emb else (),
-                )
-                with tempfile.TemporaryDirectory(
-                    "test_mlm_translation"
-                ) as translation_dir:
-                    create_dummy_data(translation_dir)
-                    preprocess_translation_data(
-                        translation_dir, extra_flags=["--joined-dictionary"]
-                    )
-                    # Train transformer with data_dir/checkpoint_last.pt
-                    train_translation_model(
-                        translation_dir,
-                        arch="transformer_from_pretrained_xlm",
-                        extra_flags=[
-                            "--decoder-layers",
-                            "1",
-                            "--decoder-embed-dim",
-                            "32",
-                            "--decoder-attention-heads",
-                            "1",
-                            "--decoder-ffn-embed-dim",
-                            "32",
-                            "--encoder-layers",
-                            "1",
-                            "--encoder-embed-dim",
-                            "32",
-                            "--encoder-attention-heads",
-                            "1",
-                            "--encoder-ffn-embed-dim",
-                            "32",
-                            "--pretrained-xlm-checkpoint",
-                            "{}/checkpoint_last.pt".format(data_dir),
-                            "--activation-fn",
-                            "gelu",
-                            "--max-source-positions",
-                            "500",
-                            "--max-target-positions",
-                            "500",
-                        ]
-                        + (
-                            ["--encoder-learned-pos", "--decoder-learned-pos"]
-                            if learned_pos_emb
-                            else []
-                        )
-                        + (["--init-encoder-only"] if encoder_only else []),
-                        task="translation_from_pretrained_xlm",
-                    )
-
-    def test_pretrained_masked_lm_for_translation_learned_pos_emb(self):
-        self._test_pretrained_masked_lm_for_translation(True, False)
-
-    def test_pretrained_masked_lm_for_translation_sinusoidal_pos_emb(self):
-        self._test_pretrained_masked_lm_for_translation(False, False)
-
-    def test_pretrained_masked_lm_for_translation_encoder_only(self):
-        self._test_pretrained_masked_lm_for_translation(True, True)
-
-    def test_r4f_roberta(self):
-        num_classes = 3
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_r4f_roberta_head") as data_dir:
-                create_dummy_roberta_head_data(data_dir, num_classes=num_classes)
-                preprocess_lm_data(os.path.join(data_dir, "input0"))
-                preprocess_lm_data(os.path.join(data_dir, "label"))
-                train_roberta_head(
-                    data_dir,
-                    "roberta_base",
-                    num_classes=num_classes,
-                    extra_flags=[
-                        "--user-dir",
-                        "examples/rxf/rxf_src",
-                        "--criterion",
-                        "sentence_prediction_r3f",
-                        "--spectral-norm-classification-head",
-                    ],
-                )
-
-
-def train_legacy_masked_language_model(data_dir, arch, extra_args=()):
-    train_parser = options.get_training_parser()
-    # TODO: langs should be in and out right?
-    train_args = options.parse_args_and_arch(
-        train_parser,
-        [
-            "--task",
-            "cross_lingual_lm",
-            data_dir,
-            "--arch",
-            arch,
-            # Optimizer args
-            "--optimizer",
-            "adam",
-            "--lr-scheduler",
-            "reduce_lr_on_plateau",
-            "--lr-shrink",
-            "0.5",
-            "--lr",
-            "0.0001",
-            "--min-lr",
-            "1e-09",
-            # dropout, attention args
-            "--dropout",
-            "0.1",
-            "--attention-dropout",
-            "0.1",
-            # MLM args
-            "--criterion",
-            "legacy_masked_lm_loss",
-            "--masked-lm-only",
-            "--monolingual-langs",
-            "in,out",
-            "--num-segment",
-            "5",
-            # Transformer args: use a small transformer model for fast training
-            "--encoder-layers",
-            "1",
-            "--encoder-embed-dim",
-            "32",
-            "--encoder-attention-heads",
-            "1",
-            "--encoder-ffn-embed-dim",
-            "32",
-            # Other training args
-            "--max-tokens",
-            "500",
-            "--tokens-per-sample",
-            "500",
-            "--save-dir",
-            data_dir,
-            "--max-epoch",
-            "1",
-            "--no-progress-bar",
-            "--distributed-world-size",
-            "1",
-            "--dataset-impl",
-            "raw",
-            "--num-workers",
-            "0",
-        ]
-        + list(extra_args),
-    )
-    train.main(train_args)
-
-
-class TestOptimizers(unittest.TestCase):
-    def setUp(self):
-        logging.disable(logging.CRITICAL)
-
-    def tearDown(self):
-        logging.disable(logging.NOTSET)
-
-    def test_optimizers(self):
-        with contextlib.redirect_stdout(StringIO()):
-            with tempfile.TemporaryDirectory("test_optimizers") as data_dir:
-                # Use just a bit of data and tiny model to keep this test runtime reasonable
-                create_dummy_data(data_dir, num_examples=10, maxlen=5)
-                preprocess_translation_data(data_dir)
-                optimizers = ["adafactor", "adam", "nag", "adagrad", "sgd", "adadelta"]
-                last_checkpoint = os.path.join(data_dir, "checkpoint_last.pt")
-                for optimizer in optimizers:
-                    if os.path.exists(last_checkpoint):
-                        os.remove(last_checkpoint)
-                    train_translation_model(
-                        data_dir,
-                        "lstm",
-                        [
-                            "--required-batch-size-multiple",
-                            "1",
-                            "--encoder-layers",
-                            "1",
-                            "--encoder-hidden-size",
-                            "32",
-                            "--decoder-layers",
-                            "1",
-                            "--optimizer",
-                            optimizer,
-                        ],
-                    )
-                    generate_main(data_dir)
-
-
-def create_dummy_roberta_head_data(
-    data_dir, num_examples=100, maxlen=10, num_classes=2, regression=False
-):
-    input_dir = "input0"
-
-    def _create_dummy_data(filename):
-        random_data = torch.rand(num_examples * maxlen)
-        input_data = 97 + torch.floor(26 * random_data).int()
-        if regression:
-            output_data = torch.rand((num_examples, num_classes))
-        else:
-            output_data = 1 + torch.floor(num_classes * torch.rand(num_examples)).int()
-        with open(os.path.join(data_dir, input_dir, filename + ".out"), "w") as f_in:
-            label_filename = filename + ".label" if regression else filename + ".out"
-            with open(os.path.join(data_dir, "label", label_filename), "w") as f_out:
-                offset = 0
-                for i in range(num_examples):
-                    # write example input
-                    ex_len = random.randint(1, maxlen)
-                    ex_str = " ".join(map(chr, input_data[offset : offset + ex_len]))
-                    print(ex_str, file=f_in)
-                    # write example label
-                    if regression:
-                        class_str = " ".join(map(str, output_data[i].numpy()))
-                        print(class_str, file=f_out)
-                    else:
-                        class_str = "class{}".format(output_data[i])
-                        print(class_str, file=f_out)
-                    offset += ex_len
-
-    os.mkdir(os.path.join(data_dir, input_dir))
-    os.mkdir(os.path.join(data_dir, "label"))
-    _create_dummy_data("train")
-    _create_dummy_data("valid")
-    _create_dummy_data("test")
-
-
-def train_masked_lm(data_dir, arch, extra_flags=None):
-    train_parser = options.get_training_parser()
-    train_args = options.parse_args_and_arch(
-        train_parser,
-        [
-            "--task",
-            "masked_lm",
-            data_dir,
-            "--arch",
-            arch,
-            "--optimizer",
-            "adam",
-            "--lr",
-            "0.0001",
-            "--criterion",
-            "masked_lm",
-            "--batch-size",
-            "500",
-            "--save-dir",
-            data_dir,
-            "--max-epoch",
-            "1",
-            "--no-progress-bar",
-            "--distributed-world-size",
-            "1",
-            "--ddp-backend",
-            "no_c10d",
-            "--num-workers",
-            "0",
-        ]
-        + (extra_flags or []),
-    )
-    train.main(train_args)
-
-
-def train_roberta_head(data_dir, arch, num_classes=2, extra_flags=None):
-    train_parser = options.get_training_parser()
-    train_args = options.parse_args_and_arch(
-        train_parser,
-        [
-            "--task",
-            "sentence_prediction",
-            data_dir,
-            "--arch",
-            arch,
-            "--encoder-layers",
-            "2",
-            "--num-classes",
-            str(num_classes),
-            "--optimizer",
-            "adam",
-            "--lr",
-            "0.0001",
-            "--criterion",
-            "sentence_prediction",
-            "--max-tokens",
-            "500",
-            "--max-positions",
-            "500",
-            "--batch-size",
-            "500",
-            "--save-dir",
-            data_dir,
-            "--max-epoch",
-            "1",
-            "--no-progress-bar",
-            "--distributed-world-size",
-            "1",
-            "--ddp-backend",
-            "no_c10d",
-            "--num-workers",
-            "0",
-        ]
-        + (extra_flags or []),
-    )
-    train.main(train_args)
-
-
-def train_language_model(data_dir, arch, extra_flags=None, run_validation=False):
-    train_parser = options.get_training_parser()
-    train_args = options.parse_args_and_arch(
-        train_parser,
-        [
-            "--task",
-            "language_modeling",
-            data_dir,
-            "--arch",
-            arch,
-            "--optimizer",
-            "adam",
-            "--lr",
-            "0.0001",
-            "--criterion",
-            "adaptive_loss",
-            "--adaptive-softmax-cutoff",
-            "5,10,15",
-            "--max-tokens",
-            "500",
-            "--tokens-per-sample",
-            "500",
-            "--save-dir",
-            data_dir,
-            "--max-epoch",
-            "1",
-            "--no-progress-bar",
-            "--distributed-world-size",
-            "1",
-            "--ddp-backend",
-            "no_c10d",
-            "--num-workers",
-            "0",
-        ]
-        + (extra_flags or []),
-    )
-    train.main(train_args)
-
-    if run_validation:
-        # test validation
-        validate_parser = options.get_validation_parser()
-        validate_args = options.parse_args_and_arch(
-            validate_parser,
-            [
-                "--task",
-                "language_modeling",
-                data_dir,
-                "--path",
-                os.path.join(data_dir, "checkpoint_last.pt"),
-                "--valid-subset",
-                "valid",
-                "--max-tokens",
-                "500",
-                "--no-progress-bar",
-                "--num-workers",
-                "0",
-            ],
-        )
-        validate.main(validate_args)
-
-
-def eval_lm_main(data_dir):
-    eval_lm_parser = options.get_eval_lm_parser()
-    eval_lm_args = options.parse_args_and_arch(
-        eval_lm_parser,
-        [
-            data_dir,
-            "--path",
-            os.path.join(data_dir, "checkpoint_last.pt"),
-            "--no-progress-bar",
-            "--num-workers",
-            "0",
-        ],
-    )
-    eval_lm.main(eval_lm_args)
-
-
-def train_masked_language_model(data_dir, arch, extra_args=()):
-    train_parser = options.get_training_parser()
-    # TODO: langs should be in and out right?
-    train_args = options.parse_args_and_arch(
-        train_parser,
-        [
-            "--task",
-            "cross_lingual_lm",
-            data_dir,
-            "--arch",
-            arch,
-            # Optimizer args
-            "--optimizer",
-            "adam",
-            "--lr-scheduler",
-            "reduce_lr_on_plateau",
-            "--lr-shrink",
-            "0.5",
-            "--lr",
-            "0.0001",
-            "--min-lr",
-            "1e-09",
-            # dropout, attention args
-            "--dropout",
-            "0.1",
-            "--attention-dropout",
-            "0.1",
-            # MLM args
-            "--criterion",
-            "masked_lm_loss",
-            "--masked-lm-only",
-            "--monolingual-langs",
-            "in,out",
-            "--num-segment",
-            "5",
-            # Transformer args: use a small transformer model for fast training
-            "--encoder-layers",
-            "1",
-            "--encoder-embed-dim",
-            "32",
-            "--encoder-attention-heads",
-            "1",
-            "--encoder-ffn-embed-dim",
-            "32",
-            # Other training args
-            "--max-tokens",
-            "500",
-            "--tokens-per-sample",
-            "500",
-            "--save-dir",
-            data_dir,
-            "--max-epoch",
-            "1",
-            "--no-progress-bar",
-            "--distributed-world-size",
-            "1",
-            "--dataset-impl",
-            "raw",
-            "--num-workers",
-            "0",
-        ]
-        + list(extra_args),
-    )
-    train.main(train_args)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_bmuf.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_bmuf.py
deleted file mode 100644
index 0165b2955bdf64903717c2bd27580c83d3cf8e21..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_bmuf.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import random
-import unittest
-from multiprocessing import Manager
-
-import torch
-import torch.nn as nn
-from fairseq import distributed_utils, optim
-
-
-class Model(nn.Module):
-    def __init__(self, input_size, output_size):
-        super(Model, self).__init__()
-        self.fc = nn.Linear(input_size, output_size)
-
-    def forward(self, input):
-        output = self.fc(input)
-        return output
-
-
-def setup_model_loss_criterion(args, rank, is_cuda):
-    """
-    setup model, criterion and optimizer based on input args
-    """
-    args.distributed_rank = rank
-    if args.distributed_world_size > 1:
-        distributed_utils.distributed_init(args)
-    torch.manual_seed(1)
-    model = Model(args.input_size, args.nb_classes)
-    loss_fn = nn.CrossEntropyLoss()
-    if is_cuda:
-        model = model.cuda()
-        loss_fn = loss_fn.cuda()
-
-    optimizer = optim.sgd.SGD(args, model.parameters())
-    optimizer = optim.FairseqBMUF(args, optimizer)
-
-    return model, loss_fn, optimizer
-
-
-def train_step(input, target, model, loss_fn, optimizer, **unused):
-    """Do forward, backward and parameter update."""
-    model.train()
-    output = model(input)
-    loss = loss_fn(output, target)
-    optimizer.backward(loss)
-    optimizer.step()
-
-
-def single_gpu_training(args, rank, iterations, shared_results):
-
-    is_cuda = torch.cuda.is_available()
-    if is_cuda:
-        torch.cuda.set_device(rank)
-
-    model, loss_fn, optimizer = setup_model_loss_criterion(args, rank, is_cuda)
-
-    for _ in range(iterations):
-        input = torch.randn(1, args.input_size)
-        target = torch.empty(args.batch_size, dtype=torch.long).random_(args.nb_classes)
-
-        if is_cuda:
-            input = input.cuda()
-            target = target.cuda()
-        train_step(input, target, model, loss_fn, optimizer)
-
-    results = []
-    for param in model.parameters():
-        if len(results) == 0:
-            results = param.flatten().cpu().data
-        else:
-            results = torch.cat((results, param.flatten().cpu().data), 0)
-
-    shared_results[rank] = results
-
-
-def setup_args():
-    args = argparse.Namespace()
-    args.global_sync_iter = 20
-    args.block_momentum = 0.875
-    args.block_lr = 0.5
-    args.input_size = 5
-    args.nb_classes = 2
-    args.batch_size = 1
-    args.lr = [1e-3]
-    args.momentum = 0
-    args.weight_decay = 0
-    args.warmup_iterations = 0
-    args.use_nbm = True
-    args.average_sync = True
-    args.global_sync_iter = 1
-    args.model_parallel_size = 1
-    args.distributed_backend = "gloo"
-
-    args.distributed_world_size = 2
-    port = random.randint(10000, 20000)
-    args.distributed_init_method = "tcp://localhost:{port}".format(port=port)
-    args.distributed_init_host = "localhost"
-    args.distributed_port = port + 1
-    args.local_world_size = args.distributed_world_size
-    return args
-
-
-@unittest.skipIf(torch.cuda.device_count() < 2, "test requires 2 GPUs")
-class TestBMUF(unittest.TestCase):
-    def bmuf_process(self, args, iterations):
-        processes = []
-        results = Manager().dict()
-        ctx = torch.multiprocessing.get_context("spawn")
-        for rank in range(args.distributed_world_size):
-            p = ctx.Process(
-                target=single_gpu_training, args=(args, rank, iterations, results)
-            )
-            p.start()
-            processes.append(p)
-
-        for p in processes:
-            p.join()
-        return results
-
-    def test_bmuf_sync(self):
-        # Train model for 1 iteration and do bmuf sync without doing warmup
-        args = setup_args()
-        iterations = 1
-        results = self.bmuf_process(args, iterations)
-        # Make sure params in both machines are same
-        assert len(results) == 2
-        self.assertAlmostEqual(results[0], results[1])
-
-    def test_warmup_sync(self):
-        # Train model for 20 iteration and do warmup sync without doing bmuf sync
-        args = setup_args()
-        args.warmup_iterations = 20
-        iterations = 20
-        results = self.bmuf_process(args, iterations)
-        # Make sure params in both machines are same
-        assert len(results) == 2
-        self.assertAlmostEqual(results[0], results[1])
-
-    def test_warmup_sync_bmuf_sync(self):
-        # Train model for 25 iteration and do warmup sync after 20 iteration
-        # and bmuf sync after 25 iteration
-        args = setup_args()
-        args.warmup_iterations = 20
-        args.global_sync_iter = 5
-        iterations = 25
-        results = self.bmuf_process(args, iterations)
-        # Make sure params in both machines are same
-        assert len(results) == 2
-        self.assertAlmostEqual(results[0], results[1])
-
-    def test_single_gpu_bmuf(self):
-        # Train model for 5 iterations and use GPU 1
-        args = setup_args()
-        args.distributed_world_size = 1
-        args.warmup_iterations = 5
-        iterations = 20
-        results = self.bmuf_process(args, iterations)
-        assert len(results) == 1
-
-    def assertAlmostEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertLess((t1 - t2).abs().max(), 1e-4)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_character_token_embedder.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_character_token_embedder.py
deleted file mode 100644
index 24940ebd21a0e4465ca6052409353a3179e9cf6d..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_character_token_embedder.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import torch
-from fairseq.data import Dictionary
-from fairseq.modules import CharacterTokenEmbedder
-
-
-class TestCharacterTokenEmbedder(unittest.TestCase):
-    def test_character_token_embedder(self):
-        vocab = Dictionary()
-        vocab.add_symbol("hello")
-        vocab.add_symbol("there")
-
-        embedder = CharacterTokenEmbedder(
-            vocab, [(2, 16), (4, 32), (8, 64), (16, 2)], 64, 5, 2
-        )
-
-        test_sents = [["hello", "unk", "there"], ["there"], ["hello", "there"]]
-        max_len = max(len(s) for s in test_sents)
-        input = torch.LongTensor(len(test_sents), max_len + 2).fill_(vocab.pad())
-        for i in range(len(test_sents)):
-            input[i][0] = vocab.eos()
-            for j in range(len(test_sents[i])):
-                input[i][j + 1] = vocab.index(test_sents[i][j])
-            input[i][j + 2] = vocab.eos()
-        embs = embedder(input)
-
-        assert embs.size() == (len(test_sents), max_len + 2, 5)
-        self.assertAlmostEqual(embs[0][0], embs[1][0])
-        self.assertAlmostEqual(embs[0][0], embs[0][-1])
-        self.assertAlmostEqual(embs[0][1], embs[2][1])
-        self.assertAlmostEqual(embs[0][3], embs[1][1])
-
-        embs.sum().backward()
-        assert embedder.char_embeddings.weight.grad is not None
-
-    def assertAlmostEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertLess((t1 - t2).abs().max(), 1e-6)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_concat_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_concat_dataset.py
deleted file mode 100644
index d94aeffd481a2e107eb5747e41d76435b3f3dc8a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_concat_dataset.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import torch
-from fairseq.data import LanguagePairDataset, TokenBlockDataset
-from fairseq.data.concat_dataset import ConcatDataset
-from tests.test_train import mock_dict
-
-
-class TestConcatDataset(unittest.TestCase):
-    def setUp(self):
-        d = mock_dict()
-        tokens_1 = torch.LongTensor([1]).view(1, -1)
-        tokens_ds1 = TokenBlockDataset(
-            tokens_1,
-            sizes=[tokens_1.size(-1)],
-            block_size=1,
-            pad=0,
-            eos=1,
-            include_targets=False,
-        )
-        self.dataset_1 = LanguagePairDataset(
-            tokens_ds1, tokens_ds1.sizes, d, shuffle=False
-        )
-        tokens_2 = torch.LongTensor([2]).view(1, -1)
-        tokens_ds2 = TokenBlockDataset(
-            tokens_2,
-            sizes=[tokens_2.size(-1)],
-            block_size=1,
-            pad=0,
-            eos=1,
-            include_targets=False,
-        )
-        self.dataset_2 = LanguagePairDataset(
-            tokens_ds2, tokens_ds2.sizes, d, shuffle=False
-        )
-
-    def test_concat_dataset_basics(self):
-        d = ConcatDataset([self.dataset_1, self.dataset_2])
-        assert len(d) == 2
-        assert d[0]["source"][0] == 1
-        assert d[1]["source"][0] == 2
-
-        d = ConcatDataset([self.dataset_1, self.dataset_2], sample_ratios=[1, 2])
-        assert len(d) == 3
-        assert d[0]["source"][0] == 1
-        assert d[1]["source"][0] == 2
-        assert d[2]["source"][0] == 2
-
-        d = ConcatDataset([self.dataset_1, self.dataset_2], sample_ratios=[2, 1])
-        assert len(d) == 3
-        assert d[0]["source"][0] == 1
-        assert d[1]["source"][0] == 1
-        assert d[2]["source"][0] == 2
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_constraints.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_constraints.py
deleted file mode 100644
index 1c37f7e1fb26d8ea5349fedd3a60f566d09cf598..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_constraints.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import sys
-import unittest
-
-import torch
-from fairseq.token_generation_constraints import *
-
-
-def tensorize(constraints: List[List[int]]) -> torch.Tensor:
-    return [torch.tensor(x) for x in constraints]
-
-
-class TestHelperRoutines(unittest.TestCase):
-    def setUp(self):
-        self.examples = [
-            ([[]], torch.tensor([[0]])),
-            ([[], []], torch.tensor([[0], [0]])),
-            ([[torch.tensor([1, 2])], []], torch.tensor([[1, 1, 2, 0], [0, 0, 0, 0]])),
-            (
-                [
-                    [
-                        torch.tensor([3, 1, 2]),
-                        torch.tensor([3]),
-                        torch.tensor([4, 5, 6, 7]),
-                    ],
-                    [],
-                    [torch.tensor([1, 8, 9, 10, 1, 4, 11, 12])],
-                ],
-                torch.tensor(
-                    [
-                        [3, 3, 1, 2, 0, 3, 0, 4, 5, 6, 7, 0],
-                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                        [1, 1, 8, 9, 10, 1, 4, 11, 12, 0, 0, 0],
-                    ]
-                ),
-            ),
-        ]
-
-    def test_packing(self):
-        """Ensures the list of lists of tensors gets packed correctly."""
-        for batch_constraints, expected_tensor in self.examples:
-            packed = pack_constraints(batch_constraints)
-            assert torch.equal(packed, expected_tensor)
-
-
-class TestUnorderedConstraintState(unittest.TestCase):
-    def setUp(self):
-        # Tuples of (contraint set, expected printed graph, token counts per node)
-        self.examples = [
-            (
-                tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]),
-                "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))",
-                {1: 4, 2: 1, 3: 2, 4: 3, 5: 2, 6: 1, 7: 1},
-            ),
-            ([], "[None].False#0", {}),
-            (tensorize([[0]]), "([None].False#1 [0].True#1)", {0: 1}),
-            (
-                tensorize([[100000, 1, 2, 3, 4, 5]]),
-                "([None].False#1 ([100000].False#1 ([1].False#1 ([2].False#1 ([3].False#1 ([4].False#1 [5].True#1))))))",
-                {100000: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1},
-            ),
-            (
-                tensorize([[1, 2], [1, 2]]),
-                "([None].False#2 ([1].False#2 [2].True#2))",
-                {1: 2, 2: 2},
-            ),
-            (
-                tensorize([[1, 2], [3, 4]]),
-                "([None].False#2 ([1].False#1 [2].True#1) ([3].False#1 [4].True#1))",
-                {1: 1, 2: 1, 3: 1, 4: 1},
-            ),
-        ]
-
-        self.sequences = [
-            (
-                self.examples[0][0],
-                [],
-                {"bank": 0, "num_completed": 0, "finished": False, "is_root": True},
-            ),
-            (
-                self.examples[0][0],
-                [1, 2],
-                {"bank": 2, "num_completed": 0, "finished": False, "is_root": False},
-            ),
-            (
-                self.examples[0][0],
-                [1, 2, 94],
-                {"bank": 1, "num_completed": 1, "finished": False, "is_root": True},
-            ),
-            (
-                self.examples[0][0],
-                [1, 3, 999, 1, 4],
-                {"bank": 4, "num_completed": 2, "finished": False, "is_root": False},
-            ),
-            (
-                self.examples[0][0],
-                [1, 3, 999, 1, 4, 999],
-                {"bank": 4, "num_completed": 2, "finished": False, "is_root": True},
-            ),
-            (
-                self.examples[0][0],
-                [4, 5, 6, 8],
-                {"bank": 2, "num_completed": 1, "finished": False, "is_root": True},
-            ),
-            (
-                self.examples[0][0],
-                # Tricky, because in last three, goes down [1->4] branch, could miss [1] and [4->5]
-                # [[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]],
-                [1, 2, 3, 1, 3, 1, 4, 4, 5, 6, 7, 1, 4, 5],
-                {"bank": 14, "num_completed": 6, "finished": True, "is_root": False},
-            ),
-            (
-                self.examples[0][0],
-                [1, 2, 3, 999, 1, 3, 1, 4, 4, 5, 6, 7, 1, 4, 5, 117],
-                {"bank": 14, "num_completed": 6, "finished": True, "is_root": True},
-            ),
-            (
-                tensorize([[1], [2, 3]]),
-                # Should not be able to get credit for entering 1 a second time
-                [1, 1],
-                {"bank": 1, "num_completed": 1, "finished": False, "is_root": True},
-            ),
-            (
-                self.examples[4][0],
-                [1, 2, 1, 2],
-                {"bank": 4, "num_completed": 2, "finished": True, "is_root": False},
-            ),
-            (
-                self.examples[4][0],
-                [1, 2, 1, 2, 1],
-                {"bank": 4, "num_completed": 2, "finished": True, "is_root": True},
-            ),
-            (
-                self.examples[5][0],
-                [1, 2, 3, 4, 5],
-                {"bank": 4, "num_completed": 2, "finished": True, "is_root": True},
-            ),
-        ]
-
-    def test_graphs(self):
-        """
-        Test whether unordered graph systems are created correctly.
-        """
-        for example in self.examples:
-            constraints, expected, gold_counts = example
-            c = ConstraintNode.create(constraints)
-            assert (
-                ConstraintNode.print_graph(c) == expected
-            ), f"got {ConstraintNode.print_graph(c)}, expected {expected}"
-            assert (
-                c.token_counts() == gold_counts
-            ), f"{c} got {c.token_counts()} wanted {gold_counts}"
-
-    def test_next_tokens(self):
-        """
-        Tests that the set of next tokens is correct.
-        """
-        for example in self.examples:
-            constraints, expected, gold_counts = example
-            root = ConstraintNode.create(constraints)
-
-            root_tokens = set(root.children.keys())
-            for sequence in constraints:
-                state = UnorderedConstraintState(root)
-                for token in sequence:
-                    all_tokens = root_tokens.union(state.node.children.keys())
-                    assert (
-                        all_tokens == state.next_tokens()
-                    ), f"ALL {all_tokens} NEXT {state.next_tokens()}"
-                    state = state.advance(token)
-
-    def test_sequences(self):
-        for constraints, tokens, expected in self.sequences:
-            state = UnorderedConstraintState.create(pack_constraints([constraints])[0])
-            for token in tokens:
-                state = state.advance(token)
-            result = {}
-            for attr in expected.keys():
-                result[attr] = getattr(state, attr)
-
-            assert (
-                result == expected
-            ), f"TEST({tokens}) GOT: {result} WANTED: {expected}"
-
-
-class TestOrderedConstraintState(unittest.TestCase):
-    def setUp(self):
-        self.sequences = [
-            (
-                tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]),
-                [],
-                {"bank": 0, "num_completed": 0, "finished": False, "is_root": True},
-            ),
-            (
-                tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]),
-                [1, 2],
-                {"bank": 2, "num_completed": 0, "finished": False, "is_root": False},
-            ),
-            (
-                tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]),
-                [1, 2, 94],
-                {"bank": 0, "num_completed": 0, "finished": False, "is_root": True},
-            ),
-            (
-                tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]),
-                [1, 3, 999, 1, 4],
-                {"bank": 0, "num_completed": 0, "finished": False, "is_root": True},
-            ),
-            (
-                tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]),
-                [1, 2, 3, 999, 999],
-                {"bank": 3, "num_completed": 1, "finished": False, "is_root": False},
-            ),
-            (
-                tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]),
-                [1, 2, 3, 77, 1, 3, 1],
-                {"bank": 6, "num_completed": 2, "finished": False, "is_root": False},
-            ),
-            (
-                tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]),
-                [1, 2, 3, 1, 3, 1, 4, 4, 5, 6, 7, 1, 4, 5],
-                {"bank": 14, "num_completed": 6, "finished": True, "is_root": False},
-            ),
-            (
-                tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]),
-                [1, 2, 999, 1, 2, 3, 999, 1, 3, 1, 4, 4, 5, 6, 7, 1, 4, 5, 117],
-                {"bank": 14, "num_completed": 6, "finished": True, "is_root": False},
-            ),
-            (
-                tensorize([[1], [2, 3]]),
-                [1, 1],
-                {"bank": 1, "num_completed": 1, "finished": False, "is_root": False},
-            ),
-            (
-                tensorize([[1, 2], [1, 2]]),
-                [1, 2, 1, 2],
-                {"bank": 4, "num_completed": 2, "finished": True, "is_root": False},
-            ),
-            (
-                tensorize([[1, 2], [1, 2]]),
-                [1, 2, 1, 2, 1],
-                {"bank": 4, "num_completed": 2, "finished": True, "is_root": False},
-            ),
-            (
-                tensorize([[1, 2], [3, 4]]),
-                [1, 2, 3, 4, 5],
-                {"bank": 4, "num_completed": 2, "finished": True, "is_root": False},
-            ),
-        ]
-
-    def test_sequences(self):
-        for i, (constraints, tokens, expected) in enumerate(self.sequences):
-            state = OrderedConstraintState.create(pack_constraints([constraints])[0])
-            for token in tokens:
-                state = state.advance(token)
-            result = {}
-            for attr in expected.keys():
-                result[attr] = getattr(state, attr)
-            assert (
-                result == expected
-            ), f"TEST({tokens}) GOT: {result} WANTED: {expected}"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_convtbc.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_convtbc.py
deleted file mode 100644
index 3a3c9b91e70f597ab77b9b01459cc429db5d7956..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_convtbc.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import torch
-import torch.nn as nn
-from fairseq.modules import ConvTBC
-
-
-class TestConvTBC(unittest.TestCase):
-    def test_convtbc(self):
-        # ksz, in_channels, out_channels
-        conv_tbc = ConvTBC(4, 5, kernel_size=3, padding=1)
-        # out_channels, in_channels, ksz
-        conv1d = nn.Conv1d(4, 5, kernel_size=3, padding=1)
-
-        conv_tbc.weight.data.copy_(conv1d.weight.data.transpose(0, 2))
-        conv_tbc.bias.data.copy_(conv1d.bias.data)
-
-        input_tbc = torch.randn(7, 2, 4, requires_grad=True)
-        input1d = input_tbc.data.transpose(0, 1).transpose(1, 2)
-        input1d.requires_grad = True
-
-        output_tbc = conv_tbc(input_tbc)
-        output1d = conv1d(input1d)
-
-        self.assertAlmostEqual(
-            output_tbc.data.transpose(0, 1).transpose(1, 2), output1d.data
-        )
-
-        grad_tbc = torch.randn(output_tbc.size())
-        grad1d = grad_tbc.transpose(0, 1).transpose(1, 2).contiguous()
-
-        output_tbc.backward(grad_tbc)
-        output1d.backward(grad1d)
-
-        self.assertAlmostEqual(
-            conv_tbc.weight.grad.data.transpose(0, 2), conv1d.weight.grad.data
-        )
-        self.assertAlmostEqual(conv_tbc.bias.grad.data, conv1d.bias.grad.data)
-        self.assertAlmostEqual(
-            input_tbc.grad.data.transpose(0, 1).transpose(1, 2), input1d.grad.data
-        )
-
-    def assertAlmostEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertLess((t1 - t2).abs().max(), 1e-4)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_dictionary.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_dictionary.py
deleted file mode 100644
index 81ce102f4f555822e36298034cdeb3d1c0650255..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_dictionary.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import io
-import tempfile
-import unittest
-
-import torch
-from fairseq.data import Dictionary
-
-
-class TestDictionary(unittest.TestCase):
-    def test_finalize(self):
-        txt = [
-            "A B C D",
-            "B C D",
-            "C D",
-            "D",
-        ]
-        ref_ids1 = list(
-            map(
-                torch.IntTensor,
-                [
-                    [4, 5, 6, 7, 2],
-                    [5, 6, 7, 2],
-                    [6, 7, 2],
-                    [7, 2],
-                ],
-            )
-        )
-        ref_ids2 = list(
-            map(
-                torch.IntTensor,
-                [
-                    [7, 6, 5, 4, 2],
-                    [6, 5, 4, 2],
-                    [5, 4, 2],
-                    [4, 2],
-                ],
-            )
-        )
-
-        # build dictionary
-        d = Dictionary()
-        for line in txt:
-            d.encode_line(line, add_if_not_exist=True)
-
-        def get_ids(dictionary):
-            ids = []
-            for line in txt:
-                ids.append(dictionary.encode_line(line, add_if_not_exist=False))
-            return ids
-
-        def assertMatch(ids, ref_ids):
-            for toks, ref_toks in zip(ids, ref_ids):
-                self.assertEqual(toks.size(), ref_toks.size())
-                self.assertEqual(0, (toks != ref_toks).sum().item())
-
-        ids = get_ids(d)
-        assertMatch(ids, ref_ids1)
-
-        # check finalized dictionary
-        d.finalize()
-        finalized_ids = get_ids(d)
-        assertMatch(finalized_ids, ref_ids2)
-
-        # write to disk and reload
-        with tempfile.NamedTemporaryFile(mode="w") as tmp_dict:
-            d.save(tmp_dict.name)
-            d = Dictionary.load(tmp_dict.name)
-            reload_ids = get_ids(d)
-            assertMatch(reload_ids, ref_ids2)
-            assertMatch(finalized_ids, reload_ids)
-
-    def test_overwrite(self):
-        # for example, Camembert overwrites <unk>, <s> and </s>
-        dict_file = io.StringIO(
-            "<unk> 999 #fairseq:overwrite\n"
-            "<s> 999 #fairseq:overwrite\n"
-            "</s> 999 #fairseq:overwrite\n"
-            ", 999\n"
-            "▁de 999\n"
-        )
-        d = Dictionary()
-        d.add_from_file(dict_file)
-        self.assertEqual(d.index("<pad>"), 1)
-        self.assertEqual(d.index("foo"), 3)
-        self.assertEqual(d.index("<unk>"), 4)
-        self.assertEqual(d.index("<s>"), 5)
-        self.assertEqual(d.index("</s>"), 6)
-        self.assertEqual(d.index(","), 7)
-        self.assertEqual(d.index("▁de"), 8)
-
-    def test_no_overwrite(self):
-        # for example, Camembert overwrites <unk>, <s> and </s>
-        dict_file = io.StringIO(
-            "<unk> 999\n" "<s> 999\n" "</s> 999\n" ", 999\n" "▁de 999\n"
-        )
-        d = Dictionary()
-        with self.assertRaisesRegex(RuntimeError, "Duplicate"):
-            d.add_from_file(dict_file)
-
-    def test_space(self):
-        # for example, character models treat space as a symbol
-        dict_file = io.StringIO("  999\n" "a 999\n" "b 999\n")
-        d = Dictionary()
-        d.add_from_file(dict_file)
-        self.assertEqual(d.index(" "), 4)
-        self.assertEqual(d.index("a"), 5)
-        self.assertEqual(d.index("b"), 6)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_export.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_export.py
deleted file mode 100644
index 87e52bd7c18306b65e60cb9a9d8566169a120dcc..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_export.py
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import tempfile
-import unittest
-
-import torch
-from fairseq.data.dictionary import Dictionary
-from fairseq.models.transformer import TransformerModel
-from fairseq.modules import multihead_attention, sinusoidal_positional_embedding
-from fairseq.tasks.fairseq_task import LegacyFairseqTask
-
-
-DEFAULT_TEST_VOCAB_SIZE = 100
-
-
-class DummyTask(LegacyFairseqTask):
-    def __init__(self, args):
-        super().__init__(args)
-        self.dictionary = get_dummy_dictionary()
-        if getattr(self.args, "ctc", False):
-            self.dictionary.add_symbol("<ctc_blank>")
-        self.src_dict = self.dictionary
-        self.tgt_dict = self.dictionary
-
-    @property
-    def source_dictionary(self):
-        return self.src_dict
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
-
-
-def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE):
-    dummy_dict = Dictionary()
-    # add dummy symbol to satisfy vocab size
-    for id, _ in enumerate(range(vocab_size)):
-        dummy_dict.add_symbol("{}".format(id), 1000)
-    return dummy_dict
-
-
-def get_dummy_task_and_parser():
-    """
-    Return a dummy task and argument parser, which can be used to
-    create a model/criterion.
-    """
-    parser = argparse.ArgumentParser(
-        description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS
-    )
-    DummyTask.add_args(parser)
-    args = parser.parse_args([])
-    task = DummyTask.setup_task(args)
-    return task, parser
-
-
-def _test_save_and_load(scripted_module):
-    with tempfile.NamedTemporaryFile() as f:
-        scripted_module.save(f.name)
-        torch.jit.load(f.name)
-
-
-class TestExportModels(unittest.TestCase):
-    def test_export_multihead_attention(self):
-        module = multihead_attention.MultiheadAttention(embed_dim=8, num_heads=2)
-        scripted = torch.jit.script(module)
-        _test_save_and_load(scripted)
-
-    def test_incremental_state_multihead_attention(self):
-        module1 = multihead_attention.MultiheadAttention(embed_dim=8, num_heads=2)
-        module1 = torch.jit.script(module1)
-        module2 = multihead_attention.MultiheadAttention(embed_dim=8, num_heads=2)
-        module2 = torch.jit.script(module2)
-
-        state = {}
-        state = module1.set_incremental_state(state, "key", {"a": torch.tensor([1])})
-        state = module2.set_incremental_state(state, "key", {"a": torch.tensor([2])})
-        v1 = module1.get_incremental_state(state, "key")["a"]
-        v2 = module2.get_incremental_state(state, "key")["a"]
-
-        self.assertEqual(v1, 1)
-        self.assertEqual(v2, 2)
-
-    def test_positional_embedding(self):
-        module = sinusoidal_positional_embedding.SinusoidalPositionalEmbedding(
-            embedding_dim=8, padding_idx=1
-        )
-        scripted = torch.jit.script(module)
-        _test_save_and_load(scripted)
-
-    @unittest.skipIf(
-        torch.__version__ < "1.6.0", "Targeting OSS scriptability for the 1.6 release"
-    )
-    def test_export_transformer(self):
-        task, parser = get_dummy_task_and_parser()
-        TransformerModel.add_args(parser)
-        args = parser.parse_args([])
-        model = TransformerModel.build_model(args, task)
-        scripted = torch.jit.script(model)
-        _test_save_and_load(scripted)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_file_io.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_file_io.py
deleted file mode 100644
index aef5b80d185591007315e56f02b553ff1f230ed0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_file_io.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import os
-import shutil
-import sys
-import tempfile
-import unittest
-from typing import Optional
-from unittest.mock import MagicMock
-
-
-class TestFileIO(unittest.TestCase):
-
-    _tmpdir: Optional[str] = None
-    _tmpfile: Optional[str] = None
-    _tmpfile_contents = "Hello, World"
-
-    @classmethod
-    def setUpClass(cls) -> None:
-        cls._tmpdir = tempfile.mkdtemp()
-        with open(os.path.join(cls._tmpdir, "test.txt"), "w") as f:
-            cls._tmpfile = f.name
-            f.write(cls._tmpfile_contents)
-            f.flush()
-
-    @classmethod
-    def tearDownClass(cls) -> None:
-        # Cleanup temp working dir.
-        if cls._tmpdir is not None:
-            shutil.rmtree(cls._tmpdir)  # type: ignore
-
-    def test_file_io(self):
-        from fairseq.file_io import PathManager
-
-        with PathManager.open(os.path.join(self._tmpdir, "test.txt"), "r") as f:
-            s = f.read()
-        self.assertEqual(s, self._tmpfile_contents)
-
-    def test_file_io_oss(self):
-        # Mock fvcore to simulate oss environment.
-        sys.modules["fvcore"] = MagicMock()
-        from fairseq.file_io import PathManager
-
-        with PathManager.open(os.path.join(self._tmpdir, "test.txt"), "r") as f:
-            s = f.read()
-        self.assertEqual(s, self._tmpfile_contents)
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_fp16_optimizer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_fp16_optimizer.py
deleted file mode 100644
index c4195273e32abdaf2146606150d50436ba70108e..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_fp16_optimizer.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import copy
-import unittest
-
-import torch
-from fairseq.optim.fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer
-
-
-@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
-class TestGradientScaling(unittest.TestCase):
-    def setUp(self):
-        self.x = torch.tensor([2.0]).cuda().half()
-        weight = 3.0
-        bias = 5.0
-        self.error = 1.0
-        self.target = torch.tensor([self.x * weight + bias + self.error]).cuda().half()
-        self.loss_fn = torch.nn.L1Loss()
-
-        self.model = torch.nn.Linear(1, 1)
-        self.model.weight.data = torch.tensor([[weight]])
-        self.model.bias.data = torch.tensor([bias])
-        self.model.cuda().half()
-        self.params = list(self.model.parameters())
-
-        self.namespace_dls = argparse.Namespace(
-            optimizer="adam",
-            lr=[0.1],
-            adam_betas="(0.9, 0.999)",
-            adam_eps=1e-8,
-            weight_decay=0.0,
-            fp16_init_scale=1,
-            fp16_scale_window=1,
-            fp16_scale_tolerance=1,
-            threshold_loss_scale=1,
-            min_loss_scale=1e-4,
-        )
-
-    def run_iter(self, model, params, optimizer):
-        optimizer.zero_grad()
-        y = model(self.x)
-        loss = self.loss_fn(y, self.target)
-        optimizer.backward(loss)
-        self.assertEqual(loss, torch.tensor(1.0, device="cuda:0", dtype=torch.float16))
-
-        grad_norm = optimizer.clip_grad_norm(0)
-        self.assertAlmostEqual(grad_norm.item(), 2.2361, 4)
-
-        optimizer.step()
-        self.assertEqual(
-            model.weight,
-            torch.tensor(
-                [[3.0996]], device="cuda:0", dtype=torch.float16, requires_grad=True
-            ),
-        )
-        self.assertEqual(
-            model.bias,
-            torch.tensor(
-                [5.1016], device="cuda:0", dtype=torch.float16, requires_grad=True
-            ),
-        )
-        self.assertEqual(optimizer.scaler.loss_scale, 2.0)
-
-    def test_mixed_precision(self):
-        model = copy.deepcopy(self.model)
-        params = list(model.parameters())
-        optimizer = FP16Optimizer.build_optimizer(self.namespace_dls, params)
-
-        self.run_iter(model, params, optimizer)
-        self.assertTrue(
-            all(
-                torch.all(
-                    fp32_params.eq(
-                        torch.tensor(
-                            [3.1000, 5.1000], device="cuda:0", requires_grad=True
-                        )
-                    )
-                )
-                for fp32_params in optimizer.fp32_params.values()
-            )
-        )
-
-    def test_memory_efficient(self):
-        model = copy.deepcopy(self.model)
-        params = list(model.parameters())
-        optimizer = MemoryEfficientFP16Optimizer.build_optimizer(
-            self.namespace_dls, params
-        )
-
-        self.run_iter(model, params, optimizer)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_inference_dropout.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_inference_dropout.py
deleted file mode 100644
index fd5edd43d6a6f1fe06f8439cb9cb9a565e8a1074..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_inference_dropout.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-import unittest
-
-from fairseq.models.transformer import TransformerModel
-from tests.test_sequence_generator import get_dummy_task_and_parser
-
-
-class TestInferenceDropout(unittest.TestCase):
-    def setUp(self):
-        self.task, self.parser = get_dummy_task_and_parser()
-        TransformerModel.add_args(self.parser)
-        self.args = self.parser.parse_args([])
-        self.args.encoder_layers = 2
-        self.args.decoder_layers = 1
-        logging.disable(logging.CRITICAL)
-
-    def tearDown(self):
-        logging.disable(logging.NOTSET)
-
-    def test_sets_inference_dropout_to_true(self):
-        self.args.retain_dropout = True
-        self.transformer_model = TransformerModel.build_model(self.args, self.task)
-        self.transformer_model.prepare_for_inference_(self.args)
-        assert self.transformer_model.encoder.dropout_module.apply_during_inference
-        assert self.transformer_model.decoder.dropout_module.apply_during_inference
-        for layer in self.transformer_model.encoder.layers:
-            assert layer.dropout_module.apply_during_inference
-
-    def test_inference_dropout_false_by_default(self):
-        self.transformer_model = TransformerModel.build_model(self.args, self.task)
-        self.transformer_model.prepare_for_inference_(self.args)
-        assert not self.transformer_model.encoder.dropout_module.apply_during_inference
-        assert not self.transformer_model.decoder.dropout_module.apply_during_inference
-        for layer in self.transformer_model.encoder.layers:
-            assert not layer.dropout_module.apply_during_inference
-        for layer in self.transformer_model.decoder.layers:
-            assert not layer.dropout_module.apply_during_inference
-
-    def test_applies_training_mode(self):
-        self.transformer_model = TransformerModel.build_model(self.args, self.task)
-        assert self.transformer_model.encoder.dropout_module.training
-        for layer in self.transformer_model.encoder.layers:
-            assert layer.dropout_module.training
-
-        self.transformer_model.eval()
-        assert not self.transformer_model.decoder.dropout_module.training
-        for layer in self.transformer_model.encoder.layers:
-            assert not layer.dropout_module.training
-
-    def test_retain_modules(self):
-        self.args.retain_dropout = True
-        self.args.retain_dropout_modules = [
-            "TransformerEncoder",
-            "TransformerEncoderLayer",
-        ]
-        self.transformer_model = TransformerModel.build_model(self.args, self.task)
-        self.transformer_model.prepare_for_inference_(self.args)
-        assert self.transformer_model.encoder.dropout_module.apply_during_inference
-        assert not self.transformer_model.decoder.dropout_module.apply_during_inference
-        for layer in self.transformer_model.decoder.layers:
-            assert not layer.dropout_module.apply_during_inference
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_iterators.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_iterators.py
deleted file mode 100644
index 3d2c4d6251f6a09b148e615e15825704882f610a..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_iterators.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-from fairseq.data import iterators
-
-
-class TestIterators(unittest.TestCase):
-    def test_counting_iterator(self, ref=None, itr=None):
-        if ref is None:
-            assert itr is None
-            ref = list(range(10))
-            itr = iterators.CountingIterator(ref)
-        else:
-            assert len(ref) == 10
-            assert itr is not None
-        self.assertTrue(itr.has_next())
-        self.assertEqual(itr.n, 0)
-        self.assertEqual(next(itr), ref[0])
-        self.assertEqual(itr.n, 1)
-        self.assertEqual(next(itr), ref[1])
-        self.assertEqual(itr.n, 2)
-        itr.skip(3)
-        self.assertEqual(itr.n, 5)
-        self.assertEqual(next(itr), ref[5])
-        itr.skip(3)
-        self.assertEqual(itr.n, 9)
-        self.assertEqual(next(itr), ref[9])
-        self.assertFalse(itr.has_next())
-
-    def test_grouped_iterator(self):
-        # test correctness
-        x = list(range(10))
-        itr = iterators.GroupedIterator(x, 1)
-        self.assertEqual(list(itr), [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]])
-        itr = iterators.GroupedIterator(x, 4)
-        self.assertEqual(list(itr), [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9]])
-        itr = iterators.GroupedIterator(x, 5)
-        self.assertEqual(list(itr), [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]])
-
-        # test CountingIterator functionality
-        x = list(range(30))
-        ref = list(iterators.GroupedIterator(x, 3))
-        itr = iterators.GroupedIterator(x, 3)
-        self.test_counting_iterator(ref, itr)
-
-    def test_sharded_iterator(self):
-        # test correctness
-        x = list(range(10))
-        itr = iterators.ShardedIterator(x, num_shards=1, shard_id=0)
-        self.assertEqual(list(itr), x)
-        itr = iterators.ShardedIterator(x, num_shards=2, shard_id=0)
-        self.assertEqual(list(itr), [0, 2, 4, 6, 8])
-        itr = iterators.ShardedIterator(x, num_shards=2, shard_id=1)
-        self.assertEqual(list(itr), [1, 3, 5, 7, 9])
-        itr = iterators.ShardedIterator(x, num_shards=3, shard_id=0)
-        self.assertEqual(list(itr), [0, 3, 6, 9])
-        itr = iterators.ShardedIterator(x, num_shards=3, shard_id=1)
-        self.assertEqual(list(itr), [1, 4, 7, None])
-        itr = iterators.ShardedIterator(x, num_shards=3, shard_id=2)
-        self.assertEqual(list(itr), [2, 5, 8, None])
-
-        # test CountingIterator functionality
-        x = list(range(30))
-        ref = list(iterators.ShardedIterator(x, num_shards=3, shard_id=0))
-        itr = iterators.ShardedIterator(x, num_shards=3, shard_id=0)
-        self.test_counting_iterator(ref, itr)
-
-    def test_counting_iterator_take(self):
-        ref = list(range(10))
-        itr = iterators.CountingIterator(ref)
-        itr.take(5)
-        self.assertEqual(len(itr), len(list(iter(itr))))
-        self.assertEqual(len(itr), 5)
-
-        itr = iterators.CountingIterator(ref)
-        itr.take(5)
-        self.assertEqual(next(itr), ref[0])
-        self.assertEqual(next(itr), ref[1])
-        itr.skip(2)
-        self.assertEqual(next(itr), ref[4])
-        self.assertFalse(itr.has_next())
-
-    def test_counting_iterator_buffered_iterator_take(self):
-        ref = list(range(10))
-        buffered_itr = iterators.BufferedIterator(2, ref)
-        itr = iterators.CountingIterator(buffered_itr)
-        itr.take(5)
-        self.assertEqual(len(itr), len(list(iter(itr))))
-        self.assertEqual(len(itr), 5)
-
-        buffered_itr = iterators.BufferedIterator(2, ref)
-        itr = iterators.CountingIterator(buffered_itr)
-        itr.take(5)
-        self.assertEqual(len(buffered_itr), 5)
-        self.assertEqual(len(list(iter(buffered_itr))), 5)
-
-        buffered_itr = iterators.BufferedIterator(2, ref)
-        itr = iterators.CountingIterator(buffered_itr)
-        itr.take(5)
-        self.assertEqual(next(itr), ref[0])
-        self.assertEqual(next(itr), ref[1])
-        itr.skip(2)
-        self.assertEqual(next(itr), ref[4])
-        self.assertFalse(itr.has_next())
-        self.assertRaises(StopIteration, next, buffered_itr)
-
-        ref = list(range(4, 10))
-        buffered_itr = iterators.BufferedIterator(2, ref)
-        itr = iterators.CountingIterator(buffered_itr, start=4)
-        itr.take(5)
-        self.assertEqual(len(itr), 5)
-        self.assertEqual(len(buffered_itr), 1)
-        self.assertEqual(next(itr), ref[0])
-        self.assertFalse(itr.has_next())
-        self.assertRaises(StopIteration, next, buffered_itr)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_label_smoothing.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_label_smoothing.py
deleted file mode 100644
index 04c0f974ac80f7606327f868e948712c3c18f1d0..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_label_smoothing.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import copy
-import unittest
-
-import tests.utils as test_utils
-import torch
-from fairseq.criterions.cross_entropy import CrossEntropyCriterion
-from fairseq.criterions.label_smoothed_cross_entropy import (
-    LabelSmoothedCrossEntropyCriterion,
-)
-
-
-class TestLabelSmoothing(unittest.TestCase):
-    def setUp(self):
-        # build dictionary
-        self.d = test_utils.dummy_dictionary(3)
-        vocab = len(self.d)
-        self.assertEqual(vocab, 4 + 3)  # 4 special + 3 tokens
-        self.assertEqual(self.d.pad(), 1)
-        self.assertEqual(self.d.eos(), 2)
-        self.assertEqual(self.d.unk(), 3)
-        pad, eos, unk, w1, w2, w3 = 1, 2, 3, 4, 5, 6  # noqa: F841
-
-        # build dataset
-        self.data = [
-            # the first batch item has padding
-            {
-                "source": torch.LongTensor([w1, eos]),
-                "target": torch.LongTensor([w1, eos]),
-            },
-            {
-                "source": torch.LongTensor([w1, eos]),
-                "target": torch.LongTensor([w1, w1, eos]),
-            },
-        ]
-        self.sample = next(test_utils.dummy_dataloader(self.data))
-
-        # build model
-        self.args = argparse.Namespace()
-        self.args.sentence_avg = False
-        self.args.report_accuracy = False
-        self.args.probs = (
-            torch.FloatTensor(
-                [
-                    #      pad   eos  unk   w1   w2   w3
-                    [0.05, 0.05, 0.1, 0.05, 0.3, 0.4, 0.05],
-                    [0.05, 0.10, 0.2, 0.05, 0.2, 0.3, 0.10],
-                    [0.05, 0.15, 0.3, 0.05, 0.1, 0.2, 0.15],
-                ]
-            )
-            .unsqueeze(0)
-            .expand(2, 3, 7)
-        )  # add batch dimension
-        self.task = test_utils.TestTranslationTask.setup_task(self.args, self.d, self.d)
-        self.model = self.task.build_model(self.args)
-
-    def test_nll_loss(self):
-        self.args.label_smoothing = 0.1
-        nll_crit = CrossEntropyCriterion.build_criterion(self.args, self.task)
-        smooth_crit = LabelSmoothedCrossEntropyCriterion.build_criterion(
-            self.args, self.task
-        )
-        nll_loss, nll_sample_size, nll_logging_output = nll_crit(
-            self.model, self.sample
-        )
-        smooth_loss, smooth_sample_size, smooth_logging_output = smooth_crit(
-            self.model, self.sample
-        )
-        self.assertLess(abs(nll_loss - nll_logging_output["loss"]), 1e-6)
-        self.assertLess(abs(nll_loss - smooth_logging_output["nll_loss"]), 1e-6)
-
-    def test_padding(self):
-        self.args.label_smoothing = 0.1
-        crit = LabelSmoothedCrossEntropyCriterion.build_criterion(self.args, self.task)
-        loss, _, logging_output = crit(self.model, self.sample)
-
-        def get_one_no_padding(idx):
-            # create a new sample with just a single batch item so that there's
-            # no padding
-            sample1 = next(test_utils.dummy_dataloader([self.data[idx]]))
-            args1 = copy.copy(self.args)
-            args1.probs = args1.probs[idx, :, :].unsqueeze(0)
-            model1 = self.task.build_model(args1)
-            loss1, _, _ = crit(model1, sample1)
-            return loss1
-
-        loss1 = get_one_no_padding(0)
-        loss2 = get_one_no_padding(1)
-        self.assertAlmostEqual(loss, loss1 + loss2)
-
-    def test_reduction(self):
-        self.args.label_smoothing = 0.1
-        crit = LabelSmoothedCrossEntropyCriterion.build_criterion(self.args, self.task)
-        loss, _, logging_output = crit(self.model, self.sample, reduce=True)
-        unreduced_loss, _, _ = crit(self.model, self.sample, reduce=False)
-        self.assertAlmostEqual(loss, unreduced_loss.sum())
-
-    def test_zero_eps(self):
-        self.args.label_smoothing = 0.0
-        nll_crit = CrossEntropyCriterion.build_criterion(self.args, self.task)
-        smooth_crit = LabelSmoothedCrossEntropyCriterion.build_criterion(
-            self.args, self.task
-        )
-        nll_loss, nll_sample_size, nll_logging_output = nll_crit(
-            self.model, self.sample
-        )
-        smooth_loss, smooth_sample_size, smooth_logging_output = smooth_crit(
-            self.model, self.sample
-        )
-        self.assertAlmostEqual(nll_loss, smooth_loss)
-
-    def assertAlmostEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertLess((t1 - t2).abs().max(), 1e-6)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_lstm_jitable.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_lstm_jitable.py
deleted file mode 100644
index 38f79d17931c32447e96c0fbae2630ac397e1804..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_lstm_jitable.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import tempfile
-import unittest
-
-import torch
-from fairseq.data.dictionary import Dictionary
-from fairseq.models.lstm import LSTMModel
-from fairseq.tasks.fairseq_task import LegacyFairseqTask
-
-
-DEFAULT_TEST_VOCAB_SIZE = 100
-
-
-class DummyTask(LegacyFairseqTask):
-    def __init__(self, args):
-        super().__init__(args)
-        self.dictionary = get_dummy_dictionary()
-        if getattr(self.args, "ctc", False):
-            self.dictionary.add_symbol("<ctc_blank>")
-        self.src_dict = self.dictionary
-        self.tgt_dict = self.dictionary
-
-    @property
-    def source_dictionary(self):
-        return self.src_dict
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
-
-
-def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE):
-    dummy_dict = Dictionary()
-    # add dummy symbol to satisfy vocab size
-    for id, _ in enumerate(range(vocab_size)):
-        dummy_dict.add_symbol("{}".format(id), 1000)
-    return dummy_dict
-
-
-def get_dummy_task_and_parser():
-    """
-    to build a fariseq model, we need some dummy parse and task. This function
-    is used to create dummy task and parser to faciliate model/criterion test
-
-    Note: we use FbSpeechRecognitionTask as the dummy task. You may want
-    to use other task by providing another function
-    """
-    parser = argparse.ArgumentParser(
-        description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS
-    )
-    DummyTask.add_args(parser)
-    args = parser.parse_args([])
-    task = DummyTask.setup_task(args)
-    return task, parser
-
-
-class TestJitLSTMModel(unittest.TestCase):
-    def _test_save_and_load(self, scripted_module):
-        with tempfile.NamedTemporaryFile() as f:
-            scripted_module.save(f.name)
-            torch.jit.load(f.name)
-
-    def assertTensorEqual(self, t1, t2):
-        t1 = t1[~torch.isnan(t1)]  # can cause size mismatch errors if there are NaNs
-        t2 = t2[~torch.isnan(t2)]
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertEqual(t1.ne(t2).long().sum(), 0)
-
-    def test_jit_and_export_lstm(self):
-        task, parser = get_dummy_task_and_parser()
-        LSTMModel.add_args(parser)
-        args = parser.parse_args([])
-        args.criterion = ""
-        model = LSTMModel.build_model(args, task)
-        scripted_model = torch.jit.script(model)
-        self._test_save_and_load(scripted_model)
-
-    def test_assert_jit_vs_nonjit_(self):
-        task, parser = get_dummy_task_and_parser()
-        LSTMModel.add_args(parser)
-        args = parser.parse_args([])
-        args.criterion = ""
-        model = LSTMModel.build_model(args, task)
-        model.eval()
-        scripted_model = torch.jit.script(model)
-        scripted_model.eval()
-        idx = len(task.source_dictionary)
-        iter = 100
-        # Inject random input and check output
-        seq_len_tensor = torch.randint(1, 10, (iter,))
-        num_samples_tensor = torch.randint(1, 10, (iter,))
-        for i in range(iter):
-            seq_len = seq_len_tensor[i]
-            num_samples = num_samples_tensor[i]
-            src_token = (torch.randint(0, idx, (num_samples, seq_len)),)
-            src_lengths = torch.randint(1, seq_len + 1, (num_samples,))
-            src_lengths, _ = torch.sort(src_lengths, descending=True)
-            # Force the first sample to have seq_len
-            src_lengths[0] = seq_len
-            prev_output_token = (torch.randint(0, idx, (num_samples, 1)),)
-            result = model(src_token[0], src_lengths, prev_output_token[0], None)
-            scripted_result = scripted_model(
-                src_token[0], src_lengths, prev_output_token[0], None
-            )
-            self.assertTensorEqual(result[0], scripted_result[0])
-            self.assertTensorEqual(result[1], scripted_result[1])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_memory_efficient_fp16.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_memory_efficient_fp16.py
deleted file mode 100644
index e10636d96af8653330fa732484550cc276cbce1b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_memory_efficient_fp16.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import logging
-import unittest
-
-import torch
-from fairseq.optim.adam import FairseqAdam
-from fairseq.optim.fp16_optimizer import MemoryEfficientFP16Optimizer
-
-
-@unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
-class TestMemoryEfficientFP16(unittest.TestCase):
-    def setUp(self):
-        logging.disable(logging.CRITICAL)
-
-    def tearDown(self):
-        logging.disable(logging.NOTSET)
-
-    def test_load_state_dict(self):
-        # define simple FP16 model
-        model = torch.nn.Linear(5, 5).cuda().half()
-        params = list(model.parameters())
-
-        # initialize memory efficient FP16 optimizer
-        optimizer = FairseqAdam(
-            argparse.Namespace(
-                lr=[0.00001],
-                adam_betas="(0.9, 0.999)",
-                adam_eps=1e-8,
-                weight_decay=0.0,
-            ),
-            params,
-        )
-        me_optimizer = MemoryEfficientFP16Optimizer(
-            argparse.Namespace(
-                fp16_init_scale=1,
-                fp16_scale_window=1,
-                fp16_scale_tolerance=1,
-                threshold_loss_scale=1,
-                min_loss_scale=1e-4,
-            ),
-            params,
-            optimizer,
-        )
-
-        # optimizer state is created in the first step
-        loss = model(torch.rand(5).cuda().half()).sum()
-        me_optimizer.backward(loss)
-        me_optimizer.step()
-
-        # reload state
-        state = me_optimizer.state_dict()
-        me_optimizer.load_state_dict(state)
-        for k, v in me_optimizer.optimizer.state.items():
-            self.assertTrue(k.dtype == torch.float16)
-            for v_i in v.values():
-                if torch.is_tensor(v_i):
-                    self.assertTrue(v_i.dtype == torch.float32)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_metrics.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_metrics.py
deleted file mode 100644
index 2de6969cf4445bc6cda44dacf6de765ea30d5f5b..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_metrics.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-import uuid
-
-from fairseq import metrics
-
-
-class TestMetrics(unittest.TestCase):
-    def test_nesting(self):
-        with metrics.aggregate() as a:
-            metrics.log_scalar("loss", 1)
-            with metrics.aggregate() as b:
-                metrics.log_scalar("loss", 2)
-
-        self.assertEqual(a.get_smoothed_values()["loss"], 1.5)
-        self.assertEqual(b.get_smoothed_values()["loss"], 2)
-
-    def test_new_root(self):
-        with metrics.aggregate() as a:
-            metrics.log_scalar("loss", 1)
-            with metrics.aggregate(new_root=True) as b:
-                metrics.log_scalar("loss", 2)
-
-        self.assertEqual(a.get_smoothed_values()["loss"], 1)
-        self.assertEqual(b.get_smoothed_values()["loss"], 2)
-
-    def test_nested_new_root(self):
-        with metrics.aggregate() as layer1:
-            metrics.log_scalar("loss", 1)
-            with metrics.aggregate(new_root=True) as layer2:
-                metrics.log_scalar("loss", 2)
-                with metrics.aggregate() as layer3:
-                    metrics.log_scalar("loss", 3)
-                    with metrics.aggregate(new_root=True) as layer4:
-                        metrics.log_scalar("loss", 4)
-            metrics.log_scalar("loss", 1.5)
-
-        self.assertEqual(layer4.get_smoothed_values()["loss"], 4)
-        self.assertEqual(layer3.get_smoothed_values()["loss"], 3)
-        self.assertEqual(layer2.get_smoothed_values()["loss"], 2.5)
-        self.assertEqual(layer1.get_smoothed_values()["loss"], 1.25)
-
-    def test_named(self):
-        name = str(uuid.uuid4())
-        metrics.reset_meters(name)
-
-        with metrics.aggregate(name):
-            metrics.log_scalar("loss", 1)
-
-        metrics.log_scalar("loss", 3)
-
-        with metrics.aggregate(name):
-            metrics.log_scalar("loss", 2)
-
-        self.assertEqual(metrics.get_smoothed_values(name)["loss"], 1.5)
-
-    def test_nested_duplicate_names(self):
-        name = str(uuid.uuid4())
-        metrics.reset_meters(name)
-
-        with metrics.aggregate(name):
-            metrics.log_scalar("loss", 1)
-            with metrics.aggregate() as other:
-                with metrics.aggregate(name):
-                    metrics.log_scalar("loss", 2)
-            metrics.log_scalar("loss", 6)
-
-        self.assertEqual(metrics.get_smoothed_values(name)["loss"], 3)
-        self.assertEqual(other.get_smoothed_values()["loss"], 2)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_multi_corpus_sampled_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_multi_corpus_sampled_dataset.py
deleted file mode 100644
index 05b20328c5605178767d138cc75e070824679842..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_multi_corpus_sampled_dataset.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-from collections import OrderedDict
-
-import numpy as np
-import torch
-from fairseq.data import LanguagePairDataset, TokenBlockDataset
-from fairseq.data.multi_corpus_sampled_dataset import MultiCorpusSampledDataset
-from tests.test_train import mock_dict
-
-
-class TestMultiCorpusSampledDataset(unittest.TestCase):
-    def setUp(self):
-        d = mock_dict()
-        tokens_1 = torch.LongTensor([1]).view(1, -1)
-        tokens_ds1 = TokenBlockDataset(
-            tokens_1,
-            sizes=[tokens_1.size(-1)],
-            block_size=1,
-            pad=0,
-            eos=1,
-            include_targets=False,
-        )
-        self.dataset_1 = LanguagePairDataset(
-            tokens_ds1, tokens_ds1.sizes, d, shuffle=False
-        )
-        tokens_2 = torch.LongTensor([2]).view(1, -1)
-        tokens_ds2 = TokenBlockDataset(
-            tokens_2,
-            sizes=[tokens_2.size(-1)],
-            block_size=1,
-            pad=0,
-            eos=1,
-            include_targets=False,
-        )
-        self.dataset_2 = LanguagePairDataset(
-            tokens_ds2, tokens_ds2.sizes, d, shuffle=False
-        )
-
-    def _test_sample_helper(
-        self,
-        expected_sample_from_first_ds_percentage,
-        num_samples=1000,
-        sampling_func=None,
-    ):
-        # To make sure test is not flaky
-        np.random.seed(0)
-        if sampling_func is None:
-            m = MultiCorpusSampledDataset(
-                OrderedDict({0: self.dataset_1, 1: self.dataset_2}),
-            )
-        else:
-            m = MultiCorpusSampledDataset(
-                OrderedDict({0: self.dataset_1, 1: self.dataset_2}),
-                sampling_func=sampling_func,
-            )
-        m.ordered_indices()
-        count_sample_from_first_dataset = 0
-        for _ in range(num_samples):
-            if m.collater([m[0], m[1]])["net_input"]["src_tokens"][0] == 1:
-                count_sample_from_first_dataset += 1
-        sample_from_first_ds_percentage = (
-            1.0 * count_sample_from_first_dataset / num_samples
-        )
-        self.assertLess(
-            abs(
-                sample_from_first_ds_percentage
-                - expected_sample_from_first_ds_percentage
-            ),
-            0.01,
-        )
-
-    def test_multi_corpus_sampled_dataset_uniform_sample(self):
-        self._test_sample_helper(expected_sample_from_first_ds_percentage=0.5)
-
-    def test_multi_corpus_sampled_dataset_weighted_sample(self):
-        def naive_weighted_sample(weights):
-            def f(l):
-                v = np.random.random()
-                agg = 0
-                for i, weight in enumerate(weights):
-                    agg += weight
-                    if agg > v:
-                        return i
-
-            return f
-
-        self._test_sample_helper(
-            expected_sample_from_first_ds_percentage=0.9,
-            sampling_func=naive_weighted_sample(weights=[0.9, 0.1]),
-        )
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_multihead_attention.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_multihead_attention.py
deleted file mode 100644
index 9aa9cb2f87f45e9674332429ae1ddec2228214b2..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_multihead_attention.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import torch
-from fairseq.modules.multihead_attention import MultiheadAttention
-
-
-class TestMultiheadAttention(unittest.TestCase):
-    def test_append_prev_key_padding_mask(self):
-        bsz = 1
-        src_len = 4
-
-        cases = [
-            # no padding mask
-            (None, None, None),
-            # current padding mask only
-            (
-                torch.tensor([[1]]).bool(),
-                None,
-                torch.tensor([[0, 0, 0, 1]]).bool(),
-            ),
-            # previous padding mask only
-            (
-                None,
-                torch.tensor([[0, 1, 0]]).bool(),
-                torch.tensor([[0, 1, 0, 0]]).bool(),
-            ),
-            # both padding masks
-            (
-                torch.tensor([[1]]).bool(),
-                torch.tensor([[0, 1, 0]]).bool(),
-                torch.tensor([[0, 1, 0, 1]]).bool(),
-            ),
-        ]
-        for c in cases:
-            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
-                c[0],
-                c[1],
-                batch_size=bsz,
-                src_len=src_len,
-                static_kv=False,
-            )
-
-            if key_padding_mask is not None:
-                self.assertTrue(
-                    torch.all(torch.eq(key_padding_mask, c[2])),
-                    f"Unexpected resultant key padding mask: {key_padding_mask}"
-                    f" given current: {c[0]} and previous: {c[1]}",
-                )
-                self.assertEqual(key_padding_mask.size(0), bsz)
-                self.assertEqual(key_padding_mask.size(1), src_len)
-            else:
-                self.assertIsNone(c[2])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_noising.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_noising.py
deleted file mode 100644
index b3d0d123c42eaca6f79371aa268049e668fcfcce..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_noising.py
+++ /dev/null
@@ -1,530 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-from typing import Dict, List
-
-import tests.utils as test_utils
-import torch
-from fairseq import utils
-from fairseq.data import (
-    Dictionary,
-    LanguagePairDataset,
-    TransformEosDataset,
-    data_utils,
-    noising,
-)
-
-
-class TestDataNoising(unittest.TestCase):
-    def _get_test_data_with_bpe_cont_marker(self, append_eos=True):
-        """
-        Args:
-            append_eos: if True, each input sentence in the source tokens tensor
-                will have an EOS appended to the end.
-
-        Returns:
-            vocabs: BPE vocab with continuation markers as suffixes to denote
-                non-end of word tokens. This is the standard BPE format used in
-                fairseq's preprocessing.
-            x: input tensor containing numberized source tokens, with EOS at the
-                end if append_eos is true
-            src_lengths: and source lengths.
-        """
-        vocab = Dictionary()
-        vocab.add_symbol("he@@")
-        vocab.add_symbol("llo")
-        vocab.add_symbol("how")
-        vocab.add_symbol("are")
-        vocab.add_symbol("y@@")
-        vocab.add_symbol("ou")
-        vocab.add_symbol("n@@")
-        vocab.add_symbol("ew")
-        vocab.add_symbol("or@@")
-        vocab.add_symbol("k")
-
-        src_tokens = [
-            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
-            ["how", "are", "y@@", "ou"],
-        ]
-        x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor(
-            vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
-        )
-        return vocab, x, src_lengths
-
-    def _get_test_data_with_bpe_end_marker(self, append_eos=True):
-        """
-        Args:
-            append_eos: if True, each input sentence in the source tokens tensor
-                will have an EOS appended to the end.
-
-        Returns:
-            vocabs: BPE vocab with end-of-word markers as suffixes to denote
-                tokens at the end of a word. This is an alternative to fairseq's
-                standard preprocessing framework and is not generally supported
-                within fairseq.
-            x: input tensor containing numberized source tokens, with EOS at the
-                end if append_eos is true
-            src_lengths: and source lengths.
-        """
-        vocab = Dictionary()
-        vocab.add_symbol("he")
-        vocab.add_symbol("llo_EOW")
-        vocab.add_symbol("how_EOW")
-        vocab.add_symbol("are_EOW")
-        vocab.add_symbol("y")
-        vocab.add_symbol("ou_EOW")
-        vocab.add_symbol("n")
-        vocab.add_symbol("ew_EOW")
-        vocab.add_symbol("or")
-        vocab.add_symbol("k_EOW")
-
-        src_tokens = [
-            ["he", "llo_EOW", "n", "ew_EOW", "y", "or", "k_EOW"],
-            ["how_EOW", "are_EOW", "y", "ou_EOW"],
-        ]
-        x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor(
-            vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
-        )
-        return vocab, x, src_lengths
-
-    def _get_test_data_with_word_vocab(self, append_eos=True):
-        """
-        Args:
-            append_eos: if True, each input sentence in the source tokens tensor
-                will have an EOS appended to the end.
-
-        Returns:
-            vocabs: word vocab
-            x: input tensor containing numberized source tokens, with EOS at the
-                end if append_eos is true
-            src_lengths: and source lengths.
-        """
-        vocab = Dictionary()
-
-        vocab.add_symbol("hello")
-        vocab.add_symbol("how")
-        vocab.add_symbol("are")
-        vocab.add_symbol("you")
-        vocab.add_symbol("new")
-        vocab.add_symbol("york")
-        src_tokens = [
-            ["hello", "new", "york", "you"],
-            ["how", "are", "you", "new", "york"],
-        ]
-        x, src_lengths = self._convert_src_tokens_to_tensor(
-            vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
-        )
-        return vocab, x, src_lengths
-
-    def _convert_src_tokens_to_tensor(
-        self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool
-    ):
-        src_len = [len(x) for x in src_tokens]
-        # If we have to append EOS, we include EOS in counting src length
-        if append_eos:
-            src_len = [length + 1 for length in src_len]
-
-        x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad())
-        for i in range(len(src_tokens)):
-            for j in range(len(src_tokens[i])):
-                x[i][j] = vocab.index(src_tokens[i][j])
-            if append_eos:
-                x[i][j + 1] = vocab.eos()
-
-        x = x.transpose(1, 0)
-        return x, torch.LongTensor(src_len)
-
-    def assert_eos_at_end(self, x, x_len, eos):
-        """Asserts last token of every sentence in x is EOS """
-        for i in range(len(x_len)):
-            self.assertEqual(
-                x[x_len[i] - 1][i],
-                eos,
-                (
-                    "Expected eos (token id {eos}) at the end of sentence {i} "
-                    "but got {other} instead"
-                ).format(i=i, eos=eos, other=x[i][-1]),
-            )
-
-    def assert_word_dropout_correct(self, x, x_noised, x_len, l_noised):
-        # Expect only the first word (2 bpe tokens) of the first example
-        # was dropped out
-        self.assertEqual(x_len[0] - 2, l_noised[0])
-        for i in range(l_noised[0]):
-            self.assertEqual(x_noised[i][0], x[i + 2][0])
-
-    def test_word_dropout_with_eos(self):
-        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)
-
-        with data_utils.numpy_seed(1234):
-            noising_gen = noising.WordDropout(vocab)
-            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2)
-            self.assert_word_dropout_correct(
-                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
-            )
-            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
-
-    def assert_word_blanking_correct(self, x, x_noised, x_len, l_noised, unk):
-        # Expect only the first word (2 bpe tokens) of the first example
-        # was blanked out
-        self.assertEqual(x_len[0], l_noised[0])
-        for i in range(l_noised[0]):
-            if i < 2:
-                self.assertEqual(x_noised[i][0], unk)
-            else:
-                self.assertEqual(x_noised[i][0], x[i][0])
-
-    def test_word_blank_with_eos(self):
-        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)
-
-        with data_utils.numpy_seed(1234):
-            noising_gen = noising.WordDropout(vocab)
-            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk())
-            self.assert_word_blanking_correct(
-                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk()
-            )
-            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
-
-    def generate_unchanged_shuffle_map(self, length):
-        return {i: i for i in range(length)}
-
-    def assert_word_shuffle_matches_expected(
-        self,
-        x,
-        x_len,
-        max_shuffle_distance: int,
-        vocab: Dictionary,
-        expected_shufle_maps: List[Dict[int, int]],
-        expect_eos_at_end: bool,
-        bpe_end_marker=None,
-    ):
-        """
-        This verifies that with a given x, x_len, max_shuffle_distance, and
-        vocab, we get the expected shuffle result.
-
-        Args:
-            x: Tensor of shape (T x B) = (sequence_length, batch_size)
-            x_len: Tensor of length B = batch_size
-            max_shuffle_distance: arg to pass to noising
-            expected_shuffle_maps: List[mapping] where mapping is a
-                Dict[old_index, new_index], mapping x's elements from their
-                old positions in x to their new positions in x.
-            expect_eos_at_end: if True, check the output to make sure there is
-                an EOS at the end.
-            bpe_end_marker: str denoting the BPE end token. If this is not None, we
-                set the BPE cont token to None in the noising classes.
-        """
-        bpe_cont_marker = None
-        if bpe_end_marker is None:
-            bpe_cont_marker = "@@"
-
-        with data_utils.numpy_seed(1234):
-            word_shuffle = noising.WordShuffle(
-                vocab, bpe_cont_marker=bpe_cont_marker, bpe_end_marker=bpe_end_marker
-            )
-            x_noised, l_noised = word_shuffle.noising(
-                x, x_len, max_shuffle_distance=max_shuffle_distance
-            )
-
-        # For every example, we have a different expected shuffle map. We check
-        # that each example is shuffled as expected according to each
-        # corresponding shuffle map.
-        for i in range(len(expected_shufle_maps)):
-            shuffle_map = expected_shufle_maps[i]
-            for k, v in shuffle_map.items():
-                self.assertEqual(x[k][i], x_noised[v][i])
-
-        # Shuffling should not affect the length of each example
-        for pre_shuffle_length, post_shuffle_length in zip(x_len, l_noised):
-            self.assertEqual(pre_shuffle_length, post_shuffle_length)
-        if expect_eos_at_end:
-            self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
-
-    def test_word_shuffle_with_eos(self):
-        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)
-
-        # Assert word shuffle with max shuffle distance 0 causes input to be
-        # unchanged
-        self.assert_word_shuffle_matches_expected(
-            x=x,
-            x_len=x_len,
-            max_shuffle_distance=0,
-            vocab=vocab,
-            expected_shufle_maps=[
-                self.generate_unchanged_shuffle_map(example_len)
-                for example_len in x_len
-            ],
-            expect_eos_at_end=True,
-        )
-
-        # Assert word shuffle with max shuffle distance 3 matches our expected
-        # shuffle order
-        self.assert_word_shuffle_matches_expected(
-            x=x,
-            x_len=x_len,
-            vocab=vocab,
-            max_shuffle_distance=3,
-            expected_shufle_maps=[
-                self.generate_unchanged_shuffle_map(x_len[0]),
-                {0: 0, 1: 3, 2: 1, 3: 2},
-            ],
-            expect_eos_at_end=True,
-        )
-
-    def test_word_shuffle_with_eos_nonbpe(self):
-        """The purpose of this is to test shuffling logic with word vocabs"""
-        vocab, x, x_len = self._get_test_data_with_word_vocab(append_eos=True)
-
-        # Assert word shuffle with max shuffle distance 0 causes input to be
-        # unchanged
-        self.assert_word_shuffle_matches_expected(
-            x=x,
-            x_len=x_len,
-            max_shuffle_distance=0,
-            vocab=vocab,
-            expected_shufle_maps=[
-                self.generate_unchanged_shuffle_map(example_len)
-                for example_len in x_len
-            ],
-            expect_eos_at_end=True,
-        )
-
-        # Assert word shuffle with max shuffle distance 3 matches our expected
-        # shuffle order
-        self.assert_word_shuffle_matches_expected(
-            x=x,
-            x_len=x_len,
-            vocab=vocab,
-            max_shuffle_distance=3,
-            expected_shufle_maps=[
-                {0: 0, 1: 1, 2: 3, 3: 2},
-                {0: 0, 1: 2, 2: 1, 3: 3, 4: 4},
-            ],
-            expect_eos_at_end=True,
-        )
-
-    def test_word_shuffle_without_eos(self):
-        """Same result as word shuffle with eos except no EOS at end"""
-        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)
-
-        # Assert word shuffle with max shuffle distance 0 causes input to be
-        # unchanged
-        self.assert_word_shuffle_matches_expected(
-            x=x,
-            x_len=x_len,
-            max_shuffle_distance=0,
-            vocab=vocab,
-            expected_shufle_maps=[
-                self.generate_unchanged_shuffle_map(example_len)
-                for example_len in x_len
-            ],
-            expect_eos_at_end=False,
-        )
-
-        # Assert word shuffle with max shuffle distance 3 matches our expected
-        # shuffle order
-        self.assert_word_shuffle_matches_expected(
-            x=x,
-            x_len=x_len,
-            vocab=vocab,
-            max_shuffle_distance=3,
-            expected_shufle_maps=[
-                self.generate_unchanged_shuffle_map(x_len[0]),
-                {0: 0, 1: 3, 2: 1, 3: 2},
-            ],
-            expect_eos_at_end=False,
-        )
-
-    def test_word_shuffle_without_eos_with_bpe_end_marker(self):
-        """Same result as word shuffle without eos except using BPE end token"""
-        vocab, x, x_len = self._get_test_data_with_bpe_end_marker(append_eos=False)
-
-        # Assert word shuffle with max shuffle distance 0 causes input to be
-        # unchanged
-        self.assert_word_shuffle_matches_expected(
-            x=x,
-            x_len=x_len,
-            max_shuffle_distance=0,
-            vocab=vocab,
-            expected_shufle_maps=[
-                self.generate_unchanged_shuffle_map(example_len)
-                for example_len in x_len
-            ],
-            expect_eos_at_end=False,
-            bpe_end_marker="_EOW",
-        )
-
-        # Assert word shuffle with max shuffle distance 3 matches our expected
-        # shuffle order
-        self.assert_word_shuffle_matches_expected(
-            x=x,
-            x_len=x_len,
-            vocab=vocab,
-            max_shuffle_distance=3,
-            expected_shufle_maps=[
-                self.generate_unchanged_shuffle_map(x_len[0]),
-                {0: 0, 1: 3, 2: 1, 3: 2},
-            ],
-            expect_eos_at_end=False,
-            bpe_end_marker="_EOW",
-        )
-
-    def assert_no_eos_at_end(self, x, x_len, eos):
-        """Asserts that the last token of each sentence in x is not EOS """
-        for i in range(len(x_len)):
-            self.assertNotEqual(
-                x[x_len[i] - 1][i],
-                eos,
-                "Expected no eos (token id {eos}) at the end of sentence {i}.".format(
-                    eos=eos, i=i
-                ),
-            )
-
-    def test_word_dropout_without_eos(self):
-        """Same result as word dropout with eos except no EOS at end"""
-        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)
-
-        with data_utils.numpy_seed(1234):
-            noising_gen = noising.WordDropout(vocab)
-            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2)
-            self.assert_word_dropout_correct(
-                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
-            )
-            self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
-
-    def test_word_blank_without_eos(self):
-        """Same result as word blank with eos except no EOS at end"""
-        vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)
-
-        with data_utils.numpy_seed(1234):
-            noising_gen = noising.WordDropout(vocab)
-            x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk())
-            self.assert_word_blanking_correct(
-                x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk()
-            )
-            self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
-
-    def _get_noising_dataset_batch(
-        self,
-        src_tokens_no_pad,
-        src_dict,
-        append_eos_to_tgt=False,
-    ):
-        """
-        Constructs a NoisingDataset and the corresponding
-        ``LanguagePairDataset(NoisingDataset(src), src)``. If
-        *append_eos_to_tgt* is True, wrap the source dataset in
-        :class:`TransformEosDataset` to append EOS to the clean source when
-        using it as the target.
-        """
-        src_dataset = test_utils.TestDataset(data=src_tokens_no_pad)
-
-        noising_dataset = noising.NoisingDataset(
-            src_dataset=src_dataset,
-            src_dict=src_dict,
-            seed=1234,
-            max_word_shuffle_distance=3,
-            word_dropout_prob=0.2,
-            word_blanking_prob=0.2,
-            noising_class=noising.UnsupervisedMTNoising,
-        )
-        tgt = src_dataset
-        language_pair_dataset = LanguagePairDataset(
-            src=noising_dataset, tgt=tgt, src_sizes=None, src_dict=src_dict
-        )
-        language_pair_dataset = TransformEosDataset(
-            language_pair_dataset,
-            src_dict.eos(),
-            append_eos_to_tgt=append_eos_to_tgt,
-        )
-
-        dataloader = torch.utils.data.DataLoader(
-            dataset=language_pair_dataset,
-            batch_size=2,
-            collate_fn=language_pair_dataset.collater,
-        )
-        denoising_batch_result = next(iter(dataloader))
-        return denoising_batch_result
-
-    def test_noising_dataset_with_eos(self):
-        src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker(
-            append_eos=True
-        )
-
-        # Format data for src_dataset
-        src_tokens = torch.t(src_tokens)
-        src_tokens_no_pad = []
-        for src_sentence in src_tokens:
-            src_tokens_no_pad.append(
-                utils.strip_pad(tensor=src_sentence, pad=src_dict.pad())
-            )
-        denoising_batch_result = self._get_noising_dataset_batch(
-            src_tokens_no_pad=src_tokens_no_pad, src_dict=src_dict
-        )
-
-        eos, pad = src_dict.eos(), src_dict.pad()
-
-        # Generated noisy source as source
-        expected_src = torch.LongTensor(
-            [[4, 5, 10, 11, 8, 12, 13, eos], [pad, pad, pad, 6, 8, 9, 7, eos]]
-        )
-        # Original clean source as target (right-padded)
-        expected_tgt = torch.LongTensor(
-            [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]]
-        )
-        generated_src = denoising_batch_result["net_input"]["src_tokens"]
-        tgt_tokens = denoising_batch_result["target"]
-
-        self.assertTensorEqual(expected_src, generated_src)
-        self.assertTensorEqual(expected_tgt, tgt_tokens)
-
-    def test_noising_dataset_without_eos(self):
-        """
-        Similar to test noising dataset with eos except that we have to set
-        *append_eos_to_tgt* to ``True``.
-        """
-
-        src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker(
-            append_eos=False
-        )
-
-        # Format data for src_dataset
-        src_tokens = torch.t(src_tokens)
-        src_tokens_no_pad = []
-        for src_sentence in src_tokens:
-            src_tokens_no_pad.append(
-                utils.strip_pad(tensor=src_sentence, pad=src_dict.pad())
-            )
-        denoising_batch_result = self._get_noising_dataset_batch(
-            src_tokens_no_pad=src_tokens_no_pad,
-            src_dict=src_dict,
-            append_eos_to_tgt=True,
-        )
-
-        eos, pad = src_dict.eos(), src_dict.pad()
-
-        # Generated noisy source as source
-        expected_src = torch.LongTensor(
-            [[4, 5, 10, 11, 8, 12, 13], [pad, pad, pad, 6, 8, 9, 7]]
-        )
-        # Original clean source as target (right-padded)
-        expected_tgt = torch.LongTensor(
-            [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]]
-        )
-
-        generated_src = denoising_batch_result["net_input"]["src_tokens"]
-        tgt_tokens = denoising_batch_result["target"]
-
-        self.assertTensorEqual(expected_src, generated_src)
-        self.assertTensorEqual(expected_tgt, tgt_tokens)
-
-    def assertTensorEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertEqual(t1.ne(t2).long().sum(), 0)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_reproducibility.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_reproducibility.py
deleted file mode 100644
index 517e23c39ef375371081b8721b873779e6067c22..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_reproducibility.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import contextlib
-import json
-import os
-import tempfile
-import unittest
-from io import StringIO
-
-import torch
-
-from . import test_binaries
-
-
-class TestReproducibility(unittest.TestCase):
-    def _test_reproducibility(
-        self,
-        name,
-        extra_flags=None,
-        delta=0.0001,
-        resume_checkpoint="checkpoint1.pt",
-        max_epoch=3,
-    ):
-        def get_last_log_stats_containing_string(log_records, search_string):
-            for log_record in logs.records[::-1]:
-                if search_string in log_record.msg:
-                    return json.loads(log_record.msg)
-
-        if extra_flags is None:
-            extra_flags = []
-
-        with tempfile.TemporaryDirectory(name) as data_dir:
-            with self.assertLogs() as logs:
-                test_binaries.create_dummy_data(data_dir)
-                test_binaries.preprocess_translation_data(data_dir)
-
-            # train epochs 1 and 2 together
-            with self.assertLogs() as logs:
-                test_binaries.train_translation_model(
-                    data_dir,
-                    "fconv_iwslt_de_en",
-                    [
-                        "--dropout",
-                        "0.0",
-                        "--log-format",
-                        "json",
-                        "--log-interval",
-                        "1",
-                        "--max-epoch",
-                        str(max_epoch),
-                    ]
-                    + extra_flags,
-                )
-            train_log = get_last_log_stats_containing_string(logs.records, "train_loss")
-            valid_log = get_last_log_stats_containing_string(logs.records, "valid_loss")
-
-            # train epoch 2, resuming from previous checkpoint 1
-            os.rename(
-                os.path.join(data_dir, resume_checkpoint),
-                os.path.join(data_dir, "checkpoint_last.pt"),
-            )
-            with self.assertLogs() as logs:
-                test_binaries.train_translation_model(
-                    data_dir,
-                    "fconv_iwslt_de_en",
-                    [
-                        "--dropout",
-                        "0.0",
-                        "--log-format",
-                        "json",
-                        "--log-interval",
-                        "1",
-                        "--max-epoch",
-                        str(max_epoch),
-                    ]
-                    + extra_flags,
-                )
-            train_res_log = get_last_log_stats_containing_string(
-                logs.records, "train_loss"
-            )
-            valid_res_log = get_last_log_stats_containing_string(
-                logs.records, "valid_loss"
-            )
-
-            for k in ["train_loss", "train_ppl", "train_num_updates", "train_gnorm"]:
-                self.assertAlmostEqual(
-                    float(train_log[k]), float(train_res_log[k]), delta=delta
-                )
-            for k in [
-                "valid_loss",
-                "valid_ppl",
-                "valid_num_updates",
-                "valid_best_loss",
-            ]:
-                self.assertAlmostEqual(
-                    float(valid_log[k]), float(valid_res_log[k]), delta=delta
-                )
-
-    def test_reproducibility(self):
-        self._test_reproducibility("test_reproducibility")
-
-    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
-    def test_reproducibility_fp16(self):
-        self._test_reproducibility(
-            "test_reproducibility_fp16",
-            [
-                "--fp16",
-                "--fp16-init-scale",
-                "4096",
-            ],
-            delta=0.011,
-        )
-
-    @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU")
-    def test_reproducibility_memory_efficient_fp16(self):
-        self._test_reproducibility(
-            "test_reproducibility_memory_efficient_fp16",
-            [
-                "--memory-efficient-fp16",
-                "--fp16-init-scale",
-                "4096",
-            ],
-        )
-
-    def test_mid_epoch_reproducibility(self):
-        self._test_reproducibility(
-            "test_mid_epoch_reproducibility",
-            ["--save-interval-updates", "3"],
-            resume_checkpoint="checkpoint_1_3.pt",
-            max_epoch=1,
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_resampling_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_resampling_dataset.py
deleted file mode 100644
index ccb53a253ce6ca0d8e972adfa708144b4299b3cb..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_resampling_dataset.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import collections
-import unittest
-
-import numpy as np
-from fairseq.data import ListDataset, ResamplingDataset
-
-
-class TestResamplingDataset(unittest.TestCase):
-    def setUp(self):
-        self.strings = ["ab", "c", "def", "ghij"]
-        self.weights = [4.0, 2.0, 7.0, 1.5]
-        self.size_ratio = 2
-        self.dataset = ListDataset(
-            self.strings, np.array([len(s) for s in self.strings])
-        )
-
-    def _test_common(self, resampling_dataset, iters):
-        assert len(self.dataset) == len(self.strings) == len(self.weights)
-        assert len(resampling_dataset) == self.size_ratio * len(self.strings)
-
-        results = {"ordered_by_size": True, "max_distribution_diff": 0.0}
-
-        totalfreqs = 0
-        freqs = collections.defaultdict(int)
-
-        for epoch_num in range(iters):
-            resampling_dataset.set_epoch(epoch_num)
-
-            indices = resampling_dataset.ordered_indices()
-            assert len(indices) == len(resampling_dataset)
-
-            prev_size = -1
-
-            for i in indices:
-                cur_size = resampling_dataset.size(i)
-                # Make sure indices map to same sequences within an epoch
-                assert resampling_dataset[i] == resampling_dataset[i]
-
-                # Make sure length of sequence is correct
-                assert cur_size == len(resampling_dataset[i])
-
-                freqs[resampling_dataset[i]] += 1
-                totalfreqs += 1
-
-                if prev_size > cur_size:
-                    results["ordered_by_size"] = False
-
-                prev_size = cur_size
-
-        assert set(freqs.keys()) == set(self.strings)
-        for s, weight in zip(self.strings, self.weights):
-            freq = freqs[s] / totalfreqs
-            expected_freq = weight / sum(self.weights)
-            results["max_distribution_diff"] = max(
-                results["max_distribution_diff"], abs(expected_freq - freq)
-            )
-
-        return results
-
-    def test_resampling_dataset_batch_by_size_false(self):
-        resampling_dataset = ResamplingDataset(
-            self.dataset,
-            self.weights,
-            size_ratio=self.size_ratio,
-            batch_by_size=False,
-            seed=0,
-        )
-
-        results = self._test_common(resampling_dataset, iters=1000)
-
-        # For batch_by_size = False, the batches should be returned in
-        # arbitrary order of size.
-        assert not results["ordered_by_size"]
-
-        # Allow tolerance in distribution error of 2%.
-        assert results["max_distribution_diff"] < 0.02
-
-    def test_resampling_dataset_batch_by_size_true(self):
-        resampling_dataset = ResamplingDataset(
-            self.dataset,
-            self.weights,
-            size_ratio=self.size_ratio,
-            batch_by_size=True,
-            seed=0,
-        )
-
-        results = self._test_common(resampling_dataset, iters=1000)
-
-        # For batch_by_size = True, the batches should be returned in
-        # increasing order of size.
-        assert results["ordered_by_size"]
-
-        # Allow tolerance in distribution error of 2%.
-        assert results["max_distribution_diff"] < 0.02
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_sequence_generator.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_sequence_generator.py
deleted file mode 100644
index c890b655ff7f488118217dd534fe7db6c22e9e22..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_sequence_generator.py
+++ /dev/null
@@ -1,647 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import tempfile
-import unittest
-
-import tests.utils as test_utils
-import torch
-from fairseq import search
-from fairseq.data.dictionary import Dictionary
-from fairseq.models.transformer import TransformerModel
-from fairseq.sequence_generator import EnsembleModel, SequenceGenerator
-from fairseq.tasks.fairseq_task import LegacyFairseqTask
-
-
-DEFAULT_TEST_VOCAB_SIZE = 100
-
-
-class DummyTask(LegacyFairseqTask):
-    def __init__(self, args):
-        super().__init__(args)
-        self.dictionary = get_dummy_dictionary()
-        if getattr(self.args, "ctc", False):
-            self.dictionary.add_symbol("<ctc_blank>")
-        self.src_dict = self.dictionary
-        self.tgt_dict = self.dictionary
-
-    @property
-    def source_dictionary(self):
-        return self.src_dict
-
-    @property
-    def target_dictionary(self):
-        return self.dictionary
-
-
-def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE):
-    dummy_dict = Dictionary()
-    # add dummy symbol to satisfy vocab size
-    for id, _ in enumerate(range(vocab_size)):
-        dummy_dict.add_symbol("{}".format(id), 1000)
-    return dummy_dict
-
-
-def get_dummy_task_and_parser():
-    """
-    to build a fariseq model, we need some dummy parse and task. This function
-    is used to create dummy task and parser to faciliate model/criterion test
-
-    Note: we use FbSpeechRecognitionTask as the dummy task. You may want
-    to use other task by providing another function
-    """
-    parser = argparse.ArgumentParser(
-        description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS
-    )
-    DummyTask.add_args(parser)
-    args = parser.parse_args([])
-    task = DummyTask.setup_task(args)
-    return task, parser
-
-
-class TestJitSequenceGeneratorBase(unittest.TestCase):
-    def setUp(self):
-        self.task, self.parser = get_dummy_task_and_parser()
-        eos = self.task.tgt_dict.eos()
-        src_tokens = torch.randint(3, 50, (2, 10)).long()
-        src_tokens = torch.cat((src_tokens, torch.LongTensor([[eos], [eos]])), -1)
-        src_lengths = torch.LongTensor([2, 10])
-        self.sample = {
-            "net_input": {"src_tokens": src_tokens, "src_lengths": src_lengths}
-        }
-        TransformerModel.add_args(self.parser)
-        args = self.parser.parse_args([])
-        args.encoder_layers = 2
-        args.decoder_layers = 1
-        self.transformer_model = TransformerModel.build_model(args, self.task)
-
-    def assertOutputEqual(self, hypo, pos_probs):
-        pos_scores = torch.FloatTensor(pos_probs).log()
-        self.assertTensorSizeEqual(hypo["positional_scores"], pos_scores)
-        self.assertTensorSizeEqual(pos_scores.numel(), hypo["tokens"].numel())
-
-    def assertTensorSizeEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-
-    def assertAlmostEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertLess((t1 - t2).abs().max(), 1e-4)
-
-    def assertTensorEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertEqual(t1.ne(t2).long().sum(), 0)
-
-    def assertHypoEqual(self, h1, h2):
-        "Check two hypos are equal"
-        self.assertTensorEqual(h1["tokens"], h2["tokens"])
-        self.assertAlmostEqual(h1["positional_scores"], h2["positional_scores"])
-        self.assertLess(abs(h1["score"] - h2["score"]), 1e-6)
-        self.assertAlmostEqual(h1["attention"], h2["attention"])
-
-    def _test_save_and_load(self, scripted_module):
-        with tempfile.NamedTemporaryFile() as f:
-            scripted_module.save(f.name)
-            torch.jit.load(f.name)
-
-
-class TestJitSequeneceGenerator(TestJitSequenceGeneratorBase):
-    @unittest.skipIf(
-        torch.__version__ < "1.6.0", "Targeting OSS scriptability for the 1.6 release"
-    )
-    def test_export_transformer(self):
-        model = self.transformer_model
-        torch.jit.script(model)
-
-    @unittest.skipIf(
-        torch.__version__ < "1.6.0", "Targeting OSS scriptability for the 1.6 release"
-    )
-    def test_ensemble_sequence_generator(self):
-        model = self.transformer_model
-        generator = SequenceGenerator(
-            [model], self.task.tgt_dict, beam_size=2, no_repeat_ngram_size=2
-        )
-        scripted_model = torch.jit.script(generator)
-        self._test_save_and_load(scripted_model)
-
-
-class TestJitEnsemble(TestJitSequenceGeneratorBase):
-    @unittest.skipIf(
-        torch.__version__ < "1.6.0", "Targeting OSS scriptability for the 1.6 release"
-    )
-    def test_export_ensemble_model(self):
-        model = self.transformer_model
-        ensemble_models = EnsembleModel([model])
-        torch.jit.script(ensemble_models)
-
-
-class TestExportSearch(unittest.TestCase):
-    def setUp(self):
-        task, _ = get_dummy_task_and_parser()
-        self.tgt_dict = task.tgt_dict
-        self.min_top1_prob = 0.4
-
-    def test_export_diverse_bs(self):
-        search_strategy = search.DiverseBeamSearch(
-            self.tgt_dict, num_groups=2, diversity_strength=0.0
-        )
-        torch.jit.script(search_strategy)
-
-    def test_export_sampling(self):
-        low_sampling_topp = self.min_top1_prob / 2.0
-        search_strategy = search.Sampling(
-            self.tgt_dict, sampling_topp=low_sampling_topp
-        )
-        torch.jit.script(search_strategy)
-
-    def test_export_diverse_siblings_search(self):
-        search_strategy = search.DiverseSiblingsSearch(
-            self.tgt_dict, diversity_rate=0.5
-        )
-        torch.jit.script(search_strategy)
-
-
-class TestSequenceGeneratorBase(unittest.TestCase):
-    def assertHypoTokens(self, hypo, tokens):
-        self.assertTensorEqual(hypo["tokens"], torch.LongTensor(tokens))
-
-    def assertHypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0):
-        pos_scores = torch.FloatTensor(pos_probs).log()
-        self.assertAlmostEqual(hypo["positional_scores"], pos_scores)
-        self.assertEqual(pos_scores.numel(), hypo["tokens"].numel())
-        score = pos_scores.sum()
-        if normalized:
-            score /= pos_scores.numel() ** lenpen
-        self.assertLess(abs(score - hypo["score"]), 1e-6)
-
-    def assertAlmostEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertLess((t1 - t2).abs().max(), 1e-4)
-
-    def assertTensorEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertEqual(t1.ne(t2).long().sum(), 0)
-
-
-class TestSequeneceGenerator(TestSequenceGeneratorBase):
-    def setUp(self):
-        (
-            self.tgt_dict,
-            self.w1,
-            self.w2,
-            src_tokens,
-            src_lengths,
-            self.model,
-        ) = test_utils.sequence_generator_setup()
-        self.sample = {
-            "net_input": {"src_tokens": src_tokens, "src_lengths": src_lengths}
-        }
-
-    def test_with_normalization(self):
-        generator = SequenceGenerator([self.model], self.tgt_dict, beam_size=2)
-        hypos = generator.forward(self.sample)
-        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
-        # sentence 1, beam 1
-        self.assertHypoTokens(hypos[0][0], [w1, eos])
-        self.assertHypoScore(hypos[0][0], [0.9, 1.0])
-        # sentence 1, beam 2
-        self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
-        self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0])
-        # sentence 2, beam 1
-        self.assertHypoTokens(hypos[1][0], [w1, w2, w1, eos])
-        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.4, 1.0])
-        # sentence 2, beam 2
-        self.assertHypoTokens(hypos[1][1], [w1, w2, eos])
-        self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.6])
-
-    def test_without_normalization(self):
-        # Sentence 1: unchanged from the normalized case
-        # Sentence 2: beams swap order
-        generator = SequenceGenerator(
-            [self.model], self.tgt_dict, beam_size=2, normalize_scores=False
-        )
-        hypos = generator.forward(self.sample)
-        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
-        # sentence 1, beam 1
-        self.assertHypoTokens(hypos[0][0], [w1, eos])
-        self.assertHypoScore(hypos[0][0], [0.9, 1.0], normalized=False)
-        # sentence 1, beam 2
-        self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
-        self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0], normalized=False)
-        # sentence 2, beam 1
-        self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
-        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6], normalized=False)
-        # sentence 2, beam 2
-        self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos])
-        self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0], normalized=False)
-
-    def test_with_lenpen_favoring_short_hypos(self):
-        lenpen = 0.6
-        generator = SequenceGenerator(
-            [self.model], self.tgt_dict, beam_size=2, len_penalty=lenpen
-        )
-        hypos = generator.forward(self.sample)
-        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
-        # sentence 1, beam 1
-        self.assertHypoTokens(hypos[0][0], [w1, eos])
-        self.assertHypoScore(hypos[0][0], [0.9, 1.0], lenpen=lenpen)
-        # sentence 1, beam 2
-        self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos])
-        self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0], lenpen=lenpen)
-        # sentence 2, beam 1
-        self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
-        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6], lenpen=lenpen)
-        # sentence 2, beam 2
-        self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos])
-        self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0], lenpen=lenpen)
-
-    def test_with_lenpen_favoring_long_hypos(self):
-        lenpen = 5.0
-        generator = SequenceGenerator(
-            [self.model], self.tgt_dict, beam_size=2, len_penalty=lenpen
-        )
-        hypos = generator.forward(self.sample)
-        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
-        # sentence 1, beam 1
-        self.assertHypoTokens(hypos[0][0], [w2, w1, w2, eos])
-        self.assertHypoScore(hypos[0][0], [0.1, 0.9, 0.9, 1.0], lenpen=lenpen)
-        # sentence 1, beam 2
-        self.assertHypoTokens(hypos[0][1], [w1, eos])
-        self.assertHypoScore(hypos[0][1], [0.9, 1.0], lenpen=lenpen)
-        # sentence 2, beam 1
-        self.assertHypoTokens(hypos[1][0], [w1, w2, w1, eos])
-        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.4, 1.0], lenpen=lenpen)
-        # sentence 2, beam 2
-        self.assertHypoTokens(hypos[1][1], [w1, w2, eos])
-        self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.6], lenpen=lenpen)
-
-    def test_maxlen(self):
-        generator = SequenceGenerator(
-            [self.model], self.tgt_dict, beam_size=2, max_len_b=2
-        )
-        hypos = generator.forward(self.sample)
-        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
-        # sentence 1, beam 1
-        self.assertHypoTokens(hypos[0][0], [w1, eos])
-        self.assertHypoScore(hypos[0][0], [0.9, 1.0])
-        # sentence 1, beam 2
-        self.assertHypoTokens(hypos[0][1], [w2, w2, eos])
-        self.assertHypoScore(hypos[0][1], [0.1, 0.1, 0.6])
-        # sentence 2, beam 1
-        self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
-        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6])
-        # sentence 2, beam 2
-        self.assertHypoTokens(hypos[1][1], [w2, w2, eos])
-        self.assertHypoScore(hypos[1][1], [0.3, 0.9, 0.01])
-
-    def test_encoder_with_different_output_len(self):
-        args = self.model.encoder.args
-        task = test_utils.TestTranslationTask.setup_task(
-            args, self.tgt_dict, self.tgt_dict
-        )
-        reshaping_model = test_utils.TestReshapingModel.build_model(args, task)
-        generator = SequenceGenerator(
-            [reshaping_model], self.tgt_dict, beam_size=2, max_len_b=2
-        )
-        hypos = generator.forward(self.sample)
-        for sent in [0, 1]:
-            for beam in [0, 1]:
-                assert hypos[sent][beam]["attention"] is not None
-
-    def test_generation_with_additional_input(self):
-        args = self.model.encoder.args
-        task = test_utils.TestTranslationTask.setup_task(
-            args, self.tgt_dict, self.tgt_dict
-        )
-        add_input_model = test_utils.TestAdditionalInputModel.build_model(args, task)
-        generator = SequenceGenerator([add_input_model], self.tgt_dict, beam_size=2)
-        sample = self.sample.copy()
-        sample["net_input"]["fancy_other_input"] = sample["net_input"]["src_tokens"]
-        hypos = generator.forward(self.sample)
-        eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2
-        # sentence 1, beam 1
-        self.assertHypoTokens(hypos[0][0], [w1, eos])
-        self.assertHypoScore(hypos[0][0], [0.9, 1.0])
-
-
-class TestDiverseBeamSearch(TestSequenceGeneratorBase):
-    def setUp(self):
-        # construct dummy dictionary
-        d = test_utils.dummy_dictionary(vocab_size=2)
-        self.assertEqual(d.pad(), 1)
-        self.assertEqual(d.eos(), 2)
-        self.assertEqual(d.unk(), 3)
-        self.eos = d.eos()
-        self.w1 = 4
-        self.w2 = 5
-
-        # construct source data
-        self.src_tokens = torch.LongTensor(
-            [
-                [self.w1, self.w2, self.eos],
-                [self.w1, self.w2, self.eos],
-            ]
-        )
-        self.src_lengths = torch.LongTensor([2, 2])
-
-        args = argparse.Namespace()
-        unk = 0.0
-        args.beam_probs = [
-            # step 0:
-            torch.FloatTensor(
-                [
-                    # eos      w1   w2
-                    # sentence 1:
-                    [0.0, unk, 0.9, 0.1],  # beam 1
-                    [0.0, unk, 0.9, 0.1],  # beam 2
-                    # sentence 2:
-                    [0.0, unk, 0.7, 0.3],
-                    [0.0, unk, 0.7, 0.3],
-                ]
-            ),
-            # step 1:
-            torch.FloatTensor(
-                [
-                    # eos      w1   w2
-                    # sentence 1:
-                    [0.0, unk, 0.6, 0.4],
-                    [0.0, unk, 0.6, 0.4],
-                    # sentence 2:
-                    [0.25, unk, 0.35, 0.4],
-                    [0.25, unk, 0.35, 0.4],
-                ]
-            ),
-            # step 2:
-            torch.FloatTensor(
-                [
-                    # eos      w1   w2
-                    # sentence 1:
-                    [1.0, unk, 0.0, 0.0],
-                    [1.0, unk, 0.0, 0.0],
-                    # sentence 2:
-                    [0.9, unk, 0.1, 0.0],
-                    [0.9, unk, 0.1, 0.0],
-                ]
-            ),
-        ]
-
-        task = test_utils.TestTranslationTask.setup_task(args, d, d)
-        self.model = task.build_model(args)
-        self.tgt_dict = task.target_dictionary
-
-    def test_diverse_beam_search(self):
-        search_strategy = search.DiverseBeamSearch(
-            self.tgt_dict, num_groups=2, diversity_strength=0.0
-        )
-        generator = SequenceGenerator(
-            [self.model],
-            self.tgt_dict,
-            beam_size=2,
-            search_strategy=search_strategy,
-        )
-        sample = {
-            "net_input": {
-                "src_tokens": self.src_tokens,
-                "src_lengths": self.src_lengths,
-            }
-        }
-        hypos = generator.forward(sample)
-        eos, w1, w2 = self.eos, self.w1, self.w2
-        # sentence 1, beam 1
-        self.assertHypoTokens(hypos[0][0], [w1, w1, eos])
-        self.assertHypoScore(hypos[0][0], [0.9, 0.6, 1.0])
-        # sentence 1, beam 2
-        self.assertHypoTokens(hypos[0][1], [w1, w1, eos])
-        self.assertHypoScore(hypos[0][1], [0.9, 0.6, 1.0])
-        # sentence 2, beam 1
-        self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
-        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.9])
-        # sentence 2, beam 2
-        self.assertHypoTokens(hypos[1][1], [w1, w2, eos])
-        self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.9])
-
-
-class TestDiverseSiblingsSearch(TestDiverseBeamSearch):
-    def assertHypoScore(
-        self, hypo, pos_probs, sibling_rank, diversity_rate, normalized=True, lenpen=1.0
-    ):
-        pos_scores = torch.FloatTensor(pos_probs).log()
-        pos_scores.sub_(torch.Tensor(sibling_rank) * diversity_rate)
-        self.assertAlmostEqual(hypo["positional_scores"], pos_scores)
-        self.assertEqual(pos_scores.numel(), hypo["tokens"].numel())
-        score = pos_scores.sum()
-        if normalized:
-            score /= pos_scores.numel() ** lenpen
-        self.assertLess(abs(score - hypo["score"]), 1e-6)
-
-    def test_diverse_beam_search(self):
-        search_strategy = search.DiverseSiblingsSearch(
-            self.tgt_dict, diversity_rate=0.5
-        )
-        generator = SequenceGenerator(
-            [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy
-        )
-        sample = {
-            "net_input": {
-                "src_tokens": self.src_tokens,
-                "src_lengths": self.src_lengths,
-            }
-        }
-        hypos = generator.forward(sample)
-        eos, w1, w2 = self.eos, self.w1, self.w2
-        # sentence 1, beam 1
-        self.assertHypoTokens(hypos[0][0], [w1, w1, eos])
-        self.assertHypoScore(hypos[0][0], [0.9, 0.6, 1.0], [0, 1, 1], 0.5)
-        # sentence 1, beam 2
-        self.assertHypoTokens(hypos[0][1], [w1, w2, eos])
-        self.assertHypoScore(hypos[0][1], [0.9, 0.4, 1.0], [0, 2, 1], 0.5)
-        # sentence 2, beam 1
-        self.assertHypoTokens(hypos[1][0], [w1, w2, eos])
-        self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.9], [0, 1, 1], 0.5)
-        # sentence 2, beam 2
-        self.assertHypoTokens(hypos[1][1], [w1, w1, eos])
-        self.assertHypoScore(hypos[1][1], [0.7, 0.35, 0.9], [0, 2, 1], 0.5)
-
-
-class TestTopPSamplingSearch(TestSequenceGeneratorBase):
-    def setUp(self):
-        # construct dummy dictionary
-        d = test_utils.dummy_dictionary(vocab_size=2)
-        self.assertEqual(d.pad(), 1)
-        self.assertEqual(d.eos(), 2)
-        self.assertEqual(d.unk(), 3)
-        self.eos = d.eos()
-        self.w1 = 4
-        self.w2 = 5
-
-        # construct source data
-        self.src_tokens = torch.LongTensor(
-            [
-                [self.w1, self.w2, self.eos],
-                [self.w1, self.w2, self.eos],
-            ]
-        )
-        self.src_lengths = torch.LongTensor([2, 2])
-
-        args = argparse.Namespace()
-        unk = 0.0
-        # The minimal probability of top 2 tokens.
-        self.min_top2_prob = 0.75
-        # The minimal probability of the top 1 token.
-        self.min_top1_prob = 0.4
-
-        w1_prob = self.min_top1_prob
-        w2_prob = self.min_top2_prob - self.min_top1_prob
-        eos_prob = 1 - self.min_top2_prob
-
-        args.beam_probs = [
-            # step 0:
-            torch.FloatTensor(
-                [
-                    # eos      w1   w2
-                    [0.0, unk, 1.0, 0.0],
-                    [0.0, unk, 1.0, 0.0],
-                    [0.0, unk, 1.0, 0.0],
-                    [0.0, unk, 1.0, 0.0],
-                ]
-            ),
-            # step 1:
-            torch.FloatTensor(
-                [
-                    # eos           w1       w2
-                    [eos_prob, unk, w1_prob, w2_prob],
-                    [eos_prob, unk, w1_prob, w2_prob],
-                    [eos_prob, unk, w1_prob, w2_prob],
-                    [eos_prob, unk, w1_prob, w2_prob],
-                ]
-            ),
-            # step 2:
-            torch.FloatTensor(
-                [
-                    # eos      w1   w2
-                    [1.0, unk, 0.0, 0.0],
-                    [1.0, unk, 0.0, 0.0],
-                    [1.0, unk, 0.0, 0.0],
-                    [1.0, unk, 0.0, 0.0],
-                ]
-            ),
-        ]
-
-        task = test_utils.TestTranslationTask.setup_task(args, d, d)
-        self.model = task.build_model(args)
-        self.tgt_dict = task.target_dictionary
-
-    def test_topp_sampling_search_low_prob(self):
-        # Given a prob low enough to top-P sampling, we expect only the top
-        # 1 token to be sampled, which always results in the same output.
-        low_sampling_topp = self.min_top1_prob / 2.0
-        search_strategy = search.Sampling(
-            self.tgt_dict, sampling_topp=low_sampling_topp
-        )
-        generator = SequenceGenerator(
-            [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy
-        )
-        sample = {
-            "net_input": {
-                "src_tokens": self.src_tokens,
-                "src_lengths": self.src_lengths,
-            }
-        }
-        hypos = generator.forward(sample)
-        eos, w1 = self.eos, self.w1
-        # sentence 1, beam 1
-        self.assertHypoTokens(hypos[0][0], [w1, w1, eos])
-        self.assertHypoScore(hypos[0][0], [1.0, 0.4, 1.0])
-        # sentence 1, beam 2
-        self.assertHypoTokens(hypos[0][1], [w1, w1, eos])
-        self.assertHypoScore(hypos[0][1], [1.0, 0.4, 1.0])
-        # sentence 2, beam 1
-        self.assertHypoTokens(hypos[1][0], [w1, w1, eos])
-        self.assertHypoScore(hypos[1][0], [1.0, 0.4, 1.0])
-        # sentence 2, beam 2
-        self.assertHypoTokens(hypos[1][1], [w1, w1, eos])
-        self.assertHypoScore(hypos[1][1], [1.0, 0.4, 1.0])
-
-    def test_topp_sampling_search_high_prob(self):
-        # Given a prob high enough to top-P sampling, any of the top 2
-        # tokens could be sampled. This can cause different outputs.
-        high_sampling_topp = (self.min_top1_prob + self.min_top2_prob) / 2.0
-        search_strategy = search.Sampling(
-            self.tgt_dict, sampling_topp=high_sampling_topp
-        )
-        generator = SequenceGenerator(
-            [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy
-        )
-        sample = {
-            "net_input": {
-                "src_tokens": self.src_tokens,
-                "src_lengths": self.src_lengths,
-            }
-        }
-        hypos = generator.forward(sample)
-        eos, w1, w2 = self.eos, self.w1, self.w2
-        # sentence 1, beam 1
-        self.assertTrue(
-            self.hypoTokens(hypos[0][0], [w1, w1, eos])
-            or self.hypoTokens(hypos[0][0], [w1, w2, eos])
-        )
-        self.assertTrue(
-            self.hypoScore(hypos[0][0], [1.0, 0.4, 1.0])
-            or self.hypoScore(hypos[0][0], [1.0, 0.35, 1.0])
-        )
-
-        # sentence 1, beam 2
-        self.assertTrue(
-            self.hypoTokens(hypos[0][1], [w1, w1, eos])
-            or self.hypoTokens(hypos[0][1], [w1, w2, eos])
-        )
-        self.assertTrue(
-            self.hypoScore(hypos[0][1], [1.0, 0.4, 1.0])
-            or self.hypoScore(hypos[0][1], [1.0, 0.35, 1.0])
-        )
-
-        # sentence 2, beam 1
-        self.assertTrue(
-            self.hypoTokens(hypos[1][0], [w1, w1, eos])
-            or self.hypoTokens(hypos[1][0], [w1, w2, eos])
-        )
-        self.assertTrue(
-            self.hypoScore(hypos[1][0], [1.0, 0.4, 1.0])
-            or self.hypoScore(hypos[1][0], [1.0, 0.35, 1.0])
-        )
-
-        # sentence 2, beam 2
-        self.assertTrue(
-            self.hypoTokens(hypos[1][1], [w1, w1, eos])
-            or self.hypoTokens(hypos[1][1], [w1, w2, eos])
-        )
-        self.assertTrue(
-            self.hypoScore(hypos[1][1], [1.0, 0.4, 1.0])
-            or self.hypoScore(hypos[1][1], [1.0, 0.35, 1.0])
-        )
-
-    def hypoTokens(self, hypo, tokens):
-        return self.tensorEqual(hypo["tokens"], torch.LongTensor(tokens))
-
-    def hypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0):
-        pos_scores = torch.FloatTensor(pos_probs).log()
-        if not self.almostEqual(hypo["positional_scores"], pos_scores):
-            return False
-        if pos_scores.numel() != hypo["tokens"].numel():
-            return False
-        score = pos_scores.sum()
-        if normalized:
-            score /= pos_scores.numel() ** lenpen
-        return abs(score - hypo["score"]) < 1e-6
-
-    def almostEqual(self, t1, t2):
-        return t1.size() == t2.size() and (t1 - t2).abs().max() < 1e-4
-
-    def tensorEqual(self, t1, t2):
-        return t1.size() == t2.size() and t1.ne(t2).long().sum() == 0
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_sequence_scorer.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_sequence_scorer.py
deleted file mode 100644
index 42f9447b599bcd7a9913aec37d94ea5078ff43a3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_sequence_scorer.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import unittest
-
-import tests.utils as test_utils
-import torch
-from fairseq.sequence_scorer import SequenceScorer
-
-
-class TestSequenceScorer(unittest.TestCase):
-    def test_sequence_scorer(self):
-        # construct dummy dictionary
-        d = test_utils.dummy_dictionary(vocab_size=2)
-        self.assertEqual(d.pad(), 1)
-        self.assertEqual(d.eos(), 2)
-        self.assertEqual(d.unk(), 3)
-        eos = d.eos()
-        w1 = 4
-        w2 = 5
-
-        # construct dataloader
-        data = [
-            {
-                "source": torch.LongTensor([w1, w2, eos]),
-                "target": torch.LongTensor([w1, w2, w1, eos]),
-            },
-            {
-                "source": torch.LongTensor([w2, eos]),
-                "target": torch.LongTensor([w2, w1, eos]),
-            },
-            {
-                "source": torch.LongTensor([w2, eos]),
-                "target": torch.LongTensor([w2, eos]),
-            },
-        ]
-        data_itr = test_utils.dummy_dataloader(data)
-
-        # specify expected output probabilities
-        args = argparse.Namespace()
-        unk = 0.0
-        args.beam_probs = [
-            # step 0:
-            torch.FloatTensor(
-                [
-                    # eos      w1   w2
-                    [0.0, unk, 0.6, 0.4],  # sentence 1
-                    [0.0, unk, 0.4, 0.6],  # sentence 2
-                    [0.0, unk, 0.7, 0.3],  # sentence 3
-                ]
-            ),
-            # step 1:
-            torch.FloatTensor(
-                [
-                    # eos      w1   w2
-                    [0.0, unk, 0.2, 0.7],  # sentence 1
-                    [0.0, unk, 0.8, 0.2],  # sentence 2
-                    [0.7, unk, 0.1, 0.2],  # sentence 3
-                ]
-            ),
-            # step 2:
-            torch.FloatTensor(
-                [
-                    # eos       w1    w2
-                    [0.10, unk, 0.50, 0.4],  # sentence 1
-                    [0.15, unk, 0.15, 0.7],  # sentence 2
-                    [0.00, unk, 0.00, 0.0],  # sentence 3
-                ]
-            ),
-            # step 3:
-            torch.FloatTensor(
-                [
-                    # eos      w1    w2
-                    [0.9, unk, 0.05, 0.05],  # sentence 1
-                    [0.0, unk, 0.00, 0.0],  # sentence 2
-                    [0.0, unk, 0.00, 0.0],  # sentence 3
-                ]
-            ),
-        ]
-        expected_scores = [
-            [0.6, 0.7, 0.5, 0.9],  # sentence 1
-            [0.6, 0.8, 0.15],  # sentence 2
-            [0.3, 0.7],  # sentence 3
-        ]
-
-        task = test_utils.TestTranslationTask.setup_task(args, d, d)
-        model = task.build_model(args)
-        scorer = SequenceScorer(task.target_dictionary)
-        for sample in data_itr:
-            hypos = task.inference_step(scorer, [model], sample)
-            for id, hypos_id in zip(sample["id"].tolist(), hypos):
-                self.assertHypoTokens(hypos_id[0], data[id]["target"])
-                self.assertHypoScore(hypos_id[0], expected_scores[id])
-
-    def assertHypoTokens(self, hypo, tokens):
-        self.assertTensorEqual(hypo["tokens"], torch.LongTensor(tokens))
-
-    def assertHypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0):
-        pos_scores = torch.FloatTensor(pos_probs).log()
-        self.assertAlmostEqual(hypo["positional_scores"], pos_scores)
-        self.assertEqual(pos_scores.numel(), hypo["tokens"].numel())
-        score = pos_scores.sum()
-        if normalized:
-            score /= pos_scores.numel() ** lenpen
-        self.assertLess(abs(score - hypo["score"]), 1e-6)
-
-    def assertAlmostEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertLess((t1 - t2).abs().max(), 1e-4)
-
-    def assertTensorEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertEqual(t1.ne(t2).long().sum(), 0)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_sparse_multihead_attention.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_sparse_multihead_attention.py
deleted file mode 100644
index 3e32b25a7fb1e12295b84d0c65064f8e42b7bdd3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_sparse_multihead_attention.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import torch
-from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention
-
-
-class TestSparseMultiheadAttention(unittest.TestCase):
-    def test_sparse_multihead_attention(self):
-        attn_weights = torch.randn(1, 8, 8)
-        bidirectional_sparse_mask = torch.tensor(
-            [
-                [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0],
-                [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0],
-                [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0],
-                [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0],
-                [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0],
-                [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0],
-                [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0],
-                [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0],
-            ]
-        )
-
-        bidirectional_attention = SparseMultiheadAttention(
-            16, 1, stride=4, expressivity=1, is_bidirectional=True
-        )
-        bidirectional_attention_sparse_mask = (
-            bidirectional_attention.buffered_sparse_mask(attn_weights, 8, 8)
-        )
-        torch.all(
-            torch.eq(bidirectional_attention_sparse_mask, bidirectional_sparse_mask)
-        )
-
-        sparse_mask = torch.tensor(
-            [
-                [
-                    0,
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                ],
-                [
-                    0,
-                    0,
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                ],
-                [
-                    0,
-                    0,
-                    0,
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                ],
-                [
-                    0,
-                    0,
-                    0,
-                    0,
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                ],
-                [0, 0, 0, 0, 0, float("-inf"), float("-inf"), float("-inf")],
-                [
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                    0,
-                    0,
-                    0,
-                    float("-inf"),
-                    float("-inf"),
-                ],
-                [
-                    float("-inf"),
-                    float("-inf"),
-                    float("-inf"),
-                    0,
-                    0,
-                    0,
-                    0,
-                    float("-inf"),
-                ],
-                [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0],
-            ]
-        )
-
-        attention = SparseMultiheadAttention(
-            16, 1, stride=4, expressivity=1, is_bidirectional=False
-        )
-        attention_sparse_mask = attention.buffered_sparse_mask(attn_weights, 8, 8)
-
-        torch.all(torch.eq(attention_sparse_mask, sparse_mask))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_token_block_dataset.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_token_block_dataset.py
deleted file mode 100644
index ea315b4e67a6176feb3e35c468ca1179b4e0e3c4..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_token_block_dataset.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import tests.utils as test_utils
-import torch
-from fairseq.data import TokenBlockDataset
-
-
-class TestTokenBlockDataset(unittest.TestCase):
-    def _build_dataset(self, data, **kwargs):
-        sizes = [len(x) for x in data]
-        underlying_ds = test_utils.TestDataset(data)
-        return TokenBlockDataset(underlying_ds, sizes, **kwargs)
-
-    def test_eos_break_mode(self):
-        data = [
-            torch.tensor([5, 4, 3, 2, 1], dtype=torch.long),
-            torch.tensor([1], dtype=torch.long),
-            torch.tensor([8, 7, 6, 1], dtype=torch.long),
-        ]
-        ds = self._build_dataset(data, block_size=None, pad=0, eos=1, break_mode="eos")
-        self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1])
-        self.assertEqual(ds[1].tolist(), [1])
-        self.assertEqual(ds[2].tolist(), [8, 7, 6, 1])
-
-        data = [
-            torch.tensor([5, 4, 3, 2, 1], dtype=torch.long),
-            torch.tensor([8, 7, 6, 1], dtype=torch.long),
-            torch.tensor([1], dtype=torch.long),
-        ]
-        ds = self._build_dataset(data, block_size=None, pad=0, eos=1, break_mode="eos")
-        self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1])
-        self.assertEqual(ds[1].tolist(), [8, 7, 6, 1])
-        self.assertEqual(ds[2].tolist(), [1])
-
-    def test_block_break_mode(self):
-        data = [
-            torch.tensor([5, 4, 3, 2, 1], dtype=torch.long),
-            torch.tensor([8, 7, 6, 1], dtype=torch.long),
-            torch.tensor([9, 1], dtype=torch.long),
-        ]
-        ds = self._build_dataset(data, block_size=3, pad=0, eos=1, break_mode="none")
-        self.assertEqual(ds[0].tolist(), [5, 4, 3])
-        self.assertEqual(ds[1].tolist(), [2, 1, 8])
-        self.assertEqual(ds[2].tolist(), [7, 6, 1])
-        self.assertEqual(ds[3].tolist(), [9, 1])
-
-    def test_complete_break_mode(self):
-        data = [
-            torch.tensor([5, 4, 3, 2, 1], dtype=torch.long),
-            torch.tensor([8, 7, 6, 1], dtype=torch.long),
-            torch.tensor([9, 1], dtype=torch.long),
-        ]
-        ds = self._build_dataset(
-            data, block_size=6, pad=0, eos=1, break_mode="complete"
-        )
-        self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1])
-        self.assertEqual(ds[1].tolist(), [8, 7, 6, 1, 9, 1])
-
-        data = [
-            torch.tensor([4, 3, 2, 1], dtype=torch.long),
-            torch.tensor([5, 1], dtype=torch.long),
-            torch.tensor([1], dtype=torch.long),
-            torch.tensor([6, 1], dtype=torch.long),
-        ]
-        ds = self._build_dataset(
-            data, block_size=3, pad=0, eos=1, break_mode="complete"
-        )
-        self.assertEqual(ds[0].tolist(), [4, 3, 2, 1])
-        self.assertEqual(ds[1].tolist(), [5, 1, 1])
-        self.assertEqual(ds[2].tolist(), [6, 1])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_train.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_train.py
deleted file mode 100644
index 1b7e027c0cd0cb1dd92bce6375588019c1e4daa3..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_train.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import contextlib
-import logging
-import unittest
-from io import StringIO
-from unittest.mock import MagicMock, patch
-
-import torch
-from fairseq import checkpoint_utils, data
-
-
-def mock_trainer(epoch, num_updates, iterations_in_epoch):
-    trainer = MagicMock()
-    trainer.load_checkpoint.return_value = {
-        "train_iterator": {
-            "epoch": epoch,
-            "iterations_in_epoch": iterations_in_epoch,
-            "shuffle": False,
-        },
-    }
-    trainer.get_num_updates.return_value = num_updates
-    return trainer
-
-
-def mock_dict():
-    d = MagicMock()
-    d.pad.return_value = 1
-    d.eos.return_value = 2
-    d.unk.return_value = 3
-    return d
-
-
-def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch):
-    tokens = torch.LongTensor(list(range(epoch_size))).view(1, -1)
-    tokens_ds = data.TokenBlockDataset(
-        tokens,
-        sizes=[tokens.size(-1)],
-        block_size=1,
-        pad=0,
-        eos=1,
-        include_targets=False,
-    )
-    trainer = mock_trainer(epoch, num_updates, iterations_in_epoch)
-    dataset = data.LanguagePairDataset(
-        tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False
-    )
-    epoch_itr = data.EpochBatchIterator(
-        dataset=dataset,
-        collate_fn=dataset.collater,
-        batch_sampler=[[i] for i in range(epoch_size)],
-    )
-    return trainer, epoch_itr
-
-
-def get_mock_args(finetune_from_model=None):
-    args_mock = MagicMock()
-    args_mock.optimizer_overrides = "{}"
-    args_mock.reset_dataloader = False
-    args_mock.reset_meters = False
-    args_mock.reset_optimizer = False
-    args_mock.reset_lr_scheduler = False
-    args_mock.finetune_from_model = finetune_from_model
-    args_mock.model_parallel_size = 1
-    return args_mock
-
-
-class TestLoadCheckpoint(unittest.TestCase):
-    def setUp(self):
-        self.args_mock = get_mock_args()
-        self.patches = {
-            "os.makedirs": MagicMock(),
-            "os.path.join": MagicMock(),
-            "os.path.isfile": MagicMock(return_value=True),
-            "os.path.isabs": MagicMock(return_value=False),
-            "fairseq.file_io.PathManager.exists": MagicMock(return_value=False),
-        }
-        self.applied_patches = [patch(p, d) for p, d in self.patches.items()]
-        [p.start() for p in self.applied_patches]
-        logging.disable(logging.CRITICAL)
-
-    def tearDown(self):
-        patch.stopall()
-        logging.disable(logging.NOTSET)
-
-    def test_load_partial_checkpoint(self):
-        with contextlib.redirect_stdout(StringIO()):
-            trainer, epoch_itr = get_trainer_and_epoch_itr(2, 150, 200, 50)
-            trainer.get_train_iterator = MagicMock(return_value=epoch_itr)
-
-            _, epoch_itr = checkpoint_utils.load_checkpoint(self.args_mock, trainer)
-
-            self.assertEqual(epoch_itr.epoch, 2)
-            self.assertEqual(epoch_itr.iterations_in_epoch, 50)
-
-            itr = epoch_itr.next_epoch_itr(shuffle=False)
-            self.assertEqual(epoch_itr.epoch, 2)
-            self.assertEqual(epoch_itr.iterations_in_epoch, 50)
-
-            self.assertEqual(next(itr)["net_input"]["src_tokens"][0].item(), 50)
-            self.assertEqual(epoch_itr.iterations_in_epoch, 51)
-
-            for _ in range(150 - 52):
-                next(itr)
-            self.assertEqual(epoch_itr.iterations_in_epoch, 149)
-            self.assertTrue(itr.has_next())
-            next(itr)
-            self.assertFalse(itr.has_next())
-
-            itr = epoch_itr.next_epoch_itr(shuffle=False)
-            self.assertTrue(itr.has_next())
-            self.assertEqual(epoch_itr.epoch, 3)
-            self.assertEqual(epoch_itr.iterations_in_epoch, 0)
-
-    def test_load_full_checkpoint(self):
-        with contextlib.redirect_stdout(StringIO()):
-            trainer, epoch_itr = get_trainer_and_epoch_itr(2, 150, 300, 150)
-            trainer.get_train_iterator = MagicMock(return_value=epoch_itr)
-
-            _, epoch_itr = checkpoint_utils.load_checkpoint(self.args_mock, trainer)
-            itr = epoch_itr.next_epoch_itr(shuffle=False)
-
-            self.assertEqual(epoch_itr.epoch, 3)
-            self.assertEqual(epoch_itr.iterations_in_epoch, 0)
-            self.assertEqual(next(itr)["net_input"]["src_tokens"][0].item(), 0)
-
-    def test_load_no_checkpoint(self):
-        with contextlib.redirect_stdout(StringIO()):
-            trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0)
-            trainer.get_train_iterator = MagicMock(return_value=epoch_itr)
-            self.patches["os.path.isfile"].return_value = False
-
-            _, epoch_itr = checkpoint_utils.load_checkpoint(self.args_mock, trainer)
-            itr = epoch_itr.next_epoch_itr(shuffle=False)
-
-            self.assertEqual(epoch_itr.epoch, 1)
-            self.assertEqual(epoch_itr.iterations_in_epoch, 0)
-            self.assertEqual(next(itr)["net_input"]["src_tokens"][0].item(), 0)
-
-    def test_finetune_from_model_args_conflict(self):
-        with contextlib.redirect_stdout(StringIO()):
-            trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0)
-            trainer.get_train_iterator = MagicMock(return_value=epoch_itr)
-
-            for arg in [
-                "reset_optimizer",
-                "reset_lr_scheduler",
-                "reset_meters",
-                "reset_dataloader",
-            ]:
-                with self.subTest(arg=arg):
-                    args_mock = get_mock_args("/temp/checkpoint_pretrained.pt")
-                    setattr(args_mock, arg, True)
-                    with self.assertRaises(Exception) as context:
-                        _, _ = checkpoint_utils.load_checkpoint(args_mock, trainer)
-
-                    self.assertTrue(
-                        "--finetune-from-model can not be set together with either --reset-optimizer"
-                        " or reset_lr_scheduler or reset_meters or reset_dataloader"
-                        in str(context.exception)
-                    )
-
-    def test_finetune_from_model(self):
-        with contextlib.redirect_stdout(StringIO()):
-            trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0)
-            trainer.get_train_iterator = MagicMock(return_value=epoch_itr)
-            from_model_path = "/temp/checkpoint_pretrained.pt"
-            args_mock = get_mock_args(from_model_path)
-            args_mock.restore_file = "checkpoint_last.pt"
-
-            def mock_finetune_exist(path):
-                if path == from_model_path:
-                    return True
-                else:
-                    return False
-
-            self.patches[
-                "fairseq.file_io.PathManager.exists"
-            ].side_effect = mock_finetune_exist
-            _, _ = checkpoint_utils.load_checkpoint(args_mock, trainer)
-            (
-                checkpoint_path,
-                reset_optimizer,
-                reset_lr_scheduler,
-                optimizer_overrides,
-            ) = trainer.load_checkpoint.call_args[0]
-            reset_meters = trainer.load_checkpoint.call_args[1]["reset_meters"]
-            self.assertTrue(reset_optimizer)
-            self.assertTrue(reset_lr_scheduler)
-            self.assertTrue(reset_meters)
-
-    def test_finetune_from_model_resume(self):
-        with contextlib.redirect_stdout(StringIO()):
-            trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0)
-            trainer.get_train_iterator = MagicMock(return_value=epoch_itr)
-            from_model_path = "/temp/checkpoint_pretrained.pt"
-            args_mock = get_mock_args(from_model_path)
-            args_mock.restore_file = "checkpoint_last.pt"
-
-            # launch second time
-            # both restore_file=checkpoint_last.pt and finetune_from_model are set
-            def mock_finetune_exist(path):
-                if path == from_model_path or path.endsWith("checkpoint_last.pt"):
-                    return True
-                else:
-                    return False
-
-            self.patches[
-                "fairseq.file_io.PathManager.exists"
-            ].side_effect = mock_finetune_exist
-            _, _ = checkpoint_utils.load_checkpoint(args_mock, trainer)
-            (
-                checkpoint_path,
-                reset_optimizer,
-                reset_lr_scheduler,
-                optimizer_overrides,
-            ) = trainer.load_checkpoint.call_args[0]
-            reset_meters = trainer.load_checkpoint.call_args[1]["reset_meters"]
-            self.assertFalse(reset_optimizer)
-            self.assertFalse(reset_lr_scheduler)
-            self.assertFalse(reset_meters)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_utils.py
deleted file mode 100644
index 79195903e0f34372a24fa50312a6e00170c14471..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/test_utils.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import unittest
-
-import torch
-from fairseq import utils
-
-
-class TestUtils(unittest.TestCase):
-    def test_convert_padding_direction(self):
-        pad = 1
-        left_pad = torch.LongTensor(
-            [
-                [2, 3, 4, 5, 6],
-                [1, 7, 8, 9, 10],
-                [1, 1, 1, 11, 12],
-            ]
-        )
-        right_pad = torch.LongTensor(
-            [
-                [2, 3, 4, 5, 6],
-                [7, 8, 9, 10, 1],
-                [11, 12, 1, 1, 1],
-            ]
-        )
-
-        self.assertAlmostEqual(
-            right_pad,
-            utils.convert_padding_direction(
-                left_pad,
-                pad,
-                left_to_right=True,
-            ),
-        )
-        self.assertAlmostEqual(
-            left_pad,
-            utils.convert_padding_direction(
-                right_pad,
-                pad,
-                right_to_left=True,
-            ),
-        )
-
-    def test_make_positions(self):
-        pad = 1
-        left_pad_input = torch.LongTensor(
-            [
-                [9, 9, 9, 9, 9],
-                [1, 9, 9, 9, 9],
-                [1, 1, 1, 9, 9],
-            ]
-        )
-        left_pad_output = torch.LongTensor(
-            [
-                [2, 3, 4, 5, 6],
-                [1, 2, 3, 4, 5],
-                [1, 1, 1, 2, 3],
-            ]
-        )
-        right_pad_input = torch.LongTensor(
-            [
-                [9, 9, 9, 9, 9],
-                [9, 9, 9, 9, 1],
-                [9, 9, 1, 1, 1],
-            ]
-        )
-        right_pad_output = torch.LongTensor(
-            [
-                [2, 3, 4, 5, 6],
-                [2, 3, 4, 5, 1],
-                [2, 3, 1, 1, 1],
-            ]
-        )
-
-        self.assertAlmostEqual(
-            left_pad_output,
-            utils.make_positions(left_pad_input, pad),
-        )
-        self.assertAlmostEqual(
-            right_pad_output,
-            utils.make_positions(right_pad_input, pad),
-        )
-
-    def test_clip_grad_norm_(self):
-        params = torch.nn.Parameter(torch.zeros(5)).requires_grad_(False)
-        grad_norm = utils.clip_grad_norm_(params, 1.0)
-        self.assertTrue(torch.is_tensor(grad_norm))
-        self.assertEqual(grad_norm, 0.0)
-
-        params = [torch.nn.Parameter(torch.zeros(5)) for i in range(3)]
-        for p in params:
-            p.grad = torch.full((5,), fill_value=2.0)
-        grad_norm = utils.clip_grad_norm_(params, 1.0)
-        exp_grad_norm = torch.full((15,), fill_value=2.0).norm()
-        self.assertTrue(torch.is_tensor(grad_norm))
-        self.assertEqual(grad_norm, exp_grad_norm)
-
-        grad_norm = utils.clip_grad_norm_(params, 1.0)
-        self.assertAlmostEqual(grad_norm, torch.tensor(1.0))
-
-    def test_resolve_max_positions_with_tuple(self):
-        resolved = utils.resolve_max_positions(None, (2000, 100, 2000), 12000)
-        self.assertEqual(resolved, (2000, 100, 2000))
-
-    def assertAlmostEqual(self, t1, t2):
-        self.assertEqual(t1.size(), t2.size(), "size mismatch")
-        self.assertLess(utils.item((t1 - t2).abs().max()), 1e-4)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/utils.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/utils.py
deleted file mode 100644
index 91feca6b2acae86bfeb327709c8746792356da9f..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/tests/utils.py
+++ /dev/null
@@ -1,608 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import os
-import random
-import sys
-from io import StringIO
-
-import torch
-import torch.nn.functional as F
-from fairseq import options, utils
-from fairseq.data import Dictionary
-from fairseq.data.language_pair_dataset import collate
-from fairseq.models import (
-    FairseqEncoder,
-    FairseqEncoderDecoderModel,
-    FairseqIncrementalDecoder,
-)
-from fairseq.models.fairseq_encoder import EncoderOut
-from fairseq.tasks import FairseqTask, LegacyFairseqTask
-from fairseq_cli import generate, interactive, preprocess, train, validate
-
-
-def dummy_dictionary(vocab_size, prefix="token_"):
-    d = Dictionary()
-    for i in range(vocab_size):
-        token = prefix + str(i)
-        d.add_symbol(token)
-    d.finalize(padding_factor=1)  # don't add extra padding symbols
-    return d
-
-
-def dummy_dataloader(
-    samples,
-    padding_idx=1,
-    eos_idx=2,
-    batch_size=None,
-):
-    if batch_size is None:
-        batch_size = len(samples)
-
-    # add any missing data to samples
-    for i, sample in enumerate(samples):
-        if "id" not in sample:
-            sample["id"] = i
-
-    # create dataloader
-    dataset = TestDataset(samples)
-    dataloader = torch.utils.data.DataLoader(
-        dataset,
-        batch_size=batch_size,
-        collate_fn=(lambda samples: collate(samples, padding_idx, eos_idx)),
-    )
-    return iter(dataloader)
-
-
-def sequence_generator_setup():
-    # construct dummy dictionary
-    d = dummy_dictionary(vocab_size=2)
-
-    eos = d.eos()
-    w1 = 4
-    w2 = 5
-
-    # construct source data
-    src_tokens = torch.LongTensor([[w1, w2, eos], [w1, w2, eos]])
-    src_lengths = torch.LongTensor([2, 2])
-
-    args = argparse.Namespace()
-    unk = 0.0
-    args.beam_probs = [
-        # step 0:
-        torch.FloatTensor(
-            [
-                # eos      w1   w2
-                # sentence 1:
-                [0.0, unk, 0.9, 0.1],  # beam 1
-                [0.0, unk, 0.9, 0.1],  # beam 2
-                # sentence 2:
-                [0.0, unk, 0.7, 0.3],
-                [0.0, unk, 0.7, 0.3],
-            ]
-        ),
-        # step 1:
-        torch.FloatTensor(
-            [
-                # eos      w1   w2       prefix
-                # sentence 1:
-                [1.0, unk, 0.0, 0.0],  # w1: 0.9  (emit: w1 <eos>: 0.9*1.0)
-                [0.0, unk, 0.9, 0.1],  # w2: 0.1
-                # sentence 2:
-                [0.25, unk, 0.35, 0.4],  # w1: 0.7  (don't emit: w1 <eos>: 0.7*0.25)
-                [0.00, unk, 0.10, 0.9],  # w2: 0.3
-            ]
-        ),
-        # step 2:
-        torch.FloatTensor(
-            [
-                # eos      w1   w2       prefix
-                # sentence 1:
-                [0.0, unk, 0.1, 0.9],  # w2 w1: 0.1*0.9
-                [
-                    0.6,
-                    unk,
-                    0.2,
-                    0.2,
-                ],  # w2 w2: 0.1*0.1  (emit: w2 w2 <eos>: 0.1*0.1*0.6)
-                # sentence 2:
-                [
-                    0.60,
-                    unk,
-                    0.4,
-                    0.00,
-                ],  # w1 w2: 0.7*0.4  (emit: w1 w2 <eos>: 0.7*0.4*0.6)
-                [0.01, unk, 0.0, 0.99],  # w2 w2: 0.3*0.9
-            ]
-        ),
-        # step 3:
-        torch.FloatTensor(
-            [
-                # eos      w1   w2       prefix
-                # sentence 1:
-                [
-                    1.0,
-                    unk,
-                    0.0,
-                    0.0,
-                ],  # w2 w1 w2: 0.1*0.9*0.9  (emit: w2 w1 w2 <eos>: 0.1*0.9*0.9*1.0)
-                [
-                    1.0,
-                    unk,
-                    0.0,
-                    0.0,
-                ],  # w2 w1 w1: 0.1*0.9*0.1  (emit: w2 w1 w1 <eos>: 0.1*0.9*0.1*1.0)
-                # sentence 2:
-                [
-                    0.1,
-                    unk,
-                    0.5,
-                    0.4,
-                ],  # w2 w2 w2: 0.3*0.9*0.99  (emit: w2 w2 w2 <eos>: 0.3*0.9*0.99*0.1)
-                [
-                    1.0,
-                    unk,
-                    0.0,
-                    0.0,
-                ],  # w1 w2 w1: 0.7*0.4*0.4  (emit: w1 w2 w1 <eos>: 0.7*0.4*0.4*1.0)
-            ]
-        ),
-    ]
-
-    task = TestTranslationTask.setup_task(args, d, d)
-    model = task.build_model(args)
-    tgt_dict = task.target_dictionary
-
-    return tgt_dict, w1, w2, src_tokens, src_lengths, model
-
-
-def create_dummy_data(data_dir, num_examples=100, maxlen=20, alignment=False):
-    def _create_dummy_data(filename):
-        data = torch.rand(num_examples * maxlen)
-        data = 97 + torch.floor(26 * data).int()
-        with open(os.path.join(data_dir, filename), "w") as h:
-            offset = 0
-            for _ in range(num_examples):
-                ex_len = random.randint(1, maxlen)
-                ex_str = " ".join(map(chr, data[offset : offset + ex_len]))
-                print(ex_str, file=h)
-                offset += ex_len
-
-    def _create_dummy_alignment_data(filename_src, filename_tgt, filename):
-        with open(os.path.join(data_dir, filename_src), "r") as src_f, open(
-            os.path.join(data_dir, filename_tgt), "r"
-        ) as tgt_f, open(os.path.join(data_dir, filename), "w") as h:
-            for src, tgt in zip(src_f, tgt_f):
-                src_len = len(src.split())
-                tgt_len = len(tgt.split())
-                avg_len = (src_len + tgt_len) // 2
-                num_alignments = random.randint(avg_len // 2, 2 * avg_len)
-                src_indices = torch.floor(torch.rand(num_alignments) * src_len).int()
-                tgt_indices = torch.floor(torch.rand(num_alignments) * tgt_len).int()
-                ex_str = " ".join(
-                    [
-                        "{}-{}".format(src, tgt)
-                        for src, tgt in zip(src_indices, tgt_indices)
-                    ]
-                )
-                print(ex_str, file=h)
-
-    _create_dummy_data("train.in")
-    _create_dummy_data("train.out")
-    _create_dummy_data("valid.in")
-    _create_dummy_data("valid.out")
-    _create_dummy_data("test.in")
-    _create_dummy_data("test.out")
-
-    if alignment:
-        _create_dummy_alignment_data("train.in", "train.out", "train.align")
-        _create_dummy_alignment_data("valid.in", "valid.out", "valid.align")
-        _create_dummy_alignment_data("test.in", "test.out", "test.align")
-
-
-def preprocess_lm_data(data_dir):
-    preprocess_parser = options.get_preprocessing_parser()
-    preprocess_args = preprocess_parser.parse_args(
-        [
-            "--only-source",
-            "--trainpref",
-            os.path.join(data_dir, "train.out"),
-            "--validpref",
-            os.path.join(data_dir, "valid.out"),
-            "--testpref",
-            os.path.join(data_dir, "test.out"),
-            "--destdir",
-            data_dir,
-        ]
-    )
-    preprocess.main(preprocess_args)
-
-
-def preprocess_translation_data(data_dir, extra_flags=None):
-    preprocess_parser = options.get_preprocessing_parser()
-    preprocess_args = preprocess_parser.parse_args(
-        [
-            "--source-lang",
-            "in",
-            "--target-lang",
-            "out",
-            "--trainpref",
-            os.path.join(data_dir, "train"),
-            "--validpref",
-            os.path.join(data_dir, "valid"),
-            "--testpref",
-            os.path.join(data_dir, "test"),
-            "--thresholdtgt",
-            "0",
-            "--thresholdsrc",
-            "0",
-            "--destdir",
-            data_dir,
-        ]
-        + (extra_flags or []),
-    )
-    preprocess.main(preprocess_args)
-
-
-def preprocess_summarization_data(data_dir, extra_flags=None):
-    preprocess_parser = options.get_preprocessing_parser()
-    preprocess_args = preprocess_parser.parse_args(
-        [
-            "--source-lang",
-            "in",
-            "--target-lang",
-            "out",
-            "--trainpref",
-            os.path.join(data_dir, "train"),
-            "--validpref",
-            os.path.join(data_dir, "valid"),
-            "--testpref",
-            os.path.join(data_dir, "test"),
-            "--thresholdtgt",
-            "0",
-            "--thresholdsrc",
-            "0",
-            "--joined-dictionary",
-            "--destdir",
-            data_dir,
-        ]
-        + (extra_flags or []),
-    )
-    preprocess.main(preprocess_args)
-
-
-def train_translation_model(
-    data_dir,
-    arch,
-    extra_flags=None,
-    task="translation",
-    run_validation=False,
-    lang_flags=None,
-    extra_valid_flags=None,
-):
-    if lang_flags is None:
-        lang_flags = [
-            "--source-lang",
-            "in",
-            "--target-lang",
-            "out",
-        ]
-    train_parser = options.get_training_parser()
-    train_args = options.parse_args_and_arch(
-        train_parser,
-        [
-            "--task",
-            task,
-            data_dir,
-            "--save-dir",
-            data_dir,
-            "--arch",
-            arch,
-            "--optimizer",
-            "nag",
-            "--lr",
-            "0.05",
-            "--max-tokens",
-            "500",
-            "--max-epoch",
-            "1",
-            "--no-progress-bar",
-            "--distributed-world-size",
-            "1",
-            "--num-workers",
-            "0",
-        ]
-        + lang_flags
-        + (extra_flags or []),
-    )
-    train.main(train_args)
-
-    if run_validation:
-        # test validation
-        validate_parser = options.get_validation_parser()
-        validate_args = options.parse_args_and_arch(
-            validate_parser,
-            [
-                "--task",
-                task,
-                data_dir,
-                "--path",
-                os.path.join(data_dir, "checkpoint_last.pt"),
-                "--valid-subset",
-                "valid",
-                "--max-tokens",
-                "500",
-                "--no-progress-bar",
-                "--num-workers",
-                "0",
-            ]
-            + lang_flags
-            + (extra_valid_flags or []),
-        )
-        validate.main(validate_args)
-
-
-def generate_main(data_dir, extra_flags=None):
-    if extra_flags is None:
-        extra_flags = [
-            "--print-alignment",
-        ]
-    generate_parser = options.get_generation_parser()
-    generate_args = options.parse_args_and_arch(
-        generate_parser,
-        [
-            data_dir,
-            "--path",
-            os.path.join(data_dir, "checkpoint_last.pt"),
-            "--beam",
-            "3",
-            "--batch-size",
-            "64",
-            "--max-len-b",
-            "5",
-            "--gen-subset",
-            "valid",
-            "--no-progress-bar",
-            "--num-workers",
-            "0",
-        ]
-        + (extra_flags or []),
-    )
-
-    # evaluate model in batch mode
-    generate.main(generate_args)
-
-    # evaluate model interactively
-    generate_args.buffer_size = 0
-    generate_args.input = "-"
-    generate_args.batch_size = None
-    orig_stdin = sys.stdin
-    sys.stdin = StringIO("h e l l o\n")
-    interactive.main(generate_args)
-    sys.stdin = orig_stdin
-
-
-class TestDataset(torch.utils.data.Dataset):
-    def __init__(self, data):
-        super().__init__()
-        self.data = data
-        self.sizes = None
-
-    def __getitem__(self, index):
-        return self.data[index]
-
-    def __len__(self):
-        return len(self.data)
-
-
-class TestTranslationTask(LegacyFairseqTask):
-    def __init__(self, args, src_dict, tgt_dict, model):
-        super().__init__(args)
-        self.src_dict = src_dict
-        self.tgt_dict = tgt_dict
-        self.model = model
-
-    @classmethod
-    def setup_task(cls, args, src_dict=None, tgt_dict=None, model=None):
-        return cls(args, src_dict, tgt_dict, model)
-
-    def build_model(self, args):
-        return TestModel.build_model(args, self)
-
-    @property
-    def source_dictionary(self):
-        return self.src_dict
-
-    @property
-    def target_dictionary(self):
-        return self.tgt_dict
-
-
-class TestModel(FairseqEncoderDecoderModel):
-    def __init__(self, encoder, decoder):
-        super().__init__(encoder, decoder)
-
-    @classmethod
-    def build_model(cls, args, task):
-        encoder = TestEncoder(args, task.source_dictionary)
-        decoder = TestIncrementalDecoder(args, task.target_dictionary)
-        return cls(encoder, decoder)
-
-
-class TestEncoder(FairseqEncoder):
-    def __init__(self, args, dictionary):
-        super().__init__(dictionary)
-        self.args = args
-
-    def forward(self, src_tokens, src_lengths=None, **kwargs):
-        return EncoderOut(
-            encoder_out=src_tokens,
-            encoder_padding_mask=None,
-            encoder_embedding=None,
-            encoder_states=None,
-            src_tokens=None,
-            src_lengths=None,
-        )
-
-    def reorder_encoder_out(self, encoder_out, new_order):
-        return EncoderOut(
-            encoder_out=encoder_out.encoder_out.index_select(0, new_order),
-            encoder_padding_mask=None,
-            encoder_embedding=None,
-            encoder_states=None,
-            src_tokens=None,
-            src_lengths=None,
-        )
-
-
-class TestIncrementalDecoder(FairseqIncrementalDecoder):
-    def __init__(self, args, dictionary):
-        super().__init__(dictionary)
-        assert hasattr(args, "beam_probs") or hasattr(args, "probs")
-        args.max_decoder_positions = getattr(args, "max_decoder_positions", 100)
-        self.args = args
-
-    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None):
-        if incremental_state is not None:
-            prev_output_tokens = prev_output_tokens[:, -1:]
-        bbsz = prev_output_tokens.size(0)
-        vocab = len(self.dictionary)
-        src_len = encoder_out.encoder_out.size(1)
-        tgt_len = prev_output_tokens.size(1)
-
-        # determine number of steps
-        if incremental_state is not None:
-            # cache step number
-            step = utils.get_incremental_state(self, incremental_state, "step")
-            if step is None:
-                step = 0
-            utils.set_incremental_state(self, incremental_state, "step", step + 1)
-            steps = [step]
-        else:
-            steps = list(range(tgt_len))
-
-        # define output in terms of raw probs
-        if hasattr(self.args, "probs"):
-            assert (
-                self.args.probs.dim() == 3
-            ), "expected probs to have size bsz*steps*vocab"
-            probs = self.args.probs.index_select(1, torch.LongTensor(steps))
-        else:
-            probs = torch.FloatTensor(bbsz, len(steps), vocab).zero_()
-            for i, step in enumerate(steps):
-                # args.beam_probs gives the probability for every vocab element,
-                # starting with eos, then unknown, and then the rest of the vocab
-                if step < len(self.args.beam_probs):
-                    probs[:, i, self.dictionary.eos() :] = self.args.beam_probs[step]
-                else:
-                    probs[:, i, self.dictionary.eos()] = 1.0
-
-        # random attention
-        attn = torch.rand(bbsz, tgt_len, src_len)
-
-        dev = prev_output_tokens.device
-        return probs.to(dev), {"attn": [attn.to(dev)]}
-
-    def get_normalized_probs(self, net_output, log_probs, _):
-        # the decoder returns probabilities directly
-        probs = net_output[0]
-        if log_probs:
-            return probs.log()
-        else:
-            return probs
-
-    def max_positions(self):
-        return self.args.max_decoder_positions
-
-
-class TestReshapingEncoder(FairseqEncoder):
-    def __init__(self, args, dictionary):
-        super().__init__(dictionary)
-        self.args = args
-
-    def forward(self, src_tokens, src_lengths=None, **kwargs):
-        b_sz, t_sz = src_tokens.shape
-        padding_needed = t_sz % 2
-        x = src_tokens
-        if padding_needed > 0:
-            padding_needed = 2 - padding_needed
-            x = F.pad(x, (0, padding_needed))
-
-        return EncoderOut(
-            encoder_out=x.view(b_sz, -1, 2),
-            encoder_padding_mask=None,
-            encoder_embedding=None,
-            encoder_states=None,
-            src_tokens=None,
-            src_lengths=None,
-        )
-
-    def reorder_encoder_out(self, encoder_out, new_order):
-        return EncoderOut(
-            encoder_out=encoder_out.encoder_out.index_select(0, new_order),
-            encoder_padding_mask=None,
-            encoder_embedding=None,
-            encoder_states=None,
-            src_tokens=None,
-            src_lengths=None,
-        )
-
-
-class TestReshapingModel(FairseqEncoderDecoderModel):
-    def __init__(self, encoder, decoder):
-        super().__init__(encoder, decoder)
-
-    @classmethod
-    def build_model(cls, args, task):
-        encoder = TestReshapingEncoder(args, task.source_dictionary)
-        decoder = TestIncrementalDecoder(args, task.target_dictionary)
-        return cls(encoder, decoder)
-
-
-class TestAdditionalInputEncoder(FairseqEncoder):
-    def __init__(self, args, dictionary):
-        super().__init__(dictionary)
-        self.args = args
-
-    def forward(self, src_tokens, src_lengths=None, **kwargs):
-        assert "fancy_other_input" in kwargs
-        assert kwargs["fancy_other_input"] is not None
-        return EncoderOut(
-            encoder_out=src_tokens,
-            encoder_padding_mask=None,
-            encoder_embedding=None,
-            encoder_states=None,
-            src_tokens=None,
-            src_lengths=None,
-        )
-
-    def reorder_encoder_out(self, encoder_out, new_order):
-        return EncoderOut(
-            encoder_out=encoder_out.encoder_out.index_select(0, new_order),
-            encoder_padding_mask=None,
-            encoder_embedding=None,
-            encoder_states=None,
-            src_tokens=None,
-            src_lengths=None,
-        )
-
-
-class TestAdditionalInputModel(FairseqEncoderDecoderModel):
-    def __init__(self, encoder, decoder):
-        super().__init__(encoder, decoder)
-
-    @classmethod
-    def build_model(cls, args, task):
-        encoder = TestAdditionalInputEncoder(args, task.source_dictionary)
-        decoder = TestIncrementalDecoder(args, task.target_dictionary)
-        return cls(encoder, decoder)
-
-    def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
-        encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
-        decoder_out = self.decoder(
-            prev_output_tokens, encoder_out=encoder_out, **kwargs
-        )
-        return decoder_out
diff --git a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/train.py b/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/train.py
deleted file mode 100644
index 321de3d9b53f8194b58c26f5cb2c03281afc2bb1..0000000000000000000000000000000000000000
--- a/PyTorch/built-in/nlp/mBART_ID2372_for_PyTorch/mBART_ID2372_for_PyTorch/train.py
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env python3 -u
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-"""
-Legacy entry point. Use fairseq_cli/train.py or fairseq-train instead.
-"""
-
-from fairseq_cli.train import cli_main
-
-
-if __name__ == "__main__":
-    cli_main()