From 4c028eec983ef1422d36c63a84bd91c67fa010fc Mon Sep 17 00:00:00 2001
From: dingli <dingli40@huawei.com>
Date: Mon, 21 Mar 2022 13:06:53 +0000
Subject: [PATCH] update TDNN files for Ascend710

---
 .../built-in/audio/TDNN_for_Pytorch/ReadMe.md |  90 +-
 .../audio/TDNN_for_Pytorch/acl_net.py         | 521 +++++-----
 .../built-in/audio/TDNN_for_Pytorch/atc.sh    |  18 +-
 .../audio/TDNN_for_Pytorch/hyperparams.yaml   |  65 ++
 .../audio/TDNN_for_Pytorch/interfaces.py      | 958 ------------------
 .../built-in/audio/TDNN_for_Pytorch/mo.py     |  32 +
 .../audio/TDNN_for_Pytorch/modify.patch       | 270 +++++
 .../audio/TDNN_for_Pytorch/modify_onnx.py     |  51 +
 .../audio/TDNN_for_Pytorch/om_infer.sh        |   6 +-
 .../audio/TDNN_for_Pytorch/requirements.txt   |   8 +
 .../TDNN_for_Pytorch/tdnn_postprocess.py      |   5 +-
 .../{pth2onnx.py => tdnn_pth2onnx.py}         |   7 +-
 .../{pyacl_infer.py => tdnn_pyacl_infer.py}   |  53 +-
 13 files changed, 827 insertions(+), 1257 deletions(-)
 create mode 100644 ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/hyperparams.yaml
 delete mode 100644 ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/interfaces.py
 create mode 100644 ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/mo.py
 create mode 100644 ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify.patch
 create mode 100644 ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify_onnx.py
 create mode 100644 ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/requirements.txt
 rename ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/{pth2onnx.py => tdnn_pth2onnx.py} (88%)
 rename ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/{pyacl_infer.py => tdnn_pyacl_infer.py} (76%)

diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/ReadMe.md b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/ReadMe.md
index 27b8c44c13..4ea7fca8b3 100644
--- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/ReadMe.md
+++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/ReadMe.md
@@ -1,31 +1,59 @@
-文件作用说明：
-
-1.atc.sh：模型转换脚本，生成动态分档模型
-
-2.pth2onnx.py：用于转换ckpt文件到onnx文件
-
-3.acl_net.py: pyACL推理依赖模块
-
-4.interfaces.py: 替换speechbrain/pretrained 目录下同名文件
-
-5.om_infer.sh: pyACL推理启动脚本
-
-6.pyacl_infer.py: pyACL推理代码
-
-7.tdnn_postprocess.py: 预处理脚本
-
-8.tdnn_preprocess.py: 后处理脚本
-
-
-
-推理端到端步骤：
-
-（1） 从Speechbrain克隆源代码，修改speechbrain/nnet/CNN.py 349行padding_mode='constant',从Ascend Modelzoo获取训练好的权重文件夹best model, 进templates/speaker_id 运行 pth2onnx.py脚本生成tdnn.onnx模型
-
-（2） 准备数据集，注释掉speechbrain/templates/speaker_id/mini_librispeech_prepare.py 174行代码，然后执行预处理脚本，python3 tdnn_preprocess.py, 将数据集处理为二进制文件
-
-（3） 执行atc脚本, bash atc.sh tdnn.onnx tdnn，生成tdnn.om
-
-（4） 执行om_infer脚本， bash om_infer.sh，推理结果输出在result目录下
-
- (5)  执行后处理脚本，python3 tdnn_postprocess.py 得到模型精度
\ No newline at end of file
+# TDNN模型pytorch离线推理指导
+
+## 1 环境准备
+
+1.获取，修改与安装开源模型代码
+
+```shell
+git clone https://github.com/speechbrain/speechbrain.git
+cd speechbrain    
+git checkout  develop    
+git reset --hard 51a2becdcf3a337578a9307a0b2fc3906bf20391
+export PYTHONPATH=`pwd`:$PYTHONPATH
+cd ..
+git clone https://gitee.com/Ronnie_zheng/MagicONNX.git
+cd MagicONNX && git checkout 8d62ae9dde478f35bece4b3d04eef573448411c9
+pip install .
+```
+将源码包中文件放入speechbrain/templates/speaker_id中
+```shell
+cd speechbrain
+git apply --reject --whitespace=fix templates/speaker_id/modify.patch
+```
+
+2.获取权重文件
+
+https://www.hiascend.com/zh/software/modelzoo/detail/1/f4f4103245624c1a8637f8a5eadd950c
+将模型权重文件夹best_model放入speechbrain/templates/speaker_id下，将hyperparams.yaml文件放入best_model中
+
+3.获取数据集
+
+预处理阶段自动下载
+```shell
+python3 tdnn_preprocess.py
+```
+
+## 2 模型转换
+```shell
+# 生成tdnn_bs64.onnx
+python3 tdnn_pth2onnx.py 64
+# 优化onnx模型
+python3 -m onnxsim tdnn_bs64.onnx tdnn_bs64s.onnx
+python3 modify_onnx.py tdnn_bs64s.onnx
+# 生成om模型
+bash atc.sh tdnn_bs64s.onnx
+```
+
+## 3 离线推理
+
+```shell
+bash om_infer.sh 64
+python3 tdnn_postprocess.py
+```
+**评测结果：**
+
+由于TensorRT不支持原模型，故只能对比修改后的模型性能。
+| 模型              | pth精度        | 710离线推理精度      | 基准性能      | 710性能  |
+| :------:          | :------:       | :------:            | :------:     | :------: |
+| TDNN bs64         | 99.93%         | 99.93%              | -            |  2467fps  |
+| TDNN修改 bs64     | -              | -                   | 2345.179 fps |  3815.886fps  |
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/acl_net.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/acl_net.py
index 8de5653478..d7f893b3f3 100644
--- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/acl_net.py
+++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/acl_net.py
@@ -1,245 +1,276 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import acl
-import functools
-
-# error code
-ACL_ERROR_NONE = 0
-
-# memory malloc code
-ACL_MEM_MALLOC_HUGE_FIRST = 0
-ACL_MEM_MALLOC_HUGE_ONLY = 1
-ACL_MEM_MALLOC_NORMAL_ONLY = 2
-
-# memory copy code
-ACL_MEMCPY_HOST_TO_HOST = 0
-ACL_MEMCPY_HOST_TO_DEVICE = 1
-ACL_MEMCPY_DEVICE_TO_HOST = 2
-ACL_MEMCPY_DEVICE_TO_DEVICE = 3
-
-ACL_DTYPE = {
-    0: 'float32',
-    1: 'float16',
-    2: 'int8',
-    3: 'int32',
-    4: 'uint8',
-    6: 'int16',
-    7: 'uint16',
-    8: 'uint32',
-    9: 'int64',
-    10: 'uint64',
-    11: 'float64',
-    12: 'bool',
-}
-
-buffer_method = {
-    "in": acl.mdl.get_input_size_by_index,
-    "out": acl.mdl.get_output_size_by_index,
-    "outhost": acl.mdl.get_output_size_by_index
-}
-
-def check_ret(message, ret):
-    if ret != ACL_ERROR_NONE:
-        raise Exception("{} failed ret = {}".format(message, ret))
-
-
-class Net(object):
-    def __init__(self, context, model_path, device_id=0, first=True, config_path=None):
-        self.device_id = device_id
-        self.model_path = model_path
-        self.model_id = None
-        self.context = context
-
-        self.input_data = []
-        self.output_data = []
-        self.output_data_host = []
-        self.model_desc = None
-        self.load_input_dataset = None
-        self.load_output_dataset = None
-
-        self._init_resource(first, config_path)
-    
-
-    def __call__(self, ori_data):
-        return self.forward(ori_data)
-
-
-    def __del__(self):
-        ret = acl.mdl.unload(self.model_id)
-        check_ret("acl.mdl.unload", ret)
-        if self.model_desc:
-            acl.mdl.destroy_desc(self.model_desc)
-            self.model_desc = None
-        
-        while self.input_data:
-            item = self.input_data.pop()
-            ret = acl.rt.free(item["buffer"])
-            check_ret("acl.rt.free", ret)
-
-        while self.output_data:
-            item = self.output_data.pop()
-            ret = acl.rt.free(item["buffer"])
-            check_ret("acl.rt.free", ret)
-
-
-    def _init_resource(self, first=False, config_path=None):
-        # load_model
-        self.model_id, ret = acl.mdl.load_from_file(self.model_path)
-        check_ret("acl.mdl.load_from_file", ret)
-
-        self.model_desc = acl.mdl.create_desc()
-        self._get_model_info()
-
-    
-    def _get_model_info(self,):
-        ret = acl.mdl.get_desc(self.model_desc, self.model_id)
-        check_ret("acl.mdl.get_desc", ret)
-        input_size = acl.mdl.get_num_inputs(self.model_desc)
-        output_size = acl.mdl.get_num_outputs(self.model_desc)
-        self._gen_data_buffer(input_size, des="in")
-        self._gen_data_buffer(output_size, des="out")
-        self._gen_dataset_output_host(output_size, des="outhost")
-    
-
-    def _gen_data_buffer(self, size, des):
-        func = buffer_method[des]
-        for i in range(size):
-            temp_buffer_size = func(self.model_desc, i)
-            temp_buffer, ret = acl.rt.malloc(temp_buffer_size, ACL_MEM_MALLOC_HUGE_FIRST)
-            check_ret("acl.rt.malloc", ret)
-
-            if des == "in":
-                self.input_data.append({"buffer": temp_buffer, 
-                                        "size": temp_buffer_size})
-            elif des == "out":
-                self.output_data.append({"buffer": temp_buffer, 
-                                        "size": temp_buffer_size})
-
-
-    def _gen_dataset_output_host(self, size, des):
-        func = buffer_method[des]
-        for i in range(size):
-            temp_buffer_size = func(self.model_desc, i)
-            temp_buffer, ret = acl.rt.malloc_host(temp_buffer_size)
-            check_ret("acl.rt.malloc_host", ret)
-
-            self.output_data_host.append({"buffer": temp_buffer,
-                                        "size": temp_buffer_size})
-
-
-    def _data_interaction(self, dataset, policy=ACL_MEMCPY_HOST_TO_DEVICE):
-        temp_data_buffer = self.input_data \
-            if policy == ACL_MEMCPY_HOST_TO_DEVICE \
-            else self.output_data
-        output_malloc_cost = 0
-        idx = 0
-
-        if len(dataset) == 0 and policy == ACL_MEMCPY_DEVICE_TO_HOST:
-            dataset = self.output_data_host
-        
-        for i, item in enumerate(temp_data_buffer):
-            if policy == ACL_MEMCPY_HOST_TO_DEVICE:
-                ptr = acl.util.numpy_to_ptr(dataset[i])
-                ret = acl.rt.memcpy(item["buffer"], item["size"], ptr, item["size"], policy)
-                check_ret("acl.rt.memcpy", ret)
-
-            else:
-                ptr = dataset[i]["buffer"]
-                ret = acl.rt.memcpy(ptr, item["size"], item["buffer"], item["size"], policy)
-                check_ret("acl.rt.memcpy", ret)
-
-
-    def _gen_dataset(self, type_str="input"):
-        dataset = acl.mdl.create_dataset()
-
-        temp_dataset = None
-        if type_str == "in":
-            self.load_input_dataset = dataset
-            temp_dataset = self.input_data
-        else:
-            self.load_output_dataset = dataset
-            temp_dataset = self.output_data           
-
-        for item in temp_dataset:
-            data = acl.create_data_buffer(item["buffer"], item["size"])
-            if data is None:
-                ret = acl.destroy_data_buffer(dataset)
-                check_ret("acl.destroy_data_buffer", ret)
-
-            _, ret = acl.mdl.add_dataset_buffer(dataset, data)
-            if ret != ACL_ERROR_NONE:
-                ret = acl.destroy_data_buffer(dataset)
-                check_ret("acl.destroy_data_buffer", ret)
-
-
-    def _data_from_host_to_device(self, images):
-        self._data_interaction(images, ACL_MEMCPY_HOST_TO_DEVICE)
-        self._gen_dataset("in")
-        self._gen_dataset("out")
-
-
-    def _data_from_device_to_host(self):
-        res = []
-        self._data_interaction(res, ACL_MEMCPY_DEVICE_TO_HOST)
-        output = self.get_result(self.output_data_host)
-        return output
-
-
-    def _destroy_databuffer(self):
-        for dataset in [self.load_input_dataset, self.load_output_dataset]:
-            if not dataset:
-                continue
-            
-            num = acl.mdl.get_dataset_num_buffers(dataset)
-            for i in range(num):
-                data_buf = acl.mdl.get_dataset_buffer(dataset, i)
-                if data_buf:
-                    ret = acl.destroy_data_buffer(data_buf)
-                    check_ret("acl.destroy_data_buffer", ret)
-            ret = acl.mdl.destroy_dataset(dataset)
-            check_ret("acl.mdl.destroy_dataset", ret)
-
-    def forward(self, input_data):
-        if not isinstance(input_data, (list, tuple)):
-            input_data = [input_data]
-
-        self._data_from_host_to_device(input_data)
-        ret = acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset)
-        check_ret("acl.mdl.execute", ret)
-
-        self._destroy_databuffer()
-        result = self._data_from_device_to_host()
-        return result
-
-
-    def get_result(self, output_data):
-        dataset = []
-        for i in range(len(output_data)):
-            dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, i)
-            check_ret("acl.mdl.get_cur_output_dims", ret)
-
-            data_shape = dims.get("dims")
-            data_type = acl.mdl.get_output_data_type(self.model_desc, i)
-            data_len =  functools.reduce(lambda x, y: x * y, data_shape)
-            ftype = np.dtype(ACL_DTYPE.get(data_type))
-
-            size = output_data[i]["size"]
-            ptr = output_data[i]["buffer"]
-            data = acl.util.ptr_to_numpy(ptr, (size,), 1)
-            np_array = np.frombuffer(bytearray(data[:data_len * ftype.itemsize]), dtype=ftype, count=data_len)
-            np_array = np_array.reshape(data_shape)
-            dataset.append(np_array)
-        return dataset
\ No newline at end of file
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import acl
+import functools
+import numpy as np
+import torch
+import time
+
+# error code
+ACL_ERROR_NONE = 0
+
+# rule for memory copy
+ACL_MEMCPY_HOST_TO_HOST = 0
+ACL_MEMCPY_HOST_TO_DEVICE = 1
+ACL_MEMCPY_DEVICE_TO_HOST = 2
+ACL_MEMCPY_DEVICE_TO_DEVICE = 3
+
+# dtype
+ACL_DTYPE = {
+    0: 'float32',
+    1: 'float16',
+    2: 'int8',
+    3: 'int32',
+    4: 'uint8',
+    6: 'int16',
+    7: 'uint16',
+    8: 'uint32',
+    9: 'int64',
+    10: 'uint64',
+    11: 'float64',
+    12: 'bool',
+}
+
+
+def check_ret(message, ret):
+    if ret != ACL_ERROR_NONE:
+        raise Exception(f"{message} failed ret={ret}")
+
+
+class MeasureTime():
+    def __init__(self, measurements, key, cpu_run=True):
+        self.measurements = measurements
+        self.key = key
+        self.cpu_run = cpu_run
+
+    def __enter__(self):
+        if not self.cpu_run:
+            torch.cuda.synchronize()
+        self.t0 = time.perf_counter_ns()
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        if not self.cpu_run:
+            torch.cuda.synchronize()
+        self.measurements[self.key] = time.perf_counter_ns() - self.t0
+
+
+class AclModel(object):
+    def __init__(self, device_id, model_path, sync_infer, measurements, key, cpu_run):
+        self.device_id = device_id
+        self.sync_infer = sync_infer
+        self.out_bufs_ptr = []
+        self.output_sizes = []
+        self.input_sizes = []
+        self.input_bufs_ptr = []
+
+        self.measurements = measurements
+        self.key = key
+        self.cpu_run = cpu_run
+
+        ret = acl.init()
+        check_ret("acl.init", ret)
+        ret = acl.rt.set_device(self.device_id)
+        check_ret("acl.rt.set_device", ret)
+        self.context, ret = acl.rt.create_context(self.device_id)
+        check_ret("acl.rt.create_context", ret)
+        self.model_id, ret = acl.mdl.load_from_file(model_path)
+        check_ret("acl.mdl.load_from_file", ret)
+
+        self.model_desc = acl.mdl.create_desc()
+        assert self.model_desc is not None
+        acl.mdl.get_desc(self.model_desc, self.model_id)
+        self.dataset_in = acl.mdl.create_dataset()
+        assert self.dataset_in is not None
+        self.dataset_out = acl.mdl.create_dataset()
+        assert self.dataset_out is not None
+        self.in_size, self.out_size = 0, 0
+        self.stm, ret = acl.rt.create_stream()
+        assert ret == 0
+
+        self.desc_init()
+        self.dataset_init()
+
+    def __call__(self, ori_data, dim):
+        return self.forward(ori_data, dim)
+
+    def __del__(self):
+        # unload model
+        if self.model_id:
+            ret = acl.mdl.unload(self.model_id)
+            assert ret == 0
+
+        # destroy model desc
+        ret = acl.mdl.destroy_desc(self.model_desc)
+        assert ret == 0
+
+        self.destroy_data_set(self.dataset_in)
+        self.destroy_data_set(self.dataset_out)
+
+        # destroy input/output tensor
+        for i in range(len(self.input_bufs_ptr)):
+            acl.rt.free(self.input_bufs_ptr[i]["buffer"])
+            self.input_bufs_ptr[i] = None
+
+        for i in range(len(self.out_bufs_ptr)):
+            acl.rt.free(self.out_bufs_ptr[i]["buffer"])
+            self.out_bufs_ptr[i] = None
+
+        ret = acl.rt.destroy_stream(self.stm)
+        assert ret == 0
+
+    def desc_init(self):
+        tensor_size = acl.mdl.get_num_inputs(self.model_desc)
+        if not tensor_size:
+            raise Exception("get_num_inputs failed")
+        self.in_size = tensor_size
+
+        for i in range(tensor_size):
+            size = acl.mdl.get_input_size_by_index(self.model_desc, i)
+            data, ret = acl.rt.malloc(size, 0)
+            assert ret == 0
+
+            self.input_bufs_ptr.append({'size': size, 'buffer': data})
+            self.input_sizes.append(size)
+
+        tensor_size = acl.mdl.get_num_outputs(self.model_desc)
+        self.out_size = tensor_size
+        for i in range(tensor_size):
+            dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, i)
+            assert ret == 0
+            size = acl.mdl.get_output_size_by_index(self.model_desc, i)
+
+            data, ret = acl.rt.malloc(size, 0)
+            assert ret == 0
+
+            self.output_sizes.append(size)
+            self.out_bufs_ptr.append({'size': size, 'buffer': data})
+
+    def dataset_init(self):
+        self.create_data_set(self.dataset_in, self.input_bufs_ptr, self.input_sizes)
+        self.create_data_set(self.dataset_out, self.out_bufs_ptr, self.output_sizes)
+
+    def create_data_set(self, dataset, bufs_ptr_list, size_list):
+        # create dataset buffer then add to dataset
+        for i in range(len(size_list)):
+            buffer = acl.create_data_buffer(bufs_ptr_list[i]["buffer"], size_list[i])
+            if not buffer:
+                self.destroy_data_set(dataset)
+                raise Exception("create_data_buffer failed")
+
+            # add to dataset
+            _, ret = acl.mdl.add_dataset_buffer(dataset, buffer)
+            if ret != 0:
+                self.destroy_data_set(dataset)
+                raise Exception("add_dataset_buffer failed, ret = {}".format(ret))
+
+        return dataset
+
+    def destroy_data_set(self, dataset):
+        data_buf_num = acl.mdl.get_dataset_num_buffers(dataset)
+        for i in range(data_buf_num):
+            # get data buffer by index
+            data_buf = acl.mdl.get_dataset_buffer(dataset, i)
+            if data_buf is not None:
+                acl.destroy_data_buffer(data_buf)
+
+        acl.mdl.destroy_dataset(dataset)
+
+    def copy_data_to_device(self, data):
+        for i in range(len(data)):
+            ptr, np = acl.util.numpy_contiguous_to_ptr(data[i]["buffer"])
+            acl.rt.memcpy(self.input_bufs_ptr[i]["buffer"], data[i]["size"], ptr,
+                          data[i]["size"], ACL_MEMCPY_HOST_TO_DEVICE)
+
+    def copy_output_to_host(self):
+        output_data = []
+        for i in range(len(self.out_bufs_ptr)):
+            temp = dict()
+            temp["size"] = self.out_bufs_ptr[i]["size"]
+            temp["buffer"], ret = acl.rt.malloc_host(temp["size"])
+            output_data.append(temp)
+            acl.rt.memcpy(temp["buffer"], temp["size"], self.out_bufs_ptr[i]["buffer"],
+                          temp["size"], ACL_MEMCPY_DEVICE_TO_HOST)
+
+        return output_data
+
+    def model_exe(self):
+        with MeasureTime(self.measurements, self.key, self.cpu_run):
+            ret = acl.mdl.execute(self.model_id, self.dataset_in, self.dataset_out)
+        assert ret == 0
+        output_data = self.copy_output_to_host()
+        dataset = []
+        for i in range(len(output_data)):
+            dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, i)
+            data_shape = dims.get("dims")
+            data_type = acl.mdl.get_output_data_type(self.model_desc, i)
+            data_len = functools.reduce(lambda x, y: x * y, data_shape)
+            ftype = np.dtype(ACL_DTYPE.get(data_type))
+
+            size = output_data[i]["size"]
+            ptr = output_data[i]["buffer"]
+            data = acl.util.ptr_to_numpy(ptr, (size,), 1)
+            np_arr = np.frombuffer(bytearray(data[:data_len * ftype.itemsize]), dtype=ftype, count=data_len)
+            np_arr = np_arr.reshape(data_shape)
+            dataset.append(np_arr)
+        return dataset
+
+    def model_exe_async(self):
+        with MeasureTime(self.measurements, self.key, self.cpu_run):
+            ret = acl.mdl.execute_async(self.model_id, self.dataset_in, self.dataset_out, self.stm)
+        assert ret == 0
+        ret = acl.rt.synchronize_stream(self.stm)
+        assert ret == 0
+        output_data = self.copy_output_to_host()
+
+        dataset = []
+        for i in range(len(output_data)):
+            dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, i)
+            # check_ret("acl.mdl.get_cur_output_dims", ret)
+            data_shape = dims.get("dims")
+
+            data_type = acl.mdl.get_output_data_type(self.model_desc, i)
+            data_len = functools.reduce(lambda x, y: x * y, data_shape)
+            ftype = np.dtype(ACL_DTYPE.get(data_type))
+
+            size = output_data[i]["size"]
+            ptr = output_data[i]["buffer"]
+            data = acl.util.ptr_to_numpy(ptr, (size,), 1)
+            np_arr = np.frombuffer(bytearray(data[:data_len * ftype.itemsize]), dtype=ftype, count=data_len)
+            np_arr = np_arr.reshape(data_shape)
+            dataset.append(np_arr)
+        return dataset
+
+    def model_exe_with_dynamic_dims(self, input_data, dims):
+        index, ret = acl.mdl.get_input_index_by_name(self.model_desc, 'ascend_mbatch_shape_data')
+        ret = acl.mdl.set_input_dynamic_dims(self.model_id, self.dataset_in, index, dims)
+        gear_count, ret = acl.mdl.get_input_dynamic_gear_count(self.model_desc, -1)
+        dims_out, ret = acl.mdl.get_input_dynamic_dims(self.model_desc, -1, gear_count)
+        self.copy_data_to_device(input_data)
+        if self.sync_infer is True:
+            res = self.model_exe()
+        else:
+            res = self.model_exe_async()
+
+        return res
+
+    def forward(self, input_data, dims):
+        input_data_dic = []
+        for i in range(len(input_data)):
+            temp = {}
+            temp["size"] = input_data[i].size * input_data[i].itemsize
+            temp["buffer"] = input_data[i]
+            input_data_dic.append(temp)
+        result = self.model_exe_with_dynamic_dims(input_data_dic, dims)
+        return result
diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/atc.sh b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/atc.sh
index a2e9250855..de48fc986b 100644
--- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/atc.sh
+++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/atc.sh
@@ -1,9 +1,13 @@
 #!/bin/bash
-export install_path=/usr/local/Ascend/ascend-toolkit/latest
-export PATH=/usr/local/python3.7.5/bin:${install_path}/atc/ccec_compiler/bin:${install_path}/atc/bin:$PATH
-export PYTHONPATH=${install_path}/atc/python/site-packages:$PYTHONPATH
-export LD_LIBRARY_PATH=${install_path}/atc/lib64:${install_path}/acllib/lib64:$LD_LIBRARY_PATH
-export ASCEND_OPP_PATH=${install_path}/opp
-#export DUMP_GE_GRAPH=2
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+model=$1
+bs=`echo  ${model} | tr -cd "[0-9]" `
+
+if [ `echo $model | grep "mod"` ]
+then
+   atc --model=$model --framework=5 --input_format=ND --input_shape="feats:${bs},-1,23;random:${bs},1500" --dynamic_dims='200;300;400;500;600;700;800;900;1000;1100;1200;1300;1400;1500;1600;1700;1800' --output=./tdnn_bs${bs}_mods --soc_version=Ascend710 --log=error
+else
+   atc --model=$model --framework=5 --input_format=ND --input_shape="feats:${bs},-1,23" --dynamic_dims='200;300;400;500;600;700;800;900;1000;1100;1200;1300;1400;1500;1600;1700;1800' --output=./tdnn_bs${bs}s --soc_version=Ascend710 --log=error
+fi
 
-atc --model=$1 --framework=5 --input_format=ND --input_shape="feats:1,-1,23" --dynamic_dims='200;300;400;500;600;700;800;900;1000;1100;1200;1300;1400;1500;1600;1700;1800' --output=$2 --soc_version=Ascend310 --log=info
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/hyperparams.yaml b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/hyperparams.yaml
new file mode 100644
index 0000000000..fc899ca068
--- /dev/null
+++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/hyperparams.yaml
@@ -0,0 +1,65 @@
+# #################################
+# Basic inference parameters for speaker-id. We have first a network that
+# computes some embeddings. On the top of that, we employ a classifier.`
+#
+# Author:
+#  * Mirco Ravanelli 2021
+# #################################
+
+# pretrain folders:
+pretrained_path: best_model
+
+
+# Model parameters
+n_mels: 23
+sample_rate: 16000
+n_classes: 28 # In this case, we have 28 speakers
+emb_dim: 512 # dimensionality of the embeddings
+
+# Feature extraction
+compute_features: !new:speechbrain.lobes.features.Fbank
+    n_mels: !ref <n_mels>
+
+# Mean and std normalization of the input features
+mean_var_norm: !new:speechbrain.processing.features.InputNormalization
+    norm_type: sentence
+    std_norm: False
+
+# To design a custom model, either just edit the simple CustomModel
+# class that's listed here, or replace this `!new` call with a line
+# pointing to a different file you've defined.
+embedding_model: !new:custom_model.Xvector
+    in_channels: !ref <n_mels>
+    activation: !name:torch.nn.LeakyReLU
+    tdnn_blocks: 5
+    tdnn_channels: [512, 512, 512, 512, 1500]
+    tdnn_kernel_sizes: [5, 3, 3, 1, 1]
+    tdnn_dilations: [1, 2, 3, 1, 1]
+    lin_neurons: !ref <emb_dim>
+
+classifier: !new:custom_model.Classifier
+    input_shape: [null, null, !ref <emb_dim>]
+    activation: !name:torch.nn.LeakyReLU
+    lin_blocks: 1
+    lin_neurons: !ref <emb_dim>
+    out_neurons: !ref <n_classes>
+
+label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
+
+# Objects in "modules" dict will have their parameters moved to the correct
+# device, as well as having train()/eval() called on them by the Brain class.
+modules:
+    compute_features: !ref <compute_features>
+    embedding_model: !ref <embedding_model>
+    classifier: !ref <classifier>
+    mean_var_norm: !ref <mean_var_norm>
+
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    loadables:
+        embedding_model: !ref <embedding_model>
+        classifier: !ref <classifier>
+        label_encoder: !ref <label_encoder>
+    paths:
+        embedding_model: !ref <pretrained_path>/embedding_model.ckpt
+        classifier: !ref <pretrained_path>/classifier.ckpt
+        label_encoder: !ref <pretrained_path>/label_encoder.txt
diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/interfaces.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/interfaces.py
deleted file mode 100644
index ead6a0634d..0000000000
--- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/interfaces.py
+++ /dev/null
@@ -1,958 +0,0 @@
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Defines interfaces for simple inference with pretrained models
-
-Authors:
- * Aku Rouhe 2021
- * Peter Plantinga 2021
- * Loren Lugosch 2020
- * Mirco Ravanelli 2020
- * Titouan Parcollet 2021
-"""
-import torch
-import torchaudio
-from types import SimpleNamespace
-from torch.nn import SyncBatchNorm
-from torch.nn import DataParallel as DP
-from hyperpyyaml import load_hyperpyyaml
-from speechbrain.pretrained.fetching import fetch
-from speechbrain.dataio.preprocess import AudioNormalizer
-import torch.nn.functional as F
-from torch.nn.parallel import DistributedDataParallel as DDP
-from speechbrain.utils.data_utils import split_path
-from speechbrain.utils.distributed import run_on_main
-
-
-class Pretrained:
-    """Takes a trained model and makes predictions on new data.
-
-    This is a base class which handles some common boilerplate.
-    It intentionally has an interface similar to ``Brain`` - these base
-    classes handle similar things.
-
-    Subclasses of Pretrained should implement the actual logic of how
-    the pretrained system runs, and add methods with descriptive names
-    (e.g. transcribe_file() for ASR).
-
-    Arguments
-    ---------
-    modules : dict of str:torch.nn.Module pairs
-        The Torch modules that make up the learned system. These can be treated
-        in special ways (put on the right device, frozen, etc.)
-    hparams : dict
-        Each key:value pair should consist of a string key and a hyperparameter
-        that is used within the overridden methods. These will
-        be accessible via an ``hparams`` attribute, using "dot" notation:
-        e.g., self.hparams.model(x).
-    run_opts : dict
-        Options parsed from command line. See ``speechbrain.parse_arguments()``.
-        List that are supported here:
-         * device
-         * data_parallel_count
-         * data_parallel_backend
-         * distributed_launch
-         * distributed_backend
-         * jit_module_keys
-    freeze_params : bool
-        To freeze (requires_grad=False) parameters or not. Normally in inference
-        you want to freeze the params. Also calls .eval() on all modules.
-    """
-
-    HPARAMS_NEEDED = []
-    MODULES_NEEDED = []
-
-    def __init__(
-        self, modules=None, hparams=None, run_opts=None, freeze_params=True
-    ):
-
-        # Arguments passed via the run opts dictionary. Set a limited
-        # number of these, since some don't apply to inference.
-        run_opt_defaults = {
-            "device": "cpu",
-            "data_parallel_count": -1,
-            "data_parallel_backend": False,
-            "distributed_launch": False,
-            "distributed_backend": "nccl",
-            "jit_module_keys": None,
-        }
-        for arg, default in run_opt_defaults.items():
-            if run_opts is not None and arg in run_opts:
-                setattr(self, arg, run_opts[arg])
-            else:
-                # If any arg from run_opt_defaults exist in hparams and
-                # not in command line args "run_opts"
-                if hparams is not None and arg in hparams:
-                    setattr(self, arg, hparams[arg])
-                else:
-                    setattr(self, arg, default)
-
-        # Put modules on the right device, accessible with dot notation
-        self.modules = torch.nn.ModuleDict(modules)
-        for mod in self.modules:
-            self.modules[mod].to(self.device)
-
-        for mod in self.MODULES_NEEDED:
-            if mod not in modules:
-                raise ValueError(f"Need modules['{mod}']")
-
-        # Check MODULES_NEEDED and HPARAMS_NEEDED and
-        # make hyperparams available with dot notation
-        if self.HPARAMS_NEEDED and hparams is None:
-            raise ValueError("Need to provide hparams dict.")
-        if hparams is not None:
-            # Also first check that all required params are found:
-            for hp in self.HPARAMS_NEEDED:
-                if hp not in hparams:
-                    raise ValueError(f"Need hparams['{hp}']")
-            self.hparams = SimpleNamespace(**hparams)
-
-        # Prepare modules for computation, e.g. jit
-        self._prepare_modules(freeze_params)
-
-        # Audio normalization
-        self.audio_normalizer = hparams.get(
-            "audio_normalizer", AudioNormalizer()
-        )
-
-    def _prepare_modules(self, freeze_params):
-        """Prepare modules for computation, e.g. jit.
-
-        Arguments
-        ---------
-        freeze_params : bool
-            Whether to freeze the parameters and call ``eval()``.
-        """
-
-        # Make jit-able
-        self._compile_jit()
-        self._wrap_distributed()
-
-        # If we don't want to backprop, freeze the pretrained parameters
-        if freeze_params:
-            self.modules.eval()
-            for p in self.modules.parameters():
-                p.requires_grad = False
-
-    def load_audio(self, path, savedir="."):
-        """Load an audio file with this model"s input spec
-
-        When using a speech model, it is important to use the same type of data,
-        as was used to train the model. This means for example using the same
-        sampling rate and number of channels. It is, however, possible to
-        convert a file from a higher sampling rate to a lower one (downsampling).
-        Similarly, it is simple to downmix a stereo file to mono.
-        The path can be a local path, a web url, or a link to a huggingface repo.
-        """
-        source, fl = split_path(path)
-        path = fetch(fl, source=source, savedir=savedir)
-        signal, sr = torchaudio.load(path, channels_first=False)
-        return self.audio_normalizer(signal, sr)
-
-    def _compile_jit(self):
-        """Compile requested modules with ``torch.jit.script``."""
-        if self.jit_module_keys is None:
-            return
-
-        for name in self.jit_module_keys:
-            if name not in self.modules:
-                raise ValueError(
-                    "module " + name + " cannot be jit compiled because "
-                    "it is not defined in your hparams file."
-                )
-            module = torch.jit.script(self.modules[name])
-            self.modules[name] = module.to(self.device)
-
-    def _wrap_distributed(self):
-        """Wrap modules with distributed wrapper when requested."""
-        if not self.distributed_launch and not self.data_parallel_backend:
-            return
-        elif self.distributed_launch:
-            for name, module in self.modules.items():
-                if any(p.requires_grad for p in module.parameters()):
-                    # for ddp, all module must run on same GPU
-                    module = SyncBatchNorm.convert_sync_batchnorm(module)
-                    module = DDP(module, device_ids=[self.device])
-                    self.modules[name] = module
-        else:
-            # data_parallel_backend
-            for name, module in self.modules.items():
-                if any(p.requires_grad for p in module.parameters()):
-                    # if distributed_count = -1 then use all gpus
-                    # otherwise, specify the set of gpu to use
-                    if self.data_parallel_count == -1:
-                        module = DP(module)
-                    else:
-                        module = DP(
-                            module,
-                            [i for i in range(self.data_parallel_count)],
-                        )
-                    self.modules[name] = module
-
-    @classmethod
-    def from_hparams(
-        cls,
-        source,
-        hparams_file="hyperparams.yaml",
-        overrides={},
-        savedir=None,
-        use_auth_token=False,
-        **kwargs,
-    ):
-        """Fetch and load based from outside source based on HyperPyYAML file
-
-        The source can be a location on the filesystem or online/huggingface
-
-        The hyperparams file should contain a "modules" key, which is a
-        dictionary of torch modules used for computation.
-
-        The hyperparams file should contain a "pretrainer" key, which is a
-        speechbrain.utils.parameter_transfer.Pretrainer
-
-        Arguments
-        ---------
-        source : str
-            The location to use for finding the model. See
-            ``speechbrain.pretrained.fetching.fetch`` for details.
-        hparams_file : str
-            The name of the hyperparameters file to use for constructing
-            the modules necessary for inference. Must contain two keys:
-            "modules" and "pretrainer", as described.
-        overrides : dict
-            Any changes to make to the hparams file when it is loaded.
-        savedir : str or Path
-            Where to put the pretraining material. If not given, will use
-            ./pretrained_models/<class-name>-hash(source).
-        use_auth_token : bool (default: False)
-            If true Hugginface's auth_token will be used to load private models from the HuggingFace Hub,
-            default is False because majority of models are public.
-        """
-        if savedir is None:
-            clsname = cls.__name__
-            savedir = f"./pretrained_models/{clsname}-{hash(source)}"
-        hparams_local_path = fetch(
-            hparams_file, source, savedir, use_auth_token
-        )
-
-        # Load the modules:
-        with open(hparams_local_path) as fin:
-            hparams = load_hyperpyyaml(fin, overrides)
-
-        # Pretraining:
-        pretrainer = hparams["pretrainer"]
-        pretrainer.set_collect_in(savedir)
-        # For distributed setups, have this here:
-        run_on_main(pretrainer.collect_files, kwargs={"default_source": source})
-        # Load on the CPU. Later the params can be moved elsewhere by specifying
-        # run_opts={"device": ...}
-        pretrainer.load_collected(device="cpu")
-
-        # Now return the system
-        return cls(hparams["modules"], hparams, **kwargs)
-
-
-class EndToEndSLU(Pretrained):
-    """A end-to-end SLU model.
-
-    The class can be used either to run only the encoder (encode()) to extract
-    features or to run the entire model (decode()) to map the speech to its semantics.
-
-    Example
-    -------
-    >>> from speechbrain.pretrained import EndToEndSLU
-    >>> tmpdir = getfixture("tmpdir")
-    >>> slu_model = EndToEndSLU.from_hparams(
-    ...     source="speechbrain/slu-timers-and-such-direct-librispeech-asr",
-    ...     savedir=tmpdir,
-    ... )
-    >>> slu_model.decode_file("samples/audio_samples/example6.wav")
-    "{'intent': 'SimpleMath', 'slots': {'number1': 37.67, 'number2': 75.7, 'op': ' minus '}}"
-    """
-
-    HPARAMS_NEEDED = ["tokenizer", "asr_model_source"]
-    MODULES_NEEDED = [
-        "slu_enc",
-        "beam_searcher",
-    ]
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.tokenizer = self.hparams.tokenizer
-        self.asr_model = EncoderDecoderASR.from_hparams(
-            source=self.hparams.asr_model_source,
-            run_opts={"device": self.device},
-        )
-
-    def decode_file(self, path):
-        """Maps the given audio file to a string representing the
-        semantic dictionary for the utterance.
-
-        Arguments
-        ---------
-        path : str
-            Path to audio file to decode.
-
-        Returns
-        -------
-        str
-            The predicted semantics.
-        """
-        waveform = self.load_audio(path)
-        waveform = waveform.to(self.device)
-        # Fake a batch:
-        batch = waveform.unsqueeze(0)
-        rel_length = torch.tensor([1.0])
-        predicted_words, predicted_tokens = self.decode_batch(batch, rel_length)
-        return predicted_words[0]
-
-    def encode_batch(self, wavs, wav_lens):
-        """Encodes the input audio into a sequence of hidden states
-
-        Arguments
-        ---------
-        wavs : torch.tensor
-            Batch of waveforms [batch, time, channels] or [batch, time]
-            depending on the model.
-        wav_lens : torch.tensor
-            Lengths of the waveforms relative to the longest one in the
-            batch, tensor of shape [batch]. The longest one should have
-            relative length 1.0 and others len(waveform) / max_length.
-            Used for ignoring padding.
-
-        Returns
-        -------
-        torch.tensor
-            The encoded batch
-        """
-        wavs = wavs.float()
-        wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
-        with torch.no_grad():
-            ASR_encoder_out = self.asr_model.encode_batch(
-                wavs.detach(), wav_lens
-            )
-        encoder_out = self.modules.slu_enc(ASR_encoder_out)
-        return encoder_out
-
-    def decode_batch(self, wavs, wav_lens):
-        """Maps the input audio to its semantics
-
-        Arguments
-        ---------
-        wavs : torch.tensor
-            Batch of waveforms [batch, time, channels] or [batch, time]
-            depending on the model.
-        wav_lens : torch.tensor
-            Lengths of the waveforms relative to the longest one in the
-            batch, tensor of shape [batch]. The longest one should have
-            relative length 1.0 and others len(waveform) / max_length.
-            Used for ignoring padding.
-
-        Returns
-        -------
-        list
-            Each waveform in the batch decoded.
-        tensor
-            Each predicted token id.
-        """
-        with torch.no_grad():
-            wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
-            encoder_out = self.encode_batch(wavs, wav_lens)
-            predicted_tokens, scores = self.modules.beam_searcher(
-                encoder_out, wav_lens
-            )
-            predicted_words = [
-                self.tokenizer.decode_ids(token_seq)
-                for token_seq in predicted_tokens
-            ]
-        return predicted_words, predicted_tokens
-
-
-class EncoderDecoderASR(Pretrained):
-    """A ready-to-use Encoder-Decoder ASR model
-
-    The class can be used either to run only the encoder (encode()) to extract
-    features or to run the entire encoder-decoder model
-    (transcribe()) to transcribe speech. The given YAML must contains the fields
-    specified in the *_NEEDED[] lists.
-
-    Example
-    -------
-    >>> from speechbrain.pretrained import EncoderDecoderASR
-    >>> tmpdir = getfixture("tmpdir")
-    >>> asr_model = EncoderDecoderASR.from_hparams(
-    ...     source="speechbrain/asr-crdnn-rnnlm-librispeech",
-    ...     savedir=tmpdir,
-    ... )
-    >>> asr_model.transcribe_file("samples/audio_samples/example2.flac")
-    "MY FATHER HAS REVEALED THE CULPRIT'S NAME"
-    """
-
-    HPARAMS_NEEDED = ["tokenizer"]
-    MODULES_NEEDED = [
-        "encoder",
-        "decoder",
-    ]
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.tokenizer = self.hparams.tokenizer
-
-    def transcribe_file(self, path):
-        """Transcribes the given audiofile into a sequence of words.
-
-        Arguments
-        ---------
-        path : str
-            Path to audio file which to transcribe.
-
-        Returns
-        -------
-        str
-            The audiofile transcription produced by this ASR system.
-        """
-        waveform = self.load_audio(path)
-        # Fake a batch:
-        batch = waveform.unsqueeze(0)
-        rel_length = torch.tensor([1.0])
-        predicted_words, predicted_tokens = self.transcribe_batch(
-            batch, rel_length
-        )
-        return predicted_words[0]
-
-    def encode_batch(self, wavs, wav_lens):
-        """Encodes the input audio into a sequence of hidden states
-
-        The waveforms should already be in the model's desired format.
-        You can call:
-        ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)``
-        to get a correctly converted signal in most cases.
-
-        Arguments
-        ---------
-        wavs : torch.tensor
-            Batch of waveforms [batch, time, channels] or [batch, time]
-            depending on the model.
-        wav_lens : torch.tensor
-            Lengths of the waveforms relative to the longest one in the
-            batch, tensor of shape [batch]. The longest one should have
-            relative length 1.0 and others len(waveform) / max_length.
-            Used for ignoring padding.
-
-        Returns
-        -------
-        torch.tensor
-            The encoded batch
-        """
-        wavs = wavs.float()
-        wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
-        encoder_out = self.modules.encoder(wavs, wav_lens)
-        return encoder_out
-
-    def transcribe_batch(self, wavs, wav_lens):
-        """Transcribes the input audio into a sequence of words
-
-        The waveforms should already be in the model's desired format.
-        You can call:
-        ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)``
-        to get a correctly converted signal in most cases.
-
-        Arguments
-        ---------
-        wavs : torch.tensor
-            Batch of waveforms [batch, time, channels] or [batch, time]
-            depending on the model.
-        wav_lens : torch.tensor
-            Lengths of the waveforms relative to the longest one in the
-            batch, tensor of shape [batch]. The longest one should have
-            relative length 1.0 and others len(waveform) / max_length.
-            Used for ignoring padding.
-
-        Returns
-        -------
-        list
-            Each waveform in the batch transcribed.
-        tensor
-            Each predicted token id.
-        """
-        with torch.no_grad():
-            wav_lens = wav_lens.to(self.device)
-            encoder_out = self.encode_batch(wavs, wav_lens)
-            predicted_tokens, scores = self.modules.decoder(
-                encoder_out, wav_lens
-            )
-            predicted_words = [
-                self.tokenizer.decode_ids(token_seq)
-                for token_seq in predicted_tokens
-            ]
-        return predicted_words, predicted_tokens
-
-
-class EncoderClassifier(Pretrained):
-    """A ready-to-use class for utterance-level classification (e.g, speaker-id,
-    language-id, emotion recognition, keyword spotting, etc).
-
-    The class assumes that an encoder called "embedding_model" and a model
-    called "classifier" are defined in the yaml file. If you want to
-    convert the predicted index into a corresponding text label, please
-    provide the path of the label_encoder in a variable called 'lab_encoder_file'
-    within the yaml.
-
-    The class can be used either to run only the encoder (encode_batch()) to
-    extract embeddings or to run a classification step (classify_batch()).
-    ```
-
-    Example
-    -------
-    >>> import torchaudio
-    >>> from speechbrain.pretrained import EncoderClassifier
-    >>> # Model is downloaded from the speechbrain HuggingFace repo
-    >>> tmpdir = getfixture("tmpdir")
-    >>> classifier = EncoderClassifier.from_hparams(
-    ...     source="speechbrain/spkrec-ecapa-voxceleb",
-    ...     savedir=tmpdir,
-    ... )
-
-    >>> # Compute embeddings
-    >>> signal, fs = torchaudio.load("samples/audio_samples/example1.wav")
-    >>> embeddings =  classifier.encode_batch(signal)
-
-    >>> # Classification
-    >>> prediction =  classifier .classify_batch(signal)
-    """
-
-    MODULES_NEEDED = [
-        "compute_features",
-        "mean_var_norm",
-        "embedding_model",
-        "classifier",
-    ]
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-    
-    def extract_feats(self, wavs, wav_lens=None):
-        # wav to feats
-        wavs = wavs.to('cpu').float()
-        if wav_lens is None:
-            wav_lens = torch.ones(wavs.shape[0], device='cpu')
-        
-        feats = self.modules.compute_features(wavs)
-        feats = self.modules.mean_var_norm(feats, wav_lens)
-
-        return feats
-    
-    def feats_classify(self, feats, wav_lens=None):
-        emb = self.modules.embedding_model(feats, wav_lens)
-        out_prob = self.modules.classifier(emb).squeeze(1)
-
-        return out_prob
-
-    def encode_batch(self, wavs, wav_lens=None, normalize=False):
-        """Encodes the input audio into a single vector embedding.
-
-        The waveforms should already be in the model's desired format.
-        You can call:
-        ``normalized = <this>.normalizer(signal, sample_rate)``
-        to get a correctly converted signal in most cases.
-
-        Arguments
-        ---------
-        wavs : torch.tensor
-            Batch of waveforms [batch, time, channels] or [batch, time]
-            depending on the model. Make sure the sample rate is fs=16000 Hz.
-        wav_lens : torch.tensor
-            Lengths of the waveforms relative to the longest one in the
-            batch, tensor of shape [batch]. The longest one should have
-            relative length 1.0 and others len(waveform) / max_length.
-            Used for ignoring padding.
-        normalize : bool
-            If True, it normalizes the embeddings with the statistics
-            contained in mean_var_norm_emb.
-
-        Returns
-        -------
-        torch.tensor
-            The encoded batch
-        """
-        # Manage single waveforms in input
-        if len(wavs.shape) == 1:
-            wavs = wavs.unsqueeze(0)
-
-        # Assign full length if wav_lens is not assigned
-        if wav_lens is None:
-            wav_lens = torch.ones(wavs.shape[0], device=self.device)
-
-        # Storing waveform in the specified device
-        wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
-        wavs = wavs.float()
-
-        # Computing features and embeddings
-        feats = self.modules.compute_features(wavs)
-        feats = self.modules.mean_var_norm(feats, wav_lens)
-        embeddings = self.modules.embedding_model(feats, wav_lens)
-        if normalize:
-            embeddings = self.hparams.mean_var_norm_emb(
-                embeddings, torch.ones(embeddings.shape[0], device=self.device)
-            )
-        return embeddings
-
-    def classify_batch(self, wavs, wav_lens=None):
-        """Performs classification on the top of the encoded features.
-
-        It returns the posterior probabilities, the index and, if the label
-        encoder is specified it also the text label.
-
-        Arguments
-        ---------
-        wavs : torch.tensor
-            Batch of waveforms [batch, time, channels] or [batch, time]
-            depending on the model. Make sure the sample rate is fs=16000 Hz.
-        wav_lens : torch.tensor
-            Lengths of the waveforms relative to the longest one in the
-            batch, tensor of shape [batch]. The longest one should have
-            relative length 1.0 and others len(waveform) / max_length.
-            Used for ignoring padding.
-
-        Returns
-        -------
-        out_prob
-            The log posterior probabilities of each class ([batch, N_class])
-        score:
-            It is the value of the log-posterior for the best class ([batch,])
-        index
-            The indexes of the best class ([batch,])
-        text_lab:
-            List with the text labels corresponding to the indexes.
-            (label encoder should be provided).
-        """
-        emb = self.encode_batch(wavs, wav_lens)
-        out_prob = self.modules.classifier(emb).squeeze(1)
-        score, index = torch.max(out_prob, dim=-1)
-        text_lab = self.hparams.label_encoder.decode_torch(index)
-        return out_prob, score, index, text_lab
-
-    def classify_file(self, path):
-        """Classifies the given audiofile into the given set of labels.
-
-        Arguments
-        ---------
-        path : str
-            Path to audio file to classify.
-
-        Returns
-        -------
-        out_prob
-            The log posterior probabilities of each class ([batch, N_class])
-        score:
-            It is the value of the log-posterior for the best class ([batch,])
-        index
-            The indexes of the best class ([batch,])
-        text_lab:
-            List with the text labels corresponding to the indexes.
-            (label encoder should be provided).
-        """
-        waveform = self.load_audio(path)
-        # Fake a batch:
-        batch = waveform.unsqueeze(0)
-        rel_length = torch.tensor([1.0])
-        emb = self.encode_batch(batch, rel_length)
-        out_prob = self.modules.classifier(emb).squeeze(1)
-        score, index = torch.max(out_prob, dim=-1)
-        text_lab = self.hparams.label_encoder.decode_torch(index)
-        return out_prob, score, index, text_lab
-
-
-class SpeakerRecognition(EncoderClassifier):
-    """A ready-to-use model for speaker recognition. It can be used to
-    perform speaker verification with verify_batch().
-
-    ```
-    Example
-    -------
-    >>> import torchaudio
-    >>> from speechbrain.pretrained import SpeakerRecognition
-    >>> # Model is downloaded from the speechbrain HuggingFace repo
-    >>> tmpdir = getfixture("tmpdir")
-    >>> verification = SpeakerRecognition.from_hparams(
-    ...     source="speechbrain/spkrec-ecapa-voxceleb",
-    ...     savedir=tmpdir,
-    ... )
-
-    >>> # Perform verification
-    >>> signal, fs = torchaudio.load("samples/audio_samples/example1.wav")
-    >>> signal2, fs = torchaudio.load("samples/audio_samples/example2.flac")
-    >>> score, prediction = verification.verify_batch(signal, signal2)
-    """
-
-    MODULES_NEEDED = [
-        "compute_features",
-        "mean_var_norm",
-        "embedding_model",
-        "mean_var_norm_emb",
-    ]
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.similarity = torch.nn.CosineSimilarity(dim=-1, eps=1e-6)
-
-    def verify_batch(
-        self, wavs1, wavs2, wav1_lens=None, wav2_lens=None, threshold=0.25
-    ):
-        """Performs speaker verification with cosine distance.
-
-        It returns the score and the decision (0 different speakers,
-        1 same speakers).
-
-        Arguments
-        ---------
-        wavs1 : Torch.Tensor
-                Tensor containing the speech waveform1 (batch, time).
-                Make sure the sample rate is fs=16000 Hz.
-        wavs2 : Torch.Tensor
-                Tensor containing the speech waveform2 (batch, time).
-                Make sure the sample rate is fs=16000 Hz.
-        wav1_lens: Torch.Tensor
-                Tensor containing the relative length for each sentence
-                in the length (e.g., [0.8 0.6 1.0])
-        wav2_lens: Torch.Tensor
-                Tensor containing the relative length for each sentence
-                in the length (e.g., [0.8 0.6 1.0])
-        threshold: Float
-                Threshold applied to the cosine distance to decide if the
-                speaker is different (0) or the same (1).
-
-        Returns
-        -------
-        score
-            The score associated to the binary verification output
-            (cosine distance).
-        prediction
-            The prediction is 1 if the two signals in input are from the same
-            speaker and 0 otherwise.
-        """
-        emb1 = self.encode_batch(wavs1, wav1_lens, normalize=True)
-        emb2 = self.encode_batch(wavs2, wav2_lens, normalize=True)
-        score = self.similarity(emb1, emb2)
-        return score, score > threshold
-
-    def verify_files(self, path_x, path_y):
-        """Speaker verification with cosine distance
-
-        Returns the score and the decision (0 different speakers,
-        1 same speakers).
-
-        Returns
-        -------
-        score
-            The score associated to the binary verification output
-            (cosine distance).
-        prediction
-            The prediction is 1 if the two signals in input are from the same
-            speaker and 0 otherwise.
-        """
-        waveform_x = self.load_audio(path_x)
-        waveform_y = self.load_audio(path_y)
-        # Fake batches:
-        batch_x = waveform_x.unsqueeze(0)
-        batch_y = waveform_y.unsqueeze(0)
-        # Verify:
-        score, decision = self.verify_batch(batch_x, batch_y)
-        # Squeeze:
-        return score[0], decision[0]
-
-
-class SepformerSeparation(Pretrained):
-    """A "ready-to-use" speech separation model.
-
-    Uses Sepformer architecture.
-
-    Example
-    -------
-    >>> tmpdir = getfixture("tmpdir")
-    >>> model = SepformerSeparation.from_hparams(
-    ...     source="speechbrain/sepformer-wsj02mix",
-    ...     savedir=tmpdir)
-    >>> mix = torch.randn(1, 400)
-    >>> est_sources = model.separate_batch(mix)
-    >>> print(est_sources.shape)
-    torch.Size([1, 400, 2])
-    """
-
-    MODULES_NEEDED = ["encoder", "masknet", "decoder"]
-
-    def separate_batch(self, mix):
-        """Run source separation on batch of audio.
-
-        Arguments
-        ---------
-        mix : torch.tensor
-            The mixture of sources.
-
-        Returns
-        -------
-        tensor
-            Separated sources
-        """
-
-        # Separation
-        mix = mix.to(self.device)
-        mix_w = self.modules.encoder(mix)
-        est_mask = self.modules.masknet(mix_w)
-        mix_w = torch.stack([mix_w] * self.hparams.num_spks)
-        sep_h = mix_w * est_mask
-
-        # Decoding
-        est_source = torch.cat(
-            [
-                self.modules.decoder(sep_h[i]).unsqueeze(-1)
-                for i in range(self.hparams.num_spks)
-            ],
-            dim=-1,
-        )
-
-        # T changed after conv1d in encoder, fix it here
-        T_origin = mix.size(1)
-        T_est = est_source.size(1)
-        if T_origin > T_est:
-            est_source = F.pad(est_source, (0, 0, 0, T_origin - T_est))
-        else:
-            est_source = est_source[:, :T_origin, :]
-        return est_source
-
-    def separate_file(self, path, savedir="."):
-        """Separate sources from file.
-
-        Arguments
-        ---------
-        path : str
-            Path to file which has a mixture of sources. It can be a local
-            path, a web url, or a huggingface repo.
-        savedir : path
-            Path where to store the wav signals (when downloaded from the web).
-        Returns
-        -------
-        tensor
-            Separated sources
-        """
-        source, fl = split_path(path)
-        path = fetch(fl, source=source, savedir=savedir)
-
-        batch, fs_file = torchaudio.load(path)
-        batch = batch.to(self.device)
-        fs_model = self.hparams.sample_rate
-
-        # resample the data if needed
-        if fs_file != fs_model:
-            print(
-                "Resampling the audio from {} Hz to {} Hz".format(
-                    fs_file, fs_model
-                )
-            )
-            tf = torchaudio.transforms.Resample(
-                orig_freq=fs_file, new_freq=fs_model
-            )
-            batch = batch.mean(dim=0, keepdim=True)
-            batch = tf(batch)
-
-        est_sources = self.separate_batch(batch)
-        est_sources = est_sources / est_sources.max(dim=1, keepdim=True)[0]
-        return est_sources
-
-
-class SpectralMaskEnhancement(Pretrained):
-    """A ready-to-use model for speech enhancement.
-
-    Arguments
-    ---------
-    See ``Pretrained``.
-
-    Example
-    -------
-    >>> import torchaudio
-    >>> from speechbrain.pretrained import SpectralMaskEnhancement
-    >>> # Model is downloaded from the speechbrain HuggingFace repo
-    >>> tmpdir = getfixture("tmpdir")
-    >>> enhancer = SpectralMaskEnhancement.from_hparams(
-    ...     source="speechbrain/mtl-mimic-voicebank",
-    ...     savedir=tmpdir,
-    ... )
-    >>> noisy, fs = torchaudio.load("samples/audio_samples/example_noisy.wav")
-    >>> # Channel dimension is interpreted as batch dimension here
-    >>> enhanced = enhancer.enhance_batch(noisy)
-    """
-
-    HPARAMS_NEEDED = ["compute_stft", "spectral_magnitude", "resynth"]
-    MODULES_NEEDED = ["enhance_model"]
-
-    def compute_features(self, wavs):
-        """Compute the log spectral magnitude features for masking.
-
-        Arguments
-        ---------
-        wavs : torch.tensor
-            A batch of waveforms to convert to log spectral mags.
-        """
-        feats = self.hparams.compute_stft(wavs)
-        feats = self.hparams.spectral_magnitude(feats)
-        return torch.log1p(feats)
-
-    def enhance_batch(self, noisy, lengths=None):
-        """Enhance a batch of noisy waveforms.
-
-        Arguments
-        ---------
-        noisy : torch.tensor
-            A batch of waveforms to perform enhancement on.
-        lengths : torch.tensor
-            The lengths of the waveforms if the enhancement model handles them.
-
-        Returns
-        -------
-        torch.tensor
-            A batch of enhanced waveforms of the same shape as input.
-        """
-        noisy = noisy.to(self.device)
-        noisy_features = self.compute_features(noisy)
-
-        # Perform masking-based enhancement, multiplying output with input.
-        if lengths is not None:
-            mask = self.modules.enhance_model(noisy_features, lengths=lengths)
-        else:
-            mask = self.modules.enhance_model(noisy_features)
-        enhanced = torch.mul(mask, noisy_features)
-
-        # Return resynthesized waveforms
-        return self.hparams.resynth(torch.expm1(enhanced), noisy)
-
-    def enhance_file(self, filename, output_filename=None):
-        """Enhance a wav file.
-
-        Arguments
-        ---------
-        filename : str
-            Location on disk to load file for enhancement.
-        output_filename : str
-            If provided, writes enhanced data to this file.
-        """
-        noisy = self.load_audio(filename)
-        noisy = noisy.to(self.device)
-
-        # Fake a batch:
-        batch = noisy.unsqueeze(0)
-        enhanced = self.enhance_batch(batch)
-
-        if output_filename is not None:
-            torchaudio.save(output_filename, enhanced, channels_first=False)
-
-        return enhanced.squeeze(0)
diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/mo.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/mo.py
new file mode 100644
index 0000000000..cad5851dcb
--- /dev/null
+++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/mo.py
@@ -0,0 +1,32 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+from magiconnx import OnnxGraph
+
+bs = sys.argv[1]
+model_name = 'tdnn_bs%s'%bs
+graph = OnnxGraph(model_name+'.onnx')
+ph = graph.add_placeholder('random','float32',[64,1500])
+
+rm = graph.get_nodes("ReduceMin")[0]
+rm.inputs = ['random']
+sub = graph.get_nodes("Sub")[-1]
+sub.inputs = ['random', rm.outputs[0]]
+
+rn = graph.get_nodes("RandomNormalLike")[0]
+graph.del_node(rn.name, auto_connection=False)
+
+graph.save('%s_mod.onnx'%model_name)
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify.patch b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify.patch
new file mode 100644
index 0000000000..d246c60425
--- /dev/null
+++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify.patch
@@ -0,0 +1,270 @@
+diff --git a/speechbrain/nnet/CNN.py b/speechbrain/nnet/CNN.py
+index 1745846..8750680 100644
+--- a/speechbrain/nnet/CNN.py
++++ b/speechbrain/nnet/CNN.py
+@@ -346,7 +346,7 @@ class Conv1d(nn.Module):
+         padding="same",
+         groups=1,
+         bias=True,
+-        padding_mode="reflect",
++        padding_mode="constant",
+         skip_transpose=False,
+     ):
+         super().__init__()
+diff --git a/speechbrain/pretrained/interfaces.py b/speechbrain/pretrained/interfaces.py
+index e2521ec..ead6a06 100644
+--- a/speechbrain/pretrained/interfaces.py
++++ b/speechbrain/pretrained/interfaces.py
+@@ -1,3 +1,17 @@
++# Copyright 2021 Huawei Technologies Co., Ltd
++#
++# Licensed under the Apache License, Version 2.0 (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++#     http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
+ """Defines interfaces for simple inference with pretrained models
+ 
+ Authors:
+@@ -85,7 +99,10 @@ class Pretrained:
+                     setattr(self, arg, default)
+ 
+         # Put modules on the right device, accessible with dot notation
+-        self.modules = torch.nn.ModuleDict(modules).to(self.device)
++        self.modules = torch.nn.ModuleDict(modules)
++        for mod in self.modules:
++            self.modules[mod].to(self.device)
++
+         for mod in self.MODULES_NEEDED:
+             if mod not in modules:
+                 raise ValueError(f"Need modules['{mod}']")
+@@ -93,7 +110,7 @@ class Pretrained:
+         # Check MODULES_NEEDED and HPARAMS_NEEDED and
+         # make hyperparams available with dot notation
+         if self.HPARAMS_NEEDED and hparams is None:
+-            raise ValueError(f"Need to provide hparams dict.")
++            raise ValueError("Need to provide hparams dict.")
+         if hparams is not None:
+             # Also first check that all required params are found:
+             for hp in self.HPARAMS_NEEDED:
+@@ -190,6 +207,7 @@ class Pretrained:
+         hparams_file="hyperparams.yaml",
+         overrides={},
+         savedir=None,
++        use_auth_token=False,
+         **kwargs,
+     ):
+         """Fetch and load based from outside source based on HyperPyYAML file
+@@ -215,12 +233,17 @@ class Pretrained:
+             Any changes to make to the hparams file when it is loaded.
+         savedir : str or Path
+             Where to put the pretraining material. If not given, will use
+-            ./pretrained_checkpoints/<class-name>-hash(source).
++            ./pretrained_models/<class-name>-hash(source).
++        use_auth_token : bool (default: False)
++            If true Hugginface's auth_token will be used to load private models from the HuggingFace Hub,
++            default is False because majority of models are public.
+         """
+         if savedir is None:
+             clsname = cls.__name__
+-            savedir = f"./pretrained_checkpoints/{clsname}-{hash(source)}"
+-        hparams_local_path = fetch(hparams_file, source, savedir)
++            savedir = f"./pretrained_models/{clsname}-{hash(source)}"
++        hparams_local_path = fetch(
++            hparams_file, source, savedir, use_auth_token
++        )
+ 
+         # Load the modules:
+         with open(hparams_local_path) as fin:
+@@ -257,7 +280,7 @@ class EndToEndSLU(Pretrained):
+     "{'intent': 'SimpleMath', 'slots': {'number1': 37.67, 'number2': 75.7, 'op': ' minus '}}"
+     """
+ 
+-    HPARAMS_NEEDED = ["tokenizer", "asr_model"]
++    HPARAMS_NEEDED = ["tokenizer", "asr_model_source"]
+     MODULES_NEEDED = [
+         "slu_enc",
+         "beam_searcher",
+@@ -266,6 +289,10 @@ class EndToEndSLU(Pretrained):
+     def __init__(self, *args, **kwargs):
+         super().__init__(*args, **kwargs)
+         self.tokenizer = self.hparams.tokenizer
++        self.asr_model = EncoderDecoderASR.from_hparams(
++            source=self.hparams.asr_model_source,
++            run_opts={"device": self.device},
++        )
+ 
+     def decode_file(self, path):
+         """Maps the given audio file to a string representing the
+@@ -282,6 +309,7 @@ class EndToEndSLU(Pretrained):
+             The predicted semantics.
+         """
+         waveform = self.load_audio(path)
++        waveform = waveform.to(self.device)
+         # Fake a batch:
+         batch = waveform.unsqueeze(0)
+         rel_length = torch.tensor([1.0])
+@@ -310,10 +338,10 @@ class EndToEndSLU(Pretrained):
+         wavs = wavs.float()
+         wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
+         with torch.no_grad():
+-            ASR_encoder_out = self.hparams.asr_model.encode_batch(
++            ASR_encoder_out = self.asr_model.encode_batch(
+                 wavs.detach(), wav_lens
+             )
+-        encoder_out = self.hparams.slu_enc(ASR_encoder_out)
++        encoder_out = self.modules.slu_enc(ASR_encoder_out)
+         return encoder_out
+ 
+     def decode_batch(self, wavs, wav_lens):
+@@ -338,7 +366,7 @@ class EndToEndSLU(Pretrained):
+             Each predicted token id.
+         """
+         with torch.no_grad():
+-            wav_lens = wav_lens.to(self.device)
++            wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
+             encoder_out = self.encode_batch(wavs, wav_lens)
+             predicted_tokens, scores = self.modules.beam_searcher(
+                 encoder_out, wav_lens
+@@ -512,6 +540,23 @@ class EncoderClassifier(Pretrained):
+ 
+     def __init__(self, *args, **kwargs):
+         super().__init__(*args, **kwargs)
++    
++    def extract_feats(self, wavs, wav_lens=None):
++        # wav to feats
++        wavs = wavs.to('cpu').float()
++        if wav_lens is None:
++            wav_lens = torch.ones(wavs.shape[0], device='cpu')
++        
++        feats = self.modules.compute_features(wavs)
++        feats = self.modules.mean_var_norm(feats, wav_lens)
++
++        return feats
++    
++    def feats_classify(self, feats, wav_lens=None):
++        emb = self.modules.embedding_model(feats, wav_lens)
++        out_prob = self.modules.classifier(emb).squeeze(1)
++
++        return out_prob
+ 
+     def encode_batch(self, wavs, wav_lens=None, normalize=False):
+         """Encodes the input audio into a single vector embedding.
+@@ -595,7 +640,36 @@ class EncoderClassifier(Pretrained):
+         out_prob = self.modules.classifier(emb).squeeze(1)
+         score, index = torch.max(out_prob, dim=-1)
+         text_lab = self.hparams.label_encoder.decode_torch(index)
++        return out_prob, score, index, text_lab
+ 
++    def classify_file(self, path):
++        """Classifies the given audiofile into the given set of labels.
++
++        Arguments
++        ---------
++        path : str
++            Path to audio file to classify.
++
++        Returns
++        -------
++        out_prob
++            The log posterior probabilities of each class ([batch, N_class])
++        score:
++            It is the value of the log-posterior for the best class ([batch,])
++        index
++            The indexes of the best class ([batch,])
++        text_lab:
++            List with the text labels corresponding to the indexes.
++            (label encoder should be provided).
++        """
++        waveform = self.load_audio(path)
++        # Fake a batch:
++        batch = waveform.unsqueeze(0)
++        rel_length = torch.tensor([1.0])
++        emb = self.encode_batch(batch, rel_length)
++        out_prob = self.modules.classifier(emb).squeeze(1)
++        score, index = torch.max(out_prob, dim=-1)
++        text_lab = self.hparams.label_encoder.decode_torch(index)
+         return out_prob, score, index, text_lab
+ 
+ 
+@@ -732,6 +806,7 @@ class SepformerSeparation(Pretrained):
+         """
+ 
+         # Separation
++        mix = mix.to(self.device)
+         mix_w = self.modules.encoder(mix)
+         est_mask = self.modules.masknet(mix_w)
+         mix_w = torch.stack([mix_w] * self.hparams.num_spks)
+@@ -774,6 +849,7 @@ class SepformerSeparation(Pretrained):
+         path = fetch(fl, source=source, savedir=savedir)
+ 
+         batch, fs_file = torchaudio.load(path)
++        batch = batch.to(self.device)
+         fs_model = self.hparams.sample_rate
+ 
+         # resample the data if needed
+@@ -846,6 +922,7 @@ class SpectralMaskEnhancement(Pretrained):
+         torch.tensor
+             A batch of enhanced waveforms of the same shape as input.
+         """
++        noisy = noisy.to(self.device)
+         noisy_features = self.compute_features(noisy)
+ 
+         # Perform masking-based enhancement, multiplying output with input.
+@@ -869,6 +946,7 @@ class SpectralMaskEnhancement(Pretrained):
+             If provided, writes enhanced data to this file.
+         """
+         noisy = self.load_audio(filename)
++        noisy = noisy.to(self.device)
+ 
+         # Fake a batch:
+         batch = noisy.unsqueeze(0)
+diff --git a/templates/speaker_id/custom_model.py b/templates/speaker_id/custom_model.py
+index 9a78a37..3a67eae 100644
+--- a/templates/speaker_id/custom_model.py
++++ b/templates/speaker_id/custom_model.py
+@@ -76,9 +76,10 @@ class Xvector(torch.nn.Module):
+                         out_channels=out_channels,
+                         kernel_size=tdnn_kernel_sizes[block_index],
+                         dilation=tdnn_dilations[block_index],
++                        skip_transpose=True,
+                     ),
+                     activation(),
+-                    BatchNorm1d(input_size=out_channels),
++                    BatchNorm1d(input_size=out_channels,skip_transpose=True),
+                 ]
+             )
+             in_channels = tdnn_channels[block_index]
+@@ -105,8 +106,12 @@ class Xvector(torch.nn.Module):
+         ---------
+         x : torch.Tensor
+         """
++        x = x.transpose(1, -1)
+ 
+         for layer in self.blocks:
++            if type(layer) == type(StatisticsPooling()):
++                x = x.transpose(1, -1)
++            
+             try:
+                 x = layer(x, lengths=lens)
+             except TypeError:
+diff --git a/templates/speaker_id/mini_librispeech_prepare.py b/templates/speaker_id/mini_librispeech_prepare.py
+index c22add8..7a777df 100644
+--- a/templates/speaker_id/mini_librispeech_prepare.py
++++ b/templates/speaker_id/mini_librispeech_prepare.py
+@@ -171,7 +171,7 @@ def split_sets(wav_list, split_ratio):
+     dictionary containing train, valid, and test splits.
+     """
+     # Random shuffle of the list
+-    random.shuffle(wav_list)
++    # random.shuffle(wav_list)
+     tot_split = sum(split_ratio)
+     tot_snts = len(wav_list)
+     data_split = {}
diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify_onnx.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify_onnx.py
new file mode 100644
index 0000000000..ab8273817c
--- /dev/null
+++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify_onnx.py
@@ -0,0 +1,51 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import numpy as np
+import onnx
+
+from magiconnx import OnnxGraph
+
+model_name = sys.argv[1]
+graph = OnnxGraph(model_name)
+
+axes = onnx.helper.make_attribute("axes", [0,1])
+rd_min = graph.get_nodes("ReduceMin")[0]
+rd_min._node.attribute.append(axes)
+rd_max = graph.get_nodes("ReduceMax")[0]
+rd_max._node.attribute.append(axes)
+
+us = graph.add_node('Unsq_1', 'Unsqueeze', {'axes': [2]})
+graph.insert_node(graph.get_nodes("Conv")[0].name, us, mode='before')
+sq = graph.add_node('Sq_291', 'Squeeze', {'axes': [2]})
+graph.insert_node(graph.get_nodes('BatchNormalization')[4].name, sq, mode='after')
+
+convs = graph.get_nodes("Conv")
+for conv in convs:
+    print(conv.name)
+    dil = conv['dilations'][0]
+    ks = conv['kernel_shape'][0]
+    pds = conv['pads'][0]
+    stri = conv['strides'][0]
+    conv['dilations'] = [1, dil]
+    conv['kernel_shape'] = [1, ks]
+    conv['pads'] = [0, pds, 0, pds]
+    conv['strides'] = [1, stri]
+    conv_w = graph[conv.inputs[1]].value
+    conv_w = np.expand_dims(conv_w, axis=-2)
+    graph[conv.inputs[1]].value = conv_w
+
+graph.save(model_name)
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/om_infer.sh b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/om_infer.sh
index 1271031bb4..55d9c9adf2 100644
--- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/om_infer.sh
+++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/om_infer.sh
@@ -1,6 +1,8 @@
-install_path=/usr/local/Ascend/ascend-toolkit/latest
+install_path=/home/dl/ascend-toolkit/latest
 export PYTHONUNBUFFERD=1
 export PYTHONPATH=${install_path}/pyACL/python/site-packages/acl:$PYTHONPATH
 export LD_LIBRARY_PATH=${install_path}/acllib/lib64/:$LD_LIBRARY_PATH
 
-python3.7 pyacl_infer.py --model_path=./tdnn_dynamic.om --device_id=4 --cpu_run=True --sync_infer=True --workspace=10 --input_info_file_path=mini_librispeech_test.input_info_file_path --input_dtypes=float32 --infer_res_save_path=./result --res_save_type=bin
\ No newline at end of file
+bs=$1
+
+python3.7 tdnn_pyacl_infer.py --model_path=tdnn_bs${bs}s.om --batch_size=${bs} --device_id=0 --cpu_run=True --sync_infer=True --workspace=10 --input_info_file_path=mini_librispeech_test.info --input_dtypes=float32 --infer_res_save_path=./result --res_save_type=bin
\ No newline at end of file
diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/requirements.txt b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/requirements.txt
new file mode 100644
index 0000000000..561b93c4e7
--- /dev/null
+++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/requirements.txt
@@ -0,0 +1,8 @@
+onnx==1.10.2
+torch==1.10.0
+torchaudio==0.10.2
+tqdm==4.63.0
+HyperPyYAML==1.0.0
+huggingface-hub==0.4.0
+
+
diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_postprocess.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_postprocess.py
index 057c419d0b..c046198734 100644
--- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_postprocess.py
+++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_postprocess.py
@@ -15,7 +15,7 @@
 import os
 import re
 import argparse
-import numpy as numpy
+import numpy as np
 label = {0:'163', 1:'7367', 2:'332', 3:'1970', 4:'4640', 5:'8629', 6:'6848', 7:'1088', 8:'460', 9:'6272', 10:'7312', 11:'2136', 12:'1867', 13:'669', 14:'3526', 15:'3664', 16:'3242', 17:'19', 18:'32', 19:'5789', 20:'118', 21:'226', 22:'7859', 23:'3947', 24:'1898', 25:'2416', 26:'1737', 27:'4680'}
 
 if __name__ == '__main__':
@@ -42,9 +42,10 @@ if __name__ == '__main__':
             index = split[0]
             input_file = split[1]
             target = re.search('/(\d*)-', input_file).group()[1:-1]
+            output_file = opt.result_dir + '/' + index + '.0.bin'
             output = np.fromfile(output_file, np.float32)
             res = np.argmax(output)
-            print('Predicted:', lable[res], 'Target:', target)
+            print('Predicted:', label[res], 'Target:', target)
             total += 1
             if label[res] != target:
                 error += 1
diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/pth2onnx.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_pth2onnx.py
similarity index 88%
rename from ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/pth2onnx.py
rename to ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_pth2onnx.py
index ea69d2af94..2e830e2802 100644
--- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/pth2onnx.py
+++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_pth2onnx.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
+
 import torch
 import torchaudio
 from speechbrain.pretrained import EncoderClassifier
@@ -29,12 +31,13 @@ class Xvector(torch.nn.Module):
         return res
 
 model = Xvector(classifier)
-feats = torch.randn([1, 1800, 23])
+batch_size=int(sys.argv[1])
+feats = torch.randn([batch_size, 1800, 23])
 
 torch.onnx.export(
     model,
     feats,
-    'tdnn.onnx',
+    'tdnn_bs%d.onnx'%(batch_size),
     input_names=['feats'],
     output_names=['output'],
     export_params=True,
diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/pyacl_infer.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_pyacl_infer.py
similarity index 76%
rename from ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/pyacl_infer.py
rename to ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_pyacl_infer.py
index 15ff286129..87967a5d26 100644
--- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/pyacl_infer.py
+++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_pyacl_infer.py
@@ -86,6 +86,7 @@ if __name__ == '__main__':
     # 参数解析
     parser = argparse.ArgumentParser()
     parser.add_argument('--model_path', required=True)
+    parser.add_argument('--batch_size', required=True)
     parser.add_argument('--device_id', required=True, type=int)
     parser.add_argument('--cpu_run', required=True, choices=['True', 'False'])
     parser.add_argument('--sync_infer', required=True, choices=['True', 'False'])
@@ -129,27 +130,59 @@ if __name__ == '__main__':
     total_infer_time = 0
     total_infer_time_workspace = 0
     total_infer_num = 0
-    for key, values in tqdm(inputs_info.items()):
+    dataset = {}
+    dims_infos = {}
+    bs = int(opt.batch_size)
+    for key, values in inputs_info.items():
         # 构造输入
         inputs = []
         dims = []
         for idx, value in enumerate(values):
             x = np.fromfile(value['path'], dtype=input_dtypes[idx]).reshape(value['shape'])
-            inputs.append(x)
-            dims.extend(value['shape'])
+            inputs.append((key,x))
+            dims.extend((bs, value['shape'][1], value['shape'][2]))
         dims_info = {'dimCount': len(dims), 'name': '', 'dims': dims}
 
+        # (1, 1500, 23) {'dimCount': 3, 'name': '', 'dims': [1, 1500, 23]}
+        length = inputs[0][1].shape[1]
+        dataset[length] = dataset.get(length,[]) + inputs
+        dims_infos[length] = dims_infos.get(length,dims_info)
+    
+    total_inputs = []
+    total_keys = []
+    for k in sorted(dataset.keys()):
+        total_len = len(dataset[k])
+        batch_input = []
+        batch_key = []
+        for i, (key, ipt) in enumerate(dataset[k]):
+            batch_input.append(ipt)
+            batch_key.append(key)
+            if (i+1) % bs == 0:
+                total_inputs.append(batch_input)
+                total_keys.append(batch_key)
+                batch_input = []
+                batch_key = []
+        if batch_input != []:
+            total_inputs.append(batch_input)
+            total_keys.append(batch_key)
+
+    for i, b_ipt in tqdm(enumerate(total_inputs)):
+        batch_input = np.squeeze(np.array(b_ipt), axis=1)
+        if batch_input.shape[0] < bs:
+            batch_input = np.pad(batch_input, [(0, bs-batch_input.shape[0]), (0, 0), (0, 0)], mode='constant')
+
         # 推理得到输出
-        output = om_model(inputs, dims_info)
+        # (bs, 28)
+        output = om_model([batch_input], dims_infos[batch_input.shape[1]])
+
         total_infer_num += 1
 
         # 保存文件
-        if opt.res_save_type == 'bin':
-            for idx, data in enumerate(output):
-                data.tofile(os.path.join(opt.infer_res_save_path, key + '.' + str(idx) + '.bin'))
-        else:
-            for idx, data in enumerate(output):
-                np.save(os.path.join(opt.infer_res_save_path, key + '.' + str(idx) + '.npy'), data)
+        for j, key in enumerate(total_keys[i]):
+            if opt.res_save_type == 'bin':
+                output[0][j].tofile(os.path.join(opt.infer_res_save_path, key + '.' + str(0) + '.bin'))
+            else:
+                np.save(os.path.join(opt.infer_res_save_path, key + '.' + str(0) + '.npy'), output[0][j])
 
         # 计算时间
         total_infer_time += measurements['per_infer_time_ns']
-- 
Gitee