From 4c028eec983ef1422d36c63a84bd91c67fa010fc Mon Sep 17 00:00:00 2001 From: dingli Date: Mon, 21 Mar 2022 13:06:53 +0000 Subject: [PATCH] update TDNN files for Ascend710 --- .../built-in/audio/TDNN_for_Pytorch/ReadMe.md | 90 +- .../audio/TDNN_for_Pytorch/acl_net.py | 521 +++++----- .../built-in/audio/TDNN_for_Pytorch/atc.sh | 18 +- .../audio/TDNN_for_Pytorch/hyperparams.yaml | 65 ++ .../audio/TDNN_for_Pytorch/interfaces.py | 958 ------------------ .../built-in/audio/TDNN_for_Pytorch/mo.py | 32 + .../audio/TDNN_for_Pytorch/modify.patch | 270 +++++ .../audio/TDNN_for_Pytorch/modify_onnx.py | 51 + .../audio/TDNN_for_Pytorch/om_infer.sh | 6 +- .../audio/TDNN_for_Pytorch/requirements.txt | 8 + .../TDNN_for_Pytorch/tdnn_postprocess.py | 5 +- .../{pth2onnx.py => tdnn_pth2onnx.py} | 7 +- .../{pyacl_infer.py => tdnn_pyacl_infer.py} | 53 +- 13 files changed, 827 insertions(+), 1257 deletions(-) create mode 100644 ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/hyperparams.yaml delete mode 100644 ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/interfaces.py create mode 100644 ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/mo.py create mode 100644 ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify.patch create mode 100644 ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify_onnx.py create mode 100644 ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/requirements.txt rename ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/{pth2onnx.py => tdnn_pth2onnx.py} (88%) rename ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/{pyacl_infer.py => tdnn_pyacl_infer.py} (76%) diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/ReadMe.md b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/ReadMe.md index 27b8c44c13..4ea7fca8b3 100644 --- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/ReadMe.md +++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/ReadMe.md @@ -1,31 +1,59 @@ -文件作用说明: - -1.atc.sh:模型转换脚本,生成动态分档模型 - -2.pth2onnx.py:用于转换ckpt文件到onnx文件 - -3.acl_net.py: pyACL推理依赖模块 - -4.interfaces.py: 替换speechbrain/pretrained 目录下同名文件 - -5.om_infer.sh: pyACL推理启动脚本 - -6.pyacl_infer.py: pyACL推理代码 - -7.tdnn_postprocess.py: 预处理脚本 - -8.tdnn_preprocess.py: 后处理脚本 - - - -推理端到端步骤: - -(1) 从Speechbrain克隆源代码,修改speechbrain/nnet/CNN.py 349行padding_mode='constant',从Ascend Modelzoo获取训练好的权重文件夹best model, 进templates/speaker_id 运行 pth2onnx.py脚本生成tdnn.onnx模型 - -(2) 准备数据集,注释掉speechbrain/templates/speaker_id/mini_librispeech_prepare.py 174行代码,然后执行预处理脚本,python3 tdnn_preprocess.py, 将数据集处理为二进制文件 - -(3) 执行atc脚本, bash atc.sh tdnn.onnx tdnn,生成tdnn.om - -(4) 执行om_infer脚本, bash om_infer.sh,推理结果输出在result目录下 - - (5) 执行后处理脚本,python3 tdnn_postprocess.py 得到模型精度 \ No newline at end of file +# TDNN模型pytorch离线推理指导 + +## 1 环境准备 + +1.获取,修改与安装开源模型代码 + +```shell +git clone https://github.com/speechbrain/speechbrain.git +cd speechbrain +git checkout develop +git reset --hard 51a2becdcf3a337578a9307a0b2fc3906bf20391 +export PYTHONPATH=`pwd`:$PYTHONPATH +cd .. +git clone https://gitee.com/Ronnie_zheng/MagicONNX.git +cd MagicONNX && git checkout 8d62ae9dde478f35bece4b3d04eef573448411c9 +pip install . +``` +将源码包中文件放入speechbrain/templates/speaker_id中 +```shell +cd speechbrain +git apply --reject --whitespace=fix templates/speaker_id/modify.patch +``` + +2.获取权重文件 + +https://www.hiascend.com/zh/software/modelzoo/detail/1/f4f4103245624c1a8637f8a5eadd950c +将模型权重文件夹best_model放入speechbrain/templates/speaker_id下,将hyperparams.yaml文件放入best_model中 + +3.获取数据集 + +预处理阶段自动下载 +```shell +python3 tdnn_preprocess.py +``` + +## 2 模型转换 +```shell +# 生成tdnn_bs64.onnx +python3 tdnn_pth2onnx.py 64 +# 优化onnx模型 +python3 -m onnxsim tdnn_bs64.onnx tdnn_bs64s.onnx +python3 modify_onnx.py tdnn_bs64s.onnx +# 生成om模型 +bash atc.sh tdnn_bs64s.onnx +``` + +## 3 离线推理 + +```shell +bash om_infer.sh 64 +python3 tdnn_postprocess.py +``` +**评测结果:** + +由于TensorRT不支持原模型,故只能对比修改后的模型性能。 +| 模型 | pth精度 | 710离线推理精度 | 基准性能 | 710性能 | +| :------: | :------: | :------: | :------: | :------: | +| TDNN bs64 | 99.93% | 99.93% | - | 2467fps | +| TDNN修改 bs64 | - | - | 2345.179 fps | 3815.886fps | \ No newline at end of file diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/acl_net.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/acl_net.py index 8de5653478..d7f893b3f3 100644 --- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/acl_net.py +++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/acl_net.py @@ -1,245 +1,276 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import acl -import functools - -# error code -ACL_ERROR_NONE = 0 - -# memory malloc code -ACL_MEM_MALLOC_HUGE_FIRST = 0 -ACL_MEM_MALLOC_HUGE_ONLY = 1 -ACL_MEM_MALLOC_NORMAL_ONLY = 2 - -# memory copy code -ACL_MEMCPY_HOST_TO_HOST = 0 -ACL_MEMCPY_HOST_TO_DEVICE = 1 -ACL_MEMCPY_DEVICE_TO_HOST = 2 -ACL_MEMCPY_DEVICE_TO_DEVICE = 3 - -ACL_DTYPE = { - 0: 'float32', - 1: 'float16', - 2: 'int8', - 3: 'int32', - 4: 'uint8', - 6: 'int16', - 7: 'uint16', - 8: 'uint32', - 9: 'int64', - 10: 'uint64', - 11: 'float64', - 12: 'bool', -} - -buffer_method = { - "in": acl.mdl.get_input_size_by_index, - "out": acl.mdl.get_output_size_by_index, - "outhost": acl.mdl.get_output_size_by_index -} - -def check_ret(message, ret): - if ret != ACL_ERROR_NONE: - raise Exception("{} failed ret = {}".format(message, ret)) - - -class Net(object): - def __init__(self, context, model_path, device_id=0, first=True, config_path=None): - self.device_id = device_id - self.model_path = model_path - self.model_id = None - self.context = context - - self.input_data = [] - self.output_data = [] - self.output_data_host = [] - self.model_desc = None - self.load_input_dataset = None - self.load_output_dataset = None - - self._init_resource(first, config_path) - - - def __call__(self, ori_data): - return self.forward(ori_data) - - - def __del__(self): - ret = acl.mdl.unload(self.model_id) - check_ret("acl.mdl.unload", ret) - if self.model_desc: - acl.mdl.destroy_desc(self.model_desc) - self.model_desc = None - - while self.input_data: - item = self.input_data.pop() - ret = acl.rt.free(item["buffer"]) - check_ret("acl.rt.free", ret) - - while self.output_data: - item = self.output_data.pop() - ret = acl.rt.free(item["buffer"]) - check_ret("acl.rt.free", ret) - - - def _init_resource(self, first=False, config_path=None): - # load_model - self.model_id, ret = acl.mdl.load_from_file(self.model_path) - check_ret("acl.mdl.load_from_file", ret) - - self.model_desc = acl.mdl.create_desc() - self._get_model_info() - - - def _get_model_info(self,): - ret = acl.mdl.get_desc(self.model_desc, self.model_id) - check_ret("acl.mdl.get_desc", ret) - input_size = acl.mdl.get_num_inputs(self.model_desc) - output_size = acl.mdl.get_num_outputs(self.model_desc) - self._gen_data_buffer(input_size, des="in") - self._gen_data_buffer(output_size, des="out") - self._gen_dataset_output_host(output_size, des="outhost") - - - def _gen_data_buffer(self, size, des): - func = buffer_method[des] - for i in range(size): - temp_buffer_size = func(self.model_desc, i) - temp_buffer, ret = acl.rt.malloc(temp_buffer_size, ACL_MEM_MALLOC_HUGE_FIRST) - check_ret("acl.rt.malloc", ret) - - if des == "in": - self.input_data.append({"buffer": temp_buffer, - "size": temp_buffer_size}) - elif des == "out": - self.output_data.append({"buffer": temp_buffer, - "size": temp_buffer_size}) - - - def _gen_dataset_output_host(self, size, des): - func = buffer_method[des] - for i in range(size): - temp_buffer_size = func(self.model_desc, i) - temp_buffer, ret = acl.rt.malloc_host(temp_buffer_size) - check_ret("acl.rt.malloc_host", ret) - - self.output_data_host.append({"buffer": temp_buffer, - "size": temp_buffer_size}) - - - def _data_interaction(self, dataset, policy=ACL_MEMCPY_HOST_TO_DEVICE): - temp_data_buffer = self.input_data \ - if policy == ACL_MEMCPY_HOST_TO_DEVICE \ - else self.output_data - output_malloc_cost = 0 - idx = 0 - - if len(dataset) == 0 and policy == ACL_MEMCPY_DEVICE_TO_HOST: - dataset = self.output_data_host - - for i, item in enumerate(temp_data_buffer): - if policy == ACL_MEMCPY_HOST_TO_DEVICE: - ptr = acl.util.numpy_to_ptr(dataset[i]) - ret = acl.rt.memcpy(item["buffer"], item["size"], ptr, item["size"], policy) - check_ret("acl.rt.memcpy", ret) - - else: - ptr = dataset[i]["buffer"] - ret = acl.rt.memcpy(ptr, item["size"], item["buffer"], item["size"], policy) - check_ret("acl.rt.memcpy", ret) - - - def _gen_dataset(self, type_str="input"): - dataset = acl.mdl.create_dataset() - - temp_dataset = None - if type_str == "in": - self.load_input_dataset = dataset - temp_dataset = self.input_data - else: - self.load_output_dataset = dataset - temp_dataset = self.output_data - - for item in temp_dataset: - data = acl.create_data_buffer(item["buffer"], item["size"]) - if data is None: - ret = acl.destroy_data_buffer(dataset) - check_ret("acl.destroy_data_buffer", ret) - - _, ret = acl.mdl.add_dataset_buffer(dataset, data) - if ret != ACL_ERROR_NONE: - ret = acl.destroy_data_buffer(dataset) - check_ret("acl.destroy_data_buffer", ret) - - - def _data_from_host_to_device(self, images): - self._data_interaction(images, ACL_MEMCPY_HOST_TO_DEVICE) - self._gen_dataset("in") - self._gen_dataset("out") - - - def _data_from_device_to_host(self): - res = [] - self._data_interaction(res, ACL_MEMCPY_DEVICE_TO_HOST) - output = self.get_result(self.output_data_host) - return output - - - def _destroy_databuffer(self): - for dataset in [self.load_input_dataset, self.load_output_dataset]: - if not dataset: - continue - - num = acl.mdl.get_dataset_num_buffers(dataset) - for i in range(num): - data_buf = acl.mdl.get_dataset_buffer(dataset, i) - if data_buf: - ret = acl.destroy_data_buffer(data_buf) - check_ret("acl.destroy_data_buffer", ret) - ret = acl.mdl.destroy_dataset(dataset) - check_ret("acl.mdl.destroy_dataset", ret) - - def forward(self, input_data): - if not isinstance(input_data, (list, tuple)): - input_data = [input_data] - - self._data_from_host_to_device(input_data) - ret = acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset) - check_ret("acl.mdl.execute", ret) - - self._destroy_databuffer() - result = self._data_from_device_to_host() - return result - - - def get_result(self, output_data): - dataset = [] - for i in range(len(output_data)): - dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, i) - check_ret("acl.mdl.get_cur_output_dims", ret) - - data_shape = dims.get("dims") - data_type = acl.mdl.get_output_data_type(self.model_desc, i) - data_len = functools.reduce(lambda x, y: x * y, data_shape) - ftype = np.dtype(ACL_DTYPE.get(data_type)) - - size = output_data[i]["size"] - ptr = output_data[i]["buffer"] - data = acl.util.ptr_to_numpy(ptr, (size,), 1) - np_array = np.frombuffer(bytearray(data[:data_len * ftype.itemsize]), dtype=ftype, count=data_len) - np_array = np_array.reshape(data_shape) - dataset.append(np_array) - return dataset \ No newline at end of file +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import acl +import functools +import numpy as np +import torch +import time + +# error code +ACL_ERROR_NONE = 0 + +# rule for memory copy +ACL_MEMCPY_HOST_TO_HOST = 0 +ACL_MEMCPY_HOST_TO_DEVICE = 1 +ACL_MEMCPY_DEVICE_TO_HOST = 2 +ACL_MEMCPY_DEVICE_TO_DEVICE = 3 + +# dtype +ACL_DTYPE = { + 0: 'float32', + 1: 'float16', + 2: 'int8', + 3: 'int32', + 4: 'uint8', + 6: 'int16', + 7: 'uint16', + 8: 'uint32', + 9: 'int64', + 10: 'uint64', + 11: 'float64', + 12: 'bool', +} + + +def check_ret(message, ret): + if ret != ACL_ERROR_NONE: + raise Exception(f"{message} failed ret={ret}") + + +class MeasureTime(): + def __init__(self, measurements, key, cpu_run=True): + self.measurements = measurements + self.key = key + self.cpu_run = cpu_run + + def __enter__(self): + if not self.cpu_run: + torch.cuda.synchronize() + self.t0 = time.perf_counter_ns() + + def __exit__(self, exc_type, exc_value, exc_traceback): + if not self.cpu_run: + torch.cuda.synchronize() + self.measurements[self.key] = time.perf_counter_ns() - self.t0 + + +class AclModel(object): + def __init__(self, device_id, model_path, sync_infer, measurements, key, cpu_run): + self.device_id = device_id + self.sync_infer = sync_infer + self.out_bufs_ptr = [] + self.output_sizes = [] + self.input_sizes = [] + self.input_bufs_ptr = [] + + self.measurements = measurements + self.key = key + self.cpu_run = cpu_run + + ret = acl.init() + check_ret("acl.init", ret) + ret = acl.rt.set_device(self.device_id) + check_ret("acl.rt.set_device", ret) + self.context, ret = acl.rt.create_context(self.device_id) + check_ret("acl.rt.create_context", ret) + self.model_id, ret = acl.mdl.load_from_file(model_path) + check_ret("acl.mdl.load_from_file", ret) + + self.model_desc = acl.mdl.create_desc() + assert self.model_desc is not None + acl.mdl.get_desc(self.model_desc, self.model_id) + self.dataset_in = acl.mdl.create_dataset() + assert self.dataset_in is not None + self.dataset_out = acl.mdl.create_dataset() + assert self.dataset_out is not None + self.in_size, self.out_size = 0, 0 + self.stm, ret = acl.rt.create_stream() + assert ret == 0 + + self.desc_init() + self.dataset_init() + + def __call__(self, ori_data, dim): + return self.forward(ori_data, dim) + + def __del__(self): + # unload model + if self.model_id: + ret = acl.mdl.unload(self.model_id) + assert ret == 0 + + # destroy model desc + ret = acl.mdl.destroy_desc(self.model_desc) + assert ret == 0 + + self.destroy_data_set(self.dataset_in) + self.destroy_data_set(self.dataset_out) + + # destroy input/output tensor + for i in range(len(self.input_bufs_ptr)): + acl.rt.free(self.input_bufs_ptr[i]["buffer"]) + self.input_bufs_ptr[i] = None + + for i in range(len(self.out_bufs_ptr)): + acl.rt.free(self.out_bufs_ptr[i]["buffer"]) + self.out_bufs_ptr[i] = None + + ret = acl.rt.destroy_stream(self.stm) + assert ret == 0 + + def desc_init(self): + tensor_size = acl.mdl.get_num_inputs(self.model_desc) + if not tensor_size: + raise Exception("get_num_inputs failed") + self.in_size = tensor_size + + for i in range(tensor_size): + size = acl.mdl.get_input_size_by_index(self.model_desc, i) + data, ret = acl.rt.malloc(size, 0) + assert ret == 0 + + self.input_bufs_ptr.append({'size': size, 'buffer': data}) + self.input_sizes.append(size) + + tensor_size = acl.mdl.get_num_outputs(self.model_desc) + self.out_size = tensor_size + for i in range(tensor_size): + dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, i) + assert ret == 0 + size = acl.mdl.get_output_size_by_index(self.model_desc, i) + + data, ret = acl.rt.malloc(size, 0) + assert ret == 0 + + self.output_sizes.append(size) + self.out_bufs_ptr.append({'size': size, 'buffer': data}) + + def dataset_init(self): + self.create_data_set(self.dataset_in, self.input_bufs_ptr, self.input_sizes) + self.create_data_set(self.dataset_out, self.out_bufs_ptr, self.output_sizes) + + def create_data_set(self, dataset, bufs_ptr_list, size_list): + # create dataset buffer then add to dataset + for i in range(len(size_list)): + buffer = acl.create_data_buffer(bufs_ptr_list[i]["buffer"], size_list[i]) + if not buffer: + self.destroy_data_set(dataset) + raise Exception("create_data_buffer failed") + + # add to dataset + _, ret = acl.mdl.add_dataset_buffer(dataset, buffer) + if ret != 0: + self.destroy_data_set(dataset) + raise Exception("add_dataset_buffer failed, ret = {}".format(ret)) + + return dataset + + def destroy_data_set(self, dataset): + data_buf_num = acl.mdl.get_dataset_num_buffers(dataset) + for i in range(data_buf_num): + # get data buffer by index + data_buf = acl.mdl.get_dataset_buffer(dataset, i) + if data_buf is not None: + acl.destroy_data_buffer(data_buf) + + acl.mdl.destroy_dataset(dataset) + + def copy_data_to_device(self, data): + for i in range(len(data)): + ptr, np = acl.util.numpy_contiguous_to_ptr(data[i]["buffer"]) + acl.rt.memcpy(self.input_bufs_ptr[i]["buffer"], data[i]["size"], ptr, + data[i]["size"], ACL_MEMCPY_HOST_TO_DEVICE) + + def copy_output_to_host(self): + output_data = [] + for i in range(len(self.out_bufs_ptr)): + temp = dict() + temp["size"] = self.out_bufs_ptr[i]["size"] + temp["buffer"], ret = acl.rt.malloc_host(temp["size"]) + output_data.append(temp) + acl.rt.memcpy(temp["buffer"], temp["size"], self.out_bufs_ptr[i]["buffer"], + temp["size"], ACL_MEMCPY_DEVICE_TO_HOST) + + return output_data + + def model_exe(self): + with MeasureTime(self.measurements, self.key, self.cpu_run): + ret = acl.mdl.execute(self.model_id, self.dataset_in, self.dataset_out) + assert ret == 0 + output_data = self.copy_output_to_host() + dataset = [] + for i in range(len(output_data)): + dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, i) + data_shape = dims.get("dims") + data_type = acl.mdl.get_output_data_type(self.model_desc, i) + data_len = functools.reduce(lambda x, y: x * y, data_shape) + ftype = np.dtype(ACL_DTYPE.get(data_type)) + + size = output_data[i]["size"] + ptr = output_data[i]["buffer"] + data = acl.util.ptr_to_numpy(ptr, (size,), 1) + np_arr = np.frombuffer(bytearray(data[:data_len * ftype.itemsize]), dtype=ftype, count=data_len) + np_arr = np_arr.reshape(data_shape) + dataset.append(np_arr) + return dataset + + def model_exe_async(self): + with MeasureTime(self.measurements, self.key, self.cpu_run): + ret = acl.mdl.execute_async(self.model_id, self.dataset_in, self.dataset_out, self.stm) + assert ret == 0 + ret = acl.rt.synchronize_stream(self.stm) + assert ret == 0 + output_data = self.copy_output_to_host() + + dataset = [] + for i in range(len(output_data)): + dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, i) + # check_ret("acl.mdl.get_cur_output_dims", ret) + data_shape = dims.get("dims") + + data_type = acl.mdl.get_output_data_type(self.model_desc, i) + data_len = functools.reduce(lambda x, y: x * y, data_shape) + ftype = np.dtype(ACL_DTYPE.get(data_type)) + + size = output_data[i]["size"] + ptr = output_data[i]["buffer"] + data = acl.util.ptr_to_numpy(ptr, (size,), 1) + np_arr = np.frombuffer(bytearray(data[:data_len * ftype.itemsize]), dtype=ftype, count=data_len) + np_arr = np_arr.reshape(data_shape) + dataset.append(np_arr) + return dataset + + def model_exe_with_dynamic_dims(self, input_data, dims): + index, ret = acl.mdl.get_input_index_by_name(self.model_desc, 'ascend_mbatch_shape_data') + ret = acl.mdl.set_input_dynamic_dims(self.model_id, self.dataset_in, index, dims) + gear_count, ret = acl.mdl.get_input_dynamic_gear_count(self.model_desc, -1) + dims_out, ret = acl.mdl.get_input_dynamic_dims(self.model_desc, -1, gear_count) + self.copy_data_to_device(input_data) + if self.sync_infer is True: + res = self.model_exe() + else: + res = self.model_exe_async() + + return res + + def forward(self, input_data, dims): + input_data_dic = [] + for i in range(len(input_data)): + temp = {} + temp["size"] = input_data[i].size * input_data[i].itemsize + temp["buffer"] = input_data[i] + input_data_dic.append(temp) + result = self.model_exe_with_dynamic_dims(input_data_dic, dims) + return result diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/atc.sh b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/atc.sh index a2e9250855..de48fc986b 100644 --- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/atc.sh +++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/atc.sh @@ -1,9 +1,13 @@ #!/bin/bash -export install_path=/usr/local/Ascend/ascend-toolkit/latest -export PATH=/usr/local/python3.7.5/bin:${install_path}/atc/ccec_compiler/bin:${install_path}/atc/bin:$PATH -export PYTHONPATH=${install_path}/atc/python/site-packages:$PYTHONPATH -export LD_LIBRARY_PATH=${install_path}/atc/lib64:${install_path}/acllib/lib64:$LD_LIBRARY_PATH -export ASCEND_OPP_PATH=${install_path}/opp -#export DUMP_GE_GRAPH=2 +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +model=$1 +bs=`echo ${model} | tr -cd "[0-9]" ` + +if [ `echo $model | grep "mod"` ] +then + atc --model=$model --framework=5 --input_format=ND --input_shape="feats:${bs},-1,23;random:${bs},1500" --dynamic_dims='200;300;400;500;600;700;800;900;1000;1100;1200;1300;1400;1500;1600;1700;1800' --output=./tdnn_bs${bs}_mods --soc_version=Ascend710 --log=error +else + atc --model=$model --framework=5 --input_format=ND --input_shape="feats:${bs},-1,23" --dynamic_dims='200;300;400;500;600;700;800;900;1000;1100;1200;1300;1400;1500;1600;1700;1800' --output=./tdnn_bs${bs}s --soc_version=Ascend710 --log=error +fi -atc --model=$1 --framework=5 --input_format=ND --input_shape="feats:1,-1,23" --dynamic_dims='200;300;400;500;600;700;800;900;1000;1100;1200;1300;1400;1500;1600;1700;1800' --output=$2 --soc_version=Ascend310 --log=info \ No newline at end of file diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/hyperparams.yaml b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/hyperparams.yaml new file mode 100644 index 0000000000..fc899ca068 --- /dev/null +++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/hyperparams.yaml @@ -0,0 +1,65 @@ +# ################################# +# Basic inference parameters for speaker-id. We have first a network that +# computes some embeddings. On the top of that, we employ a classifier.` +# +# Author: +# * Mirco Ravanelli 2021 +# ################################# + +# pretrain folders: +pretrained_path: best_model + + +# Model parameters +n_mels: 23 +sample_rate: 16000 +n_classes: 28 # In this case, we have 28 speakers +emb_dim: 512 # dimensionality of the embeddings + +# Feature extraction +compute_features: !new:speechbrain.lobes.features.Fbank + n_mels: !ref + +# Mean and std normalization of the input features +mean_var_norm: !new:speechbrain.processing.features.InputNormalization + norm_type: sentence + std_norm: False + +# To design a custom model, either just edit the simple CustomModel +# class that's listed here, or replace this `!new` call with a line +# pointing to a different file you've defined. +embedding_model: !new:custom_model.Xvector + in_channels: !ref + activation: !name:torch.nn.LeakyReLU + tdnn_blocks: 5 + tdnn_channels: [512, 512, 512, 512, 1500] + tdnn_kernel_sizes: [5, 3, 3, 1, 1] + tdnn_dilations: [1, 2, 3, 1, 1] + lin_neurons: !ref + +classifier: !new:custom_model.Classifier + input_shape: [null, null, !ref ] + activation: !name:torch.nn.LeakyReLU + lin_blocks: 1 + lin_neurons: !ref + out_neurons: !ref + +label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder + +# Objects in "modules" dict will have their parameters moved to the correct +# device, as well as having train()/eval() called on them by the Brain class. +modules: + compute_features: !ref + embedding_model: !ref + classifier: !ref + mean_var_norm: !ref + +pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer + loadables: + embedding_model: !ref + classifier: !ref + label_encoder: !ref + paths: + embedding_model: !ref /embedding_model.ckpt + classifier: !ref /classifier.ckpt + label_encoder: !ref /label_encoder.txt diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/interfaces.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/interfaces.py deleted file mode 100644 index ead6a0634d..0000000000 --- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/interfaces.py +++ /dev/null @@ -1,958 +0,0 @@ -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Defines interfaces for simple inference with pretrained models - -Authors: - * Aku Rouhe 2021 - * Peter Plantinga 2021 - * Loren Lugosch 2020 - * Mirco Ravanelli 2020 - * Titouan Parcollet 2021 -""" -import torch -import torchaudio -from types import SimpleNamespace -from torch.nn import SyncBatchNorm -from torch.nn import DataParallel as DP -from hyperpyyaml import load_hyperpyyaml -from speechbrain.pretrained.fetching import fetch -from speechbrain.dataio.preprocess import AudioNormalizer -import torch.nn.functional as F -from torch.nn.parallel import DistributedDataParallel as DDP -from speechbrain.utils.data_utils import split_path -from speechbrain.utils.distributed import run_on_main - - -class Pretrained: - """Takes a trained model and makes predictions on new data. - - This is a base class which handles some common boilerplate. - It intentionally has an interface similar to ``Brain`` - these base - classes handle similar things. - - Subclasses of Pretrained should implement the actual logic of how - the pretrained system runs, and add methods with descriptive names - (e.g. transcribe_file() for ASR). - - Arguments - --------- - modules : dict of str:torch.nn.Module pairs - The Torch modules that make up the learned system. These can be treated - in special ways (put on the right device, frozen, etc.) - hparams : dict - Each key:value pair should consist of a string key and a hyperparameter - that is used within the overridden methods. These will - be accessible via an ``hparams`` attribute, using "dot" notation: - e.g., self.hparams.model(x). - run_opts : dict - Options parsed from command line. See ``speechbrain.parse_arguments()``. - List that are supported here: - * device - * data_parallel_count - * data_parallel_backend - * distributed_launch - * distributed_backend - * jit_module_keys - freeze_params : bool - To freeze (requires_grad=False) parameters or not. Normally in inference - you want to freeze the params. Also calls .eval() on all modules. - """ - - HPARAMS_NEEDED = [] - MODULES_NEEDED = [] - - def __init__( - self, modules=None, hparams=None, run_opts=None, freeze_params=True - ): - - # Arguments passed via the run opts dictionary. Set a limited - # number of these, since some don't apply to inference. - run_opt_defaults = { - "device": "cpu", - "data_parallel_count": -1, - "data_parallel_backend": False, - "distributed_launch": False, - "distributed_backend": "nccl", - "jit_module_keys": None, - } - for arg, default in run_opt_defaults.items(): - if run_opts is not None and arg in run_opts: - setattr(self, arg, run_opts[arg]) - else: - # If any arg from run_opt_defaults exist in hparams and - # not in command line args "run_opts" - if hparams is not None and arg in hparams: - setattr(self, arg, hparams[arg]) - else: - setattr(self, arg, default) - - # Put modules on the right device, accessible with dot notation - self.modules = torch.nn.ModuleDict(modules) - for mod in self.modules: - self.modules[mod].to(self.device) - - for mod in self.MODULES_NEEDED: - if mod not in modules: - raise ValueError(f"Need modules['{mod}']") - - # Check MODULES_NEEDED and HPARAMS_NEEDED and - # make hyperparams available with dot notation - if self.HPARAMS_NEEDED and hparams is None: - raise ValueError("Need to provide hparams dict.") - if hparams is not None: - # Also first check that all required params are found: - for hp in self.HPARAMS_NEEDED: - if hp not in hparams: - raise ValueError(f"Need hparams['{hp}']") - self.hparams = SimpleNamespace(**hparams) - - # Prepare modules for computation, e.g. jit - self._prepare_modules(freeze_params) - - # Audio normalization - self.audio_normalizer = hparams.get( - "audio_normalizer", AudioNormalizer() - ) - - def _prepare_modules(self, freeze_params): - """Prepare modules for computation, e.g. jit. - - Arguments - --------- - freeze_params : bool - Whether to freeze the parameters and call ``eval()``. - """ - - # Make jit-able - self._compile_jit() - self._wrap_distributed() - - # If we don't want to backprop, freeze the pretrained parameters - if freeze_params: - self.modules.eval() - for p in self.modules.parameters(): - p.requires_grad = False - - def load_audio(self, path, savedir="."): - """Load an audio file with this model"s input spec - - When using a speech model, it is important to use the same type of data, - as was used to train the model. This means for example using the same - sampling rate and number of channels. It is, however, possible to - convert a file from a higher sampling rate to a lower one (downsampling). - Similarly, it is simple to downmix a stereo file to mono. - The path can be a local path, a web url, or a link to a huggingface repo. - """ - source, fl = split_path(path) - path = fetch(fl, source=source, savedir=savedir) - signal, sr = torchaudio.load(path, channels_first=False) - return self.audio_normalizer(signal, sr) - - def _compile_jit(self): - """Compile requested modules with ``torch.jit.script``.""" - if self.jit_module_keys is None: - return - - for name in self.jit_module_keys: - if name not in self.modules: - raise ValueError( - "module " + name + " cannot be jit compiled because " - "it is not defined in your hparams file." - ) - module = torch.jit.script(self.modules[name]) - self.modules[name] = module.to(self.device) - - def _wrap_distributed(self): - """Wrap modules with distributed wrapper when requested.""" - if not self.distributed_launch and not self.data_parallel_backend: - return - elif self.distributed_launch: - for name, module in self.modules.items(): - if any(p.requires_grad for p in module.parameters()): - # for ddp, all module must run on same GPU - module = SyncBatchNorm.convert_sync_batchnorm(module) - module = DDP(module, device_ids=[self.device]) - self.modules[name] = module - else: - # data_parallel_backend - for name, module in self.modules.items(): - if any(p.requires_grad for p in module.parameters()): - # if distributed_count = -1 then use all gpus - # otherwise, specify the set of gpu to use - if self.data_parallel_count == -1: - module = DP(module) - else: - module = DP( - module, - [i for i in range(self.data_parallel_count)], - ) - self.modules[name] = module - - @classmethod - def from_hparams( - cls, - source, - hparams_file="hyperparams.yaml", - overrides={}, - savedir=None, - use_auth_token=False, - **kwargs, - ): - """Fetch and load based from outside source based on HyperPyYAML file - - The source can be a location on the filesystem or online/huggingface - - The hyperparams file should contain a "modules" key, which is a - dictionary of torch modules used for computation. - - The hyperparams file should contain a "pretrainer" key, which is a - speechbrain.utils.parameter_transfer.Pretrainer - - Arguments - --------- - source : str - The location to use for finding the model. See - ``speechbrain.pretrained.fetching.fetch`` for details. - hparams_file : str - The name of the hyperparameters file to use for constructing - the modules necessary for inference. Must contain two keys: - "modules" and "pretrainer", as described. - overrides : dict - Any changes to make to the hparams file when it is loaded. - savedir : str or Path - Where to put the pretraining material. If not given, will use - ./pretrained_models/-hash(source). - use_auth_token : bool (default: False) - If true Hugginface's auth_token will be used to load private models from the HuggingFace Hub, - default is False because majority of models are public. - """ - if savedir is None: - clsname = cls.__name__ - savedir = f"./pretrained_models/{clsname}-{hash(source)}" - hparams_local_path = fetch( - hparams_file, source, savedir, use_auth_token - ) - - # Load the modules: - with open(hparams_local_path) as fin: - hparams = load_hyperpyyaml(fin, overrides) - - # Pretraining: - pretrainer = hparams["pretrainer"] - pretrainer.set_collect_in(savedir) - # For distributed setups, have this here: - run_on_main(pretrainer.collect_files, kwargs={"default_source": source}) - # Load on the CPU. Later the params can be moved elsewhere by specifying - # run_opts={"device": ...} - pretrainer.load_collected(device="cpu") - - # Now return the system - return cls(hparams["modules"], hparams, **kwargs) - - -class EndToEndSLU(Pretrained): - """A end-to-end SLU model. - - The class can be used either to run only the encoder (encode()) to extract - features or to run the entire model (decode()) to map the speech to its semantics. - - Example - ------- - >>> from speechbrain.pretrained import EndToEndSLU - >>> tmpdir = getfixture("tmpdir") - >>> slu_model = EndToEndSLU.from_hparams( - ... source="speechbrain/slu-timers-and-such-direct-librispeech-asr", - ... savedir=tmpdir, - ... ) - >>> slu_model.decode_file("samples/audio_samples/example6.wav") - "{'intent': 'SimpleMath', 'slots': {'number1': 37.67, 'number2': 75.7, 'op': ' minus '}}" - """ - - HPARAMS_NEEDED = ["tokenizer", "asr_model_source"] - MODULES_NEEDED = [ - "slu_enc", - "beam_searcher", - ] - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.tokenizer = self.hparams.tokenizer - self.asr_model = EncoderDecoderASR.from_hparams( - source=self.hparams.asr_model_source, - run_opts={"device": self.device}, - ) - - def decode_file(self, path): - """Maps the given audio file to a string representing the - semantic dictionary for the utterance. - - Arguments - --------- - path : str - Path to audio file to decode. - - Returns - ------- - str - The predicted semantics. - """ - waveform = self.load_audio(path) - waveform = waveform.to(self.device) - # Fake a batch: - batch = waveform.unsqueeze(0) - rel_length = torch.tensor([1.0]) - predicted_words, predicted_tokens = self.decode_batch(batch, rel_length) - return predicted_words[0] - - def encode_batch(self, wavs, wav_lens): - """Encodes the input audio into a sequence of hidden states - - Arguments - --------- - wavs : torch.tensor - Batch of waveforms [batch, time, channels] or [batch, time] - depending on the model. - wav_lens : torch.tensor - Lengths of the waveforms relative to the longest one in the - batch, tensor of shape [batch]. The longest one should have - relative length 1.0 and others len(waveform) / max_length. - Used for ignoring padding. - - Returns - ------- - torch.tensor - The encoded batch - """ - wavs = wavs.float() - wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device) - with torch.no_grad(): - ASR_encoder_out = self.asr_model.encode_batch( - wavs.detach(), wav_lens - ) - encoder_out = self.modules.slu_enc(ASR_encoder_out) - return encoder_out - - def decode_batch(self, wavs, wav_lens): - """Maps the input audio to its semantics - - Arguments - --------- - wavs : torch.tensor - Batch of waveforms [batch, time, channels] or [batch, time] - depending on the model. - wav_lens : torch.tensor - Lengths of the waveforms relative to the longest one in the - batch, tensor of shape [batch]. The longest one should have - relative length 1.0 and others len(waveform) / max_length. - Used for ignoring padding. - - Returns - ------- - list - Each waveform in the batch decoded. - tensor - Each predicted token id. - """ - with torch.no_grad(): - wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device) - encoder_out = self.encode_batch(wavs, wav_lens) - predicted_tokens, scores = self.modules.beam_searcher( - encoder_out, wav_lens - ) - predicted_words = [ - self.tokenizer.decode_ids(token_seq) - for token_seq in predicted_tokens - ] - return predicted_words, predicted_tokens - - -class EncoderDecoderASR(Pretrained): - """A ready-to-use Encoder-Decoder ASR model - - The class can be used either to run only the encoder (encode()) to extract - features or to run the entire encoder-decoder model - (transcribe()) to transcribe speech. The given YAML must contains the fields - specified in the *_NEEDED[] lists. - - Example - ------- - >>> from speechbrain.pretrained import EncoderDecoderASR - >>> tmpdir = getfixture("tmpdir") - >>> asr_model = EncoderDecoderASR.from_hparams( - ... source="speechbrain/asr-crdnn-rnnlm-librispeech", - ... savedir=tmpdir, - ... ) - >>> asr_model.transcribe_file("samples/audio_samples/example2.flac") - "MY FATHER HAS REVEALED THE CULPRIT'S NAME" - """ - - HPARAMS_NEEDED = ["tokenizer"] - MODULES_NEEDED = [ - "encoder", - "decoder", - ] - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.tokenizer = self.hparams.tokenizer - - def transcribe_file(self, path): - """Transcribes the given audiofile into a sequence of words. - - Arguments - --------- - path : str - Path to audio file which to transcribe. - - Returns - ------- - str - The audiofile transcription produced by this ASR system. - """ - waveform = self.load_audio(path) - # Fake a batch: - batch = waveform.unsqueeze(0) - rel_length = torch.tensor([1.0]) - predicted_words, predicted_tokens = self.transcribe_batch( - batch, rel_length - ) - return predicted_words[0] - - def encode_batch(self, wavs, wav_lens): - """Encodes the input audio into a sequence of hidden states - - The waveforms should already be in the model's desired format. - You can call: - ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)`` - to get a correctly converted signal in most cases. - - Arguments - --------- - wavs : torch.tensor - Batch of waveforms [batch, time, channels] or [batch, time] - depending on the model. - wav_lens : torch.tensor - Lengths of the waveforms relative to the longest one in the - batch, tensor of shape [batch]. The longest one should have - relative length 1.0 and others len(waveform) / max_length. - Used for ignoring padding. - - Returns - ------- - torch.tensor - The encoded batch - """ - wavs = wavs.float() - wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device) - encoder_out = self.modules.encoder(wavs, wav_lens) - return encoder_out - - def transcribe_batch(self, wavs, wav_lens): - """Transcribes the input audio into a sequence of words - - The waveforms should already be in the model's desired format. - You can call: - ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)`` - to get a correctly converted signal in most cases. - - Arguments - --------- - wavs : torch.tensor - Batch of waveforms [batch, time, channels] or [batch, time] - depending on the model. - wav_lens : torch.tensor - Lengths of the waveforms relative to the longest one in the - batch, tensor of shape [batch]. The longest one should have - relative length 1.0 and others len(waveform) / max_length. - Used for ignoring padding. - - Returns - ------- - list - Each waveform in the batch transcribed. - tensor - Each predicted token id. - """ - with torch.no_grad(): - wav_lens = wav_lens.to(self.device) - encoder_out = self.encode_batch(wavs, wav_lens) - predicted_tokens, scores = self.modules.decoder( - encoder_out, wav_lens - ) - predicted_words = [ - self.tokenizer.decode_ids(token_seq) - for token_seq in predicted_tokens - ] - return predicted_words, predicted_tokens - - -class EncoderClassifier(Pretrained): - """A ready-to-use class for utterance-level classification (e.g, speaker-id, - language-id, emotion recognition, keyword spotting, etc). - - The class assumes that an encoder called "embedding_model" and a model - called "classifier" are defined in the yaml file. If you want to - convert the predicted index into a corresponding text label, please - provide the path of the label_encoder in a variable called 'lab_encoder_file' - within the yaml. - - The class can be used either to run only the encoder (encode_batch()) to - extract embeddings or to run a classification step (classify_batch()). - ``` - - Example - ------- - >>> import torchaudio - >>> from speechbrain.pretrained import EncoderClassifier - >>> # Model is downloaded from the speechbrain HuggingFace repo - >>> tmpdir = getfixture("tmpdir") - >>> classifier = EncoderClassifier.from_hparams( - ... source="speechbrain/spkrec-ecapa-voxceleb", - ... savedir=tmpdir, - ... ) - - >>> # Compute embeddings - >>> signal, fs = torchaudio.load("samples/audio_samples/example1.wav") - >>> embeddings = classifier.encode_batch(signal) - - >>> # Classification - >>> prediction = classifier .classify_batch(signal) - """ - - MODULES_NEEDED = [ - "compute_features", - "mean_var_norm", - "embedding_model", - "classifier", - ] - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def extract_feats(self, wavs, wav_lens=None): - # wav to feats - wavs = wavs.to('cpu').float() - if wav_lens is None: - wav_lens = torch.ones(wavs.shape[0], device='cpu') - - feats = self.modules.compute_features(wavs) - feats = self.modules.mean_var_norm(feats, wav_lens) - - return feats - - def feats_classify(self, feats, wav_lens=None): - emb = self.modules.embedding_model(feats, wav_lens) - out_prob = self.modules.classifier(emb).squeeze(1) - - return out_prob - - def encode_batch(self, wavs, wav_lens=None, normalize=False): - """Encodes the input audio into a single vector embedding. - - The waveforms should already be in the model's desired format. - You can call: - ``normalized = .normalizer(signal, sample_rate)`` - to get a correctly converted signal in most cases. - - Arguments - --------- - wavs : torch.tensor - Batch of waveforms [batch, time, channels] or [batch, time] - depending on the model. Make sure the sample rate is fs=16000 Hz. - wav_lens : torch.tensor - Lengths of the waveforms relative to the longest one in the - batch, tensor of shape [batch]. The longest one should have - relative length 1.0 and others len(waveform) / max_length. - Used for ignoring padding. - normalize : bool - If True, it normalizes the embeddings with the statistics - contained in mean_var_norm_emb. - - Returns - ------- - torch.tensor - The encoded batch - """ - # Manage single waveforms in input - if len(wavs.shape) == 1: - wavs = wavs.unsqueeze(0) - - # Assign full length if wav_lens is not assigned - if wav_lens is None: - wav_lens = torch.ones(wavs.shape[0], device=self.device) - - # Storing waveform in the specified device - wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device) - wavs = wavs.float() - - # Computing features and embeddings - feats = self.modules.compute_features(wavs) - feats = self.modules.mean_var_norm(feats, wav_lens) - embeddings = self.modules.embedding_model(feats, wav_lens) - if normalize: - embeddings = self.hparams.mean_var_norm_emb( - embeddings, torch.ones(embeddings.shape[0], device=self.device) - ) - return embeddings - - def classify_batch(self, wavs, wav_lens=None): - """Performs classification on the top of the encoded features. - - It returns the posterior probabilities, the index and, if the label - encoder is specified it also the text label. - - Arguments - --------- - wavs : torch.tensor - Batch of waveforms [batch, time, channels] or [batch, time] - depending on the model. Make sure the sample rate is fs=16000 Hz. - wav_lens : torch.tensor - Lengths of the waveforms relative to the longest one in the - batch, tensor of shape [batch]. The longest one should have - relative length 1.0 and others len(waveform) / max_length. - Used for ignoring padding. - - Returns - ------- - out_prob - The log posterior probabilities of each class ([batch, N_class]) - score: - It is the value of the log-posterior for the best class ([batch,]) - index - The indexes of the best class ([batch,]) - text_lab: - List with the text labels corresponding to the indexes. - (label encoder should be provided). - """ - emb = self.encode_batch(wavs, wav_lens) - out_prob = self.modules.classifier(emb).squeeze(1) - score, index = torch.max(out_prob, dim=-1) - text_lab = self.hparams.label_encoder.decode_torch(index) - return out_prob, score, index, text_lab - - def classify_file(self, path): - """Classifies the given audiofile into the given set of labels. - - Arguments - --------- - path : str - Path to audio file to classify. - - Returns - ------- - out_prob - The log posterior probabilities of each class ([batch, N_class]) - score: - It is the value of the log-posterior for the best class ([batch,]) - index - The indexes of the best class ([batch,]) - text_lab: - List with the text labels corresponding to the indexes. - (label encoder should be provided). - """ - waveform = self.load_audio(path) - # Fake a batch: - batch = waveform.unsqueeze(0) - rel_length = torch.tensor([1.0]) - emb = self.encode_batch(batch, rel_length) - out_prob = self.modules.classifier(emb).squeeze(1) - score, index = torch.max(out_prob, dim=-1) - text_lab = self.hparams.label_encoder.decode_torch(index) - return out_prob, score, index, text_lab - - -class SpeakerRecognition(EncoderClassifier): - """A ready-to-use model for speaker recognition. It can be used to - perform speaker verification with verify_batch(). - - ``` - Example - ------- - >>> import torchaudio - >>> from speechbrain.pretrained import SpeakerRecognition - >>> # Model is downloaded from the speechbrain HuggingFace repo - >>> tmpdir = getfixture("tmpdir") - >>> verification = SpeakerRecognition.from_hparams( - ... source="speechbrain/spkrec-ecapa-voxceleb", - ... savedir=tmpdir, - ... ) - - >>> # Perform verification - >>> signal, fs = torchaudio.load("samples/audio_samples/example1.wav") - >>> signal2, fs = torchaudio.load("samples/audio_samples/example2.flac") - >>> score, prediction = verification.verify_batch(signal, signal2) - """ - - MODULES_NEEDED = [ - "compute_features", - "mean_var_norm", - "embedding_model", - "mean_var_norm_emb", - ] - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.similarity = torch.nn.CosineSimilarity(dim=-1, eps=1e-6) - - def verify_batch( - self, wavs1, wavs2, wav1_lens=None, wav2_lens=None, threshold=0.25 - ): - """Performs speaker verification with cosine distance. - - It returns the score and the decision (0 different speakers, - 1 same speakers). - - Arguments - --------- - wavs1 : Torch.Tensor - Tensor containing the speech waveform1 (batch, time). - Make sure the sample rate is fs=16000 Hz. - wavs2 : Torch.Tensor - Tensor containing the speech waveform2 (batch, time). - Make sure the sample rate is fs=16000 Hz. - wav1_lens: Torch.Tensor - Tensor containing the relative length for each sentence - in the length (e.g., [0.8 0.6 1.0]) - wav2_lens: Torch.Tensor - Tensor containing the relative length for each sentence - in the length (e.g., [0.8 0.6 1.0]) - threshold: Float - Threshold applied to the cosine distance to decide if the - speaker is different (0) or the same (1). - - Returns - ------- - score - The score associated to the binary verification output - (cosine distance). - prediction - The prediction is 1 if the two signals in input are from the same - speaker and 0 otherwise. - """ - emb1 = self.encode_batch(wavs1, wav1_lens, normalize=True) - emb2 = self.encode_batch(wavs2, wav2_lens, normalize=True) - score = self.similarity(emb1, emb2) - return score, score > threshold - - def verify_files(self, path_x, path_y): - """Speaker verification with cosine distance - - Returns the score and the decision (0 different speakers, - 1 same speakers). - - Returns - ------- - score - The score associated to the binary verification output - (cosine distance). - prediction - The prediction is 1 if the two signals in input are from the same - speaker and 0 otherwise. - """ - waveform_x = self.load_audio(path_x) - waveform_y = self.load_audio(path_y) - # Fake batches: - batch_x = waveform_x.unsqueeze(0) - batch_y = waveform_y.unsqueeze(0) - # Verify: - score, decision = self.verify_batch(batch_x, batch_y) - # Squeeze: - return score[0], decision[0] - - -class SepformerSeparation(Pretrained): - """A "ready-to-use" speech separation model. - - Uses Sepformer architecture. - - Example - ------- - >>> tmpdir = getfixture("tmpdir") - >>> model = SepformerSeparation.from_hparams( - ... source="speechbrain/sepformer-wsj02mix", - ... savedir=tmpdir) - >>> mix = torch.randn(1, 400) - >>> est_sources = model.separate_batch(mix) - >>> print(est_sources.shape) - torch.Size([1, 400, 2]) - """ - - MODULES_NEEDED = ["encoder", "masknet", "decoder"] - - def separate_batch(self, mix): - """Run source separation on batch of audio. - - Arguments - --------- - mix : torch.tensor - The mixture of sources. - - Returns - ------- - tensor - Separated sources - """ - - # Separation - mix = mix.to(self.device) - mix_w = self.modules.encoder(mix) - est_mask = self.modules.masknet(mix_w) - mix_w = torch.stack([mix_w] * self.hparams.num_spks) - sep_h = mix_w * est_mask - - # Decoding - est_source = torch.cat( - [ - self.modules.decoder(sep_h[i]).unsqueeze(-1) - for i in range(self.hparams.num_spks) - ], - dim=-1, - ) - - # T changed after conv1d in encoder, fix it here - T_origin = mix.size(1) - T_est = est_source.size(1) - if T_origin > T_est: - est_source = F.pad(est_source, (0, 0, 0, T_origin - T_est)) - else: - est_source = est_source[:, :T_origin, :] - return est_source - - def separate_file(self, path, savedir="."): - """Separate sources from file. - - Arguments - --------- - path : str - Path to file which has a mixture of sources. It can be a local - path, a web url, or a huggingface repo. - savedir : path - Path where to store the wav signals (when downloaded from the web). - Returns - ------- - tensor - Separated sources - """ - source, fl = split_path(path) - path = fetch(fl, source=source, savedir=savedir) - - batch, fs_file = torchaudio.load(path) - batch = batch.to(self.device) - fs_model = self.hparams.sample_rate - - # resample the data if needed - if fs_file != fs_model: - print( - "Resampling the audio from {} Hz to {} Hz".format( - fs_file, fs_model - ) - ) - tf = torchaudio.transforms.Resample( - orig_freq=fs_file, new_freq=fs_model - ) - batch = batch.mean(dim=0, keepdim=True) - batch = tf(batch) - - est_sources = self.separate_batch(batch) - est_sources = est_sources / est_sources.max(dim=1, keepdim=True)[0] - return est_sources - - -class SpectralMaskEnhancement(Pretrained): - """A ready-to-use model for speech enhancement. - - Arguments - --------- - See ``Pretrained``. - - Example - ------- - >>> import torchaudio - >>> from speechbrain.pretrained import SpectralMaskEnhancement - >>> # Model is downloaded from the speechbrain HuggingFace repo - >>> tmpdir = getfixture("tmpdir") - >>> enhancer = SpectralMaskEnhancement.from_hparams( - ... source="speechbrain/mtl-mimic-voicebank", - ... savedir=tmpdir, - ... ) - >>> noisy, fs = torchaudio.load("samples/audio_samples/example_noisy.wav") - >>> # Channel dimension is interpreted as batch dimension here - >>> enhanced = enhancer.enhance_batch(noisy) - """ - - HPARAMS_NEEDED = ["compute_stft", "spectral_magnitude", "resynth"] - MODULES_NEEDED = ["enhance_model"] - - def compute_features(self, wavs): - """Compute the log spectral magnitude features for masking. - - Arguments - --------- - wavs : torch.tensor - A batch of waveforms to convert to log spectral mags. - """ - feats = self.hparams.compute_stft(wavs) - feats = self.hparams.spectral_magnitude(feats) - return torch.log1p(feats) - - def enhance_batch(self, noisy, lengths=None): - """Enhance a batch of noisy waveforms. - - Arguments - --------- - noisy : torch.tensor - A batch of waveforms to perform enhancement on. - lengths : torch.tensor - The lengths of the waveforms if the enhancement model handles them. - - Returns - ------- - torch.tensor - A batch of enhanced waveforms of the same shape as input. - """ - noisy = noisy.to(self.device) - noisy_features = self.compute_features(noisy) - - # Perform masking-based enhancement, multiplying output with input. - if lengths is not None: - mask = self.modules.enhance_model(noisy_features, lengths=lengths) - else: - mask = self.modules.enhance_model(noisy_features) - enhanced = torch.mul(mask, noisy_features) - - # Return resynthesized waveforms - return self.hparams.resynth(torch.expm1(enhanced), noisy) - - def enhance_file(self, filename, output_filename=None): - """Enhance a wav file. - - Arguments - --------- - filename : str - Location on disk to load file for enhancement. - output_filename : str - If provided, writes enhanced data to this file. - """ - noisy = self.load_audio(filename) - noisy = noisy.to(self.device) - - # Fake a batch: - batch = noisy.unsqueeze(0) - enhanced = self.enhance_batch(batch) - - if output_filename is not None: - torchaudio.save(output_filename, enhanced, channels_first=False) - - return enhanced.squeeze(0) diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/mo.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/mo.py new file mode 100644 index 0000000000..cad5851dcb --- /dev/null +++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/mo.py @@ -0,0 +1,32 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +from magiconnx import OnnxGraph + +bs = sys.argv[1] +model_name = 'tdnn_bs%s'%bs +graph = OnnxGraph(model_name+'.onnx') +ph = graph.add_placeholder('random','float32',[64,1500]) + +rm = graph.get_nodes("ReduceMin")[0] +rm.inputs = ['random'] +sub = graph.get_nodes("Sub")[-1] +sub.inputs = ['random', rm.outputs[0]] + +rn = graph.get_nodes("RandomNormalLike")[0] +graph.del_node(rn.name, auto_connection=False) + +graph.save('%s_mod.onnx'%model_name) \ No newline at end of file diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify.patch b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify.patch new file mode 100644 index 0000000000..d246c60425 --- /dev/null +++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify.patch @@ -0,0 +1,270 @@ +diff --git a/speechbrain/nnet/CNN.py b/speechbrain/nnet/CNN.py +index 1745846..8750680 100644 +--- a/speechbrain/nnet/CNN.py ++++ b/speechbrain/nnet/CNN.py +@@ -346,7 +346,7 @@ class Conv1d(nn.Module): + padding="same", + groups=1, + bias=True, +- padding_mode="reflect", ++ padding_mode="constant", + skip_transpose=False, + ): + super().__init__() +diff --git a/speechbrain/pretrained/interfaces.py b/speechbrain/pretrained/interfaces.py +index e2521ec..ead6a06 100644 +--- a/speechbrain/pretrained/interfaces.py ++++ b/speechbrain/pretrained/interfaces.py +@@ -1,3 +1,17 @@ ++# Copyright 2021 Huawei Technologies Co., Ltd ++# ++# Licensed under the Apache License, Version 2.0 (the "License"); ++# you may not use this file except in compliance with the License. ++# You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, software ++# distributed under the License is distributed on an "AS IS" BASIS, ++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++# See the License for the specific language governing permissions and ++# limitations under the License. ++ + """Defines interfaces for simple inference with pretrained models + + Authors: +@@ -85,7 +99,10 @@ class Pretrained: + setattr(self, arg, default) + + # Put modules on the right device, accessible with dot notation +- self.modules = torch.nn.ModuleDict(modules).to(self.device) ++ self.modules = torch.nn.ModuleDict(modules) ++ for mod in self.modules: ++ self.modules[mod].to(self.device) ++ + for mod in self.MODULES_NEEDED: + if mod not in modules: + raise ValueError(f"Need modules['{mod}']") +@@ -93,7 +110,7 @@ class Pretrained: + # Check MODULES_NEEDED and HPARAMS_NEEDED and + # make hyperparams available with dot notation + if self.HPARAMS_NEEDED and hparams is None: +- raise ValueError(f"Need to provide hparams dict.") ++ raise ValueError("Need to provide hparams dict.") + if hparams is not None: + # Also first check that all required params are found: + for hp in self.HPARAMS_NEEDED: +@@ -190,6 +207,7 @@ class Pretrained: + hparams_file="hyperparams.yaml", + overrides={}, + savedir=None, ++ use_auth_token=False, + **kwargs, + ): + """Fetch and load based from outside source based on HyperPyYAML file +@@ -215,12 +233,17 @@ class Pretrained: + Any changes to make to the hparams file when it is loaded. + savedir : str or Path + Where to put the pretraining material. If not given, will use +- ./pretrained_checkpoints/-hash(source). ++ ./pretrained_models/-hash(source). ++ use_auth_token : bool (default: False) ++ If true Hugginface's auth_token will be used to load private models from the HuggingFace Hub, ++ default is False because majority of models are public. + """ + if savedir is None: + clsname = cls.__name__ +- savedir = f"./pretrained_checkpoints/{clsname}-{hash(source)}" +- hparams_local_path = fetch(hparams_file, source, savedir) ++ savedir = f"./pretrained_models/{clsname}-{hash(source)}" ++ hparams_local_path = fetch( ++ hparams_file, source, savedir, use_auth_token ++ ) + + # Load the modules: + with open(hparams_local_path) as fin: +@@ -257,7 +280,7 @@ class EndToEndSLU(Pretrained): + "{'intent': 'SimpleMath', 'slots': {'number1': 37.67, 'number2': 75.7, 'op': ' minus '}}" + """ + +- HPARAMS_NEEDED = ["tokenizer", "asr_model"] ++ HPARAMS_NEEDED = ["tokenizer", "asr_model_source"] + MODULES_NEEDED = [ + "slu_enc", + "beam_searcher", +@@ -266,6 +289,10 @@ class EndToEndSLU(Pretrained): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.tokenizer = self.hparams.tokenizer ++ self.asr_model = EncoderDecoderASR.from_hparams( ++ source=self.hparams.asr_model_source, ++ run_opts={"device": self.device}, ++ ) + + def decode_file(self, path): + """Maps the given audio file to a string representing the +@@ -282,6 +309,7 @@ class EndToEndSLU(Pretrained): + The predicted semantics. + """ + waveform = self.load_audio(path) ++ waveform = waveform.to(self.device) + # Fake a batch: + batch = waveform.unsqueeze(0) + rel_length = torch.tensor([1.0]) +@@ -310,10 +338,10 @@ class EndToEndSLU(Pretrained): + wavs = wavs.float() + wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device) + with torch.no_grad(): +- ASR_encoder_out = self.hparams.asr_model.encode_batch( ++ ASR_encoder_out = self.asr_model.encode_batch( + wavs.detach(), wav_lens + ) +- encoder_out = self.hparams.slu_enc(ASR_encoder_out) ++ encoder_out = self.modules.slu_enc(ASR_encoder_out) + return encoder_out + + def decode_batch(self, wavs, wav_lens): +@@ -338,7 +366,7 @@ class EndToEndSLU(Pretrained): + Each predicted token id. + """ + with torch.no_grad(): +- wav_lens = wav_lens.to(self.device) ++ wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device) + encoder_out = self.encode_batch(wavs, wav_lens) + predicted_tokens, scores = self.modules.beam_searcher( + encoder_out, wav_lens +@@ -512,6 +540,23 @@ class EncoderClassifier(Pretrained): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) ++ ++ def extract_feats(self, wavs, wav_lens=None): ++ # wav to feats ++ wavs = wavs.to('cpu').float() ++ if wav_lens is None: ++ wav_lens = torch.ones(wavs.shape[0], device='cpu') ++ ++ feats = self.modules.compute_features(wavs) ++ feats = self.modules.mean_var_norm(feats, wav_lens) ++ ++ return feats ++ ++ def feats_classify(self, feats, wav_lens=None): ++ emb = self.modules.embedding_model(feats, wav_lens) ++ out_prob = self.modules.classifier(emb).squeeze(1) ++ ++ return out_prob + + def encode_batch(self, wavs, wav_lens=None, normalize=False): + """Encodes the input audio into a single vector embedding. +@@ -595,7 +640,36 @@ class EncoderClassifier(Pretrained): + out_prob = self.modules.classifier(emb).squeeze(1) + score, index = torch.max(out_prob, dim=-1) + text_lab = self.hparams.label_encoder.decode_torch(index) ++ return out_prob, score, index, text_lab + ++ def classify_file(self, path): ++ """Classifies the given audiofile into the given set of labels. ++ ++ Arguments ++ --------- ++ path : str ++ Path to audio file to classify. ++ ++ Returns ++ ------- ++ out_prob ++ The log posterior probabilities of each class ([batch, N_class]) ++ score: ++ It is the value of the log-posterior for the best class ([batch,]) ++ index ++ The indexes of the best class ([batch,]) ++ text_lab: ++ List with the text labels corresponding to the indexes. ++ (label encoder should be provided). ++ """ ++ waveform = self.load_audio(path) ++ # Fake a batch: ++ batch = waveform.unsqueeze(0) ++ rel_length = torch.tensor([1.0]) ++ emb = self.encode_batch(batch, rel_length) ++ out_prob = self.modules.classifier(emb).squeeze(1) ++ score, index = torch.max(out_prob, dim=-1) ++ text_lab = self.hparams.label_encoder.decode_torch(index) + return out_prob, score, index, text_lab + + +@@ -732,6 +806,7 @@ class SepformerSeparation(Pretrained): + """ + + # Separation ++ mix = mix.to(self.device) + mix_w = self.modules.encoder(mix) + est_mask = self.modules.masknet(mix_w) + mix_w = torch.stack([mix_w] * self.hparams.num_spks) +@@ -774,6 +849,7 @@ class SepformerSeparation(Pretrained): + path = fetch(fl, source=source, savedir=savedir) + + batch, fs_file = torchaudio.load(path) ++ batch = batch.to(self.device) + fs_model = self.hparams.sample_rate + + # resample the data if needed +@@ -846,6 +922,7 @@ class SpectralMaskEnhancement(Pretrained): + torch.tensor + A batch of enhanced waveforms of the same shape as input. + """ ++ noisy = noisy.to(self.device) + noisy_features = self.compute_features(noisy) + + # Perform masking-based enhancement, multiplying output with input. +@@ -869,6 +946,7 @@ class SpectralMaskEnhancement(Pretrained): + If provided, writes enhanced data to this file. + """ + noisy = self.load_audio(filename) ++ noisy = noisy.to(self.device) + + # Fake a batch: + batch = noisy.unsqueeze(0) +diff --git a/templates/speaker_id/custom_model.py b/templates/speaker_id/custom_model.py +index 9a78a37..3a67eae 100644 +--- a/templates/speaker_id/custom_model.py ++++ b/templates/speaker_id/custom_model.py +@@ -76,9 +76,10 @@ class Xvector(torch.nn.Module): + out_channels=out_channels, + kernel_size=tdnn_kernel_sizes[block_index], + dilation=tdnn_dilations[block_index], ++ skip_transpose=True, + ), + activation(), +- BatchNorm1d(input_size=out_channels), ++ BatchNorm1d(input_size=out_channels,skip_transpose=True), + ] + ) + in_channels = tdnn_channels[block_index] +@@ -105,8 +106,12 @@ class Xvector(torch.nn.Module): + --------- + x : torch.Tensor + """ ++ x = x.transpose(1, -1) + + for layer in self.blocks: ++ if type(layer) == type(StatisticsPooling()): ++ x = x.transpose(1, -1) ++ + try: + x = layer(x, lengths=lens) + except TypeError: +diff --git a/templates/speaker_id/mini_librispeech_prepare.py b/templates/speaker_id/mini_librispeech_prepare.py +index c22add8..7a777df 100644 +--- a/templates/speaker_id/mini_librispeech_prepare.py ++++ b/templates/speaker_id/mini_librispeech_prepare.py +@@ -171,7 +171,7 @@ def split_sets(wav_list, split_ratio): + dictionary containing train, valid, and test splits. + """ + # Random shuffle of the list +- random.shuffle(wav_list) ++ # random.shuffle(wav_list) + tot_split = sum(split_ratio) + tot_snts = len(wav_list) + data_split = {} diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify_onnx.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify_onnx.py new file mode 100644 index 0000000000..ab8273817c --- /dev/null +++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/modify_onnx.py @@ -0,0 +1,51 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys + +import numpy as np +import onnx + +from magiconnx import OnnxGraph + +model_name = sys.argv[1] +graph = OnnxGraph(model_name) + +axes = onnx.helper.make_attribute("axes", [0,1]) +rd_min = graph.get_nodes("ReduceMin")[0] +rd_min._node.attribute.append(axes) +rd_max = graph.get_nodes("ReduceMax")[0] +rd_max._node.attribute.append(axes) + +us = graph.add_node('Unsq_1', 'Unsqueeze', {'axes': [2]}) +graph.insert_node(graph.get_nodes("Conv")[0].name, us, mode='before') +sq = graph.add_node('Sq_291', 'Squeeze', {'axes': [2]}) +graph.insert_node(graph.get_nodes('BatchNormalization')[4].name, sq, mode='after') + +convs = graph.get_nodes("Conv") +for conv in convs: + print(conv.name) + dil = conv['dilations'][0] + ks = conv['kernel_shape'][0] + pds = conv['pads'][0] + stri = conv['strides'][0] + conv['dilations'] = [1, dil] + conv['kernel_shape'] = [1, ks] + conv['pads'] = [0, pds, 0, pds] + conv['strides'] = [1, stri] + conv_w = graph[conv.inputs[1]].value + conv_w = np.expand_dims(conv_w, axis=-2) + graph[conv.inputs[1]].value = conv_w + +graph.save(model_name) \ No newline at end of file diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/om_infer.sh b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/om_infer.sh index 1271031bb4..55d9c9adf2 100644 --- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/om_infer.sh +++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/om_infer.sh @@ -1,6 +1,8 @@ -install_path=/usr/local/Ascend/ascend-toolkit/latest +install_path=/home/dl/ascend-toolkit/latest export PYTHONUNBUFFERD=1 export PYTHONPATH=${install_path}/pyACL/python/site-packages/acl:$PYTHONPATH export LD_LIBRARY_PATH=${install_path}/acllib/lib64/:$LD_LIBRARY_PATH -python3.7 pyacl_infer.py --model_path=./tdnn_dynamic.om --device_id=4 --cpu_run=True --sync_infer=True --workspace=10 --input_info_file_path=mini_librispeech_test.input_info_file_path --input_dtypes=float32 --infer_res_save_path=./result --res_save_type=bin \ No newline at end of file +bs=$1 + +python3.7 tdnn_pyacl_infer.py --model_path=tdnn_bs${bs}s.om --batch_size=${bs} --device_id=0 --cpu_run=True --sync_infer=True --workspace=10 --input_info_file_path=mini_librispeech_test.info --input_dtypes=float32 --infer_res_save_path=./result --res_save_type=bin \ No newline at end of file diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/requirements.txt b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/requirements.txt new file mode 100644 index 0000000000..561b93c4e7 --- /dev/null +++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/requirements.txt @@ -0,0 +1,8 @@ +onnx==1.10.2 +torch==1.10.0 +torchaudio==0.10.2 +tqdm==4.63.0 +HyperPyYAML==1.0.0 +huggingface-hub==0.4.0 + + diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_postprocess.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_postprocess.py index 057c419d0b..c046198734 100644 --- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_postprocess.py +++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_postprocess.py @@ -15,7 +15,7 @@ import os import re import argparse -import numpy as numpy +import numpy as np label = {0:'163', 1:'7367', 2:'332', 3:'1970', 4:'4640', 5:'8629', 6:'6848', 7:'1088', 8:'460', 9:'6272', 10:'7312', 11:'2136', 12:'1867', 13:'669', 14:'3526', 15:'3664', 16:'3242', 17:'19', 18:'32', 19:'5789', 20:'118', 21:'226', 22:'7859', 23:'3947', 24:'1898', 25:'2416', 26:'1737', 27:'4680'} if __name__ == '__main__': @@ -42,9 +42,10 @@ if __name__ == '__main__': index = split[0] input_file = split[1] target = re.search('/(\d*)-', input_file).group()[1:-1] + output_file = opt.result_dir + '/' + index + '.0.bin' output = np.fromfile(output_file, np.float32) res = np.argmax(output) - print('Predicted:', lable[res], 'Target:', target) + print('Predicted:', label[res], 'Target:', target) total += 1 if label[res] != target: error += 1 diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/pth2onnx.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_pth2onnx.py similarity index 88% rename from ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/pth2onnx.py rename to ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_pth2onnx.py index ea69d2af94..2e830e2802 100644 --- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/pth2onnx.py +++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_pth2onnx.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys + import torch import torchaudio from speechbrain.pretrained import EncoderClassifier @@ -29,12 +31,13 @@ class Xvector(torch.nn.Module): return res model = Xvector(classifier) -feats = torch.randn([1, 1800, 23]) +batch_size=int(sys.argv[1]) +feats = torch.randn([batch_size, 1800, 23]) torch.onnx.export( model, feats, - 'tdnn.onnx', + 'tdnn_bs%d.onnx'%(batch_size), input_names=['feats'], output_names=['output'], export_params=True, diff --git a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/pyacl_infer.py b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_pyacl_infer.py similarity index 76% rename from ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/pyacl_infer.py rename to ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_pyacl_infer.py index 15ff286129..87967a5d26 100644 --- a/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/pyacl_infer.py +++ b/ACL_PyTorch/built-in/audio/TDNN_for_Pytorch/tdnn_pyacl_infer.py @@ -86,6 +86,7 @@ if __name__ == '__main__': # 参数解析 parser = argparse.ArgumentParser() parser.add_argument('--model_path', required=True) + parser.add_argument('--batch_size', required=True) parser.add_argument('--device_id', required=True, type=int) parser.add_argument('--cpu_run', required=True, choices=['True', 'False']) parser.add_argument('--sync_infer', required=True, choices=['True', 'False']) @@ -129,27 +130,59 @@ if __name__ == '__main__': total_infer_time = 0 total_infer_time_workspace = 0 total_infer_num = 0 - for key, values in tqdm(inputs_info.items()): + dataset = {} + dims_infos = {} + bs = int(opt.batch_size) + for key, values in inputs_info.items(): # 构造输入 inputs = [] dims = [] for idx, value in enumerate(values): x = np.fromfile(value['path'], dtype=input_dtypes[idx]).reshape(value['shape']) - inputs.append(x) - dims.extend(value['shape']) + inputs.append((key,x)) + dims.extend((bs, value['shape'][1], value['shape'][2])) dims_info = {'dimCount': len(dims), 'name': '', 'dims': dims} + # (1, 1500, 23) {'dimCount': 3, 'name': '', 'dims': [1, 1500, 23]} + length = inputs[0][1].shape[1] + dataset[length] = dataset.get(length,[]) + inputs + dims_infos[length] = dims_infos.get(length,dims_info) + + total_inputs = [] + total_keys = [] + for k in sorted(dataset.keys()): + total_len = len(dataset[k]) + batch_input = [] + batch_key = [] + for i, (key, ipt) in enumerate(dataset[k]): + batch_input.append(ipt) + batch_key.append(key) + if (i+1) % bs == 0: + total_inputs.append(batch_input) + total_keys.append(batch_key) + batch_input = [] + batch_key = [] + if batch_input != []: + total_inputs.append(batch_input) + total_keys.append(batch_key) + + for i, b_ipt in tqdm(enumerate(total_inputs)): + batch_input = np.squeeze(np.array(b_ipt), axis=1) + if batch_input.shape[0] < bs: + batch_input = np.pad(batch_input, [(0, bs-batch_input.shape[0]), (0, 0), (0, 0)], mode='constant') + # 推理得到输出 - output = om_model(inputs, dims_info) + # (bs, 28) + output = om_model([batch_input], dims_infos[batch_input.shape[1]]) + total_infer_num += 1 # 保存文件 - if opt.res_save_type == 'bin': - for idx, data in enumerate(output): - data.tofile(os.path.join(opt.infer_res_save_path, key + '.' + str(idx) + '.bin')) - else: - for idx, data in enumerate(output): - np.save(os.path.join(opt.infer_res_save_path, key + '.' + str(idx) + '.npy'), data) + for j, key in enumerate(total_keys[i]): + if opt.res_save_type == 'bin': + output[0][j].tofile(os.path.join(opt.infer_res_save_path, key + '.' + str(0) + '.bin')) + else: + np.save(os.path.join(opt.infer_res_save_path, key + '.' + str(0) + '.npy'), output[0][j]) # 计算时间 total_infer_time += measurements['per_infer_time_ns'] -- Gitee