diff --git a/ACL_PyTorch/contrib/audio/WeNet/.keep b/ACL_PyTorch/contrib/audio/WeNet/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ACL_PyTorch/contrib/audio/WeNet/README.md b/ACL_PyTorch/contrib/audio/WeNet/README.md new file mode 100644 index 0000000000000000000000000000000000000000..96f0f4b5584ada66f418fcf35471f6c9f4be75e7 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/README.md @@ -0,0 +1,103 @@ +# Wenet模型PyTorch离线推理指导 + +## 1 环境准备 + +1. 安装必要的依赖,测试环境可能已经安装其中的一些不同版本的库了,故手动测试时不推荐使用该命令安装 + +``` +pip3 install -r requirements.txt +``` + +2. 获取,修改与安装开源模型代码 + +``` +git clone https://github.com/wenet-e2e/wenet.git +cd wenet +git reset 9c4e305bcc24a06932f6a65c8147429d8406cc63 --hard +``` + +3. 下载网络权重文件并导出onnx + +下载链接:http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/aishell/20210601_u2pp_conformer_exp.tar.gz下载压缩文件,将文件解压,将文件夹内的文件放置到wenet/examples/aishell/s0/exp/conformer_u2文件夹下,若没有该文件夹,则创建该文件夹 + +将所有提供的diff文件放到wenet根目录下 +patch -p1 < export_onnx.diff文件适配导出onnx的代码 +将提供的export_onnx.py、process_encoder_data_noflash.py、recognize_attenstion_rescoring.py、static.py文件放到wenet/wenet/bin/目录下 +将提供的slice_helper.py, acl_net.py文件放到wenet/wenet/transformer文件夹下,将提供的sh脚本文件放到wenet/examples/aishell/s0/目录下 +运行bash export_onnx.sh exp/conformer_u2/train.yaml exp/conformer_u2/final.pt导出onnx文件在当前目录下的onnx文件夹下 + +4. 运行脚本将onnx转为om模型 + +首先使用改图工具om_gener改图,该工具链接为https://gitee.com/liurf_hw/om_gener,安装之后使用以下命令修改脚本, + +python3 adaptdecoder.py生成decoder_final.onnx + +python3 adaptnoflashencoder.py生成no_flash_encoder_revise.onnx + +配置环境变量,使用atc工具将模型转换为om文件,命令参考提供的decoder.sh, no_flash_encoder.sh脚本,运行即可生成对应的om文件 + +5. 数据集下载: + + 在wenet/examples/aishell/s0/文件夹下运行bash run.sh --stage -1 –stop_stage -1下载数据集 + + 运行bash run.sh --stage 0 --stop_stage 0处理数据集 + + 运行bash run.sh --stage 1 --stop_stage 1处理数据集 + + 运行bash run.sh --stage 2 --stop_stage 2处理数据集 + + 运行bash run.sh --stage 3 --stop_stage 3处理数据集 + +## 2 离线推理 + +​ 动态shape场景: + + 获取非流式场景下encoder处理数据:cd到wenet根目录下 + + ``` + git checkout . + patch -p1 < get_no_flash_encoder_out.diff + cd examples/aishell/s0/ + bash run_no_flash_encoder_out.sh + ``` + + wenet/bin/process_encoder_data_noflash.py文件中 + --bin_path 保存encoder生成的bin文件 + --model_path 保存非流式encoder om模型位置 + --json_path保存encoder生成bin文件的shape信息 + 获取非流式场景下,decoder处理结果:cd到wenet根目录下 + + ``` + git checkout . + patch -p1 < getwer.diff + cd examples/aishell/s0/ + bash run_attention_rescoring.sh + ``` + + wenet/bin/recognize_attenstion_rescoring.py文件中 + --bin_path 非流式encoder om生成bin文件,即上一步生成的bin文件路径 + --model_path decoder模型om路径 + --json_path 非流式encoder生成bin文件shape信息对应的json文件,即上一步生成的json文件。 + 查看wenet/examples/aishell/s0/exp/conformer/test_attention_rescoring/wer文件的最后几行,即可获取overall精度 + 运行infer.py可以得到fps性能数据 + +静态shape场景(仅支持非流式场景): + +onnx转om: + +``` +bash static_encoder.sh +bash static_decoder.sh +``` + +精度测试: + +首先export ASCEND_GLOBAL_LOG_LEVEL=3,指定acc.diff中self.encoder_ascend, self.decoder_ascend加载的文件为静态转出的encoder,decoder模型 +``` +git checkout . +patch -p1 < acc.diff +cd examples/aishell/s0/ +bash static.sh +``` + +性能:在wenet/examples/aishell/s0/exp/conformer/test_attention_rescoring/text文件最后一行有fps性能数据 diff --git a/ACL_PyTorch/contrib/audio/WeNet/acc.diff b/ACL_PyTorch/contrib/audio/WeNet/acc.diff new file mode 100644 index 0000000000000000000000000000000000000000..b4d2d5ea3c3acf37998d2a0c8bd940ac21416ccf --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/acc.diff @@ -0,0 +1,358 @@ +diff --git a/wenet/dataset/dataset.py b/wenet/dataset/dataset.py +index 4f0ff39..4ce97a4 100644 +--- a/wenet/dataset/dataset.py ++++ b/wenet/dataset/dataset.py +@@ -27,7 +27,7 @@ import torchaudio.sox_effects as sox_effects + import yaml + from PIL import Image + from PIL.Image import BICUBIC +-from torch.nn.utils.rnn import pad_sequence ++#from torch.nn.utils.rnn import pad_sequence + from torch.utils.data import Dataset, DataLoader + + import wenet.dataset.kaldi_io as kaldi_io +@@ -36,7 +36,69 @@ from wenet.utils.common import IGNORE_ID + + torchaudio.set_audio_backend("sox_io") + ++def _pad_sequence(sequences, batch_first=False, padding_value=0, mul_shape = None): ++ r"""Pad a list of variable length Tensors with ``padding_value`` ++ ++ ``pad_sequence`` stacks a list of Tensors along a new dimension, ++ and pads them to equal length. For example, if the input is list of ++ sequences with size ``L x *`` and if batch_first is False, and ``T x B x *`` ++ otherwise. ++ ++ `B` is batch size. It is equal to the number of elements in ``sequences``. ++ `T` is length of the longest sequence. ++ `L` is length of the sequence. ++ `*` is any number of trailing dimensions, including none. ++ ++ Example: ++ >>> from torch.nn.utils.rnn import pad_sequence ++ >>> a = torch.ones(25, 300) ++ >>> b = torch.ones(22, 300) ++ >>> c = torch.ones(15, 300) ++ >>> pad_sequence([a, b, c]).size() ++ torch.Size([25, 3, 300]) ++ ++ Note: ++ This function returns a Tensor of size ``T x B x *`` or ``B x T x *`` ++ where `T` is the length of the longest sequence. This function assumes ++ trailing dimensions and type of all the Tensors in sequences are same. ++ ++ Arguments: ++ sequences (list[Tensor]): list of variable length sequences. ++ batch_first (bool, optional): output will be in ``B x T x *`` if True, or in ++ ``T x B x *`` otherwise ++ padding_value (float, optional): value for padded elements. Default: 0. + ++ Returns: ++ Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``. ++ Tensor of size ``B x T x *`` otherwise ++ """ ++ ++ # assuming trailing dimensions and type of all the Tensors ++ # in sequences are same and fetching those from sequences[0] ++ ++ max_size = sequences[0].size() ++ trailing_dims = max_size[1:] ++ ++ max_len = max([s.size(0) for s in sequences]) ++ if mul_shape is not None: ++ for in_shape in mul_shape: ++ if max_len < in_shape: ++ max_len = in_shape ++ break ++ if batch_first: ++ out_dims = (len(sequences), max_len) + trailing_dims ++ else: ++ out_dims = (max_len, len(sequences)) + trailing_dims ++ ++ out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value) ++ for i, tensor in enumerate(sequences): ++ length = tensor.size(0) ++ # use index notation to prevent duplicate references to the tensor ++ if batch_first: ++ out_tensor[i, :length, ...] = tensor ++ else: ++ out_tensor[:length, i, ...] = tensor ++ return out_tensor + def _spec_augmentation(x, + warp_for_time=False, + num_t_mask=2, +@@ -187,6 +249,7 @@ def _extract_feature(batch, speed_perturb, wav_distortion_conf, + Returns: + (keys, feats, labels) + """ ++ + keys = [] + feats = [] + lengths = [] +@@ -331,13 +394,14 @@ class CollateFunc(object): + self.spec_sub = spec_sub + self.spec_sub_conf = spec_sub_conf + ++ ++ + def __call__(self, batch): + assert (len(batch) == 1) + if self.raw_wav: + keys, xs, ys = _extract_feature(batch[0], self.speed_perturb, + self.wav_distortion_conf, + self.feature_extraction_conf) +- + else: + keys, xs, ys = _load_feature(batch[0]) + +@@ -359,27 +423,31 @@ class CollateFunc(object): + if self.spec_aug: + xs = [_spec_augmentation(x, **self.spec_aug_conf) for x in xs] + +- # padding +- xs_lengths = torch.from_numpy( +- np.array([x.shape[0] for x in xs], dtype=np.int32)) ++ + + # pad_sequence will FAIL in case xs is empty ++ mul_shape = [262, 326, 390, 454, 518, 582, 646, 710, 774, 838, 902, 966, 1028, 1284, 1478] + if len(xs) > 0: +- xs_pad = pad_sequence([torch.from_numpy(x).float() for x in xs], +- True, 0) ++ xs_pad = _pad_sequence([torch.from_numpy(x).float() for x in xs], ++ True, 0, mul_shape) + else: + xs_pad = torch.Tensor(xs) ++ # padding ++ xs_lengths = torch.from_numpy( ++ np.array([x.shape[0] for x in xs_pad], dtype=np.int32)) ++ + if train_flag: + ys_lengths = torch.from_numpy( + np.array([y.shape[0] for y in ys], dtype=np.int32)) + if len(ys) > 0: +- ys_pad = pad_sequence([torch.from_numpy(y).int() for y in ys], ++ ys_pad = _pad_sequence([torch.from_numpy(y).int() for y in ys], + True, IGNORE_ID) + else: + ys_pad = torch.Tensor(ys) + else: + ys_pad = None + ys_lengths = None ++ + return keys, xs_pad, ys_pad, xs_lengths, ys_lengths + + +@@ -430,7 +498,6 @@ class AudioDataset(Dataset): + """ + assert batch_type in ['static', 'dynamic'] + data = [] +- + # Open in utf8 mode since meet encoding problem + with codecs.open(data_file, 'r', encoding='utf-8') as f: + for line in f: +diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py +index 73990fa..50358ca 100644 +--- a/wenet/transformer/asr_model.py ++++ b/wenet/transformer/asr_model.py +@@ -32,8 +32,74 @@ from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, + reverse_pad_list) + from wenet.utils.mask import (make_pad_mask, mask_finished_preds, + mask_finished_scores, subsequent_mask) ++from wenet.transformer.acl_net import Net ++import time ++import acl ++ ++def _pad_sequence(sequences, batch_first=False, padding_value=0, mul_shape = None): ++ r"""Pad a list of variable length Tensors with ``padding_value`` ++ ++ ``pad_sequence`` stacks a list of Tensors along a new dimension, ++ and pads them to equal length. For example, if the input is list of ++ sequences with size ``L x *`` and if batch_first is False, and ``T x B x *`` ++ otherwise. ++ ++ `B` is batch size. It is equal to the number of elements in ``sequences``. ++ `T` is length of the longest sequence. ++ `L` is length of the sequence. ++ `*` is any number of trailing dimensions, including none. ++ ++ Example: ++ >>> from torch.nn.utils.rnn import pad_sequence ++ >>> a = torch.ones(25, 300) ++ >>> b = torch.ones(22, 300) ++ >>> c = torch.ones(15, 300) ++ >>> pad_sequence([a, b, c]).size() ++ torch.Size([25, 3, 300]) ++ ++ Note: ++ This function returns a Tensor of size ``T x B x *`` or ``B x T x *`` ++ where `T` is the length of the longest sequence. This function assumes ++ trailing dimensions and type of all the Tensors in sequences are same. ++ ++ Arguments: ++ sequences (list[Tensor]): list of variable length sequences. ++ batch_first (bool, optional): output will be in ``B x T x *`` if True, or in ++ ``T x B x *`` otherwise ++ padding_value (float, optional): value for padded elements. Default: 0. ++ ++ Returns: ++ Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``. ++ Tensor of size ``B x T x *`` otherwise ++ """ ++ ++ # assuming trailing dimensions and type of all the Tensors ++ # in sequences are same and fetching those from sequences[0] ++ ++ max_size = sequences[0].size() ++ trailing_dims = max_size[1:] ++ ++ max_len = max([s.size(0) for s in sequences]) ++ if mul_shape is not None: ++ for in_shape in mul_shape: ++ if max_len < in_shape: ++ max_len = in_shape ++ break + +- ++ if batch_first: ++ out_dims = (len(sequences), max_len) + trailing_dims ++ else: ++ out_dims = (max_len, len(sequences)) + trailing_dims ++ ++ out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value) ++ for i, tensor in enumerate(sequences): ++ length = tensor.size(0) ++ # use index notation to prevent duplicate references to the tensor ++ if batch_first: ++ out_tensor[i, :length, ...] = tensor ++ else: ++ out_tensor[:length, i, ...] = tensor ++ return out_tensor + class ASRModel(torch.nn.Module): + """CTC-attention hybrid Encoder-Decoder model""" + def __init__( +@@ -60,6 +126,13 @@ class ASRModel(torch.nn.Module): + self.reverse_weight = reverse_weight + + self.encoder = encoder ++ self.device_id = 0 ++ ret = acl.init() ++ ret = acl.rt.set_device(self.device_id) ++ context, ret = acl.rt.create_context(self.device_id) ++ self.encoder_ascend = Net(model_path="/home/zry2/wenet/examples/aishell/s0/onnx/encoder_fendang_262_1478_static.om", device_id=self.device_id) ++ self.decoder_ascend = Net(model_path="/home/zry2/wenet/examples/aishell/s0/onnx/decoder_fendang.om", device_id=self.device_id) ++ self.encoder_out_shape = [] + self.decoder = decoder + self.ctc = ctc + self.criterion_att = LabelSmoothingLoss( +@@ -168,13 +241,21 @@ class ASRModel(torch.nn.Module): + num_decoding_left_chunks=num_decoding_left_chunks + ) # (B, maxlen, encoder_dim) + else: +- encoder_out, encoder_mask = self.encoder( +- speech, +- speech_lengths, +- decoding_chunk_size=decoding_chunk_size, +- num_decoding_left_chunks=num_decoding_left_chunks +- ) # (B, maxlen, encoder_dim) +- return encoder_out, encoder_mask ++ st = time.time() ++ ++ # encoder_out, encoder_mask = self.encoder( ++ # speech, ++ # speech_lengths, ++ # decoding_chunk_size=decoding_chunk_size, ++ # num_decoding_left_chunks=num_decoding_left_chunks ++ # ) # (B, maxlen, encoder_dim) ++ speech = speech.numpy() ++ speech_lengths = speech_lengths.numpy().astype("int32") ++ dims1 = {'dimCount': 4, 'name': '', 'dims': [1, speech.shape[1], 80, 1]} ++ y, exe_time = self.encoder_ascend([speech, speech_lengths], dims = dims1) ++ encoder_out = torch.from_numpy(y[0]) ++ encoder_mask = torch.from_numpy(y[1]) ++ return encoder_out, encoder_mask, exe_time + + def recognize( + self, +@@ -361,13 +442,17 @@ class ASRModel(torch.nn.Module): + assert batch_size == 1 + # Let's assume B = batch_size and N = beam_size + # 1. Encoder forward and get CTC score +- encoder_out, encoder_mask = self._forward_encoder( ++ encoder_out, encoder_mask, encoder_t = self._forward_encoder( + speech, speech_lengths, decoding_chunk_size, + num_decoding_left_chunks, + simulate_streaming) # (B, maxlen, encoder_dim) + maxlen = encoder_out.size(1) ++ mul_shape = [96, 144, 384] ++ ++ encoder_out = _pad_sequence(encoder_out, True, 0, mul_shape) + ctc_probs = self.ctc.log_softmax( + encoder_out) # (1, maxlen, vocab_size) ++ + ctc_probs = ctc_probs.squeeze(0) + # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) + cur_hyps = [(tuple(), (0.0, -float('inf')))] +@@ -409,7 +494,7 @@ class ASRModel(torch.nn.Module): + reverse=True) + cur_hyps = next_hyps[:beam_size] + hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] +- return hyps, encoder_out ++ return hyps, encoder_out, encoder_t + + def ctc_prefix_beam_search( + self, +@@ -485,7 +570,7 @@ class ASRModel(torch.nn.Module): + # For attention rescoring we only support batch_size=1 + assert batch_size == 1 + # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size +- hyps, encoder_out = self._ctc_prefix_beam_search( ++ hyps, encoder_out, encoder_t = self._ctc_prefix_beam_search( + speech, speech_lengths, beam_size, decoding_chunk_size, + num_decoding_left_chunks, simulate_streaming) + +@@ -510,9 +595,19 @@ class ASRModel(torch.nn.Module): + r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) + r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, + self.ignore_id) +- decoder_out, r_decoder_out, _ = self.decoder( +- encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, +- reverse_weight) # (beam_size, max_hyps_len, vocab_size) ++ ++ encoder_out = encoder_out.numpy() ++ encoder_mask = encoder_mask.numpy() ++ hyps_pad = hyps_pad.numpy() ++ hyps_lens = hyps_lens.numpy().astype("int32") ++ r_hyps_pad = r_hyps_pad.numpy() ++ dims2 = {'dimCount': 11, 'name': '', 'dims': [10, encoder_out.shape[1], 256, 10, 1, encoder_out.shape[1], 10, r_hyps_pad.shape[1], 10, 10, r_hyps_pad.shape[1]]} ++ ++ y, exe_time = self.decoder_ascend([encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad], dims=dims2) ++ batch_t = encoder_t + exe_time ++ decoder_out = torch.from_numpy(y[0]) ++ r_decoder_out = torch.from_numpy(y[1]) ++ + decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) + decoder_out = decoder_out.cpu().numpy() + # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a +@@ -539,7 +634,7 @@ class ASRModel(torch.nn.Module): + if score > best_score: + best_score = score + best_index = i +- return hyps[best_index][0] ++ return hyps[best_index][0], batch_t + + @torch.jit.export + def subsampling_rate(self) -> int: +diff --git a/wenet/transformer/encoder.py b/wenet/transformer/encoder.py +index e342ed4..c8e18d5 100644 +--- a/wenet/transformer/encoder.py ++++ b/wenet/transformer/encoder.py +@@ -157,6 +157,8 @@ class BaseEncoder(torch.nn.Module): + decoding_chunk_size, + self.static_chunk_size, + num_decoding_left_chunks) ++ ++ + for layer in self.encoders: + xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad) + if self.normalize_before: diff --git a/ACL_PyTorch/contrib/audio/WeNet/acl_net.py b/ACL_PyTorch/contrib/audio/WeNet/acl_net.py new file mode 100644 index 0000000000000000000000000000000000000000..db5cc314aadb64531bef5b8a8d8fcc268b3ed84d --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/acl_net.py @@ -0,0 +1,391 @@ +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ + + +import numpy as np +import acl +import functools +import time + +# error code +ACL_ERROR_NONE = 0 + +# memory malloc code +ACL_MEM_MALLOC_HUGE_FIRST = 0 +ACL_MEM_MALLOC_HUGE_ONLY = 1 +ACL_MEM_MALLOC_NORMAL_ONLY = 2 + +# memory copy code +ACL_MEMCPY_HOST_TO_HOST = 0 +ACL_MEMCPY_HOST_TO_DEVICE = 1 +ACL_MEMCPY_DEVICE_TO_HOST = 2 +ACL_MEMCPY_DEVICE_TO_DEVICE = 3 + +# format +ACL_FORMAT_NCHW = 0 +ACL_DTYPE = { + 0: 'float32', + 1: 'float16', + 2: 'int8', + 3: 'int32', + 4: 'uint8', + 6: 'int16', + 7: 'uint16', + 8: 'uint32', + 9: 'int64', + 10: 'uint64', + 11: 'float64', + 12: 'bool', +} + +ACL_DTYPE_INDEX = { + 'float32': 0, + 'float16': 1, + 'int8': 2, + 'int32': 3, + 'uint8': 4, + 'int16': 6, + 'uint16': 7, + 'uint32': 8, + 'int64': 9, + 'uint64': 10, + 'float64': 11, + 'bool': 12, +} + + +def check_ret(message, ret): + if ret != ACL_ERROR_NONE: + raise Exception("{} failed ret = {}".format(message, ret)) + + +def check_input_type(input_type, model_input_type): + for i in range(len(input_type)): + if ACL_DTYPE_INDEX.get(input_type[i]) != model_input_type[i]: + raise Exception("real input {} input_type:{} model_input_type:{} not same".format(i, input_type[i], \ + ACL_DTYPE.get(model_input_type[i]))) + + +class Net(object): + def __init__(self, model_path, device_id, check_input=False, output_data_shape=None): + self.check_input = check_input + self.dynamic = False + self.device_id = device_id + self.model_path = model_path + self.model_id = None + # if self.ascend_mbatch_shape_data = True, the model is static with multi input shape + self.ascend_mbatch_shape_data = False + self.input_data_type = [] + self.model_input_data_type = [] + self.model_input_data_format = [] + self.model_output_data_type = [] + self.output_data_shape = output_data_shape + self.output_shape = [] + self.buffer_method = { + "in": acl.mdl.get_input_size_by_index, + "out": acl.mdl.get_output_size_by_index, + "outhost": acl.mdl.get_output_size_by_index + } + + self.input_data = [] + self.output_data = [] + self.output_data_host = [] + self.model_desc = None + self.load_input_dataset = None + self.load_output_dataset = None + self.input_size = None + self.output_size = None + self.exe_t = 0 + self._init_resource() + + def __call__(self, ori_data, dims=None): + return self.forward(ori_data, dims) + + def __del__(self): + ret = acl.mdl.unload(self.model_id) + check_ret("acl.mdl.unload", ret) + if self.model_desc: + acl.mdl.destroy_desc(self.model_desc) + self.model_desc = None + if not self.dynamic: + self._release_data_buffer() + + def _release_data_buffer(self): + while self.input_data: + item = self.input_data.pop() + ret = acl.rt.free(item["buffer"]) + check_ret("acl.rt.free", ret) + + while self.output_data: + item = self.output_data.pop() + ret = acl.rt.free(item["buffer"]) + check_ret("acl.rt.free", ret) + + while self.output_data_host: + item = self.output_data_host.pop() + ret = acl.rt.free_host(item["buffer"]) + check_ret("acl.rt.free_host", ret) + + def _init_resource(self): + # load_model + self.model_id, ret = acl.mdl.load_from_file(self.model_path) + check_ret("acl.mdl.load_from_file", ret) + + self.model_desc = acl.mdl.create_desc() + self._get_model_info() + + def _get_model_info(self): + ret = acl.mdl.get_desc(self.model_desc, self.model_id) + check_ret("acl.mdl.get_desc", ret) + self.input_size = acl.mdl.get_num_inputs(self.model_desc) + # get the input format, data_type and get the model static or not + for i in range(self.input_size): + data_type = acl.mdl.get_input_data_type(self.model_desc, i) + self.model_input_data_type.append(data_type) + data_format = acl.mdl.get_input_format(self.model_desc, i) + self.model_input_data_format.append(data_format) + dims_input, ret = acl.mdl.get_input_dims(self.model_desc, i) + # check if the model has ascend_mbatch_shape_data + if i == self.input_size - 1 and dims_input["name"] == "ascend_mbatch_shape_data": + self.dynamic = False + self.ascend_mbatch_shape_data = True + elif -1 in dims_input["dims"]: + self.dynamic = True + self.output_size = acl.mdl.get_num_outputs(self.model_desc) + for j in range(self.output_size): + data_type = acl.mdl.get_output_data_type(self.model_desc, j) + self.model_output_data_type.append(data_type) + dims_output, ret = acl.mdl.get_output_dims(self.model_desc, j) + if -1 in dims_output["dims"]: + self.dynamic = True + if self.output_data_shape is None and self.dynamic: + self.output_data_shape = 500000000 + if not self.dynamic: + self._prepare_data_buffer_in() + self._prepare_data_buffer_out() + self._prepare_data_buffer_host() + + def _gen_data_buffer(self, size, des, data=None): + func = self.buffer_method[des] + for i in range(size): + if not self.dynamic: + temp_buffer_size = func(self.model_desc, i) + else: + if des == "in": + input_size = np.prod(np.array(data[i]).shape) + temp_buffer_size = Net.gen_data_size(input_size, dtype=ACL_DTYPE.get(self.model_input_data_type[i])) + elif des == "out": + temp_buffer_size = Net.gen_data_size(data, dtype=ACL_DTYPE.get(self.model_output_data_type[i])) + + temp_buffer, ret = acl.rt.malloc(temp_buffer_size, ACL_MEM_MALLOC_HUGE_FIRST) + check_ret("acl.rt.malloc", ret) + acl.rt.memset(temp_buffer, temp_buffer_size, 0, temp_buffer_size) + if des == "in": + self.input_data.append({"buffer": temp_buffer, + "size": temp_buffer_size}) + elif des == "out": + self.output_data.append({"buffer": temp_buffer, + "size": temp_buffer_size}) + + def _gen_dataset_output_host(self, size, des, data=None): + func = self.buffer_method[des] + for i in range(size): + if not self.dynamic: + temp_buffer_size = func(self.model_desc, i) + else: + temp_buffer_size = Net.gen_data_size(data, ACL_DTYPE.get(self.model_output_data_type[i])) + temp_buffer, ret = acl.rt.malloc_host(temp_buffer_size) + check_ret("acl.rt.malloc_host", ret) + + self.output_data_host.append({"buffer": temp_buffer, + "size": temp_buffer_size}) + + def _data_interaction(self, dataset, policy=ACL_MEMCPY_HOST_TO_DEVICE): + + temp_data_buffer = self.input_data \ + if policy == ACL_MEMCPY_HOST_TO_DEVICE \ + else self.output_data + if len(dataset) == 0 and policy == ACL_MEMCPY_DEVICE_TO_HOST: + dataset = self.output_data_host + for i in range(len(dataset)): + if policy == ACL_MEMCPY_HOST_TO_DEVICE: + ptr = acl.util.numpy_to_ptr(dataset[i]) + if self.ascend_mbatch_shape_data: + malloc_size = dataset[i].size * dataset[i].itemsize + else: + malloc_size = temp_data_buffer[i]["size"] + ret = acl.rt.memcpy(temp_data_buffer[i]["buffer"], malloc_size, ptr, malloc_size, policy) + check_ret("acl.rt.memcpy", ret) + + else: + ptr = dataset[i]["buffer"] + ret = acl.rt.memcpy(ptr, temp_data_buffer[i]["size"], temp_data_buffer[i]["buffer"], + temp_data_buffer[i]["size"], policy) + check_ret("acl.rt.memcpy", ret) + + def _gen_dataset(self, type_str="input", input_shapes=None): + dataset = acl.mdl.create_dataset() + temp_dataset = None + if type_str == "in": + self.load_input_dataset = dataset + temp_dataset = self.input_data + + else: + self.load_output_dataset = dataset + temp_dataset = self.output_data + + for i, item in enumerate(temp_dataset): + data = acl.create_data_buffer(item["buffer"], item["size"]) + if data is None: + ret = acl.destroy_data_buffer(dataset) + check_ret("acl.destroy_data_buffer", ret) + + _, ret = acl.mdl.add_dataset_buffer(dataset, data) + if ret != ACL_ERROR_NONE: + ret = acl.destroy_data_buffer(dataset) + check_ret("acl.destroy_data_buffer", ret) + + if type_str == "in" and not self.ascend_mbatch_shape_data: + # set dynamic dataset tensor desc + input_shape = input_shapes[i] + input_desc = acl.create_tensor_desc(self.model_input_data_type[i], input_shape, + self.model_input_data_format[i]) + dataset, ret = acl.mdl.set_dataset_tensor_desc(dataset, input_desc, i) + if ret != ACL_ERROR_NONE: + ret = acl.destroy_data_buffer(dataset) + check_ret("acl.destroy_data_buffer", ret) + + def _data_from_host_to_device(self, images): + self._data_interaction(images, ACL_MEMCPY_HOST_TO_DEVICE) + input_shapes = [list(data.shape) for data in images] + self._gen_dataset("in", input_shapes) + self._gen_dataset("out") + + def _data_from_device_to_host(self, input_data, output_shape): + res = [] + self._data_interaction(res, ACL_MEMCPY_DEVICE_TO_HOST) + output = self.get_result(self.output_data_host, input_data, output_shape) + return output + + def _get_output_shape(self): + output_shape = [] + num = acl.mdl.get_dataset_num_buffers(self.load_output_dataset) + for output_index in range(num): + if self.dynamic: + outpu_desc = acl.mdl.get_dataset_tensor_desc(self.load_output_dataset, output_index) + temp_output_shape = [] + dim_nums = acl.get_tensor_desc_num_dims(outpu_desc) + for i in range(dim_nums): + dim, ret = acl.get_tensor_desc_dim_v2(outpu_desc, i) + temp_output_shape.append(dim) + output_shape.append(temp_output_shape) + else: + dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, output_index) + data_shape = dims.get("dims") + output_shape.append(data_shape) + + return output_shape + + def _destroy_databuffer(self): + for dataset in [self.load_input_dataset, self.load_output_dataset]: + if not dataset: + continue + + num = acl.mdl.get_dataset_num_buffers(dataset) + for i in range(num): + data_buf = acl.mdl.get_dataset_buffer(dataset, i) + if data_buf: + ret = acl.destroy_data_buffer(data_buf) + check_ret("acl.destroy_data_buffer", ret) + ret = acl.mdl.destroy_dataset(dataset) + check_ret("acl.mdl.destroy_dataset", ret) + + def _prepare_data_buffer_in(self, input_data=None): + self._gen_data_buffer(self.input_size, des="in", data=input_data) + + def _prepare_data_buffer_out(self, input_data=None): + self._gen_data_buffer(self.output_size, des="out", data=input_data) + + def _prepare_data_buffer_host(self, input_data=None): + self._gen_dataset_output_host(self.output_size, des="outhost", data=input_data) + + def forward(self, input_data, dims=None): + if not isinstance(input_data, (list, tuple)): + input_data = [input_data] + if self.check_input: + self.input_data_type = [] + for data in input_data: + self.input_data_type.append(str(data.dtype)) + check_input_type(self.input_data_type, self.model_input_data_type) + if self.dynamic: + self._prepare_data_buffer_in(input_data) + self._prepare_data_buffer_out(self.output_data_shape) + self._prepare_data_buffer_host(self.output_data_shape) + self._data_from_host_to_device(input_data) + + if self.ascend_mbatch_shape_data: + if dims is None: + raise Exception("the model is static multi shape model, dims can not be None") + index, ret = acl.mdl.get_input_index_by_name(self.model_desc, 'ascend_mbatch_shape_data') + ret = acl.mdl.set_input_dynamic_dims(self.model_id, self.load_input_dataset, index, dims) + check_ret("acl.mdl.set_input_dynamic_dims", ret) + st = time.time() + ret = acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset) + self.exe_t = time.time() - st + check_ret("acl.mdl.execute", ret) + # get output shape + output_shape = self._get_output_shape() + self._destroy_databuffer() + result = self._data_from_device_to_host(input_data=input_data, output_shape=output_shape) + if self.dynamic: + self._release_data_buffer() + return result + + def get_result(self, output_data, data, output_shape): + dataset = [] + for i in range(len(output_data)): + # fix dynamic batch size + data_type = acl.mdl.get_output_data_type(self.model_desc, i) + data_len = functools.reduce(lambda x, y: x * y, output_shape[i]) + ftype = np.dtype(ACL_DTYPE.get(data_type)) + size = output_data[i]["size"] + ptr = output_data[i]["buffer"] + data = acl.util.ptr_to_numpy(ptr, (size,), 1) + np_array = np.frombuffer(bytearray(data[:data_len * ftype.itemsize]), dtype=ftype, count=data_len) + np_array = np_array.reshape(output_shape[i]) + dataset.append(np_array) + return dataset, self.exe_t * 1000 + + @staticmethod + def gen_data_size(size, dtype): + dtype = np.dtype(dtype) + return int(size * dtype.itemsize) diff --git a/ACL_PyTorch/contrib/audio/WeNet/adaptdecoder.py b/ACL_PyTorch/contrib/audio/WeNet/adaptdecoder.py new file mode 100644 index 0000000000000000000000000000000000000000..deca1909985a784b5e37562a751cd12e5103b530 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/adaptdecoder.py @@ -0,0 +1,96 @@ +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ + +from gener_core.mod_modify.onnx_graph import OXGraph +from gener_core.mod_modify.onnx_node import OXNode +from gener_core.mod_modify.interface import AttrType as AT + +from gener_core.mod_modify.onnx_graph import OXGraph +from gener_core.mod_modify.onnx_node import OXNode +from gener_core.mod_modify.interface import AttrType as AT + +mod = OXGraph("decoder.onnx") +Expand_lists = mod.get_nodes_by_optype("Expand") +for i in range(len(Expand_lists)): + now_expand = mod.get_node(Expand_lists[i]) + cast_node = mod.add_new_node(now_expand.name + "_cast", "Cast", + {"to": (AT.INT, 6) + }) + Expand_first_input_now = mod.get_node(now_expand.input_name[0]) + now_expand.set_input_node(0, [cast_node]) + cast_node.set_input_node(0, [Expand_first_input_now]) + +Less_lists = mod.get_nodes_by_optype("Less") +for i in range(len(Less_lists)): + now_expand = mod.get_node(Less_lists[i]) + cast_node = mod.add_new_node(now_expand.name + "_cast", "Cast", + {"to": (AT.INT, 6) + }) + Expand_second_input_now = mod.get_node(now_expand.input_name[1]) + now_expand.set_input_node(1, [cast_node]) + cast_node.set_input_node(0, [Expand_second_input_now]) + +Greater_lists = mod.get_nodes_by_optype("Greater") +for greater_node in Greater_lists: + now_expand = mod.get_node(greater_node) + cast_node = mod.add_new_node(now_expand.name + "_cast", "Cast", + {"to": (AT.INT, 6) + }) + Expand_second_input_now = mod.get_node(now_expand.input_name[1]) + now_expand.set_input_node(1, [cast_node]) + cast_node.set_input_node(0, [Expand_second_input_now]) + +not_change_cast = [] +Range_lists = mod.get_nodes_by_optype("Range") +for range_node in Range_lists: + now_expand = mod.get_node(range_node) + Expand_first_input_now = mod.get_node(now_expand.input_name[1]) + not_change_cast.append(Expand_first_input_now.name) + +to = 6 +Cast = mod.get_nodes_by_optype("Cast") +for cast_node in Cast: + now_Cast = mod.get_node(cast_node) + if now_Cast.get_attr("to", AT.INT) == 7 and now_Cast.name not in not_change_cast: + now_Cast.set_attr({"to": (AT.INT, to)}) + +Equal = mod.get_nodes_by_optype("Equal") +for equal_node in Equal: + now_equal = mod.get_node(equal_node) + now_ends = mod.get_node(now_equal.input_name[1]) + if now_ends.op_type in ("Initializer", "Constant") and now_ends.const_value.dtype == "int64": + print("now_ends.dtype:", now_ends.const_value.dtype) + val = now_ends.const_value.astype("int32") + now_ends.set_const_value(val) + +mod.save_new_model("decoder_final.onnx") + diff --git a/ACL_PyTorch/contrib/audio/WeNet/adaptnoflashencoder.py b/ACL_PyTorch/contrib/audio/WeNet/adaptnoflashencoder.py new file mode 100644 index 0000000000000000000000000000000000000000..8d1675b1d6d6ce9327a4066297817b1914f47610 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/adaptnoflashencoder.py @@ -0,0 +1,81 @@ +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ + +from gener_core.mod_modify.onnx_graph import OXGraph +from gener_core.mod_modify.onnx_node import OXNode +from gener_core.mod_modify.interface import AttrType as AT +import numpy as np + +mod = OXGraph("no_flash_encoder.onnx") +Expand_lists = mod.get_nodes_by_optype("Less") +for i in range(len(Expand_lists)): + now_expand = mod.get_node(Expand_lists[i]) + cast_node = mod.add_new_node(now_expand.name + "_cast", "Cast", + {"to": (AT.INT, 6) + }) + Expand_first_input_now = mod.get_node(now_expand.input_name[1]) + now_expand.set_input_node(1, [cast_node]) + cast_node.set_input_node(0, [Expand_first_input_now]) + +Equal = mod.get_nodes_by_optype("Equal") +for equal_node in Equal: + now_equal = mod.get_node(equal_node) + now_ends = mod.get_node(now_equal.input_name[1]) + if now_ends.op_type in ("Initializer", "Constant") and now_ends.const_value.dtype == "int64": + print("now_ends.dtype:", now_ends.const_value.dtype) + val = now_ends.const_value.astype("int32") + now_ends.set_const_value(val) + +Expand_lists = ["Expand_20"] +for expand_node in Expand_lists: + now_expand = mod.get_node(expand_node) + cast_node = mod.add_new_node(now_expand.name + "_cast", "Cast", + {"to": (AT.INT, 6) + }) + Expand_first_input_now = mod.get_node(now_expand.input_name[0]) + now_expand.set_input_node(0, [cast_node]) + cast_node.set_input_node(0, [Expand_first_input_now]) + +not_change_cast = [] +Range_lists = mod.get_nodes_by_optype("Range") +for range_node in Range_lists: + now_expand = mod.get_node(range_node) + Expand_first_input_now = mod.get_node(now_expand.input_name[1]) + not_change_cast.append(Expand_first_input_now.name) + +to = 6 +Cast = mod.get_nodes_by_optype("Cast") +for i in range(len(Cast)): + now_Cast = mod.get_node(Cast[i]) + if now_Cast.get_attr("to", AT.INT) == 7 and now_Cast.name not in not_change_cast: + now_Cast.set_attr({"to": (AT.INT, to)}) +mod.save_new_model("no_flash_encoder_revise.onnx") diff --git a/ACL_PyTorch/contrib/audio/WeNet/asr_model.py b/ACL_PyTorch/contrib/audio/WeNet/asr_model.py new file mode 100644 index 0000000000000000000000000000000000000000..11f16147d8a43df03edfdd4bb310383f0c701102 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/asr_model.py @@ -0,0 +1,720 @@ +# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from typing import List, Optional, Tuple + +import torch + +from torch.nn.utils.rnn import pad_sequence + +from wenet.transformer.cmvn import GlobalCMVN +from wenet.transformer.ctc import CTC +from wenet.transformer.decoder import (TransformerDecoder, + BiTransformerDecoder) +from wenet.transformer.encoder import ConformerEncoder +from wenet.transformer.encoder import TransformerEncoder +from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss +from wenet.utils.cmvn import load_cmvn +from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, + remove_duplicates_and_blank, th_accuracy, + reverse_pad_list) +from wenet.utils.mask import (make_pad_mask, mask_finished_preds, + mask_finished_scores, subsequent_mask) + +class ASRModel(torch.nn.Module): + """CTC-attention hybrid Encoder-Decoder model""" + def __init__( + self, + vocab_size: int, + encoder: TransformerEncoder, + decoder: TransformerDecoder, + ctc: CTC, + ctc_weight: float = 0.5, + ignore_id: int = IGNORE_ID, + reverse_weight: float = 0.0, + lsm_weight: float = 0.0, + length_normalized_loss: bool = False, + ): + assert 0.0 <= ctc_weight <= 1.0, ctc_weight + + super().__init__() + # note that eos is the same as sos (equivalent ID) + self.sos = vocab_size - 1 + self.eos = vocab_size - 1 + self.vocab_size = vocab_size + self.ignore_id = ignore_id + self.ctc_weight = ctc_weight + self.reverse_weight = reverse_weight + + self.encoder = encoder + self.decoder = decoder + self.ctc = ctc + self.criterion_att = LabelSmoothingLoss( + size=vocab_size, + padding_idx=ignore_id, + smoothing=lsm_weight, + normalize_length=length_normalized_loss, + ) + + def forward( + self, + speech: torch.Tensor, + speech_lengths: torch.Tensor, + text: torch.Tensor, + text_lengths: torch.Tensor, + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], + Optional[torch.Tensor]]: + """Frontend + Encoder + Decoder + Calc loss + + Args: + speech: (Batch, Length, ...) + speech_lengths: (Batch, ) + text: (Batch, Length) + text_lengths: (Batch,) + """ + print("***") + assert text_lengths.dim() == 1, text_lengths.shape + # Check that batch_size is unified + assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] == + text_lengths.shape[0]), (speech.shape, speech_lengths.shape, + text.shape, text_lengths.shape) + # 1. Encoder + encoder_out, encoder_mask = self.encoder(speech, speech_lengths) + encoder_out_lens = encoder_mask.squeeze(1).sum(1) + + # 2a. Attention-decoder branch + if self.ctc_weight != 1.0: + loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask, + text, text_lengths) + else: + loss_att = None + + # 2b. CTC branch + if self.ctc_weight != 0.0: + loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, + text_lengths) + else: + loss_ctc = None + + if loss_ctc is None: + loss = loss_att + elif loss_att is None: + loss = loss_ctc + else: + loss = self.ctc_weight * loss_ctc + (1 - + self.ctc_weight) * loss_att + return loss, loss_att, loss_ctc + + def _calc_att_loss( + self, + encoder_out: torch.Tensor, + encoder_mask: torch.Tensor, + ys_pad: torch.Tensor, + ys_pad_lens: torch.Tensor, + ) -> Tuple[torch.Tensor, float]: + ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, + self.ignore_id) + ys_in_lens = ys_pad_lens + 1 + + # reverse the seq, used for right to left decoder + r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id)) + r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos, + self.ignore_id) + # 1. Forward decoder + decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask, + ys_in_pad, ys_in_lens, + r_ys_in_pad, + self.reverse_weight) + # 2. Compute attention loss + loss_att = self.criterion_att(decoder_out, ys_out_pad) + r_loss_att = torch.tensor(0.0) + if self.reverse_weight > 0.0: + r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad) + loss_att = loss_att * ( + 1 - self.reverse_weight) + r_loss_att * self.reverse_weight + acc_att = th_accuracy( + decoder_out.view(-1, self.vocab_size), + ys_out_pad, + ignore_label=self.ignore_id, + ) + return loss_att, acc_att + + def _forward_encoder( + self, + speech: torch.Tensor, + speech_lengths: torch.Tensor, + decoding_chunk_size: int = -1, + num_decoding_left_chunks: int = -1, + simulate_streaming: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Let's assume B = batch_size + # 1. Encoder + if simulate_streaming and decoding_chunk_size > 0: + encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( + speech, + decoding_chunk_size=decoding_chunk_size, + num_decoding_left_chunks=num_decoding_left_chunks + ) # (B, maxlen, encoder_dim) + else: + encoder_out, encoder_mask, encoder_t = self.encoder( + speech, + speech_lengths, + decoding_chunk_size=decoding_chunk_size, + num_decoding_left_chunks=num_decoding_left_chunks + ) # (B, maxlen, encoder_dim) + return encoder_out, encoder_mask, encoder_t + + def recognize( + self, + speech: torch.Tensor, + speech_lengths: torch.Tensor, + beam_size: int = 10, + decoding_chunk_size: int = -1, + num_decoding_left_chunks: int = -1, + simulate_streaming: bool = False, + ) -> torch.Tensor: + """ Apply beam search on attention decoder + + Args: + speech (torch.Tensor): (batch, max_len, feat_dim) + speech_length (torch.Tensor): (batch, ) + beam_size (int): beam size for beam search + decoding_chunk_size (int): decoding chunk for dynamic chunk + trained model. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here + simulate_streaming (bool): whether do encoder forward in a + streaming fashion + + Returns: + torch.Tensor: decoding result, (batch, max_result_len) + """ + assert speech.shape[0] == speech_lengths.shape[0] + assert decoding_chunk_size != 0 + device = speech.device + batch_size = speech.shape[0] + + # Let's assume B = batch_size and N = beam_size + # 1. Encoder + encoder_out, encoder_mask = self._forward_encoder( + speech, speech_lengths, decoding_chunk_size, + num_decoding_left_chunks, + simulate_streaming) # (B, maxlen, encoder_dim) + maxlen = encoder_out.size(1) + encoder_dim = encoder_out.size(2) + running_size = batch_size * beam_size + encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( + running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) + encoder_mask = encoder_mask.unsqueeze(1).repeat( + 1, beam_size, 1, 1).view(running_size, 1, + maxlen) # (B*N, 1, max_len) + + hyps = torch.ones([running_size, 1], dtype=torch.long, + device=device).fill_(self.sos) # (B*N, 1) + scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1), + dtype=torch.float) + scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to( + device) # (B*N, 1) + end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device) + cache: Optional[List[torch.Tensor]] = None + # 2. Decoder forward step by step + for i in range(1, maxlen + 1): + # Stop if all batch and all beam produce eos + if end_flag.sum() == running_size: + break + # 2.1 Forward decoder step + hyps_mask = subsequent_mask(i).unsqueeze(0).repeat( + running_size, 1, 1).to(device) # (B*N, i, i) + # logp: (B*N, vocab) + logp, cache = self.decoder.forward_one_step( + encoder_out, encoder_mask, hyps, hyps_mask, cache) + # 2.2 First beam prune: select topk best prob at current time + top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) + top_k_logp = mask_finished_scores(top_k_logp, end_flag) + top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) + # 2.3 Second beam prune: select topk score with history + scores = scores + top_k_logp # (B*N, N), broadcast add + scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) + scores, offset_k_index = scores.topk(k=beam_size) # (B, N) + scores = scores.view(-1, 1) # (B*N, 1) + # 2.4. Compute base index in top_k_index, + # regard top_k_index as (B*N*N),regard offset_k_index as (B*N), + # then find offset_k_index in top_k_index + base_k_index = torch.arange(batch_size, device=device).view( + -1, 1).repeat([1, beam_size]) # (B, N) + base_k_index = base_k_index * beam_size * beam_size + best_k_index = base_k_index.view(-1) + offset_k_index.view( + -1) # (B*N) + + # 2.5 Update best hyps + best_k_pred = torch.index_select(top_k_index.view(-1), + dim=-1, + index=best_k_index) # (B*N) + best_hyps_index = best_k_index // beam_size + last_best_k_hyps = torch.index_select( + hyps, dim=0, index=best_hyps_index) # (B*N, i) + hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)), + dim=1) # (B*N, i+1) + + # 2.6 Update end flag + end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1) + + # 3. Select best of best + scores = scores.view(batch_size, beam_size) + # TODO: length normalization + best_index = torch.argmax(scores, dim=-1).long() + best_hyps_index = best_index + torch.arange( + batch_size, dtype=torch.long, device=device) * beam_size + best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index) + best_hyps = best_hyps[:, 1:] + return best_hyps + + def ctc_greedy_search( + self, + speech: torch.Tensor, + speech_lengths: torch.Tensor, + decoding_chunk_size: int = -1, + num_decoding_left_chunks: int = -1, + simulate_streaming: bool = False, + ) -> List[List[int]]: + """ Apply CTC greedy search + + Args: + speech (torch.Tensor): (batch, max_len, feat_dim) + speech_length (torch.Tensor): (batch, ) + beam_size (int): beam size for beam search + decoding_chunk_size (int): decoding chunk for dynamic chunk + trained model. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here + simulate_streaming (bool): whether do encoder forward in a + streaming fashion + Returns: + List[List[int]]: best path result + """ + assert speech.shape[0] == speech_lengths.shape[0] + assert decoding_chunk_size != 0 + batch_size = speech.shape[0] + # Let's assume B = batch_size + encoder_out, encoder_mask = self._forward_encoder( + speech, speech_lengths, decoding_chunk_size, + num_decoding_left_chunks, + simulate_streaming) # (B, maxlen, encoder_dim) + maxlen = encoder_out.size(1) + encoder_out_lens = encoder_mask.squeeze(1).sum(1) + ctc_probs = self.ctc.log_softmax( + encoder_out) # (B, maxlen, vocab_size) + topk_prob, topk_index = ctc_probs.topk(1, dim=2) # (B, maxlen, 1) + topk_index = topk_index.view(batch_size, maxlen) # (B, maxlen) + mask = make_pad_mask(encoder_out_lens) # (B, maxlen) + topk_index = topk_index.masked_fill_(mask, self.eos) # (B, maxlen) + hyps = [hyp.tolist() for hyp in topk_index] + hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] + return hyps + + def _ctc_prefix_beam_search( + self, + speech: torch.Tensor, + speech_lengths: torch.Tensor, + beam_size: int, + decoding_chunk_size: int = -1, + num_decoding_left_chunks: int = -1, + simulate_streaming: bool = False, + ) -> Tuple[List[List[int]], torch.Tensor]: + """ CTC prefix beam search inner implementation + + Args: + speech (torch.Tensor): (batch, max_len, feat_dim) + speech_length (torch.Tensor): (batch, ) + beam_size (int): beam size for beam search + decoding_chunk_size (int): decoding chunk for dynamic chunk + trained model. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here + simulate_streaming (bool): whether do encoder forward in a + streaming fashion + + Returns: + List[List[int]]: nbest results + torch.Tensor: encoder output, (1, max_len, encoder_dim), + it will be used for rescoring in attention rescoring mode + """ + assert speech.shape[0] == speech_lengths.shape[0] + assert decoding_chunk_size != 0 + batch_size = speech.shape[0] + # For CTC prefix beam search, we only support batch_size=1 + assert batch_size == 1 + # Let's assume B = batch_size and N = beam_size + # 1. Encoder forward and get CTC score + encoder_out, encoder_mask, encoder_t = self._forward_encoder( + speech, speech_lengths, decoding_chunk_size, + num_decoding_left_chunks, + simulate_streaming) # (B, maxlen, encoder_dim) + maxlen = encoder_out.size(1) + ctc_probs = self.ctc.log_softmax( + encoder_out) # (1, maxlen, vocab_size) + ctc_probs = ctc_probs.squeeze(0) + # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) + cur_hyps = [(tuple(), (0.0, -float('inf')))] + # 2. CTC beam search step by step + for t in range(0, maxlen): + logp = ctc_probs[t] # (vocab_size,) + # key: prefix, value (pb, pnb), default value(-inf, -inf) + next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) + # 2.1 First beam prune: select topk best + top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) + for s in top_k_index: + s = s.item() + ps = logp[s].item() + for prefix, (pb, pnb) in cur_hyps: + last = prefix[-1] if len(prefix) > 0 else None + if s == 0: # blank + n_pb, n_pnb = next_hyps[prefix] + n_pb = log_add([n_pb, pb + ps, pnb + ps]) + next_hyps[prefix] = (n_pb, n_pnb) + elif s == last: + # Update *ss -> *s; + n_pb, n_pnb = next_hyps[prefix] + n_pnb = log_add([n_pnb, pnb + ps]) + next_hyps[prefix] = (n_pb, n_pnb) + # Update *s-s -> *ss, - is for blank + n_prefix = prefix + (s, ) + n_pb, n_pnb = next_hyps[n_prefix] + n_pnb = log_add([n_pnb, pb + ps]) + next_hyps[n_prefix] = (n_pb, n_pnb) + else: + n_prefix = prefix + (s, ) + n_pb, n_pnb = next_hyps[n_prefix] + n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) + next_hyps[n_prefix] = (n_pb, n_pnb) + + # 2.2 Second beam prune + next_hyps = sorted(next_hyps.items(), + key=lambda x: log_add(list(x[1])), + reverse=True) + cur_hyps = next_hyps[:beam_size] + hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] + return hyps, encoder_out, encoder_t + + def ctc_prefix_beam_search( + self, + speech: torch.Tensor, + speech_lengths: torch.Tensor, + beam_size: int, + decoding_chunk_size: int = -1, + num_decoding_left_chunks: int = -1, + simulate_streaming: bool = False, + ) -> List[int]: + """ Apply CTC prefix beam search + + Args: + speech (torch.Tensor): (batch, max_len, feat_dim) + speech_length (torch.Tensor): (batch, ) + beam_size (int): beam size for beam search + decoding_chunk_size (int): decoding chunk for dynamic chunk + trained model. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here + simulate_streaming (bool): whether do encoder forward in a + streaming fashion + + Returns: + List[int]: CTC prefix beam search nbest results + """ + hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths, + beam_size, decoding_chunk_size, + num_decoding_left_chunks, + simulate_streaming) + return hyps[0][0] + + def attention_rescoring( + self, + speech: torch.Tensor, + speech_lengths: torch.Tensor, + beam_size: int, + decoding_chunk_size: int = -1, + num_decoding_left_chunks: int = -1, + ctc_weight: float = 0.0, + simulate_streaming: bool = False, + reverse_weight: float = 0.0, + ) -> List[int]: + """ Apply attention rescoring decoding, CTC prefix beam search + is applied first to get nbest, then we resoring the nbest on + attention decoder with corresponding encoder out + + Args: + speech (torch.Tensor): (batch, max_len, feat_dim) + speech_length (torch.Tensor): (batch, ) + beam_size (int): beam size for beam search + decoding_chunk_size (int): decoding chunk for dynamic chunk + trained model. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here + simulate_streaming (bool): whether do encoder forward in a + streaming fashion + reverse_weight (float): right to left decoder weight + ctc_weight (float): ctc score weight + + Returns: + List[int]: Attention rescoring result + """ + assert speech.shape[0] == speech_lengths.shape[0] + assert decoding_chunk_size != 0 + if reverse_weight > 0.0: + # decoder should be a bitransformer decoder if reverse_weight > 0.0 + assert hasattr(self.decoder, 'right_decoder') + device = speech.device + batch_size = speech.shape[0] + # For attention rescoring we only support batch_size=1 + assert batch_size == 1 + # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size + hyps, encoder_out, encoder_t = self._ctc_prefix_beam_search( + speech, speech_lengths, beam_size, decoding_chunk_size, + num_decoding_left_chunks, simulate_streaming) + + assert len(hyps) == beam_size + hyps_pad = pad_sequence([ + torch.tensor(hyp[0], device=device, dtype=torch.long) + for hyp in hyps + ], True, self.ignore_id) # (beam_size, max_hyps_len) + ori_hyps_pad = hyps_pad + hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], + device=device, + dtype=torch.long) # (beam_size,) + hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) + hyps_lens = hyps_lens + 1 # Add at begining + encoder_out = encoder_out.repeat(beam_size, 1, 1) + encoder_mask = torch.ones(beam_size, + 1, + encoder_out.size(1), + dtype=torch.bool, + device=device) + # used for right to left decoder + r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) + r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, + self.ignore_id) + + decoder_out, r_decoder_out, _, decoder_t = self.decoder( + encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad, + reverse_weight) # (beam_size, max_hyps_len, vocab_size) + + decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) + decoder_out = decoder_out.cpu().numpy() + # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a + # conventional transformer decoder. + r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) + r_decoder_out = r_decoder_out.cpu().numpy() + # Only use decoder score for rescoring + best_score = -float('inf') + best_index = 0 + for i, hyp in enumerate(hyps): + score = 0.0 + for j, w in enumerate(hyp[0]): + score += decoder_out[i][j][w] + score += decoder_out[i][len(hyp[0])][self.eos] + # add right to left decoder score + if reverse_weight > 0: + r_score = 0.0 + for j, w in enumerate(hyp[0]): + r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] + r_score += r_decoder_out[i][len(hyp[0])][self.eos] + score = score * (1 - reverse_weight) + r_score * reverse_weight + # add ctc score + score += hyp[1] * ctc_weight + if score > best_score: + best_score = score + best_index = i + return hyps[best_index][0], encoder_t+decoder_t + + @torch.jit.export + def subsampling_rate(self) -> int: + """ Export interface for c++ call, return subsampling_rate of the + model + """ + return self.encoder.embed.subsampling_rate + + @torch.jit.export + def right_context(self) -> int: + """ Export interface for c++ call, return right_context of the model + """ + return self.encoder.embed.right_context + + @torch.jit.export + def sos_symbol(self) -> int: + """ Export interface for c++ call, return sos symbol id of the model + """ + return self.sos + + @torch.jit.export + def eos_symbol(self) -> int: + """ Export interface for c++ call, return eos symbol id of the model + """ + return self.eos + + @torch.jit.export + def forward_encoder_chunk( + self, + xs: torch.Tensor, + offset: int, + required_cache_size: int, + subsampling_cache: Optional[torch.Tensor] = None, + elayers_output_cache: Optional[List[torch.Tensor]] = None, + conformer_cnn_cache: Optional[List[torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], + List[torch.Tensor]]: + """ Export interface for c++ call, give input chunk xs, and return + output from time 0 to current chunk. + + Args: + xs (torch.Tensor): chunk input + subsampling_cache (Optional[torch.Tensor]): subsampling cache + elayers_output_cache (Optional[List[torch.Tensor]]): + transformer/conformer encoder layers output cache + conformer_cnn_cache (Optional[List[torch.Tensor]]): conformer + cnn cache + + Returns: + torch.Tensor: output, it ranges from time 0 to current chunk. + torch.Tensor: subsampling cache + List[torch.Tensor]: attention cache + List[torch.Tensor]: conformer cnn cache + + """ + return self.encoder.forward_chunk(xs, offset, required_cache_size, + subsampling_cache, + elayers_output_cache, + conformer_cnn_cache) + + @torch.jit.export + def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor: + """ Export interface for c++ call, apply linear transform and log + softmax before ctc + Args: + xs (torch.Tensor): encoder output + + Returns: + torch.Tensor: activation before ctc + + """ + return self.ctc.log_softmax(xs) + + @torch.jit.export + def is_bidirectional_decoder(self) -> bool: + """ + Returns: + torch.Tensor: decoder output + """ + if hasattr(self.decoder, 'right_decoder'): + return True + else: + return False + + @torch.jit.export + def forward_attention_decoder( + self, + hyps: torch.Tensor, + hyps_lens: torch.Tensor, + encoder_out: torch.Tensor, + reverse_weight: float = 0, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ Export interface for c++ call, forward decoder with multiple + hypothesis from ctc prefix beam search and one encoder output + Args: + hyps (torch.Tensor): hyps from ctc prefix beam search, already + pad sos at the begining + hyps_lens (torch.Tensor): length of each hyp in hyps + encoder_out (torch.Tensor): corresponding encoder output + r_hyps (torch.Tensor): hyps from ctc prefix beam search, already + pad eos at the begining which is used fo right to left decoder + reverse_weight: used for verfing whether used right to left decoder, + > 0 will use. + + Returns: + torch.Tensor: decoder output + """ + assert encoder_out.size(0) == 1 + num_hyps = hyps.size(0) + assert hyps_lens.size(0) == num_hyps + encoder_out = encoder_out.repeat(num_hyps, 1, 1) + encoder_mask = torch.ones(num_hyps, + 1, + encoder_out.size(1), + dtype=torch.bool, + device=encoder_out.device) + # input for right to left decoder + # this hyps_lens has count token, we need minus it. + r_hyps_lens = hyps_lens - 1 + # this hyps has included token, so it should be + # convert the original hyps. + r_hyps = hyps[:, 1:] + r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id)) + r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id) + decoder_out, r_decoder_out, _ = self.decoder( + encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, + reverse_weight) # (num_hyps, max_hyps_len, vocab_size) + decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) + + # right to left decoder may be not used during decoding process, + # which depends on reverse_weight param. + # r_dccoder_out will be 0.0, if reverse_weight is 0.0 + r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) + return decoder_out, r_decoder_out + + +def init_asr_model(configs): + if configs['cmvn_file'] is not None: + mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn']) + global_cmvn = GlobalCMVN( + torch.from_numpy(mean).float(), + torch.from_numpy(istd).float()) + else: + global_cmvn = None + + input_dim = configs['input_dim'] + vocab_size = configs['output_dim'] + + encoder_type = configs.get('encoder', 'conformer') + decoder_type = configs.get('decoder', 'bitransformer') + + if encoder_type == 'conformer': + encoder = ConformerEncoder(input_dim, + global_cmvn=global_cmvn, + **configs['encoder_conf']) + else: + encoder = TransformerEncoder(input_dim, + global_cmvn=global_cmvn, + **configs['encoder_conf']) + if decoder_type == 'transformer': + decoder = TransformerDecoder(vocab_size, encoder.output_size(), + **configs['decoder_conf']) + else: + assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0 + assert configs['decoder_conf']['r_num_blocks'] > 0 + decoder = BiTransformerDecoder(vocab_size, encoder.output_size(), + **configs['decoder_conf']) + ctc = CTC(vocab_size, encoder.output_size()) + model = ASRModel( + vocab_size=vocab_size, + encoder=encoder, + decoder=decoder, + ctc=ctc, + **configs['model_conf'], + ) + return model diff --git a/ACL_PyTorch/contrib/audio/WeNet/decoder.py b/ACL_PyTorch/contrib/audio/WeNet/decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..b9296e3d13f58c0af7d3f6628b120d1eed6ac76e --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/decoder.py @@ -0,0 +1,288 @@ +# Copyright 2021 Mobvoi Inc. All Rights Reserved. +# Author: di.wu@mobvoi.com (DI WU) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +"""Decoder definition.""" +from typing import Tuple, List, Optional + +import torch +from typeguard import check_argument_types + +from wenet.transformer.attention import MultiHeadedAttention +from wenet.transformer.decoder_layer import DecoderLayer +from wenet.transformer.embedding import PositionalEncoding +from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward +from wenet.utils.mask import (subsequent_mask, make_pad_mask) + +import time + +class TransformerDecoder(torch.nn.Module): + """Base class of Transfomer decoder module. + Args: + vocab_size: output dim + encoder_output_size: dimension of attention + attention_heads: the number of heads of multi head attention + linear_units: the hidden units number of position-wise feedforward + num_blocks: the number of decoder blocks + dropout_rate: dropout rate + self_attention_dropout_rate: dropout rate for attention + input_layer: input layer type + use_output_layer: whether to use output layer + pos_enc_class: PositionalEncoding or ScaledPositionalEncoding + normalize_before: + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + concat_after: whether to concat attention layer's input and output + True: x -> x + linear(concat(x, att(x))) + False: x -> x + att(x) + """ + def __init__( + self, + vocab_size: int, + encoder_output_size: int, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + self_attention_dropout_rate: float = 0.0, + src_attention_dropout_rate: float = 0.0, + input_layer: str = "embed", + use_output_layer: bool = True, + normalize_before: bool = True, + concat_after: bool = False, + ): + assert check_argument_types() + super().__init__() + attention_dim = encoder_output_size + + if input_layer == "embed": + self.embed = torch.nn.Sequential( + torch.nn.Embedding(vocab_size, attention_dim), + PositionalEncoding(attention_dim, positional_dropout_rate), + ) + else: + raise ValueError(f"only 'embed' is supported: {input_layer}") + + self.normalize_before = normalize_before + self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-12) + self.use_output_layer = use_output_layer + self.output_layer = torch.nn.Linear(attention_dim, vocab_size) + self.num_blocks = num_blocks + self.decoders = torch.nn.ModuleList([ + DecoderLayer( + attention_dim, + MultiHeadedAttention(attention_heads, attention_dim, + self_attention_dropout_rate), + MultiHeadedAttention(attention_heads, attention_dim, + src_attention_dropout_rate), + PositionwiseFeedForward(attention_dim, linear_units, + dropout_rate), + dropout_rate, + normalize_before, + concat_after, + ) for _ in range(self.num_blocks) + ]) + + def forward( + self, + memory: torch.Tensor, + memory_mask: torch.Tensor, + ys_in_pad: torch.Tensor, + ys_in_lens: torch.Tensor, + r_ys_in_pad: Optional[torch.Tensor] = None, + reverse_weight: float = 0.0, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Forward decoder. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoder memory mask, (batch, 1, maxlen_in) + ys_in_pad: padded input token ids, int64 (batch, maxlen_out) + ys_in_lens: input lengths of this batch (batch) + r_ys_in_pad: not used in transformer decoder, in order to unify api + with bidirectional decoder + reverse_weight: not used in transformer decoder, in order to unify + api with bidirectional decode + Returns: + (tuple): tuple containing: + x: decoded token score before softmax (batch, maxlen_out, + vocab_size) if use_output_layer is True, + torch.tensor(0.0), in order to unify api with bidirectional decoder + olens: (batch, ) + """ + tgt = ys_in_pad + + # tgt_mask: (B, 1, L) + tgt_mask = (~make_pad_mask(ys_in_lens).unsqueeze(1)).to(tgt.device) + # m: (1, L, L) + m = subsequent_mask(tgt_mask.size(-1), + device=tgt_mask.device).unsqueeze(0) + # tgt_mask: (B, L, L) + tgt_mask = tgt_mask & m + x, _ = self.embed(tgt) + for layer in self.decoders: + x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, + memory_mask) + if self.normalize_before: + x = self.after_norm(x) + if self.use_output_layer: + x = self.output_layer(x) + olens = tgt_mask.sum(1) + return x, torch.tensor(0.0), olens + + def forward_one_step( + self, + memory: torch.Tensor, + memory_mask: torch.Tensor, + tgt: torch.Tensor, + tgt_mask: torch.Tensor, + cache: Optional[List[torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, List[torch.Tensor]]: + """Forward one step. + This is only used for decoding. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoded memory mask, (batch, 1, maxlen_in) + tgt: input token ids, int64 (batch, maxlen_out) + tgt_mask: input token mask, (batch, maxlen_out) + dtype=torch.uint8 in PyTorch 1.2- + dtype=torch.bool in PyTorch 1.2+ (include 1.2) + cache: cached output list of (batch, max_time_out-1, size) + Returns: + y, cache: NN output value and cache per `self.decoders`. + y.shape` is (batch, maxlen_out, token) + """ + x, _ = self.embed(tgt) + new_cache = [] + for i, decoder in enumerate(self.decoders): + if cache is None: + c = None + else: + c = cache[i] + x, tgt_mask, memory, memory_mask = decoder(x, + tgt_mask, + memory, + memory_mask, + cache=c) + new_cache.append(x) + if self.normalize_before: + y = self.after_norm(x[:, -1]) + else: + y = x[:, -1] + if self.use_output_layer: + y = torch.log_softmax(self.output_layer(y), dim=-1) + return y, new_cache + + +class BiTransformerDecoder(torch.nn.Module): + """Base class of Transfomer decoder module. + Args: + vocab_size: output dim + encoder_output_size: dimension of attention + attention_heads: the number of heads of multi head attention + linear_units: the hidden units number of position-wise feedforward + num_blocks: the number of decoder blocks + r_num_blocks: the number of right to left decoder blocks + dropout_rate: dropout rate + self_attention_dropout_rate: dropout rate for attention + input_layer: input layer type + use_output_layer: whether to use output layer + pos_enc_class: PositionalEncoding or ScaledPositionalEncoding + normalize_before: + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + concat_after: whether to concat attention layer's input and output + True: x -> x + linear(concat(x, att(x))) + False: x -> x + att(x) + """ + def __init__( + self, + vocab_size: int, + encoder_output_size: int, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + r_num_blocks: int = 0, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + self_attention_dropout_rate: float = 0.0, + src_attention_dropout_rate: float = 0.0, + input_layer: str = "embed", + use_output_layer: bool = True, + normalize_before: bool = True, + concat_after: bool = False, + ): + + assert check_argument_types() + super().__init__() + self.left_decoder = TransformerDecoder( + vocab_size, encoder_output_size, attention_heads, linear_units, + num_blocks, dropout_rate, positional_dropout_rate, + self_attention_dropout_rate, src_attention_dropout_rate, + input_layer, use_output_layer, normalize_before, concat_after) + + self.right_decoder = TransformerDecoder( + vocab_size, encoder_output_size, attention_heads, linear_units, + r_num_blocks, dropout_rate, positional_dropout_rate, + self_attention_dropout_rate, src_attention_dropout_rate, + input_layer, use_output_layer, normalize_before, concat_after) + + def forward( + self, + memory: torch.Tensor, + memory_mask: torch.Tensor, + ys_in_pad: torch.Tensor, + ys_in_lens: torch.Tensor, + r_ys_in_pad: torch.Tensor, + reverse_weight: float = 0.0, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Forward decoder. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoder memory mask, (batch, 1, maxlen_in) + ys_in_pad: padded input token ids, int64 (batch, maxlen_out) + ys_in_lens: input lengths of this batch (batch) + r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out), + used for right to left decoder + reverse_weight: used for right to left decoder + Returns: + (tuple): tuple containing: + x: decoded token score before softmax (batch, maxlen_out, + vocab_size) if use_output_layer is True, + r_x: x: decoded token score (right to left decoder) + before softmax (batch, maxlen_out, vocab_size) + if use_output_layer is True, + olens: (batch, ) + """ + st = time.time() + l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, + ys_in_lens) + r_x = torch.tensor(0.0) + if reverse_weight > 0.0: + r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad, + ys_in_lens) + return l_x, r_x, olens, time.time()-st + + def forward_one_step( + self, + memory: torch.Tensor, + memory_mask: torch.Tensor, + tgt: torch.Tensor, + tgt_mask: torch.Tensor, + cache: Optional[List[torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, List[torch.Tensor]]: + """Forward one step. + This is only used for decoding. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoded memory mask, (batch, 1, maxlen_in) + tgt: input token ids, int64 (batch, maxlen_out) + tgt_mask: input token mask, (batch, maxlen_out) + dtype=torch.uint8 in PyTorch 1.2- + dtype=torch.bool in PyTorch 1.2+ (include 1.2) + cache: cached output list of (batch, max_time_out-1, size) + Returns: + y, cache: NN output value and cache per `self.decoders`. + y.shape` is (batch, maxlen_out, token) + """ + return self.left_decoder.forward_one_step(memory, memory_mask, tgt, + tgt_mask, cache) diff --git a/ACL_PyTorch/contrib/audio/WeNet/decoder.sh b/ACL_PyTorch/contrib/audio/WeNet/decoder.sh new file mode 100644 index 0000000000000000000000000000000000000000..0e33e8da1590f87817c404f399bbebb1ac0a60d2 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/decoder.sh @@ -0,0 +1,13 @@ +export install_path=/usr/local/Ascend/ascend-toolkit/latest +export PATH=${install_path}/atc/bin:${install_path}/bin:${install_path}/atc/ccec_compiler/bin:$PATH +export LD_LIBRARY_PATH=${install_path}/lib64:${install_path}/atc/lib64:${install_path}/acllib/lib64:${install_path}/compiler/lib64/plugin/opskernel:${install_path}/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH +export PYTHONPATH=${install_path}/latest/python/site-packages:${install_path}/opp/op_impl/built-in/ai_core/tbe:${install_path}/atc/python/site-packages:${install_path}/pyACL/python/site-packages/acl:$PYTHONPATH +export ASCEND_AICPU_PATH=${install_path} +export ASCEND_OPP_PATH=${install_path}/opp +export TOOLCHAIN_HOME=${install_path}/toolkit +export ASCEND_AUTOML_PATH=${install_path}/tools +export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:${LD_LIBRARY_PATH} +atc --model=decoder_final.onnx --framework=5 --output=decoder_final --input_format=ND \ + --input_shape_range="memory:[10,1~1500,256];memory_mask:[10,1,1~1500];ys_in_pad:[10,1~1500];ys_in_lens:[10];r_ys_in_pad:[10,1~1500]" --out_nodes="Add_488:0;Add_977:0" --log=error --soc_version=Ascend310 + + diff --git a/ACL_PyTorch/contrib/audio/WeNet/encoder.py b/ACL_PyTorch/contrib/audio/WeNet/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..035d64f4c8f932eb452694d1413757308cd56ee9 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/encoder.py @@ -0,0 +1,452 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2019 Mobvoi Inc. All Rights Reserved. +# Author: di.wu@mobvoi.com (DI WU) +"""Encoder definition.""" +from typing import Tuple, List, Optional + +import torch +from typeguard import check_argument_types + +from wenet.transformer.attention import MultiHeadedAttention +from wenet.transformer.attention import RelPositionMultiHeadedAttention +from wenet.transformer.convolution import ConvolutionModule +from wenet.transformer.embedding import PositionalEncoding +from wenet.transformer.embedding import RelPositionalEncoding +from wenet.transformer.embedding import NoPositionalEncoding +from wenet.transformer.encoder_layer import TransformerEncoderLayer +from wenet.transformer.encoder_layer import ConformerEncoderLayer +from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward +from wenet.transformer.subsampling import Conv2dSubsampling4 +from wenet.transformer.subsampling import Conv2dSubsampling6 +from wenet.transformer.subsampling import Conv2dSubsampling8 +from wenet.transformer.subsampling import LinearNoSubsampling +from wenet.utils.common import get_activation +from wenet.utils.mask import make_pad_mask +from wenet.utils.mask import add_optional_chunk_mask + +import time + +class BaseEncoder(torch.nn.Module): + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: str = "conv2d", + pos_enc_layer_type: str = "abs_pos", + normalize_before: bool = True, + concat_after: bool = False, + static_chunk_size: int = 0, + use_dynamic_chunk: bool = False, + global_cmvn: torch.nn.Module = None, + use_dynamic_left_chunk: bool = False, + ): + """ + Args: + input_size (int): input dim + output_size (int): dimension of attention + attention_heads (int): the number of heads of multi head attention + linear_units (int): the hidden units number of position-wise feed + forward + num_blocks (int): the number of decoder blocks + dropout_rate (float): dropout rate + attention_dropout_rate (float): dropout rate in attention + positional_dropout_rate (float): dropout rate after adding + positional encoding + input_layer (str): input layer type. + optional [linear, conv2d, conv2d6, conv2d8] + pos_enc_layer_type (str): Encoder positional encoding layer type. + opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos] + normalize_before (bool): + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + concat_after (bool): whether to concat attention layer's input + and output. + True: x -> x + linear(concat(x, att(x))) + False: x -> x + att(x) + static_chunk_size (int): chunk size for static chunk training and + decoding + use_dynamic_chunk (bool): whether use dynamic chunk size for + training or not, You can only use fixed chunk(chunk_size > 0) + or dyanmic chunk size(use_dynamic_chunk = True) + global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module + use_dynamic_left_chunk (bool): whether use dynamic left chunk in + dynamic chunk training + """ + assert check_argument_types() + super().__init__() + self._output_size = output_size + + if pos_enc_layer_type == "abs_pos": + pos_enc_class = PositionalEncoding + elif pos_enc_layer_type == "rel_pos": + pos_enc_class = RelPositionalEncoding + elif pos_enc_layer_type == "no_pos": + pos_enc_class = NoPositionalEncoding + else: + raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) + + if input_layer == "linear": + subsampling_class = LinearNoSubsampling + elif input_layer == "conv2d": + subsampling_class = Conv2dSubsampling4 + elif input_layer == "conv2d6": + subsampling_class = Conv2dSubsampling6 + elif input_layer == "conv2d8": + subsampling_class = Conv2dSubsampling8 + else: + raise ValueError("unknown input_layer: " + input_layer) + + self.global_cmvn = global_cmvn + self.embed = subsampling_class( + input_size, + output_size, + dropout_rate, + pos_enc_class(output_size, positional_dropout_rate), + ) + + self.normalize_before = normalize_before + self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-12) + self.static_chunk_size = static_chunk_size + self.use_dynamic_chunk = use_dynamic_chunk + self.use_dynamic_left_chunk = use_dynamic_left_chunk + + def output_size(self) -> int: + return self._output_size + + def forward( + self, + xs: torch.Tensor, + xs_lens: torch.Tensor, + decoding_chunk_size: int = 0, + num_decoding_left_chunks: int = -1, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Embed positions in tensor. + + Args: + xs: padded input tensor (B, T, D) + xs_lens: input length (B) + decoding_chunk_size: decoding chunk size for dynamic chunk + 0: default for training, use random dynamic chunk. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + num_decoding_left_chunks: number of left chunks, this is for decoding, + the chunk size is decoding_chunk_size. + >=0: use num_decoding_left_chunks + <0: use all left chunks + Returns: + encoder output tensor xs, and subsampled masks + xs: padded output tensor (B, T' ~= T/subsample_rate, D) + masks: torch.Tensor batch padding mask after subsample + (B, 1, T' ~= T/subsample_rate) + """ + st = time.time() + masks = ~make_pad_mask(xs_lens).unsqueeze(1) # (B, 1, T) + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + xs, pos_emb, masks = self.embed(xs, masks) + mask_pad = masks # (B, 1, T/subsample_rate) + chunk_masks = add_optional_chunk_mask(xs, masks, + self.use_dynamic_chunk, + self.use_dynamic_left_chunk, + decoding_chunk_size, + self.static_chunk_size, + num_decoding_left_chunks) + for layer in self.encoders: + xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad) + if self.normalize_before: + xs = self.after_norm(xs) + # Here we assume the mask is not changed in encoder layers, so just + # return the masks before encoder layers, and the masks will be used + # for cross attention with decoder later + return xs, masks, time.time()-st + + def forward_chunk( + self, + xs: torch.Tensor, + offset: int, + required_cache_size: int, + subsampling_cache: Optional[torch.Tensor] = None, + elayers_output_cache: Optional[List[torch.Tensor]] = None, + conformer_cnn_cache: Optional[List[torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], + List[torch.Tensor]]: + """ Forward just one chunk + + Args: + xs (torch.Tensor): chunk input + offset (int): current offset in encoder output time stamp + required_cache_size (int): cache size required for next chunk + compuation + >=0: actual cache size + <0: means all history cache is required + subsampling_cache (Optional[torch.Tensor]): subsampling cache + elayers_output_cache (Optional[List[torch.Tensor]]): + transformer/conformer encoder layers output cache + conformer_cnn_cache (Optional[List[torch.Tensor]]): conformer + cnn cache + + Returns: + torch.Tensor: output of current input xs + torch.Tensor: subsampling cache required for next chunk computation + List[torch.Tensor]: encoder layers output cache required for next + chunk computation + List[torch.Tensor]: conformer cnn cache + + """ + assert xs.size(0) == 1 + # tmp_masks is just for interface compatibility + tmp_masks = torch.ones(1, + xs.size(1), + device=xs.device, + dtype=torch.bool) + tmp_masks = tmp_masks.unsqueeze(1) + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) + if subsampling_cache is not None: + cache_size = subsampling_cache.size(1) + xs = torch.cat((subsampling_cache, xs), dim=1) + else: + cache_size = 0 + pos_emb = self.embed.position_encoding(offset - cache_size, xs.size(1)) + if required_cache_size < 0: + next_cache_start = 0 + elif required_cache_size == 0: + next_cache_start = xs.size(1) + else: + next_cache_start = max(xs.size(1) - required_cache_size, 0) + r_subsampling_cache = xs[:, next_cache_start:, :] + # Real mask for transformer/conformer layers + masks = torch.ones(1, xs.size(1), device=xs.device, dtype=torch.bool) + masks = masks.unsqueeze(1) + r_elayers_output_cache = [] + r_conformer_cnn_cache = [] + for i, layer in enumerate(self.encoders): + if elayers_output_cache is None: + attn_cache = None + else: + attn_cache = elayers_output_cache[i] + if conformer_cnn_cache is None: + cnn_cache = None + else: + cnn_cache = conformer_cnn_cache[i] + xs, _, new_cnn_cache = layer(xs, + masks, + pos_emb, + output_cache=attn_cache, + cnn_cache=cnn_cache) + r_elayers_output_cache.append(xs[:, next_cache_start:, :]) + r_conformer_cnn_cache.append(new_cnn_cache) + if self.normalize_before: + xs = self.after_norm(xs) + + return (xs[:, cache_size:, :], r_subsampling_cache, + r_elayers_output_cache, r_conformer_cnn_cache) + + def forward_chunk_by_chunk( + self, + xs: torch.Tensor, + decoding_chunk_size: int, + num_decoding_left_chunks: int = -1, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ Forward input chunk by chunk with chunk_size like a streaming + fashion + + Here we should pay special attention to computation cache in the + streaming style forward chunk by chunk. Three things should be taken + into account for computation in the current network: + 1. transformer/conformer encoder layers output cache + 2. convolution in conformer + 3. convolution in subsampling + + However, we don't implement subsampling cache for: + 1. We can control subsampling module to output the right result by + overlapping input instead of cache left context, even though it + wastes some computation, but subsampling only takes a very + small fraction of computation in the whole model. + 2. Typically, there are several covolution layers with subsampling + in subsampling module, it is tricky and complicated to do cache + with different convolution layers with different subsampling + rate. + 3. Currently, nn.Sequential is used to stack all the convolution + layers in subsampling, we need to rewrite it to make it work + with cache, which is not prefered. + Args: + xs (torch.Tensor): (1, max_len, dim) + chunk_size (int): decoding chunk size + """ + assert decoding_chunk_size > 0 + # The model is trained by static or dynamic chunk + assert self.static_chunk_size > 0 or self.use_dynamic_chunk + subsampling = self.embed.subsampling_rate + context = self.embed.right_context + 1 # Add current frame + stride = subsampling * decoding_chunk_size + decoding_window = (decoding_chunk_size - 1) * subsampling + context + num_frames = xs.size(1) + subsampling_cache: Optional[torch.Tensor] = None + elayers_output_cache: Optional[List[torch.Tensor]] = None + conformer_cnn_cache: Optional[List[torch.Tensor]] = None + outputs = [] + offset = 0 + required_cache_size = decoding_chunk_size * num_decoding_left_chunks + + # Feed forward overlap input step by step + for cur in range(0, num_frames - context + 1, stride): + end = min(cur + decoding_window, num_frames) + chunk_xs = xs[:, cur:end, :] + (y, subsampling_cache, elayers_output_cache, + conformer_cnn_cache) = self.forward_chunk(chunk_xs, offset, + required_cache_size, + subsampling_cache, + elayers_output_cache, + conformer_cnn_cache) + outputs.append(y) + offset += y.size(1) + ys = torch.cat(outputs, 1) + masks = torch.ones(1, ys.size(1), device=ys.device, dtype=torch.bool) + masks = masks.unsqueeze(1) + return ys, masks + + +class TransformerEncoder(BaseEncoder): + """Transformer encoder module.""" + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: str = "conv2d", + pos_enc_layer_type: str = "abs_pos", + normalize_before: bool = True, + concat_after: bool = False, + static_chunk_size: int = 0, + use_dynamic_chunk: bool = False, + global_cmvn: torch.nn.Module = None, + use_dynamic_left_chunk: bool = False, + ): + """ Construct TransformerEncoder + + See Encoder for the meaning of each parameter. + """ + assert check_argument_types() + super().__init__(input_size, output_size, attention_heads, + linear_units, num_blocks, dropout_rate, + positional_dropout_rate, attention_dropout_rate, + input_layer, pos_enc_layer_type, normalize_before, + concat_after, static_chunk_size, use_dynamic_chunk, + global_cmvn, use_dynamic_left_chunk) + self.encoders = torch.nn.ModuleList([ + TransformerEncoderLayer( + output_size, + MultiHeadedAttention(attention_heads, output_size, + attention_dropout_rate), + PositionwiseFeedForward(output_size, linear_units, + dropout_rate), dropout_rate, + normalize_before, concat_after) for _ in range(num_blocks) + ]) + + +class ConformerEncoder(BaseEncoder): + """Conformer encoder module.""" + def __init__( + self, + input_size: int, + output_size: int = 256, + attention_heads: int = 4, + linear_units: int = 2048, + num_blocks: int = 6, + dropout_rate: float = 0.1, + positional_dropout_rate: float = 0.1, + attention_dropout_rate: float = 0.0, + input_layer: str = "conv2d", + pos_enc_layer_type: str = "rel_pos", + normalize_before: bool = True, + concat_after: bool = False, + static_chunk_size: int = 0, + use_dynamic_chunk: bool = False, + global_cmvn: torch.nn.Module = None, + use_dynamic_left_chunk: bool = False, + positionwise_conv_kernel_size: int = 1, + macaron_style: bool = True, + selfattention_layer_type: str = "rel_selfattn", + activation_type: str = "swish", + use_cnn_module: bool = True, + cnn_module_kernel: int = 15, + causal: bool = False, + cnn_module_norm: str = "batch_norm", + ): + """Construct ConformerEncoder + + Args: + input_size to use_dynamic_chunk, see in BaseEncoder + positionwise_conv_kernel_size (int): Kernel size of positionwise + conv1d layer. + macaron_style (bool): Whether to use macaron style for + positionwise layer. + selfattention_layer_type (str): Encoder attention layer type, + the parameter has no effect now, it's just for configure + compatibility. + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + cnn_module_kernel (int): Kernel size of convolution module. + causal (bool): whether to use causal convolution or not. + """ + assert check_argument_types() + super().__init__(input_size, output_size, attention_heads, + linear_units, num_blocks, dropout_rate, + positional_dropout_rate, attention_dropout_rate, + input_layer, pos_enc_layer_type, normalize_before, + concat_after, static_chunk_size, use_dynamic_chunk, + global_cmvn, use_dynamic_left_chunk) + activation = get_activation(activation_type) + + # self-attention module definition + if pos_enc_layer_type == "no_pos": + encoder_selfattn_layer = MultiHeadedAttention + else: + encoder_selfattn_layer = RelPositionMultiHeadedAttention + encoder_selfattn_layer_args = ( + attention_heads, + output_size, + attention_dropout_rate, + ) + # feed-forward module definition + positionwise_layer = PositionwiseFeedForward + positionwise_layer_args = ( + output_size, + linear_units, + dropout_rate, + activation, + ) + # convolution module definition + convolution_layer = ConvolutionModule + convolution_layer_args = (output_size, cnn_module_kernel, activation, + cnn_module_norm, causal) + + self.encoders = torch.nn.ModuleList([ + ConformerEncoderLayer( + output_size, + encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + positionwise_layer( + *positionwise_layer_args) if macaron_style else None, + convolution_layer( + *convolution_layer_args) if use_cnn_module else None, + dropout_rate, + normalize_before, + concat_after, + ) for _ in range(num_blocks) + ]) diff --git a/ACL_PyTorch/contrib/audio/WeNet/env.sh b/ACL_PyTorch/contrib/audio/WeNet/env.sh new file mode 100644 index 0000000000000000000000000000000000000000..33243beff29297566d7412a08452ad6657a8d6fa --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/env.sh @@ -0,0 +1,9 @@ +export install_path=/usr/local/Ascend/ascend-toolkit/latest +export PATH=${install_path}/atc/bin:${install_path}/bin:${install_path}/atc/ccec_compiler/bin:$PATH +export LD_LIBRARY_PATH=${install_path}/lib64:${install_path}/atc/lib64:${install_path}/acllib/lib64:${install_path}/compiler/lib64/plugin/opskernel:${install_path}/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH +export PYTHONPATH=${install_path}/latest/python/site-packages:${install_path}/opp/op_impl/built-in/ai_core/tbe:${install_path}/atc/python/site-packages:${install_path}/pyACL/python/site-packages/acl:$PYTHONPATH +export ASCEND_AICPU_PATH=${install_path} +export ASCEND_OPP_PATH=${install_path}/opp +export TOOLCHAIN_HOME=${install_path}/toolkit +export ASCEND_AUTOML_PATH=${install_path}/tools +export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:${LD_LIBRARY_PATH} diff --git a/ACL_PyTorch/contrib/audio/WeNet/export_onnx.diff b/ACL_PyTorch/contrib/audio/WeNet/export_onnx.diff new file mode 100644 index 0000000000000000000000000000000000000000..9d954caedcedd69e9b6034db160c8fbbb8a53737 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/export_onnx.diff @@ -0,0 +1,794 @@ +diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py +index 73990fa..68c8299 100644 +--- a/wenet/transformer/asr_model.py ++++ b/wenet/transformer/asr_model.py +@@ -245,7 +245,7 @@ class ASRModel(torch.nn.Module): + top_k_logp, top_k_index = logp.topk(beam_size) # (B*N, N) + top_k_logp = mask_finished_scores(top_k_logp, end_flag) + top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos) +- # 2.3 Second beam prune: select topk score with history ++ # 2.3 Seconde beam prune: select topk score with history + scores = scores + top_k_logp # (B*N, N), broadcast add + scores = scores.view(batch_size, beam_size * beam_size) # (B, N*N) + scores, offset_k_index = scores.topk(k=beam_size) # (B, N) +@@ -570,13 +570,12 @@ class ASRModel(torch.nn.Module): + def forward_encoder_chunk( + self, + xs: torch.Tensor, +- offset: int, +- required_cache_size: int, ++ offset: torch.Tensor, ++ required_cache_size: torch.Tensor, + subsampling_cache: Optional[torch.Tensor] = None, +- elayers_output_cache: Optional[List[torch.Tensor]] = None, +- conformer_cnn_cache: Optional[List[torch.Tensor]] = None, +- ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], +- List[torch.Tensor]]: ++ elayers_output_cache: Optional[torch.Tensor] = None, ++ conformer_cnn_cache: Optional[torch.Tensor] = None, ++ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ Export interface for c++ call, give input chunk xs, and return + output from time 0 to current chunk. + +@@ -675,6 +674,10 @@ class ASRModel(torch.nn.Module): + r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) + return decoder_out, r_decoder_out + ++ @torch.jit.export ++ def test(self,) -> str: ++ return "test" ++ + + def init_asr_model(configs): + if configs['cmvn_file'] is not None: +diff --git a/wenet/transformer/decoder.py b/wenet/transformer/decoder.py +index f41f7e4..40c1a57 100644 +--- a/wenet/transformer/decoder.py ++++ b/wenet/transformer/decoder.py +@@ -57,8 +57,7 @@ class TransformerDecoder(torch.nn.Module): + if input_layer == "embed": + self.embed = torch.nn.Sequential( + torch.nn.Embedding(vocab_size, attention_dim), +- PositionalEncoding(attention_dim, positional_dropout_rate), +- ) ++ PositionalEncoding(attention_dim, positional_dropout_rate)) + else: + raise ValueError(f"only 'embed' is supported: {input_layer}") + +@@ -81,6 +80,10 @@ class TransformerDecoder(torch.nn.Module): + concat_after, + ) for _ in range(self.num_blocks) + ]) ++ self.onnx_mode = False ++ ++ def set_onnx_mode(self, onnx_mode=False): ++ self.onnx_mode = onnx_mode + + def forward( + self, +@@ -111,13 +114,15 @@ class TransformerDecoder(torch.nn.Module): + tgt = ys_in_pad + + # tgt_mask: (B, 1, L) +- tgt_mask = (~make_pad_mask(ys_in_lens).unsqueeze(1)).to(tgt.device) ++ tgt_mask = (~make_pad_mask(ys_in_lens, ys_in_pad).unsqueeze(1)).to(tgt.device) + # m: (1, L, L) + m = subsequent_mask(tgt_mask.size(-1), + device=tgt_mask.device).unsqueeze(0) + # tgt_mask: (B, L, L) +- tgt_mask = tgt_mask & m +- x, _ = self.embed(tgt) ++ # tgt_mask = tgt_mask & m ++ tgt_mask = torch.mul(tgt_mask, m) ++ x = self.embed[0](tgt) ++ x, _ = self.embed[1](x, onnx_mode=self.onnx_mode) + for layer in self.decoders: + x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, + memory_mask) +@@ -225,6 +230,13 @@ class BiTransformerDecoder(torch.nn.Module): + self_attention_dropout_rate, src_attention_dropout_rate, + input_layer, use_output_layer, normalize_before, concat_after) + ++ self.onnx_mode = False ++ ++ def set_onnx_mode(self, onnx_mode=False): ++ self.onnx_mode = onnx_mode ++ self.left_decoder.set_onnx_mode(onnx_mode) ++ self.right_decoder.set_onnx_mode(onnx_mode) ++ + def forward( + self, + memory: torch.Tensor, +@@ -252,6 +264,7 @@ class BiTransformerDecoder(torch.nn.Module): + if use_output_layer is True, + olens: (batch, ) + """ ++ reverse_weight = 0.3 + l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad, + ys_in_lens) + r_x = torch.tensor(0.0) +diff --git a/wenet/transformer/decoder_layer.py b/wenet/transformer/decoder_layer.py +index 25bb281..59dd174 100644 +--- a/wenet/transformer/decoder_layer.py ++++ b/wenet/transformer/decoder_layer.py +@@ -17,7 +17,7 @@ class DecoderLayer(nn.Module): + size (int): Input dimension. + self_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. +- src_attn (torch.nn.Module): Inter-attention module instance. ++ src_attn (torch.nn.Module): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + feed_forward (torch.nn.Module): Feed-forward module instance. + `PositionwiseFeedForward` instance can be used as the argument. +@@ -61,7 +61,8 @@ class DecoderLayer(nn.Module): + tgt_mask: torch.Tensor, + memory: torch.Tensor, + memory_mask: torch.Tensor, +- cache: Optional[torch.Tensor] = None ++ cache: Optional[torch.Tensor] = None, ++ onnx_mode: Optional[bool] = False + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Compute decoded features. + +diff --git a/wenet/transformer/embedding.py b/wenet/transformer/embedding.py +index a47afd9..0a6794c 100644 +--- a/wenet/transformer/embedding.py ++++ b/wenet/transformer/embedding.py +@@ -9,6 +9,7 @@ import math + from typing import Tuple + + import torch ++from wenet.transformer.slice_helper import slice_helper2 + + + class PositionalEncoding(torch.nn.Module): +@@ -45,7 +46,8 @@ class PositionalEncoding(torch.nn.Module): + + def forward(self, + x: torch.Tensor, +- offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]: ++ offset: torch.Tensor = torch.tensor(0), ++ onnx_mode: bool = False) -> Tuple[torch.Tensor, torch.Tensor]: + """Add positional encoding. + + Args: +@@ -56,13 +58,21 @@ class PositionalEncoding(torch.nn.Module): + torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) + torch.Tensor: for compatibility to RelPositionalEncoding + """ +- assert offset + x.size(1) < self.max_len ++ # assert offset + x.size(1) < self.max_len + self.pe = self.pe.to(x.device) +- pos_emb = self.pe[:, offset:offset + x.size(1)] ++ # pos_emb = self.pe[:, offset:offset + x.size(1)] ++ if onnx_mode: ++ pos_emb = slice_helper2(self.pe, offset, offset + x.size(1)) ++ else: ++ pos_emb = self.pe[:, offset:offset + x.size(1)] + x = x * self.xscale + pos_emb + return self.dropout(x), self.dropout(pos_emb) + +- def position_encoding(self, offset: int, size: int) -> torch.Tensor: ++ def position_encoding(self, ++ offset: torch.Tensor, ++ size: torch.Tensor, ++ onnx_mode: bool = False, ++ ) -> torch.Tensor: + """ For getting encoding in a streaming fashion + + Attention!!!!! +@@ -79,7 +89,12 @@ class PositionalEncoding(torch.nn.Module): + torch.Tensor: Corresponding encoding + """ + assert offset + size < self.max_len +- return self.dropout(self.pe[:, offset:offset + size]) ++ if onnx_mode: ++ # pe = torch.cat([self.pe[:, [0]], slice_helper2(self.pe, offset, offset + size - 1)], dim=1) ++ pe = slice_helper2(self.pe, offset, offset + size) ++ else: ++ pe = self.pe[:, offset:offset + size] ++ return self.dropout(pe) + + + class RelPositionalEncoding(PositionalEncoding): +@@ -96,7 +111,8 @@ class RelPositionalEncoding(PositionalEncoding): + + def forward(self, + x: torch.Tensor, +- offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]: ++ offset: torch.Tensor, ++ onnx_mode: bool = False) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute positional encoding. + Args: + x (torch.Tensor): Input tensor (batch, time, `*`). +@@ -104,10 +120,16 @@ class RelPositionalEncoding(PositionalEncoding): + torch.Tensor: Encoded tensor (batch, time, `*`). + torch.Tensor: Positional embedding tensor (1, time, `*`). + """ +- assert offset + x.size(1) < self.max_len ++ # assert offset + x.size(1) < self.max_len + self.pe = self.pe.to(x.device) + x = x * self.xscale +- pos_emb = self.pe[:, offset:offset + x.size(1)] ++ if onnx_mode: ++ # end = offset.item() + x.size(1) ++ # pos_emb = torch.index_select(self.pe, 1, torch.tensor(range(x.size(1)))) ++ pos_emb = slice_helper2(self.pe, offset, offset + x.size(1)) ++ # pos_emb = slice_helper3(pos_emb, x.size(1)) ++ else: ++ pos_emb = self.pe[:, offset:offset + x.size(1)] + return self.dropout(x), self.dropout(pos_emb) + + +diff --git a/wenet/transformer/encoder.py b/wenet/transformer/encoder.py +index e342ed4..9b4f968 100644 +--- a/wenet/transformer/encoder.py ++++ b/wenet/transformer/encoder.py +@@ -6,6 +6,8 @@ + """Encoder definition.""" + from typing import Tuple, List, Optional + ++import numpy as np ++import onnxruntime + import torch + from typeguard import check_argument_types + +@@ -18,6 +20,7 @@ from wenet.transformer.embedding import NoPositionalEncoding + from wenet.transformer.encoder_layer import TransformerEncoderLayer + from wenet.transformer.encoder_layer import ConformerEncoderLayer + from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward ++from wenet.transformer.slice_helper import slice_helper3, get_next_cache_start + from wenet.transformer.subsampling import Conv2dSubsampling4 + from wenet.transformer.subsampling import Conv2dSubsampling6 + from wenet.transformer.subsampling import Conv2dSubsampling8 +@@ -26,6 +29,8 @@ from wenet.utils.common import get_activation + from wenet.utils.mask import make_pad_mask + from wenet.utils.mask import add_optional_chunk_mask + ++def to_numpy(x): ++ return x.detach().numpy() + + class BaseEncoder(torch.nn.Module): + def __init__( +@@ -116,10 +121,14 @@ class BaseEncoder(torch.nn.Module): + self.static_chunk_size = static_chunk_size + self.use_dynamic_chunk = use_dynamic_chunk + self.use_dynamic_left_chunk = use_dynamic_left_chunk ++ self.onnx_mode = False + + def output_size(self) -> int: + return self._output_size + ++ def set_onnx_mode(self, onnx_mode=False): ++ self.onnx_mode = onnx_mode ++ + def forward( + self, + xs: torch.Tensor, +@@ -130,7 +139,7 @@ class BaseEncoder(torch.nn.Module): + """Embed positions in tensor. + + Args: +- xs: padded input tensor (B, T, D) ++ xs: padded input tensor (B, L, D) + xs_lens: input length (B) + decoding_chunk_size: decoding chunk size for dynamic chunk + 0: default for training, use random dynamic chunk. +@@ -141,16 +150,18 @@ class BaseEncoder(torch.nn.Module): + >=0: use num_decoding_left_chunks + <0: use all left chunks + Returns: +- encoder output tensor xs, and subsampled masks +- xs: padded output tensor (B, T' ~= T/subsample_rate, D) +- masks: torch.Tensor batch padding mask after subsample +- (B, 1, T' ~= T/subsample_rate) ++ encoder output tensor, lens and mask + """ +- masks = ~make_pad_mask(xs_lens).unsqueeze(1) # (B, 1, T) ++ decoding_chunk_size = 1 ++ num_decoding_left_chunks = 1 ++ self.use_dynamic_chunk = False ++ self.use_dynamic_left_chunk = False ++ self.static_chunk_size = 0 ++ masks = ~make_pad_mask(xs_lens, xs).unsqueeze(1) # (B, 1, L) + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + xs, pos_emb, masks = self.embed(xs, masks) +- mask_pad = masks # (B, 1, T/subsample_rate) ++ mask_pad = masks + chunk_masks = add_optional_chunk_mask(xs, masks, + self.use_dynamic_chunk, + self.use_dynamic_left_chunk, +@@ -169,13 +180,12 @@ class BaseEncoder(torch.nn.Module): + def forward_chunk( + self, + xs: torch.Tensor, +- offset: int, +- required_cache_size: int, ++ offset_tensor: torch.Tensor = torch.tensor(0), ++ required_cache_size_tensor: torch.Tensor = torch.tensor(0), + subsampling_cache: Optional[torch.Tensor] = None, +- elayers_output_cache: Optional[List[torch.Tensor]] = None, +- conformer_cnn_cache: Optional[List[torch.Tensor]] = None, +- ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor], +- List[torch.Tensor]]: ++ elayers_output_cache: Optional[torch.Tensor] = None, ++ conformer_cnn_cache: Optional[torch.Tensor] = None, ++ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ Forward just one chunk + + Args: +@@ -199,6 +209,7 @@ class BaseEncoder(torch.nn.Module): + List[torch.Tensor]: conformer cnn cache + + """ ++ required_cache_size_tensor = torch.tensor(-1) + assert xs.size(0) == 1 + # tmp_masks is just for interface compatibility + tmp_masks = torch.ones(1, +@@ -208,30 +219,53 @@ class BaseEncoder(torch.nn.Module): + tmp_masks = tmp_masks.unsqueeze(1) + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) +- xs, pos_emb, _ = self.embed(xs, tmp_masks, offset) ++ # if self.onnx_mode: ++ # offset_tensor = offset_tensor - torch.tensor(1) ++ xs, pos_emb, _ = self.embed(xs, tmp_masks, offset_tensor, self.onnx_mode) + if subsampling_cache is not None: + cache_size = subsampling_cache.size(1) + xs = torch.cat((subsampling_cache, xs), dim=1) + else: + cache_size = 0 +- pos_emb = self.embed.position_encoding(offset - cache_size, xs.size(1)) +- if required_cache_size < 0: +- next_cache_start = 0 +- elif required_cache_size == 0: +- next_cache_start = xs.size(1) ++ # if self.onnx_mode: ++ # cache_size = cache_size - 1 ++ # if self.onnx_mode: ++ # # subsampling_cache append dummy var, remove it here ++ # xs = xs[:, 1:, :] ++ # cache_size = cache_size - 1 ++ if isinstance(xs.size(1), int): ++ xs_size_1 = torch.tensor(xs.size(1)) + else: +- next_cache_start = max(xs.size(1) - required_cache_size, 0) +- r_subsampling_cache = xs[:, next_cache_start:, :] ++ xs_size_1 = xs.size(1).clone().detach() ++ pos_emb = self.embed.position_encoding(offset_tensor - cache_size, ++ xs_size_1, ++ self.onnx_mode) ++ next_cache_start = get_next_cache_start(required_cache_size_tensor, xs) ++ r_subsampling_cache = slice_helper3(xs, next_cache_start) ++ # if self.onnx_mode: ++ # next_cache_start_1 = get_next_cache_start(required_cache_size_tensor, xs) ++ # r_subsampling_cache = slice_helper3(xs, next_cache_start_1) ++ # else: ++ # required_cache_size = required_cache_size_tensor.detach().item() ++ # if required_cache_size < 0: ++ # next_cache_start = 0 ++ # elif required_cache_size == 0: ++ # next_cache_start = xs.size(1) ++ # else: ++ # next_cache_start = max(xs.size(1) - required_cache_size, 0) ++ # r_subsampling_cache = xs[:, next_cache_start:, :] + # Real mask for transformer/conformer layers + masks = torch.ones(1, xs.size(1), device=xs.device, dtype=torch.bool) + masks = masks.unsqueeze(1) +- r_elayers_output_cache = [] +- r_conformer_cnn_cache = [] ++ r_elayers_output_cache = None ++ r_conformer_cnn_cache = None + for i, layer in enumerate(self.encoders): + if elayers_output_cache is None: + attn_cache = None + else: + attn_cache = elayers_output_cache[i] ++ # if self.onnx_mode and attn_cache is not None: ++ # attn_cache = attn_cache[:, 1:, :] + if conformer_cnn_cache is None: + cnn_cache = None + else: +@@ -240,13 +274,32 @@ class BaseEncoder(torch.nn.Module): + masks, + pos_emb, + output_cache=attn_cache, +- cnn_cache=cnn_cache) +- r_elayers_output_cache.append(xs[:, next_cache_start:, :]) +- r_conformer_cnn_cache.append(new_cnn_cache) ++ cnn_cache=cnn_cache, ++ onnx_mode=self.onnx_mode) ++ if self.onnx_mode: ++ layer_output_cache = slice_helper3(xs, next_cache_start) ++ else: ++ layer_output_cache = xs[:, next_cache_start:, :] ++ if i == 0: ++ r_elayers_output_cache = layer_output_cache.unsqueeze(0) ++ r_conformer_cnn_cache = new_cnn_cache.unsqueeze(0) ++ else: ++ # r_elayers_output_cache.append(xs[:, next_cache_start:, :]) ++ r_elayers_output_cache = torch.cat((r_elayers_output_cache, layer_output_cache.unsqueeze(0)), 0) ++ # r_conformer_cnn_cache.append(new_cnn_cache) ++ r_conformer_cnn_cache = torch.cat((r_conformer_cnn_cache, new_cnn_cache.unsqueeze(0)), 0) + if self.normalize_before: + xs = self.after_norm(xs) +- +- return (xs[:, cache_size:, :], r_subsampling_cache, ++ if self.onnx_mode: ++ cache_size = cache_size - 1 ++ if isinstance(cache_size, int): ++ cache_size_1 = torch.tensor(cache_size) ++ else: ++ cache_size_1 = cache_size.clone().detach() ++ output = slice_helper3(xs, cache_size_1) ++ else: ++ output = xs[:, cache_size:, :] ++ return (output, r_subsampling_cache, + r_elayers_output_cache, r_conformer_cnn_cache) + + def forward_chunk_by_chunk( +@@ -290,24 +343,54 @@ class BaseEncoder(torch.nn.Module): + decoding_window = (decoding_chunk_size - 1) * subsampling + context + num_frames = xs.size(1) + subsampling_cache: Optional[torch.Tensor] = None +- elayers_output_cache: Optional[List[torch.Tensor]] = None +- conformer_cnn_cache: Optional[List[torch.Tensor]] = None ++ elayers_output_cache: Optional[torch.Tensor] = None ++ conformer_cnn_cache: Optional[torch.Tensor] = None + outputs = [] + offset = 0 + required_cache_size = decoding_chunk_size * num_decoding_left_chunks ++ print("required_cache_size:", required_cache_size) ++ encoder_session = onnxruntime.InferenceSession("onnx/encoder.onnx") ++ ++ subsampling_cache_onnx = torch.zeros(1, 1, 256, requires_grad=False) ++ elayers_output_cache_onnx = torch.zeros(12, 1, 1, 256, requires_grad=False) ++ conformer_cnn_cache_onnx = torch.zeros(12, 1, 256, 7, requires_grad=False) + + # Feed forward overlap input step by step + for cur in range(0, num_frames - context + 1, stride): + end = min(cur + decoding_window, num_frames) + chunk_xs = xs[:, cur:end, :] ++ ++ if offset > 0: ++ offset = offset - 1 + (y, subsampling_cache, elayers_output_cache, +- conformer_cnn_cache) = self.forward_chunk(chunk_xs, offset, +- required_cache_size, ++ conformer_cnn_cache) = self.forward_chunk(chunk_xs, torch.tensor(offset), ++ torch.tensor(required_cache_size), + subsampling_cache, + elayers_output_cache, + conformer_cnn_cache) +- outputs.append(y) ++ ++ offset = offset + 1 ++ encoder_inputs = { ++ encoder_session.get_inputs()[0].name: chunk_xs.numpy(), ++ encoder_session.get_inputs()[1].name: np.array(offset), ++ encoder_session.get_inputs()[2].name: subsampling_cache_onnx.numpy(), ++ encoder_session.get_inputs()[3].name: elayers_output_cache_onnx.numpy(), ++ encoder_session.get_inputs()[4].name: conformer_cnn_cache_onnx.numpy(), ++ } ++ ort_outs = encoder_session.run(None, encoder_inputs) ++ y_onnx, subsampling_cache_onnx, elayers_output_cache_onnx, conformer_cnn_cache_onnx = \ ++ torch.from_numpy(ort_outs[0][:, 1:, :]), torch.from_numpy(ort_outs[1]), \ ++ torch.from_numpy(ort_outs[2]), torch.from_numpy(ort_outs[3]) ++ ++ np.testing.assert_allclose(to_numpy(y), ort_outs[0][:, 1:, :], rtol=1e-03, atol=1e-03) ++ np.testing.assert_allclose(to_numpy(subsampling_cache), ort_outs[1][:, 1:, :], rtol=1e-03, atol=1e-03) ++ np.testing.assert_allclose(to_numpy(elayers_output_cache), ort_outs[2][:, :, 1:, :], rtol=1e-03, atol=1e-03) ++ np.testing.assert_allclose(to_numpy(conformer_cnn_cache), ort_outs[3], rtol=1e-03, atol=1e-03) ++ ++ outputs.append(y_onnx) ++ # outputs.append(y) + offset += y.size(1) ++ # break + ys = torch.cat(outputs, 1) + masks = torch.ones(1, ys.size(1), device=ys.device, dtype=torch.bool) + masks = masks.unsqueeze(1) +diff --git a/wenet/transformer/encoder_layer.py b/wenet/transformer/encoder_layer.py +index db8696d..0be079c 100644 +--- a/wenet/transformer/encoder_layer.py ++++ b/wenet/transformer/encoder_layer.py +@@ -9,6 +9,7 @@ from typing import Optional, Tuple + + import torch + from torch import nn ++from wenet.transformer.slice_helper import slice_helper + + + class TransformerEncoderLayer(nn.Module): +@@ -53,6 +54,9 @@ class TransformerEncoderLayer(nn.Module): + # concat_linear may be not used in forward fuction, + # but will be saved in the *.pt + self.concat_linear = nn.Linear(size + size, size) ++ ++ def set_onnx_mode(self, onnx_mode=False): ++ self.onnx_mode = onnx_mode + + def forward( + self, +@@ -92,9 +96,14 @@ class TransformerEncoderLayer(nn.Module): + assert output_cache.size(2) == self.size + assert output_cache.size(1) < x.size(1) + chunk = x.size(1) - output_cache.size(1) +- x_q = x[:, -chunk:, :] +- residual = residual[:, -chunk:, :] +- mask = mask[:, -chunk:, :] ++ if self.onnx_mode: ++ x_q = slice_helper(x, chunk) ++ residual = slice_helper(residual, chunk) ++ mask = slice_helper(mask, chunk) ++ else: ++ x_q = x[:, -chunk:, :] ++ residual = residual[:, -chunk:, :] ++ mask = mask[:, -chunk:, :] + + if self.concat_after: + x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1) +@@ -184,6 +193,7 @@ class ConformerEncoderLayer(nn.Module): + mask_pad: Optional[torch.Tensor] = None, + output_cache: Optional[torch.Tensor] = None, + cnn_cache: Optional[torch.Tensor] = None, ++ onnx_mode: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Compute encoded features. + +@@ -193,7 +203,6 @@ class ConformerEncoderLayer(nn.Module): + pos_emb (torch.Tensor): positional encoding, must not be None + for ConformerEncoderLayer. + mask_pad (torch.Tensor): batch padding mask used for conv module. +- (#batch, 1,time) + output_cache (torch.Tensor): Cache tensor of the output + (#batch, time2, size), time2 < time in x. + cnn_cache (torch.Tensor): Convolution cache in conformer layer +@@ -202,6 +211,14 @@ class ConformerEncoderLayer(nn.Module): + torch.Tensor: Mask tensor (#batch, time). + """ + ++ if onnx_mode: ++ x = x[:, 1:, :] ++ mask = mask[:, :, 1:] ++ # pos_emb_ = pos_emb[:, 1:, :] ++ pos_emb_ = pos_emb[:, :-1, :] ++ else: ++ pos_emb_ = pos_emb ++ + # whether to use macaron style + if self.feed_forward_macaron is not None: + residual = x +@@ -223,12 +240,26 @@ class ConformerEncoderLayer(nn.Module): + assert output_cache.size(0) == x.size(0) + assert output_cache.size(2) == self.size + assert output_cache.size(1) < x.size(1) +- chunk = x.size(1) - output_cache.size(1) +- x_q = x[:, -chunk:, :] +- residual = residual[:, -chunk:, :] +- mask = mask[:, -chunk:, :] + +- x_att = self.self_attn(x_q, x, x, mask, pos_emb) ++ # chunk = x.size(1) - output_cache.size(1) ++ if onnx_mode: ++ chunk = x.size(1) - output_cache.size(1) + 1 ++ if isinstance(chunk, int): ++ chunk_1 = torch.tensor(chunk) ++ else: ++ chunk_1 = chunk.clone().detach() ++ # chunk = torch.tensor(chunk) ++ # print(type(chunk)) ++ x_q = slice_helper(x, chunk_1) ++ residual = slice_helper(residual, chunk_1) ++ mask = slice_helper(mask, chunk_1) ++ else: ++ chunk = x.size(1) - output_cache.size(1) ++ x_q = x[:, -chunk:, :] ++ residual = residual[:, -chunk:, :] ++ mask = mask[:, -chunk:, :] ++ ++ x_att = self.self_attn(x_q, x, x, mask, pos_emb_) + if self.concat_after: + x_concat = torch.cat((x, x_att), dim=-1) + x = residual + self.concat_linear(x_concat) +diff --git a/wenet/transformer/subsampling.py b/wenet/transformer/subsampling.py +index b890f70..a978424 100644 +--- a/wenet/transformer/subsampling.py ++++ b/wenet/transformer/subsampling.py +@@ -16,8 +16,11 @@ class BaseSubsampling(torch.nn.Module): + self.right_context = 0 + self.subsampling_rate = 1 + +- def position_encoding(self, offset: int, size: int) -> torch.Tensor: +- return self.pos_enc.position_encoding(offset, size) ++ def position_encoding(self, ++ offset: torch.Tensor, ++ size: torch.Tensor, ++ onnx_mode: bool = False) -> torch.Tensor: ++ return self.pos_enc.position_encoding(offset, size, onnx_mode) + + + class LinearNoSubsampling(BaseSubsampling): +@@ -89,16 +92,17 @@ class Conv2dSubsampling4(BaseSubsampling): + torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) + self.pos_enc = pos_enc_class + # The right context for every conv layer is computed by: +- # (kernel_size - 1) * frame_rate_of_this_layer ++ # (kernel_size - 1) / 2 * stride * frame_rate_of_this_layer + self.subsampling_rate = 4 +- # 6 = (3 - 1) * 1 + (3 - 1) * 2 ++ # 6 = (3 - 1) / 2 * 2 * 1 + (3 - 1) / 2 * 2 * 2 + self.right_context = 6 + + def forward( + self, + x: torch.Tensor, + x_mask: torch.Tensor, +- offset: int = 0 ++ offset: torch.Tensor = torch.tensor(0), ++ onnx_mode: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Subsample x. + +@@ -118,7 +122,7 @@ class Conv2dSubsampling4(BaseSubsampling): + x = self.conv(x) + b, c, t, f = x.size() + x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f)) +- x, pos_emb = self.pos_enc(x, offset) ++ x, pos_emb = self.pos_enc(x, offset, onnx_mode) + return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] + + +@@ -143,9 +147,9 @@ class Conv2dSubsampling6(BaseSubsampling): + self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), + odim) + self.pos_enc = pos_enc_class +- # 10 = (3 - 1) * 1 + (5 - 1) * 2 ++ # 14 = (3 - 1) / 2 * 2 * 1 + (5 - 1) / 2 * 3 * 2 + self.subsampling_rate = 6 +- self.right_context = 10 ++ self.right_context = 14 + + def forward( + self, +@@ -198,7 +202,7 @@ class Conv2dSubsampling8(BaseSubsampling): + odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim) + self.pos_enc = pos_enc_class + self.subsampling_rate = 8 +- # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4 ++ # 14 = (3 - 1) / 2 * 2 * 1 + (3 - 1) / 2 * 2 * 2 + (3 - 1) / 2 * 2 * 4 + self.right_context = 14 + + def forward( +diff --git a/wenet/utils/mask.py b/wenet/utils/mask.py +index c2bb50f..d23bd95 100644 +--- a/wenet/utils/mask.py ++++ b/wenet/utils/mask.py +@@ -5,6 +5,15 @@ + + import torch + ++def tril_onnx(x, diagonal: torch.Tensor = torch.tensor(0)): ++ m,n = x.shape[0], x.shape[1] ++ arange = torch.arange(n, device = x.device) ++ mask = arange.expand(m, n) ++ mask_maker = torch.arange(m, device = x.device).unsqueeze(-1) ++ if diagonal: ++ mask_maker = mask_maker + diagonal ++ mask = mask <= mask_maker ++ return mask * x + + def subsequent_mask( + size: int, +@@ -35,13 +44,17 @@ def subsequent_mask( + [1, 1, 0], + [1, 1, 1]] + """ +- ret = torch.ones(size, size, device=device, dtype=torch.bool) +- return torch.tril(ret, out=ret) ++ # ret = torch.ones(size, size, device=device, dtype=torch.bool) ++ # return torch.tril(ret, out=ret) ++ # to export onnx, we change the code as follows ++ ret = torch.ones(size, size, device=device) ++ #return torch.tril(ret, out=ret) ++ return tril_onnx(ret) + + + def subsequent_chunk_mask( +- size: int, +- chunk_size: int, ++ size: torch.tensor(0), ++ chunk_size: torch.tensor(0), + num_left_chunks: int = -1, + device: torch.device = torch.device("cpu"), + ) -> torch.Tensor: +@@ -67,6 +80,18 @@ def subsequent_chunk_mask( + [1, 1, 1, 1]] + """ + ret = torch.zeros(size, size, device=device, dtype=torch.bool) ++ row_index = torch.arange(size, device = device) ++ index = row_index.expand(size, size) ++ expand_size = torch.ones((size), device = device)*size ++ #expand_size = expand_size.long() ++ if num_left_chunks < 0: ++ start1 = torch.tensor(0) ++ else: ++ start1 = torch.max((torch.floor_divide(row_index, chunk_size)-num_left_chunks).float()*chunk_size, torch.tensor(0.0)).long().view(size,1) ++ ending = torch.min((torch.floor_divide(row_index, chunk_size)+1).float()*chunk_size, expand_size.float()).long().view(size,1) ++ ret[torch.where(index < ending)] = True ++ ret[torch.where(index < start1)] = False ++ ''' + for i in range(size): + if num_left_chunks < 0: + start = 0 +@@ -74,6 +99,8 @@ def subsequent_chunk_mask( + start = max((i // chunk_size - num_left_chunks) * chunk_size, 0) + ending = min((i // chunk_size + 1) * chunk_size, size) + ret[i, start:ending] = True ++ print("ret:", ret) ++ ''' + return ret + + +@@ -107,18 +134,18 @@ def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, + """ + # Whether to use chunk mask or not + if use_dynamic_chunk: +- max_len = xs.size(1) ++ max_len = xs.shape[1] + if decoding_chunk_size < 0: + chunk_size = max_len + num_left_chunks = -1 + elif decoding_chunk_size > 0: +- chunk_size = decoding_chunk_size ++ chunk_size = torch.tensor(decoding_chunk_size) + num_left_chunks = num_decoding_left_chunks + else: + # chunk size is either [1, 25] or full context(max_len). + # Since we use 4 times subsampling and allow up to 1s(100 frames) + # delay, the maximum frame is 100 / 4 = 25. +- chunk_size = torch.randint(1, max_len, (1, )).item() ++ chunk_size = torch.randint(1, max_len, (1, )) + num_left_chunks = -1 + if chunk_size > max_len // 2: + chunk_size = max_len +@@ -128,14 +155,14 @@ def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, + max_left_chunks = (max_len - 1) // chunk_size + num_left_chunks = torch.randint(0, max_left_chunks, + (1, )).item() +- chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size, ++ chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size, + num_left_chunks, + xs.device) # (L, L) + chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) + chunk_masks = masks & chunk_masks # (B, L, L) + elif static_chunk_size > 0: + num_left_chunks = num_decoding_left_chunks +- chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size, ++ chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size, + num_left_chunks, + xs.device) # (L, L) + chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) +@@ -145,7 +172,7 @@ def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor, + return chunk_masks + + +-def make_pad_mask(lengths: torch.Tensor) -> torch.Tensor: ++def make_pad_mask(lengths: torch.Tensor, xs: torch.Tensor) -> torch.Tensor: + """Make mask tensor containing indices of padded part. + + See description of make_non_pad_mask. +@@ -162,8 +189,11 @@ def make_pad_mask(lengths: torch.Tensor) -> torch.Tensor: + [0, 0, 0, 1, 1], + [0, 0, 1, 1, 1]] + """ +- batch_size = int(lengths.size(0)) +- max_len = int(lengths.max().item()) ++ # batch_size = int(lengths.size(0)) ++ # max_len = int(lengths.max().item()) ++ # to export the decoder onnx and avoid the constant fold ++ batch_size = xs.shape[0] ++ max_len = xs.shape[1] + seq_range = torch.arange(0, + max_len, + dtype=torch.int64, diff --git a/ACL_PyTorch/contrib/audio/WeNet/export_onnx.py b/ACL_PyTorch/contrib/audio/WeNet/export_onnx.py new file mode 100644 index 0000000000000000000000000000000000000000..63617b9c52c735e89e475262e9987f7f95042d37 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/export_onnx.py @@ -0,0 +1,193 @@ +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ + + +from __future__ import print_function + +import argparse +import os + +import torch +import onnx, onnxruntime +import yaml +import numpy as np + +from wenet.transformer.asr_model import init_asr_model +from wenet.transformer.decoder import TransformerDecoder, BiTransformerDecoder +from wenet.utils.checkpoint import load_checkpoint + + +def to_numpy(x): + return x.detach().numpy() + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='export your script model') + parser.add_argument('--config', required=True, help='config file') + parser.add_argument('--checkpoint', required=True, help='checkpoint model') + # parser.add_argument('--output_file', required=True, help='output file') + parser.add_argument('--output_onnx_file', required=True, help='output onnx file') + args = parser.parse_args() + # No need gpu for model export + os.environ['CUDA_VISIBLE_DEVICES'] = '-1' + + with open(args.config, 'r') as fin: + configs = yaml.load(fin, Loader=yaml.FullLoader) + model = init_asr_model(configs) + print(model) + + load_checkpoint(model, args.checkpoint) + # Export jit torch script model + + model.eval() + + #export the none flash model + encoder = model.encoder + xs = torch.randn(1, 131, 80, requires_grad=False) + xs_lens = torch.tensor([131], dtype=torch.int32) + onnx_encoder_path = os.path.join(args.output_onnx_file, 'no_flash_encoder.onnx') + torch.onnx.export(encoder, + (xs, xs_lens), + onnx_encoder_path, + export_params=True, + opset_version=11, + do_constant_folding=True, + input_names=['xs_input', 'xs_input_lens'], + output_names=['xs_output', 'masks_output'], + dynamic_axes={'xs_input': [1], 'xs_input_lens': [0], + 'xs_output': [1], 'masks_output': [2]}, + verbose=True + ) + onnx_model = onnx.load(onnx_encoder_path) + onnx.checker.check_model(onnx_model) + print("encoder onnx_model check pass!") + + ort_session = onnxruntime.InferenceSession(onnx_encoder_path) + ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(xs), + ort_session.get_inputs()[1].name: to_numpy(xs_lens), + } + ort_outs = ort_session.run(None, ort_inputs) + y1, y2 = encoder(xs, xs_lens) + # np.testing.assert_allclose(to_numpy(y1), ort_outs[0], rtol=1e-05, atol=1e-05) + # np.testing.assert_allclose(to_numpy(y2), ort_outs[1], rtol=1e-05, atol=1e-05) + print("Exported no flash encoder model has been tested with ONNXRuntime, and the result looks good!") + + #export the flash encoder + encoder = model.encoder + encoder.forward = encoder.forward_chunk + + batch_size = 1 + audio_len = 131 + x = torch.randn(batch_size, audio_len, 80, requires_grad=False) + offset = torch.tensor(1) + decoding_chunk_size = 16 + num_decoding_left_chunks = -1 + required_cache_size = decoding_chunk_size * num_decoding_left_chunks + required_cache_size = torch.tensor(required_cache_size) + subsampling_cache = torch.randn(batch_size, 1, 256, requires_grad=False) + elayers_cache = torch.randn(12, batch_size, 1, 256, requires_grad=False) + conformer_cnn_cache = torch.randn(12, batch_size, 256, 7, requires_grad=False) + + + encoder.set_onnx_mode(False) + y, subsampling_cache_output, elayers_cache_output, conformer_cnn_cache_output = encoder(x, torch.tensor(0), \ + required_cache_size, None, None, conformer_cnn_cache) + + encoder.set_onnx_mode(True) + onnx_encoder_path = os.path.join(args.output_onnx_file, 'encoder.onnx') + torch.onnx.export(encoder, + (x, offset, required_cache_size, subsampling_cache, elayers_cache, conformer_cnn_cache), + onnx_encoder_path, + export_params=True, + opset_version=11, + do_constant_folding=True, + input_names=['input', 'offset', 'required_cache_size', 'subsampling_cache', 'elayers_cache', \ + 'conformer_cnn_cache'], + output_names=['output', 'subsampling_cache_output', 'elayers_cache_output', \ + 'conformer_cnn_cache_output'], + dynamic_axes={'input': [1], 'subsampling_cache':[1], 'elayers_cache':[2], + 'output': [1]}, + verbose=True + ) + + onnx_model = onnx.load(onnx_encoder_path) + onnx.checker.check_model(onnx_model) + print("encoder onnx_model check pass!") + + ort_session = onnxruntime.InferenceSession(onnx_encoder_path) + ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x), + ort_session.get_inputs()[1].name: to_numpy(offset), + ort_session.get_inputs()[2].name: to_numpy(subsampling_cache), + ort_session.get_inputs()[3].name: to_numpy(elayers_cache), + ort_session.get_inputs()[4].name: to_numpy(conformer_cnn_cache), + } + ort_outs = ort_session.run(None, ort_inputs) + print("Exported encoder model has been tested with ONNXRuntime, and the result looks good!") + + #export decoder onnx + + decoder = model.decoder + decoder.set_onnx_mode(True) + onnx_decoder_path = os.path.join(args.output_onnx_file, 'decoder.onnx') + memory = torch.randn(10, 131, 256) + memory_mask = torch.ones(10, 1, 131).bool() + ys_in_pad = torch.randint(0, 4232, (10, 50)).long() + ys_in_lens = torch.tensor([13, 13, 13, 13, 13, 13, 13, 13, 50, 13], dtype=torch.int32) + r_ys_in_pad = torch.randint(0, 4232, (10, 50)).long() + + if isinstance(decoder, TransformerDecoder): + torch.onnx.export(decoder, + (memory, memory_mask, ys_in_pad, ys_in_lens), + onnx_decoder_path, + export_params=True, + opset_version=12, + do_constant_folding=True, + input_names=['memory', 'memory_mask', 'ys_in_pad', 'ys_in_lens'], + output_names=['l_x', 'r_x'], + dynamic_axes={'memory': [1], 'memory_mask':[2], 'ys_in_pad':[1], + 'ys_in_lens': [0]}, + verbose=True + ) + elif isinstance(decoder, BiTransformerDecoder): + print("BI mode") + torch.onnx.export(decoder, + (memory, memory_mask, ys_in_pad, ys_in_lens, r_ys_in_pad), + onnx_decoder_path, + export_params=True, + opset_version=11, + do_constant_folding=True, + input_names=['memory', 'memory_mask', 'ys_in_pad', 'ys_in_lens', 'r_ys_in_pad'], + output_names=['l_x', 'r_x', 'olens'], + dynamic_axes={'memory': [1], 'memory_mask':[2], 'ys_in_pad':[1], + 'ys_in_lens': [0], 'r_ys_in_pad':[1]}, + verbose=True + ) + diff --git a/ACL_PyTorch/contrib/audio/WeNet/export_onnx.sh b/ACL_PyTorch/contrib/audio/WeNet/export_onnx.sh new file mode 100644 index 0000000000000000000000000000000000000000..bebe2a07ef04b7fb044fdb57dcac8bc1337e3894 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/export_onnx.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Copyright 2019 Mobvoi Inc. All Rights Reserved. +. ./path.sh || exit 1; + +yaml_path=$1 +decode_checkpoint=$2 + +mkdir onnx +python3 wenet/bin/export_onnx.py \ + --config $yaml_path \ + --checkpoint $decode_checkpoint \ + --output_onnx_file onnx diff --git a/ACL_PyTorch/contrib/audio/WeNet/get_no_flash_encoder_out.diff b/ACL_PyTorch/contrib/audio/WeNet/get_no_flash_encoder_out.diff new file mode 100644 index 0000000000000000000000000000000000000000..b209edb704a1cc238363120b779cbf3dc2556058 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/get_no_flash_encoder_out.diff @@ -0,0 +1,38 @@ +diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py +index 73990fa..e2f3555 100644 +--- a/wenet/transformer/asr_model.py ++++ b/wenet/transformer/asr_model.py +@@ -175,6 +175,33 @@ class ASRModel(torch.nn.Module): + num_decoding_left_chunks=num_decoding_left_chunks + ) # (B, maxlen, encoder_dim) + return encoder_out, encoder_mask ++ ++ def get_no_flash_encoder_out( ++ self, ++ encoder_model_noflash, ++ batch_idx: int, ++ speech: torch.Tensor, ++ speech_lengths: torch.Tensor, ++ beam_size: int, ++ decoding_chunk_size: int = -1, ++ num_decoding_left_chunks: int = -1, ++ ctc_weight: float = 0.0, ++ simulate_streaming: bool = False, ++ reverse_weight: float = 0.0, ++ ) -> List[int]: ++ assert speech.shape[0] == speech_lengths.shape[0] ++ assert decoding_chunk_size != 0 ++ if reverse_weight > 0.0: ++ # decoder should be a bitransformer decoder if reverse_weight > 0.0 ++ assert hasattr(self.decoder, 'right_decoder') ++ device = speech.device ++ batch_size = speech.shape[0] ++ # For attention rescoring we only support batch_size=1 ++ assert batch_size == 1 ++ y, exe_time = encoder_model_noflash( ++ [speech.numpy(), speech_lengths.numpy().astype("int32")]) # (beam_size, max_hyps_len, vocab_size) ++ encoder_out, encoder_mask = torch.from_numpy(y[0]), torch.from_numpy(y[1]) ++ return encoder_out, encoder_mask, exe_time + + def recognize( + self, diff --git a/ACL_PyTorch/contrib/audio/WeNet/getwer.diff b/ACL_PyTorch/contrib/audio/WeNet/getwer.diff new file mode 100644 index 0000000000000000000000000000000000000000..633513671cfe36099d7139f6f2245c7511d39b69 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/getwer.diff @@ -0,0 +1,174 @@ +diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py +index 73990fa..82337ca 100644 +--- a/wenet/transformer/asr_model.py ++++ b/wenet/transformer/asr_model.py +@@ -33,7 +33,8 @@ from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add, + from wenet.utils.mask import (make_pad_mask, mask_finished_preds, + mask_finished_scores, subsequent_mask) + +- ++import numpy as np ++import os + class ASRModel(torch.nn.Module): + """CTC-attention hybrid Encoder-Decoder model""" + def __init__( +@@ -443,6 +444,159 @@ class ASRModel(torch.nn.Module): + simulate_streaming) + return hyps[0][0] + ++ def get_wer( ++ self, ++ batch_idx, ++ bin_path, ++ json_data, ++ decoder_model, ++ speech: torch.Tensor, ++ speech_lengths: torch.Tensor, ++ beam_size: int, ++ decoding_chunk_size: int = -1, ++ num_decoding_left_chunks: int = -1, ++ ctc_weight: float = 0.0, ++ simulate_streaming: bool = False, ++ reverse_weight: float = 0.0, ++ ) -> List[int]: ++ """ Apply attention rescoring decoding, CTC prefix beam search ++ is applied first to get nbest, then we resoring the nbest on ++ attention decoder with corresponding encoder out ++ ++ Args: ++ speech (torch.Tensor): (batch, max_len, feat_dim) ++ speech_length (torch.Tensor): (batch, ) ++ beam_size (int): beam size for beam search ++ decoding_chunk_size (int): decoding chunk for dynamic chunk ++ trained model. ++ <0: for decoding, use full chunk. ++ >0: for decoding, use fixed chunk size as set. ++ 0: used for training, it's prohibited here ++ simulate_streaming (bool): whether do encoder forward in a ++ streaming fashion ++ reverse_weight (float): right to left decoder weight ++ ctc_weight (float): ctc score weight ++ ++ Returns: ++ List[int]: Attention rescoring result ++ """ ++ assert speech.shape[0] == speech_lengths.shape[0] ++ assert decoding_chunk_size != 0 ++ if reverse_weight > 0.0: ++ # decoder should be a bitransformer decoder if reverse_weight > 0.0 ++ assert hasattr(self.decoder, 'right_decoder') ++ device = speech.device ++ batch_size = speech.shape[0] ++ # For attention rescoring we only support batch_size=1 ++ assert batch_size == 1 ++ # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size ++ # Let's assume B = batch_size and N = beam_size ++ # 1. Encoder forward and get CTC score ++ encoder_out_file = "encoder_out_{}.bin".format(batch_idx) ++ encoder_out_path = os.path.join(bin_path, encoder_out_file) ++ encoder_out = np.fromfile(encoder_out_path,dtype = np.float32).reshape( ++ json_data["encoder_out_{}".format(batch_idx)]) ++ encoder_mask_file = "encoder_mask_{}.bin".format(batch_idx) ++ encoder_mask_path = os.path.join(bin_path, encoder_mask_file) ++ encoder_mask = np.fromfile(encoder_mask_path, dtype = np.bool).reshape(json_data["encoder_mask_{}".format(batch_idx)]) ++ encoder_out = torch.from_numpy(encoder_out) ++ maxlen = encoder_out.size(1) ++ ctc_probs = self.ctc.log_softmax( ++ encoder_out) # (1, maxlen, vocab_size) ++ ctc_probs = ctc_probs.squeeze(0) ++ # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) ++ cur_hyps = [(tuple(), (0.0, -float('inf')))] ++ # 2. CTC beam search step by step ++ for t in range(0, maxlen): ++ logp = ctc_probs[t] # (vocab_size,) ++ # key: prefix, value (pb, pnb), default value(-inf, -inf) ++ next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) ++ # 2.1 First beam prune: select topk best ++ top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) ++ for s in top_k_index: ++ s = s.item() ++ ps = logp[s].item() ++ for prefix, (pb, pnb) in cur_hyps: ++ last = prefix[-1] if len(prefix) > 0 else None ++ if s == 0: # blank ++ n_pb, n_pnb = next_hyps[prefix] ++ n_pb = log_add([n_pb, pb + ps, pnb + ps]) ++ next_hyps[prefix] = (n_pb, n_pnb) ++ elif s == last: ++ # Update *ss -> *s; ++ n_pb, n_pnb = next_hyps[prefix] ++ n_pnb = log_add([n_pnb, pnb + ps]) ++ next_hyps[prefix] = (n_pb, n_pnb) ++ # Update *s-s -> *ss, - is for blank ++ n_prefix = prefix + (s, ) ++ n_pb, n_pnb = next_hyps[n_prefix] ++ n_pnb = log_add([n_pnb, pb + ps]) ++ next_hyps[n_prefix] = (n_pb, n_pnb) ++ else: ++ n_prefix = prefix + (s, ) ++ n_pb, n_pnb = next_hyps[n_prefix] ++ n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) ++ next_hyps[n_prefix] = (n_pb, n_pnb) ++ ++ # 2.2 Second beam prune ++ next_hyps = sorted(next_hyps.items(), ++ key=lambda x: log_add(list(x[1])), ++ reverse=True) ++ cur_hyps = next_hyps[:beam_size] ++ hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] ++ ++ assert len(hyps) == beam_size ++ hyps_pad = pad_sequence([ ++ torch.tensor(hyp[0], device=device, dtype=torch.long) ++ for hyp in hyps ++ ], True, self.ignore_id) # (beam_size, max_hyps_len) ++ ori_hyps_pad = hyps_pad ++ hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps], ++ device=device, ++ dtype=torch.long) # (beam_size,) ++ hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id) ++ hyps_lens = hyps_lens + 1 # Add at begining ++ encoder_out = encoder_out.repeat(beam_size, 1, 1) ++ encoder_mask = torch.ones(beam_size, ++ 1, ++ encoder_out.size(1), ++ dtype=torch.bool, ++ device=device) ++ # used for right to left decoder ++ r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id) ++ r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos, ++ self.ignore_id) ++ y, exe_time = decoder_model([encoder_out.numpy(), encoder_mask.numpy(), hyps_pad.numpy(), hyps_lens.numpy().astype("int32"), ++ r_hyps_pad.numpy()]) # (beam_size, max_hyps_len, vocab_size) ++ decoder_out, r_decoder_out = torch.from_numpy(y[0]), torch.from_numpy(y[1]) ++ decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1) ++ decoder_out = decoder_out.cpu().numpy() ++ # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a ++ # conventional transformer decoder. ++ r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1) ++ r_decoder_out = r_decoder_out.cpu().numpy() ++ # Only use decoder score for rescoring ++ best_score = -float('inf') ++ best_index = 0 ++ for i, hyp in enumerate(hyps): ++ score = 0.0 ++ for j, w in enumerate(hyp[0]): ++ score += decoder_out[i][j][w] ++ score += decoder_out[i][len(hyp[0])][self.eos] ++ # add right to left decoder score ++ if reverse_weight > 0: ++ r_score = 0.0 ++ for j, w in enumerate(hyp[0]): ++ r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w] ++ r_score += r_decoder_out[i][len(hyp[0])][self.eos] ++ score = score * (1 - reverse_weight) + r_score * reverse_weight ++ # add ctc score ++ score += hyp[1] * ctc_weight ++ if score > best_score: ++ best_score = score ++ best_index = i ++ return hyps[best_index][0], exe_time ++ + def attention_rescoring( + self, + speech: torch.Tensor, diff --git a/ACL_PyTorch/contrib/audio/WeNet/infer.py b/ACL_PyTorch/contrib/audio/WeNet/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..95945eb542dcaba0b22d194f6f8421d7a7af1ff1 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/infer.py @@ -0,0 +1,24 @@ +# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +t1 = {} +with open("t1.json", 'r') as load_f: + t1 = json.load(load_f) + +t2 = {} +with open("t2.json", 'r') as load_f: + t2 = json.load(load_f) + +perf = t1["t1"] + t2["t2"] +print("fps:", 1000 / perf) diff --git a/ACL_PyTorch/contrib/audio/WeNet/no_flash_encoder.sh b/ACL_PyTorch/contrib/audio/WeNet/no_flash_encoder.sh new file mode 100644 index 0000000000000000000000000000000000000000..34cac6e77703a78b02aae672e75a533b9b7b2c3f --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/no_flash_encoder.sh @@ -0,0 +1,11 @@ +export install_path=/usr/local/Ascend/ascend-toolkit/latest +export PATH=${install_path}/atc/bin:${install_path}/bin:${install_path}/atc/ccec_compiler/bin:$PATH +export LD_LIBRARY_PATH=${install_path}/lib64:${install_path}/atc/lib64:${install_path}/acllib/lib64:${install_path}/compiler/lib64/plugin/opskernel:${install_path}/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH +export PYTHONPATH=${install_path}/latest/python/site-packages:${install_path}/opp/op_impl/built-in/ai_core/tbe:${install_path}/atc/python/site-packages:${install_path}/pyACL/python/site-packages/acl:$PYTHONPATH +export ASCEND_AICPU_PATH=${install_path} +export ASCEND_OPP_PATH=${install_path}/opp +export TOOLCHAIN_HOME=${install_path}/toolkit +export ASCEND_AUTOML_PATH=${install_path}/tools +export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:${LD_LIBRARY_PATH} +atc --model=no_flash_encoder_revise.onnx --framework=5 --output=no_flash_encoder_revise --input_format=ND --input_shape_range="xs_input:[1,-1,80];xs_input_lens:[-1]" --log=error --soc_version=Ascend310 + diff --git a/ACL_PyTorch/contrib/audio/WeNet/process_encoder_data_noflash.py b/ACL_PyTorch/contrib/audio/WeNet/process_encoder_data_noflash.py new file mode 100644 index 0000000000000000000000000000000000000000..709d6f199db3fd81d919030a6bccd2f85a6e35b7 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/process_encoder_data_noflash.py @@ -0,0 +1,209 @@ +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ + + +from __future__ import print_function + +import argparse +import copy +import logging +import os +import sys + +import torch +import yaml +from torch.utils.data import DataLoader + +from wenet.dataset.dataset import AudioDataset, CollateFunc +from wenet.transformer.asr_model import init_asr_model +from wenet.utils.checkpoint import load_checkpoint +#from wenet.transformer.acl_init import decoder_model, device_id +import json +import os +import acl +from wenet.transformer.acl_net import Net +def dic2json(input_dict, json_path): + json_str = json.dumps(input_dict) + with open(json_path, 'a') as json_file: + json_file.write(json_str) +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='recognize with your model') + parser.add_argument('--config', required=True, help='config file') + parser.add_argument('--test_data', required=True, help='test data file') + parser.add_argument('--gpu', + type=int, + default=-1, + help='gpu id for this rank, -1 for cpu') + parser.add_argument('--checkpoint', required=True, help='checkpoint model') + parser.add_argument('--dict', required=True, help='dict file') + parser.add_argument('--beam_size', + type=int, + default=10, + help='beam size for search') + parser.add_argument('--penalty', + type=float, + default=0.0, + help='length penalty') + parser.add_argument('--result_file', required=True, help='asr result file') + parser.add_argument('--bin_path', type=str, default="./encoder_data_noflash", help='encoder bin images dir') + parser.add_argument('--model_path', type=str, default="no_flash_encoder_revise.om", help='encoder bin images dir') + parser.add_argument('--json_path', type=str, default="encoder_noflash_all.json", help='encoder bin images dir') + parser.add_argument('--batch_size', + type=int, + default=16, + help='asr result file') + parser.add_argument('--ctc_weight', + type=float, + default=0.0, + help='ctc weight for attention rescoring decode mode') + parser.add_argument('--decoding_chunk_size', + type=int, + default=-1, + help='''decoding chunk size, + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here''') + parser.add_argument('--num_decoding_left_chunks', + type=int, + default=-1, + help='number of left chunks for decoding') + parser.add_argument('--simulate_streaming', + action='store_true', + help='simulate streaming inference') + parser.add_argument('--reverse_weight', + type=float, + default=0.0, + help='''right to left weight for attention rescoring + decode mode''') + args = parser.parse_args() + print(args) + logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(message)s') + os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) + + #init acl + ret = acl.init() + device_id = 0 + #check_ret('acl.init', ret) + ret = acl.rt.set_device(device_id) + #check_ret('acl.rt.set_device', ret) + context, ret = acl.rt.create_context(device_id) + #check_ret('acl.rt.create_context', ret) + + decoder_output_data_shape = 42330000 + encoder_model_noflash = Net( + model_path=args.model_path, + output_data_shape=decoder_output_data_shape, + device_id=device_id, ) + + with open(args.config, 'r') as fin: + configs = yaml.load(fin, Loader=yaml.FullLoader) + + raw_wav = configs['raw_wav'] + # Init dataset and data loader + # Init dataset and data loader + test_collate_conf = copy.deepcopy(configs['collate_conf']) + test_collate_conf['spec_aug'] = False + test_collate_conf['spec_sub'] = False + test_collate_conf['feature_dither'] = False + test_collate_conf['speed_perturb'] = False + if raw_wav: + test_collate_conf['wav_distortion_conf']['wav_distortion_rate'] = 0 + test_collate_func = CollateFunc(**test_collate_conf, raw_wav=raw_wav) + dataset_conf = configs.get('dataset_conf', {}) + dataset_conf['batch_size'] = args.batch_size + dataset_conf['batch_type'] = 'static' + dataset_conf['sort'] = False + test_dataset = AudioDataset(args.test_data, + **dataset_conf, + raw_wav=raw_wav) + test_data_loader = DataLoader(test_dataset, + collate_fn=test_collate_func, + shuffle=False, + batch_size=1, + num_workers=0) + + # Init asr model from configs + model = init_asr_model(configs) + + # Load dict + char_dict = {} + with open(args.dict, 'r') as fin: + for line in fin: + arr = line.strip().split() + assert len(arr) == 2 + char_dict[int(arr[1])] = arr[0] + eos = len(char_dict) - 1 + + load_checkpoint(model, args.checkpoint) + use_cuda = args.gpu >= 0 and torch.cuda.is_available() + device = torch.device('cuda' if use_cuda else 'cpu') + model = model.to(device) + + model.eval() + + #init acl + if os.path.exists(args.json_path): + os.remove(args.json_path) + total_t = 0 + encoder_dic = {} + import time + for batch_idx, batch in enumerate(test_data_loader): + print("batch_idx", batch_idx) + keys, feats, target, feats_lengths, target_lengths = batch + feats = feats.to(device) + target = target.to(device) + feats_lengths = feats_lengths.to(device) + target_lengths = target_lengths.to(device) + assert (feats.size(0) == 1) + encoder_out, encoder_mask, exe_time = model.get_no_flash_encoder_out( + encoder_model_noflash, + batch_idx, + feats, + feats_lengths, + args.beam_size, + decoding_chunk_size=args.decoding_chunk_size, + num_decoding_left_chunks=args.num_decoding_left_chunks, + ctc_weight=args.ctc_weight, + simulate_streaming=args.simulate_streaming, + reverse_weight=args.reverse_weight) + total_t += exe_time + encoder_dic["encoder_out_"+ str(batch_idx)] = [encoder_out.shape[0], encoder_out.shape[1],encoder_out.shape[2]] + encoder_dic["encoder_mask_"+ str(batch_idx)] = [encoder_mask.shape[0], encoder_mask.shape[1],encoder_mask.shape[2]] + encoder_out.numpy().tofile(os.path.join(args.bin_path, "encoder_out_{}.bin".format(batch_idx))) + encoder_mask.numpy().tofile(os.path.join(args.bin_path, "encoder_mask_{}.bin".format(batch_idx))) + ave_t = total_t / (batch_idx + 1) + dic_perf = {} + dic_perf["t1"] = ave_t + dic2json(dic_perf, "t1.json") + dic2json(encoder_dic, args.json_path) + diff --git a/ACL_PyTorch/contrib/audio/WeNet/recognize.py b/ACL_PyTorch/contrib/audio/WeNet/recognize.py new file mode 100644 index 0000000000000000000000000000000000000000..817248c572124388c35f8ca316ec69ed3cda4fd6 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/recognize.py @@ -0,0 +1,204 @@ +# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import argparse +import copy +import logging +import os +import sys + +import torch +import yaml +from torch.utils.data import DataLoader + +from wenet.dataset.dataset import AudioDataset, CollateFunc +from wenet.transformer.asr_model import init_asr_model +from wenet.utils.checkpoint import load_checkpoint + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='recognize with your model') + parser.add_argument('--config', required=True, help='config file') + parser.add_argument('--test_data', required=True, help='test data file') + parser.add_argument('--gpu', + type=int, + default=-1, + help='gpu id for this rank, -1 for cpu') + parser.add_argument('--checkpoint', required=True, help='checkpoint model') + parser.add_argument('--dict', required=True, help='dict file') + parser.add_argument('--beam_size', + type=int, + default=10, + help='beam size for search') + parser.add_argument('--penalty', + type=float, + default=0.0, + help='length penalty') + parser.add_argument('--result_file', required=True, help='asr result file') + parser.add_argument('--batch_size', + type=int, + default=16, + help='asr result file') + parser.add_argument('--mode', + choices=[ + 'attention', 'ctc_greedy_search', + 'ctc_prefix_beam_search', 'attention_rescoring' + ], + default='attention', + help='decoding mode') + parser.add_argument('--ctc_weight', + type=float, + default=0.0, + help='ctc weight for attention rescoring decode mode') + parser.add_argument('--decoding_chunk_size', + type=int, + default=-1, + help='''decoding chunk size, + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here''') + parser.add_argument('--num_decoding_left_chunks', + type=int, + default=-1, + help='number of left chunks for decoding') + parser.add_argument('--simulate_streaming', + action='store_true', + help='simulate streaming inference') + parser.add_argument('--reverse_weight', + type=float, + default=0.0, + help='''right to left weight for attention rescoring + decode mode''') + args = parser.parse_args() + print(args) + logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(message)s') + os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) + + if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' + ] and args.batch_size > 1: + logging.fatal( + 'decoding mode {} must be running with batch_size == 1'.format( + args.mode)) + sys.exit(1) + + with open(args.config, 'r') as fin: + configs = yaml.load(fin, Loader=yaml.FullLoader) + + raw_wav = configs['raw_wav'] + # Init dataset and data loader + # Init dataset and data loader + test_collate_conf = copy.deepcopy(configs['collate_conf']) + test_collate_conf['spec_aug'] = False + test_collate_conf['spec_sub'] = False + test_collate_conf['feature_dither'] = False + test_collate_conf['speed_perturb'] = False + if raw_wav: + test_collate_conf['wav_distortion_conf']['wav_distortion_rate'] = 0 + test_collate_func = CollateFunc(**test_collate_conf, raw_wav=raw_wav) + dataset_conf = configs.get('dataset_conf', {}) + dataset_conf['batch_size'] = args.batch_size + dataset_conf['batch_type'] = 'static' + dataset_conf['sort'] = False + test_dataset = AudioDataset(args.test_data, + **dataset_conf, + raw_wav=raw_wav) + test_data_loader = DataLoader(test_dataset, + collate_fn=test_collate_func, + shuffle=False, + batch_size=1, + num_workers=0) + + # Init asr model from configs + model = init_asr_model(configs) + + # Load dict + char_dict = {} + with open(args.dict, 'r') as fin: + for line in fin: + arr = line.strip().split() + assert len(arr) == 2 + char_dict[int(arr[1])] = arr[0] + eos = len(char_dict) - 1 + + load_checkpoint(model, args.checkpoint) + use_cuda = args.gpu >= 0 and torch.cuda.is_available() + device = torch.device('cuda' if use_cuda else 'cpu') + model = model.to(device) + + model.eval() + total_t = 0 + with torch.no_grad(), open(args.result_file, 'w') as fout: + for batch_idx, batch in enumerate(test_data_loader): + keys, feats, target, feats_lengths, target_lengths = batch + feats = feats.to(device) + target = target.to(device) + feats_lengths = feats_lengths.to(device) + target_lengths = target_lengths.to(device) + if args.mode == 'attention': + hyps = model.recognize( + feats, + feats_lengths, + beam_size=args.beam_size, + decoding_chunk_size=args.decoding_chunk_size, + num_decoding_left_chunks=args.num_decoding_left_chunks, + simulate_streaming=args.simulate_streaming) + hyps = [hyp.tolist() for hyp in hyps] + elif args.mode == 'ctc_greedy_search': + hyps = model.ctc_greedy_search( + feats, + feats_lengths, + decoding_chunk_size=args.decoding_chunk_size, + num_decoding_left_chunks=args.num_decoding_left_chunks, + simulate_streaming=args.simulate_streaming) + # ctc_prefix_beam_search and attention_rescoring only return one + # result in List[int], change it to List[List[int]] for compatible + # with other batch decoding mode + elif args.mode == 'ctc_prefix_beam_search': + assert (feats.size(0) == 1) + hyp = model.ctc_prefix_beam_search( + feats, + feats_lengths, + args.beam_size, + decoding_chunk_size=args.decoding_chunk_size, + num_decoding_left_chunks=args.num_decoding_left_chunks, + simulate_streaming=args.simulate_streaming) + hyps = [hyp] + elif args.mode == 'attention_rescoring': + assert (feats.size(0) == 1) + hyp, exe_t = model.attention_rescoring( + feats, + feats_lengths, + args.beam_size, + decoding_chunk_size=args.decoding_chunk_size, + num_decoding_left_chunks=args.num_decoding_left_chunks, + ctc_weight=args.ctc_weight, + simulate_streaming=args.simulate_streaming, + reverse_weight=args.reverse_weight) + hyps = [hyp] + total_t += exe_t + print(exe_t) + for i, key in enumerate(keys): + content = '' + for w in hyps[i]: + if w == eos: + break + content += char_dict[w] + logging.info('{} {}'.format(key, content)) + fout.write('{} {}\n'.format(key, content)) + print("mean_fps: ", 1/(total_t/(batch_idx+1))) + print("mean_time: ", total_t/(batch_idx+1)) + fout.write("mean_time: "+str(total_t/(batch_idx+1))) + fout.write("mean_fps: "+str(1/(total_t/(batch_idx+1)))) diff --git a/ACL_PyTorch/contrib/audio/WeNet/recognize_attenstion_rescoring.py b/ACL_PyTorch/contrib/audio/WeNet/recognize_attenstion_rescoring.py new file mode 100644 index 0000000000000000000000000000000000000000..fb63a5e9cbb05e01d1beecb5e9d111e664f2c9d1 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/recognize_attenstion_rescoring.py @@ -0,0 +1,205 @@ +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ + + + + +from __future__ import print_function + +import argparse +import copy +import logging +import os +import sys + +import torch +import yaml +from torch.utils.data import DataLoader + +from wenet.dataset.dataset import AudioDataset, CollateFunc +from wenet.transformer.asr_model import init_asr_model +from wenet.utils.checkpoint import load_checkpoint +import acl +from wenet.transformer.acl_net import Net +import json +import os + +def dic2json(input_dict, json_path): + json_str = json.dumps(input_dict) + with open(json_path, 'a') as json_file: + json_file.write(json_str) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='recognize with your model') + parser.add_argument('--config', required=True, help='config file') + parser.add_argument('--test_data', required=True, help='test data file') + parser.add_argument('--gpu', + type=int, + default=-1, + help='gpu id for this rank, -1 for cpu') + parser.add_argument('--checkpoint', required=True, help='checkpoint model') + parser.add_argument('--dict', required=True, help='dict file') + parser.add_argument('--beam_size', + type=int, + default=10, + help='beam size for search') + parser.add_argument('--bin_path', type=str, default="./encoder_data_noflash", help='encoder bin images dir') + parser.add_argument('--model_path', type=str, default="./onnx/decoder_final.om", help='encoder bin images dir') + parser.add_argument('--json_path', type=str, default="encoder_noflash_all.json", help='encoder bin images dir') + parser.add_argument('--penalty', + type=float, + default=0.0, + help='length penalty') + parser.add_argument('--result_file', required=True, help='asr result file') + parser.add_argument('--batch_size', + type=int, + default=16, + help='asr result file') + parser.add_argument('--ctc_weight', + type=float, + default=0.0, + help='ctc weight for attention rescoring decode mode') + parser.add_argument('--decoding_chunk_size', + type=int, + default=-1, + help='''decoding chunk size, + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here''') + parser.add_argument('--num_decoding_left_chunks', + type=int, + default=-1, + help='number of left chunks for decoding') + parser.add_argument('--simulate_streaming', + action='store_true', + help='simulate streaming inference') + parser.add_argument('--reverse_weight', + type=float, + default=0.0, + help='''right to left weight for attention rescoring + decode mode''') + args = parser.parse_args() + print(args) + logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(message)s') + os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) + + with open(args.config, 'r') as fin: + configs = yaml.load(fin, Loader=yaml.FullLoader) + + raw_wav = configs['raw_wav'] + test_collate_conf = copy.deepcopy(configs['collate_conf']) + test_collate_conf['spec_aug'] = False + test_collate_conf['spec_sub'] = False + test_collate_conf['feature_dither'] = False + test_collate_conf['speed_perturb'] = False + if raw_wav: + test_collate_conf['wav_distortion_conf']['wav_distortion_rate'] = 0 + test_collate_func = CollateFunc(**test_collate_conf, raw_wav=raw_wav) + dataset_conf = configs.get('dataset_conf', {}) + dataset_conf['batch_size'] = args.batch_size + dataset_conf['batch_type'] = 'static' + dataset_conf['sort'] = False + test_dataset = AudioDataset(args.test_data, + **dataset_conf, + raw_wav=raw_wav) + test_data_loader = DataLoader(test_dataset, + collate_fn=test_collate_func, + shuffle=False, + batch_size=1, + num_workers=0) + + # Init asr model from configs + model = init_asr_model(configs) + # Load dict + char_dict = {} + with open(args.dict, 'r') as fin: + for line in fin: + arr = line.strip().split() + assert len(arr) == 2 + char_dict[int(arr[1])] = arr[0] + eos = len(char_dict) - 1 + + load_checkpoint(model, args.checkpoint) + use_cuda = args.gpu >= 0 and torch.cuda.is_available() + device = torch.device('cuda' if use_cuda else 'cpu') + model = model.to(device) + + model.eval() + total_t = 0 + #init acl + ret = acl.init() + device_id = 0 + ret = acl.rt.set_device(device_id) + context, ret = acl.rt.create_context(device_id) + decoder_output_data_shape = 42330000 + decoder_model = Net(model_path =args.model_path, output_data_shape = decoder_output_data_shape, device_id = device_id, ) + json_data = {} + with open(args.json_path, 'r') as load_f: + json_data = json.load(load_f) + bin_path = args.bin_path + with torch.no_grad(), open(args.result_file, 'w') as fout: + for batch_idx, batch in enumerate(test_data_loader): + keys, feats, target, feats_lengths, target_lengths = batch + feats = feats.to(device) + target = target.to(device) + feats_lengths = feats_lengths.to(device) + target_lengths = target_lengths.to(device) + assert (feats.size(0) == 1) + hyp, exe_time = model.get_wer( + batch_idx, + bin_path, + json_data, + decoder_model, + feats, + feats_lengths, + args.beam_size, + decoding_chunk_size=args.decoding_chunk_size, + num_decoding_left_chunks=args.num_decoding_left_chunks, + ctc_weight=args.ctc_weight, + simulate_streaming=args.simulate_streaming, + reverse_weight=args.reverse_weight) + total_t += exe_time + hyps = [hyp] + for i, key in enumerate(keys): + content = '' + for w in hyps[i]: + if w == eos: + break + content += char_dict[w] + logging.info('{} {}'.format(key, content)) + fout.write('{} {}\n'.format(key, content)) + ave_t = total_t / (batch_idx + 1) + dic_perf = {} + dic_perf["t2"] = ave_t + if "no" in args.bin_path: + dic2json(dic_perf, "t2.json") diff --git a/ACL_PyTorch/contrib/audio/WeNet/requirements.txt b/ACL_PyTorch/contrib/audio/WeNet/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4e8c3570cf18cb61706d5dcf65e1c0e918168bc --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/requirements.txt @@ -0,0 +1,9 @@ +torch==1.9.0 +onnx==1.10.0 +onnxruntime==1.8.1 +torchaudio==0.9.0 +sympy +pyyaml +decorator +typeguard +pillow \ No newline at end of file diff --git a/ACL_PyTorch/contrib/audio/WeNet/run_attention_rescoring.sh b/ACL_PyTorch/contrib/audio/WeNet/run_attention_rescoring.sh new file mode 100644 index 0000000000000000000000000000000000000000..521b5820d9f36422682f15a1a6f86a20a7a89934 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/run_attention_rescoring.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# Copyright 2019 Mobvoi Inc. All Rights Reserved. +. ./path.sh || exit 1; + +# Use this to control how many gpu you use, It's 1-gpu training if you specify +# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch +export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl +# communication. More details can be found in +# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html +# export NCCL_SOCKET_IFNAME=ens4f1 +export NCCL_DEBUG=INFO +stage=0 # start from 0 if you need to start from data preparation +stop_stage=6 +# The num of nodes or machines used for multi-machine training +# Default 1 for single machine/node +# NFS will be needed if you want run multi-machine training +num_nodes=1 +# The rank of each node or machine, range from 0 to num_nodes -1 +# The first node/machine sets node_rank 0, the second one sets node_rank 1 +# the third one set node_rank 2, and so on. Default 0 +node_rank=0 +# data +data=/export/data/asr-data/OpenSLR/33/ +data_url=www.openslr.org/resources/33 + +nj=16 +feat_dir=raw_wav +dict=data/dict/lang_char.txt + +train_set=train +# Optional train_config +# 1. conf/train_transformer.yaml: Standard transformer +# 2. conf/train_conformer.yaml: Standard conformer +# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer +# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer +# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding +# 6. conf/train_u2++_conformer.yaml: U2++ conformer +# 7. conf/train_u2++_transformer.yaml: U2++ transformer +train_config=conf/train_conformer.yaml +cmvn=true +dir=exp/conformer +checkpoint= + +# use average_checkpoint will get better result +average_checkpoint=false +decode_checkpoint=$dir/final.pt +average_num=30 +decode_modes="attention_rescoring" + +. tools/parse_options.sh || exit 1; + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # Test model, please specify the model you want to test by --checkpoint + if [ ${average_checkpoint} == true ]; then + decode_checkpoint=$dir/avg_${average_num}.pt + echo "do model average and final checkpoint is $decode_checkpoint" + python3 wenet/bin/average_model.py \ + --dst_model $decode_checkpoint \ + --src_path $dir \ + --num ${average_num} \ + --val_best + fi + # Specify decoding_chunk_size if it's a unified dynamic chunk trained model + # -1 for full chunk + decoding_chunk_size= + ctc_weight=0.3 + reverse_weight=0.3 + for mode in ${decode_modes}; do + { + test_dir=$dir/test_${mode} + mkdir -p $test_dir + python3 wenet/bin/recognize_attenstion_rescoring.py --gpu -1 \ + --config $dir/train.yaml \ + --test_data $feat_dir/test/format.data \ + --checkpoint $decode_checkpoint \ + --beam_size 10 \ + --batch_size 1 \ + --penalty 0.0 \ + --dict $dict \ + --ctc_weight $ctc_weight \ + --reverse_weight $reverse_weight \ + --result_file $test_dir/text \ + --simulate_streaming \ + --decoding_chunk_size 1 + ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} + python3 tools/compute-wer.py --char=1 --v=1 \ + $feat_dir/test/text $test_dir/text > $test_dir/wer + } & + done + wait + +fi + diff --git a/ACL_PyTorch/contrib/audio/WeNet/run_no_flash_encoder_out.sh b/ACL_PyTorch/contrib/audio/WeNet/run_no_flash_encoder_out.sh new file mode 100644 index 0000000000000000000000000000000000000000..d41edbd972b4bcfca7d7f89375a30634a03ead30 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/run_no_flash_encoder_out.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +# Copyright 2019 Mobvoi Inc. All Rights Reserved. +. ./path.sh || exit 1; + +# Use this to control how many gpu you use, It's 1-gpu training if you specify +# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch +export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl +# communication. More details can be found in +# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html +# export NCCL_SOCKET_IFNAME=ens4f1 +export NCCL_DEBUG=INFO +stage=0 # start from 0 if you need to start from data preparation +stop_stage=6 +# The num of nodes or machines used for multi-machine training +# Default 1 for single machine/node +# NFS will be needed if you want run multi-machine training +num_nodes=1 +# The rank of each node or machine, range from 0 to num_nodes -1 +# The first node/machine sets node_rank 0, the second one sets node_rank 1 +# the third one set node_rank 2, and so on. Default 0 +node_rank=0 +# data +data=/export/data/asr-data/OpenSLR/33/ +data_url=www.openslr.org/resources/33 + +nj=16 +feat_dir=raw_wav +dict=data/dict/lang_char.txt + +train_set=train +# Optional train_config +# 1. conf/train_transformer.yaml: Standard transformer +# 2. conf/train_conformer.yaml: Standard conformer +# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer +# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer +# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding +# 6. conf/train_u2++_conformer.yaml: U2++ conformer +# 7. conf/train_u2++_transformer.yaml: U2++ transformer +train_config=conf/train_conformer.yaml +cmvn=true +dir=exp/conformer +checkpoint= + +# use average_checkpoint will get better result +average_checkpoint=false +decode_checkpoint=$dir/final.pt +average_num=30 +decode_modes="attention_rescoring" + +. tools/parse_options.sh || exit 1; + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # Test model, please specify the model you want to test by --checkpoint + if [ ${average_checkpoint} == true ]; then + decode_checkpoint=$dir/avg_${average_num}.pt + echo "do model average and final checkpoint is $decode_checkpoint" + python3 wenet/bin/average_model.py \ + --dst_model $decode_checkpoint \ + --src_path $dir \ + --num ${average_num} \ + --val_best + fi + # Specify decoding_chunk_size if it's a unified dynamic chunk trained model + # -1 for full chunk + decoding_chunk_size= + ctc_weight=0.3 + reverse_weight=0.3 + + test_dir=$dir/test_${mode} + mkdir -p $test_dir + python3 wenet/bin/process_encoder_data_noflash.py --gpu -1 \ + --config $dir/train.yaml \ + --test_data $feat_dir/test/format.data \ + --checkpoint $decode_checkpoint \ + --beam_size 10 \ + --batch_size 1 \ + --penalty 0.0 \ + --dict $dict \ + --ctc_weight $ctc_weight \ + --reverse_weight $reverse_weight \ + --result_file $test_dir/text \ + --decoding_chunk_size 1 + ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} +fi + diff --git a/ACL_PyTorch/contrib/audio/WeNet/run_static.sh b/ACL_PyTorch/contrib/audio/WeNet/run_static.sh new file mode 100644 index 0000000000000000000000000000000000000000..f0c967ee8e3eac1042150ba0f4b4f498bf7380a6 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/run_static.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +# Copyright 2019 Mobvoi Inc. All Rights Reserved. +. ./path.sh || exit 1; + +# Use this to control how many gpu you use, It's 1-gpu training if you specify +# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch +export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl +# communication. More details can be found in +# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html +# export NCCL_SOCKET_IFNAME=ens4f1 +export NCCL_DEBUG=INFO +stage=5 # start from 0 if you need to start from data preparation +stop_stage=5 +# The num of nodes or machines used for multi-machine training +# Default 1 for single machine/node +# NFS will be needed if you want run multi-machine training +num_nodes=1 +# The rank of each node or machine, range from 0 to num_nodes -1 +# The first node/machine sets node_rank 0, the second one sets node_rank 1 +# the third one set node_rank 2, and so on. Default 0 +node_rank=0 +# data +data=/export/data/asr-data/OpenSLR/33/ +data_url=www.openslr.org/resources/33 + +nj=16 +feat_dir=raw_wav +dict=data/dict/lang_char.txt + +train_set=train +# Optional train_config +# 1. conf/train_transformer.yaml: Standard transformer +# 2. conf/train_conformer.yaml: Standard conformer +# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer +# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer +# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding +# 6. conf/train_u2++_conformer.yaml: U2++ conformer +# 7. conf/train_u2++_transformer.yaml: U2++ transformer +train_config=conf/train_conformer.yaml +cmvn=true +dir=exp/conformer +checkpoint= + +# use average_checkpoint will get better result +average_checkpoint=false +decode_checkpoint=$dir/final.pt +average_num=30 +decode_modes="attention_rescoring" + +. tools/parse_options.sh || exit 1; + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # Test model, please specify the model you want to test by --checkpoint + if [ ${average_checkpoint} == true ]; then + decode_checkpoint=$dir/avg_${average_num}.pt + echo "do model average and final checkpoint is $decode_checkpoint" + python3 wenet/bin/average_model.py \ + --dst_model $decode_checkpoint \ + --src_path $dir \ + --num ${average_num} \ + --val_best + fi + # Specify decoding_chunk_size if it's a unified dynamic chunk trained model + # -1 for full chunk + decoding_chunk_size= + ctc_weight=0.5 + reverse_weight=0.0 + for mode in ${decode_modes}; do + { + test_dir=$dir/test_${mode} + mkdir -p $test_dir + python3 wenet/bin/static.py --gpu -1 \ + --mode $mode \ + --config $dir/train.yaml \ + --test_data $feat_dir/test/format.data \ + --checkpoint $decode_checkpoint \ + --beam_size 10 \ + --batch_size 1 \ + --penalty 0.0 \ + --dict $dict \ + --ctc_weight $ctc_weight \ + --reverse_weight $reverse_weight \ + --result_file $test_dir/text \ + ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} + python3 tools/compute-wer.py --char=1 --v=1 \ + $feat_dir/test/text $test_dir/text > $test_dir/wer + } & + done + wait + +fi + diff --git a/ACL_PyTorch/contrib/audio/WeNet/slice_helper.py b/ACL_PyTorch/contrib/audio/WeNet/slice_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..62a92e1b56173db2aae1acad8a6cdf34b532d6d7 --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/slice_helper.py @@ -0,0 +1,68 @@ +# BSD 3-Clause License +# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# ============================================================================ + +import torch + +@torch.jit.script +def slice_helper(x, offset): + return x[:, -offset: , : ] + +@torch.jit.script +def slice_helper2(x: torch.Tensor, start: torch.Tensor, end: torch.Tensor): + start = start.long() + end = end.long() + return x[:, start:end] + +@torch.jit.script +def slice_helper3(x, start): + return x[:, start:] + +@torch.jit.script +def get_item(x): + item = x.detach().item() + output = torch.tensor(item) + return output + +@torch.jit.script +def get_next_cache_start(required_cache_size: torch.Tensor, xs: torch.Tensor): + # required_cache_size = required_cache_size_tensor.detach().item() + next_cache_start = 0 + if required_cache_size < 0: + next_cache_start = 0 + elif required_cache_size == 0: + next_cache_start = xs.size(1) + else: + if xs.size(1) - required_cache_size < 0: + next_cache_start = 0 + else: + next_cache_start = xs.size(1) - required_cache_size + return torch.tensor(next_cache_start, dtype=torch.int64) diff --git a/ACL_PyTorch/contrib/audio/WeNet/static.py b/ACL_PyTorch/contrib/audio/WeNet/static.py new file mode 100644 index 0000000000000000000000000000000000000000..edb1f67aeab88e12a7f28293bd069ac2de92664d --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/static.py @@ -0,0 +1,201 @@ +# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import argparse +import copy +import logging +import os +import sys + +import torch +import yaml +from torch.utils.data import DataLoader + +from wenet.dataset.dataset import AudioDataset, CollateFunc +from wenet.transformer.asr_model import init_asr_model +from wenet.utils.checkpoint import load_checkpoint + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='recognize with your model') + parser.add_argument('--config', required=True, help='config file') + parser.add_argument('--test_data', required=True, help='test data file') + parser.add_argument('--gpu', + type=int, + default=-1, + help='gpu id for this rank, -1 for cpu') + parser.add_argument('--checkpoint', required=True, help='checkpoint model') + parser.add_argument('--dict', required=True, help='dict file') + parser.add_argument('--beam_size', + type=int, + default=10, + help='beam size for search') + parser.add_argument('--penalty', + type=float, + default=0.0, + help='length penalty') + parser.add_argument('--result_file', required=True, help='asr result file') + parser.add_argument('--batch_size', + type=int, + default=16, + help='asr result file') + parser.add_argument('--mode', + choices=[ + 'attention', 'ctc_greedy_search', + 'ctc_prefix_beam_search', 'attention_rescoring' + ], + default='attention', + help='decoding mode') + parser.add_argument('--ctc_weight', + type=float, + default=0.0, + help='ctc weight for attention rescoring decode mode') + parser.add_argument('--decoding_chunk_size', + type=int, + default=-1, + help='''decoding chunk size, + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here''') + parser.add_argument('--num_decoding_left_chunks', + type=int, + default=-1, + help='number of left chunks for decoding') + parser.add_argument('--simulate_streaming', + action='store_true', + help='simulate streaming inference') + parser.add_argument('--reverse_weight', + type=float, + default=0.0, + help='''right to left weight for attention rescoring + decode mode''') + args = parser.parse_args() + print(args) + total_t = 0 + logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(message)s') + os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) + + if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring' + ] and args.batch_size > 1: + logging.fatal( + 'decoding mode {} must be running with batch_size == 1'.format( + args.mode)) + sys.exit(1) + + with open(args.config, 'r') as fin: + configs = yaml.load(fin, Loader=yaml.FullLoader) + + raw_wav = configs['raw_wav'] + # Init dataset and data loader + # Init dataset and data loader + test_collate_conf = copy.deepcopy(configs['collate_conf']) + test_collate_conf['spec_aug'] = False + test_collate_conf['spec_sub'] = False + test_collate_conf['feature_dither'] = False + test_collate_conf['speed_perturb'] = False + if raw_wav: + test_collate_conf['wav_distortion_conf']['wav_distortion_rate'] = 0 + test_collate_func = CollateFunc(**test_collate_conf, raw_wav=raw_wav) + dataset_conf = configs.get('dataset_conf', {}) + dataset_conf['batch_size'] = args.batch_size + dataset_conf['batch_type'] = 'static' + dataset_conf['sort'] = False + test_dataset = AudioDataset(args.test_data, + **dataset_conf, + raw_wav=raw_wav) + test_data_loader = DataLoader(test_dataset, + collate_fn=test_collate_func, + shuffle=False, + batch_size=1, + num_workers=0) + + # Init asr model from configs + model = init_asr_model(configs) + + # Load dict + char_dict = {} + with open(args.dict, 'r') as fin: + for line in fin: + arr = line.strip().split() + assert len(arr) == 2 + char_dict[int(arr[1])] = arr[0] + eos = len(char_dict) - 1 + + load_checkpoint(model, args.checkpoint) + use_cuda = args.gpu >= 0 and torch.cuda.is_available() + device = torch.device('cuda' if use_cuda else 'cpu') + model = model.to(device) + + model.eval() + with torch.no_grad(), open(args.result_file, 'w') as fout: + for batch_idx, batch in enumerate(test_data_loader): + print("batch_idx:", batch_idx) + keys, feats, target, feats_lengths, target_lengths = batch + feats = feats.to(device) + target = target.to(device) + feats_lengths = feats_lengths.to(device) + target_lengths = target_lengths.to(device) + if args.mode == 'attention': + hyps = model.recognize( + feats, + feats_lengths, + beam_size=args.beam_size, + decoding_chunk_size=args.decoding_chunk_size, + num_decoding_left_chunks=args.num_decoding_left_chunks, + simulate_streaming=args.simulate_streaming) + hyps = [hyp.tolist() for hyp in hyps] + elif args.mode == 'ctc_greedy_search': + hyps = model.ctc_greedy_search( + feats, + feats_lengths, + decoding_chunk_size=args.decoding_chunk_size, + num_decoding_left_chunks=args.num_decoding_left_chunks, + simulate_streaming=args.simulate_streaming) + # ctc_prefix_beam_search and attention_rescoring only return one + # result in List[int], change it to List[List[int]] for compatible + # with other batch decoding mode + elif args.mode == 'ctc_prefix_beam_search': + assert (feats.size(0) == 1) + hyp = model.ctc_prefix_beam_search( + feats, + feats_lengths, + args.beam_size, + decoding_chunk_size=args.decoding_chunk_size, + num_decoding_left_chunks=args.num_decoding_left_chunks, + simulate_streaming=args.simulate_streaming) + hyps = [hyp] + elif args.mode == 'attention_rescoring': + assert (feats.size(0) == 1) + hyp = model.attention_rescoring( + feats, + feats_lengths, + args.beam_size, + decoding_chunk_size=args.decoding_chunk_size, + num_decoding_left_chunks=args.num_decoding_left_chunks, + ctc_weight=args.ctc_weight, + simulate_streaming=args.simulate_streaming, + reverse_weight=args.reverse_weight) + total_t += hyp[1] + hyps = [hyp] + for i, key in enumerate(keys): + content = '' + for w in hyps[i][0]: + if w == eos: + break + content += char_dict[w] + logging.info('{} {}'.format(key, content)) + fout.write('{} {}\n'.format(key, content)) + fout.write('FPS:{}\n'.format(1000/(total_t/(batch_idx+1)))) diff --git a/ACL_PyTorch/contrib/audio/WeNet/static_decoder.sh b/ACL_PyTorch/contrib/audio/WeNet/static_decoder.sh new file mode 100644 index 0000000000000000000000000000000000000000..75e40f6223aa7694cc0b052f1230645a2d6b0c8a --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/static_decoder.sh @@ -0,0 +1,22 @@ +export install_path=/usr/local/Ascend/ascend-toolkit/latest +export PATH=${install_path}/atc/bin:${install_path}/bin:${install_path}/atc/ccec_compiler/bin:$PATH +export LD_LIBRARY_PATH=${install_path}/lib64:${install_path}/atc/lib64:${install_path}/acllib/lib64:${install_path}/compiler/lib64/plugin/opskernel:${install_path}/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH +export PYTHONPATH=${install_path}/latest/python/site-packages:${install_path}/opp/op_impl/built-in/ai_core/tbe:${install_path}/atc/python/site-packages:${install_path}/pyACL/python/site-packages/acl:$PYTHONPATH +export ASCEND_AICPU_PATH=${install_path} +export ASCEND_OPP_PATH=${install_path}/opp +export TOOLCHAIN_HOME=${install_path}/toolkit +export ASCEND_AUTOML_PATH=${install_path}/tools +export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:${LD_LIBRARY_PATH} +atc --model=decoder_final.onnx --framework=5 --output=decoder_fendang --input_format=ND \ +--input_shape="memory:10,-1,256;memory_mask:10,1,-1;ys_in_pad:10,-1;ys_in_lens:10;r_ys_in_pad:10,-1" --log=error \ +--dynamic_dims="96,96,3,3;96,96,4,4;96,96,5,5;96,96,6,6;96,96,7,7;96,96,8,8;96,96,9,9;96,96,10,10;96,96,11,11;\ +96,96,12,12;96,96,13,13;96,96,14,14;96,96,15,15;96,96,16,16;96,96,17,17;96,96,18,18;96,96,19,19;96,96,20,20;\ +96,96,21,21;96,96,22,22;96,96,23,23;144,144,6,6;144,144,7,7;144,144,8,8;144,144,9,9;144,144,10,10;144,144,11,11;\ +144,144,12,12;144,144,13,13;144,144,14,14;144,144,15,15;144,144,16,16;144,144,17,17;144,144,18,18;144,144,19,19;\ +144,144,20,20;144,144,21,21;144,144,22,22;144,144,23,23;144,144,24,24;144,144,25,25;144,144,26,26;144,144,27,27;\ +144,144,28,28;384,384,9,9;384,384,10,10;384,384,11,11;384,384,12,12;384,384,13,13;384,384,14,14;384,384,15,15;\ +384,384,16,16;384,384,17,17;384,384,18,18;384,384,19,19;384,384,20,20;384,384,21,21;384,384,22,22;384,384,23,23;\ +384,384,24,24;384,384,25,25;384,384,26,26;384,384,27,27;384,384,28,28;384,384,29,29;384,384,30,30;384,384,31,31;\ +384,384,32,32;384,384,33,33;384,384,34,34;384,384,35,35;384,384,36,36;384,384,37,37;384,384,38,38;384,384,39,39;384,384,40,40;384,384,41,41;" \ +--soc_version=Ascend310 + diff --git a/ACL_PyTorch/contrib/audio/WeNet/static_encoder.sh b/ACL_PyTorch/contrib/audio/WeNet/static_encoder.sh new file mode 100644 index 0000000000000000000000000000000000000000..7e409da26b160d6be35174e81e334269b0d531fb --- /dev/null +++ b/ACL_PyTorch/contrib/audio/WeNet/static_encoder.sh @@ -0,0 +1,13 @@ +export install_path=/usr/local/Ascend/ascend-toolkit/latest +export PATH=${install_path}/atc/bin:${install_path}/bin:${install_path}/atc/ccec_compiler/bin:$PATH +export LD_LIBRARY_PATH=${install_path}/lib64:${install_path}/atc/lib64:${install_path}/acllib/lib64:${install_path}/compiler/lib64/plugin/opskernel:${install_path}/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH +export PYTHONPATH=${install_path}/latest/python/site-packages:${install_path}/opp/op_impl/built-in/ai_core/tbe:${install_path}/atc/python/site-packages:${install_path}/pyACL/python/site-packages/acl:$PYTHONPATH +export ASCEND_AICPU_PATH=${install_path} +export ASCEND_OPP_PATH=${install_path}/opp +export TOOLCHAIN_HOME=${install_path}/toolkit +export ASCEND_AUTOML_PATH=${install_path}/tools +export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:${LD_LIBRARY_PATH} +atc --model=no_flash_encoder_revise.onnx --framework=5 --output=encoder_fendang_262_1478_static --input_format=ND \ +--input_shape="xs_input:1,-1,80;xs_input_lens:1" --log=error \ +--dynamic_dims="262;326;390;454;518;582;646;710;774;838;902;966;1028;1284;1478" \ +--soc_version=Ascend310