From 7bd35585e52f7fb7c530e40e4c68dd260fbb6e88 Mon Sep 17 00:00:00 2001 From: pengaoao Date: Wed, 23 Mar 2022 19:40:45 +0800 Subject: [PATCH 1/3] fix --- .../audio/Wenet_for_Pytorch/README.md | 24 +++- .../get_no_flash_encoder_out.diff | 123 ++++++++++++++++-- .../process_encoder_data_flash.py | 8 +- .../process_encoder_data_noflash.py | 1 + .../recognize_attenstion_rescoring.py | 1 + .../audio/Wenet_for_Pytorch/requirements.txt | 8 +- 6 files changed, 142 insertions(+), 23 deletions(-) diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md index e3d5cafbe8..d316562557 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md @@ -18,13 +18,17 @@ git reset 9c4e305bcc24a06932f6a65c8147429d8406cc63 --hard 3. 下载网络权重文件并导出onnx -下载链接:http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/aishell/20210601_u2pp_conformer_exp.tar.gz下载压缩文件,将文件解压,将文件夹内的文件放置到wenet/examples/aishell/s0/exp/conformer_u2文件夹下,若没有该文件夹,则创建该文件夹 +下载链接:http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/aishell/20210601_u2pp_conformer_exp.tar.gz + +下载压缩文件,将文件解压,将文件夹内的文件放置到wenet/examples/aishell/s0/exp/conformer_u2文件夹下,若没有该文件夹,则创建该文件夹 首先将所有提供的diff文件放到wenet根目录下,patch -p1 < export_onnx.diff文件适配导出onnx的代码,将提供的export_onnx.py、process_encoder_data_flash.py、process_encoder_data_noflash.py、recognize_attenstion_rescoring.py、static.py文件放到wenet/wenet/bin/目录下,将提供的slice_helper.py, acl_net.py文件放到wenet/wenet/transformer文件夹下,将提供的sh脚本文件放到wenet/examples/aishell/s0/目录下,运行bash export_onnx.sh exp/conformer_u2/train.yaml exp/conformer_u2/final.pt导出onnx文件在当前目录下的onnx文件夹下 4. 运行脚本将onnx转为om模型 -首先使用改图工具om_gener改图,该工具链接为https://gitee.com/liurf_hw/om_gener,安装之后使用以下命令修改脚本, +首先使用改图工具om_gener改图,该工具链接为https://gitee.com/liurf_hw/om_gener, + +安装之后,将生成的onnx和脚本放至同一目录,使用以下命令修改脚本, python3 adaptdecoder.py生成decoder_final.onnx @@ -60,12 +64,16 @@ python3 adaptnoflashencoder.py生成no_flash_encoder_revise.onnx ``` git checkout . - patch –p1 < get_no_flash_encoder_out.diff + patch -p1 < get_no_flash_encoder_out.diff cd examples/aishell/s0/ bash run_no_flash_encoder_out.sh ``` - 以上步骤注意,wenet/bin/process_encoder_data_noflash.py文件中--bin_path, --model_path,--json_path分别保存encoder生成的bin文件,非流式encoder om模型位置,encoder生成bin文件的shape信息获取非流式场景下,decoder处理结果:cd到wenet根目录下 + 以上步骤注意,wenet/bin/process_encoder_data_noflash.py文件中--bin_path, --model_path,--json_path分别保存encoder生成的bin文件,非流式encoder om模型位置,encoder生成bin文件的shape信息 + + + + 获取非流式场景下,decoder处理结果:cd到wenet根目录下 ``` git checkout . @@ -87,13 +95,15 @@ python3 adaptnoflashencoder.py生成no_flash_encoder_revise.onnx bash run_encoder_out.sh ``` - 以上步骤注意,wenet/bin/process_encoder_data_flash.py文件中--bin_path, --json_path分别保存encoder生成的bin文件, encoder生成bin文件的shape信息;注意修改wenet/transformer/encoder.py文件中BaseEncoder类中init函数中encoder_model参数中流式om模型的路径 + 以上步骤注意,wenet/bin/process_encoder_data_flash.py文件中--bin_path,--model_path, --json_path分别保存encoder生成的bin文件,模型路径信息, encoder生成bin文件的shape信息; + + 获取流式场景下,decoder处理结果:cd到wenet根目录下 ``` git checkout . - patch -p1 < getwer.diff + patch -·p1 < getwer.diff cd examples/aishell/s0/ bash run_attention_rescoring.sh ``` @@ -102,7 +112,7 @@ python3 adaptnoflashencoder.py生成no_flash_encoder_revise.onnx | 模型 | 官网pth精度 | 710/310离线推理精度 | gpu性能 | 710性能 | 310性能 | | :---: | :----------------------------: | :-------------------------: | :-----: | :-----: | ------- | -| wenet | GPU流式:5.94%, 非流式:4.64% | 流式:5.66%, 非流式:5.66% | 66fps | 7.69 | 11.6fps | +| wenet | GPU流式:5.94%, 非流式:4.64% | 流式:5.66%, 非流式:5.66% | | 7.69 | 11.6fps | 生成的t1.json, t2.json文件中分别为encoder,decoder耗时,将其相加即可,运行python3.7.5 infer_perf.py diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/get_no_flash_encoder_out.diff b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/get_no_flash_encoder_out.diff index b209edb704..66a596cf41 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/get_no_flash_encoder_out.diff +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/get_no_flash_encoder_out.diff @@ -1,16 +1,31 @@ diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py -index 73990fa..e2f3555 100644 +index 73990fa..bb3e10f 100644 --- a/wenet/transformer/asr_model.py +++ b/wenet/transformer/asr_model.py -@@ -175,6 +175,33 @@ class ASRModel(torch.nn.Module): - num_decoding_left_chunks=num_decoding_left_chunks +@@ -158,6 +158,7 @@ class ASRModel(torch.nn.Module): + decoding_chunk_size: int = -1, + num_decoding_left_chunks: int = -1, + simulate_streaming: bool = False, ++ encoder_model=None + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Let's assume B = batch_size + # 1. Encoder +@@ -165,7 +166,8 @@ class ASRModel(torch.nn.Module): + encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk( + speech, + decoding_chunk_size=decoding_chunk_size, +- num_decoding_left_chunks=num_decoding_left_chunks ++ num_decoding_left_chunks=num_decoding_left_chunks, ++ encoder_model=encoder_model ) # (B, maxlen, encoder_dim) - return encoder_out, encoder_mask -+ -+ def get_no_flash_encoder_out( + else: + encoder_out, encoder_mask = self.encoder( +@@ -443,6 +445,57 @@ class ASRModel(torch.nn.Module): + simulate_streaming) + return hyps[0][0] + ++ def get_encoder_flash_data( + self, -+ encoder_model_noflash, -+ batch_idx: int, + speech: torch.Tensor, + speech_lengths: torch.Tensor, + beam_size: int, @@ -19,7 +34,29 @@ index 73990fa..e2f3555 100644 + ctc_weight: float = 0.0, + simulate_streaming: bool = False, + reverse_weight: float = 0.0, ++ encoder_model=None + ) -> List[int]: ++ """ Apply attention rescoring decoding, CTC prefix beam search ++ is applied first to get nbest, then we resoring the nbest on ++ attention decoder with corresponding encoder out ++ ++ Args: ++ speech (torch.Tensor): (batch, max_len, feat_dim) ++ speech_length (torch.Tensor): (batch, ) ++ beam_size (int): beam size for beam search ++ decoding_chunk_size (int): decoding chunk for dynamic chunk ++ trained model. ++ <0: for decoding, use full chunk. ++ >0: for decoding, use fixed chunk size as set. ++ 0: used for training, it's prohibited here ++ simulate_streaming (bool): whether do encoder forward in a ++ streaming fashion ++ reverse_weight (float): right to left decoder weight ++ ctc_weight (float): ctc score weight ++ ++ Returns: ++ List[int]: Attention rescoring result ++ """ + assert speech.shape[0] == speech_lengths.shape[0] + assert decoding_chunk_size != 0 + if reverse_weight > 0.0: @@ -29,10 +66,68 @@ index 73990fa..e2f3555 100644 + batch_size = speech.shape[0] + # For attention rescoring we only support batch_size=1 + assert batch_size == 1 -+ y, exe_time = encoder_model_noflash( -+ [speech.numpy(), speech_lengths.numpy().astype("int32")]) # (beam_size, max_hyps_len, vocab_size) -+ encoder_out, encoder_mask = torch.from_numpy(y[0]), torch.from_numpy(y[1]) -+ return encoder_out, encoder_mask, exe_time - - def recognize( ++ # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size ++ ++ encoder_out, encoder_mask = self._forward_encoder( ++ speech, speech_lengths, decoding_chunk_size, ++ num_decoding_left_chunks,simulate_streaming, ++ encoder_model=encoder_model) # (B, maxlen, encoder_dim) ++ return encoder_out, encoder_mask ++ ++ + def attention_rescoring( self, + speech: torch.Tensor, +diff --git a/wenet/transformer/encoder.py b/wenet/transformer/encoder.py +index e342ed4..21b536a 100644 +--- a/wenet/transformer/encoder.py ++++ b/wenet/transformer/encoder.py +@@ -26,7 +26,9 @@ from wenet.utils.common import get_activation + from wenet.utils.mask import make_pad_mask + from wenet.utils.mask import add_optional_chunk_mask + +- ++import acl ++from wenet.transformer.acl_net import Net ++import numpy as np + class BaseEncoder(torch.nn.Module): + def __init__( + self, +@@ -254,6 +256,7 @@ class BaseEncoder(torch.nn.Module): + xs: torch.Tensor, + decoding_chunk_size: int, + num_decoding_left_chunks: int = -1, ++ encoder_model=None + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ Forward input chunk by chunk with chunk_size like a streaming + fashion +@@ -295,17 +298,22 @@ class BaseEncoder(torch.nn.Module): + outputs = [] + offset = 0 + required_cache_size = decoding_chunk_size * num_decoding_left_chunks +- ++ subsampling_cache_om = torch.zeros(1, 1, 256, requires_grad=False) ++ elayers_output_cache_om = torch.zeros(12, 1, 1, 256, requires_grad=False) ++ conformer_cnn_cache_om = torch.zeros(12, 1, 256, 7, requires_grad=False) + # Feed forward overlap input step by step + for cur in range(0, num_frames - context + 1, stride): + end = min(cur + decoding_window, num_frames) + chunk_xs = xs[:, cur:end, :] +- (y, subsampling_cache, elayers_output_cache, +- conformer_cnn_cache) = self.forward_chunk(chunk_xs, offset, +- required_cache_size, +- subsampling_cache, +- elayers_output_cache, +- conformer_cnn_cache) ++ if offset > 0: ++ offset = offset - 1 ++ offset = offset + 1 ++ encoder_output, exe_time = encoder_model( ++ [chunk_xs.cpu().numpy(), np.array(offset), subsampling_cache_om.cpu().numpy(), \ ++ elayers_output_cache_om.cpu().numpy(), conformer_cnn_cache_om.cpu().numpy()]) ++ y, subsampling_cache_om, elayers_output_cache_om, conformer_cnn_cache_om = \ ++ torch.from_numpy(encoder_output[0][:, 1:, :]), torch.from_numpy(encoder_output[1]), \ ++ torch.from_numpy(encoder_output[2]), torch.from_numpy(encoder_output[3]) + outputs.append(y) + offset += y.size(1) + ys = torch.cat(outputs, 1) diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py index 1626fca9db..a23ad1e129 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py @@ -48,6 +48,7 @@ from wenet.transformer.asr_model import init_asr_model from wenet.utils.checkpoint import load_checkpoint #from wenet.transformer.acl_init import decoder_model, device_id import acl +from wenet.transformer.acl_net import Net import json import os @@ -97,6 +98,7 @@ if __name__ == '__main__': action='store_true', help='simulate streaming inference') parser.add_argument('--bin_path', type=str, default="./encoder_data", help='encoder bin images dir') + parser.add_argument('--model_path', type=str, default="./encoder_revise.om", help='encoder bin images dir') parser.add_argument('--json_path', type=str, default="encoder.json", help='encoder bin images dir') parser.add_argument('--reverse_weight', type=float, @@ -114,6 +116,8 @@ if __name__ == '__main__': device_id = 0 ret = acl.rt.set_device(device_id) context, ret = acl.rt.create_context(device_id) + output_shape = 4233000 + encoder_model = Net(model_path = args.model_path, output_data_shape = output_shape, device_id = device_id) with open(args.config, 'r') as fin: configs = yaml.load(fin, Loader=yaml.FullLoader) raw_wav = configs['raw_wav'] @@ -177,12 +181,14 @@ if __name__ == '__main__': num_decoding_left_chunks=args.num_decoding_left_chunks, ctc_weight=args.ctc_weight, simulate_streaming=args.simulate_streaming, - reverse_weight=args.reverse_weight) + reverse_weight=args.reverse_weight, + encoder_model=encoder_model) encoder_dic["encoder_out_"+ str(batch_idx)] = [encoder_out.shape[0], encoder_out.shape[1],encoder_out.shape[2]] encoder_dic["encoder_mask_"+ str(batch_idx)] = [encoder_mask.shape[0], encoder_mask.shape[1],encoder_mask.shape[2]] encoder_out.numpy().tofile(os.path.join(args.bin_path, "encoder_out_{}.bin".format(batch_idx))) encoder_mask.numpy().tofile(os.path.join(args.bin_path, "encoder_mask_{}.bin".format(batch_idx))) dic2json(encoder_dic, args.json_path) + del encoder_model diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py index 709d6f199d..12a46b08f0 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py @@ -206,4 +206,5 @@ if __name__ == '__main__': dic_perf["t1"] = ave_t dic2json(dic_perf, "t1.json") dic2json(encoder_dic, args.json_path) + del encoder_model_noflash diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/recognize_attenstion_rescoring.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/recognize_attenstion_rescoring.py index bdc02ad3a4..1c8850d6e7 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/recognize_attenstion_rescoring.py +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/recognize_attenstion_rescoring.py @@ -203,3 +203,4 @@ if __name__ == '__main__': dic_perf["t2"] = ave_t if "no" in args.bin_path: dic2json(dic_perf, "t2.json") + del decoder_model diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/requirements.txt b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/requirements.txt index baf9a5ad64..aff4bc95fc 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/requirements.txt +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/requirements.txt @@ -4,4 +4,10 @@ onnxruntime==1.8.1 torchaudio==0.9.0 numpy==1.18.5 Pillow==7.2.0 - +flake8==3.8.2 +Pillow +pyyaml>=5.1 +sentencepiece +typeguard +textgrid +pytest -- Gitee From 3c1dbe3bfccd80beebd40663bfd496140e613c79 Mon Sep 17 00:00:00 2001 From: pengaoao Date: Wed, 23 Mar 2022 19:44:44 +0800 Subject: [PATCH 2/3] fix --- .../audio/Wenet_for_Pytorch/process_encoder_data_flash.py | 2 ++ .../audio/Wenet_for_Pytorch/process_encoder_data_noflash.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py index a23ad1e129..6d98eb6a3d 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py @@ -111,6 +111,8 @@ if __name__ == '__main__': format='%(asctime)s %(levelname)s %(message)s') os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) + if not os.path.exists(args.bin_path): + os.mkdir(args.bin_path) #init acl ret = acl.init() device_id = 0 diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py index 12a46b08f0..cf295c927b 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py @@ -110,6 +110,8 @@ if __name__ == '__main__': format='%(asctime)s %(levelname)s %(message)s') os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) + if not os.path.exists(args.bin_path): + os.mkdir(args.bin_path) #init acl ret = acl.init() device_id = 0 -- Gitee From 82bca5995a805c672c1f29cb6ff64ed305f524d2 Mon Sep 17 00:00:00 2001 From: pengaoao Date: Wed, 23 Mar 2022 19:48:16 +0800 Subject: [PATCH 3/3] fix --- ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md index d316562557..80803daa07 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md @@ -8,7 +8,9 @@ pip3.7 install -r requirements.txt ``` -2. 获取,修改与安装开源模型代码 +其他需要安装的请按需安装 + +1. 获取,修改与安装开源模型代码 ``` git clone https://github.com/wenet-e2e/wenet.git -- Gitee