From 7bd35585e52f7fb7c530e40e4c68dd260fbb6e88 Mon Sep 17 00:00:00 2001
From: pengaoao <pengaoao@huawei.com>
Date: Wed, 23 Mar 2022 19:40:45 +0800
Subject: [PATCH 1/3] fix

---
 .../audio/Wenet_for_Pytorch/README.md         |  24 +++-
 .../get_no_flash_encoder_out.diff             | 123 ++++++++++++++++--
 .../process_encoder_data_flash.py             |   8 +-
 .../process_encoder_data_noflash.py           |   1 +
 .../recognize_attenstion_rescoring.py         |   1 +
 .../audio/Wenet_for_Pytorch/requirements.txt  |   8 +-
 6 files changed, 142 insertions(+), 23 deletions(-)

diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md
index e3d5cafbe8..d316562557 100644
--- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md
+++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md
@@ -18,13 +18,17 @@ git reset 9c4e305bcc24a06932f6a65c8147429d8406cc63 --hard
 
 3. 下载网络权重文件并导出onnx
 
-下载链接：http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/aishell/20210601_u2pp_conformer_exp.tar.gz下载压缩文件，将文件解压，将文件夹内的文件放置到wenet/examples/aishell/s0/exp/conformer_u2文件夹下，若没有该文件夹，则创建该文件夹
+下载链接：http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/aishell/20210601_u2pp_conformer_exp.tar.gz
+
+下载压缩文件，将文件解压，将文件夹内的文件放置到wenet/examples/aishell/s0/exp/conformer_u2文件夹下，若没有该文件夹，则创建该文件夹
 
 首先将所有提供的diff文件放到wenet根目录下，patch -p1 < export_onnx.diff文件适配导出onnx的代码，将提供的export_onnx.py、process_encoder_data_flash.py、process_encoder_data_noflash.py、recognize_attenstion_rescoring.py、static.py文件放到wenet/wenet/bin/目录下，将提供的slice_helper.py, acl_net.py文件放到wenet/wenet/transformer文件夹下，将提供的sh脚本文件放到wenet/examples/aishell/s0/目录下，运行bash export_onnx.sh exp/conformer_u2/train.yaml exp/conformer_u2/final.pt导出onnx文件在当前目录下的onnx文件夹下
 
 4.  运行脚本将onnx转为om模型
 
-首先使用改图工具om_gener改图，该工具链接为https://gitee.com/liurf_hw/om_gener，安装之后使用以下命令修改脚本，
+首先使用改图工具om_gener改图，该工具链接为https://gitee.com/liurf_hw/om_gener，
+
+安装之后，将生成的onnx和脚本放至同一目录，使用以下命令修改脚本，
 
 python3 adaptdecoder.py生成decoder_final.onnx
 
@@ -60,12 +64,16 @@ python3 adaptnoflashencoder.py生成no_flash_encoder_revise.onnx
    
    ```
    git checkout .
-   patch –p1 < get_no_flash_encoder_out.diff
+   patch -p1 < get_no_flash_encoder_out.diff
    cd examples/aishell/s0/
    bash run_no_flash_encoder_out.sh
    ```
    
-   以上步骤注意，wenet/bin/process_encoder_data_noflash.py文件中--bin_path， --model_path，--json_path分别保存encoder生成的bin文件，非流式encoder om模型位置，encoder生成bin文件的shape信息获取非流式场景下，decoder处理结果：cd到wenet根目录下
+   以上步骤注意，wenet/bin/process_encoder_data_noflash.py文件中--bin_path， --model_path，--json_path分别保存encoder生成的bin文件，非流式encoder om模型位置，encoder生成bin文件的shape信息
+   
+   
+   
+   获取非流式场景下，decoder处理结果：cd到wenet根目录下
    
    ```
    git checkout .
@@ -87,13 +95,15 @@ python3 adaptnoflashencoder.py生成no_flash_encoder_revise.onnx
    bash run_encoder_out.sh
    ```
    
-   以上步骤注意，wenet/bin/process_encoder_data_flash.py文件中--bin_path， --json_path分别保存encoder生成的bin文件， encoder生成bin文件的shape信息；注意修改wenet/transformer/encoder.py文件中BaseEncoder类中init函数中encoder_model参数中流式om模型的路径
+   以上步骤注意，wenet/bin/process_encoder_data_flash.py文件中--bin_path，--model_path,  --json_path分别保存encoder生成的bin文件，模型路径信息， encoder生成bin文件的shape信息；
+   
+   
    
    获取流式场景下，decoder处理结果：cd到wenet根目录下
    
    ```
    git checkout .
-   patch -p1 < getwer.diff
+   patch -·p1 < getwer.diff
    cd examples/aishell/s0/
    bash run_attention_rescoring.sh
    ```
@@ -102,7 +112,7 @@ python3 adaptnoflashencoder.py生成no_flash_encoder_revise.onnx
 
 | 模型  |          官网pth精度           |     710/310离线推理精度     | gpu性能 | 710性能 | 310性能 |
 | :---: | :----------------------------: | :-------------------------: | :-----: | :-----: | ------- |
-| wenet | GPU流式：5.94%， 非流式：4.64% | 流式：5.66%， 非流式：5.66% |  66fps  |  7.69   | 11.6fps |
+| wenet | GPU流式：5.94%， 非流式：4.64% | 流式：5.66%， 非流式：5.66% |         |  7.69   | 11.6fps |
 
 生成的t1.json, t2.json文件中分别为encoder，decoder耗时，将其相加即可，运行python3.7.5 infer_perf.py
 
diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/get_no_flash_encoder_out.diff b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/get_no_flash_encoder_out.diff
index b209edb704..66a596cf41 100644
--- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/get_no_flash_encoder_out.diff
+++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/get_no_flash_encoder_out.diff
@@ -1,16 +1,31 @@
 diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py
-index 73990fa..e2f3555 100644
+index 73990fa..bb3e10f 100644
 --- a/wenet/transformer/asr_model.py
 +++ b/wenet/transformer/asr_model.py
-@@ -175,6 +175,33 @@ class ASRModel(torch.nn.Module):
-                 num_decoding_left_chunks=num_decoding_left_chunks
+@@ -158,6 +158,7 @@ class ASRModel(torch.nn.Module):
+         decoding_chunk_size: int = -1,
+         num_decoding_left_chunks: int = -1,
+         simulate_streaming: bool = False,
++        encoder_model=None
+     ) -> Tuple[torch.Tensor, torch.Tensor]:
+         # Let's assume B = batch_size
+         # 1. Encoder
+@@ -165,7 +166,8 @@ class ASRModel(torch.nn.Module):
+             encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk(
+                 speech,
+                 decoding_chunk_size=decoding_chunk_size,
+-                num_decoding_left_chunks=num_decoding_left_chunks
++                num_decoding_left_chunks=num_decoding_left_chunks,
++                encoder_model=encoder_model
              )  # (B, maxlen, encoder_dim)
-         return encoder_out, encoder_mask
-+    
-+    def get_no_flash_encoder_out(
+         else:
+             encoder_out, encoder_mask = self.encoder(
+@@ -443,6 +445,57 @@ class ASRModel(torch.nn.Module):
+                                                simulate_streaming)
+         return hyps[0][0]
+
++    def get_encoder_flash_data(
 +        self,
-+        encoder_model_noflash,
-+        batch_idx: int,
 +        speech: torch.Tensor,
 +        speech_lengths: torch.Tensor,
 +        beam_size: int,
@@ -19,7 +34,29 @@ index 73990fa..e2f3555 100644
 +        ctc_weight: float = 0.0,
 +        simulate_streaming: bool = False,
 +        reverse_weight: float = 0.0,
++        encoder_model=None
 +    ) -> List[int]:
++        """ Apply attention rescoring decoding, CTC prefix beam search
++            is applied first to get nbest, then we resoring the nbest on
++            attention decoder with corresponding encoder out
++
++        Args:
++            speech (torch.Tensor): (batch, max_len, feat_dim)
++            speech_length (torch.Tensor): (batch, )
++            beam_size (int): beam size for beam search
++            decoding_chunk_size (int): decoding chunk for dynamic chunk
++                trained model.
++                <0: for decoding, use full chunk.
++                >0: for decoding, use fixed chunk size as set.
++                0: used for training, it's prohibited here
++            simulate_streaming (bool): whether do encoder forward in a
++                streaming fashion
++            reverse_weight (float): right to left decoder weight
++            ctc_weight (float): ctc score weight
++
++        Returns:
++            List[int]: Attention rescoring result
++        """
 +        assert speech.shape[0] == speech_lengths.shape[0]
 +        assert decoding_chunk_size != 0
 +        if reverse_weight > 0.0:
@@ -29,10 +66,68 @@ index 73990fa..e2f3555 100644
 +        batch_size = speech.shape[0]
 +        # For attention rescoring we only support batch_size=1
 +        assert batch_size == 1
-+        y, exe_time = encoder_model_noflash(
-+            [speech.numpy(), speech_lengths.numpy().astype("int32")])  # (beam_size, max_hyps_len, vocab_size)
-+        encoder_out, encoder_mask = torch.from_numpy(y[0]), torch.from_numpy(y[1])
-+        return encoder_out, encoder_mask, exe_time
- 
-     def recognize(
++        # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size
++
++        encoder_out, encoder_mask = self._forward_encoder(
++            speech, speech_lengths, decoding_chunk_size,
++            num_decoding_left_chunks,simulate_streaming,
++            encoder_model=encoder_model)  # (B, maxlen, encoder_dim)
++        return encoder_out, encoder_mask
++
++
+     def attention_rescoring(
          self,
+         speech: torch.Tensor,
+diff --git a/wenet/transformer/encoder.py b/wenet/transformer/encoder.py
+index e342ed4..21b536a 100644
+--- a/wenet/transformer/encoder.py
++++ b/wenet/transformer/encoder.py
+@@ -26,7 +26,9 @@ from wenet.utils.common import get_activation
+ from wenet.utils.mask import make_pad_mask
+ from wenet.utils.mask import add_optional_chunk_mask
+
+-
++import acl
++from wenet.transformer.acl_net import Net
++import numpy as np
+ class BaseEncoder(torch.nn.Module):
+     def __init__(
+         self,
+@@ -254,6 +256,7 @@ class BaseEncoder(torch.nn.Module):
+         xs: torch.Tensor,
+         decoding_chunk_size: int,
+         num_decoding_left_chunks: int = -1,
++        encoder_model=None
+     ) -> Tuple[torch.Tensor, torch.Tensor]:
+         """ Forward input chunk by chunk with chunk_size like a streaming
+             fashion
+@@ -295,17 +298,22 @@ class BaseEncoder(torch.nn.Module):
+         outputs = []
+         offset = 0
+         required_cache_size = decoding_chunk_size * num_decoding_left_chunks
+-
++        subsampling_cache_om = torch.zeros(1, 1, 256, requires_grad=False)
++        elayers_output_cache_om = torch.zeros(12, 1, 1, 256, requires_grad=False)
++        conformer_cnn_cache_om = torch.zeros(12, 1, 256, 7, requires_grad=False)
+         # Feed forward overlap input step by step
+         for cur in range(0, num_frames - context + 1, stride):
+             end = min(cur + decoding_window, num_frames)
+             chunk_xs = xs[:, cur:end, :]
+-            (y, subsampling_cache, elayers_output_cache,
+-             conformer_cnn_cache) = self.forward_chunk(chunk_xs, offset,
+-                                                       required_cache_size,
+-                                                       subsampling_cache,
+-                                                       elayers_output_cache,
+-                                                       conformer_cnn_cache)
++            if offset > 0:
++                offset = offset - 1
++            offset = offset + 1
++            encoder_output, exe_time = encoder_model(
++                [chunk_xs.cpu().numpy(), np.array(offset), subsampling_cache_om.cpu().numpy(), \
++                 elayers_output_cache_om.cpu().numpy(), conformer_cnn_cache_om.cpu().numpy()])
++            y, subsampling_cache_om, elayers_output_cache_om, conformer_cnn_cache_om = \
++                torch.from_numpy(encoder_output[0][:, 1:, :]), torch.from_numpy(encoder_output[1]), \
++                torch.from_numpy(encoder_output[2]), torch.from_numpy(encoder_output[3])
+             outputs.append(y)
+             offset += y.size(1)
+         ys = torch.cat(outputs, 1)
diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py
index 1626fca9db..a23ad1e129 100644
--- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py
+++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py
@@ -48,6 +48,7 @@ from wenet.transformer.asr_model import init_asr_model
 from wenet.utils.checkpoint import load_checkpoint
 #from wenet.transformer.acl_init import decoder_model, device_id
 import acl
+from wenet.transformer.acl_net import Net
 import json
 import os
 
@@ -97,6 +98,7 @@ if __name__ == '__main__':
                         action='store_true',
                         help='simulate streaming inference')
     parser.add_argument('--bin_path', type=str, default="./encoder_data", help='encoder bin images dir')
+    parser.add_argument('--model_path', type=str, default="./encoder_revise.om", help='encoder bin images dir')
     parser.add_argument('--json_path', type=str, default="encoder.json", help='encoder bin images dir')
     parser.add_argument('--reverse_weight',
                         type=float,
@@ -114,6 +116,8 @@ if __name__ == '__main__':
     device_id = 0
     ret = acl.rt.set_device(device_id)
     context, ret = acl.rt.create_context(device_id)
+    output_shape = 4233000
+    encoder_model = Net(model_path = args.model_path, output_data_shape = output_shape, device_id = device_id)
     with open(args.config, 'r') as fin:
         configs = yaml.load(fin, Loader=yaml.FullLoader)
     raw_wav = configs['raw_wav']
@@ -177,12 +181,14 @@ if __name__ == '__main__':
             num_decoding_left_chunks=args.num_decoding_left_chunks,
             ctc_weight=args.ctc_weight,
             simulate_streaming=args.simulate_streaming,
-            reverse_weight=args.reverse_weight)
+            reverse_weight=args.reverse_weight,
+            encoder_model=encoder_model)
 
         encoder_dic["encoder_out_"+ str(batch_idx)] = [encoder_out.shape[0], encoder_out.shape[1],encoder_out.shape[2]]
         encoder_dic["encoder_mask_"+ str(batch_idx)] = [encoder_mask.shape[0], encoder_mask.shape[1],encoder_mask.shape[2]]
         encoder_out.numpy().tofile(os.path.join(args.bin_path, "encoder_out_{}.bin".format(batch_idx)))
         encoder_mask.numpy().tofile(os.path.join(args.bin_path, "encoder_mask_{}.bin".format(batch_idx)))
     dic2json(encoder_dic, args.json_path)
+    del encoder_model
 
 
diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py
index 709d6f199d..12a46b08f0 100644
--- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py
+++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py
@@ -206,4 +206,5 @@ if __name__ == '__main__':
     dic_perf["t1"] = ave_t
     dic2json(dic_perf, "t1.json")
     dic2json(encoder_dic, args.json_path)
+    del encoder_model_noflash
 
diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/recognize_attenstion_rescoring.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/recognize_attenstion_rescoring.py
index bdc02ad3a4..1c8850d6e7 100644
--- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/recognize_attenstion_rescoring.py
+++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/recognize_attenstion_rescoring.py
@@ -203,3 +203,4 @@ if __name__ == '__main__':
         dic_perf["t2"] = ave_t
         if "no" in args.bin_path:
             dic2json(dic_perf, "t2.json")
+    del decoder_model
diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/requirements.txt b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/requirements.txt
index baf9a5ad64..aff4bc95fc 100644
--- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/requirements.txt
+++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/requirements.txt
@@ -4,4 +4,10 @@ onnxruntime==1.8.1
 torchaudio==0.9.0
 numpy==1.18.5
 Pillow==7.2.0
-
+flake8==3.8.2
+Pillow
+pyyaml>=5.1
+sentencepiece
+typeguard
+textgrid
+pytest
-- 
Gitee


From 3c1dbe3bfccd80beebd40663bfd496140e613c79 Mon Sep 17 00:00:00 2001
From: pengaoao <pengaoao@huawei.com>
Date: Wed, 23 Mar 2022 19:44:44 +0800
Subject: [PATCH 2/3] fix

---
 .../audio/Wenet_for_Pytorch/process_encoder_data_flash.py       | 2 ++
 .../audio/Wenet_for_Pytorch/process_encoder_data_noflash.py     | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py
index a23ad1e129..6d98eb6a3d 100644
--- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py
+++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py
@@ -111,6 +111,8 @@ if __name__ == '__main__':
                         format='%(asctime)s %(levelname)s %(message)s')
     os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
 
+    if not os.path.exists(args.bin_path):
+        os.mkdir(args.bin_path)
     #init acl
     ret = acl.init()
     device_id = 0
diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py
index 12a46b08f0..cf295c927b 100644
--- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py
+++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py
@@ -110,6 +110,8 @@ if __name__ == '__main__':
                         format='%(asctime)s %(levelname)s %(message)s')
     os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
 
+    if not os.path.exists(args.bin_path):
+        os.mkdir(args.bin_path)
     #init acl
     ret = acl.init()
     device_id = 0
-- 
Gitee


From 82bca5995a805c672c1f29cb6ff64ed305f524d2 Mon Sep 17 00:00:00 2001
From: pengaoao <pengaoao@huawei.com>
Date: Wed, 23 Mar 2022 19:48:16 +0800
Subject: [PATCH 3/3] fix

---
 ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md
index d316562557..80803daa07 100644
--- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md
+++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md
@@ -8,7 +8,9 @@
 pip3.7 install -r requirements.txt  
 ```
 
-2. 获取，修改与安装开源模型代码  
+其他需要安装的请按需安装
+
+1. 获取，修改与安装开源模型代码  
 
 ```
 git clone https://github.com/wenet-e2e/wenet.git
-- 
Gitee