diff --git a/ACL_PyTorch/contrib/audio/WeNet/.keep b/ACL_PyTorch/contrib/audio/WeNet/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ACL_PyTorch/contrib/audio/WeNet/README.md b/ACL_PyTorch/contrib/audio/WeNet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..96f0f4b5584ada66f418fcf35471f6c9f4be75e7
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/README.md
@@ -0,0 +1,103 @@
+# Wenet模型PyTorch离线推理指导
+
+## 1 环境准备 
+
+1. 安装必要的依赖，测试环境可能已经安装其中的一些不同版本的库了，故手动测试时不推荐使用该命令安装  
+
+```
+pip3 install -r requirements.txt  
+```
+
+2. 获取，修改与安装开源模型代码  
+
+```
+git clone https://github.com/wenet-e2e/wenet.git
+cd wenet
+git reset 9c4e305bcc24a06932f6a65c8147429d8406cc63 --hard
+```
+
+3. 下载网络权重文件并导出onnx
+
+下载链接：http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/aishell/20210601_u2pp_conformer_exp.tar.gz下载压缩文件，将文件解压，将文件夹内的文件放置到wenet/examples/aishell/s0/exp/conformer_u2文件夹下，若没有该文件夹，则创建该文件夹
+
+将所有提供的diff文件放到wenet根目录下
+patch -p1 < export_onnx.diff文件适配导出onnx的代码
+将提供的export_onnx.py、process_encoder_data_noflash.py、recognize_attenstion_rescoring.py、static.py文件放到wenet/wenet/bin/目录下
+将提供的slice_helper.py, acl_net.py文件放到wenet/wenet/transformer文件夹下，将提供的sh脚本文件放到wenet/examples/aishell/s0/目录下
+运行bash export_onnx.sh exp/conformer_u2/train.yaml exp/conformer_u2/final.pt导出onnx文件在当前目录下的onnx文件夹下
+
+4.  运行脚本将onnx转为om模型
+
+首先使用改图工具om_gener改图，该工具链接为https://gitee.com/liurf_hw/om_gener，安装之后使用以下命令修改脚本，
+
+python3 adaptdecoder.py生成decoder_final.onnx
+
+python3 adaptnoflashencoder.py生成no_flash_encoder_revise.onnx
+
+配置环境变量，使用atc工具将模型转换为om文件，命令参考提供的decoder.sh, no_flash_encoder.sh脚本，运行即可生成对应的om文件
+
+5. 数据集下载：
+
+   在wenet/examples/aishell/s0/文件夹下运行bash run.sh --stage -1 –stop_stage -1下载数据集
+
+   运行bash run.sh --stage 0 --stop_stage 0处理数据集
+
+   运行bash run.sh --stage 1 --stop_stage 1处理数据集
+
+   运行bash run.sh --stage 2 --stop_stage 2处理数据集
+
+   运行bash run.sh --stage 3 --stop_stage 3处理数据集
+
+## 2 离线推理 
+
+​	动态shape场景：
+
+   获取非流式场景下encoder处理数据：cd到wenet根目录下
+   
+   ```
+   git checkout .
+   patch -p1 < get_no_flash_encoder_out.diff
+   cd examples/aishell/s0/
+   bash run_no_flash_encoder_out.sh
+   ```
+   
+   wenet/bin/process_encoder_data_noflash.py文件中
+   --bin_path 保存encoder生成的bin文件
+    --model_path 保存非流式encoder om模型位置
+    --json_path保存encoder生成bin文件的shape信息
+   获取非流式场景下，decoder处理结果：cd到wenet根目录下
+   
+   ```
+   git checkout .
+   patch -p1 < getwer.diff
+   cd examples/aishell/s0/
+   bash run_attention_rescoring.sh
+   ```
+   
+   wenet/bin/recognize_attenstion_rescoring.py文件中
+   --bin_path 非流式encoder om生成bin文件，即上一步生成的bin文件路径
+   --model_path decoder模型om路径
+   --json_path 非流式encoder生成bin文件shape信息对应的json文件，即上一步生成的json文件。
+   查看wenet/examples/aishell/s0/exp/conformer/test_attention_rescoring/wer文件的最后几行，即可获取overall精度
+   运行infer.py可以得到fps性能数据
+
+静态shape场景(仅支持非流式场景)：
+
+onnx转om:
+
+```
+bash static_encoder.sh
+bash static_decoder.sh
+```
+
+精度测试:
+
+首先export ASCEND_GLOBAL_LOG_LEVEL=3，指定acc.diff中self.encoder_ascend， self.decoder_ascend加载的文件为静态转出的encoder，decoder模型
+```
+git checkout .
+patch -p1 < acc.diff
+cd examples/aishell/s0/
+bash static.sh
+```
+
+性能：在wenet/examples/aishell/s0/exp/conformer/test_attention_rescoring/text文件最后一行有fps性能数据
diff --git a/ACL_PyTorch/contrib/audio/WeNet/acc.diff b/ACL_PyTorch/contrib/audio/WeNet/acc.diff
new file mode 100644
index 0000000000000000000000000000000000000000..b4d2d5ea3c3acf37998d2a0c8bd940ac21416ccf
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/acc.diff
@@ -0,0 +1,358 @@
+diff --git a/wenet/dataset/dataset.py b/wenet/dataset/dataset.py
+index 4f0ff39..4ce97a4 100644
+--- a/wenet/dataset/dataset.py
++++ b/wenet/dataset/dataset.py
+@@ -27,7 +27,7 @@ import torchaudio.sox_effects as sox_effects
+ import yaml
+ from PIL import Image
+ from PIL.Image import BICUBIC
+-from torch.nn.utils.rnn import pad_sequence
++#from torch.nn.utils.rnn import pad_sequence
+ from torch.utils.data import Dataset, DataLoader
+ 
+ import wenet.dataset.kaldi_io as kaldi_io
+@@ -36,7 +36,69 @@ from wenet.utils.common import IGNORE_ID
+ 
+ torchaudio.set_audio_backend("sox_io")
+ 
++def _pad_sequence(sequences, batch_first=False, padding_value=0, mul_shape = None):
++    r"""Pad a list of variable length Tensors with ``padding_value``
++
++    ``pad_sequence`` stacks a list of Tensors along a new dimension,
++    and pads them to equal length. For example, if the input is list of
++    sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
++    otherwise.
++
++    `B` is batch size. It is equal to the number of elements in ``sequences``.
++    `T` is length of the longest sequence.
++    `L` is length of the sequence.
++    `*` is any number of trailing dimensions, including none.
++
++    Example:
++        >>> from torch.nn.utils.rnn import pad_sequence
++        >>> a = torch.ones(25, 300)
++        >>> b = torch.ones(22, 300)
++        >>> c = torch.ones(15, 300)
++        >>> pad_sequence([a, b, c]).size()
++        torch.Size([25, 3, 300])
++
++    Note:
++        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
++        where `T` is the length of the longest sequence. This function assumes
++        trailing dimensions and type of all the Tensors in sequences are same.
++
++    Arguments:
++        sequences (list[Tensor]): list of variable length sequences.
++        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
++            ``T x B x *`` otherwise
++        padding_value (float, optional): value for padded elements. Default: 0.
+ 
++    Returns:
++        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
++        Tensor of size ``B x T x *`` otherwise
++    """
++
++    # assuming trailing dimensions and type of all the Tensors
++    # in sequences are same and fetching those from sequences[0]
++
++    max_size = sequences[0].size()
++    trailing_dims = max_size[1:]
++
++    max_len = max([s.size(0) for s in sequences])
++    if mul_shape is not None:
++        for in_shape in mul_shape:
++            if max_len < in_shape:
++                max_len = in_shape
++                break
++    if batch_first:
++        out_dims = (len(sequences), max_len) + trailing_dims
++    else:
++        out_dims = (max_len, len(sequences)) + trailing_dims
++
++    out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
++    for i, tensor in enumerate(sequences):
++        length = tensor.size(0)
++        # use index notation to prevent duplicate references to the tensor
++        if batch_first:
++            out_tensor[i, :length, ...] = tensor
++        else:
++            out_tensor[:length, i, ...] = tensor
++    return out_tensor
+ def _spec_augmentation(x,
+                        warp_for_time=False,
+                        num_t_mask=2,
+@@ -187,6 +249,7 @@ def _extract_feature(batch, speed_perturb, wav_distortion_conf,
+     Returns:
+         (keys, feats, labels)
+     """
++
+     keys = []
+     feats = []
+     lengths = []
+@@ -331,13 +394,14 @@ class CollateFunc(object):
+         self.spec_sub = spec_sub
+         self.spec_sub_conf = spec_sub_conf
+ 
++
++
+     def __call__(self, batch):
+         assert (len(batch) == 1)
+         if self.raw_wav:
+             keys, xs, ys = _extract_feature(batch[0], self.speed_perturb,
+                                             self.wav_distortion_conf,
+                                             self.feature_extraction_conf)
+-
+         else:
+             keys, xs, ys = _load_feature(batch[0])
+ 
+@@ -359,27 +423,31 @@ class CollateFunc(object):
+         if self.spec_aug:
+             xs = [_spec_augmentation(x, **self.spec_aug_conf) for x in xs]
+ 
+-        # padding
+-        xs_lengths = torch.from_numpy(
+-            np.array([x.shape[0] for x in xs], dtype=np.int32))
++
+ 
+         # pad_sequence will FAIL in case xs is empty
++        mul_shape = [262, 326, 390, 454, 518, 582, 646, 710, 774, 838, 902, 966, 1028, 1284, 1478]
+         if len(xs) > 0:
+-            xs_pad = pad_sequence([torch.from_numpy(x).float() for x in xs],
+-                                  True, 0)
++            xs_pad = _pad_sequence([torch.from_numpy(x).float() for x in xs],
++                                  True, 0, mul_shape)
+         else:
+             xs_pad = torch.Tensor(xs)
++        # padding
++        xs_lengths = torch.from_numpy(
++            np.array([x.shape[0] for x in xs_pad], dtype=np.int32))
++
+         if train_flag:
+             ys_lengths = torch.from_numpy(
+                 np.array([y.shape[0] for y in ys], dtype=np.int32))
+             if len(ys) > 0:
+-                ys_pad = pad_sequence([torch.from_numpy(y).int() for y in ys],
++                ys_pad = _pad_sequence([torch.from_numpy(y).int() for y in ys],
+                                       True, IGNORE_ID)
+             else:
+                 ys_pad = torch.Tensor(ys)
+         else:
+             ys_pad = None
+             ys_lengths = None
++
+         return keys, xs_pad, ys_pad, xs_lengths, ys_lengths
+ 
+ 
+@@ -430,7 +498,6 @@ class AudioDataset(Dataset):
+         """
+         assert batch_type in ['static', 'dynamic']
+         data = []
+-
+         # Open in utf8 mode since meet encoding problem
+         with codecs.open(data_file, 'r', encoding='utf-8') as f:
+             for line in f:
+diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py
+index 73990fa..50358ca 100644
+--- a/wenet/transformer/asr_model.py
++++ b/wenet/transformer/asr_model.py
+@@ -32,8 +32,74 @@ from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add,
+                                 reverse_pad_list)
+ from wenet.utils.mask import (make_pad_mask, mask_finished_preds,
+                               mask_finished_scores, subsequent_mask)
++from wenet.transformer.acl_net import Net
++import time
++import acl
++
++def _pad_sequence(sequences, batch_first=False, padding_value=0, mul_shape = None):
++    r"""Pad a list of variable length Tensors with ``padding_value``
++
++    ``pad_sequence`` stacks a list of Tensors along a new dimension,
++    and pads them to equal length. For example, if the input is list of
++    sequences with size ``L x *`` and if batch_first is False, and ``T x B x *``
++    otherwise.
++
++    `B` is batch size. It is equal to the number of elements in ``sequences``.
++    `T` is length of the longest sequence.
++    `L` is length of the sequence.
++    `*` is any number of trailing dimensions, including none.
++
++    Example:
++        >>> from torch.nn.utils.rnn import pad_sequence
++        >>> a = torch.ones(25, 300)
++        >>> b = torch.ones(22, 300)
++        >>> c = torch.ones(15, 300)
++        >>> pad_sequence([a, b, c]).size()
++        torch.Size([25, 3, 300])
++
++    Note:
++        This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
++        where `T` is the length of the longest sequence. This function assumes
++        trailing dimensions and type of all the Tensors in sequences are same.
++
++    Arguments:
++        sequences (list[Tensor]): list of variable length sequences.
++        batch_first (bool, optional): output will be in ``B x T x *`` if True, or in
++            ``T x B x *`` otherwise
++        padding_value (float, optional): value for padded elements. Default: 0.
++
++    Returns:
++        Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``.
++        Tensor of size ``B x T x *`` otherwise
++    """
++
++    # assuming trailing dimensions and type of all the Tensors
++    # in sequences are same and fetching those from sequences[0]
++
++    max_size = sequences[0].size()
++    trailing_dims = max_size[1:]
++
++    max_len = max([s.size(0) for s in sequences])
++    if mul_shape is not None:
++        for in_shape in mul_shape:
++            if max_len < in_shape:
++                max_len = in_shape
++                break
+ 
+-
++    if batch_first:
++        out_dims = (len(sequences), max_len) + trailing_dims
++    else:
++        out_dims = (max_len, len(sequences)) + trailing_dims
++
++    out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
++    for i, tensor in enumerate(sequences):
++        length = tensor.size(0)
++        # use index notation to prevent duplicate references to the tensor
++        if batch_first:
++            out_tensor[i, :length, ...] = tensor
++        else:
++            out_tensor[:length, i, ...] = tensor
++    return out_tensor
+ class ASRModel(torch.nn.Module):
+     """CTC-attention hybrid Encoder-Decoder model"""
+     def __init__(
+@@ -60,6 +126,13 @@ class ASRModel(torch.nn.Module):
+         self.reverse_weight = reverse_weight
+ 
+         self.encoder = encoder
++        self.device_id = 0
++        ret = acl.init()
++        ret = acl.rt.set_device(self.device_id)
++        context, ret = acl.rt.create_context(self.device_id)
++        self.encoder_ascend = Net(model_path="/home/zry2/wenet/examples/aishell/s0/onnx/encoder_fendang_262_1478_static.om", device_id=self.device_id)
++        self.decoder_ascend = Net(model_path="/home/zry2/wenet/examples/aishell/s0/onnx/decoder_fendang.om", device_id=self.device_id)
++        self.encoder_out_shape = []
+         self.decoder = decoder
+         self.ctc = ctc
+         self.criterion_att = LabelSmoothingLoss(
+@@ -168,13 +241,21 @@ class ASRModel(torch.nn.Module):
+                 num_decoding_left_chunks=num_decoding_left_chunks
+             )  # (B, maxlen, encoder_dim)
+         else:
+-            encoder_out, encoder_mask = self.encoder(
+-                speech,
+-                speech_lengths,
+-                decoding_chunk_size=decoding_chunk_size,
+-                num_decoding_left_chunks=num_decoding_left_chunks
+-            )  # (B, maxlen, encoder_dim)
+-        return encoder_out, encoder_mask
++            st = time.time()
++
++            # encoder_out, encoder_mask = self.encoder(
++            #     speech,
++            #     speech_lengths,
++            #     decoding_chunk_size=decoding_chunk_size,
++            #     num_decoding_left_chunks=num_decoding_left_chunks
++            # )  # (B, maxlen, encoder_dim)
++            speech = speech.numpy()
++            speech_lengths = speech_lengths.numpy().astype("int32")
++            dims1 = {'dimCount': 4, 'name': '', 'dims': [1, speech.shape[1], 80, 1]}
++            y, exe_time = self.encoder_ascend([speech, speech_lengths], dims = dims1)
++            encoder_out = torch.from_numpy(y[0])
++            encoder_mask = torch.from_numpy(y[1])
++        return encoder_out, encoder_mask, exe_time
+ 
+     def recognize(
+         self,
+@@ -361,13 +442,17 @@ class ASRModel(torch.nn.Module):
+         assert batch_size == 1
+         # Let's assume B = batch_size and N = beam_size
+         # 1. Encoder forward and get CTC score
+-        encoder_out, encoder_mask = self._forward_encoder(
++        encoder_out, encoder_mask, encoder_t = self._forward_encoder(
+             speech, speech_lengths, decoding_chunk_size,
+             num_decoding_left_chunks,
+             simulate_streaming)  # (B, maxlen, encoder_dim)
+         maxlen = encoder_out.size(1)
++        mul_shape = [96, 144, 384]
++
++        encoder_out = _pad_sequence(encoder_out, True, 0, mul_shape)
+         ctc_probs = self.ctc.log_softmax(
+             encoder_out)  # (1, maxlen, vocab_size)
++
+         ctc_probs = ctc_probs.squeeze(0)
+         # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
+         cur_hyps = [(tuple(), (0.0, -float('inf')))]
+@@ -409,7 +494,7 @@ class ASRModel(torch.nn.Module):
+                                reverse=True)
+             cur_hyps = next_hyps[:beam_size]
+         hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
+-        return hyps, encoder_out
++        return hyps, encoder_out, encoder_t
+ 
+     def ctc_prefix_beam_search(
+         self,
+@@ -485,7 +570,7 @@ class ASRModel(torch.nn.Module):
+         # For attention rescoring we only support batch_size=1
+         assert batch_size == 1
+         # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size
+-        hyps, encoder_out = self._ctc_prefix_beam_search(
++        hyps, encoder_out, encoder_t = self._ctc_prefix_beam_search(
+             speech, speech_lengths, beam_size, decoding_chunk_size,
+             num_decoding_left_chunks, simulate_streaming)
+ 
+@@ -510,9 +595,19 @@ class ASRModel(torch.nn.Module):
+         r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id)
+         r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos,
+                                     self.ignore_id)
+-        decoder_out, r_decoder_out, _ = self.decoder(
+-            encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad,
+-            reverse_weight)  # (beam_size, max_hyps_len, vocab_size)
++
++        encoder_out = encoder_out.numpy()
++        encoder_mask = encoder_mask.numpy()
++        hyps_pad = hyps_pad.numpy()
++        hyps_lens = hyps_lens.numpy().astype("int32")
++        r_hyps_pad = r_hyps_pad.numpy()
++        dims2 = {'dimCount': 11, 'name': '', 'dims': [10, encoder_out.shape[1], 256, 10, 1, encoder_out.shape[1], 10, r_hyps_pad.shape[1], 10, 10, r_hyps_pad.shape[1]]}
++
++        y, exe_time = self.decoder_ascend([encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad], dims=dims2)
++        batch_t = encoder_t + exe_time
++        decoder_out = torch.from_numpy(y[0])
++        r_decoder_out = torch.from_numpy(y[1])
++
+         decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1)
+         decoder_out = decoder_out.cpu().numpy()
+         # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a
+@@ -539,7 +634,7 @@ class ASRModel(torch.nn.Module):
+             if score > best_score:
+                 best_score = score
+                 best_index = i
+-        return hyps[best_index][0]
++        return hyps[best_index][0], batch_t
+ 
+     @torch.jit.export
+     def subsampling_rate(self) -> int:
+diff --git a/wenet/transformer/encoder.py b/wenet/transformer/encoder.py
+index e342ed4..c8e18d5 100644
+--- a/wenet/transformer/encoder.py
++++ b/wenet/transformer/encoder.py
+@@ -157,6 +157,8 @@ class BaseEncoder(torch.nn.Module):
+                                               decoding_chunk_size,
+                                               self.static_chunk_size,
+                                               num_decoding_left_chunks)
++
++
+         for layer in self.encoders:
+             xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+         if self.normalize_before:
diff --git a/ACL_PyTorch/contrib/audio/WeNet/acl_net.py b/ACL_PyTorch/contrib/audio/WeNet/acl_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..db5cc314aadb64531bef5b8a8d8fcc268b3ed84d
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/acl_net.py
@@ -0,0 +1,391 @@
+# BSD 3-Clause License
+#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+
+
+import numpy as np
+import acl
+import functools
+import time
+
+# error code
+ACL_ERROR_NONE = 0
+
+# memory malloc code
+ACL_MEM_MALLOC_HUGE_FIRST = 0
+ACL_MEM_MALLOC_HUGE_ONLY = 1
+ACL_MEM_MALLOC_NORMAL_ONLY = 2
+
+# memory copy code
+ACL_MEMCPY_HOST_TO_HOST = 0
+ACL_MEMCPY_HOST_TO_DEVICE = 1
+ACL_MEMCPY_DEVICE_TO_HOST = 2
+ACL_MEMCPY_DEVICE_TO_DEVICE = 3
+
+# format
+ACL_FORMAT_NCHW = 0
+ACL_DTYPE = {
+    0: 'float32',
+    1: 'float16',
+    2: 'int8',
+    3: 'int32',
+    4: 'uint8',
+    6: 'int16',
+    7: 'uint16',
+    8: 'uint32',
+    9: 'int64',
+    10: 'uint64',
+    11: 'float64',
+    12: 'bool',
+}
+
+ACL_DTYPE_INDEX = {
+    'float32': 0,
+    'float16': 1,
+    'int8': 2,
+    'int32': 3,
+    'uint8': 4,
+    'int16': 6,
+    'uint16': 7,
+    'uint32': 8,
+    'int64': 9,
+    'uint64': 10,
+    'float64': 11,
+    'bool': 12,
+}
+
+
+def check_ret(message, ret):
+    if ret != ACL_ERROR_NONE:
+        raise Exception("{} failed ret = {}".format(message, ret))
+
+
+def check_input_type(input_type, model_input_type):
+    for i in range(len(input_type)):
+        if ACL_DTYPE_INDEX.get(input_type[i]) != model_input_type[i]:
+            raise Exception("real input {} input_type:{} model_input_type:{} not same".format(i, input_type[i], \
+                            ACL_DTYPE.get(model_input_type[i])))
+
+
+class Net(object):
+    def __init__(self, model_path, device_id, check_input=False, output_data_shape=None):
+        self.check_input = check_input
+        self.dynamic = False
+        self.device_id = device_id
+        self.model_path = model_path
+        self.model_id = None
+        # if self.ascend_mbatch_shape_data = True, the model is static with multi input shape
+        self.ascend_mbatch_shape_data = False
+        self.input_data_type = []
+        self.model_input_data_type = []
+        self.model_input_data_format = []
+        self.model_output_data_type = []
+        self.output_data_shape = output_data_shape
+        self.output_shape = []
+        self.buffer_method = {
+            "in": acl.mdl.get_input_size_by_index,
+            "out": acl.mdl.get_output_size_by_index,
+            "outhost": acl.mdl.get_output_size_by_index
+        }
+
+        self.input_data = []
+        self.output_data = []
+        self.output_data_host = []
+        self.model_desc = None
+        self.load_input_dataset = None
+        self.load_output_dataset = None
+        self.input_size = None
+        self.output_size = None
+        self.exe_t = 0
+        self._init_resource()
+
+    def __call__(self, ori_data, dims=None):
+        return self.forward(ori_data, dims)
+
+    def __del__(self):
+        ret = acl.mdl.unload(self.model_id)
+        check_ret("acl.mdl.unload", ret)
+        if self.model_desc:
+            acl.mdl.destroy_desc(self.model_desc)
+            self.model_desc = None
+        if not self.dynamic:
+            self._release_data_buffer()
+
+    def _release_data_buffer(self):
+        while self.input_data:
+            item = self.input_data.pop()
+            ret = acl.rt.free(item["buffer"])
+            check_ret("acl.rt.free", ret)
+
+        while self.output_data:
+            item = self.output_data.pop()
+            ret = acl.rt.free(item["buffer"])
+            check_ret("acl.rt.free", ret)
+
+        while self.output_data_host:
+            item = self.output_data_host.pop()
+            ret = acl.rt.free_host(item["buffer"])
+            check_ret("acl.rt.free_host", ret)
+
+    def _init_resource(self):
+        # load_model
+        self.model_id, ret = acl.mdl.load_from_file(self.model_path)
+        check_ret("acl.mdl.load_from_file", ret)
+
+        self.model_desc = acl.mdl.create_desc()
+        self._get_model_info()
+
+    def _get_model_info(self):
+        ret = acl.mdl.get_desc(self.model_desc, self.model_id)
+        check_ret("acl.mdl.get_desc", ret)
+        self.input_size = acl.mdl.get_num_inputs(self.model_desc)
+        # get the input format, data_type and get the model static or not
+        for i in range(self.input_size):
+            data_type = acl.mdl.get_input_data_type(self.model_desc, i)
+            self.model_input_data_type.append(data_type)
+            data_format = acl.mdl.get_input_format(self.model_desc, i)
+            self.model_input_data_format.append(data_format)
+            dims_input, ret = acl.mdl.get_input_dims(self.model_desc, i)
+            # check if the model has ascend_mbatch_shape_data
+            if i == self.input_size - 1 and dims_input["name"] == "ascend_mbatch_shape_data":
+                self.dynamic = False
+                self.ascend_mbatch_shape_data = True
+            elif -1 in dims_input["dims"]:
+                self.dynamic = True
+        self.output_size = acl.mdl.get_num_outputs(self.model_desc)
+        for j in range(self.output_size):
+            data_type = acl.mdl.get_output_data_type(self.model_desc, j)
+            self.model_output_data_type.append(data_type)
+            dims_output, ret = acl.mdl.get_output_dims(self.model_desc, j)
+            if -1 in dims_output["dims"]:
+                self.dynamic = True
+        if self.output_data_shape is None and self.dynamic:
+            self.output_data_shape = 500000000
+        if not self.dynamic:
+            self._prepare_data_buffer_in()
+            self._prepare_data_buffer_out()
+            self._prepare_data_buffer_host()
+
+    def _gen_data_buffer(self, size, des, data=None):
+        func = self.buffer_method[des]
+        for i in range(size):
+            if not self.dynamic:
+                temp_buffer_size = func(self.model_desc, i)
+            else:
+                if des == "in":
+                    input_size = np.prod(np.array(data[i]).shape)
+                    temp_buffer_size = Net.gen_data_size(input_size, dtype=ACL_DTYPE.get(self.model_input_data_type[i]))
+                elif des == "out":
+                    temp_buffer_size = Net.gen_data_size(data, dtype=ACL_DTYPE.get(self.model_output_data_type[i]))
+
+            temp_buffer, ret = acl.rt.malloc(temp_buffer_size, ACL_MEM_MALLOC_HUGE_FIRST)
+            check_ret("acl.rt.malloc", ret)
+            acl.rt.memset(temp_buffer, temp_buffer_size, 0, temp_buffer_size)
+            if des == "in":
+                self.input_data.append({"buffer": temp_buffer,
+                                        "size": temp_buffer_size})
+            elif des == "out":
+                self.output_data.append({"buffer": temp_buffer,
+                                         "size": temp_buffer_size})
+
+    def _gen_dataset_output_host(self, size, des, data=None):
+        func = self.buffer_method[des]
+        for i in range(size):
+            if not self.dynamic:
+                temp_buffer_size = func(self.model_desc, i)
+            else:
+                temp_buffer_size = Net.gen_data_size(data, ACL_DTYPE.get(self.model_output_data_type[i]))
+            temp_buffer, ret = acl.rt.malloc_host(temp_buffer_size)
+            check_ret("acl.rt.malloc_host", ret)
+
+            self.output_data_host.append({"buffer": temp_buffer,
+                                          "size": temp_buffer_size})
+
+    def _data_interaction(self, dataset, policy=ACL_MEMCPY_HOST_TO_DEVICE):
+
+        temp_data_buffer = self.input_data \
+            if policy == ACL_MEMCPY_HOST_TO_DEVICE \
+            else self.output_data
+        if len(dataset) == 0 and policy == ACL_MEMCPY_DEVICE_TO_HOST:
+            dataset = self.output_data_host
+        for i in range(len(dataset)):
+            if policy == ACL_MEMCPY_HOST_TO_DEVICE:
+                ptr = acl.util.numpy_to_ptr(dataset[i])
+                if self.ascend_mbatch_shape_data:
+                    malloc_size = dataset[i].size * dataset[i].itemsize
+                else:
+                    malloc_size = temp_data_buffer[i]["size"]
+                ret = acl.rt.memcpy(temp_data_buffer[i]["buffer"], malloc_size, ptr, malloc_size, policy)
+                check_ret("acl.rt.memcpy", ret)
+
+            else:
+                ptr = dataset[i]["buffer"]
+                ret = acl.rt.memcpy(ptr, temp_data_buffer[i]["size"], temp_data_buffer[i]["buffer"],
+                                    temp_data_buffer[i]["size"], policy)
+                check_ret("acl.rt.memcpy", ret)
+
+    def _gen_dataset(self, type_str="input", input_shapes=None):
+        dataset = acl.mdl.create_dataset()
+        temp_dataset = None
+        if type_str == "in":
+            self.load_input_dataset = dataset
+            temp_dataset = self.input_data
+
+        else:
+            self.load_output_dataset = dataset
+            temp_dataset = self.output_data
+
+        for i, item in enumerate(temp_dataset):
+            data = acl.create_data_buffer(item["buffer"], item["size"])
+            if data is None:
+                ret = acl.destroy_data_buffer(dataset)
+                check_ret("acl.destroy_data_buffer", ret)
+
+            _, ret = acl.mdl.add_dataset_buffer(dataset, data)
+            if ret != ACL_ERROR_NONE:
+                ret = acl.destroy_data_buffer(dataset)
+                check_ret("acl.destroy_data_buffer", ret)
+
+            if type_str == "in" and not self.ascend_mbatch_shape_data:
+                # set dynamic dataset tensor desc
+                input_shape = input_shapes[i]
+                input_desc = acl.create_tensor_desc(self.model_input_data_type[i], input_shape,
+                                                    self.model_input_data_format[i])
+                dataset, ret = acl.mdl.set_dataset_tensor_desc(dataset, input_desc, i)
+                if ret != ACL_ERROR_NONE:
+                    ret = acl.destroy_data_buffer(dataset)
+                    check_ret("acl.destroy_data_buffer", ret)
+
+    def _data_from_host_to_device(self, images):
+        self._data_interaction(images, ACL_MEMCPY_HOST_TO_DEVICE)
+        input_shapes = [list(data.shape) for data in images]
+        self._gen_dataset("in", input_shapes)
+        self._gen_dataset("out")
+
+    def _data_from_device_to_host(self, input_data, output_shape):
+        res = []
+        self._data_interaction(res, ACL_MEMCPY_DEVICE_TO_HOST)
+        output = self.get_result(self.output_data_host, input_data, output_shape)
+        return output
+
+    def _get_output_shape(self):
+        output_shape = []
+        num = acl.mdl.get_dataset_num_buffers(self.load_output_dataset)
+        for output_index in range(num):
+            if self.dynamic:
+                outpu_desc = acl.mdl.get_dataset_tensor_desc(self.load_output_dataset, output_index)
+                temp_output_shape = []
+                dim_nums = acl.get_tensor_desc_num_dims(outpu_desc)
+                for i in range(dim_nums):
+                    dim, ret = acl.get_tensor_desc_dim_v2(outpu_desc, i)
+                    temp_output_shape.append(dim)
+                output_shape.append(temp_output_shape)
+            else:
+                dims, ret = acl.mdl.get_cur_output_dims(self.model_desc, output_index)
+                data_shape = dims.get("dims")
+                output_shape.append(data_shape)
+
+        return output_shape
+
+    def _destroy_databuffer(self):
+        for dataset in [self.load_input_dataset, self.load_output_dataset]:
+            if not dataset:
+                continue
+
+            num = acl.mdl.get_dataset_num_buffers(dataset)
+            for i in range(num):
+                data_buf = acl.mdl.get_dataset_buffer(dataset, i)
+                if data_buf:
+                    ret = acl.destroy_data_buffer(data_buf)
+                    check_ret("acl.destroy_data_buffer", ret)
+            ret = acl.mdl.destroy_dataset(dataset)
+            check_ret("acl.mdl.destroy_dataset", ret)
+
+    def _prepare_data_buffer_in(self, input_data=None):
+        self._gen_data_buffer(self.input_size, des="in", data=input_data)
+
+    def _prepare_data_buffer_out(self, input_data=None):
+        self._gen_data_buffer(self.output_size, des="out", data=input_data)
+
+    def _prepare_data_buffer_host(self, input_data=None):
+        self._gen_dataset_output_host(self.output_size, des="outhost", data=input_data)
+
+    def forward(self, input_data, dims=None):
+        if not isinstance(input_data, (list, tuple)):
+            input_data = [input_data]
+        if self.check_input:
+            self.input_data_type = []
+            for data in input_data:
+                self.input_data_type.append(str(data.dtype))
+            check_input_type(self.input_data_type, self.model_input_data_type)
+        if self.dynamic:
+            self._prepare_data_buffer_in(input_data)
+            self._prepare_data_buffer_out(self.output_data_shape)
+            self._prepare_data_buffer_host(self.output_data_shape)
+        self._data_from_host_to_device(input_data)
+
+        if self.ascend_mbatch_shape_data:
+            if dims is None:
+                raise Exception("the model is static multi shape model, dims can not be None")
+            index, ret = acl.mdl.get_input_index_by_name(self.model_desc, 'ascend_mbatch_shape_data')
+            ret = acl.mdl.set_input_dynamic_dims(self.model_id, self.load_input_dataset, index, dims)
+            check_ret("acl.mdl.set_input_dynamic_dims", ret)
+        st = time.time()
+        ret = acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset)
+        self.exe_t = time.time() - st
+        check_ret("acl.mdl.execute", ret)
+        # get output shape
+        output_shape = self._get_output_shape()
+        self._destroy_databuffer()
+        result = self._data_from_device_to_host(input_data=input_data, output_shape=output_shape)
+        if self.dynamic:
+            self._release_data_buffer()
+        return result
+
+    def get_result(self, output_data, data, output_shape):
+        dataset = []
+        for i in range(len(output_data)):
+            # fix dynamic batch size
+            data_type = acl.mdl.get_output_data_type(self.model_desc, i)
+            data_len = functools.reduce(lambda x, y: x * y, output_shape[i])
+            ftype = np.dtype(ACL_DTYPE.get(data_type))
+            size = output_data[i]["size"]
+            ptr = output_data[i]["buffer"]
+            data = acl.util.ptr_to_numpy(ptr, (size,), 1)
+            np_array = np.frombuffer(bytearray(data[:data_len * ftype.itemsize]), dtype=ftype, count=data_len)
+            np_array = np_array.reshape(output_shape[i])
+            dataset.append(np_array)
+        return dataset, self.exe_t * 1000
+
+    @staticmethod
+    def gen_data_size(size, dtype):
+        dtype = np.dtype(dtype)
+        return int(size * dtype.itemsize)
diff --git a/ACL_PyTorch/contrib/audio/WeNet/adaptdecoder.py b/ACL_PyTorch/contrib/audio/WeNet/adaptdecoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..deca1909985a784b5e37562a751cd12e5103b530
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/adaptdecoder.py
@@ -0,0 +1,96 @@
+# BSD 3-Clause License
+#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+
+from gener_core.mod_modify.onnx_graph import OXGraph
+from gener_core.mod_modify.onnx_node import OXNode
+from gener_core.mod_modify.interface import AttrType as AT
+
+from gener_core.mod_modify.onnx_graph import OXGraph
+from gener_core.mod_modify.onnx_node import OXNode
+from gener_core.mod_modify.interface import AttrType as AT
+
+mod = OXGraph("decoder.onnx")
+Expand_lists = mod.get_nodes_by_optype("Expand")
+for i in range(len(Expand_lists)):
+    now_expand = mod.get_node(Expand_lists[i])
+    cast_node = mod.add_new_node(now_expand.name + "_cast", "Cast",
+                                  {"to": (AT.INT, 6)
+                                   })
+    Expand_first_input_now = mod.get_node(now_expand.input_name[0])
+    now_expand.set_input_node(0, [cast_node])
+    cast_node.set_input_node(0, [Expand_first_input_now])
+
+Less_lists = mod.get_nodes_by_optype("Less")
+for i in range(len(Less_lists)):
+    now_expand = mod.get_node(Less_lists[i])
+    cast_node = mod.add_new_node(now_expand.name + "_cast", "Cast",
+                                  {"to": (AT.INT, 6)
+                                   })
+    Expand_second_input_now = mod.get_node(now_expand.input_name[1])
+    now_expand.set_input_node(1, [cast_node])
+    cast_node.set_input_node(0, [Expand_second_input_now])
+
+Greater_lists = mod.get_nodes_by_optype("Greater")
+for greater_node in Greater_lists:
+    now_expand = mod.get_node(greater_node)
+    cast_node = mod.add_new_node(now_expand.name + "_cast", "Cast",
+                                  {"to": (AT.INT, 6)
+                                   })
+    Expand_second_input_now = mod.get_node(now_expand.input_name[1])
+    now_expand.set_input_node(1, [cast_node])
+    cast_node.set_input_node(0, [Expand_second_input_now])
+
+not_change_cast = []
+Range_lists = mod.get_nodes_by_optype("Range")
+for range_node in Range_lists:
+    now_expand = mod.get_node(range_node)
+    Expand_first_input_now = mod.get_node(now_expand.input_name[1])
+    not_change_cast.append(Expand_first_input_now.name)
+
+to = 6
+Cast = mod.get_nodes_by_optype("Cast")
+for cast_node in Cast:
+    now_Cast = mod.get_node(cast_node)
+    if now_Cast.get_attr("to", AT.INT) == 7 and now_Cast.name not in not_change_cast:
+        now_Cast.set_attr({"to": (AT.INT, to)})
+
+Equal = mod.get_nodes_by_optype("Equal")
+for equal_node in Equal:
+    now_equal = mod.get_node(equal_node)
+    now_ends = mod.get_node(now_equal.input_name[1])
+    if now_ends.op_type in ("Initializer", "Constant") and now_ends.const_value.dtype == "int64":
+        print("now_ends.dtype:", now_ends.const_value.dtype)
+        val = now_ends.const_value.astype("int32")
+        now_ends.set_const_value(val)
+
+mod.save_new_model("decoder_final.onnx")
+
diff --git a/ACL_PyTorch/contrib/audio/WeNet/adaptnoflashencoder.py b/ACL_PyTorch/contrib/audio/WeNet/adaptnoflashencoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d1675b1d6d6ce9327a4066297817b1914f47610
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/adaptnoflashencoder.py
@@ -0,0 +1,81 @@
+# BSD 3-Clause License
+#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+
+from gener_core.mod_modify.onnx_graph import OXGraph
+from gener_core.mod_modify.onnx_node import OXNode
+from gener_core.mod_modify.interface import AttrType as AT
+import numpy as np
+
+mod = OXGraph("no_flash_encoder.onnx")
+Expand_lists = mod.get_nodes_by_optype("Less")
+for i in range(len(Expand_lists)):
+    now_expand = mod.get_node(Expand_lists[i])
+    cast_node = mod.add_new_node(now_expand.name + "_cast", "Cast",
+                                  {"to": (AT.INT, 6)
+                                   })
+    Expand_first_input_now = mod.get_node(now_expand.input_name[1])
+    now_expand.set_input_node(1, [cast_node])
+    cast_node.set_input_node(0, [Expand_first_input_now])
+
+Equal = mod.get_nodes_by_optype("Equal")
+for equal_node in Equal:
+    now_equal = mod.get_node(equal_node)
+    now_ends = mod.get_node(now_equal.input_name[1])
+    if now_ends.op_type in ("Initializer", "Constant") and now_ends.const_value.dtype == "int64":
+        print("now_ends.dtype:", now_ends.const_value.dtype)
+        val = now_ends.const_value.astype("int32")
+        now_ends.set_const_value(val)
+
+Expand_lists = ["Expand_20"]
+for expand_node in Expand_lists:
+    now_expand = mod.get_node(expand_node)
+    cast_node = mod.add_new_node(now_expand.name + "_cast", "Cast",
+                                  {"to": (AT.INT, 6)
+                                   })
+    Expand_first_input_now = mod.get_node(now_expand.input_name[0])
+    now_expand.set_input_node(0, [cast_node])
+    cast_node.set_input_node(0, [Expand_first_input_now])
+
+not_change_cast = []
+Range_lists = mod.get_nodes_by_optype("Range")
+for range_node in Range_lists:
+    now_expand = mod.get_node(range_node)
+    Expand_first_input_now = mod.get_node(now_expand.input_name[1])
+    not_change_cast.append(Expand_first_input_now.name)
+
+to = 6
+Cast = mod.get_nodes_by_optype("Cast")
+for i in range(len(Cast)):
+    now_Cast = mod.get_node(Cast[i])
+    if now_Cast.get_attr("to", AT.INT) == 7 and now_Cast.name not in not_change_cast:
+        now_Cast.set_attr({"to": (AT.INT, to)})
+mod.save_new_model("no_flash_encoder_revise.onnx")
diff --git a/ACL_PyTorch/contrib/audio/WeNet/asr_model.py b/ACL_PyTorch/contrib/audio/WeNet/asr_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..11f16147d8a43df03edfdd4bb310383f0c701102
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/asr_model.py
@@ -0,0 +1,720 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import List, Optional, Tuple
+
+import torch
+
+from torch.nn.utils.rnn import pad_sequence
+
+from wenet.transformer.cmvn import GlobalCMVN
+from wenet.transformer.ctc import CTC
+from wenet.transformer.decoder import (TransformerDecoder,
+                                       BiTransformerDecoder)
+from wenet.transformer.encoder import ConformerEncoder
+from wenet.transformer.encoder import TransformerEncoder
+from wenet.transformer.label_smoothing_loss import LabelSmoothingLoss
+from wenet.utils.cmvn import load_cmvn
+from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add,
+                                remove_duplicates_and_blank, th_accuracy,
+                                reverse_pad_list)
+from wenet.utils.mask import (make_pad_mask, mask_finished_preds,
+                              mask_finished_scores, subsequent_mask)
+
+class ASRModel(torch.nn.Module):
+    """CTC-attention hybrid Encoder-Decoder model"""
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder: TransformerEncoder,
+        decoder: TransformerDecoder,
+        ctc: CTC,
+        ctc_weight: float = 0.5,
+        ignore_id: int = IGNORE_ID,
+        reverse_weight: float = 0.0,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+    ):
+        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
+
+        super().__init__()
+        # note that eos is the same as sos (equivalent ID)
+        self.sos = vocab_size - 1
+        self.eos = vocab_size - 1
+        self.vocab_size = vocab_size
+        self.ignore_id = ignore_id
+        self.ctc_weight = ctc_weight
+        self.reverse_weight = reverse_weight
+
+        self.encoder = encoder
+        self.decoder = decoder
+        self.ctc = ctc
+        self.criterion_att = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor],
+               Optional[torch.Tensor]]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        """
+        print("***")
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (speech.shape[0] == speech_lengths.shape[0] == text.shape[0] ==
+                text_lengths.shape[0]), (speech.shape, speech_lengths.shape,
+                                         text.shape, text_lengths.shape)
+        # 1. Encoder
+        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+
+        # 2a. Attention-decoder branch
+        if self.ctc_weight != 1.0:
+            loss_att, acc_att = self._calc_att_loss(encoder_out, encoder_mask,
+                                                    text, text_lengths)
+        else:
+            loss_att = None
+
+        # 2b. CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc = self.ctc(encoder_out, encoder_out_lens, text,
+                                text_lengths)
+        else:
+            loss_ctc = None
+
+        if loss_ctc is None:
+            loss = loss_att
+        elif loss_att is None:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc + (1 -
+                                                 self.ctc_weight) * loss_att
+        return loss, loss_att, loss_ctc
+
+    def _calc_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_mask: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ) -> Tuple[torch.Tensor, float]:
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos,
+                                            self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # reverse the seq, used for right to left decoder
+        r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id))
+        r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos,
+                                                self.ignore_id)
+        # 1. Forward decoder
+        decoder_out, r_decoder_out, _ = self.decoder(encoder_out, encoder_mask,
+                                                     ys_in_pad, ys_in_lens,
+                                                     r_ys_in_pad,
+                                                     self.reverse_weight)
+        # 2. Compute attention loss
+        loss_att = self.criterion_att(decoder_out, ys_out_pad)
+        r_loss_att = torch.tensor(0.0)
+        if self.reverse_weight > 0.0:
+            r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad)
+        loss_att = loss_att * (
+            1 - self.reverse_weight) + r_loss_att * self.reverse_weight
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+        return loss_att, acc_att
+
+    def _forward_encoder(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+        simulate_streaming: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Let's assume B = batch_size
+        # 1. Encoder
+        if simulate_streaming and decoding_chunk_size > 0:
+            encoder_out, encoder_mask = self.encoder.forward_chunk_by_chunk(
+                speech,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks
+            )  # (B, maxlen, encoder_dim)
+        else:
+            encoder_out, encoder_mask, encoder_t = self.encoder(
+                speech,
+                speech_lengths,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks
+            )  # (B, maxlen, encoder_dim)
+        return encoder_out, encoder_mask, encoder_t
+
+    def recognize(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        beam_size: int = 10,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+        simulate_streaming: bool = False,
+    ) -> torch.Tensor:
+        """ Apply beam search on attention decoder
+
+        Args:
+            speech (torch.Tensor): (batch, max_len, feat_dim)
+            speech_length (torch.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+
+        Returns:
+            torch.Tensor: decoding result, (batch, max_result_len)
+        """
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        device = speech.device
+        batch_size = speech.shape[0]
+
+        # Let's assume B = batch_size and N = beam_size
+        # 1. Encoder
+        encoder_out, encoder_mask = self._forward_encoder(
+            speech, speech_lengths, decoding_chunk_size,
+            num_decoding_left_chunks,
+            simulate_streaming)  # (B, maxlen, encoder_dim)
+        maxlen = encoder_out.size(1)
+        encoder_dim = encoder_out.size(2)
+        running_size = batch_size * beam_size
+        encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
+            running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
+        encoder_mask = encoder_mask.unsqueeze(1).repeat(
+            1, beam_size, 1, 1).view(running_size, 1,
+                                     maxlen)  # (B*N, 1, max_len)
+
+        hyps = torch.ones([running_size, 1], dtype=torch.long,
+                          device=device).fill_(self.sos)  # (B*N, 1)
+        scores = torch.tensor([0.0] + [-float('inf')] * (beam_size - 1),
+                              dtype=torch.float)
+        scores = scores.to(device).repeat([batch_size]).unsqueeze(1).to(
+            device)  # (B*N, 1)
+        end_flag = torch.zeros_like(scores, dtype=torch.bool, device=device)
+        cache: Optional[List[torch.Tensor]] = None
+        # 2. Decoder forward step by step
+        for i in range(1, maxlen + 1):
+            # Stop if all batch and all beam produce eos
+            if end_flag.sum() == running_size:
+                break
+            # 2.1 Forward decoder step
+            hyps_mask = subsequent_mask(i).unsqueeze(0).repeat(
+                running_size, 1, 1).to(device)  # (B*N, i, i)
+            # logp: (B*N, vocab)
+            logp, cache = self.decoder.forward_one_step(
+                encoder_out, encoder_mask, hyps, hyps_mask, cache)
+            # 2.2 First beam prune: select topk best prob at current time
+            top_k_logp, top_k_index = logp.topk(beam_size)  # (B*N, N)
+            top_k_logp = mask_finished_scores(top_k_logp, end_flag)
+            top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos)
+            # 2.3 Second beam prune: select topk score with history
+            scores = scores + top_k_logp  # (B*N, N), broadcast add
+            scores = scores.view(batch_size, beam_size * beam_size)  # (B, N*N)
+            scores, offset_k_index = scores.topk(k=beam_size)  # (B, N)
+            scores = scores.view(-1, 1)  # (B*N, 1)
+            # 2.4. Compute base index in top_k_index,
+            # regard top_k_index as (B*N*N),regard offset_k_index as (B*N),
+            # then find offset_k_index in top_k_index
+            base_k_index = torch.arange(batch_size, device=device).view(
+                -1, 1).repeat([1, beam_size])  # (B, N)
+            base_k_index = base_k_index * beam_size * beam_size
+            best_k_index = base_k_index.view(-1) + offset_k_index.view(
+                -1)  # (B*N)
+
+            # 2.5 Update best hyps
+            best_k_pred = torch.index_select(top_k_index.view(-1),
+                                             dim=-1,
+                                             index=best_k_index)  # (B*N)
+            best_hyps_index = best_k_index // beam_size
+            last_best_k_hyps = torch.index_select(
+                hyps, dim=0, index=best_hyps_index)  # (B*N, i)
+            hyps = torch.cat((last_best_k_hyps, best_k_pred.view(-1, 1)),
+                             dim=1)  # (B*N, i+1)
+
+            # 2.6 Update end flag
+            end_flag = torch.eq(hyps[:, -1], self.eos).view(-1, 1)
+
+        # 3. Select best of best
+        scores = scores.view(batch_size, beam_size)
+        # TODO: length normalization
+        best_index = torch.argmax(scores, dim=-1).long()
+        best_hyps_index = best_index + torch.arange(
+            batch_size, dtype=torch.long, device=device) * beam_size
+        best_hyps = torch.index_select(hyps, dim=0, index=best_hyps_index)
+        best_hyps = best_hyps[:, 1:]
+        return best_hyps
+
+    def ctc_greedy_search(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+        simulate_streaming: bool = False,
+    ) -> List[List[int]]:
+        """ Apply CTC greedy search
+
+        Args:
+            speech (torch.Tensor): (batch, max_len, feat_dim)
+            speech_length (torch.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[List[int]]: best path result
+        """
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        batch_size = speech.shape[0]
+        # Let's assume B = batch_size
+        encoder_out, encoder_mask = self._forward_encoder(
+            speech, speech_lengths, decoding_chunk_size,
+            num_decoding_left_chunks,
+            simulate_streaming)  # (B, maxlen, encoder_dim)
+        maxlen = encoder_out.size(1)
+        encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+        ctc_probs = self.ctc.log_softmax(
+            encoder_out)  # (B, maxlen, vocab_size)
+        topk_prob, topk_index = ctc_probs.topk(1, dim=2)  # (B, maxlen, 1)
+        topk_index = topk_index.view(batch_size, maxlen)  # (B, maxlen)
+        mask = make_pad_mask(encoder_out_lens)  # (B, maxlen)
+        topk_index = topk_index.masked_fill_(mask, self.eos)  # (B, maxlen)
+        hyps = [hyp.tolist() for hyp in topk_index]
+        hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
+        return hyps
+
+    def _ctc_prefix_beam_search(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        beam_size: int,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+        simulate_streaming: bool = False,
+    ) -> Tuple[List[List[int]], torch.Tensor]:
+        """ CTC prefix beam search inner implementation
+
+        Args:
+            speech (torch.Tensor): (batch, max_len, feat_dim)
+            speech_length (torch.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+
+        Returns:
+            List[List[int]]: nbest results
+            torch.Tensor: encoder output, (1, max_len, encoder_dim),
+                it will be used for rescoring in attention rescoring mode
+        """
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        batch_size = speech.shape[0]
+        # For CTC prefix beam search, we only support batch_size=1
+        assert batch_size == 1
+        # Let's assume B = batch_size and N = beam_size
+        # 1. Encoder forward and get CTC score
+        encoder_out, encoder_mask, encoder_t = self._forward_encoder(
+            speech, speech_lengths, decoding_chunk_size,
+            num_decoding_left_chunks,
+            simulate_streaming)  # (B, maxlen, encoder_dim)
+        maxlen = encoder_out.size(1)
+        ctc_probs = self.ctc.log_softmax(
+            encoder_out)  # (1, maxlen, vocab_size)
+        ctc_probs = ctc_probs.squeeze(0)
+        # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
+        cur_hyps = [(tuple(), (0.0, -float('inf')))]
+        # 2. CTC beam search step by step
+        for t in range(0, maxlen):
+            logp = ctc_probs[t]  # (vocab_size,)
+            # key: prefix, value (pb, pnb), default value(-inf, -inf)
+            next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
+            # 2.1 First beam prune: select topk best
+            top_k_logp, top_k_index = logp.topk(beam_size)  # (beam_size,)
+            for s in top_k_index:
+                s = s.item()
+                ps = logp[s].item()
+                for prefix, (pb, pnb) in cur_hyps:
+                    last = prefix[-1] if len(prefix) > 0 else None
+                    if s == 0:  # blank
+                        n_pb, n_pnb = next_hyps[prefix]
+                        n_pb = log_add([n_pb, pb + ps, pnb + ps])
+                        next_hyps[prefix] = (n_pb, n_pnb)
+                    elif s == last:
+                        #  Update *ss -> *s;
+                        n_pb, n_pnb = next_hyps[prefix]
+                        n_pnb = log_add([n_pnb, pnb + ps])
+                        next_hyps[prefix] = (n_pb, n_pnb)
+                        # Update *s-s -> *ss, - is for blank
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb = next_hyps[n_prefix]
+                        n_pnb = log_add([n_pnb, pb + ps])
+                        next_hyps[n_prefix] = (n_pb, n_pnb)
+                    else:
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb = next_hyps[n_prefix]
+                        n_pnb = log_add([n_pnb, pb + ps, pnb + ps])
+                        next_hyps[n_prefix] = (n_pb, n_pnb)
+
+            # 2.2 Second beam prune
+            next_hyps = sorted(next_hyps.items(),
+                               key=lambda x: log_add(list(x[1])),
+                               reverse=True)
+            cur_hyps = next_hyps[:beam_size]
+        hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
+        return hyps, encoder_out, encoder_t
+
+    def ctc_prefix_beam_search(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        beam_size: int,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+        simulate_streaming: bool = False,
+    ) -> List[int]:
+        """ Apply CTC prefix beam search
+
+        Args:
+            speech (torch.Tensor): (batch, max_len, feat_dim)
+            speech_length (torch.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+
+        Returns:
+            List[int]: CTC prefix beam search nbest results
+        """
+        hyps, _ = self._ctc_prefix_beam_search(speech, speech_lengths,
+                                               beam_size, decoding_chunk_size,
+                                               num_decoding_left_chunks,
+                                               simulate_streaming)
+        return hyps[0][0]
+
+    def attention_rescoring(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        beam_size: int,
+        decoding_chunk_size: int = -1,
+        num_decoding_left_chunks: int = -1,
+        ctc_weight: float = 0.0,
+        simulate_streaming: bool = False,
+        reverse_weight: float = 0.0,
+    ) -> List[int]:
+        """ Apply attention rescoring decoding, CTC prefix beam search
+            is applied first to get nbest, then we resoring the nbest on
+            attention decoder with corresponding encoder out
+
+        Args:
+            speech (torch.Tensor): (batch, max_len, feat_dim)
+            speech_length (torch.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+            reverse_weight (float): right to left decoder weight
+            ctc_weight (float): ctc score weight
+
+        Returns:
+            List[int]: Attention rescoring result
+        """
+        assert speech.shape[0] == speech_lengths.shape[0]
+        assert decoding_chunk_size != 0
+        if reverse_weight > 0.0:
+            # decoder should be a bitransformer decoder if reverse_weight > 0.0
+            assert hasattr(self.decoder, 'right_decoder')
+        device = speech.device
+        batch_size = speech.shape[0]
+        # For attention rescoring we only support batch_size=1
+        assert batch_size == 1
+        # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size
+        hyps, encoder_out, encoder_t = self._ctc_prefix_beam_search(
+            speech, speech_lengths, beam_size, decoding_chunk_size,
+            num_decoding_left_chunks, simulate_streaming)
+
+        assert len(hyps) == beam_size
+        hyps_pad = pad_sequence([
+            torch.tensor(hyp[0], device=device, dtype=torch.long)
+            for hyp in hyps
+        ], True, self.ignore_id)  # (beam_size, max_hyps_len)
+        ori_hyps_pad = hyps_pad
+        hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps],
+                                 device=device,
+                                 dtype=torch.long)  # (beam_size,)
+        hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
+        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+        encoder_out = encoder_out.repeat(beam_size, 1, 1)
+        encoder_mask = torch.ones(beam_size,
+                                  1,
+                                  encoder_out.size(1),
+                                  dtype=torch.bool,
+                                  device=device)
+        # used for right to left decoder
+        r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id)
+        r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos,
+                                    self.ignore_id)
+    
+        decoder_out, r_decoder_out, _, decoder_t = self.decoder(
+            encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad,
+            reverse_weight)  # (beam_size, max_hyps_len, vocab_size)
+        
+        decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1)
+        decoder_out = decoder_out.cpu().numpy()
+        # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a
+        # conventional transformer decoder.
+        r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1)
+        r_decoder_out = r_decoder_out.cpu().numpy()
+        # Only use decoder score for rescoring
+        best_score = -float('inf')
+        best_index = 0
+        for i, hyp in enumerate(hyps):
+            score = 0.0
+            for j, w in enumerate(hyp[0]):
+                score += decoder_out[i][j][w]
+            score += decoder_out[i][len(hyp[0])][self.eos]
+            # add right to left decoder score
+            if reverse_weight > 0:
+                r_score = 0.0
+                for j, w in enumerate(hyp[0]):
+                    r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w]
+                r_score += r_decoder_out[i][len(hyp[0])][self.eos]
+                score = score * (1 - reverse_weight) + r_score * reverse_weight
+            # add ctc score
+            score += hyp[1] * ctc_weight
+            if score > best_score:
+                best_score = score
+                best_index = i
+        return hyps[best_index][0], encoder_t+decoder_t
+
+    @torch.jit.export
+    def subsampling_rate(self) -> int:
+        """ Export interface for c++ call, return subsampling_rate of the
+            model
+        """
+        return self.encoder.embed.subsampling_rate
+
+    @torch.jit.export
+    def right_context(self) -> int:
+        """ Export interface for c++ call, return right_context of the model
+        """
+        return self.encoder.embed.right_context
+
+    @torch.jit.export
+    def sos_symbol(self) -> int:
+        """ Export interface for c++ call, return sos symbol id of the model
+        """
+        return self.sos
+
+    @torch.jit.export
+    def eos_symbol(self) -> int:
+        """ Export interface for c++ call, return eos symbol id of the model
+        """
+        return self.eos
+
+    @torch.jit.export
+    def forward_encoder_chunk(
+        self,
+        xs: torch.Tensor,
+        offset: int,
+        required_cache_size: int,
+        subsampling_cache: Optional[torch.Tensor] = None,
+        elayers_output_cache: Optional[List[torch.Tensor]] = None,
+        conformer_cnn_cache: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor],
+               List[torch.Tensor]]:
+        """ Export interface for c++ call, give input chunk xs, and return
+            output from time 0 to current chunk.
+
+        Args:
+            xs (torch.Tensor): chunk input
+            subsampling_cache (Optional[torch.Tensor]): subsampling cache
+            elayers_output_cache (Optional[List[torch.Tensor]]):
+                transformer/conformer encoder layers output cache
+            conformer_cnn_cache (Optional[List[torch.Tensor]]): conformer
+                cnn cache
+
+        Returns:
+            torch.Tensor: output, it ranges from time 0 to current chunk.
+            torch.Tensor: subsampling cache
+            List[torch.Tensor]: attention cache
+            List[torch.Tensor]: conformer cnn cache
+
+        """
+        return self.encoder.forward_chunk(xs, offset, required_cache_size,
+                                          subsampling_cache,
+                                          elayers_output_cache,
+                                          conformer_cnn_cache)
+
+    @torch.jit.export
+    def ctc_activation(self, xs: torch.Tensor) -> torch.Tensor:
+        """ Export interface for c++ call, apply linear transform and log
+            softmax before ctc
+        Args:
+            xs (torch.Tensor): encoder output
+
+        Returns:
+            torch.Tensor: activation before ctc
+
+        """
+        return self.ctc.log_softmax(xs)
+
+    @torch.jit.export
+    def is_bidirectional_decoder(self) -> bool:
+        """
+        Returns:
+            torch.Tensor: decoder output
+        """
+        if hasattr(self.decoder, 'right_decoder'):
+            return True
+        else:
+            return False
+
+    @torch.jit.export
+    def forward_attention_decoder(
+        self,
+        hyps: torch.Tensor,
+        hyps_lens: torch.Tensor,
+        encoder_out: torch.Tensor,
+        reverse_weight: float = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """ Export interface for c++ call, forward decoder with multiple
+            hypothesis from ctc prefix beam search and one encoder output
+        Args:
+            hyps (torch.Tensor): hyps from ctc prefix beam search, already
+                pad sos at the begining
+            hyps_lens (torch.Tensor): length of each hyp in hyps
+            encoder_out (torch.Tensor): corresponding encoder output
+            r_hyps (torch.Tensor): hyps from ctc prefix beam search, already
+                pad eos at the begining which is used fo right to left decoder
+            reverse_weight: used for verfing whether used right to left decoder,
+            > 0 will use.
+
+        Returns:
+            torch.Tensor: decoder output
+        """
+        assert encoder_out.size(0) == 1
+        num_hyps = hyps.size(0)
+        assert hyps_lens.size(0) == num_hyps
+        encoder_out = encoder_out.repeat(num_hyps, 1, 1)
+        encoder_mask = torch.ones(num_hyps,
+                                  1,
+                                  encoder_out.size(1),
+                                  dtype=torch.bool,
+                                  device=encoder_out.device)
+        # input for right to left decoder
+        # this hyps_lens has count <sos> token, we need minus it.
+        r_hyps_lens = hyps_lens - 1
+        # this hyps has included <sos> token, so it should be
+        # convert the original hyps.
+        r_hyps = hyps[:, 1:]
+        r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id))
+        r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id)
+        decoder_out, r_decoder_out, _ = self.decoder(
+            encoder_out, encoder_mask, hyps, hyps_lens, r_hyps,
+            reverse_weight)  # (num_hyps, max_hyps_len, vocab_size)
+        decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1)
+
+        # right to left decoder may be not used during decoding process,
+        # which depends on reverse_weight param.
+        # r_dccoder_out will be 0.0, if reverse_weight is 0.0
+        r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1)
+        return decoder_out, r_decoder_out
+
+
+def init_asr_model(configs):
+    if configs['cmvn_file'] is not None:
+        mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn'])
+        global_cmvn = GlobalCMVN(
+            torch.from_numpy(mean).float(),
+            torch.from_numpy(istd).float())
+    else:
+        global_cmvn = None
+
+    input_dim = configs['input_dim']
+    vocab_size = configs['output_dim']
+
+    encoder_type = configs.get('encoder', 'conformer')
+    decoder_type = configs.get('decoder', 'bitransformer')
+
+    if encoder_type == 'conformer':
+        encoder = ConformerEncoder(input_dim,
+                                   global_cmvn=global_cmvn,
+                                   **configs['encoder_conf'])
+    else:
+        encoder = TransformerEncoder(input_dim,
+                                     global_cmvn=global_cmvn,
+                                     **configs['encoder_conf'])
+    if decoder_type == 'transformer':
+        decoder = TransformerDecoder(vocab_size, encoder.output_size(),
+                                     **configs['decoder_conf'])
+    else:
+        assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0
+        assert configs['decoder_conf']['r_num_blocks'] > 0
+        decoder = BiTransformerDecoder(vocab_size, encoder.output_size(),
+                                       **configs['decoder_conf'])
+    ctc = CTC(vocab_size, encoder.output_size())
+    model = ASRModel(
+        vocab_size=vocab_size,
+        encoder=encoder,
+        decoder=decoder,
+        ctc=ctc,
+        **configs['model_conf'],
+    )
+    return model
diff --git a/ACL_PyTorch/contrib/audio/WeNet/decoder.py b/ACL_PyTorch/contrib/audio/WeNet/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9296e3d13f58c0af7d3f6628b120d1eed6ac76e
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/decoder.py
@@ -0,0 +1,288 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
+# Author: di.wu@mobvoi.com (DI WU)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Decoder definition."""
+from typing import Tuple, List, Optional
+
+import torch
+from typeguard import check_argument_types
+
+from wenet.transformer.attention import MultiHeadedAttention
+from wenet.transformer.decoder_layer import DecoderLayer
+from wenet.transformer.embedding import PositionalEncoding
+from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from wenet.utils.mask import (subsequent_mask, make_pad_mask)
+
+import time
+
+class TransformerDecoder(torch.nn.Module):
+    """Base class of Transfomer decoder module.
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the hidden units number of position-wise feedforward
+        num_blocks: the number of decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before:
+            True: use layer_norm before each sub-block of a layer.
+            False: use layer_norm after each sub-block of a layer.
+        concat_after: whether to concat attention layer's input and output
+            True: x -> x + linear(concat(x, att(x)))
+            False: x -> x + att(x)
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        normalize_before: bool = True,
+        concat_after: bool = False,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        attention_dim = encoder_output_size
+
+        if input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(vocab_size, attention_dim),
+                PositionalEncoding(attention_dim, positional_dropout_rate),
+            )
+        else:
+            raise ValueError(f"only 'embed' is supported: {input_layer}")
+
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(attention_dim, eps=1e-12)
+        self.use_output_layer = use_output_layer
+        self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
+        self.num_blocks = num_blocks
+        self.decoders = torch.nn.ModuleList([
+            DecoderLayer(
+                attention_dim,
+                MultiHeadedAttention(attention_heads, attention_dim,
+                                     self_attention_dropout_rate),
+                MultiHeadedAttention(attention_heads, attention_dim,
+                                     src_attention_dropout_rate),
+                PositionwiseFeedForward(attention_dim, linear_units,
+                                        dropout_rate),
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ) for _ in range(self.num_blocks)
+        ])
+
+    def forward(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+        r_ys_in_pad: Optional[torch.Tensor] = None,
+        reverse_weight: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
+            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
+            ys_in_lens: input lengths of this batch (batch)
+            r_ys_in_pad: not used in transformer decoder, in order to unify api
+                with bidirectional decoder
+            reverse_weight: not used in transformer decoder, in order to unify
+                api with bidirectional decode
+        Returns:
+            (tuple): tuple containing:
+                x: decoded token score before softmax (batch, maxlen_out,
+                    vocab_size) if use_output_layer is True,
+                torch.tensor(0.0), in order to unify api with bidirectional decoder
+                olens: (batch, )
+        """
+        tgt = ys_in_pad
+
+        # tgt_mask: (B, 1, L)
+        tgt_mask = (~make_pad_mask(ys_in_lens).unsqueeze(1)).to(tgt.device)
+        # m: (1, L, L)
+        m = subsequent_mask(tgt_mask.size(-1),
+                            device=tgt_mask.device).unsqueeze(0)
+        # tgt_mask: (B, L, L)
+        tgt_mask = tgt_mask & m
+        x, _ = self.embed(tgt)
+        for layer in self.decoders:
+            x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory,
+                                                     memory_mask)
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.use_output_layer:
+            x = self.output_layer(x)
+        olens = tgt_mask.sum(1)
+        return x, torch.tensor(0.0), olens
+
+    def forward_one_step(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        cache: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Forward one step.
+            This is only used for decoding.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
+            tgt: input token ids, int64 (batch, maxlen_out)
+            tgt_mask: input token mask,  (batch, maxlen_out)
+                      dtype=torch.uint8 in PyTorch 1.2-
+                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+            cache: cached output list of (batch, max_time_out-1, size)
+        Returns:
+            y, cache: NN output value and cache per `self.decoders`.
+            y.shape` is (batch, maxlen_out, token)
+        """
+        x, _ = self.embed(tgt)
+        new_cache = []
+        for i, decoder in enumerate(self.decoders):
+            if cache is None:
+                c = None
+            else:
+                c = cache[i]
+            x, tgt_mask, memory, memory_mask = decoder(x,
+                                                       tgt_mask,
+                                                       memory,
+                                                       memory_mask,
+                                                       cache=c)
+            new_cache.append(x)
+        if self.normalize_before:
+            y = self.after_norm(x[:, -1])
+        else:
+            y = x[:, -1]
+        if self.use_output_layer:
+            y = torch.log_softmax(self.output_layer(y), dim=-1)
+        return y, new_cache
+
+
+class BiTransformerDecoder(torch.nn.Module):
+    """Base class of Transfomer decoder module.
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the hidden units number of position-wise feedforward
+        num_blocks: the number of decoder blocks
+        r_num_blocks: the number of right to left decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before:
+            True: use layer_norm before each sub-block of a layer.
+            False: use layer_norm after each sub-block of a layer.
+        concat_after: whether to concat attention layer's input and output
+            True: x -> x + linear(concat(x, att(x)))
+            False: x -> x + att(x)
+    """
+    def __init__(
+        self,
+        vocab_size: int,
+        encoder_output_size: int,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        r_num_blocks: int = 0,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        self_attention_dropout_rate: float = 0.0,
+        src_attention_dropout_rate: float = 0.0,
+        input_layer: str = "embed",
+        use_output_layer: bool = True,
+        normalize_before: bool = True,
+        concat_after: bool = False,
+    ):
+
+        assert check_argument_types()
+        super().__init__()
+        self.left_decoder = TransformerDecoder(
+            vocab_size, encoder_output_size, attention_heads, linear_units,
+            num_blocks, dropout_rate, positional_dropout_rate,
+            self_attention_dropout_rate, src_attention_dropout_rate,
+            input_layer, use_output_layer, normalize_before, concat_after)
+
+        self.right_decoder = TransformerDecoder(
+            vocab_size, encoder_output_size, attention_heads, linear_units,
+            r_num_blocks, dropout_rate, positional_dropout_rate,
+            self_attention_dropout_rate, src_attention_dropout_rate,
+            input_layer, use_output_layer, normalize_before, concat_after)
+
+    def forward(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        ys_in_pad: torch.Tensor,
+        ys_in_lens: torch.Tensor,
+        r_ys_in_pad: torch.Tensor,
+        reverse_weight: float = 0.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
+            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
+            ys_in_lens: input lengths of this batch (batch)
+            r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out),
+                used for right to left decoder
+            reverse_weight: used for right to left decoder
+        Returns:
+            (tuple): tuple containing:
+                x: decoded token score before softmax (batch, maxlen_out,
+                    vocab_size) if use_output_layer is True,
+                r_x: x: decoded token score (right to left decoder)
+                    before softmax (batch, maxlen_out, vocab_size)
+                    if use_output_layer is True,
+                olens: (batch, )
+        """
+        st = time.time()
+        l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad,
+                                          ys_in_lens)
+        r_x = torch.tensor(0.0)
+        if reverse_weight > 0.0:
+            r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad,
+                                               ys_in_lens)
+        return l_x, r_x, olens, time.time()-st
+
+    def forward_one_step(
+        self,
+        memory: torch.Tensor,
+        memory_mask: torch.Tensor,
+        tgt: torch.Tensor,
+        tgt_mask: torch.Tensor,
+        cache: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Forward one step.
+            This is only used for decoding.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
+            tgt: input token ids, int64 (batch, maxlen_out)
+            tgt_mask: input token mask,  (batch, maxlen_out)
+                      dtype=torch.uint8 in PyTorch 1.2-
+                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
+            cache: cached output list of (batch, max_time_out-1, size)
+        Returns:
+            y, cache: NN output value and cache per `self.decoders`.
+            y.shape` is (batch, maxlen_out, token)
+        """
+        return self.left_decoder.forward_one_step(memory, memory_mask, tgt,
+                                                  tgt_mask, cache)
diff --git a/ACL_PyTorch/contrib/audio/WeNet/decoder.sh b/ACL_PyTorch/contrib/audio/WeNet/decoder.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0e33e8da1590f87817c404f399bbebb1ac0a60d2
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/decoder.sh
@@ -0,0 +1,13 @@
+export install_path=/usr/local/Ascend/ascend-toolkit/latest
+export PATH=${install_path}/atc/bin:${install_path}/bin:${install_path}/atc/ccec_compiler/bin:$PATH
+export LD_LIBRARY_PATH=${install_path}/lib64:${install_path}/atc/lib64:${install_path}/acllib/lib64:${install_path}/compiler/lib64/plugin/opskernel:${install_path}/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH
+export PYTHONPATH=${install_path}/latest/python/site-packages:${install_path}/opp/op_impl/built-in/ai_core/tbe:${install_path}/atc/python/site-packages:${install_path}/pyACL/python/site-packages/acl:$PYTHONPATH
+export ASCEND_AICPU_PATH=${install_path}
+export ASCEND_OPP_PATH=${install_path}/opp
+export TOOLCHAIN_HOME=${install_path}/toolkit
+export ASCEND_AUTOML_PATH=${install_path}/tools
+export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:${LD_LIBRARY_PATH}
+atc --model=decoder_final.onnx --framework=5 --output=decoder_final --input_format=ND \
+ --input_shape_range="memory:[10,1~1500,256];memory_mask:[10,1,1~1500];ys_in_pad:[10,1~1500];ys_in_lens:[10];r_ys_in_pad:[10,1~1500]" --out_nodes="Add_488:0;Add_977:0"  --log=error --soc_version=Ascend310
+
+
diff --git a/ACL_PyTorch/contrib/audio/WeNet/encoder.py b/ACL_PyTorch/contrib/audio/WeNet/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..035d64f4c8f932eb452694d1413757308cd56ee9
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/encoder.py
@@ -0,0 +1,452 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+# Author: di.wu@mobvoi.com (DI WU)
+"""Encoder definition."""
+from typing import Tuple, List, Optional
+
+import torch
+from typeguard import check_argument_types
+
+from wenet.transformer.attention import MultiHeadedAttention
+from wenet.transformer.attention import RelPositionMultiHeadedAttention
+from wenet.transformer.convolution import ConvolutionModule
+from wenet.transformer.embedding import PositionalEncoding
+from wenet.transformer.embedding import RelPositionalEncoding
+from wenet.transformer.embedding import NoPositionalEncoding
+from wenet.transformer.encoder_layer import TransformerEncoderLayer
+from wenet.transformer.encoder_layer import ConformerEncoderLayer
+from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from wenet.transformer.subsampling import Conv2dSubsampling4
+from wenet.transformer.subsampling import Conv2dSubsampling6
+from wenet.transformer.subsampling import Conv2dSubsampling8
+from wenet.transformer.subsampling import LinearNoSubsampling
+from wenet.utils.common import get_activation
+from wenet.utils.mask import make_pad_mask
+from wenet.utils.mask import add_optional_chunk_mask
+
+import time
+
+class BaseEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "abs_pos",
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+    ):
+        """
+        Args:
+            input_size (int): input dim
+            output_size (int): dimension of attention
+            attention_heads (int): the number of heads of multi head attention
+            linear_units (int): the hidden units number of position-wise feed
+                forward
+            num_blocks (int): the number of decoder blocks
+            dropout_rate (float): dropout rate
+            attention_dropout_rate (float): dropout rate in attention
+            positional_dropout_rate (float): dropout rate after adding
+                positional encoding
+            input_layer (str): input layer type.
+                optional [linear, conv2d, conv2d6, conv2d8]
+            pos_enc_layer_type (str): Encoder positional encoding layer type.
+                opitonal [abs_pos, scaled_abs_pos, rel_pos, no_pos]
+            normalize_before (bool):
+                True: use layer_norm before each sub-block of a layer.
+                False: use layer_norm after each sub-block of a layer.
+            concat_after (bool): whether to concat attention layer's input
+                and output.
+                True: x -> x + linear(concat(x, att(x)))
+                False: x -> x + att(x)
+            static_chunk_size (int): chunk size for static chunk training and
+                decoding
+            use_dynamic_chunk (bool): whether use dynamic chunk size for
+                training or not, You can only use fixed chunk(chunk_size > 0)
+                or dyanmic chunk size(use_dynamic_chunk = True)
+            global_cmvn (Optional[torch.nn.Module]): Optional GlobalCMVN module
+            use_dynamic_left_chunk (bool): whether use dynamic left chunk in
+                dynamic chunk training
+        """
+        assert check_argument_types()
+        super().__init__()
+        self._output_size = output_size
+
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "no_pos":
+            pos_enc_class = NoPositionalEncoding
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+        if input_layer == "linear":
+            subsampling_class = LinearNoSubsampling
+        elif input_layer == "conv2d":
+            subsampling_class = Conv2dSubsampling4
+        elif input_layer == "conv2d6":
+            subsampling_class = Conv2dSubsampling6
+        elif input_layer == "conv2d8":
+            subsampling_class = Conv2dSubsampling8
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        self.global_cmvn = global_cmvn
+        self.embed = subsampling_class(
+            input_size,
+            output_size,
+            dropout_rate,
+            pos_enc_class(output_size, positional_dropout_rate),
+        )
+
+        self.normalize_before = normalize_before
+        self.after_norm = torch.nn.LayerNorm(output_size, eps=1e-12)
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs: torch.Tensor,
+        xs_lens: torch.Tensor,
+        decoding_chunk_size: int = 0,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Embed positions in tensor.
+
+        Args:
+            xs: padded input tensor (B, T, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor xs, and subsampled masks
+            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+            masks: torch.Tensor batch padding mask after subsample
+                (B, 1, T' ~= T/subsample_rate)
+        """
+        st = time.time()
+        masks = ~make_pad_mask(xs_lens).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks  # (B, 1, T/subsample_rate)
+        chunk_masks = add_optional_chunk_mask(xs, masks,
+                                              self.use_dynamic_chunk,
+                                              self.use_dynamic_left_chunk,
+                                              decoding_chunk_size,
+                                              self.static_chunk_size,
+                                              num_decoding_left_chunks)
+        for layer in self.encoders:
+            xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        # Here we assume the mask is not changed in encoder layers, so just
+        # return the masks before encoder layers, and the masks will be used
+        # for cross attention with decoder later
+        return xs, masks, time.time()-st
+
+    def forward_chunk(
+        self,
+        xs: torch.Tensor,
+        offset: int,
+        required_cache_size: int,
+        subsampling_cache: Optional[torch.Tensor] = None,
+        elayers_output_cache: Optional[List[torch.Tensor]] = None,
+        conformer_cnn_cache: Optional[List[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor],
+               List[torch.Tensor]]:
+        """ Forward just one chunk
+
+        Args:
+            xs (torch.Tensor): chunk input
+            offset (int): current offset in encoder output time stamp
+            required_cache_size (int): cache size required for next chunk
+                compuation
+                >=0: actual cache size
+                <0: means all history cache is required
+            subsampling_cache (Optional[torch.Tensor]): subsampling cache
+            elayers_output_cache (Optional[List[torch.Tensor]]):
+                transformer/conformer encoder layers output cache
+            conformer_cnn_cache (Optional[List[torch.Tensor]]): conformer
+                cnn cache
+
+        Returns:
+            torch.Tensor: output of current input xs
+            torch.Tensor: subsampling cache required for next chunk computation
+            List[torch.Tensor]: encoder layers output cache required for next
+                chunk computation
+            List[torch.Tensor]: conformer cnn cache
+
+        """
+        assert xs.size(0) == 1
+        # tmp_masks is just for interface compatibility
+        tmp_masks = torch.ones(1,
+                               xs.size(1),
+                               device=xs.device,
+                               dtype=torch.bool)
+        tmp_masks = tmp_masks.unsqueeze(1)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
+        if subsampling_cache is not None:
+            cache_size = subsampling_cache.size(1)
+            xs = torch.cat((subsampling_cache, xs), dim=1)
+        else:
+            cache_size = 0
+        pos_emb = self.embed.position_encoding(offset - cache_size, xs.size(1))
+        if required_cache_size < 0:
+            next_cache_start = 0
+        elif required_cache_size == 0:
+            next_cache_start = xs.size(1)
+        else:
+            next_cache_start = max(xs.size(1) - required_cache_size, 0)
+        r_subsampling_cache = xs[:, next_cache_start:, :]
+        # Real mask for transformer/conformer layers
+        masks = torch.ones(1, xs.size(1), device=xs.device, dtype=torch.bool)
+        masks = masks.unsqueeze(1)
+        r_elayers_output_cache = []
+        r_conformer_cnn_cache = []
+        for i, layer in enumerate(self.encoders):
+            if elayers_output_cache is None:
+                attn_cache = None
+            else:
+                attn_cache = elayers_output_cache[i]
+            if conformer_cnn_cache is None:
+                cnn_cache = None
+            else:
+                cnn_cache = conformer_cnn_cache[i]
+            xs, _, new_cnn_cache = layer(xs,
+                                         masks,
+                                         pos_emb,
+                                         output_cache=attn_cache,
+                                         cnn_cache=cnn_cache)
+            r_elayers_output_cache.append(xs[:, next_cache_start:, :])
+            r_conformer_cnn_cache.append(new_cnn_cache)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+
+        return (xs[:, cache_size:, :], r_subsampling_cache,
+                r_elayers_output_cache, r_conformer_cnn_cache)
+
+    def forward_chunk_by_chunk(
+        self,
+        xs: torch.Tensor,
+        decoding_chunk_size: int,
+        num_decoding_left_chunks: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """ Forward input chunk by chunk with chunk_size like a streaming
+            fashion
+
+        Here we should pay special attention to computation cache in the
+        streaming style forward chunk by chunk. Three things should be taken
+        into account for computation in the current network:
+            1. transformer/conformer encoder layers output cache
+            2. convolution in conformer
+            3. convolution in subsampling
+
+        However, we don't implement subsampling cache for:
+            1. We can control subsampling module to output the right result by
+               overlapping input instead of cache left context, even though it
+               wastes some computation, but subsampling only takes a very
+               small fraction of computation in the whole model.
+            2. Typically, there are several covolution layers with subsampling
+               in subsampling module, it is tricky and complicated to do cache
+               with different convolution layers with different subsampling
+               rate.
+            3. Currently, nn.Sequential is used to stack all the convolution
+               layers in subsampling, we need to rewrite it to make it work
+               with cache, which is not prefered.
+        Args:
+            xs (torch.Tensor): (1, max_len, dim)
+            chunk_size (int): decoding chunk size
+        """
+        assert decoding_chunk_size > 0
+        # The model is trained by static or dynamic chunk
+        assert self.static_chunk_size > 0 or self.use_dynamic_chunk
+        subsampling = self.embed.subsampling_rate
+        context = self.embed.right_context + 1  # Add current frame
+        stride = subsampling * decoding_chunk_size
+        decoding_window = (decoding_chunk_size - 1) * subsampling + context
+        num_frames = xs.size(1)
+        subsampling_cache: Optional[torch.Tensor] = None
+        elayers_output_cache: Optional[List[torch.Tensor]] = None
+        conformer_cnn_cache: Optional[List[torch.Tensor]] = None
+        outputs = []
+        offset = 0
+        required_cache_size = decoding_chunk_size * num_decoding_left_chunks
+
+        # Feed forward overlap input step by step
+        for cur in range(0, num_frames - context + 1, stride):
+            end = min(cur + decoding_window, num_frames)
+            chunk_xs = xs[:, cur:end, :]
+            (y, subsampling_cache, elayers_output_cache,
+             conformer_cnn_cache) = self.forward_chunk(chunk_xs, offset,
+                                                       required_cache_size,
+                                                       subsampling_cache,
+                                                       elayers_output_cache,
+                                                       conformer_cnn_cache)
+            outputs.append(y)
+            offset += y.size(1)
+        ys = torch.cat(outputs, 1)
+        masks = torch.ones(1, ys.size(1), device=ys.device, dtype=torch.bool)
+        masks = masks.unsqueeze(1)
+        return ys, masks
+
+
+class TransformerEncoder(BaseEncoder):
+    """Transformer encoder module."""
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "abs_pos",
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+    ):
+        """ Construct TransformerEncoder
+
+        See Encoder for the meaning of each parameter.
+        """
+        assert check_argument_types()
+        super().__init__(input_size, output_size, attention_heads,
+                         linear_units, num_blocks, dropout_rate,
+                         positional_dropout_rate, attention_dropout_rate,
+                         input_layer, pos_enc_layer_type, normalize_before,
+                         concat_after, static_chunk_size, use_dynamic_chunk,
+                         global_cmvn, use_dynamic_left_chunk)
+        self.encoders = torch.nn.ModuleList([
+            TransformerEncoderLayer(
+                output_size,
+                MultiHeadedAttention(attention_heads, output_size,
+                                     attention_dropout_rate),
+                PositionwiseFeedForward(output_size, linear_units,
+                                        dropout_rate), dropout_rate,
+                normalize_before, concat_after) for _ in range(num_blocks)
+        ])
+
+
+class ConformerEncoder(BaseEncoder):
+    """Conformer encoder module."""
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "rel_pos",
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        positionwise_conv_kernel_size: int = 1,
+        macaron_style: bool = True,
+        selfattention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        cnn_module_kernel: int = 15,
+        causal: bool = False,
+        cnn_module_norm: str = "batch_norm",
+    ):
+        """Construct ConformerEncoder
+
+        Args:
+            input_size to use_dynamic_chunk, see in BaseEncoder
+            positionwise_conv_kernel_size (int): Kernel size of positionwise
+                conv1d layer.
+            macaron_style (bool): Whether to use macaron style for
+                positionwise layer.
+            selfattention_layer_type (str): Encoder attention layer type,
+                the parameter has no effect now, it's just for configure
+                compatibility.
+            activation_type (str): Encoder activation function type.
+            use_cnn_module (bool): Whether to use convolution module.
+            cnn_module_kernel (int): Kernel size of convolution module.
+            causal (bool): whether to use causal convolution or not.
+        """
+        assert check_argument_types()
+        super().__init__(input_size, output_size, attention_heads,
+                         linear_units, num_blocks, dropout_rate,
+                         positional_dropout_rate, attention_dropout_rate,
+                         input_layer, pos_enc_layer_type, normalize_before,
+                         concat_after, static_chunk_size, use_dynamic_chunk,
+                         global_cmvn, use_dynamic_left_chunk)
+        activation = get_activation(activation_type)
+
+        # self-attention module definition
+        if pos_enc_layer_type == "no_pos":
+            encoder_selfattn_layer = MultiHeadedAttention
+        else:
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+        )
+        # feed-forward module definition
+        positionwise_layer = PositionwiseFeedForward
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+            activation,
+        )
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (output_size, cnn_module_kernel, activation,
+                                  cnn_module_norm, causal)
+
+        self.encoders = torch.nn.ModuleList([
+            ConformerEncoderLayer(
+                output_size,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(
+                    *positionwise_layer_args) if macaron_style else None,
+                convolution_layer(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ) for _ in range(num_blocks)
+        ])
diff --git a/ACL_PyTorch/contrib/audio/WeNet/env.sh b/ACL_PyTorch/contrib/audio/WeNet/env.sh
new file mode 100644
index 0000000000000000000000000000000000000000..33243beff29297566d7412a08452ad6657a8d6fa
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/env.sh
@@ -0,0 +1,9 @@
+export install_path=/usr/local/Ascend/ascend-toolkit/latest
+export PATH=${install_path}/atc/bin:${install_path}/bin:${install_path}/atc/ccec_compiler/bin:$PATH
+export LD_LIBRARY_PATH=${install_path}/lib64:${install_path}/atc/lib64:${install_path}/acllib/lib64:${install_path}/compiler/lib64/plugin/opskernel:${install_path}/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH
+export PYTHONPATH=${install_path}/latest/python/site-packages:${install_path}/opp/op_impl/built-in/ai_core/tbe:${install_path}/atc/python/site-packages:${install_path}/pyACL/python/site-packages/acl:$PYTHONPATH
+export ASCEND_AICPU_PATH=${install_path}
+export ASCEND_OPP_PATH=${install_path}/opp
+export TOOLCHAIN_HOME=${install_path}/toolkit
+export ASCEND_AUTOML_PATH=${install_path}/tools
+export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:${LD_LIBRARY_PATH}
diff --git a/ACL_PyTorch/contrib/audio/WeNet/export_onnx.diff b/ACL_PyTorch/contrib/audio/WeNet/export_onnx.diff
new file mode 100644
index 0000000000000000000000000000000000000000..9d954caedcedd69e9b6034db160c8fbbb8a53737
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/export_onnx.diff
@@ -0,0 +1,794 @@
+diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py
+index 73990fa..68c8299 100644
+--- a/wenet/transformer/asr_model.py
++++ b/wenet/transformer/asr_model.py
+@@ -245,7 +245,7 @@ class ASRModel(torch.nn.Module):
+             top_k_logp, top_k_index = logp.topk(beam_size)  # (B*N, N)
+             top_k_logp = mask_finished_scores(top_k_logp, end_flag)
+             top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos)
+-            # 2.3 Second beam prune: select topk score with history
++            # 2.3 Seconde beam prune: select topk score with history
+             scores = scores + top_k_logp  # (B*N, N), broadcast add
+             scores = scores.view(batch_size, beam_size * beam_size)  # (B, N*N)
+             scores, offset_k_index = scores.topk(k=beam_size)  # (B, N)
+@@ -570,13 +570,12 @@ class ASRModel(torch.nn.Module):
+     def forward_encoder_chunk(
+         self,
+         xs: torch.Tensor,
+-        offset: int,
+-        required_cache_size: int,
++        offset: torch.Tensor,
++        required_cache_size: torch.Tensor,
+         subsampling_cache: Optional[torch.Tensor] = None,
+-        elayers_output_cache: Optional[List[torch.Tensor]] = None,
+-        conformer_cnn_cache: Optional[List[torch.Tensor]] = None,
+-    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor],
+-               List[torch.Tensor]]:
++        elayers_output_cache: Optional[torch.Tensor] = None,
++        conformer_cnn_cache: Optional[torch.Tensor] = None,
++    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+         """ Export interface for c++ call, give input chunk xs, and return
+             output from time 0 to current chunk.
+ 
+@@ -675,6 +674,10 @@ class ASRModel(torch.nn.Module):
+         r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1)
+         return decoder_out, r_decoder_out
+ 
++    @torch.jit.export
++    def test(self,) -> str:
++        return "test"
++
+ 
+ def init_asr_model(configs):
+     if configs['cmvn_file'] is not None:
+diff --git a/wenet/transformer/decoder.py b/wenet/transformer/decoder.py
+index f41f7e4..40c1a57 100644
+--- a/wenet/transformer/decoder.py
++++ b/wenet/transformer/decoder.py
+@@ -57,8 +57,7 @@ class TransformerDecoder(torch.nn.Module):
+         if input_layer == "embed":
+             self.embed = torch.nn.Sequential(
+                 torch.nn.Embedding(vocab_size, attention_dim),
+-                PositionalEncoding(attention_dim, positional_dropout_rate),
+-            )
++                PositionalEncoding(attention_dim, positional_dropout_rate))
+         else:
+             raise ValueError(f"only 'embed' is supported: {input_layer}")
+ 
+@@ -81,6 +80,10 @@ class TransformerDecoder(torch.nn.Module):
+                 concat_after,
+             ) for _ in range(self.num_blocks)
+         ])
++        self.onnx_mode = False
++
++    def set_onnx_mode(self, onnx_mode=False):
++        self.onnx_mode = onnx_mode
+ 
+     def forward(
+         self,
+@@ -111,13 +114,15 @@ class TransformerDecoder(torch.nn.Module):
+         tgt = ys_in_pad
+ 
+         # tgt_mask: (B, 1, L)
+-        tgt_mask = (~make_pad_mask(ys_in_lens).unsqueeze(1)).to(tgt.device)
++        tgt_mask = (~make_pad_mask(ys_in_lens, ys_in_pad).unsqueeze(1)).to(tgt.device)
+         # m: (1, L, L)
+         m = subsequent_mask(tgt_mask.size(-1),
+                             device=tgt_mask.device).unsqueeze(0)
+         # tgt_mask: (B, L, L)
+-        tgt_mask = tgt_mask & m
+-        x, _ = self.embed(tgt)
++        # tgt_mask = tgt_mask & m
++        tgt_mask = torch.mul(tgt_mask, m)
++        x = self.embed[0](tgt)
++        x, _ = self.embed[1](x, onnx_mode=self.onnx_mode)
+         for layer in self.decoders:
+             x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory,
+                                                      memory_mask)
+@@ -225,6 +230,13 @@ class BiTransformerDecoder(torch.nn.Module):
+             self_attention_dropout_rate, src_attention_dropout_rate,
+             input_layer, use_output_layer, normalize_before, concat_after)
+ 
++        self.onnx_mode = False
++
++    def set_onnx_mode(self, onnx_mode=False):
++        self.onnx_mode = onnx_mode
++        self.left_decoder.set_onnx_mode(onnx_mode)
++        self.right_decoder.set_onnx_mode(onnx_mode)
++
+     def forward(
+         self,
+         memory: torch.Tensor,
+@@ -252,6 +264,7 @@ class BiTransformerDecoder(torch.nn.Module):
+                     if use_output_layer is True,
+                 olens: (batch, )
+         """
++        reverse_weight = 0.3
+         l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad,
+                                           ys_in_lens)
+         r_x = torch.tensor(0.0)
+diff --git a/wenet/transformer/decoder_layer.py b/wenet/transformer/decoder_layer.py
+index 25bb281..59dd174 100644
+--- a/wenet/transformer/decoder_layer.py
++++ b/wenet/transformer/decoder_layer.py
+@@ -17,7 +17,7 @@ class DecoderLayer(nn.Module):
+         size (int): Input dimension.
+         self_attn (torch.nn.Module): Self-attention module instance.
+             `MultiHeadedAttention` instance can be used as the argument.
+-        src_attn (torch.nn.Module): Inter-attention module instance.
++        src_attn (torch.nn.Module): Self-attention module instance.
+             `MultiHeadedAttention` instance can be used as the argument.
+         feed_forward (torch.nn.Module): Feed-forward module instance.
+             `PositionwiseFeedForward` instance can be used as the argument.
+@@ -61,7 +61,8 @@ class DecoderLayer(nn.Module):
+         tgt_mask: torch.Tensor,
+         memory: torch.Tensor,
+         memory_mask: torch.Tensor,
+-        cache: Optional[torch.Tensor] = None
++        cache: Optional[torch.Tensor] = None,
++        onnx_mode: Optional[bool] = False
+     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+         """Compute decoded features.
+ 
+diff --git a/wenet/transformer/embedding.py b/wenet/transformer/embedding.py
+index a47afd9..0a6794c 100644
+--- a/wenet/transformer/embedding.py
++++ b/wenet/transformer/embedding.py
+@@ -9,6 +9,7 @@ import math
+ from typing import Tuple
+ 
+ import torch
++from wenet.transformer.slice_helper import slice_helper2
+ 
+ 
+ class PositionalEncoding(torch.nn.Module):
+@@ -45,7 +46,8 @@ class PositionalEncoding(torch.nn.Module):
+ 
+     def forward(self,
+                 x: torch.Tensor,
+-                offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]:
++                offset: torch.Tensor = torch.tensor(0),
++                onnx_mode: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
+         """Add positional encoding.
+ 
+         Args:
+@@ -56,13 +58,21 @@ class PositionalEncoding(torch.nn.Module):
+             torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+             torch.Tensor: for compatibility to RelPositionalEncoding
+         """
+-        assert offset + x.size(1) < self.max_len
++        # assert offset + x.size(1) < self.max_len
+         self.pe = self.pe.to(x.device)
+-        pos_emb = self.pe[:, offset:offset + x.size(1)]
++        # pos_emb = self.pe[:, offset:offset + x.size(1)]
++        if onnx_mode:
++            pos_emb = slice_helper2(self.pe, offset, offset + x.size(1))
++        else:
++            pos_emb = self.pe[:, offset:offset + x.size(1)]
+         x = x * self.xscale + pos_emb
+         return self.dropout(x), self.dropout(pos_emb)
+ 
+-    def position_encoding(self, offset: int, size: int) -> torch.Tensor:
++    def position_encoding(self, 
++                            offset: torch.Tensor, 
++                            size: torch.Tensor,
++                            onnx_mode: bool = False,
++                            ) -> torch.Tensor:
+         """ For getting encoding in a streaming fashion
+ 
+         Attention!!!!!
+@@ -79,7 +89,12 @@ class PositionalEncoding(torch.nn.Module):
+             torch.Tensor: Corresponding encoding
+         """
+         assert offset + size < self.max_len
+-        return self.dropout(self.pe[:, offset:offset + size])
++        if onnx_mode:
++            # pe = torch.cat([self.pe[:, [0]], slice_helper2(self.pe, offset, offset + size - 1)], dim=1)
++            pe = slice_helper2(self.pe, offset, offset + size)
++        else:
++            pe = self.pe[:, offset:offset + size]
++        return self.dropout(pe)
+ 
+ 
+ class RelPositionalEncoding(PositionalEncoding):
+@@ -96,7 +111,8 @@ class RelPositionalEncoding(PositionalEncoding):
+ 
+     def forward(self,
+                 x: torch.Tensor,
+-                offset: int = 0) -> Tuple[torch.Tensor, torch.Tensor]:
++                offset: torch.Tensor,
++                onnx_mode: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
+         """Compute positional encoding.
+         Args:
+             x (torch.Tensor): Input tensor (batch, time, `*`).
+@@ -104,10 +120,16 @@ class RelPositionalEncoding(PositionalEncoding):
+             torch.Tensor: Encoded tensor (batch, time, `*`).
+             torch.Tensor: Positional embedding tensor (1, time, `*`).
+         """
+-        assert offset + x.size(1) < self.max_len
++        # assert offset + x.size(1) < self.max_len
+         self.pe = self.pe.to(x.device)
+         x = x * self.xscale
+-        pos_emb = self.pe[:, offset:offset + x.size(1)]
++        if onnx_mode:
++            # end = offset.item() + x.size(1)
++            # pos_emb = torch.index_select(self.pe, 1, torch.tensor(range(x.size(1))))
++            pos_emb = slice_helper2(self.pe, offset, offset + x.size(1))
++            # pos_emb = slice_helper3(pos_emb, x.size(1))
++        else:
++            pos_emb = self.pe[:, offset:offset + x.size(1)]
+         return self.dropout(x), self.dropout(pos_emb)
+ 
+ 
+diff --git a/wenet/transformer/encoder.py b/wenet/transformer/encoder.py
+index e342ed4..9b4f968 100644
+--- a/wenet/transformer/encoder.py
++++ b/wenet/transformer/encoder.py
+@@ -6,6 +6,8 @@
+ """Encoder definition."""
+ from typing import Tuple, List, Optional
+ 
++import numpy as np
++import onnxruntime
+ import torch
+ from typeguard import check_argument_types
+ 
+@@ -18,6 +20,7 @@ from wenet.transformer.embedding import NoPositionalEncoding
+ from wenet.transformer.encoder_layer import TransformerEncoderLayer
+ from wenet.transformer.encoder_layer import ConformerEncoderLayer
+ from wenet.transformer.positionwise_feed_forward import PositionwiseFeedForward
++from wenet.transformer.slice_helper import slice_helper3, get_next_cache_start
+ from wenet.transformer.subsampling import Conv2dSubsampling4
+ from wenet.transformer.subsampling import Conv2dSubsampling6
+ from wenet.transformer.subsampling import Conv2dSubsampling8
+@@ -26,6 +29,8 @@ from wenet.utils.common import get_activation
+ from wenet.utils.mask import make_pad_mask
+ from wenet.utils.mask import add_optional_chunk_mask
+ 
++def to_numpy(x):
++    return x.detach().numpy()
+ 
+ class BaseEncoder(torch.nn.Module):
+     def __init__(
+@@ -116,10 +121,14 @@ class BaseEncoder(torch.nn.Module):
+         self.static_chunk_size = static_chunk_size
+         self.use_dynamic_chunk = use_dynamic_chunk
+         self.use_dynamic_left_chunk = use_dynamic_left_chunk
++        self.onnx_mode = False
+ 
+     def output_size(self) -> int:
+         return self._output_size
+ 
++    def set_onnx_mode(self, onnx_mode=False):
++        self.onnx_mode = onnx_mode
++
+     def forward(
+         self,
+         xs: torch.Tensor,
+@@ -130,7 +139,7 @@ class BaseEncoder(torch.nn.Module):
+         """Embed positions in tensor.
+ 
+         Args:
+-            xs: padded input tensor (B, T, D)
++            xs: padded input tensor (B, L, D)
+             xs_lens: input length (B)
+             decoding_chunk_size: decoding chunk size for dynamic chunk
+                 0: default for training, use random dynamic chunk.
+@@ -141,16 +150,18 @@ class BaseEncoder(torch.nn.Module):
+                 >=0: use num_decoding_left_chunks
+                 <0: use all left chunks
+         Returns:
+-            encoder output tensor xs, and subsampled masks
+-            xs: padded output tensor (B, T' ~= T/subsample_rate, D)
+-            masks: torch.Tensor batch padding mask after subsample
+-                (B, 1, T' ~= T/subsample_rate)
++            encoder output tensor, lens and mask
+         """
+-        masks = ~make_pad_mask(xs_lens).unsqueeze(1)  # (B, 1, T)
++        decoding_chunk_size = 1
++        num_decoding_left_chunks = 1
++        self.use_dynamic_chunk = False
++        self.use_dynamic_left_chunk = False
++        self.static_chunk_size = 0
++        masks = ~make_pad_mask(xs_lens, xs).unsqueeze(1)  # (B, 1, L)
+         if self.global_cmvn is not None:
+             xs = self.global_cmvn(xs)
+         xs, pos_emb, masks = self.embed(xs, masks)
+-        mask_pad = masks  # (B, 1, T/subsample_rate)
++        mask_pad = masks
+         chunk_masks = add_optional_chunk_mask(xs, masks,
+                                               self.use_dynamic_chunk,
+                                               self.use_dynamic_left_chunk,
+@@ -169,13 +180,12 @@ class BaseEncoder(torch.nn.Module):
+     def forward_chunk(
+         self,
+         xs: torch.Tensor,
+-        offset: int,
+-        required_cache_size: int,
++        offset_tensor: torch.Tensor = torch.tensor(0),
++        required_cache_size_tensor: torch.Tensor = torch.tensor(0),
+         subsampling_cache: Optional[torch.Tensor] = None,
+-        elayers_output_cache: Optional[List[torch.Tensor]] = None,
+-        conformer_cnn_cache: Optional[List[torch.Tensor]] = None,
+-    ) -> Tuple[torch.Tensor, torch.Tensor, List[torch.Tensor],
+-               List[torch.Tensor]]:
++        elayers_output_cache: Optional[torch.Tensor] = None,
++        conformer_cnn_cache: Optional[torch.Tensor] = None,
++    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+         """ Forward just one chunk
+ 
+         Args:
+@@ -199,6 +209,7 @@ class BaseEncoder(torch.nn.Module):
+             List[torch.Tensor]: conformer cnn cache
+ 
+         """
++        required_cache_size_tensor = torch.tensor(-1)
+         assert xs.size(0) == 1
+         # tmp_masks is just for interface compatibility
+         tmp_masks = torch.ones(1,
+@@ -208,30 +219,53 @@ class BaseEncoder(torch.nn.Module):
+         tmp_masks = tmp_masks.unsqueeze(1)
+         if self.global_cmvn is not None:
+             xs = self.global_cmvn(xs)
+-        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset)
++        # if self.onnx_mode:
++        #     offset_tensor = offset_tensor - torch.tensor(1)
++        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset_tensor, self.onnx_mode)
+         if subsampling_cache is not None:
+             cache_size = subsampling_cache.size(1)
+             xs = torch.cat((subsampling_cache, xs), dim=1)
+         else:
+             cache_size = 0
+-        pos_emb = self.embed.position_encoding(offset - cache_size, xs.size(1))
+-        if required_cache_size < 0:
+-            next_cache_start = 0
+-        elif required_cache_size == 0:
+-            next_cache_start = xs.size(1)
++        # if self.onnx_mode:
++        #     cache_size = cache_size - 1
++        # if self.onnx_mode:
++        #     # subsampling_cache append dummy var, remove it here
++        #     xs = xs[:, 1:, :]
++        #     cache_size = cache_size - 1
++        if isinstance(xs.size(1), int):
++            xs_size_1 = torch.tensor(xs.size(1))
+         else:
+-            next_cache_start = max(xs.size(1) - required_cache_size, 0)
+-        r_subsampling_cache = xs[:, next_cache_start:, :]
++            xs_size_1 = xs.size(1).clone().detach()
++        pos_emb = self.embed.position_encoding(offset_tensor - cache_size, 
++                                            xs_size_1,
++                                            self.onnx_mode)
++        next_cache_start = get_next_cache_start(required_cache_size_tensor, xs)
++        r_subsampling_cache = slice_helper3(xs, next_cache_start)
++        # if self.onnx_mode:
++        #     next_cache_start_1 = get_next_cache_start(required_cache_size_tensor, xs)
++        #     r_subsampling_cache = slice_helper3(xs, next_cache_start_1)
++        # else:
++        #     required_cache_size = required_cache_size_tensor.detach().item()
++        #     if required_cache_size < 0:
++        #         next_cache_start = 0
++        #     elif required_cache_size == 0:
++        #         next_cache_start = xs.size(1)
++        #     else:
++        #         next_cache_start = max(xs.size(1) - required_cache_size, 0)
++        #     r_subsampling_cache = xs[:, next_cache_start:, :]
+         # Real mask for transformer/conformer layers
+         masks = torch.ones(1, xs.size(1), device=xs.device, dtype=torch.bool)
+         masks = masks.unsqueeze(1)
+-        r_elayers_output_cache = []
+-        r_conformer_cnn_cache = []
++        r_elayers_output_cache = None
++        r_conformer_cnn_cache = None
+         for i, layer in enumerate(self.encoders):
+             if elayers_output_cache is None:
+                 attn_cache = None
+             else:
+                 attn_cache = elayers_output_cache[i]
++            # if self.onnx_mode and attn_cache is not None:
++            #     attn_cache = attn_cache[:, 1:, :]
+             if conformer_cnn_cache is None:
+                 cnn_cache = None
+             else:
+@@ -240,13 +274,32 @@ class BaseEncoder(torch.nn.Module):
+                                          masks,
+                                          pos_emb,
+                                          output_cache=attn_cache,
+-                                         cnn_cache=cnn_cache)
+-            r_elayers_output_cache.append(xs[:, next_cache_start:, :])
+-            r_conformer_cnn_cache.append(new_cnn_cache)
++                                         cnn_cache=cnn_cache,
++                                         onnx_mode=self.onnx_mode)
++            if self.onnx_mode:
++                layer_output_cache = slice_helper3(xs, next_cache_start)
++            else:
++                layer_output_cache = xs[:, next_cache_start:, :]
++            if i == 0:
++                r_elayers_output_cache = layer_output_cache.unsqueeze(0)
++                r_conformer_cnn_cache = new_cnn_cache.unsqueeze(0)
++            else:
++                # r_elayers_output_cache.append(xs[:, next_cache_start:, :])
++                r_elayers_output_cache = torch.cat((r_elayers_output_cache, layer_output_cache.unsqueeze(0)), 0)
++                # r_conformer_cnn_cache.append(new_cnn_cache)
++                r_conformer_cnn_cache = torch.cat((r_conformer_cnn_cache, new_cnn_cache.unsqueeze(0)), 0)
+         if self.normalize_before:
+             xs = self.after_norm(xs)
+-
+-        return (xs[:, cache_size:, :], r_subsampling_cache,
++        if self.onnx_mode:
++            cache_size = cache_size - 1
++            if isinstance(cache_size, int):
++                cache_size_1 = torch.tensor(cache_size)
++            else:
++                cache_size_1 = cache_size.clone().detach()
++            output = slice_helper3(xs, cache_size_1)
++        else:
++            output = xs[:, cache_size:, :]
++        return (output, r_subsampling_cache,
+                 r_elayers_output_cache, r_conformer_cnn_cache)
+ 
+     def forward_chunk_by_chunk(
+@@ -290,24 +343,54 @@ class BaseEncoder(torch.nn.Module):
+         decoding_window = (decoding_chunk_size - 1) * subsampling + context
+         num_frames = xs.size(1)
+         subsampling_cache: Optional[torch.Tensor] = None
+-        elayers_output_cache: Optional[List[torch.Tensor]] = None
+-        conformer_cnn_cache: Optional[List[torch.Tensor]] = None
++        elayers_output_cache: Optional[torch.Tensor] = None
++        conformer_cnn_cache: Optional[torch.Tensor] = None
+         outputs = []
+         offset = 0
+         required_cache_size = decoding_chunk_size * num_decoding_left_chunks
++        print("required_cache_size:", required_cache_size)
++        encoder_session = onnxruntime.InferenceSession("onnx/encoder.onnx")
++
++        subsampling_cache_onnx = torch.zeros(1, 1, 256, requires_grad=False)
++        elayers_output_cache_onnx = torch.zeros(12, 1, 1, 256, requires_grad=False)
++        conformer_cnn_cache_onnx = torch.zeros(12, 1, 256, 7, requires_grad=False)
+ 
+         # Feed forward overlap input step by step
+         for cur in range(0, num_frames - context + 1, stride):
+             end = min(cur + decoding_window, num_frames)
+             chunk_xs = xs[:, cur:end, :]
++            
++            if offset > 0:
++                offset = offset - 1
+             (y, subsampling_cache, elayers_output_cache,
+-             conformer_cnn_cache) = self.forward_chunk(chunk_xs, offset,
+-                                                       required_cache_size,
++             conformer_cnn_cache) = self.forward_chunk(chunk_xs, torch.tensor(offset),
++                                                       torch.tensor(required_cache_size),
+                                                        subsampling_cache,
+                                                        elayers_output_cache,
+                                                        conformer_cnn_cache)
+-            outputs.append(y)
++            
++            offset = offset + 1
++            encoder_inputs = {
++                encoder_session.get_inputs()[0].name: chunk_xs.numpy(),
++                encoder_session.get_inputs()[1].name: np.array(offset),
++                encoder_session.get_inputs()[2].name: subsampling_cache_onnx.numpy(),
++                encoder_session.get_inputs()[3].name: elayers_output_cache_onnx.numpy(),
++                encoder_session.get_inputs()[4].name: conformer_cnn_cache_onnx.numpy(),
++            }
++            ort_outs = encoder_session.run(None, encoder_inputs)
++            y_onnx, subsampling_cache_onnx, elayers_output_cache_onnx, conformer_cnn_cache_onnx = \
++                torch.from_numpy(ort_outs[0][:, 1:, :]), torch.from_numpy(ort_outs[1]), \
++                torch.from_numpy(ort_outs[2]), torch.from_numpy(ort_outs[3])
++
++            np.testing.assert_allclose(to_numpy(y), ort_outs[0][:, 1:, :], rtol=1e-03, atol=1e-03)
++            np.testing.assert_allclose(to_numpy(subsampling_cache), ort_outs[1][:, 1:, :], rtol=1e-03, atol=1e-03)
++            np.testing.assert_allclose(to_numpy(elayers_output_cache), ort_outs[2][:, :, 1:, :], rtol=1e-03, atol=1e-03)
++            np.testing.assert_allclose(to_numpy(conformer_cnn_cache), ort_outs[3], rtol=1e-03, atol=1e-03)
++
++            outputs.append(y_onnx)
++            # outputs.append(y)
+             offset += y.size(1)
++            # break
+         ys = torch.cat(outputs, 1)
+         masks = torch.ones(1, ys.size(1), device=ys.device, dtype=torch.bool)
+         masks = masks.unsqueeze(1)
+diff --git a/wenet/transformer/encoder_layer.py b/wenet/transformer/encoder_layer.py
+index db8696d..0be079c 100644
+--- a/wenet/transformer/encoder_layer.py
++++ b/wenet/transformer/encoder_layer.py
+@@ -9,6 +9,7 @@ from typing import Optional, Tuple
+ 
+ import torch
+ from torch import nn
++from wenet.transformer.slice_helper import slice_helper
+ 
+ 
+ class TransformerEncoderLayer(nn.Module):
+@@ -53,6 +54,9 @@ class TransformerEncoderLayer(nn.Module):
+         # concat_linear may be not used in forward fuction,
+         # but will be saved in the *.pt
+         self.concat_linear = nn.Linear(size + size, size)
++    
++    def set_onnx_mode(self, onnx_mode=False):
++        self.onnx_mode = onnx_mode
+ 
+     def forward(
+         self,
+@@ -92,9 +96,14 @@ class TransformerEncoderLayer(nn.Module):
+             assert output_cache.size(2) == self.size
+             assert output_cache.size(1) < x.size(1)
+             chunk = x.size(1) - output_cache.size(1)
+-            x_q = x[:, -chunk:, :]
+-            residual = residual[:, -chunk:, :]
+-            mask = mask[:, -chunk:, :]
++            if self.onnx_mode:
++                x_q = slice_helper(x, chunk)
++                residual = slice_helper(residual, chunk)
++                mask = slice_helper(mask, chunk)
++            else:
++                x_q = x[:, -chunk:, :]
++                residual = residual[:, -chunk:, :]
++                mask = mask[:, -chunk:, :]
+ 
+         if self.concat_after:
+             x_concat = torch.cat((x, self.self_attn(x_q, x, x, mask)), dim=-1)
+@@ -184,6 +193,7 @@ class ConformerEncoderLayer(nn.Module):
+         mask_pad: Optional[torch.Tensor] = None,
+         output_cache: Optional[torch.Tensor] = None,
+         cnn_cache: Optional[torch.Tensor] = None,
++        onnx_mode: bool = False,
+     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+         """Compute encoded features.
+ 
+@@ -193,7 +203,6 @@ class ConformerEncoderLayer(nn.Module):
+             pos_emb (torch.Tensor): positional encoding, must not be None
+                 for ConformerEncoderLayer.
+             mask_pad (torch.Tensor): batch padding mask used for conv module.
+-                (#batch, 1，time)
+             output_cache (torch.Tensor): Cache tensor of the output
+                 (#batch, time2, size), time2 < time in x.
+             cnn_cache (torch.Tensor): Convolution cache in conformer layer
+@@ -202,6 +211,14 @@ class ConformerEncoderLayer(nn.Module):
+             torch.Tensor: Mask tensor (#batch, time).
+         """
+ 
++        if onnx_mode:
++            x = x[:, 1:, :]
++            mask = mask[:, :, 1:]
++            # pos_emb_ = pos_emb[:, 1:, :]
++            pos_emb_ = pos_emb[:, :-1, :]
++        else:
++            pos_emb_ = pos_emb
++
+         # whether to use macaron style
+         if self.feed_forward_macaron is not None:
+             residual = x
+@@ -223,12 +240,26 @@ class ConformerEncoderLayer(nn.Module):
+             assert output_cache.size(0) == x.size(0)
+             assert output_cache.size(2) == self.size
+             assert output_cache.size(1) < x.size(1)
+-            chunk = x.size(1) - output_cache.size(1)
+-            x_q = x[:, -chunk:, :]
+-            residual = residual[:, -chunk:, :]
+-            mask = mask[:, -chunk:, :]
+ 
+-        x_att = self.self_attn(x_q, x, x, mask, pos_emb)
++            # chunk = x.size(1) - output_cache.size(1)
++            if onnx_mode:
++                chunk = x.size(1) - output_cache.size(1) + 1
++                if isinstance(chunk, int):
++                    chunk_1 = torch.tensor(chunk)
++                else:
++                    chunk_1 = chunk.clone().detach()
++                # chunk = torch.tensor(chunk)
++                # print(type(chunk))
++                x_q = slice_helper(x, chunk_1)
++                residual = slice_helper(residual, chunk_1)
++                mask = slice_helper(mask, chunk_1)
++            else:
++                chunk = x.size(1) - output_cache.size(1)
++                x_q = x[:, -chunk:, :]
++                residual = residual[:, -chunk:, :]
++                mask = mask[:, -chunk:, :]
++
++        x_att = self.self_attn(x_q, x, x, mask, pos_emb_)
+         if self.concat_after:
+             x_concat = torch.cat((x, x_att), dim=-1)
+             x = residual + self.concat_linear(x_concat)
+diff --git a/wenet/transformer/subsampling.py b/wenet/transformer/subsampling.py
+index b890f70..a978424 100644
+--- a/wenet/transformer/subsampling.py
++++ b/wenet/transformer/subsampling.py
+@@ -16,8 +16,11 @@ class BaseSubsampling(torch.nn.Module):
+         self.right_context = 0
+         self.subsampling_rate = 1
+ 
+-    def position_encoding(self, offset: int, size: int) -> torch.Tensor:
+-        return self.pos_enc.position_encoding(offset, size)
++    def position_encoding(self, 
++                        offset: torch.Tensor, 
++                        size: torch.Tensor,
++                        onnx_mode: bool = False) -> torch.Tensor:
++        return self.pos_enc.position_encoding(offset, size, onnx_mode)
+ 
+ 
+ class LinearNoSubsampling(BaseSubsampling):
+@@ -89,16 +92,17 @@ class Conv2dSubsampling4(BaseSubsampling):
+             torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
+         self.pos_enc = pos_enc_class
+         # The right context for every conv layer is computed by:
+-        # (kernel_size - 1) * frame_rate_of_this_layer
++        # (kernel_size - 1) / 2 * stride  * frame_rate_of_this_layer
+         self.subsampling_rate = 4
+-        # 6 = (3 - 1) * 1 + (3 - 1) * 2
++        # 6 = (3 - 1) / 2 * 2 * 1 + (3 - 1) / 2 * 2 * 2
+         self.right_context = 6
+ 
+     def forward(
+             self,
+             x: torch.Tensor,
+             x_mask: torch.Tensor,
+-            offset: int = 0
++            offset: torch.Tensor = torch.tensor(0),
++            onnx_mode: bool = False,
+     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+         """Subsample x.
+ 
+@@ -118,7 +122,7 @@ class Conv2dSubsampling4(BaseSubsampling):
+         x = self.conv(x)
+         b, c, t, f = x.size()
+         x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+-        x, pos_emb = self.pos_enc(x, offset)
++        x, pos_emb = self.pos_enc(x, offset, onnx_mode)
+         return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2]
+ 
+ 
+@@ -143,9 +147,9 @@ class Conv2dSubsampling6(BaseSubsampling):
+         self.linear = torch.nn.Linear(odim * (((idim - 1) // 2 - 2) // 3),
+                                       odim)
+         self.pos_enc = pos_enc_class
+-        # 10 = (3 - 1) * 1 + (5 - 1) * 2
++        # 14 = (3 - 1) / 2 * 2 * 1 + (5 - 1) / 2 * 3 * 2
+         self.subsampling_rate = 6
+-        self.right_context = 10
++        self.right_context = 14
+ 
+     def forward(
+             self,
+@@ -198,7 +202,7 @@ class Conv2dSubsampling8(BaseSubsampling):
+             odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim)
+         self.pos_enc = pos_enc_class
+         self.subsampling_rate = 8
+-        # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
++        # 14 = (3 - 1) / 2 * 2 * 1 + (3 - 1) / 2 * 2 * 2 + (3 - 1) / 2 * 2 * 4
+         self.right_context = 14
+ 
+     def forward(
+diff --git a/wenet/utils/mask.py b/wenet/utils/mask.py
+index c2bb50f..d23bd95 100644
+--- a/wenet/utils/mask.py
++++ b/wenet/utils/mask.py
+@@ -5,6 +5,15 @@
+ 
+ import torch
+ 
++def tril_onnx(x, diagonal: torch.Tensor = torch.tensor(0)):
++     m,n = x.shape[0], x.shape[1]
++     arange = torch.arange(n, device = x.device)
++     mask = arange.expand(m, n)
++     mask_maker = torch.arange(m, device = x.device).unsqueeze(-1)
++     if diagonal:
++         mask_maker = mask_maker + diagonal
++     mask = mask <= mask_maker
++     return mask * x
+ 
+ def subsequent_mask(
+         size: int,
+@@ -35,13 +44,17 @@ def subsequent_mask(
+          [1, 1, 0],
+          [1, 1, 1]]
+     """
+-    ret = torch.ones(size, size, device=device, dtype=torch.bool)
+-    return torch.tril(ret, out=ret)
++    # ret = torch.ones(size, size, device=device, dtype=torch.bool)
++    # return torch.tril(ret, out=ret)
++    # to export onnx, we change the code as follows
++    ret = torch.ones(size, size, device=device)
++    #return torch.tril(ret, out=ret)
++    return tril_onnx(ret)
+ 
+ 
+ def subsequent_chunk_mask(
+-        size: int,
+-        chunk_size: int,
++        size: torch.tensor(0),
++        chunk_size: torch.tensor(0),
+         num_left_chunks: int = -1,
+         device: torch.device = torch.device("cpu"),
+ ) -> torch.Tensor:
+@@ -67,6 +80,18 @@ def subsequent_chunk_mask(
+          [1, 1, 1, 1]]
+     """
+     ret = torch.zeros(size, size, device=device, dtype=torch.bool)
++    row_index = torch.arange(size, device = device)
++    index  = row_index.expand(size, size)
++    expand_size = torch.ones((size), device = device)*size
++    #expand_size = expand_size.long()
++    if num_left_chunks < 0:
++        start1 = torch.tensor(0)
++    else:
++        start1 = torch.max((torch.floor_divide(row_index, chunk_size)-num_left_chunks).float()*chunk_size, torch.tensor(0.0)).long().view(size,1)
++    ending = torch.min((torch.floor_divide(row_index, chunk_size)+1).float()*chunk_size, expand_size.float()).long().view(size,1)
++    ret[torch.where(index < ending)] = True
++    ret[torch.where(index < start1)] = False
++    '''
+     for i in range(size):
+         if num_left_chunks < 0:
+             start = 0
+@@ -74,6 +99,8 @@ def subsequent_chunk_mask(
+             start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
+         ending = min((i // chunk_size + 1) * chunk_size, size)
+         ret[i, start:ending] = True
++    print("ret:", ret)
++    '''
+     return ret
+ 
+ 
+@@ -107,18 +134,18 @@ def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor,
+     """
+     # Whether to use chunk mask or not
+     if use_dynamic_chunk:
+-        max_len = xs.size(1)
++        max_len = xs.shape[1]
+         if decoding_chunk_size < 0:
+             chunk_size = max_len
+             num_left_chunks = -1
+         elif decoding_chunk_size > 0:
+-            chunk_size = decoding_chunk_size
++            chunk_size = torch.tensor(decoding_chunk_size)
+             num_left_chunks = num_decoding_left_chunks
+         else:
+             # chunk size is either [1, 25] or full context(max_len).
+             # Since we use 4 times subsampling and allow up to 1s(100 frames)
+             # delay, the maximum frame is 100 / 4 = 25.
+-            chunk_size = torch.randint(1, max_len, (1, )).item()
++            chunk_size = torch.randint(1, max_len, (1, ))
+             num_left_chunks = -1
+             if chunk_size > max_len // 2:
+                 chunk_size = max_len
+@@ -128,14 +155,14 @@ def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor,
+                     max_left_chunks = (max_len - 1) // chunk_size
+                     num_left_chunks = torch.randint(0, max_left_chunks,
+                                                     (1, )).item()
+-        chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size,
++        chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size,
+                                             num_left_chunks,
+                                             xs.device)  # (L, L)
+         chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+         chunk_masks = masks & chunk_masks  # (B, L, L)
+     elif static_chunk_size > 0:
+         num_left_chunks = num_decoding_left_chunks
+-        chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size,
++        chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size,
+                                             num_left_chunks,
+                                             xs.device)  # (L, L)
+         chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+@@ -145,7 +172,7 @@ def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor,
+     return chunk_masks
+ 
+ 
+-def make_pad_mask(lengths: torch.Tensor) -> torch.Tensor:
++def make_pad_mask(lengths: torch.Tensor, xs: torch.Tensor) -> torch.Tensor:
+     """Make mask tensor containing indices of padded part.
+ 
+     See description of make_non_pad_mask.
+@@ -162,8 +189,11 @@ def make_pad_mask(lengths: torch.Tensor) -> torch.Tensor:
+                  [0, 0, 0, 1, 1],
+                  [0, 0, 1, 1, 1]]
+     """
+-    batch_size = int(lengths.size(0))
+-    max_len = int(lengths.max().item())
++    # batch_size = int(lengths.size(0))
++    # max_len = int(lengths.max().item())
++    # to export the decoder onnx and avoid the constant fold
++    batch_size = xs.shape[0]
++    max_len = xs.shape[1]
+     seq_range = torch.arange(0,
+                              max_len,
+                              dtype=torch.int64,
diff --git a/ACL_PyTorch/contrib/audio/WeNet/export_onnx.py b/ACL_PyTorch/contrib/audio/WeNet/export_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..63617b9c52c735e89e475262e9987f7f95042d37
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/export_onnx.py
@@ -0,0 +1,193 @@
+# BSD 3-Clause License
+#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+
+
+from __future__ import print_function
+
+import argparse
+import os
+
+import torch
+import onnx, onnxruntime
+import yaml
+import numpy as np
+
+from wenet.transformer.asr_model import init_asr_model
+from wenet.transformer.decoder import TransformerDecoder, BiTransformerDecoder
+from wenet.utils.checkpoint import load_checkpoint
+
+
+def to_numpy(x):
+    return x.detach().numpy()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='export your script model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--checkpoint', required=True, help='checkpoint model')
+    # parser.add_argument('--output_file', required=True, help='output file')
+    parser.add_argument('--output_onnx_file', required=True, help='output onnx file')
+    args = parser.parse_args()
+    # No need gpu for model export
+    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+    model = init_asr_model(configs)
+    print(model)
+
+    load_checkpoint(model, args.checkpoint)
+    # Export jit torch script model
+
+    model.eval()
+
+    #export the none flash model
+    encoder = model.encoder
+    xs = torch.randn(1, 131, 80, requires_grad=False)
+    xs_lens = torch.tensor([131], dtype=torch.int32)
+    onnx_encoder_path = os.path.join(args.output_onnx_file, 'no_flash_encoder.onnx')
+    torch.onnx.export(encoder,
+                    (xs, xs_lens),
+                    onnx_encoder_path,
+                    export_params=True,
+                    opset_version=11,
+                    do_constant_folding=True,
+                    input_names=['xs_input', 'xs_input_lens'],
+                    output_names=['xs_output', 'masks_output'],
+                    dynamic_axes={'xs_input': [1], 'xs_input_lens': [0],
+                                    'xs_output': [1], 'masks_output': [2]},
+                    verbose=True
+                    )
+    onnx_model = onnx.load(onnx_encoder_path)
+    onnx.checker.check_model(onnx_model)
+    print("encoder onnx_model check pass!")
+
+    ort_session = onnxruntime.InferenceSession(onnx_encoder_path)
+    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(xs),
+                ort_session.get_inputs()[1].name: to_numpy(xs_lens),
+                }
+    ort_outs = ort_session.run(None, ort_inputs)
+    y1, y2 = encoder(xs, xs_lens)
+    # np.testing.assert_allclose(to_numpy(y1), ort_outs[0], rtol=1e-05, atol=1e-05)
+    # np.testing.assert_allclose(to_numpy(y2), ort_outs[1], rtol=1e-05, atol=1e-05)
+    print("Exported no flash encoder model has been tested with ONNXRuntime, and the result looks good!")
+
+    #export the flash encoder
+    encoder = model.encoder
+    encoder.forward = encoder.forward_chunk
+
+    batch_size = 1
+    audio_len = 131
+    x = torch.randn(batch_size, audio_len, 80, requires_grad=False)
+    offset = torch.tensor(1)
+    decoding_chunk_size = 16
+    num_decoding_left_chunks = -1
+    required_cache_size = decoding_chunk_size * num_decoding_left_chunks
+    required_cache_size = torch.tensor(required_cache_size)
+    subsampling_cache = torch.randn(batch_size, 1, 256, requires_grad=False)
+    elayers_cache = torch.randn(12, batch_size, 1, 256, requires_grad=False)
+    conformer_cnn_cache = torch.randn(12, batch_size, 256, 7, requires_grad=False)
+
+
+    encoder.set_onnx_mode(False)
+    y, subsampling_cache_output, elayers_cache_output, conformer_cnn_cache_output = encoder(x, torch.tensor(0), \
+                                                                required_cache_size, None, None, conformer_cnn_cache)
+
+    encoder.set_onnx_mode(True)
+    onnx_encoder_path = os.path.join(args.output_onnx_file, 'encoder.onnx')
+    torch.onnx.export(encoder,
+                    (x, offset, required_cache_size, subsampling_cache, elayers_cache, conformer_cnn_cache),
+                    onnx_encoder_path,
+                    export_params=True,
+                    opset_version=11,
+                    do_constant_folding=True,
+                    input_names=['input', 'offset', 'required_cache_size', 'subsampling_cache', 'elayers_cache', \
+                                 'conformer_cnn_cache'],
+                    output_names=['output', 'subsampling_cache_output', 'elayers_cache_output', \
+                                  'conformer_cnn_cache_output'],
+                    dynamic_axes={'input': [1], 'subsampling_cache':[1], 'elayers_cache':[2],
+                                    'output': [1]},
+                    verbose=True
+                    )
+
+    onnx_model = onnx.load(onnx_encoder_path)
+    onnx.checker.check_model(onnx_model)
+    print("encoder onnx_model check pass!")
+
+    ort_session = onnxruntime.InferenceSession(onnx_encoder_path)
+    ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(x),
+                ort_session.get_inputs()[1].name: to_numpy(offset),
+                ort_session.get_inputs()[2].name: to_numpy(subsampling_cache),
+                ort_session.get_inputs()[3].name: to_numpy(elayers_cache),
+                ort_session.get_inputs()[4].name: to_numpy(conformer_cnn_cache),
+                }
+    ort_outs = ort_session.run(None, ort_inputs)
+    print("Exported encoder model has been tested with ONNXRuntime, and the result looks good!")
+
+    #export decoder onnx
+
+    decoder = model.decoder
+    decoder.set_onnx_mode(True)
+    onnx_decoder_path = os.path.join(args.output_onnx_file, 'decoder.onnx')
+    memory = torch.randn(10, 131, 256)
+    memory_mask = torch.ones(10, 1, 131).bool()
+    ys_in_pad = torch.randint(0, 4232, (10, 50)).long()
+    ys_in_lens = torch.tensor([13, 13, 13, 13, 13, 13, 13, 13, 50, 13], dtype=torch.int32)
+    r_ys_in_pad = torch.randint(0, 4232, (10, 50)).long()
+
+    if isinstance(decoder, TransformerDecoder):
+        torch.onnx.export(decoder,
+                        (memory, memory_mask, ys_in_pad, ys_in_lens),
+                        onnx_decoder_path,
+                        export_params=True,
+                        opset_version=12,
+                        do_constant_folding=True,
+                        input_names=['memory', 'memory_mask', 'ys_in_pad', 'ys_in_lens'],
+                        output_names=['l_x', 'r_x'],
+                        dynamic_axes={'memory': [1], 'memory_mask':[2], 'ys_in_pad':[1],
+                                        'ys_in_lens': [0]},
+                        verbose=True
+                        )
+    elif isinstance(decoder, BiTransformerDecoder):
+        print("BI mode")
+        torch.onnx.export(decoder,
+                        (memory, memory_mask, ys_in_pad, ys_in_lens, r_ys_in_pad),
+                        onnx_decoder_path,
+                        export_params=True,
+                        opset_version=11,
+                        do_constant_folding=True,
+                        input_names=['memory', 'memory_mask', 'ys_in_pad', 'ys_in_lens', 'r_ys_in_pad'],
+                        output_names=['l_x', 'r_x', 'olens'],
+                        dynamic_axes={'memory': [1], 'memory_mask':[2], 'ys_in_pad':[1],
+                                        'ys_in_lens': [0], 'r_ys_in_pad':[1]},
+                        verbose=True
+                        )
+
diff --git a/ACL_PyTorch/contrib/audio/WeNet/export_onnx.sh b/ACL_PyTorch/contrib/audio/WeNet/export_onnx.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bebe2a07ef04b7fb044fdb57dcac8bc1337e3894
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/export_onnx.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+. ./path.sh || exit 1;
+
+yaml_path=$1
+decode_checkpoint=$2
+
+mkdir onnx
+python3 wenet/bin/export_onnx.py \
+    --config $yaml_path \
+    --checkpoint $decode_checkpoint \
+    --output_onnx_file onnx
diff --git a/ACL_PyTorch/contrib/audio/WeNet/get_no_flash_encoder_out.diff b/ACL_PyTorch/contrib/audio/WeNet/get_no_flash_encoder_out.diff
new file mode 100644
index 0000000000000000000000000000000000000000..b209edb704a1cc238363120b779cbf3dc2556058
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/get_no_flash_encoder_out.diff
@@ -0,0 +1,38 @@
+diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py
+index 73990fa..e2f3555 100644
+--- a/wenet/transformer/asr_model.py
++++ b/wenet/transformer/asr_model.py
+@@ -175,6 +175,33 @@ class ASRModel(torch.nn.Module):
+                 num_decoding_left_chunks=num_decoding_left_chunks
+             )  # (B, maxlen, encoder_dim)
+         return encoder_out, encoder_mask
++    
++    def get_no_flash_encoder_out(
++        self,
++        encoder_model_noflash,
++        batch_idx: int,
++        speech: torch.Tensor,
++        speech_lengths: torch.Tensor,
++        beam_size: int,
++        decoding_chunk_size: int = -1,
++        num_decoding_left_chunks: int = -1,
++        ctc_weight: float = 0.0,
++        simulate_streaming: bool = False,
++        reverse_weight: float = 0.0,
++    ) -> List[int]:
++        assert speech.shape[0] == speech_lengths.shape[0]
++        assert decoding_chunk_size != 0
++        if reverse_weight > 0.0:
++            # decoder should be a bitransformer decoder if reverse_weight > 0.0
++            assert hasattr(self.decoder, 'right_decoder')
++        device = speech.device
++        batch_size = speech.shape[0]
++        # For attention rescoring we only support batch_size=1
++        assert batch_size == 1
++        y, exe_time = encoder_model_noflash(
++            [speech.numpy(), speech_lengths.numpy().astype("int32")])  # (beam_size, max_hyps_len, vocab_size)
++        encoder_out, encoder_mask = torch.from_numpy(y[0]), torch.from_numpy(y[1])
++        return encoder_out, encoder_mask, exe_time
+ 
+     def recognize(
+         self,
diff --git a/ACL_PyTorch/contrib/audio/WeNet/getwer.diff b/ACL_PyTorch/contrib/audio/WeNet/getwer.diff
new file mode 100644
index 0000000000000000000000000000000000000000..633513671cfe36099d7139f6f2245c7511d39b69
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/getwer.diff
@@ -0,0 +1,174 @@
+diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py
+index 73990fa..82337ca 100644
+--- a/wenet/transformer/asr_model.py
++++ b/wenet/transformer/asr_model.py
+@@ -33,7 +33,8 @@ from wenet.utils.common import (IGNORE_ID, add_sos_eos, log_add,
+ from wenet.utils.mask import (make_pad_mask, mask_finished_preds,
+                               mask_finished_scores, subsequent_mask)
+ 
+-
++import numpy as np
++import os
+ class ASRModel(torch.nn.Module):
+     """CTC-attention hybrid Encoder-Decoder model"""
+     def __init__(
+@@ -443,6 +444,159 @@ class ASRModel(torch.nn.Module):
+                                                simulate_streaming)
+         return hyps[0][0]
+ 
++    def get_wer(
++        self,
++        batch_idx,
++        bin_path,
++        json_data,
++        decoder_model,
++        speech: torch.Tensor,
++        speech_lengths: torch.Tensor,
++        beam_size: int,
++        decoding_chunk_size: int = -1,
++        num_decoding_left_chunks: int = -1,
++        ctc_weight: float = 0.0,
++        simulate_streaming: bool = False,
++        reverse_weight: float = 0.0,
++    ) -> List[int]:
++        """ Apply attention rescoring decoding, CTC prefix beam search
++            is applied first to get nbest, then we resoring the nbest on
++            attention decoder with corresponding encoder out
++
++        Args:
++            speech (torch.Tensor): (batch, max_len, feat_dim)
++            speech_length (torch.Tensor): (batch, )
++            beam_size (int): beam size for beam search
++            decoding_chunk_size (int): decoding chunk for dynamic chunk
++                trained model.
++                <0: for decoding, use full chunk.
++                >0: for decoding, use fixed chunk size as set.
++                0: used for training, it's prohibited here
++            simulate_streaming (bool): whether do encoder forward in a
++                streaming fashion
++            reverse_weight (float): right to left decoder weight
++            ctc_weight (float): ctc score weight
++
++        Returns:
++            List[int]: Attention rescoring result
++        """
++        assert speech.shape[0] == speech_lengths.shape[0]
++        assert decoding_chunk_size != 0
++        if reverse_weight > 0.0:
++            # decoder should be a bitransformer decoder if reverse_weight > 0.0
++            assert hasattr(self.decoder, 'right_decoder')
++        device = speech.device
++        batch_size = speech.shape[0]
++        # For attention rescoring we only support batch_size=1
++        assert batch_size == 1
++        # encoder_out: (1, maxlen, encoder_dim), len(hyps) = beam_size
++        # Let's assume B = batch_size and N = beam_size
++        # 1. Encoder forward and get CTC score
++        encoder_out_file = "encoder_out_{}.bin".format(batch_idx)
++        encoder_out_path = os.path.join(bin_path, encoder_out_file)
++        encoder_out = np.fromfile(encoder_out_path,dtype = np.float32).reshape(
++        json_data["encoder_out_{}".format(batch_idx)])
++        encoder_mask_file = "encoder_mask_{}.bin".format(batch_idx)
++        encoder_mask_path = os.path.join(bin_path, encoder_mask_file)
++        encoder_mask = np.fromfile(encoder_mask_path, dtype = np.bool).reshape(json_data["encoder_mask_{}".format(batch_idx)])
++        encoder_out = torch.from_numpy(encoder_out)
++        maxlen = encoder_out.size(1)
++        ctc_probs = self.ctc.log_softmax(
++            encoder_out)  # (1, maxlen, vocab_size)
++        ctc_probs = ctc_probs.squeeze(0)
++        # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
++        cur_hyps = [(tuple(), (0.0, -float('inf')))]
++        # 2. CTC beam search step by step
++        for t in range(0, maxlen):
++            logp = ctc_probs[t]  # (vocab_size,)
++            # key: prefix, value (pb, pnb), default value(-inf, -inf)
++            next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
++            # 2.1 First beam prune: select topk best
++            top_k_logp, top_k_index = logp.topk(beam_size)  # (beam_size,)
++            for s in top_k_index:
++                s = s.item()
++                ps = logp[s].item()
++                for prefix, (pb, pnb) in cur_hyps:
++                    last = prefix[-1] if len(prefix) > 0 else None
++                    if s == 0:  # blank
++                        n_pb, n_pnb = next_hyps[prefix]
++                        n_pb = log_add([n_pb, pb + ps, pnb + ps])
++                        next_hyps[prefix] = (n_pb, n_pnb)
++                    elif s == last:
++                        #  Update *ss -> *s;
++                        n_pb, n_pnb = next_hyps[prefix]
++                        n_pnb = log_add([n_pnb, pnb + ps])
++                        next_hyps[prefix] = (n_pb, n_pnb)
++                        # Update *s-s -> *ss, - is for blank
++                        n_prefix = prefix + (s, )
++                        n_pb, n_pnb = next_hyps[n_prefix]
++                        n_pnb = log_add([n_pnb, pb + ps])
++                        next_hyps[n_prefix] = (n_pb, n_pnb)
++                    else:
++                        n_prefix = prefix + (s, )
++                        n_pb, n_pnb = next_hyps[n_prefix]
++                        n_pnb = log_add([n_pnb, pb + ps, pnb + ps])
++                        next_hyps[n_prefix] = (n_pb, n_pnb)
++
++            # 2.2 Second beam prune
++            next_hyps = sorted(next_hyps.items(),
++                               key=lambda x: log_add(list(x[1])),
++                               reverse=True)
++            cur_hyps = next_hyps[:beam_size]
++        hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
++
++        assert len(hyps) == beam_size
++        hyps_pad = pad_sequence([
++            torch.tensor(hyp[0], device=device, dtype=torch.long)
++            for hyp in hyps
++        ], True, self.ignore_id)  # (beam_size, max_hyps_len)
++        ori_hyps_pad = hyps_pad
++        hyps_lens = torch.tensor([len(hyp[0]) for hyp in hyps],
++                                 device=device,
++                                 dtype=torch.long)  # (beam_size,)
++        hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
++        hyps_lens = hyps_lens + 1  # Add <sos> at begining
++        encoder_out = encoder_out.repeat(beam_size, 1, 1)
++        encoder_mask = torch.ones(beam_size,
++                                  1,
++                                  encoder_out.size(1),
++                                  dtype=torch.bool,
++                                  device=device)
++        # used for right to left decoder
++        r_hyps_pad = reverse_pad_list(ori_hyps_pad, hyps_lens, self.ignore_id)
++        r_hyps_pad, _ = add_sos_eos(r_hyps_pad, self.sos, self.eos,
++                                    self.ignore_id)
++        y, exe_time = decoder_model([encoder_out.numpy(), encoder_mask.numpy(), hyps_pad.numpy(), hyps_lens.numpy().astype("int32"),
++                                                   r_hyps_pad.numpy()])  # (beam_size, max_hyps_len, vocab_size)
++        decoder_out, r_decoder_out = torch.from_numpy(y[0]), torch.from_numpy(y[1])
++        decoder_out = torch.nn.functional.log_softmax(decoder_out, dim=-1)
++        decoder_out = decoder_out.cpu().numpy()
++        # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a
++        # conventional transformer decoder.
++        r_decoder_out = torch.nn.functional.log_softmax(r_decoder_out, dim=-1)
++        r_decoder_out = r_decoder_out.cpu().numpy()
++        # Only use decoder score for rescoring
++        best_score = -float('inf')
++        best_index = 0
++        for i, hyp in enumerate(hyps):
++            score = 0.0
++            for j, w in enumerate(hyp[0]):
++                score += decoder_out[i][j][w]
++            score += decoder_out[i][len(hyp[0])][self.eos]
++            # add right to left decoder score
++            if reverse_weight > 0:
++                r_score = 0.0
++                for j, w in enumerate(hyp[0]):
++                    r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w]
++                r_score += r_decoder_out[i][len(hyp[0])][self.eos]
++                score = score * (1 - reverse_weight) + r_score * reverse_weight
++            # add ctc score
++            score += hyp[1] * ctc_weight
++            if score > best_score:
++                best_score = score
++                best_index = i
++        return hyps[best_index][0], exe_time
++
+     def attention_rescoring(
+         self,
+         speech: torch.Tensor,
diff --git a/ACL_PyTorch/contrib/audio/WeNet/infer.py b/ACL_PyTorch/contrib/audio/WeNet/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..95945eb542dcaba0b22d194f6f8421d7a7af1ff1
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/infer.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+t1 = {}
+with open("t1.json", 'r') as load_f:
+    t1 = json.load(load_f)
+
+t2 = {}
+with open("t2.json", 'r') as load_f:
+    t2 = json.load(load_f)
+
+perf = t1["t1"] + t2["t2"]
+print("fps:", 1000 / perf)
diff --git a/ACL_PyTorch/contrib/audio/WeNet/no_flash_encoder.sh b/ACL_PyTorch/contrib/audio/WeNet/no_flash_encoder.sh
new file mode 100644
index 0000000000000000000000000000000000000000..34cac6e77703a78b02aae672e75a533b9b7b2c3f
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/no_flash_encoder.sh
@@ -0,0 +1,11 @@
+export install_path=/usr/local/Ascend/ascend-toolkit/latest
+export PATH=${install_path}/atc/bin:${install_path}/bin:${install_path}/atc/ccec_compiler/bin:$PATH
+export LD_LIBRARY_PATH=${install_path}/lib64:${install_path}/atc/lib64:${install_path}/acllib/lib64:${install_path}/compiler/lib64/plugin/opskernel:${install_path}/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH
+export PYTHONPATH=${install_path}/latest/python/site-packages:${install_path}/opp/op_impl/built-in/ai_core/tbe:${install_path}/atc/python/site-packages:${install_path}/pyACL/python/site-packages/acl:$PYTHONPATH
+export ASCEND_AICPU_PATH=${install_path}
+export ASCEND_OPP_PATH=${install_path}/opp
+export TOOLCHAIN_HOME=${install_path}/toolkit
+export ASCEND_AUTOML_PATH=${install_path}/tools
+export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:${LD_LIBRARY_PATH}
+atc --model=no_flash_encoder_revise.onnx --framework=5 --output=no_flash_encoder_revise --input_format=ND --input_shape_range="xs_input:[1,-1,80];xs_input_lens:[-1]" --log=error  --soc_version=Ascend310
+
diff --git a/ACL_PyTorch/contrib/audio/WeNet/process_encoder_data_noflash.py b/ACL_PyTorch/contrib/audio/WeNet/process_encoder_data_noflash.py
new file mode 100644
index 0000000000000000000000000000000000000000..709d6f199db3fd81d919030a6bccd2f85a6e35b7
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/process_encoder_data_noflash.py
@@ -0,0 +1,209 @@
+# BSD 3-Clause License
+#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+
+
+from __future__ import print_function
+
+import argparse
+import copy
+import logging
+import os
+import sys
+
+import torch
+import yaml
+from torch.utils.data import DataLoader
+
+from wenet.dataset.dataset import AudioDataset, CollateFunc
+from wenet.transformer.asr_model import init_asr_model
+from wenet.utils.checkpoint import load_checkpoint
+#from wenet.transformer.acl_init import decoder_model, device_id
+import json
+import os
+import acl
+from wenet.transformer.acl_net import Net
+def dic2json(input_dict, json_path):
+    json_str = json.dumps(input_dict)
+    with open(json_path, 'a') as json_file:
+        json_file.write(json_str)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='recognize with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--test_data', required=True, help='test data file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--checkpoint', required=True, help='checkpoint model')
+    parser.add_argument('--dict', required=True, help='dict file')
+    parser.add_argument('--beam_size',
+                        type=int,
+                        default=10,
+                        help='beam size for search')
+    parser.add_argument('--penalty',
+                        type=float,
+                        default=0.0,
+                        help='length penalty')
+    parser.add_argument('--result_file', required=True, help='asr result file')
+    parser.add_argument('--bin_path', type=str, default="./encoder_data_noflash", help='encoder bin images dir')
+    parser.add_argument('--model_path', type=str, default="no_flash_encoder_revise.om", help='encoder bin images dir')
+    parser.add_argument('--json_path', type=str, default="encoder_noflash_all.json", help='encoder bin images dir')
+    parser.add_argument('--batch_size',
+                        type=int,
+                        default=16,
+                        help='asr result file')
+    parser.add_argument('--ctc_weight',
+                        type=float,
+                        default=0.0,
+                        help='ctc weight for attention rescoring decode mode')
+    parser.add_argument('--decoding_chunk_size',
+                        type=int,
+                        default=-1,
+                        help='''decoding chunk size,
+                                <0: for decoding, use full chunk.
+                                >0: for decoding, use fixed chunk size as set.
+                                0: used for training, it's prohibited here''')
+    parser.add_argument('--num_decoding_left_chunks',
+                        type=int,
+                        default=-1,
+                        help='number of left chunks for decoding')
+    parser.add_argument('--simulate_streaming',
+                        action='store_true',
+                        help='simulate streaming inference')
+    parser.add_argument('--reverse_weight',
+                        type=float,
+                        default=0.0,
+                        help='''right to left weight for attention rescoring
+                                decode mode''')
+    args = parser.parse_args()
+    print(args)
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+
+    #init acl
+    ret = acl.init()
+    device_id = 0
+    #check_ret('acl.init', ret)
+    ret = acl.rt.set_device(device_id)
+    #check_ret('acl.rt.set_device', ret)
+    context, ret = acl.rt.create_context(device_id)
+    #check_ret('acl.rt.create_context', ret)
+
+    decoder_output_data_shape = 42330000
+    encoder_model_noflash = Net(
+        model_path=args.model_path,
+        output_data_shape=decoder_output_data_shape,
+        device_id=device_id, )
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    raw_wav = configs['raw_wav']
+    # Init dataset and data loader
+    # Init dataset and data loader
+    test_collate_conf = copy.deepcopy(configs['collate_conf'])
+    test_collate_conf['spec_aug'] = False
+    test_collate_conf['spec_sub'] = False
+    test_collate_conf['feature_dither'] = False
+    test_collate_conf['speed_perturb'] = False
+    if raw_wav:
+        test_collate_conf['wav_distortion_conf']['wav_distortion_rate'] = 0
+    test_collate_func = CollateFunc(**test_collate_conf, raw_wav=raw_wav)
+    dataset_conf = configs.get('dataset_conf', {})
+    dataset_conf['batch_size'] = args.batch_size
+    dataset_conf['batch_type'] = 'static'
+    dataset_conf['sort'] = False
+    test_dataset = AudioDataset(args.test_data,
+                                **dataset_conf,
+                                raw_wav=raw_wav)
+    test_data_loader = DataLoader(test_dataset,
+                                  collate_fn=test_collate_func,
+                                  shuffle=False,
+                                  batch_size=1,
+                                  num_workers=0)
+
+    # Init asr model from configs
+    model = init_asr_model(configs)
+
+    # Load dict
+    char_dict = {}
+    with open(args.dict, 'r') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            char_dict[int(arr[1])] = arr[0]
+    eos = len(char_dict) - 1
+
+    load_checkpoint(model, args.checkpoint)
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    model = model.to(device)
+
+    model.eval()
+
+    #init acl
+    if os.path.exists(args.json_path):
+        os.remove(args.json_path)
+    total_t = 0
+    encoder_dic = {}
+    import time
+    for batch_idx, batch in enumerate(test_data_loader):
+        print("batch_idx", batch_idx)
+        keys, feats, target, feats_lengths, target_lengths = batch
+        feats = feats.to(device)
+        target = target.to(device)
+        feats_lengths = feats_lengths.to(device)
+        target_lengths = target_lengths.to(device)
+        assert (feats.size(0) == 1)
+        encoder_out, encoder_mask, exe_time = model.get_no_flash_encoder_out(
+            encoder_model_noflash,
+            batch_idx,
+            feats,
+            feats_lengths,
+            args.beam_size,
+            decoding_chunk_size=args.decoding_chunk_size,
+            num_decoding_left_chunks=args.num_decoding_left_chunks,
+            ctc_weight=args.ctc_weight,
+            simulate_streaming=args.simulate_streaming,
+            reverse_weight=args.reverse_weight)
+        total_t += exe_time
+        encoder_dic["encoder_out_"+ str(batch_idx)] = [encoder_out.shape[0], encoder_out.shape[1],encoder_out.shape[2]]
+        encoder_dic["encoder_mask_"+ str(batch_idx)] = [encoder_mask.shape[0], encoder_mask.shape[1],encoder_mask.shape[2]]
+        encoder_out.numpy().tofile(os.path.join(args.bin_path, "encoder_out_{}.bin".format(batch_idx)))
+        encoder_mask.numpy().tofile(os.path.join(args.bin_path, "encoder_mask_{}.bin".format(batch_idx)))
+    ave_t = total_t / (batch_idx + 1)
+    dic_perf = {}
+    dic_perf["t1"] = ave_t
+    dic2json(dic_perf, "t1.json")
+    dic2json(encoder_dic, args.json_path)
+
diff --git a/ACL_PyTorch/contrib/audio/WeNet/recognize.py b/ACL_PyTorch/contrib/audio/WeNet/recognize.py
new file mode 100644
index 0000000000000000000000000000000000000000..817248c572124388c35f8ca316ec69ed3cda4fd6
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/recognize.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import copy
+import logging
+import os
+import sys
+
+import torch
+import yaml
+from torch.utils.data import DataLoader
+
+from wenet.dataset.dataset import AudioDataset, CollateFunc
+from wenet.transformer.asr_model import init_asr_model
+from wenet.utils.checkpoint import load_checkpoint
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='recognize with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--test_data', required=True, help='test data file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--checkpoint', required=True, help='checkpoint model')
+    parser.add_argument('--dict', required=True, help='dict file')
+    parser.add_argument('--beam_size',
+                        type=int,
+                        default=10,
+                        help='beam size for search')
+    parser.add_argument('--penalty',
+                        type=float,
+                        default=0.0,
+                        help='length penalty')
+    parser.add_argument('--result_file', required=True, help='asr result file')
+    parser.add_argument('--batch_size',
+                        type=int,
+                        default=16,
+                        help='asr result file')
+    parser.add_argument('--mode',
+                        choices=[
+                            'attention', 'ctc_greedy_search',
+                            'ctc_prefix_beam_search', 'attention_rescoring'
+                        ],
+                        default='attention',
+                        help='decoding mode')
+    parser.add_argument('--ctc_weight',
+                        type=float,
+                        default=0.0,
+                        help='ctc weight for attention rescoring decode mode')
+    parser.add_argument('--decoding_chunk_size',
+                        type=int,
+                        default=-1,
+                        help='''decoding chunk size,
+                                <0: for decoding, use full chunk.
+                                >0: for decoding, use fixed chunk size as set.
+                                0: used for training, it's prohibited here''')
+    parser.add_argument('--num_decoding_left_chunks',
+                        type=int,
+                        default=-1,
+                        help='number of left chunks for decoding')
+    parser.add_argument('--simulate_streaming',
+                        action='store_true',
+                        help='simulate streaming inference')
+    parser.add_argument('--reverse_weight',
+                        type=float,
+                        default=0.0,
+                        help='''right to left weight for attention rescoring
+                                decode mode''')
+    args = parser.parse_args()
+    print(args)
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+
+    if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring'
+                     ] and args.batch_size > 1:
+        logging.fatal(
+            'decoding mode {} must be running with batch_size == 1'.format(
+                args.mode))
+        sys.exit(1)
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    raw_wav = configs['raw_wav']
+    # Init dataset and data loader
+    # Init dataset and data loader
+    test_collate_conf = copy.deepcopy(configs['collate_conf'])
+    test_collate_conf['spec_aug'] = False
+    test_collate_conf['spec_sub'] = False
+    test_collate_conf['feature_dither'] = False
+    test_collate_conf['speed_perturb'] = False
+    if raw_wav:
+        test_collate_conf['wav_distortion_conf']['wav_distortion_rate'] = 0
+    test_collate_func = CollateFunc(**test_collate_conf, raw_wav=raw_wav)
+    dataset_conf = configs.get('dataset_conf', {})
+    dataset_conf['batch_size'] = args.batch_size
+    dataset_conf['batch_type'] = 'static'
+    dataset_conf['sort'] = False
+    test_dataset = AudioDataset(args.test_data,
+                                **dataset_conf,
+                                raw_wav=raw_wav)
+    test_data_loader = DataLoader(test_dataset,
+                                  collate_fn=test_collate_func,
+                                  shuffle=False,
+                                  batch_size=1,
+                                  num_workers=0)
+
+    # Init asr model from configs
+    model = init_asr_model(configs)
+
+    # Load dict
+    char_dict = {}
+    with open(args.dict, 'r') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            char_dict[int(arr[1])] = arr[0]
+    eos = len(char_dict) - 1
+
+    load_checkpoint(model, args.checkpoint)
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    model = model.to(device)
+
+    model.eval()
+    total_t = 0
+    with torch.no_grad(), open(args.result_file, 'w') as fout:
+        for batch_idx, batch in enumerate(test_data_loader):
+            keys, feats, target, feats_lengths, target_lengths = batch
+            feats = feats.to(device)
+            target = target.to(device)
+            feats_lengths = feats_lengths.to(device)
+            target_lengths = target_lengths.to(device)
+            if args.mode == 'attention':
+                hyps = model.recognize(
+                    feats,
+                    feats_lengths,
+                    beam_size=args.beam_size,
+                    decoding_chunk_size=args.decoding_chunk_size,
+                    num_decoding_left_chunks=args.num_decoding_left_chunks,
+                    simulate_streaming=args.simulate_streaming)
+                hyps = [hyp.tolist() for hyp in hyps]
+            elif args.mode == 'ctc_greedy_search':
+                hyps = model.ctc_greedy_search(
+                    feats,
+                    feats_lengths,
+                    decoding_chunk_size=args.decoding_chunk_size,
+                    num_decoding_left_chunks=args.num_decoding_left_chunks,
+                    simulate_streaming=args.simulate_streaming)
+            # ctc_prefix_beam_search and attention_rescoring only return one
+            # result in List[int], change it to List[List[int]] for compatible
+            # with other batch decoding mode
+            elif args.mode == 'ctc_prefix_beam_search':
+                assert (feats.size(0) == 1)
+                hyp = model.ctc_prefix_beam_search(
+                    feats,
+                    feats_lengths,
+                    args.beam_size,
+                    decoding_chunk_size=args.decoding_chunk_size,
+                    num_decoding_left_chunks=args.num_decoding_left_chunks,
+                    simulate_streaming=args.simulate_streaming)
+                hyps = [hyp]
+            elif args.mode == 'attention_rescoring':
+                assert (feats.size(0) == 1)
+                hyp, exe_t = model.attention_rescoring(
+                    feats,
+                    feats_lengths,
+                    args.beam_size,
+                    decoding_chunk_size=args.decoding_chunk_size,
+                    num_decoding_left_chunks=args.num_decoding_left_chunks,
+                    ctc_weight=args.ctc_weight,
+                    simulate_streaming=args.simulate_streaming,
+                    reverse_weight=args.reverse_weight)
+                hyps = [hyp]
+                total_t += exe_t
+                print(exe_t)
+            for i, key in enumerate(keys):
+                content = ''
+                for w in hyps[i]:
+                    if w == eos:
+                        break
+                    content += char_dict[w]
+                logging.info('{} {}'.format(key, content))
+                fout.write('{} {}\n'.format(key, content))
+        print("mean_fps: ", 1/(total_t/(batch_idx+1)))
+        print("mean_time: ", total_t/(batch_idx+1))
+        fout.write("mean_time: "+str(total_t/(batch_idx+1)))
+        fout.write("mean_fps: "+str(1/(total_t/(batch_idx+1))))
diff --git a/ACL_PyTorch/contrib/audio/WeNet/recognize_attenstion_rescoring.py b/ACL_PyTorch/contrib/audio/WeNet/recognize_attenstion_rescoring.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb63a5e9cbb05e01d1beecb5e9d111e664f2c9d1
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/recognize_attenstion_rescoring.py
@@ -0,0 +1,205 @@
+# BSD 3-Clause License
+#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+
+
+
+
+from __future__ import print_function
+
+import argparse
+import copy
+import logging
+import os
+import sys
+
+import torch
+import yaml
+from torch.utils.data import DataLoader
+
+from wenet.dataset.dataset import AudioDataset, CollateFunc
+from wenet.transformer.asr_model import init_asr_model
+from wenet.utils.checkpoint import load_checkpoint
+import acl
+from wenet.transformer.acl_net import Net
+import json
+import os
+
+def dic2json(input_dict, json_path):
+    json_str = json.dumps(input_dict)
+    with open(json_path, 'a') as json_file:
+        json_file.write(json_str)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='recognize with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--test_data', required=True, help='test data file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--checkpoint', required=True, help='checkpoint model')
+    parser.add_argument('--dict', required=True, help='dict file')
+    parser.add_argument('--beam_size',
+                        type=int,
+                        default=10,
+                        help='beam size for search')
+    parser.add_argument('--bin_path', type=str, default="./encoder_data_noflash", help='encoder bin images dir')
+    parser.add_argument('--model_path', type=str, default="./onnx/decoder_final.om", help='encoder bin images dir')
+    parser.add_argument('--json_path', type=str, default="encoder_noflash_all.json", help='encoder bin images dir')
+    parser.add_argument('--penalty',
+                        type=float,
+                        default=0.0,
+                        help='length penalty')
+    parser.add_argument('--result_file', required=True, help='asr result file')
+    parser.add_argument('--batch_size',
+                        type=int,
+                        default=16,
+                        help='asr result file')
+    parser.add_argument('--ctc_weight',
+                        type=float,
+                        default=0.0,
+                        help='ctc weight for attention rescoring decode mode')
+    parser.add_argument('--decoding_chunk_size',
+                        type=int,
+                        default=-1,
+                        help='''decoding chunk size,
+                                <0: for decoding, use full chunk.
+                                >0: for decoding, use fixed chunk size as set.
+                                0: used for training, it's prohibited here''')
+    parser.add_argument('--num_decoding_left_chunks',
+                        type=int,
+                        default=-1,
+                        help='number of left chunks for decoding')
+    parser.add_argument('--simulate_streaming',
+                        action='store_true',
+                        help='simulate streaming inference')
+    parser.add_argument('--reverse_weight',
+                        type=float,
+                        default=0.0,
+                        help='''right to left weight for attention rescoring
+                                decode mode''')
+    args = parser.parse_args()
+    print(args)
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    raw_wav = configs['raw_wav']
+    test_collate_conf = copy.deepcopy(configs['collate_conf'])
+    test_collate_conf['spec_aug'] = False
+    test_collate_conf['spec_sub'] = False
+    test_collate_conf['feature_dither'] = False
+    test_collate_conf['speed_perturb'] = False
+    if raw_wav:
+        test_collate_conf['wav_distortion_conf']['wav_distortion_rate'] = 0
+    test_collate_func = CollateFunc(**test_collate_conf, raw_wav=raw_wav)
+    dataset_conf = configs.get('dataset_conf', {})
+    dataset_conf['batch_size'] = args.batch_size
+    dataset_conf['batch_type'] = 'static'
+    dataset_conf['sort'] = False
+    test_dataset = AudioDataset(args.test_data,
+                                **dataset_conf,
+                                raw_wav=raw_wav)
+    test_data_loader = DataLoader(test_dataset,
+                                  collate_fn=test_collate_func,
+                                  shuffle=False,
+                                  batch_size=1,
+                                  num_workers=0)
+
+    # Init asr model from configs
+    model = init_asr_model(configs)
+    # Load dict
+    char_dict = {}
+    with open(args.dict, 'r') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            char_dict[int(arr[1])] = arr[0]
+    eos = len(char_dict) - 1
+
+    load_checkpoint(model, args.checkpoint)
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    model = model.to(device)
+
+    model.eval()
+    total_t = 0
+    #init acl
+    ret = acl.init()
+    device_id = 0
+    ret = acl.rt.set_device(device_id)
+    context, ret = acl.rt.create_context(device_id)
+    decoder_output_data_shape = 42330000
+    decoder_model = Net(model_path =args.model_path, output_data_shape = decoder_output_data_shape, device_id = device_id, )
+    json_data = {}
+    with open(args.json_path, 'r') as load_f:
+        json_data = json.load(load_f)
+    bin_path = args.bin_path
+    with torch.no_grad(), open(args.result_file, 'w') as fout:
+        for batch_idx, batch in enumerate(test_data_loader):
+            keys, feats, target, feats_lengths, target_lengths = batch
+            feats = feats.to(device)
+            target = target.to(device)
+            feats_lengths = feats_lengths.to(device)
+            target_lengths = target_lengths.to(device)
+            assert (feats.size(0) == 1)
+            hyp, exe_time = model.get_wer(
+                batch_idx,
+                bin_path,
+                json_data,
+                decoder_model,
+                feats,
+                feats_lengths,
+                args.beam_size,
+                decoding_chunk_size=args.decoding_chunk_size,
+                num_decoding_left_chunks=args.num_decoding_left_chunks,
+                ctc_weight=args.ctc_weight,
+                simulate_streaming=args.simulate_streaming,
+                reverse_weight=args.reverse_weight)
+            total_t += exe_time
+            hyps = [hyp]
+            for i, key in enumerate(keys):
+                content = ''
+                for w in hyps[i]:
+                    if w == eos:
+                        break
+                    content += char_dict[w]
+                logging.info('{} {}'.format(key, content))
+                fout.write('{} {}\n'.format(key, content))
+        ave_t = total_t / (batch_idx + 1)
+        dic_perf = {}
+        dic_perf["t2"] = ave_t
+        if "no" in args.bin_path:
+            dic2json(dic_perf, "t2.json")
diff --git a/ACL_PyTorch/contrib/audio/WeNet/requirements.txt b/ACL_PyTorch/contrib/audio/WeNet/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a4e8c3570cf18cb61706d5dcf65e1c0e918168bc
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/requirements.txt
@@ -0,0 +1,9 @@
+torch==1.9.0
+onnx==1.10.0
+onnxruntime==1.8.1
+torchaudio==0.9.0
+sympy
+pyyaml
+decorator
+typeguard
+pillow
\ No newline at end of file
diff --git a/ACL_PyTorch/contrib/audio/WeNet/run_attention_rescoring.sh b/ACL_PyTorch/contrib/audio/WeNet/run_attention_rescoring.sh
new file mode 100644
index 0000000000000000000000000000000000000000..521b5820d9f36422682f15a1a6f86a20a7a89934
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/run_attention_rescoring.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+export NCCL_DEBUG=INFO
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=6
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+# data
+data=/export/data/asr-data/OpenSLR/33/
+data_url=www.openslr.org/resources/33
+
+nj=16
+feat_dir=raw_wav
+dict=data/dict/lang_char.txt
+
+train_set=train
+# Optional train_config
+# 1. conf/train_transformer.yaml: Standard transformer
+# 2. conf/train_conformer.yaml: Standard conformer
+# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
+# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
+# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding
+# 6. conf/train_u2++_conformer.yaml: U2++ conformer
+# 7. conf/train_u2++_transformer.yaml: U2++ transformer
+train_config=conf/train_conformer.yaml
+cmvn=true
+dir=exp/conformer
+checkpoint=
+
+# use average_checkpoint will get better result
+average_checkpoint=false
+decode_checkpoint=$dir/final.pt
+average_num=30
+decode_modes="attention_rescoring"
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # Test model, please specify the model you want to test by --checkpoint
+    if [ ${average_checkpoint} == true ]; then
+        decode_checkpoint=$dir/avg_${average_num}.pt
+        echo "do model average and final checkpoint is $decode_checkpoint"
+        python3 wenet/bin/average_model.py \
+            --dst_model $decode_checkpoint \
+            --src_path $dir  \
+            --num ${average_num} \
+            --val_best
+    fi
+    # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+    # -1 for full chunk
+    decoding_chunk_size=
+    ctc_weight=0.3
+    reverse_weight=0.3
+    for mode in ${decode_modes}; do
+    {
+        test_dir=$dir/test_${mode}
+        mkdir -p $test_dir
+        python3 wenet/bin/recognize_attenstion_rescoring.py --gpu -1 \
+            --config $dir/train.yaml \
+            --test_data $feat_dir/test/format.data \
+            --checkpoint $decode_checkpoint \
+            --beam_size 10 \
+            --batch_size 1 \
+            --penalty 0.0 \
+            --dict $dict \
+            --ctc_weight $ctc_weight \
+            --reverse_weight $reverse_weight \
+            --result_file $test_dir/text \
+            --simulate_streaming \
+            --decoding_chunk_size 1
+            ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+         python3 tools/compute-wer.py --char=1 --v=1 \
+            $feat_dir/test/text $test_dir/text > $test_dir/wer
+    } &
+    done
+    wait
+
+fi
+
diff --git a/ACL_PyTorch/contrib/audio/WeNet/run_no_flash_encoder_out.sh b/ACL_PyTorch/contrib/audio/WeNet/run_no_flash_encoder_out.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d41edbd972b4bcfca7d7f89375a30634a03ead30
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/run_no_flash_encoder_out.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+export NCCL_DEBUG=INFO
+stage=0 # start from 0 if you need to start from data preparation
+stop_stage=6
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+# data
+data=/export/data/asr-data/OpenSLR/33/
+data_url=www.openslr.org/resources/33
+
+nj=16
+feat_dir=raw_wav
+dict=data/dict/lang_char.txt
+
+train_set=train
+# Optional train_config
+# 1. conf/train_transformer.yaml: Standard transformer
+# 2. conf/train_conformer.yaml: Standard conformer
+# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
+# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
+# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding
+# 6. conf/train_u2++_conformer.yaml: U2++ conformer
+# 7. conf/train_u2++_transformer.yaml: U2++ transformer
+train_config=conf/train_conformer.yaml
+cmvn=true
+dir=exp/conformer
+checkpoint=
+
+# use average_checkpoint will get better result
+average_checkpoint=false
+decode_checkpoint=$dir/final.pt
+average_num=30
+decode_modes="attention_rescoring"
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # Test model, please specify the model you want to test by --checkpoint
+    if [ ${average_checkpoint} == true ]; then
+        decode_checkpoint=$dir/avg_${average_num}.pt
+        echo "do model average and final checkpoint is $decode_checkpoint"
+        python3 wenet/bin/average_model.py \
+            --dst_model $decode_checkpoint \
+            --src_path $dir  \
+            --num ${average_num} \
+            --val_best
+    fi
+    # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+    # -1 for full chunk
+    decoding_chunk_size=
+    ctc_weight=0.3
+    reverse_weight=0.3
+
+    test_dir=$dir/test_${mode}
+    mkdir -p $test_dir
+    python3 wenet/bin/process_encoder_data_noflash.py --gpu -1 \
+        --config $dir/train.yaml \
+        --test_data $feat_dir/test/format.data \
+        --checkpoint $decode_checkpoint \
+        --beam_size 10 \
+        --batch_size 1 \
+        --penalty 0.0 \
+        --dict $dict \
+        --ctc_weight $ctc_weight \
+        --reverse_weight $reverse_weight \
+        --result_file $test_dir/text \
+        --decoding_chunk_size 1
+        ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+fi
+
diff --git a/ACL_PyTorch/contrib/audio/WeNet/run_static.sh b/ACL_PyTorch/contrib/audio/WeNet/run_static.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f0c967ee8e3eac1042150ba0f4b4f498bf7380a6
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/run_static.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+export NCCL_DEBUG=INFO
+stage=5 # start from 0 if you need to start from data preparation
+stop_stage=5
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+# data
+data=/export/data/asr-data/OpenSLR/33/
+data_url=www.openslr.org/resources/33
+
+nj=16
+feat_dir=raw_wav
+dict=data/dict/lang_char.txt
+
+train_set=train
+# Optional train_config
+# 1. conf/train_transformer.yaml: Standard transformer
+# 2. conf/train_conformer.yaml: Standard conformer
+# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
+# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
+# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding
+# 6. conf/train_u2++_conformer.yaml: U2++ conformer
+# 7. conf/train_u2++_transformer.yaml: U2++ transformer
+train_config=conf/train_conformer.yaml
+cmvn=true
+dir=exp/conformer
+checkpoint=
+
+# use average_checkpoint will get better result
+average_checkpoint=false
+decode_checkpoint=$dir/final.pt
+average_num=30
+decode_modes="attention_rescoring"
+
+. tools/parse_options.sh || exit 1;
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # Test model, please specify the model you want to test by --checkpoint
+    if [ ${average_checkpoint} == true ]; then
+        decode_checkpoint=$dir/avg_${average_num}.pt
+        echo "do model average and final checkpoint is $decode_checkpoint"
+        python3 wenet/bin/average_model.py \
+            --dst_model $decode_checkpoint \
+            --src_path $dir  \
+            --num ${average_num} \
+            --val_best
+    fi
+    # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+    # -1 for full chunk
+    decoding_chunk_size=
+    ctc_weight=0.5
+    reverse_weight=0.0
+    for mode in ${decode_modes}; do
+    {
+        test_dir=$dir/test_${mode}
+        mkdir -p $test_dir
+        python3 wenet/bin/static.py --gpu -1 \
+            --mode $mode \
+            --config $dir/train.yaml \
+            --test_data $feat_dir/test/format.data \
+            --checkpoint $decode_checkpoint \
+            --beam_size 10 \
+            --batch_size 1 \
+            --penalty 0.0 \
+            --dict $dict \
+            --ctc_weight $ctc_weight \
+            --reverse_weight $reverse_weight \
+            --result_file $test_dir/text \
+            ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+         python3 tools/compute-wer.py --char=1 --v=1 \
+            $feat_dir/test/text $test_dir/text > $test_dir/wer
+    } &
+    done
+    wait
+
+fi
+
diff --git a/ACL_PyTorch/contrib/audio/WeNet/slice_helper.py b/ACL_PyTorch/contrib/audio/WeNet/slice_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..62a92e1b56173db2aae1acad8a6cdf34b532d6d7
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/slice_helper.py
@@ -0,0 +1,68 @@
+# BSD 3-Clause License
+#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ============================================================================
+
+import torch
+
+@torch.jit.script
+def slice_helper(x, offset):
+  return x[:, -offset: , : ]
+
+@torch.jit.script
+def slice_helper2(x: torch.Tensor, start: torch.Tensor, end: torch.Tensor):
+  start = start.long()
+  end = end.long()
+  return x[:, start:end]
+
+@torch.jit.script
+def slice_helper3(x, start):
+  return x[:, start:]
+
+@torch.jit.script
+def get_item(x):
+  item = x.detach().item()
+  output = torch.tensor(item)
+  return output
+
+@torch.jit.script
+def get_next_cache_start(required_cache_size: torch.Tensor, xs: torch.Tensor):
+  # required_cache_size = required_cache_size_tensor.detach().item()
+  next_cache_start = 0
+  if required_cache_size < 0:
+    next_cache_start = 0
+  elif required_cache_size == 0:
+    next_cache_start = xs.size(1)
+  else:
+    if xs.size(1) - required_cache_size < 0:
+      next_cache_start = 0
+    else:
+      next_cache_start = xs.size(1) - required_cache_size
+  return torch.tensor(next_cache_start, dtype=torch.int64)
diff --git a/ACL_PyTorch/contrib/audio/WeNet/static.py b/ACL_PyTorch/contrib/audio/WeNet/static.py
new file mode 100644
index 0000000000000000000000000000000000000000..edb1f67aeab88e12a7f28293bd069ac2de92664d
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/static.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import argparse
+import copy
+import logging
+import os
+import sys
+
+import torch
+import yaml
+from torch.utils.data import DataLoader
+
+from wenet.dataset.dataset import AudioDataset, CollateFunc
+from wenet.transformer.asr_model import init_asr_model
+from wenet.utils.checkpoint import load_checkpoint
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='recognize with your model')
+    parser.add_argument('--config', required=True, help='config file')
+    parser.add_argument('--test_data', required=True, help='test data file')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=-1,
+                        help='gpu id for this rank, -1 for cpu')
+    parser.add_argument('--checkpoint', required=True, help='checkpoint model')
+    parser.add_argument('--dict', required=True, help='dict file')
+    parser.add_argument('--beam_size',
+                        type=int,
+                        default=10,
+                        help='beam size for search')
+    parser.add_argument('--penalty',
+                        type=float,
+                        default=0.0,
+                        help='length penalty')
+    parser.add_argument('--result_file', required=True, help='asr result file')
+    parser.add_argument('--batch_size',
+                        type=int,
+                        default=16,
+                        help='asr result file')
+    parser.add_argument('--mode',
+                        choices=[
+                            'attention', 'ctc_greedy_search',
+                            'ctc_prefix_beam_search', 'attention_rescoring'
+                        ],
+                        default='attention',
+                        help='decoding mode')
+    parser.add_argument('--ctc_weight',
+                        type=float,
+                        default=0.0,
+                        help='ctc weight for attention rescoring decode mode')
+    parser.add_argument('--decoding_chunk_size',
+                        type=int,
+                        default=-1,
+                        help='''decoding chunk size,
+                                <0: for decoding, use full chunk.
+                                >0: for decoding, use fixed chunk size as set.
+                                0: used for training, it's prohibited here''')
+    parser.add_argument('--num_decoding_left_chunks',
+                        type=int,
+                        default=-1,
+                        help='number of left chunks for decoding')
+    parser.add_argument('--simulate_streaming',
+                        action='store_true',
+                        help='simulate streaming inference')
+    parser.add_argument('--reverse_weight',
+                        type=float,
+                        default=0.0,
+                        help='''right to left weight for attention rescoring
+                                decode mode''')
+    args = parser.parse_args()
+    print(args)
+    total_t = 0
+    logging.basicConfig(level=logging.DEBUG,
+                        format='%(asctime)s %(levelname)s %(message)s')
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
+
+    if args.mode in ['ctc_prefix_beam_search', 'attention_rescoring'
+                     ] and args.batch_size > 1:
+        logging.fatal(
+            'decoding mode {} must be running with batch_size == 1'.format(
+                args.mode))
+        sys.exit(1)
+
+    with open(args.config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    raw_wav = configs['raw_wav']
+    # Init dataset and data loader
+    # Init dataset and data loader
+    test_collate_conf = copy.deepcopy(configs['collate_conf'])
+    test_collate_conf['spec_aug'] = False
+    test_collate_conf['spec_sub'] = False
+    test_collate_conf['feature_dither'] = False
+    test_collate_conf['speed_perturb'] = False
+    if raw_wav:
+        test_collate_conf['wav_distortion_conf']['wav_distortion_rate'] = 0
+    test_collate_func = CollateFunc(**test_collate_conf, raw_wav=raw_wav)
+    dataset_conf = configs.get('dataset_conf', {})
+    dataset_conf['batch_size'] = args.batch_size
+    dataset_conf['batch_type'] = 'static'
+    dataset_conf['sort'] = False
+    test_dataset = AudioDataset(args.test_data,
+                                **dataset_conf,
+                                raw_wav=raw_wav)
+    test_data_loader = DataLoader(test_dataset,
+                                  collate_fn=test_collate_func,
+                                  shuffle=False,
+                                  batch_size=1,
+                                  num_workers=0)
+
+    # Init asr model from configs
+    model = init_asr_model(configs)
+
+    # Load dict
+    char_dict = {}
+    with open(args.dict, 'r') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            char_dict[int(arr[1])] = arr[0]
+    eos = len(char_dict) - 1
+
+    load_checkpoint(model, args.checkpoint)
+    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
+    device = torch.device('cuda' if use_cuda else 'cpu')
+    model = model.to(device)
+
+    model.eval()
+    with torch.no_grad(), open(args.result_file, 'w') as fout:
+        for batch_idx, batch in enumerate(test_data_loader):
+            print("batch_idx:", batch_idx)
+            keys, feats, target, feats_lengths, target_lengths = batch
+            feats = feats.to(device)
+            target = target.to(device)
+            feats_lengths = feats_lengths.to(device)
+            target_lengths = target_lengths.to(device)
+            if args.mode == 'attention':
+                hyps = model.recognize(
+                    feats,
+                    feats_lengths,
+                    beam_size=args.beam_size,
+                    decoding_chunk_size=args.decoding_chunk_size,
+                    num_decoding_left_chunks=args.num_decoding_left_chunks,
+                    simulate_streaming=args.simulate_streaming)
+                hyps = [hyp.tolist() for hyp in hyps]
+            elif args.mode == 'ctc_greedy_search':
+                hyps = model.ctc_greedy_search(
+                    feats,
+                    feats_lengths,
+                    decoding_chunk_size=args.decoding_chunk_size,
+                    num_decoding_left_chunks=args.num_decoding_left_chunks,
+                    simulate_streaming=args.simulate_streaming)
+            # ctc_prefix_beam_search and attention_rescoring only return one
+            # result in List[int], change it to List[List[int]] for compatible
+            # with other batch decoding mode
+            elif args.mode == 'ctc_prefix_beam_search':
+                assert (feats.size(0) == 1)
+                hyp = model.ctc_prefix_beam_search(
+                    feats,
+                    feats_lengths,
+                    args.beam_size,
+                    decoding_chunk_size=args.decoding_chunk_size,
+                    num_decoding_left_chunks=args.num_decoding_left_chunks,
+                    simulate_streaming=args.simulate_streaming)
+                hyps = [hyp]
+            elif args.mode == 'attention_rescoring':
+                assert (feats.size(0) == 1)
+                hyp = model.attention_rescoring(
+                    feats,
+                    feats_lengths,
+                    args.beam_size,
+                    decoding_chunk_size=args.decoding_chunk_size,
+                    num_decoding_left_chunks=args.num_decoding_left_chunks,
+                    ctc_weight=args.ctc_weight,
+                    simulate_streaming=args.simulate_streaming,
+                    reverse_weight=args.reverse_weight)
+                total_t += hyp[1]
+                hyps = [hyp]
+            for i, key in enumerate(keys):
+                content = ''
+                for w in hyps[i][0]:
+                    if w == eos:
+                        break
+                    content += char_dict[w]
+                logging.info('{} {}'.format(key, content))
+                fout.write('{} {}\n'.format(key, content))
+        fout.write('FPS:{}\n'.format(1000/(total_t/(batch_idx+1))))
diff --git a/ACL_PyTorch/contrib/audio/WeNet/static_decoder.sh b/ACL_PyTorch/contrib/audio/WeNet/static_decoder.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75e40f6223aa7694cc0b052f1230645a2d6b0c8a
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/static_decoder.sh
@@ -0,0 +1,22 @@
+export install_path=/usr/local/Ascend/ascend-toolkit/latest
+export PATH=${install_path}/atc/bin:${install_path}/bin:${install_path}/atc/ccec_compiler/bin:$PATH
+export LD_LIBRARY_PATH=${install_path}/lib64:${install_path}/atc/lib64:${install_path}/acllib/lib64:${install_path}/compiler/lib64/plugin/opskernel:${install_path}/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH
+export PYTHONPATH=${install_path}/latest/python/site-packages:${install_path}/opp/op_impl/built-in/ai_core/tbe:${install_path}/atc/python/site-packages:${install_path}/pyACL/python/site-packages/acl:$PYTHONPATH
+export ASCEND_AICPU_PATH=${install_path}
+export ASCEND_OPP_PATH=${install_path}/opp
+export TOOLCHAIN_HOME=${install_path}/toolkit
+export ASCEND_AUTOML_PATH=${install_path}/tools
+export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:${LD_LIBRARY_PATH}
+atc --model=decoder_final.onnx --framework=5 --output=decoder_fendang --input_format=ND \
+--input_shape="memory:10,-1,256;memory_mask:10,1,-1;ys_in_pad:10,-1;ys_in_lens:10;r_ys_in_pad:10,-1" --log=error  \
+--dynamic_dims="96,96,3,3;96,96,4,4;96,96,5,5;96,96,6,6;96,96,7,7;96,96,8,8;96,96,9,9;96,96,10,10;96,96,11,11;\
+96,96,12,12;96,96,13,13;96,96,14,14;96,96,15,15;96,96,16,16;96,96,17,17;96,96,18,18;96,96,19,19;96,96,20,20;\
+96,96,21,21;96,96,22,22;96,96,23,23;144,144,6,6;144,144,7,7;144,144,8,8;144,144,9,9;144,144,10,10;144,144,11,11;\
+144,144,12,12;144,144,13,13;144,144,14,14;144,144,15,15;144,144,16,16;144,144,17,17;144,144,18,18;144,144,19,19;\
+144,144,20,20;144,144,21,21;144,144,22,22;144,144,23,23;144,144,24,24;144,144,25,25;144,144,26,26;144,144,27,27;\
+144,144,28,28;384,384,9,9;384,384,10,10;384,384,11,11;384,384,12,12;384,384,13,13;384,384,14,14;384,384,15,15;\
+384,384,16,16;384,384,17,17;384,384,18,18;384,384,19,19;384,384,20,20;384,384,21,21;384,384,22,22;384,384,23,23;\
+384,384,24,24;384,384,25,25;384,384,26,26;384,384,27,27;384,384,28,28;384,384,29,29;384,384,30,30;384,384,31,31;\
+384,384,32,32;384,384,33,33;384,384,34,34;384,384,35,35;384,384,36,36;384,384,37,37;384,384,38,38;384,384,39,39;384,384,40,40;384,384,41,41;" \
+--soc_version=Ascend310
+
diff --git a/ACL_PyTorch/contrib/audio/WeNet/static_encoder.sh b/ACL_PyTorch/contrib/audio/WeNet/static_encoder.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7e409da26b160d6be35174e81e334269b0d531fb
--- /dev/null
+++ b/ACL_PyTorch/contrib/audio/WeNet/static_encoder.sh
@@ -0,0 +1,13 @@
+export install_path=/usr/local/Ascend/ascend-toolkit/latest
+export PATH=${install_path}/atc/bin:${install_path}/bin:${install_path}/atc/ccec_compiler/bin:$PATH
+export LD_LIBRARY_PATH=${install_path}/lib64:${install_path}/atc/lib64:${install_path}/acllib/lib64:${install_path}/compiler/lib64/plugin/opskernel:${install_path}/compiler/lib64/plugin/nnengine:$LD_LIBRARY_PATH
+export PYTHONPATH=${install_path}/latest/python/site-packages:${install_path}/opp/op_impl/built-in/ai_core/tbe:${install_path}/atc/python/site-packages:${install_path}/pyACL/python/site-packages/acl:$PYTHONPATH
+export ASCEND_AICPU_PATH=${install_path}
+export ASCEND_OPP_PATH=${install_path}/opp
+export TOOLCHAIN_HOME=${install_path}/toolkit
+export ASCEND_AUTOML_PATH=${install_path}/tools
+export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/:${LD_LIBRARY_PATH}
+atc --model=no_flash_encoder_revise.onnx --framework=5 --output=encoder_fendang_262_1478_static --input_format=ND \
+--input_shape="xs_input:1,-1,80;xs_input_lens:1" --log=error \
+--dynamic_dims="262;326;390;454;518;582;646;710;774;838;902;966;1028;1284;1478" \
+--soc_version=Ascend310