From 2e553218926fd8790346de0a5f1cf81de29f1e3a Mon Sep 17 00:00:00 2001 From: brjiang Date: Wed, 14 Aug 2024 16:00:57 +0800 Subject: [PATCH 01/12] =?UTF-8?q?=E6=96=B0=E5=A2=9EParaformer=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/audio/Paraformer/README.md | 133 ++++++++ .../built-in/audio/Paraformer/compile.py | 99 ++++++ .../built-in/audio/Paraformer/mindie.patch | 288 ++++++++++++++++++ .../audio/Paraformer/mindie_auto_model.py | 278 +++++++++++++++++ .../built-in/audio/Paraformer/mindie_cif.py | 173 +++++++++++ .../Paraformer/mindie_encoder_decoder.py | 121 ++++++++ .../audio/Paraformer/mindie_paraformer.py | 251 +++++++++++++++ .../built-in/audio/Paraformer/mindie_punc.py | 199 ++++++++++++ 8 files changed, 1542 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie.patch create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md new file mode 100644 index 0000000000..2bacbe5e0a --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md @@ -0,0 +1,133 @@ +# stable-diffusionxl-controlnet模型-推理指导 + +- [概述](#概述) +- [推理环境准备](#推理环境准备) +- [快速上手](#快速上手) + - [获取源码](#获取源码) + - [模型推理](#模型推理) + +# 概述 + +该工程使用mindietorch部署paraformer模型 + +- 模型路径: + ```bash + https://modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch + ``` + +- 参考实现: + ```bash + https://github.com/modelscope/FunASR + ``` + +# 推理环境准备 + +- 该模型需要以下插件与驱动 + + **表 1** 版本配套表 + + | 配套 | 版本 | 环境准备指导 | + | ------ | ------- | ------------ | + | Python | 3.10.13 | - | + | torch | 2.1.0+cpu | - | + | torch_audio | 2.1.0+cpu | - | + | CANN | 8.0.RC2 | - | + | MindIE | 1.0.RC2.B091 | - | + +# 快速上手 +## 获取源码 + +1. 安装mindie包 + + ```bash + # 安装mindie + chmod +x ./Ascend-mindie_xxx.run + ./Ascend-mindie_xxx.run --install + source /usr/local/Ascend/mindie/set_env.sh + ``` + +2. 安装torch_npu + + 请根据[昇腾文档](https://www.hiascend.com/document/detail/zh/mindie/10RC2/mindietorch/Torchdev/mindie_torch0017.html)安装合适版本的torch_npu,并手动编译libtorch_npu_bridge.so + +3. 获取Funasr源码 + + ``` + git clone https://github.com/modelscope/FunASR.git + cd ./FunASR + git reset fdac68e1d09645c48adf540d6091b194bac71075 --hard + cd .. + ``` + +4. 修改Funasr的源码,先将patch文件移动至Funasr的工程路径下,而后将patch应用到代码中(若patch应用失败,则需要手动进行修改) + ``` + mv mindie.patch ./FunASR + cd ./FunASR + git apply mindie.patch --reject + cd .. + ``` + +5. 获取模型文件 + +将[Paraformer](https://modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)的模型文件下载到本地,并保存在./model文件夹下 + +将[vad](https://modelscope.cn/models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)的模型文件下载到本地,并保存在./model_vad文件夹下 + +将[punc](https://modelscope.cn/models/iic/punc_ct-transformer_cn-en-common-vocab471067-large/files)的模型文件下载到本地,并保存在./model_punc文件夹下 + +目录结构如下所示 + + ``` + Paraformer + ├── FunASR + └── model + └── model.pt + └── config.yaml + └── ... + └── model_vad + └── model.pt + └── config.yaml + └── ... + └── model_punc + └── model.pt + └── config.yaml + └── ... + ``` + +6. 安装Funasr的依赖 + ``` + sudo apt install ffmpeg + pip install jieba + ``` + +## 模型推理 + +1. 模型编译及样本测试 + 执行下述命令进行模型编译 + ```bash + python compile.py \ + --model ./model \ + --model_vad ./model_vad \ + --model_punc ./model_punc \ + --compiled_encoder ./compiled_model/compiled_encoder.pt \ + --compiled_decoder ./compiled_model/compiled_decoder.pt \ + --compiled_cif ./compiled_model/compiled_cif.pt \ + --compiled_cif_timestamp ./compiled_model/compiled_cif_timestamp.pt \ + --compiled_punc ./compiled_model/compiled_punc.ts \ + --batch_size 16 \ + --sample_path ./model/example/asr_example.wav \ + --skip_compile + ``` + + 参数说明: + - --model:预训练模型路径 + - --model_vad:VAD预训练模型路径,若不使用VAD模型则设置为None + - --model_punc:PUNC预训练模型路径,若不使用PUNC模型则设置为None + - --compiled_encoder:编译后的encoder模型的路径 + - --compiled_decoder:编译后的decoder模型的路径 + - --compiled_cif:编译后的cif函数的路径 + - --compiled_cif_timestamp:编译后的cif_timestamp函数的路径 + - --compiled_punc:编译后的punc的路径 + - --batch_size:Paraformer模型所使用的batch_size + - --sample_path:用于测试模型的样本音频路径 + - --skip_compile:是否进行模型编译,若已经完成编译,后续使用可使用该参数跳过编译,若为第一次使用不能指定该参数 \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py new file mode 100644 index 0000000000..5629dada48 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import torch +import torch_npu +import mindietorch + +from mindie_auto_model import MindieAutoModel + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--skip_compile", action="store_true", + help="whether to skip compiling sub-models in Paraformer") + parser.add_argument("--model", default="./model", + help="path of pretrained model") + parser.add_argument("--model_vad", default="./model_vad", + help="path of pretrained vad model") + parser.add_argument("--model_punc", default="./model_punc", + help="path of pretrained punc model") + parser.add_argument("--compiled_encoder", default="./compiled_model/compiled_encoder.ts", + help="path to save compiled encoder") + parser.add_argument("--compiled_decoder", default="./compiled_model/compiled_decoder.ts", + help="path to save compiled decoder") + parser.add_argument("--compiled_cif", default="./compiled_model/compiled_cif.ts", + help="path to save compiled cif function") + parser.add_argument("--compiled_cif_timestamp", default="./compiled_model/compiled_cif_timestamp.ts", + help="path to save compiled cif timestamp function") + parser.add_argument("--compiled_punc", default="./compiled_model/compiled_punc.ts", + help="path to save compiled punc model") + parser.add_argument("--batch_size", default=16, type=int, + help="batch size of paraformer model") + parser.add_argument("--sample_path", default="./audio/test_1.wav", + help="path of sample audio") + args = parser.parse_args() + + mindietorch.set_device(0) + + # use mindietorch to compile sub-models in Paraformer + if not args.skip_compile: + print("Begin compiling sub-models") + MindieAutoModel.export_model(model=args.model_punc, compiled_path=args.compiled_punc, compile_type="punc") + MindieAutoModel.export_model(model=args.model, compiled_encoder=args.compiled_encoder, + compiled_decoder=args.compiled_decoder, compiled_cif=args.compiled_cif, + compiled_cif_timestamp=args.compiled_cif_timestamp, + cif_interval=200, cif_timestamp_interval=500, compile_type="paraformer") + print("Finish compiling sub-models") + else: + print("Use existing compiled model") + + # initialize auto model + # note: ncpu means the number of threads used for intraop parallelism on the CPU, which is relevant to speed of vad model + model = MindieAutoModel(model=args.model, vad_model=args.model_vad, punc_model=args.model_punc, + compiled_encoder=args.compiled_encoder, compiled_decoder=args.compiled_decoder, + compiled_cif=args.compiled_cif, compiled_cif_timestamp=args.compiled_cif_timestamp, + compiled_punc=args.compiled_punc, paraformer_batch_size=args.batch_size, + cif_interval=200, cif_timestamp_interval=500, ncpu=16) + + # warm up + print("Begin warming up") + for i in range(5): + _ = model.generate(input=args.sample_path) + + # run with sample audio + iteration_num = 100 + print("Begin runing with sample audio with {} iterations".format(iteration_num)) + + total_dict_time = {} + for i in range(iteration_num): + res, time_stats = model.generate(input=args.sample_path) + + print("Iteration {} Model output: {}".format(i, res[0]["text"])) + print("Iteration {} Time comsumption:".format(i)) + print(" ".join(f"{key}: {value:.3f}s" for key, value in time_stats.items())) + for key, value in time_stats.items(): + if key not in total_dict_time: + total_dict_time[key] = float(value) + else: + total_dict_time[key] += float(value) + + # display average time consumption + print("\nAverage time comsumption") + for key, value in total_dict_time.items(): + print(key, ": {:.3f}s".format(float(value) / iteration_num)) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie.patch b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie.patch new file mode 100644 index 0000000000..23d3207807 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie.patch @@ -0,0 +1,288 @@ +diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py +index 01e6aaf6..2081ebb1 100644 +--- a/funasr/auto/auto_model.py ++++ b/funasr/auto/auto_model.py +@@ -277,9 +277,9 @@ class AutoModel: + asr_result_list = [] + num_samples = len(data_list) + disable_pbar = self.kwargs.get("disable_pbar", False) +- pbar = ( +- tqdm(colour="blue", total=num_samples, dynamic_ncols=True) if not disable_pbar else None +- ) ++ # pbar = ( ++ # tqdm(colour="blue", total=num_samples, dynamic_ncols=True) if not disable_pbar else None ++ # ) + time_speech_total = 0.0 + time_escape_total = 0.0 + for beg_idx in range(0, num_samples, batch_size): +@@ -311,20 +311,22 @@ class AutoModel: + speed_stats["batch_size"] = f"{len(results)}" + speed_stats["rtf"] = f"{(time_escape) / batch_data_time:0.3f}" + description = f"{speed_stats}, " +- if pbar: +- pbar.update(1) +- pbar.set_description(description) ++ # if pbar: ++ # pbar.update(1) ++ # pbar.set_description(description) + time_speech_total += batch_data_time + time_escape_total += time_escape + +- if pbar: +- # pbar.update(1) +- pbar.set_description(f"rtf_avg: {time_escape_total/time_speech_total:0.3f}") ++ # if pbar: ++ # # pbar.update(1) ++ # pbar.set_description(f"rtf_avg: {time_escape_total/time_speech_total:0.3f}") + torch.cuda.empty_cache() + return asr_result_list + + def inference_with_vad(self, input, input_len=None, **cfg): + kwargs = self.kwargs ++ time_stats = {"input_speech_time": 0.0, "end_to_end_time": 0.0, "vad_time" : 0.0, ++ "paraformer_time": 0.0, "punc_time": 0.0} + # step.1: compute the vad model + deep_update(self.vad_kwargs, cfg) + beg_vad = time.time() +@@ -332,6 +334,7 @@ class AutoModel: + input, input_len=input_len, model=self.vad_model, kwargs=self.vad_kwargs, **cfg + ) + end_vad = time.time() ++ time_stats["vad_time"] = end_vad - beg_vad + + # FIX(gcf): concat the vad clips for sense vocie model for better aed + if kwargs.get("merge_vad", False): +@@ -366,7 +369,7 @@ class AutoModel: + speech_lengths = len(speech) + n = len(vadsegments) + data_with_index = [(vadsegments[i], i) for i in range(n)] +- sorted_data = sorted(data_with_index, key=lambda x: x[0][1] - x[0][0]) ++ sorted_data = sorted(data_with_index, key=lambda x: x[0][1] - x[0][0], reverse=True) + results_sorted = [] + + if not len(sorted_data): +@@ -377,7 +380,7 @@ class AutoModel: + if len(sorted_data) > 0 and len(sorted_data[0]) > 0: + batch_size = max(batch_size, sorted_data[0][0][1] - sorted_data[0][0][0]) + +- beg_idx = 0 ++ batch_size = 0 + beg_asr_total = time.time() + time_speech_total_per_sample = speech_lengths / 16000 + time_speech_total_all_samples += time_speech_total_per_sample +@@ -385,28 +388,19 @@ class AutoModel: + # pbar_sample = tqdm(colour="blue", total=n, dynamic_ncols=True) + + all_segments = [] +- max_len_in_batch = 0 +- end_idx = 1 +- for j, _ in enumerate(range(0, n)): +- # pbar_sample.update(1) +- sample_length = sorted_data[j][0][1] - sorted_data[j][0][0] +- potential_batch_length = max(max_len_in_batch, sample_length) * (j + 1 - beg_idx) +- # batch_size_ms_cum += sorted_data[j][0][1] - sorted_data[j][0][0] +- if ( +- j < n - 1 +- and sample_length < batch_size_threshold_ms +- and potential_batch_length < batch_size +- ): +- max_len_in_batch = max(max_len_in_batch, sample_length) +- end_idx += 1 +- continue +- ++ batch_segments = kwargs["paraformer_batch_size"] ++ print("Num of vadsegments: {}, Paraformer batch_size: {}".format(n, batch_segments)) ++ loop_num = n // batch_segments if n % batch_segments == 0 else n // batch_segments + 1 ++ beg_idx = 0 ++ end_idx = batch_segments ++ for j in range(loop_num): + speech_j, speech_lengths_j = slice_padding_audio_samples( + speech, speech_lengths, sorted_data[beg_idx:end_idx] + ) +- results = self.inference( ++ results_batch = self.inference_with_asr( + speech_j, input_len=None, model=model, kwargs=kwargs, **cfg + ) ++ results = results_batch[0] + if self.spk_model is not None: + # compose vad segments: [[start_time_sec, end_time_sec, speech], [...]] + for _b in range(len(speech_j)): +@@ -425,8 +419,7 @@ class AutoModel: + ) + results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"] + beg_idx = end_idx +- end_idx += 1 +- max_len_in_batch = sample_length ++ end_idx += batch_segments + if len(results) < 1: + continue + results_sorted.extend(results) +@@ -478,6 +471,10 @@ class AutoModel: + if not len(result["text"].strip()): + continue + return_raw_text = kwargs.get("return_raw_text", False) ++ ++ end_paraformer = time.time() ++ time_stats["paraformer_time"] = end_paraformer - beg_asr_total ++ + # step.3 compute punc model + raw_text = None + if self.punc_model is not None: +@@ -489,6 +486,8 @@ class AutoModel: + if return_raw_text: + result["raw_text"] = raw_text + result["text"] = punc_res[0]["text"] ++ end_punc = time.time() ++ time_stats["punc_time"] = end_punc - end_paraformer + + # speaker embedding cluster after resorted + if self.spk_model is not None and kwargs.get("return_spk_res", True): +@@ -575,12 +574,13 @@ class AutoModel: + f"time_escape: {time_escape_total_per_sample:0.3f}" + ) + +- # end_total = time.time() +- # time_escape_total_all_samples = end_total - beg_total ++ end_total = time.time() ++ time_stats["end_to_end_time"] = end_total - beg_vad ++ time_stats["input_speech_time"] = time_speech_total_all_samples + # print(f"rtf_avg_all: {time_escape_total_all_samples / time_speech_total_all_samples:0.3f}, " + # f"time_speech_all: {time_speech_total_all_samples: 0.3f}, " + # f"time_escape_all: {time_escape_total_all_samples:0.3f}") +- return results_ret_list ++ return results_ret_list, time_stats + + def export(self, input=None, **cfg): + """ +diff --git a/funasr/models/bicif_paraformer/cif_predictor.py b/funasr/models/bicif_paraformer/cif_predictor.py +index ca98cdc2..4d61ab85 100644 +--- a/funasr/models/bicif_paraformer/cif_predictor.py ++++ b/funasr/models/bicif_paraformer/cif_predictor.py +@@ -238,6 +238,7 @@ class CifPredictorV3(torch.nn.Module): + elif self.tail_threshold > 0.0: + hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=mask) + ++ return hidden, alphas, token_num + acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold) + if target_length is None and self.tail_threshold > 0.0: + token_num_int = torch.max(token_num).type(torch.int32).item() +@@ -282,6 +283,7 @@ class CifPredictorV3(torch.nn.Module): + _token_num = alphas2.sum(-1) + if token_num is not None: + alphas2 *= (token_num / _token_num)[:, None].repeat(1, alphas2.size(1)) ++ return alphas2 + # re-downsample + ds_alphas = alphas2.reshape(b, -1, self.upsample_times).sum(-1) + ds_cif_peak = cif_wo_hidden(ds_alphas, self.threshold - 1e-4) +diff --git a/funasr/models/sanm/attention.py b/funasr/models/sanm/attention.py +index c7e8a8e0..0f1862ae 100644 +--- a/funasr/models/sanm/attention.py ++++ b/funasr/models/sanm/attention.py +@@ -275,13 +275,15 @@ class MultiHeadedAttentionSANM(nn.Module): + "inf" + ) # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min) + scores = scores.masked_fill(mask, min_value) +- self.attn = torch.softmax(scores, dim=-1).masked_fill( +- mask, 0.0 +- ) # (batch, head, time1, time2) ++ # self.attn = torch.softmax(scores, dim=-1).masked_fill( ++ # mask, 0.0 ++ # ) # (batch, head, time1, time2) ++ attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) + else: +- self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) ++ # self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) ++ attn = torch.softmax(scores, dim=-1) + +- p_attn = self.dropout(self.attn) ++ p_attn = self.dropout(attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) +@@ -683,18 +685,22 @@ class MultiHeadedAttentionCrossAtt(nn.Module): + # logging.info( + # "scores: {}, mask_size: {}".format(scores.size(), mask.size())) + scores = scores.masked_fill(mask, min_value) +- self.attn = torch.softmax(scores, dim=-1).masked_fill( +- mask, 0.0 +- ) # (batch, head, time1, time2) ++ # self.attn = torch.softmax(scores, dim=-1).masked_fill( ++ # mask, 0.0 ++ # ) # (batch, head, time1, time2) ++ attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) + else: + self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) +- p_attn = self.dropout(self.attn) ++ attn = torch.softmax(scores, dim=-1) ++ # p_attn = self.dropout(self.attn) ++ p_attn = self.dropout(attn) + x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) + x = ( + x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) + ) # (batch, time1, d_model) + if ret_attn: +- return self.linear_out(x), self.attn # (batch, time1, d_model) ++ # return self.linear_out(x), self.attn # (batch, time1, d_model) ++ return self.linear_out(x), attn + return self.linear_out(x) # (batch, time1, d_model) + + def forward(self, x, memory, memory_mask, ret_attn=False): +diff --git a/funasr/models/sanm/encoder.py b/funasr/models/sanm/encoder.py +index b2a442bd..c8044cee 100644 +--- a/funasr/models/sanm/encoder.py ++++ b/funasr/models/sanm/encoder.py +@@ -15,7 +15,7 @@ import torch.nn.functional as F + + import numpy as np + from funasr.train_utils.device_funcs import to_device +-from funasr.models.transformer.utils.nets_utils import make_pad_mask ++from funasr.models.transformer.utils.nets_utils import make_pad_mask, make_pad_mask_new + from funasr.models.sanm.attention import MultiHeadedAttention, MultiHeadedAttentionSANM + from funasr.models.transformer.embedding import ( + SinusoidalPositionEncoder, +@@ -374,7 +374,8 @@ class SANMEncoder(nn.Module): + Returns: + position embedded tensor and mask + """ +- masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) ++ # masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) ++ masks = (~make_pad_mask_new(ilens)[:, None, :]).to(xs_pad.device) + xs_pad = xs_pad * self.output_size() ** 0.5 + if self.embed is None: + xs_pad = xs_pad +diff --git a/funasr/models/transformer/utils/nets_utils.py b/funasr/models/transformer/utils/nets_utils.py +index 29d23ee5..19693c9e 100644 +--- a/funasr/models/transformer/utils/nets_utils.py ++++ b/funasr/models/transformer/utils/nets_utils.py +@@ -218,6 +218,15 @@ def make_pad_mask(lengths, xs=None, length_dim=-1, maxlen=None): + return mask + + ++def make_pad_mask_new(lengths): ++ maxlen = lengths.max() ++ row_vector = torch.arange(0, maxlen, 1).to(lengths.device) ++ matrix = torch.unsqueeze(lengths, dim=-1) ++ mask = row_vector >= matrix ++ mask = mask.detach() ++ return mask ++ ++ + def make_non_pad_mask(lengths, xs=None, length_dim=-1): + """Make mask tensor containing indices of non-padded part. + +diff --git a/funasr/models/transformer/utils/repeat.py b/funasr/models/transformer/utils/repeat.py +index a44c1a01..0935d854 100644 +--- a/funasr/models/transformer/utils/repeat.py ++++ b/funasr/models/transformer/utils/repeat.py +@@ -28,8 +28,9 @@ class MultiSequential(torch.nn.Sequential): + """Repeat.""" + _probs = torch.empty(len(self)).uniform_() + for idx, m in enumerate(self): +- if not self.training or (_probs[idx] >= self.layer_drop_rate): +- args = m(*args) ++ # if not self.training or (_probs[idx] >= self.layer_drop_rate): ++ # args = m(*args) ++ args = m(*args) + return args + + diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py new file mode 100644 index 0000000000..8251f7533f --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py @@ -0,0 +1,278 @@ +import sys +sys.path.append("./FunASR") + +import torch +import time +import logging + +from mindie_paraformer import MindieBiCifParaformer +from mindie_encoder_decoder import MindieEncoder, MindieDecoder +from mindie_punc import MindiePunc, MindieCTTransformer +from mindie_cif import MindieCifTimestamp, MindieCif +from funasr.auto.auto_model import AutoModel, download_model, tables, deep_update, \ + load_pretrained_model, prepare_data_iterator + + +class MindieAutoModel(AutoModel): + def __init__(self, **kwargs): + log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) + logging.basicConfig(level=log_level) + + if not kwargs.get("disable_log", True): + tables.print() + + kwargs["compile_type"] = "paraformer" + model, kwargs = self.build_model_with_mindie(**kwargs) + + # if vad_model is not None, build vad model else None + vad_model = kwargs.get("vad_model", None) + vad_kwargs = {} if kwargs.get("vad_kwargs", {}) is None else kwargs.get("vad_kwargs", {}) + if vad_model is not None: + logging.info("Building VAD model.") + vad_kwargs["model"] = vad_model + vad_kwargs["model_revision"] = kwargs.get("vad_model_revision", "master") + vad_kwargs["device"] = "cpu" + vad_model, vad_kwargs = self.build_model(**vad_kwargs) + + # if punc_model is not None, build punc model else None + punc_model = kwargs.get("punc_model", None) + punc_kwargs = {} if kwargs.get("punc_kwargs", {}) is None else kwargs.get("punc_kwargs", {}) + if punc_model is not None: + logging.info("Building punc model.") + punc_kwargs["model"] = punc_model + punc_kwargs["model_revision"] = kwargs.get("punc_model_revision", "master") + punc_kwargs["device"] = "npu" + punc_kwargs["compile_type"] = "punc" + punc_kwargs["compiled_punc"] = kwargs["compiled_punc"] + punc_model, punc_kwargs = self.build_model_with_mindie(**punc_kwargs) + + self.kwargs = kwargs + self.model = model + self.vad_model = vad_model + self.vad_kwargs = vad_kwargs + self.punc_model = punc_model + self.punc_kwargs = punc_kwargs + self.spk_model = None + self.spk_kwargs = {} + self.model_path = kwargs.get("model_path") + + def generate(self, input, input_len=None, **cfg): + if self.vad_model is None: + return self.inference_with_asr(input, input_len=input_len, **cfg) + + else: + return self.inference_with_vad(input, input_len=input_len, **cfg) + + @staticmethod + def export_model(**kwargs): + # load model config + assert "model" in kwargs + if "model_conf" not in kwargs: + print("download models from model hub: {}".format(kwargs.get("hub", "ms"))) + kwargs = download_model(**kwargs) + + kwargs["batch_size"] = 1 + kwargs["device"] = "cpu" + + # build tokenizer + tokenizer = kwargs.get("tokenizer", None) + if tokenizer is not None: + tokenizer_class = tables.tokenizer_classes.get(tokenizer) + tokenizer = tokenizer_class(**kwargs.get("tokenizer_conf", {})) + kwargs["token_list"] = ( + tokenizer.token_list if hasattr(tokenizer, "token_list") else None + ) + kwargs["token_list"] = ( + tokenizer.get_vocab() if hasattr(tokenizer, "get_vocab") else kwargs["token_list"] + ) + vocab_size = len(kwargs["token_list"]) if kwargs["token_list"] is not None else -1 + if vocab_size == -1 and hasattr(tokenizer, "get_vocab_size"): + vocab_size = tokenizer.get_vocab_size() + else: + vocab_size = -1 + kwargs["tokenizer"] = tokenizer + + # build frontend + frontend = kwargs.get("frontend", None) + kwargs["input_size"] = None + if frontend is not None: + frontend_class = tables.frontend_classes.get(frontend) + frontend = frontend_class(**kwargs.get("frontend_conf", {})) + kwargs["input_size"] = ( + frontend.output_size() if hasattr(frontend, "output_size") else None + ) + kwargs["frontend"] = frontend + + model_conf = {} + deep_update(model_conf, kwargs.get("model_conf", {})) + deep_update(model_conf, kwargs) + init_param = kwargs.get("init_param", None) + + if kwargs["compile_type"] == "punc": + model = MindiePunc(**model_conf, vocab_size=vocab_size) + model.eval() + print(f"Loading pretrained params from {init_param}") + load_pretrained_model( + model=model, + path=init_param, + ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True), + oss_bucket=kwargs.get("oss_bucket", None), + scope_map=kwargs.get("scope_map", []), + excludes=kwargs.get("excludes", None), + ) + MindiePunc.export_ts(model, kwargs["compiled_path"]) + else: + # compile encoder + encoder = MindieEncoder(**model_conf, vocab_size=vocab_size) + encoder.eval() + print(f"Loading pretrained params from {init_param}") + load_pretrained_model( + model=encoder, + path=init_param, + ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True), + oss_bucket=kwargs.get("oss_bucket", None), + scope_map=kwargs.get("scope_map", []), + excludes=kwargs.get("excludes", None), + ) + MindieEncoder.export_ts(encoder, kwargs["compiled_encoder"]) + + # compile decoder + decoder = MindieDecoder(**model_conf, vocab_size=vocab_size) + decoder.eval() + print(f"Loading pretrained params from {init_param}") + load_pretrained_model( + model=decoder, + path=init_param, + ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True), + oss_bucket=kwargs.get("oss_bucket", None), + scope_map=kwargs.get("scope_map", []), + excludes=kwargs.get("excludes", None), + ) + MindieDecoder.export_ts(decoder, kwargs["compiled_decoder"]) + + # compile cif + mindie_cif = MindieCif(decoder.predictor.threshold, kwargs["cif_interval"]) + mindie_cif.export_ts(kwargs["compiled_cif"]) + + # compile cif_timestamp + mindie_cif_timestamp = MindieCifTimestamp(decoder.predictor.threshold - 1e-4, kwargs["cif_timestamp_interval"]) + mindie_cif_timestamp.export_ts(kwargs["compiled_cif_timestamp"]) + + def build_model_with_mindie(self, **kwargs): + assert "model" in kwargs + if "model_conf" not in kwargs: + logging.info("download models from model hub: {}".format(kwargs.get("hub", "ms"))) + kwargs = download_model(**kwargs) + + torch.set_num_threads(kwargs.get("ncpu", 4)) + + # build tokenizer + tokenizer = kwargs.get("tokenizer", None) + if tokenizer is not None: + tokenizer_class = tables.tokenizer_classes.get(tokenizer) + tokenizer = tokenizer_class(**kwargs.get("tokenizer_conf", {})) + kwargs["token_list"] = ( + tokenizer.token_list if hasattr(tokenizer, "token_list") else None + ) + kwargs["token_list"] = ( + tokenizer.get_vocab() if hasattr(tokenizer, "get_vocab") else kwargs["token_list"] + ) + vocab_size = len(kwargs["token_list"]) if kwargs["token_list"] is not None else -1 + if vocab_size == -1 and hasattr(tokenizer, "get_vocab_size"): + vocab_size = tokenizer.get_vocab_size() + else: + vocab_size = -1 + kwargs["tokenizer"] = tokenizer + + # build frontend + frontend = kwargs.get("frontend", None) + kwargs["input_size"] = None + if frontend is not None: + frontend_class = tables.frontend_classes.get(frontend) + frontend = frontend_class(**kwargs.get("frontend_conf", {})) + kwargs["input_size"] = ( + frontend.output_size() if hasattr(frontend, "output_size") else None + ) + kwargs["frontend"] = frontend + + # build model + model_conf = {} + deep_update(model_conf, kwargs.get("model_conf", {})) + deep_update(model_conf, kwargs) + + if kwargs["compile_type"] == "punc": + model = MindieCTTransformer(**model_conf, vocab_size=vocab_size) + else: + model = MindieBiCifParaformer(**model_conf, vocab_size=vocab_size) + + # init_param + init_param = kwargs.get("init_param", None) + logging.info(f"Loading pretrained params from {init_param}") + load_pretrained_model( + model=model, + path=init_param, + ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True), + oss_bucket=kwargs.get("oss_bucket", None), + scope_map=kwargs.get("scope_map", []), + excludes=kwargs.get("excludes", None), + ) + + model.to("npu") + + return model, kwargs + + def inference_with_asr(self, input, input_len=None, model=None, kwargs=None, key=None, **cfg): + kwargs = self.kwargs if kwargs is None else kwargs + deep_update(kwargs, cfg) + model = self.model if model is None else model + model.eval() + + batch_size = kwargs.get("batch_size", 1) + + key_list, data_list = prepare_data_iterator( + input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key + ) + + time_stats = {"input_speech_time": 0.0, "end_to_end_time": 0.0, "pure_infer_time": 0.0, + "load_data": 0.0, "encoder": 0.0, "predictor": 0.0, "decoder": 0.0, + "predictor_timestamp": 0.0, "post_process": 0.0} + asr_result_list = [] + num_samples = len(data_list) + + for beg_idx in range(0, num_samples, batch_size): + end_idx = min(num_samples, beg_idx + batch_size) + data_batch = data_list[beg_idx:end_idx] + key_batch = key_list[beg_idx:end_idx] + batch = {"data_in": data_batch, "key": key_batch} + + if (end_idx - beg_idx) == 1 and kwargs.get("data_type", None) == "fbank": # fbank + batch["data_in"] = data_batch[0] + batch["data_lengths"] = input_len + + with torch.no_grad(): + time1 = time.perf_counter() + res = model.inference_with_npu(**batch, **kwargs) + time2 = time.perf_counter() + if isinstance(res, (list, tuple)): + results = res[0] if len(res) > 0 else [{"text": ""}] + meta_data = res[1] if len(res) > 1 else {} + + asr_result_list.extend(results) + + # batch_data_time = time_per_frame_s * data_batch_i["speech_lengths"].sum().item() + batch_data_time = meta_data.get("batch_data_time", -1) + time_escape = time2 - time1 + + time_stats["load_data"] += meta_data.get("load_data", 0.0) + time_stats["encoder"] += meta_data.get("encoder", 0.0) + time_stats["predictor"] += meta_data.get("calc_predictor", 0.0) + time_stats["decoder"] += meta_data.get("decoder", 0.0) + time_stats["predictor_timestamp"] += meta_data.get("calc_predictor_timestamp", 0.0) + time_stats["post_process"] += meta_data.get("post_process", 0.0) + time_stats["end_to_end_time"] += time_escape + + time_stats["input_speech_time"] += batch_data_time + + time_stats["pure_infer_time"] = time_stats["end_to_end_time"] - time_stats["load_data"] + + return asr_result_list, time_stats \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py new file mode 100644 index 0000000000..71b761187d --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py @@ -0,0 +1,173 @@ +import sys +sys.path.append("./FunASR") + +import torch +import mindietorch + +from mindie_paraformer import precision_eval + + +def cif(hidden, alphas, integrate, frame, threshold): + batch_size, len_time, hidden_size = hidden.size() + + # intermediate vars along time + list_fires = [] + list_frames = [] + + constant = torch.ones([batch_size], device=hidden.device) + for t in range(len_time): + alpha = alphas[:, t] + distribution_completion = constant - integrate + + integrate += alpha + list_fires.append(integrate) + + fire_place = integrate >= threshold + integrate = torch.where( + fire_place, integrate - constant, integrate + ) + cur = torch.where(fire_place, distribution_completion, alpha) + remainds = alpha - cur + + frame += cur[:, None] * hidden[:, t, :] + list_frames.append(frame) + frame = torch.where( + fire_place[:, None].repeat(1, hidden_size), remainds[:, None] * hidden[:, t, :], frame + ) + + fires = torch.stack(list_fires, 1) + frames = torch.stack(list_frames, 1) + + return fires, frames, integrate, frame + + +def cif_wo_hidden(alphas, integrate, threshold): + batch_size, len_time = alphas.size() + + list_fires = [] + + constant = torch.ones([batch_size], device=alphas.device) * threshold + + for t in range(len_time): + alpha = alphas[:, t] + + integrate += alpha + list_fires.append(integrate) + + fire_place = integrate >= threshold + integrate = torch.where( + fire_place, + integrate - constant, + integrate, + ) + + fire_list = [] + for i in range(0, len(list_fires), 500): + batch = list_fires[i:i + 500] + fire = torch.stack(batch, 1) + fire_list.append(fire) + + fires = torch.cat(fire_list, 1) + return fires, integrate + + +class MindieCif(torch.nn.Module): + def __init__(self, threshold, seq_len): + super().__init__() + self.threshold = threshold + self.seq_len = seq_len + + def forward(self, hidden, alphas, integrate, frame): + fires, frames, integrate_new, frame_new = cif(hidden, alphas, integrate, frame, self.threshold) + return fires, frames, integrate_new, frame_new + + def export_ts(self, path="./compiled_cif.ts"): + print("Begin trace cif!") + + input_shape1 = (1, self.seq_len, 512) + input_shape2 = (1, self.seq_len) + input_shape3 = (1, ) + input_shape4 = (1, 512) + + hidden = torch.randn(input_shape1, dtype=torch.float32) + alphas = torch.randn(input_shape2, dtype=torch.float32) + integrate = torch.randn(input_shape3, dtype=torch.float32) + frame = torch.randn(input_shape4, dtype=torch.float32) + compile_inputs = [mindietorch.Input(shape = input_shape1, dtype = torch.float32), + mindietorch.Input(shape = input_shape2, dtype = torch.float32), + mindietorch.Input(shape = input_shape3, dtype = torch.float32), + mindietorch.Input(shape = input_shape4, dtype = torch.float32)] + + export_model = torch.jit.trace(self, example_inputs=(hidden, alphas, integrate, frame)) + print("Finish trace cif") + + compiled_model = mindietorch.compile( + export_model, + inputs = compile_inputs, + precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + soc_version = "Ascend310P3", + ir = "ts" + ) + print("mindietorch compile done !") + compiled_model.save(path) + # compiled_model = torch.jit.load(path) + + print("start to check the percision of cif model.") + sample_hidden = torch.randn(input_shape1, dtype=torch.float32) + sample_alphas = torch.randn(input_shape2, dtype=torch.float32) + sample_integrate = torch.randn(input_shape3, dtype=torch.float32) + sample_frame = torch.randn(input_shape4, dtype=torch.float32) + mrt_res = compiled_model(sample_hidden.to("npu"), sample_alphas.to("npu"), + sample_integrate.to("npu"), sample_frame.to("npu")) + print("mindie infer done !") + ref_res = cif(sample_hidden, sample_alphas, sample_integrate, sample_frame, self.threshold) + print("torch infer done !") + + precision_eval(mrt_res, ref_res) + + +class MindieCifTimestamp(torch.nn.Module): + def __init__(self, threshold, seq_len): + super().__init__() + self.threshold = threshold + self.seq_len = seq_len + + def forward(self, us_alphas, integrate): + us_peaks, integrate_new = cif_wo_hidden(us_alphas, integrate, self.threshold) + + return us_peaks, integrate_new + + def export_ts(self, path="./compiled_cif_timestamp.ts"): + print("Begin trace cif_timestamp!") + + input_shape1 = (1, self.seq_len) + input_shape2 = (1, ) + + us_alphas = torch.randn(input_shape1, dtype=torch.float32) + integrate = torch.randn(input_shape2, dtype=torch.float32) + compile_inputs = [mindietorch.Input(shape = input_shape1, dtype = torch.float32), + mindietorch.Input(shape = input_shape2, dtype = torch.float32)] + + export_model = torch.jit.trace(self, example_inputs=(us_alphas, integrate)) + print("Finish trace cif_timestamp") + + compiled_model = mindietorch.compile( + export_model, + inputs = compile_inputs, + precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + soc_version = "Ascend310P3", + ir = "ts" + ) + print("mindietorch compile done !") + compiled_model.save(path) + # compiled_model = torch.jit.load(path) + + print("start to check the percision of cif_timestamp model.") + sample_input1 = torch.randn(input_shape1, dtype=torch.float32) + sample_input2 = torch.randn(input_shape2, dtype=torch.float32) + mrt_res = compiled_model(sample_input1.to("npu"), sample_input2.to("npu")) + print("mindie infer done !") + ref_res = cif_wo_hidden(sample_input1, sample_input2, self.threshold) + print("torch infer done !") + + precision_eval(mrt_res, ref_res) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py new file mode 100644 index 0000000000..55ace9f3d4 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py @@ -0,0 +1,121 @@ +import sys +sys.path.append("./FunASR") + +import torch +import mindietorch + +from mindie_paraformer import precision_eval +from funasr.models.bicif_paraformer.model import Paraformer + + +class MindieEncoder(Paraformer): + def __init__( + self, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + + def forward(self, speech, speech_length): + encoder_out, encoder_out_lens = self.encode(speech, speech_length) + + return encoder_out, encoder_out_lens + + @staticmethod + def export_ts(encoder, path="./compiled_encoder.ts"): + print("Begin trace encoder!") + + input_shape = (2, 50, 560) + min_shape = (-1, -1, 560) + max_shape = (-1, -1, 560) + input_speech = torch.randn(input_shape, dtype=torch.float32) + input_speech_lens = torch.tensor([50, 25], dtype=torch.int32) + compile_inputs = [mindietorch.Input(min_shape = min_shape, max_shape = max_shape, dtype = torch.float32), + mindietorch.Input(min_shape = (-1, ), max_shape = (-1, ), dtype = torch.int32)] + + export_model = torch.jit.trace(encoder, example_inputs=(input_speech, input_speech_lens)) + print("Finish trace encoder") + + compiled_model = mindietorch.compile( + export_model, + inputs = compile_inputs, + precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + soc_version = "Ascend310P3", + ir = "ts" + ) + print("mindietorch compile done !") + compiled_model.save(path) + # compiled_model = torch.jit.load(path) + + print("start to check the percision of encoder.") + sample_speech = torch.randn((4, 100, 560), dtype=torch.float32) + sample_speech_lens = torch.tensor([100, 50, 100, 25], dtype=torch.int32) + mrt_res = compiled_model(sample_speech.to("npu"), sample_speech_lens.to("npu")) + print("mindie infer done !") + ref_res = encoder(sample_speech, sample_speech_lens) + print("torch infer done !") + + precision_eval(mrt_res, ref_res) + + +class MindieDecoder(Paraformer): + def __init__( + self, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + + def forward(self, encoder_out, encoder_out_lens, sematic_embeds, pre_token_length): + decoder_outs = self.decoder(encoder_out, encoder_out_lens, sematic_embeds, pre_token_length) + decoder_out = decoder_outs[0] + decoder_out = torch.log_softmax(decoder_out, dim=-1) + return decoder_out + + @staticmethod + def export_ts(decoder, path="./compiled_decoder.ts"): + print("Begin trace decoder!") + + input_shape1 = (2, 939, 512) + min_shape1 = (-1, -1, 512) + max_shape1 = (-1, -1, 512) + + input_shape2 = (2, 261, 512) + min_shape2 = (-1, -1, 512) + max_shape2 = (-1, -1, 512) + + encoder_out = torch.randn(input_shape1, dtype=torch.float32) + encoder_out_lens = torch.tensor([939, 500], dtype=torch.int32) + sematic_embeds = torch.randn(input_shape2, dtype=torch.float32) + sematic_embeds_lens = torch.tensor([261, 100], dtype=torch.int32) + + compile_inputs = [mindietorch.Input(min_shape = min_shape1, max_shape = max_shape1, dtype = torch.float32), + mindietorch.Input(min_shape = (-1, ), max_shape = (-1, ), dtype = torch.int32), + mindietorch.Input(min_shape = min_shape2, max_shape = max_shape2, dtype = torch.float32), + mindietorch.Input(min_shape = (-1, ), max_shape = (-1, ), dtype = torch.int32)] + + export_model = torch.jit.trace(decoder, example_inputs=(encoder_out, encoder_out_lens, sematic_embeds, sematic_embeds_lens)) + print("Finish trace decoder") + + compiled_model = mindietorch.compile( + export_model, + inputs = compile_inputs, + precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + soc_version = "Ascend310P3", + ir = "ts" + ) + print("mindietorch compile done !") + compiled_model.save(path) + # compiled_model = torch.jit.load(path) + + print("start to check the percision of decoder.") + sample_encoder = torch.randn((4, 150, 512), dtype=torch.float32) + sample_encoder_lens = torch.tensor([150, 100, 150, 50], dtype=torch.int32) + sample_sematic = torch.randn((4, 50, 512), dtype=torch.float32) + sample_sematic_lens = torch.tensor([50, 30, 50, 10], dtype=torch.int32) + mrt_res = compiled_model(sample_encoder.to("npu"), sample_encoder_lens.to("npu"), sample_sematic.to("npu"), sample_sematic_lens.to("npu")) + print("mindie infer done !") + ref_res = decoder(sample_encoder, sample_encoder_lens, sample_sematic, sample_sematic_lens) + print("torch infer done !") + + precision_eval(mrt_res, ref_res) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py new file mode 100644 index 0000000000..bc4f38335b --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py @@ -0,0 +1,251 @@ +import sys +sys.path.append("./FunASR") + +import copy +import time +import torch +import torch_npu +import torch.nn.functional as F + +from funasr.models.bicif_paraformer.model import BiCifParaformer, load_audio_text_image_video, \ + extract_fbank, Hypothesis, ts_prediction_lfr6_standard, postprocess_utils +from funasr.models.transformer.utils.nets_utils import make_pad_mask_new + + +COSINE_THRESHOLD = 0.999 +def cosine_similarity(gt_tensor, pred_tensor): + gt_tensor = gt_tensor.flatten().to(torch.float32) + pred_tensor = pred_tensor.flatten().to(torch.float32) + if torch.sum(gt_tensor) == 0.0 or torch.sum(pred_tensor) == 0.0: + if torch.allclose(gt_tensor, pred_tensor, atol=1e-4, rtol=1e-4, equal_nan=True): + return 1.0 + res = torch.nn.functional.cosine_similarity(gt_tensor, pred_tensor, dim=0, eps=1e-6) + res = res.cpu().detach().item() + return res + + +def precision_eval(mrt_res, ref_res): + if not isinstance(mrt_res, (list, tuple)): + mrt_res = [mrt_res, ] + if not isinstance(ref_res, (list, tuple)): + ref_res = [ref_res, ] + + com_res = True + for j, a in zip(mrt_res, ref_res): + res = cosine_similarity(j.to("cpu"), a) + print(res) + if res < COSINE_THRESHOLD: + com_res = False + + if com_res: + print("Compare success ! NPU model have the same output with CPU model !") + else: + print("Compare failed ! Outputs of NPU model are not the same with CPU model !") + + +class MindieBiCifParaformer(BiCifParaformer): + def __init__( + self, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + + self.mindie_encoder = torch.jit.load(kwargs["compiled_encoder"]) + self.mindie_decoder = torch.jit.load(kwargs["compiled_decoder"]) + self.mindie_cif = torch.jit.load(kwargs["compiled_cif"]) + self.mindie_cif_timestamp = torch.jit.load(kwargs["compiled_cif_timestamp"]) + + def inference_with_npu( + self, + data_in, + data_lengths=None, + key: list = None, + tokenizer=None, + frontend=None, + **kwargs, + ): + stream = torch_npu.npu.current_stream() + # Step1: load input data + time1 = time.perf_counter() + meta_data = {} + + is_use_ctc = kwargs.get("decoding_ctc_weight", 0.0) > 0.00001 and self.ctc != None + is_use_lm = ( + kwargs.get("lm_weight", 0.0) > 0.00001 and kwargs.get("lm_file", None) is not None + ) + if self.beam_search is None and (is_use_lm or is_use_ctc): + self.init_beam_search(**kwargs) + self.nbest = kwargs.get("nbest", 1) + audio_sample_list = load_audio_text_image_video( + data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000) + ) + + speech, speech_lengths = extract_fbank( + audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend + ) + print("Input shape: ", speech.shape) + speech = speech.to("npu") + speech_lengths = speech_lengths.to("npu") + meta_data["batch_data_time"] = ( + speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000 + ) + + time2 = time.perf_counter() + meta_data["load_data"] = time2 - time1 + + # Step2: run with compiled encoder + encoder_out, encoder_out_lens = self.mindie_encoder(speech, speech_lengths) + encoder_out_lens = encoder_out_lens.to(torch.int32) + + encoder_out_mask = ( + ~make_pad_mask_new(encoder_out_lens)[:, None, :] + ).to(encoder_out.device) + hidden, alphas, pre_token_length = ( + self.predictor(encoder_out, None, encoder_out_mask, ignore_id=self.ignore_id) + ) + pre_token_length = pre_token_length.round().to(torch.int32) + stream.synchronize() + time3 = time.perf_counter() + meta_data["encoder"] = time3 - time2 + + + # Step3: divide dynamic loop into multiple smaller loops for calculation + # each with a number of iterations based on kwargs["cif_interval"] + batch_size, len_time, hidden_size = hidden.size() + loop_num = len_time // kwargs["cif_interval"] + 1 + padding_len = loop_num * kwargs["cif_interval"] + padding_size = padding_len - len_time + padded_hidden = F.pad(hidden, (0, 0, 0, padding_size), "constant", 0) + padded_alphas = F.pad(alphas, (0, padding_size), "constant", 0) + + fires_batch = [] + frames_batch = [] + for b in range(batch_size): + fires_list = [] + frames_list = [] + integrate = torch.zeros([1, ], device=hidden.device) + frame = torch.zeros([1, hidden_size], device=hidden.device) + for i in range(loop_num): + cur_hidden = padded_hidden[b : b + 1, i * kwargs["cif_interval"] : (i + 1) * kwargs["cif_interval"], :] + cur_alphas = padded_alphas[b : b + 1, i * kwargs["cif_interval"] : (i + 1) * kwargs["cif_interval"]] + cur_fires, cur_frames, integrate, frame = self.mindie_cif(cur_hidden, cur_alphas, integrate, frame) + fires_list.append(cur_fires) + frames_list.append(cur_frames) + fire = torch.cat(fires_list, 1) + frame = torch.cat(frames_list, 1) + fires_batch.append(fire) + frames_batch.append(frame) + fires = torch.cat(fires_batch, 0) + frames = torch.cat(frames_batch, 0) + + list_ls = [] + len_labels = torch.round(alphas.sum(-1)).int() + max_label_len = len_labels.max() + for b in range(batch_size): + fire = fires[b, :len_time] + l = torch.index_select(frames[b, :len_time, :], 0, torch.nonzero(fire >= self.predictor.threshold).squeeze()) + pad_l = torch.zeros([max_label_len - l.size(0), hidden_size], device=hidden.device) + list_ls.append(torch.cat([l, pad_l], 0)) + acoustic_embeds = torch.stack(list_ls, 0) + token_num_int = torch.max(pre_token_length) + pre_acoustic_embeds = acoustic_embeds[:, :token_num_int, :] + + if torch.max(pre_token_length) < 1: + return [] + stream.synchronize() + time4 = time.perf_counter() + meta_data["calc_predictor"] = time4 - time3 + + + # Step4: run with compiled decoder + decoder_out = self.mindie_decoder(encoder_out, encoder_out_lens, pre_acoustic_embeds, pre_token_length) + stream.synchronize() + time5 = time.perf_counter() + meta_data["decoder"] = time5 - time4 + + + # Step5: divide dynamic loop into multiple smaller loops for calculation + # each with a number of iterations based on kwargs["cif_timestamp_interval"] + us_alphas = self.predictor.get_upsample_timestamp(encoder_out, encoder_out_mask, pre_token_length) + len_alphas = us_alphas.shape[1] + loop_num = len_alphas // kwargs["cif_timestamp_interval"] + 1 + padding_len = loop_num * kwargs["cif_timestamp_interval"] + padding_size = padding_len - len_alphas + padded_alphas = F.pad(us_alphas, (0, padding_size), "constant", 0) + + peak_batch = [] + for b in range(batch_size): + peak_list = [] + integrate_alphas = torch.zeros([1], device=alphas.device) + for i in range(loop_num): + cur_alphas = padded_alphas[b:b+1, i * kwargs["cif_timestamp_interval"] : (i + 1) * kwargs["cif_timestamp_interval"]] + peak, integrate_alphas = self.mindie_cif_timestamp(cur_alphas, integrate_alphas) + peak_list.append(peak) + us_peak = torch.cat(peak_list, 1)[:, :len_alphas] + peak_batch.append(us_peak) + us_peaks = torch.cat(peak_batch, 0) + + stream.synchronize() + time6 = time.perf_counter() + meta_data["calc_predictor_timestamp"] = time6 - time5 + + + # Step6: post process + results = [] + b, n, d = decoder_out.size() + for i in range(b): + x = encoder_out[i, : encoder_out_lens[i], :] + am_scores = decoder_out[i, : pre_token_length[i], :] + + yseq = am_scores.argmax(dim=-1) + score = am_scores.max(dim=-1)[0] + score = torch.sum(score, dim=-1) + + # pad with mask tokens to ensure compatibility with sos/eos tokens + yseq = torch.tensor([self.sos] + yseq.tolist() + [self.eos], device=yseq.device) + + nbest_hyps = [Hypothesis(yseq=yseq, score=score)] + + for nbest_idx, hyp in enumerate(nbest_hyps): + # remove sos/eos and get results + last_pos = -1 + if isinstance(hyp.yseq, list): + token_int = hyp.yseq[1:last_pos] + else: + token_int = hyp.yseq[1:last_pos].tolist() + + # remove blank symbol id, which is assumed to be 0 + token_int = list( + filter( + lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int + ) + ) + + # Change integer-ids to tokens + token = tokenizer.ids2tokens(token_int) + + _, timestamp = ts_prediction_lfr6_standard( + us_alphas[i][: encoder_out_lens[i] * 3], + us_peaks[i][: encoder_out_lens[i] * 3], + copy.copy(token), + vad_offset=kwargs.get("begin_time", 0), + ) + + text_postprocessed, time_stamp_postprocessed, word_lists = ( + postprocess_utils.sentence_postprocess(token, timestamp) + ) + + result_i = { + "key": key[i], + "text": text_postprocessed, + "timestamp": time_stamp_postprocessed, + } + + results.append(result_i) + + stream.synchronize() + time7 = time.perf_counter() + meta_data["post_process"] = time7 - time6 + + return results, meta_data diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py new file mode 100644 index 0000000000..87c7635264 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py @@ -0,0 +1,199 @@ +import sys +sys.path.append("./FunASR") + +import torch +import mindietorch + +from mindie_paraformer import precision_eval +from funasr.utils.load_utils import load_audio_text_image_video +from funasr.models.ct_transformer.model import CTTransformer +from funasr.models.ct_transformer.utils import split_to_mini_sentence, split_words + + +class MindiePunc(CTTransformer): + def __init__( + self, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + + def forward(self, text, text_lengths): + y, _ = self.punc_forward(text, text_lengths) + _, indices = y.view(-1, y.shape[-1]).topk(1, dim=1) + punctuations = torch.squeeze(indices, dim=1) + + return punctuations + + @staticmethod + def export_ts(punc, path="./compiled_punc.ts"): + print("Begin trace punc!") + + input_shape = (1, 20) + min_shape = (1, -1) + max_shape = (1, -1) + input_speech = torch.randint(1, 10, input_shape, dtype=torch.int32) + input_speech_lengths = torch.tensor([20, ], dtype=torch.int32) + compile_inputs = [mindietorch.Input(min_shape = min_shape, max_shape = max_shape, dtype = torch.int32), + mindietorch.Input(min_shape = (1, ), max_shape = (1, ), dtype = torch.int32)] + + export_model = torch.jit.trace(punc, example_inputs=(input_speech, input_speech_lengths)) + print("Finish trace punc") + + compiled_model = mindietorch.compile( + export_model, + inputs = compile_inputs, + precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + soc_version = "Ascend310P3", + ir = "ts" + ) + print("mindietorch compile done !") + compiled_model.save(path) + # compiled_model = torch.jit.load(path) + + print("start to check the percision of punc model.") + sample_speech = torch.randint(1, 10, (1, 10), dtype=torch.int32) + sample_speech_lengths = torch.tensor([10, ], dtype=torch.int32) + mrt_res = compiled_model(sample_speech.to("npu"), sample_speech_lengths.to("npu")) + print("mindie infer done !") + ref_res = punc(sample_speech, sample_speech_lengths) + print("torch infer done !") + + precision_eval(mrt_res, ref_res) + + +class MindieCTTransformer(CTTransformer): + def __init__( + self, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + + self.mindie_punc = torch.jit.load(kwargs["compiled_punc"]) + + def inference( + self, + data_in, + data_lengths=None, + key: list = None, + tokenizer=None, + frontend=None, + **kwargs, + ): + assert len(data_in) == 1 + text = load_audio_text_image_video(data_in, data_type=kwargs.get("kwargs", "text"))[0] + + split_size = kwargs.get("split_size", 20) + + tokens = split_words(text, jieba_usr_dict=self.jieba_usr_dict) + tokens_int = tokenizer.encode(tokens) + + mini_sentences = split_to_mini_sentence(tokens, split_size) + mini_sentences_id = split_to_mini_sentence(tokens_int, split_size) + assert len(mini_sentences) == len(mini_sentences_id) + + mini_sentences_id = [torch.unsqueeze(torch.tensor(id, dtype=torch.int32), 0).to("npu") for id in mini_sentences_id] + + cache_sent = [] + cache_sent_id = torch.tensor([[]], dtype=torch.int32).to("npu") + new_mini_sentence = "" + cache_pop_trigger_limit = 200 + results = [] + meta_data = {} + + for mini_sentence_i in range(len(mini_sentences)): + mini_sentence = mini_sentences[mini_sentence_i] + mini_sentence_id = mini_sentences_id[mini_sentence_i] + mini_sentence = cache_sent + mini_sentence + mini_sentence_id = torch.cat([cache_sent_id, mini_sentence_id], dim=1) + + text = mini_sentence_id + text_lengths = torch.tensor([text.shape[1], ], dtype=torch.int32, device=text.device) + punctuations = self.mindie_punc(text, text_lengths) + + assert punctuations.size()[0] == len(mini_sentence) + + # Search for the last Period/QuestionMark as cache + if mini_sentence_i < len(mini_sentences) - 1: + sentenceEnd = -1 + last_comma_index = -1 + for i in range(len(punctuations) - 2, 1, -1): + if ( + self.punc_list[punctuations[i]] == "。" + or self.punc_list[punctuations[i]] == "?" + ): + sentenceEnd = i + break + if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",": + last_comma_index = i + + if ( + sentenceEnd < 0 + and len(mini_sentence) > cache_pop_trigger_limit + and last_comma_index >= 0 + ): + # The sentence it too long, cut off at a comma. + sentenceEnd = last_comma_index + punctuations[sentenceEnd] = self.sentence_end_id + cache_sent = mini_sentence[sentenceEnd + 1 :] + cache_sent_id = mini_sentence_id[:, sentenceEnd + 1 :] + mini_sentence = mini_sentence[0 : sentenceEnd + 1] + punctuations = punctuations[0 : sentenceEnd + 1] + + words_with_punc = [] + for i in range(len(mini_sentence)): + if ( + i == 0 + or self.punc_list[punctuations[i - 1]] == "。" + or self.punc_list[punctuations[i - 1]] == "?" + ) and len(mini_sentence[i][0].encode()) == 1: + mini_sentence[i] = mini_sentence[i].capitalize() + if i == 0: + if len(mini_sentence[i][0].encode()) == 1: + mini_sentence[i] = " " + mini_sentence[i] + if i > 0: + if ( + len(mini_sentence[i][0].encode()) == 1 + and len(mini_sentence[i - 1][0].encode()) == 1 + ): + mini_sentence[i] = " " + mini_sentence[i] + words_with_punc.append(mini_sentence[i]) + if self.punc_list[punctuations[i]] != "_": + punc_res = self.punc_list[punctuations[i]] + if len(mini_sentence[i][0].encode()) == 1: + if punc_res == ",": + punc_res = "," + elif punc_res == "。": + punc_res = "." + elif punc_res == "?": + punc_res = "?" + words_with_punc.append(punc_res) + new_mini_sentence += "".join(words_with_punc) + # Add Period for the end of the sentence + new_mini_sentence_out = new_mini_sentence + if mini_sentence_i == len(mini_sentences) - 1: + if new_mini_sentence[-1] == "," or new_mini_sentence[-1] == "、": + new_mini_sentence_out = new_mini_sentence[:-1] + "。" + elif new_mini_sentence[-1] == ",": + new_mini_sentence_out = new_mini_sentence[:-1] + "." + elif ( + new_mini_sentence[-1] != "。" + and new_mini_sentence[-1] != "?" + and len(new_mini_sentence[-1].encode()) != 1 + ): + new_mini_sentence_out = new_mini_sentence + "。" + if len(punctuations): + punctuations[-1] = 2 + elif ( + new_mini_sentence[-1] != "." + and new_mini_sentence[-1] != "?" + and len(new_mini_sentence[-1].encode()) == 1 + ): + new_mini_sentence_out = new_mini_sentence + "." + if len(punctuations): + punctuations[-1] = 2 + + result_i = {"key": key[0], "text": new_mini_sentence_out} + results.append(result_i) + return results, meta_data -- Gitee From 3db33ce4080b6eaf27fbd3e43832933d4f7ef847 Mon Sep 17 00:00:00 2001 From: brjiang Date: Mon, 19 Aug 2024 11:03:13 +0800 Subject: [PATCH 02/12] =?UTF-8?q?=E5=88=A0=E9=99=A4torch=5Fnpu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/audio/Paraformer/README.md | 16 ++-- .../built-in/audio/Paraformer/compile.py | 3 +- .../audio/Paraformer/mindie_auto_model.py | 4 +- .../built-in/audio/Paraformer/mindie_cif.py | 5 +- .../Paraformer/mindie_encoder_decoder.py | 21 ++++- .../audio/Paraformer/mindie_paraformer.py | 77 ++++++++----------- .../built-in/audio/Paraformer/mindie_punc.py | 9 ++- 7 files changed, 68 insertions(+), 67 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md index 2bacbe5e0a..ebf7dfae4a 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md @@ -46,11 +46,7 @@ source /usr/local/Ascend/mindie/set_env.sh ``` -2. 安装torch_npu - - 请根据[昇腾文档](https://www.hiascend.com/document/detail/zh/mindie/10RC2/mindietorch/Torchdev/mindie_torch0017.html)安装合适版本的torch_npu,并手动编译libtorch_npu_bridge.so - -3. 获取Funasr源码 +2. 获取Funasr源码 ``` git clone https://github.com/modelscope/FunASR.git @@ -59,7 +55,7 @@ cd .. ``` -4. 修改Funasr的源码,先将patch文件移动至Funasr的工程路径下,而后将patch应用到代码中(若patch应用失败,则需要手动进行修改) +3. 修改Funasr的源码,先将patch文件移动至Funasr的工程路径下,而后将patch应用到代码中(若patch应用失败,则需要手动进行修改) ``` mv mindie.patch ./FunASR cd ./FunASR @@ -67,7 +63,7 @@ cd .. ``` -5. 获取模型文件 +4. 获取模型文件 将[Paraformer](https://modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)的模型文件下载到本地,并保存在./model文件夹下 @@ -94,13 +90,17 @@ └── ... ``` -6. 安装Funasr的依赖 +5. 安装Funasr的依赖 ``` sudo apt install ffmpeg pip install jieba ``` ## 模型推理 +0. 设置mindie内存池上限为12G,执行如下命令设置环境变量 + ``` + export TORCH_AIE_NPU_CACHE_MAX_SIZE=12 + ``` 1. 模型编译及样本测试 执行下述命令进行模型编译 diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py index 5629dada48..224f4a11e1 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py @@ -17,7 +17,6 @@ import argparse import torch -import torch_npu import mindietorch from mindie_auto_model import MindieAutoModel @@ -77,7 +76,7 @@ if __name__ == "__main__": _ = model.generate(input=args.sample_path) # run with sample audio - iteration_num = 100 + iteration_num = 10 print("Begin runing with sample audio with {} iterations".format(iteration_num)) total_dict_time = {} diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py index 8251f7533f..0e33b0b2e4 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py @@ -41,7 +41,7 @@ class MindieAutoModel(AutoModel): logging.info("Building punc model.") punc_kwargs["model"] = punc_model punc_kwargs["model_revision"] = kwargs.get("punc_model_revision", "master") - punc_kwargs["device"] = "npu" + punc_kwargs["device"] = "cpu" punc_kwargs["compile_type"] = "punc" punc_kwargs["compiled_punc"] = kwargs["compiled_punc"] punc_model, punc_kwargs = self.build_model_with_mindie(**punc_kwargs) @@ -217,8 +217,6 @@ class MindieAutoModel(AutoModel): excludes=kwargs.get("excludes", None), ) - model.to("npu") - return model, kwargs def inference_with_asr(self, input, input_len=None, model=None, kwargs=None, key=None, **cfg): diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py index 71b761187d..71f18f6c03 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py @@ -79,7 +79,10 @@ class MindieCif(torch.nn.Module): def forward(self, hidden, alphas, integrate, frame): fires, frames, integrate_new, frame_new = cif(hidden, alphas, integrate, frame, self.threshold) - return fires, frames, integrate_new, frame_new + + frame = torch.index_select(frames[0, :, :], 0, torch.nonzero(fires[0, :] >= self.threshold).squeeze(1)) + + return frame, integrate_new, frame_new def export_ts(self, path="./compiled_cif.ts"): print("Begin trace cif!") diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py index 55ace9f3d4..db8e59fa41 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py @@ -6,6 +6,7 @@ import mindietorch from mindie_paraformer import precision_eval from funasr.models.bicif_paraformer.model import Paraformer +from funasr.models.transformer.utils.nets_utils import make_pad_mask_new class MindieEncoder(Paraformer): @@ -19,7 +20,16 @@ class MindieEncoder(Paraformer): def forward(self, speech, speech_length): encoder_out, encoder_out_lens = self.encode(speech, speech_length) - return encoder_out, encoder_out_lens + encoder_out_lens = encoder_out_lens.to(torch.int32) + + encoder_out_mask = ( + ~make_pad_mask_new(encoder_out_lens)[:, None, :] + ).to(encoder_out.device) + hidden, alphas, pre_token_length = ( + self.predictor(encoder_out, None, encoder_out_mask, ignore_id=self.ignore_id) + ) + + return encoder_out, encoder_out_lens, hidden, alphas, pre_token_length @staticmethod def export_ts(encoder, path="./compiled_encoder.ts"): @@ -70,7 +80,14 @@ class MindieDecoder(Paraformer): decoder_outs = self.decoder(encoder_out, encoder_out_lens, sematic_embeds, pre_token_length) decoder_out = decoder_outs[0] decoder_out = torch.log_softmax(decoder_out, dim=-1) - return decoder_out + + encoder_out_mask = ( + ~make_pad_mask_new(encoder_out_lens)[:, None, :] + ).to(encoder_out.device) + + us_alphas = self.predictor.get_upsample_timestamp(encoder_out, encoder_out_mask, pre_token_length) + + return decoder_out, us_alphas @staticmethod def export_ts(decoder, path="./compiled_decoder.ts"): diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py index bc4f38335b..73876484f0 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py @@ -4,7 +4,6 @@ sys.path.append("./FunASR") import copy import time import torch -import torch_npu import torch.nn.functional as F from funasr.models.bicif_paraformer.model import BiCifParaformer, load_audio_text_image_video, \ @@ -65,7 +64,6 @@ class MindieBiCifParaformer(BiCifParaformer): frontend=None, **kwargs, ): - stream = torch_npu.npu.current_stream() # Step1: load input data time1 = time.perf_counter() meta_data = {} @@ -85,27 +83,23 @@ class MindieBiCifParaformer(BiCifParaformer): audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend ) print("Input shape: ", speech.shape) - speech = speech.to("npu") - speech_lengths = speech_lengths.to("npu") meta_data["batch_data_time"] = ( speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000 ) + speech = speech.to("npu") + speech_lengths = speech_lengths.to("npu") time2 = time.perf_counter() meta_data["load_data"] = time2 - time1 # Step2: run with compiled encoder - encoder_out, encoder_out_lens = self.mindie_encoder(speech, speech_lengths) - encoder_out_lens = encoder_out_lens.to(torch.int32) - - encoder_out_mask = ( - ~make_pad_mask_new(encoder_out_lens)[:, None, :] - ).to(encoder_out.device) - hidden, alphas, pre_token_length = ( - self.predictor(encoder_out, None, encoder_out_mask, ignore_id=self.ignore_id) - ) + encoder_out, encoder_out_lens, hidden, alphas, pre_token_length = self.mindie_encoder(speech, speech_lengths) + + hidden = hidden.to("cpu") + alphas = alphas.to("cpu") + pre_token_length = pre_token_length.to("cpu") + pre_token_length = pre_token_length.round().to(torch.int32) - stream.synchronize() time3 = time.perf_counter() meta_data["encoder"] = time3 - time2 @@ -119,55 +113,43 @@ class MindieBiCifParaformer(BiCifParaformer): padded_hidden = F.pad(hidden, (0, 0, 0, padding_size), "constant", 0) padded_alphas = F.pad(alphas, (0, padding_size), "constant", 0) - fires_batch = [] + len_labels = torch.round(alphas.sum(-1)).int() + max_label_len = len_labels.max() + frames_batch = [] for b in range(batch_size): - fires_list = [] frames_list = [] - integrate = torch.zeros([1, ], device=hidden.device) - frame = torch.zeros([1, hidden_size], device=hidden.device) + integrate = torch.zeros([1, ]).to("npu") + frame = torch.zeros([1, hidden_size]).to("npu") for i in range(loop_num): cur_hidden = padded_hidden[b : b + 1, i * kwargs["cif_interval"] : (i + 1) * kwargs["cif_interval"], :] cur_alphas = padded_alphas[b : b + 1, i * kwargs["cif_interval"] : (i + 1) * kwargs["cif_interval"]] - cur_fires, cur_frames, integrate, frame = self.mindie_cif(cur_hidden, cur_alphas, integrate, frame) - fires_list.append(cur_fires) - frames_list.append(cur_frames) - fire = torch.cat(fires_list, 1) - frame = torch.cat(frames_list, 1) - fires_batch.append(fire) - frames_batch.append(frame) - fires = torch.cat(fires_batch, 0) - frames = torch.cat(frames_batch, 0) - - list_ls = [] - len_labels = torch.round(alphas.sum(-1)).int() - max_label_len = len_labels.max() - for b in range(batch_size): - fire = fires[b, :len_time] - l = torch.index_select(frames[b, :len_time, :], 0, torch.nonzero(fire >= self.predictor.threshold).squeeze()) - pad_l = torch.zeros([max_label_len - l.size(0), hidden_size], device=hidden.device) - list_ls.append(torch.cat([l, pad_l], 0)) - acoustic_embeds = torch.stack(list_ls, 0) + cur_frames, integrate, frame = self.mindie_cif(cur_hidden.to("npu"), cur_alphas.to("npu"), integrate, frame) + frames_list.append(cur_frames.to("cpu")) + frame = torch.cat(frames_list, 0) + pad_frame = torch.zeros([max_label_len - frame.size(0), hidden_size], device=hidden.device) + frames_batch.append(torch.cat([frame, pad_frame], 0)) + + acoustic_embeds = torch.stack(frames_batch, 0) token_num_int = torch.max(pre_token_length) pre_acoustic_embeds = acoustic_embeds[:, :token_num_int, :] if torch.max(pre_token_length) < 1: return [] - stream.synchronize() time4 = time.perf_counter() meta_data["calc_predictor"] = time4 - time3 # Step4: run with compiled decoder - decoder_out = self.mindie_decoder(encoder_out, encoder_out_lens, pre_acoustic_embeds, pre_token_length) - stream.synchronize() + decoder_out, us_alphas = self.mindie_decoder(encoder_out, encoder_out_lens, + pre_acoustic_embeds.to("npu"), pre_token_length.to("npu")) + us_alphas = us_alphas.to("cpu") time5 = time.perf_counter() meta_data["decoder"] = time5 - time4 # Step5: divide dynamic loop into multiple smaller loops for calculation # each with a number of iterations based on kwargs["cif_timestamp_interval"] - us_alphas = self.predictor.get_upsample_timestamp(encoder_out, encoder_out_mask, pre_token_length) len_alphas = us_alphas.shape[1] loop_num = len_alphas // kwargs["cif_timestamp_interval"] + 1 padding_len = loop_num * kwargs["cif_timestamp_interval"] @@ -177,25 +159,27 @@ class MindieBiCifParaformer(BiCifParaformer): peak_batch = [] for b in range(batch_size): peak_list = [] - integrate_alphas = torch.zeros([1], device=alphas.device) + integrate_alphas = torch.zeros([1]).to("npu") for i in range(loop_num): cur_alphas = padded_alphas[b:b+1, i * kwargs["cif_timestamp_interval"] : (i + 1) * kwargs["cif_timestamp_interval"]] - peak, integrate_alphas = self.mindie_cif_timestamp(cur_alphas, integrate_alphas) - peak_list.append(peak) + peak, integrate_alphas = self.mindie_cif_timestamp(cur_alphas.to("npu"), integrate_alphas) + peak_list.append(peak.to("cpu")) us_peak = torch.cat(peak_list, 1)[:, :len_alphas] peak_batch.append(us_peak) us_peaks = torch.cat(peak_batch, 0) - stream.synchronize() time6 = time.perf_counter() meta_data["calc_predictor_timestamp"] = time6 - time5 # Step6: post process + decoder_out = decoder_out.to("cpu") + us_alphas = us_alphas.to("cpu") + us_peaks = us_peaks.to("cpu") + encoder_out_lens = encoder_out_lens.to("cpu") results = [] b, n, d = decoder_out.size() for i in range(b): - x = encoder_out[i, : encoder_out_lens[i], :] am_scores = decoder_out[i, : pre_token_length[i], :] yseq = am_scores.argmax(dim=-1) @@ -244,7 +228,6 @@ class MindieBiCifParaformer(BiCifParaformer): results.append(result_i) - stream.synchronize() time7 = time.perf_counter() meta_data["post_process"] = time7 - time6 diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py index 87c7635264..4aa895f9f7 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py @@ -93,10 +93,10 @@ class MindieCTTransformer(CTTransformer): mini_sentences_id = split_to_mini_sentence(tokens_int, split_size) assert len(mini_sentences) == len(mini_sentences_id) - mini_sentences_id = [torch.unsqueeze(torch.tensor(id, dtype=torch.int32), 0).to("npu") for id in mini_sentences_id] + mini_sentences_id = [torch.unsqueeze(torch.tensor(id, dtype=torch.int32), 0) for id in mini_sentences_id] cache_sent = [] - cache_sent_id = torch.tensor([[]], dtype=torch.int32).to("npu") + cache_sent_id = torch.tensor([[]], dtype=torch.int32) new_mini_sentence = "" cache_pop_trigger_limit = 200 results = [] @@ -108,9 +108,10 @@ class MindieCTTransformer(CTTransformer): mini_sentence = cache_sent + mini_sentence mini_sentence_id = torch.cat([cache_sent_id, mini_sentence_id], dim=1) - text = mini_sentence_id - text_lengths = torch.tensor([text.shape[1], ], dtype=torch.int32, device=text.device) + text = mini_sentence_id.to("npu") + text_lengths = torch.tensor([text.shape[1], ], dtype=torch.int32).to("npu") punctuations = self.mindie_punc(text, text_lengths) + punctuations = punctuations.to("cpu") assert punctuations.size()[0] == len(mini_sentence) -- Gitee From 735c4e61be2aa5a95f8d98dfa7c5ef29ff723f5b Mon Sep 17 00:00:00 2001 From: brjiang Date: Mon, 19 Aug 2024 17:10:24 +0800 Subject: [PATCH 03/12] =?UTF-8?q?=E6=96=B0=E5=A2=9Etrace=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/audio/Paraformer/README.md | 22 ++- .../built-in/audio/Paraformer/compile.py | 4 +- .../audio/Paraformer/mindie_auto_model.py | 2 +- .../Paraformer/mindie_encoder_decoder.py | 27 ++-- .../audio/Paraformer/trace_decoder.py | 137 ++++++++++++++++++ 5 files changed, 177 insertions(+), 15 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_decoder.py diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md index ebf7dfae4a..e16b89fc5d 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md @@ -97,12 +97,28 @@ ``` ## 模型推理 -0. 设置mindie内存池上限为12G,执行如下命令设置环境变量 +1. 设置mindie内存池上限为12G,执行如下命令设置环境变量 ``` export TORCH_AIE_NPU_CACHE_MAX_SIZE=12 ``` -1. 模型编译及样本测试 +2. (可选) 若CPU为aarch64的架构,则在编译decoder时会出现RuntimeError: could not create a primitive descriptor for a matmul primitive,此时需要新创建一个Python环境(推荐使用conda创建),而后使用如下命令安装torch 2.2.1 + ``` + pip install torch==2.2.1 torchvision==0.17.1 --index-url https://download.pytorch.org/whl/cpu + ``` + + 而后,执行如下脚本将decoder序列化 + ``` + python trace_decoder.py --model ./model --traced_decoder ./compiled_model/traced_decoder.pt + ``` + 参数说明: + - --model:预训练模型路径 + - --traced_decoder: 序列化后的decoder保存路径 + + 该步骤仅获得一个序列化后的decoder模型,后续模型编译仍需回到原始环境 + + +3. 模型编译及样本测试 执行下述命令进行模型编译 ```bash python compile.py \ @@ -114,6 +130,7 @@ --compiled_cif ./compiled_model/compiled_cif.pt \ --compiled_cif_timestamp ./compiled_model/compiled_cif_timestamp.pt \ --compiled_punc ./compiled_model/compiled_punc.ts \ + --traced_decoder ./compiled_model/traced_decoder.pt \ --batch_size 16 \ --sample_path ./model/example/asr_example.wav \ --skip_compile @@ -128,6 +145,7 @@ - --compiled_cif:编译后的cif函数的路径 - --compiled_cif_timestamp:编译后的cif_timestamp函数的路径 - --compiled_punc:编译后的punc的路径 + - --traced_decoder:预先序列化的decoder模型的路径,若并未执行第2步提前编译模型,则无需指定该参数 - --batch_size:Paraformer模型所使用的batch_size - --sample_path:用于测试模型的样本音频路径 - --skip_compile:是否进行模型编译,若已经完成编译,后续使用可使用该参数跳过编译,若为第一次使用不能指定该参数 \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py index 224f4a11e1..ebe823195c 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py @@ -42,6 +42,8 @@ if __name__ == "__main__": help="path to save compiled cif timestamp function") parser.add_argument("--compiled_punc", default="./compiled_model/compiled_punc.ts", help="path to save compiled punc model") + parser.add_argument("--traced_decoder", default=None, + help="path to save compiled punc model") parser.add_argument("--batch_size", default=16, type=int, help="batch size of paraformer model") parser.add_argument("--sample_path", default="./audio/test_1.wav", @@ -56,7 +58,7 @@ if __name__ == "__main__": MindieAutoModel.export_model(model=args.model_punc, compiled_path=args.compiled_punc, compile_type="punc") MindieAutoModel.export_model(model=args.model, compiled_encoder=args.compiled_encoder, compiled_decoder=args.compiled_decoder, compiled_cif=args.compiled_cif, - compiled_cif_timestamp=args.compiled_cif_timestamp, + compiled_cif_timestamp=args.compiled_cif_timestamp, traced_decoder=args.traced_decoder, cif_interval=200, cif_timestamp_interval=500, compile_type="paraformer") print("Finish compiling sub-models") else: diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py index 0e33b0b2e4..93f1a6c65d 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py @@ -148,7 +148,7 @@ class MindieAutoModel(AutoModel): scope_map=kwargs.get("scope_map", []), excludes=kwargs.get("excludes", None), ) - MindieDecoder.export_ts(decoder, kwargs["compiled_decoder"]) + MindieDecoder.export_ts(decoder, kwargs["compiled_decoder"], kwargs["traced_decoder"]) # compile cif mindie_cif = MindieCif(decoder.predictor.threshold, kwargs["cif_interval"]) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py index db8e59fa41..3a9b7436c5 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py @@ -1,4 +1,5 @@ import sys +import os sys.path.append("./FunASR") import torch @@ -90,7 +91,7 @@ class MindieDecoder(Paraformer): return decoder_out, us_alphas @staticmethod - def export_ts(decoder, path="./compiled_decoder.ts"): + def export_ts(decoder, path="./compiled_decoder.ts", traced_path="./traced_decoder.ts"): print("Begin trace decoder!") input_shape1 = (2, 939, 512) @@ -101,18 +102,22 @@ class MindieDecoder(Paraformer): min_shape2 = (-1, -1, 512) max_shape2 = (-1, -1, 512) - encoder_out = torch.randn(input_shape1, dtype=torch.float32) - encoder_out_lens = torch.tensor([939, 500], dtype=torch.int32) - sematic_embeds = torch.randn(input_shape2, dtype=torch.float32) - sematic_embeds_lens = torch.tensor([261, 100], dtype=torch.int32) + if traced_path is not None and os.path.exists(traced_path): + export_model = torch.load(traced_path) + print("load existing traced_decoder") + else: + encoder_out = torch.randn(input_shape1, dtype=torch.float32) + encoder_out_lens = torch.tensor([939, 500], dtype=torch.int32) + sematic_embeds = torch.randn(input_shape2, dtype=torch.float32) + sematic_embeds_lens = torch.tensor([261, 100], dtype=torch.int32) + + export_model = torch.jit.trace(decoder, example_inputs=(encoder_out, encoder_out_lens, sematic_embeds, sematic_embeds_lens)) + print("Finish trace decoder") compile_inputs = [mindietorch.Input(min_shape = min_shape1, max_shape = max_shape1, dtype = torch.float32), - mindietorch.Input(min_shape = (-1, ), max_shape = (-1, ), dtype = torch.int32), - mindietorch.Input(min_shape = min_shape2, max_shape = max_shape2, dtype = torch.float32), - mindietorch.Input(min_shape = (-1, ), max_shape = (-1, ), dtype = torch.int32)] - - export_model = torch.jit.trace(decoder, example_inputs=(encoder_out, encoder_out_lens, sematic_embeds, sematic_embeds_lens)) - print("Finish trace decoder") + mindietorch.Input(min_shape = (-1, ), max_shape = (-1, ), dtype = torch.int32), + mindietorch.Input(min_shape = min_shape2, max_shape = max_shape2, dtype = torch.float32), + mindietorch.Input(min_shape = (-1, ), max_shape = (-1, ), dtype = torch.int32)] compiled_model = mindietorch.compile( export_model, diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_decoder.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_decoder.py new file mode 100644 index 0000000000..155a7912e7 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_decoder.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import torch + +import sys +sys.path.append("./FunASR") + +from funasr.auto.auto_model import AutoModel, download_model, tables, deep_update, \ + load_pretrained_model, prepare_data_iterator +from funasr.models.bicif_paraformer.model import Paraformer +from funasr.models.transformer.utils.nets_utils import make_pad_mask_new + + +class ParaformerDecoder(Paraformer): + def __init__( + self, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + + def forward(self, encoder_out, encoder_out_lens, sematic_embeds, pre_token_length): + decoder_outs = self.decoder(encoder_out, encoder_out_lens, sematic_embeds, pre_token_length) + decoder_out = decoder_outs[0] + decoder_out = torch.log_softmax(decoder_out, dim=-1) + + encoder_out_mask = ( + ~make_pad_mask_new(encoder_out_lens)[:, None, :] + ).to(encoder_out.device) + + us_alphas = self.predictor.get_upsample_timestamp(encoder_out, encoder_out_mask, pre_token_length) + + return decoder_out, us_alphas + + def trace_model(decoder, path="./traced_decoder.ts"): + print("Begin trace decoder!") + + input_shape1 = (2, 939, 512) + input_shape2 = (2, 261, 512) + + encoder_out = torch.randn(input_shape1, dtype=torch.float32) + encoder_out_lens = torch.tensor([939, 500], dtype=torch.int32) + sematic_embeds = torch.randn(input_shape2, dtype=torch.float32) + sematic_embeds_lens = torch.tensor([261, 100], dtype=torch.int32) + + trace_model = torch.jit.trace(decoder, example_inputs=(encoder_out, encoder_out_lens, sematic_embeds, sematic_embeds_lens)) + trace_model.save(path) + print("Finish trace decoder") + + +class AutoModelDecoder(AutoModel): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + @staticmethod + def trace_decoder(**kwargs): + # load model config + assert "model" in kwargs + if "model_conf" not in kwargs: + print("download models from model hub: {}".format(kwargs.get("hub", "ms"))) + kwargs = download_model(**kwargs) + + kwargs["batch_size"] = 1 + kwargs["device"] = "cpu" + + # build tokenizer + tokenizer = kwargs.get("tokenizer", None) + if tokenizer is not None: + tokenizer_class = tables.tokenizer_classes.get(tokenizer) + tokenizer = tokenizer_class(**kwargs.get("tokenizer_conf", {})) + kwargs["token_list"] = ( + tokenizer.token_list if hasattr(tokenizer, "token_list") else None + ) + kwargs["token_list"] = ( + tokenizer.get_vocab() if hasattr(tokenizer, "get_vocab") else kwargs["token_list"] + ) + vocab_size = len(kwargs["token_list"]) if kwargs["token_list"] is not None else -1 + if vocab_size == -1 and hasattr(tokenizer, "get_vocab_size"): + vocab_size = tokenizer.get_vocab_size() + else: + vocab_size = -1 + kwargs["tokenizer"] = tokenizer + + # build frontend + frontend = kwargs.get("frontend", None) + kwargs["input_size"] = None + if frontend is not None: + frontend_class = tables.frontend_classes.get(frontend) + frontend = frontend_class(**kwargs.get("frontend_conf", {})) + kwargs["input_size"] = ( + frontend.output_size() if hasattr(frontend, "output_size") else None + ) + kwargs["frontend"] = frontend + + model_conf = {} + deep_update(model_conf, kwargs.get("model_conf", {})) + deep_update(model_conf, kwargs) + init_param = kwargs.get("init_param", None) + + decoder = ParaformerDecoder(**model_conf, vocab_size=vocab_size) + decoder.eval() + print(f"Loading pretrained params from {init_param}") + load_pretrained_model( + model=decoder, + path=init_param, + ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True), + oss_bucket=kwargs.get("oss_bucket", None), + scope_map=kwargs.get("scope_map", []), + excludes=kwargs.get("excludes", None), + ) + ParaformerDecoder.trace_model(decoder, kwargs["traced_decoder"]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="./model", + help="path of pretrained model") + parser.add_argument("--traced_decoder", default="./compiled_model/traced_decoder.ts", + help="path to save compiled decoder") + args = parser.parse_args() + + AutoModelDecoder.trace_decoder(model=args.model, traced_decoder=args.traced_decoder) \ No newline at end of file -- Gitee From dcf2d3ed2e4fe182fef59a93e1db26af1e374b60 Mon Sep 17 00:00:00 2001 From: brjiang Date: Mon, 19 Aug 2024 18:03:10 +0800 Subject: [PATCH 04/12] =?UTF-8?q?=E9=80=82=E9=85=8Dencoder?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/audio/Paraformer/README.md | 16 ++++-- .../built-in/audio/Paraformer/compile.py | 7 ++- .../audio/Paraformer/mindie_auto_model.py | 2 +- .../Paraformer/mindie_encoder_decoder.py | 20 +++++--- .../audio/Paraformer/trace_decoder.py | 51 ++++++++++++++++++- 5 files changed, 80 insertions(+), 16 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md index e16b89fc5d..c4fb83e671 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md @@ -102,20 +102,24 @@ export TORCH_AIE_NPU_CACHE_MAX_SIZE=12 ``` -2. (可选) 若CPU为aarch64的架构,则在编译decoder时会出现RuntimeError: could not create a primitive descriptor for a matmul primitive,此时需要新创建一个Python环境(推荐使用conda创建),而后使用如下命令安装torch 2.2.1 +2. (可选) 若CPU为aarch64的架构,则在编译encoder和decoder时会出现RuntimeError: could not create a primitive descriptor for a matmul primitive,此时需要新创建一个Python环境(推荐使用conda创建),而后使用如下命令安装torch 2.2.1 ``` - pip install torch==2.2.1 torchvision==0.17.1 --index-url https://download.pytorch.org/whl/cpu + pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cpu ``` 而后,执行如下脚本将decoder序列化 - ``` - python trace_decoder.py --model ./model --traced_decoder ./compiled_model/traced_decoder.pt + ```bash + python trace_decoder.py \ + --model ./model \ + --traced_encoder ./compiled_model/traced_encoder.pt \ + --traced_decoder ./compiled_model/traced_decoder.pt ``` 参数说明: - --model:预训练模型路径 + - --traced_encoder: 序列化后的encoder保存路径 - --traced_decoder: 序列化后的decoder保存路径 - 该步骤仅获得一个序列化后的decoder模型,后续模型编译仍需回到原始环境 + 该步骤获得序列化后的encoder和decoder模型,后续模型编译仍需回到原始环境 3. 模型编译及样本测试 @@ -130,6 +134,7 @@ --compiled_cif ./compiled_model/compiled_cif.pt \ --compiled_cif_timestamp ./compiled_model/compiled_cif_timestamp.pt \ --compiled_punc ./compiled_model/compiled_punc.ts \ + --traced_encoder ./compiled_model/traced_encoder.pt \ --traced_decoder ./compiled_model/traced_decoder.pt \ --batch_size 16 \ --sample_path ./model/example/asr_example.wav \ @@ -145,6 +150,7 @@ - --compiled_cif:编译后的cif函数的路径 - --compiled_cif_timestamp:编译后的cif_timestamp函数的路径 - --compiled_punc:编译后的punc的路径 + - --traced_encoder:预先序列化的encoder模型的路径,若并未执行第2步提前编译模型,则无需指定该参数 - --traced_decoder:预先序列化的decoder模型的路径,若并未执行第2步提前编译模型,则无需指定该参数 - --batch_size:Paraformer模型所使用的batch_size - --sample_path:用于测试模型的样本音频路径 diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py index ebe823195c..3610458379 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py @@ -42,8 +42,10 @@ if __name__ == "__main__": help="path to save compiled cif timestamp function") parser.add_argument("--compiled_punc", default="./compiled_model/compiled_punc.ts", help="path to save compiled punc model") + parser.add_argument("--traced_encoder", default=None, + help="path to save traced encoder model") parser.add_argument("--traced_decoder", default=None, - help="path to save compiled punc model") + help="path to save traced decoder model") parser.add_argument("--batch_size", default=16, type=int, help="batch size of paraformer model") parser.add_argument("--sample_path", default="./audio/test_1.wav", @@ -58,7 +60,8 @@ if __name__ == "__main__": MindieAutoModel.export_model(model=args.model_punc, compiled_path=args.compiled_punc, compile_type="punc") MindieAutoModel.export_model(model=args.model, compiled_encoder=args.compiled_encoder, compiled_decoder=args.compiled_decoder, compiled_cif=args.compiled_cif, - compiled_cif_timestamp=args.compiled_cif_timestamp, traced_decoder=args.traced_decoder, + compiled_cif_timestamp=args.compiled_cif_timestamp, + traced_encoder=args.traced_encoder, traced_decoder=args.traced_decoder, cif_interval=200, cif_timestamp_interval=500, compile_type="paraformer") print("Finish compiling sub-models") else: diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py index 93f1a6c65d..b70627eb97 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py @@ -134,7 +134,7 @@ class MindieAutoModel(AutoModel): scope_map=kwargs.get("scope_map", []), excludes=kwargs.get("excludes", None), ) - MindieEncoder.export_ts(encoder, kwargs["compiled_encoder"]) + MindieEncoder.export_ts(encoder, kwargs["compiled_encoder"], kwargs["traced_encoder"]) # compile decoder decoder = MindieDecoder(**model_conf, vocab_size=vocab_size) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py index 3a9b7436c5..57c17f21f3 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py @@ -33,19 +33,25 @@ class MindieEncoder(Paraformer): return encoder_out, encoder_out_lens, hidden, alphas, pre_token_length @staticmethod - def export_ts(encoder, path="./compiled_encoder.ts"): + def export_ts(encoder, path="./compiled_encoder.ts", traced_path="./traced_encoder.ts"): print("Begin trace encoder!") input_shape = (2, 50, 560) min_shape = (-1, -1, 560) max_shape = (-1, -1, 560) - input_speech = torch.randn(input_shape, dtype=torch.float32) - input_speech_lens = torch.tensor([50, 25], dtype=torch.int32) - compile_inputs = [mindietorch.Input(min_shape = min_shape, max_shape = max_shape, dtype = torch.float32), - mindietorch.Input(min_shape = (-1, ), max_shape = (-1, ), dtype = torch.int32)] + + + if traced_path is not None and os.path.exists(traced_path): + export_model = torch.load(traced_path) + print("load existing traced_encoder") + else: + input_speech = torch.randn(input_shape, dtype=torch.float32) + input_speech_lens = torch.tensor([50, 25], dtype=torch.int32) - export_model = torch.jit.trace(encoder, example_inputs=(input_speech, input_speech_lens)) - print("Finish trace encoder") + export_model = torch.jit.trace(encoder, example_inputs=(input_speech, input_speech_lens)) + + compile_inputs = [mindietorch.Input(min_shape = min_shape, max_shape = max_shape, dtype = torch.float32), + mindietorch.Input(min_shape = (-1, ), max_shape = (-1, ), dtype = torch.int32)] compiled_model = mindietorch.compile( export_model, diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_decoder.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_decoder.py index 155a7912e7..44bb0e00f8 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_decoder.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_decoder.py @@ -26,6 +26,40 @@ from funasr.models.bicif_paraformer.model import Paraformer from funasr.models.transformer.utils.nets_utils import make_pad_mask_new +class ParaformerEncoder(Paraformer): + def __init__( + self, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + + def forward(self, speech, speech_length): + encoder_out, encoder_out_lens = self.encode(speech, speech_length) + + encoder_out_lens = encoder_out_lens.to(torch.int32) + + encoder_out_mask = ( + ~make_pad_mask_new(encoder_out_lens)[:, None, :] + ).to(encoder_out.device) + hidden, alphas, pre_token_length = ( + self.predictor(encoder_out, None, encoder_out_mask, ignore_id=self.ignore_id) + ) + + return encoder_out, encoder_out_lens, hidden, alphas, pre_token_length + + def trace_model(encoder, path="./traced_encoder.ts"): + print("Begin trace encoder!") + + input_shape = (2, 50, 560) + input_speech = torch.randn(input_shape, dtype=torch.float32) + input_speech_lens = torch.tensor([50, 25], dtype=torch.int32) + + trace_model = torch.jit.trace(encoder, example_inputs=(input_speech, input_speech_lens)) + trace_model.save(path) + print("Finish trace encoder") + + class ParaformerDecoder(Paraformer): def __init__( self, @@ -112,6 +146,19 @@ class AutoModelDecoder(AutoModel): deep_update(model_conf, kwargs) init_param = kwargs.get("init_param", None) + encoder = ParaformerEncoder(**model_conf, vocab_size=vocab_size) + encoder.eval() + print(f"Loading pretrained params from {init_param}") + load_pretrained_model( + model=encoder, + path=init_param, + ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True), + oss_bucket=kwargs.get("oss_bucket", None), + scope_map=kwargs.get("scope_map", []), + excludes=kwargs.get("excludes", None), + ) + ParaformerEncoder.trace_model(encoder, kwargs["traced_encoder"]) + decoder = ParaformerDecoder(**model_conf, vocab_size=vocab_size) decoder.eval() print(f"Loading pretrained params from {init_param}") @@ -130,8 +177,10 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model", default="./model", help="path of pretrained model") + parser.add_argument("--traced_encoder", default="./compiled_model/traced_encoder.ts", + help="path to save compiled decoder") parser.add_argument("--traced_decoder", default="./compiled_model/traced_decoder.ts", help="path to save compiled decoder") args = parser.parse_args() - AutoModelDecoder.trace_decoder(model=args.model, traced_decoder=args.traced_decoder) \ No newline at end of file + AutoModelDecoder.trace_decoder(model=args.model, traced_encoder=args.traced_encoder, traced_decoder=args.traced_decoder) \ No newline at end of file -- Gitee From 8507c27eb784c17b009cd64bd2f2778775b12682 Mon Sep 17 00:00:00 2001 From: brjiang Date: Tue, 20 Aug 2024 10:40:12 +0800 Subject: [PATCH 05/12] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E7=B2=BE=E5=BA=A6?= =?UTF-8?q?=E9=AA=8C=E8=AF=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py | 4 ++-- .../built-in/audio/Paraformer/mindie_encoder_decoder.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py index 71f18f6c03..309dc4a93a 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py @@ -123,7 +123,7 @@ class MindieCif(torch.nn.Module): mrt_res = compiled_model(sample_hidden.to("npu"), sample_alphas.to("npu"), sample_integrate.to("npu"), sample_frame.to("npu")) print("mindie infer done !") - ref_res = cif(sample_hidden, sample_alphas, sample_integrate, sample_frame, self.threshold) + ref_res = self.forward(sample_hidden, sample_alphas, sample_integrate, sample_frame) print("torch infer done !") precision_eval(mrt_res, ref_res) @@ -170,7 +170,7 @@ class MindieCifTimestamp(torch.nn.Module): sample_input2 = torch.randn(input_shape2, dtype=torch.float32) mrt_res = compiled_model(sample_input1.to("npu"), sample_input2.to("npu")) print("mindie infer done !") - ref_res = cif_wo_hidden(sample_input1, sample_input2, self.threshold) + ref_res = self.forward(sample_input1, sample_input2) print("torch infer done !") precision_eval(mrt_res, ref_res) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py index 57c17f21f3..4138447068 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py @@ -69,7 +69,7 @@ class MindieEncoder(Paraformer): sample_speech_lens = torch.tensor([100, 50, 100, 25], dtype=torch.int32) mrt_res = compiled_model(sample_speech.to("npu"), sample_speech_lens.to("npu")) print("mindie infer done !") - ref_res = encoder(sample_speech, sample_speech_lens) + ref_res = export_model(sample_speech, sample_speech_lens) print("torch infer done !") precision_eval(mrt_res, ref_res) @@ -143,7 +143,7 @@ class MindieDecoder(Paraformer): sample_sematic_lens = torch.tensor([50, 30, 50, 10], dtype=torch.int32) mrt_res = compiled_model(sample_encoder.to("npu"), sample_encoder_lens.to("npu"), sample_sematic.to("npu"), sample_sematic_lens.to("npu")) print("mindie infer done !") - ref_res = decoder(sample_encoder, sample_encoder_lens, sample_sematic, sample_sematic_lens) + ref_res = export_model(sample_encoder, sample_encoder_lens, sample_sematic, sample_sematic_lens) print("torch infer done !") precision_eval(mrt_res, ref_res) \ No newline at end of file -- Gitee From 08d7d532b531eb49f5440585f3d385e807cf3615 Mon Sep 17 00:00:00 2001 From: brjiang Date: Wed, 21 Aug 2024 10:47:43 +0800 Subject: [PATCH 06/12] =?UTF-8?q?=E4=BF=AE=E6=94=B9readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/audio/Paraformer/README.md | 18 ++++++++++-------- .../audio/Paraformer/mindie_encoder_decoder.py | 12 ++++++------ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md index c4fb83e671..ef40dfa10a 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md @@ -1,4 +1,4 @@ -# stable-diffusionxl-controlnet模型-推理指导 +# Paraformer模型-推理指导 - [概述](#概述) - [推理环境准备](#推理环境准备) @@ -65,13 +65,13 @@ 4. 获取模型文件 -将[Paraformer](https://modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)的模型文件下载到本地,并保存在./model文件夹下 + 将[Paraformer](https://modelscope.cn/models/iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/files)的模型文件下载到本地,并保存在./model文件夹下 -将[vad](https://modelscope.cn/models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)的模型文件下载到本地,并保存在./model_vad文件夹下 + 将[vad](https://modelscope.cn/models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch/files)的模型文件下载到本地,并保存在./model_vad文件夹下 -将[punc](https://modelscope.cn/models/iic/punc_ct-transformer_cn-en-common-vocab471067-large/files)的模型文件下载到本地,并保存在./model_punc文件夹下 + 将[punc](https://modelscope.cn/models/iic/punc_ct-transformer_cn-en-common-vocab471067-large/files)的模型文件下载到本地,并保存在./model_punc文件夹下 -目录结构如下所示 + 目录结构如下所示 ``` Paraformer @@ -93,7 +93,8 @@ 5. 安装Funasr的依赖 ``` sudo apt install ffmpeg - pip install jieba + pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cpu + pip install jieba omegaconf kaldiio librosa tqdm hydra-core six attrs psutil tornado ``` ## 模型推理 @@ -102,9 +103,10 @@ export TORCH_AIE_NPU_CACHE_MAX_SIZE=12 ``` -2. (可选) 若CPU为aarch64的架构,则在编译encoder和decoder时会出现RuntimeError: could not create a primitive descriptor for a matmul primitive,此时需要新创建一个Python环境(推荐使用conda创建),而后使用如下命令安装torch 2.2.1 +2. (可选) 若CPU为aarch64的架构,则在编译encoder和decoder时会出现RuntimeError: could not create a primitive descriptor for a matmul primitive,此时需要新创建一个Python环境(推荐使用conda创建),而后使用如下命令安装torch 2.2.1及相关依赖 ``` pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cpu + pip install omegaconf kaldiio librosa tqdm hydra-core six ``` 而后,执行如下脚本将decoder序列化 @@ -123,7 +125,7 @@ 3. 模型编译及样本测试 - 执行下述命令进行模型编译 + 执行下述命令进行模型编译(如编译后的模型保存于compiled_model目录下,需要首先mkdir compiled_model) ```bash python compile.py \ --model ./model \ diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py index 4138447068..25282d55ac 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py @@ -69,10 +69,10 @@ class MindieEncoder(Paraformer): sample_speech_lens = torch.tensor([100, 50, 100, 25], dtype=torch.int32) mrt_res = compiled_model(sample_speech.to("npu"), sample_speech_lens.to("npu")) print("mindie infer done !") - ref_res = export_model(sample_speech, sample_speech_lens) - print("torch infer done !") + # ref_res = export_model(sample_speech, sample_speech_lens) + # print("torch infer done !") - precision_eval(mrt_res, ref_res) + # precision_eval(mrt_res, ref_res) class MindieDecoder(Paraformer): @@ -143,7 +143,7 @@ class MindieDecoder(Paraformer): sample_sematic_lens = torch.tensor([50, 30, 50, 10], dtype=torch.int32) mrt_res = compiled_model(sample_encoder.to("npu"), sample_encoder_lens.to("npu"), sample_sematic.to("npu"), sample_sematic_lens.to("npu")) print("mindie infer done !") - ref_res = export_model(sample_encoder, sample_encoder_lens, sample_sematic, sample_sematic_lens) - print("torch infer done !") + # ref_res = export_model(sample_encoder, sample_encoder_lens, sample_sematic, sample_sematic_lens) + # print("torch infer done !") - precision_eval(mrt_res, ref_res) \ No newline at end of file + # precision_eval(mrt_res, ref_res) \ No newline at end of file -- Gitee From 12778d9e1ff8775c05de10469ffcfeae142d7e7b Mon Sep 17 00:00:00 2001 From: brjiang Date: Tue, 27 Aug 2024 09:25:57 +0800 Subject: [PATCH 07/12] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E7=BC=96=E8=AF=91?= =?UTF-8?q?=E5=8F=8A=E6=B5=8B=E8=AF=95=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/audio/Paraformer/README.md | 38 +++++++-- .../built-in/audio/Paraformer/compile.py | 68 +++------------ .../built-in/audio/Paraformer/test.py | 85 +++++++++++++++++++ 3 files changed, 127 insertions(+), 64 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/test.py diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md index ef40dfa10a..f5d21ad229 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md @@ -124,7 +124,7 @@ 该步骤获得序列化后的encoder和decoder模型,后续模型编译仍需回到原始环境 -3. 模型编译及样本测试 +3. 模型编译 执行下述命令进行模型编译(如编译后的模型保存于compiled_model目录下,需要首先mkdir compiled_model) ```bash python compile.py \ @@ -137,10 +137,35 @@ --compiled_cif_timestamp ./compiled_model/compiled_cif_timestamp.pt \ --compiled_punc ./compiled_model/compiled_punc.ts \ --traced_encoder ./compiled_model/traced_encoder.pt \ - --traced_decoder ./compiled_model/traced_decoder.pt \ + --traced_decoder ./compiled_model/traced_decoder.pt + ``` + + 参数说明: + - --model:预训练模型路径 + - --model_vad:VAD预训练模型路径,若不使用VAD模型则设置为None + - --model_punc:PUNC预训练模型路径,若不使用PUNC模型则设置为None + - --compiled_encoder:编译后的encoder模型的保存路径 + - --compiled_decoder:编译后的decoder模型的保存路径 + - --compiled_cif:编译后的cif函数的保存路径 + - --compiled_cif_timestamp:编译后的cif_timestamp函数的保存路径 + - --compiled_punc:编译后的punc的保存路径 + - --traced_encoder:预先序列化的encoder模型的路径,若并未执行第2步提前编译模型,则无需指定该参数 + - --traced_decoder:预先序列化的decoder模型的路径,若并未执行第2步提前编译模型,则无需指定该参数 + +4. 样本测试 + 执行下述命令进行音频样本测试 + ```bash + python test.py \ + --model ./model \ + --model_vad ./model_vad \ + --model_punc ./model_punc \ + --compiled_encoder ./compiled_model/compiled_encoder.pt \ + --compiled_decoder ./compiled_model/compiled_decoder.pt \ + --compiled_cif ./compiled_model/compiled_cif.pt \ + --compiled_cif_timestamp ./compiled_model/compiled_cif_timestamp.pt \ + --compiled_punc ./compiled_model/compiled_punc.ts \ --batch_size 16 \ - --sample_path ./model/example/asr_example.wav \ - --skip_compile + --sample_path ./model/example ``` 参数说明: @@ -152,8 +177,5 @@ - --compiled_cif:编译后的cif函数的路径 - --compiled_cif_timestamp:编译后的cif_timestamp函数的路径 - --compiled_punc:编译后的punc的路径 - - --traced_encoder:预先序列化的encoder模型的路径,若并未执行第2步提前编译模型,则无需指定该参数 - - --traced_decoder:预先序列化的decoder模型的路径,若并未执行第2步提前编译模型,则无需指定该参数 - --batch_size:Paraformer模型所使用的batch_size - - --sample_path:用于测试模型的样本音频路径 - - --skip_compile:是否进行模型编译,若已经完成编译,后续使用可使用该参数跳过编译,若为第一次使用不能指定该参数 \ No newline at end of file + - --sample_path:测试音频的路径或所在的文件夹路径,若为文件夹路径则会遍历文件夹下的所有音频文件 \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py index 3610458379..6776879ac5 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py @@ -24,21 +24,19 @@ from mindie_auto_model import MindieAutoModel if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--skip_compile", action="store_true", - help="whether to skip compiling sub-models in Paraformer") parser.add_argument("--model", default="./model", help="path of pretrained model") parser.add_argument("--model_vad", default="./model_vad", help="path of pretrained vad model") parser.add_argument("--model_punc", default="./model_punc", help="path of pretrained punc model") - parser.add_argument("--compiled_encoder", default="./compiled_model/compiled_encoder.ts", + parser.add_argument("--compiled_encoder", default="./compiled_model/compiled_encoder.pt", help="path to save compiled encoder") - parser.add_argument("--compiled_decoder", default="./compiled_model/compiled_decoder.ts", + parser.add_argument("--compiled_decoder", default="./compiled_model/compiled_decoder.pt", help="path to save compiled decoder") - parser.add_argument("--compiled_cif", default="./compiled_model/compiled_cif.ts", + parser.add_argument("--compiled_cif", default="./compiled_model/compiled_cif.pt", help="path to save compiled cif function") - parser.add_argument("--compiled_cif_timestamp", default="./compiled_model/compiled_cif_timestamp.ts", + parser.add_argument("--compiled_cif_timestamp", default="./compiled_model/compiled_cif_timestamp.pt", help="path to save compiled cif timestamp function") parser.add_argument("--compiled_punc", default="./compiled_model/compiled_punc.ts", help="path to save compiled punc model") @@ -46,58 +44,16 @@ if __name__ == "__main__": help="path to save traced encoder model") parser.add_argument("--traced_decoder", default=None, help="path to save traced decoder model") - parser.add_argument("--batch_size", default=16, type=int, - help="batch size of paraformer model") - parser.add_argument("--sample_path", default="./audio/test_1.wav", - help="path of sample audio") args = parser.parse_args() mindietorch.set_device(0) # use mindietorch to compile sub-models in Paraformer - if not args.skip_compile: - print("Begin compiling sub-models") - MindieAutoModel.export_model(model=args.model_punc, compiled_path=args.compiled_punc, compile_type="punc") - MindieAutoModel.export_model(model=args.model, compiled_encoder=args.compiled_encoder, - compiled_decoder=args.compiled_decoder, compiled_cif=args.compiled_cif, - compiled_cif_timestamp=args.compiled_cif_timestamp, - traced_encoder=args.traced_encoder, traced_decoder=args.traced_decoder, - cif_interval=200, cif_timestamp_interval=500, compile_type="paraformer") - print("Finish compiling sub-models") - else: - print("Use existing compiled model") - - # initialize auto model - # note: ncpu means the number of threads used for intraop parallelism on the CPU, which is relevant to speed of vad model - model = MindieAutoModel(model=args.model, vad_model=args.model_vad, punc_model=args.model_punc, - compiled_encoder=args.compiled_encoder, compiled_decoder=args.compiled_decoder, - compiled_cif=args.compiled_cif, compiled_cif_timestamp=args.compiled_cif_timestamp, - compiled_punc=args.compiled_punc, paraformer_batch_size=args.batch_size, - cif_interval=200, cif_timestamp_interval=500, ncpu=16) - - # warm up - print("Begin warming up") - for i in range(5): - _ = model.generate(input=args.sample_path) - - # run with sample audio - iteration_num = 10 - print("Begin runing with sample audio with {} iterations".format(iteration_num)) - - total_dict_time = {} - for i in range(iteration_num): - res, time_stats = model.generate(input=args.sample_path) - - print("Iteration {} Model output: {}".format(i, res[0]["text"])) - print("Iteration {} Time comsumption:".format(i)) - print(" ".join(f"{key}: {value:.3f}s" for key, value in time_stats.items())) - for key, value in time_stats.items(): - if key not in total_dict_time: - total_dict_time[key] = float(value) - else: - total_dict_time[key] += float(value) - - # display average time consumption - print("\nAverage time comsumption") - for key, value in total_dict_time.items(): - print(key, ": {:.3f}s".format(float(value) / iteration_num)) \ No newline at end of file + print("Begin compiling sub-models") + MindieAutoModel.export_model(model=args.model_punc, compiled_path=args.compiled_punc, compile_type="punc") + MindieAutoModel.export_model(model=args.model, compiled_encoder=args.compiled_encoder, + compiled_decoder=args.compiled_decoder, compiled_cif=args.compiled_cif, + compiled_cif_timestamp=args.compiled_cif_timestamp, + traced_encoder=args.traced_encoder, traced_decoder=args.traced_decoder, + cif_interval=200, cif_timestamp_interval=500, compile_type="paraformer") + print("Finish compiling sub-models") \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/test.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/test.py new file mode 100644 index 0000000000..226a2add1c --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/test.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse + +import mindietorch + +from mindie_auto_model import MindieAutoModel + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="./model", + help="path of pretrained model") + parser.add_argument("--model_vad", default="./model_vad", + help="path of pretrained vad model") + parser.add_argument("--model_punc", default="./model_punc", + help="path of pretrained punc model") + parser.add_argument("--compiled_encoder", default="./compiled_model/compiled_encoder.pt", + help="path to save compiled encoder") + parser.add_argument("--compiled_decoder", default="./compiled_model/compiled_decoder.pt", + help="path to save compiled decoder") + parser.add_argument("--compiled_cif", default="./compiled_model/compiled_cif.pt", + help="path to save compiled cif function") + parser.add_argument("--compiled_cif_timestamp", default="./compiled_model/compiled_cif_timestamp.pt", + help="path to save compiled cif timestamp function") + parser.add_argument("--compiled_punc", default="./compiled_model/compiled_punc.ts", + help="path to save compiled punc model") + parser.add_argument("--batch_size", default=16, type=int, + help="batch size of paraformer model") + parser.add_argument("--sample_path", default="./audio/", + help="directory or path of sample audio") + args = parser.parse_args() + + mindietorch.set_device(0) + + valid_extensions = ['.wav'] + audio_files = [] + + if os.path.isfile(args.sample_path): + if any(args.sample_path.endswith(ext) for ext in valid_extensions): + audio_files.append(args.sample_path) + elif os.path.isdir(args.sample_path): + for root, dirs, files in os.walk(args.sample_path): + for file in files: + if any(file.endswith(ext) for ext in valid_extensions): + audio_files.append(os.path.join(root, file)) + + if len(audio_files) == 0: + print("There is no valid wav file in sample_dir.") + else: + # initialize auto model + model = MindieAutoModel(model=args.model, vad_model=args.model_vad, punc_model=args.model_punc, + compiled_encoder=args.compiled_encoder, compiled_decoder=args.compiled_decoder, + compiled_cif=args.compiled_cif, compiled_cif_timestamp=args.compiled_cif_timestamp, + compiled_punc=args.compiled_punc, paraformer_batch_size=args.batch_size, + cif_interval=200, cif_timestamp_interval=500) + + # warm up + print("Begin warming up") + _ = model.generate(input=audio_files[0]) + print("Finish warming up") + + # iterate over sample_dir + for wav_file in audio_files: + print("Begin evaluating {}".format(wav_file)) + + res, time_stats = model.generate(input=wav_file) + print("Model output: {}".format(res[0]["text"])) + print("Time comsumption:") + print(" ".join(f"{key}: {value:.3f}s" for key, value in time_stats.items())) \ No newline at end of file -- Gitee From 972daa13dd55213c66a0a793057d343e04191a36 Mon Sep 17 00:00:00 2001 From: brjiang Date: Thu, 26 Sep 2024 21:09:30 +0800 Subject: [PATCH 08/12] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E7=BC=96=E8=AF=91VAD?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/audio/Paraformer/README.md | 12 +++ .../built-in/audio/Paraformer/compile_vad.py | 100 ++++++++++++++++++ .../audio/Paraformer/mindie_auto_model.py | 4 +- .../built-in/audio/Paraformer/test.py | 6 +- 4 files changed, 119 insertions(+), 3 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md index f5d21ad229..cc86138454 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md @@ -151,6 +151,17 @@ - --compiled_punc:编译后的punc的保存路径 - --traced_encoder:预先序列化的encoder模型的路径,若并未执行第2步提前编译模型,则无需指定该参数 - --traced_decoder:预先序列化的decoder模型的路径,若并未执行第2步提前编译模型,则无需指定该参数 + + 执行下面的命令进行vad模型的编译优化 + ```bash + python compile_vad.py \ + --model_vad ./model_vad \ + --compiled_vad ./compiled_model/compiled_vad.ts + ``` + + 参数说明: + - --model_vad:VAD预训练模型路径 + - --compiled_vad:编译后的vad的保存路径 4. 样本测试 执行下述命令进行音频样本测试 @@ -164,6 +175,7 @@ --compiled_cif ./compiled_model/compiled_cif.pt \ --compiled_cif_timestamp ./compiled_model/compiled_cif_timestamp.pt \ --compiled_punc ./compiled_model/compiled_punc.ts \ + --compiled_vad ./compiled_model/compiled_vad.ts \ --batch_size 16 \ --sample_path ./model/example ``` diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py new file mode 100644 index 0000000000..410daadac9 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py @@ -0,0 +1,100 @@ +import sys +sys.path.append("./FunASR") + +import time +import argparse + +from funasr import AutoModel + +import torch +import mindietorch + + +COSINE_THRESHOLD = 0.999 +def cosine_similarity(gt_tensor, pred_tensor): + gt_tensor = gt_tensor.flatten().to(torch.float32) + pred_tensor = pred_tensor.flatten().to(torch.float32) + if torch.sum(gt_tensor) == 0.0 or torch.sum(pred_tensor) == 0.0: + if torch.allclose(gt_tensor, pred_tensor, atol=1e-4, rtol=1e-4, equal_nan=True): + return 1.0 + res = torch.nn.functional.cosine_similarity(gt_tensor, pred_tensor, dim=0, eps=1e-6) + res = res.cpu().detach().item() + return res + + +def precision_eval(mrt_res, ref_res): + if not isinstance(mrt_res, (list, tuple)): + mrt_res = [mrt_res, ] + if not isinstance(ref_res, (list, tuple)): + ref_res = [ref_res, ] + + com_res = True + for j, a in zip(mrt_res, ref_res): + res = cosine_similarity(j.to("cpu"), a) + print(res) + if res < COSINE_THRESHOLD: + com_res = False + + if com_res: + print("Compare success ! NPU model have the same output with CPU model !") + else: + print("Compare failed ! Outputs of NPU model are not the same with CPU model !") + +class MindieVAD(torch.nn.Module): + def __init__(self, vad_path): + super().__init__() + + self.model = AutoModel(model=vad_path) + self.model.model.encoder.eval() + for para in self.model.model.encoder.parameters(): + para.requires_grad = False + + def forward(self, feat): + result = self.model.model.encoder(feat, {}) + + return result + + +def export(args): + vad = MindieVAD(args.model_vad) + + print("Begin trace vad!") + input_shape = (1, 5996, 400) + min_shape = (1, -1, 400) + max_shape = (1, -1, 400) + input_feat = torch.randn(input_shape, dtype=torch.float32) + compile_inputs = [mindietorch.Input(min_shape = min_shape, max_shape = max_shape, dtype = torch.float32)] + + export_model = torch.jit.trace(vad, input_feat) + print("Finish trace vad") + + compiled_model = mindietorch.compile( + export_model, + inputs = compile_inputs, + precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + soc_version = "Ascend310P3", # 需要根据设备进行修改 + ir = "ts" + ) + print("mindietorch compile done !") + compiled_model.save(args.compiled_vad) + # compiled_model = torch.jit.load(args.compiled_vad) + + print("start to check the percision of vad.") + sample_feat = torch.randn(input_shape, dtype=torch.float32) + mrt_res = compiled_model(sample_feat.to("npu")) + print("mindie infer done !") + ref_res = vad(sample_feat) + print("torch infer done !") + + precision_eval(mrt_res, ref_res) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model_vad", default="./model_vad", + help="path of pretrained vad model") + parser.add_argument("--compiled_vad", default="./compiled_model/compiled_vad.ts", + help="path to save compiled vad") + args = parser.parse_args() + + export(args) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py index b70627eb97..c02daff479 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py @@ -31,8 +31,10 @@ class MindieAutoModel(AutoModel): logging.info("Building VAD model.") vad_kwargs["model"] = vad_model vad_kwargs["model_revision"] = kwargs.get("vad_model_revision", "master") - vad_kwargs["device"] = "cpu" vad_model, vad_kwargs = self.build_model(**vad_kwargs) + vad_kwargs["device"] = "npu" + compiled_vad = torch.jit.load(kwargs["compiled_vad"]) + vad_model.encoder = compiled_vad # if punc_model is not None, build punc model else None punc_model = kwargs.get("punc_model", None) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/test.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/test.py index 226a2add1c..3a078e42dc 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/test.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/test.py @@ -40,6 +40,8 @@ if __name__ == "__main__": help="path to save compiled cif timestamp function") parser.add_argument("--compiled_punc", default="./compiled_model/compiled_punc.ts", help="path to save compiled punc model") + parser.add_argument("--compiled_vad", default="./compiled_model/compiled_vad.ts", + help="path to save compiled vad model") parser.add_argument("--batch_size", default=16, type=int, help="batch size of paraformer model") parser.add_argument("--sample_path", default="./audio/", @@ -67,8 +69,8 @@ if __name__ == "__main__": model = MindieAutoModel(model=args.model, vad_model=args.model_vad, punc_model=args.model_punc, compiled_encoder=args.compiled_encoder, compiled_decoder=args.compiled_decoder, compiled_cif=args.compiled_cif, compiled_cif_timestamp=args.compiled_cif_timestamp, - compiled_punc=args.compiled_punc, paraformer_batch_size=args.batch_size, - cif_interval=200, cif_timestamp_interval=500) + compiled_punc=args.compiled_punc, compiled_vad=args.compiled_vad, + paraformer_batch_size=args.batch_size, cif_interval=200, cif_timestamp_interval=500) # warm up print("Begin warming up") -- Gitee From 6bad2d9f31cfbd257aba1bfd019130de671fbf28 Mon Sep 17 00:00:00 2001 From: brjiang Date: Tue, 8 Oct 2024 17:59:53 +0800 Subject: [PATCH 09/12] =?UTF-8?q?=E6=96=B0=E5=A2=9Edefault=5Fbuffer=5Fsize?= =?UTF-8?q?=5Fvec=E7=BC=96=E8=AF=91=E5=8F=82=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py | 1 + MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py | 2 ++ .../built-in/audio/Paraformer/mindie_encoder_decoder.py | 2 ++ .../MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py | 2 +- MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py | 1 + 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py index 410daadac9..5f71207153 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py @@ -72,6 +72,7 @@ def export(args): export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + default_buffer_size_vec = [50, ], soc_version = "Ascend310P3", # 需要根据设备进行修改 ir = "ts" ) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py index 309dc4a93a..0e3d6940c5 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py @@ -108,6 +108,7 @@ class MindieCif(torch.nn.Module): export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + default_buffer_size_vec = [1, 5, 5], soc_version = "Ascend310P3", ir = "ts" ) @@ -158,6 +159,7 @@ class MindieCifTimestamp(torch.nn.Module): export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + default_buffer_size_vec = [1, 5], soc_version = "Ascend310P3", ir = "ts" ) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py index 25282d55ac..93d5f295f9 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py @@ -57,6 +57,7 @@ class MindieEncoder(Paraformer): export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + default_buffer_size_vec = [200, 1, 5, 200, 1], soc_version = "Ascend310P3", ir = "ts" ) @@ -129,6 +130,7 @@ class MindieDecoder(Paraformer): export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + default_buffer_size_vec = [400, 5], soc_version = "Ascend310P3", ir = "ts" ) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py index 73876484f0..0b82da0aec 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py @@ -142,7 +142,7 @@ class MindieBiCifParaformer(BiCifParaformer): # Step4: run with compiled decoder decoder_out, us_alphas = self.mindie_decoder(encoder_out, encoder_out_lens, - pre_acoustic_embeds.to("npu"), pre_token_length.to("npu")) + pre_acoustic_embeds.contiguous().to("npu"), pre_token_length.contiguous().to("npu")) us_alphas = us_alphas.to("cpu") time5 = time.perf_counter() meta_data["decoder"] = time5 - time4 diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py index 4aa895f9f7..54810f6797 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py @@ -44,6 +44,7 @@ class MindiePunc(CTTransformer): export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + default_buffer_size_vec = [1, ], soc_version = "Ascend310P3", ir = "ts" ) -- Gitee From 45a2c6fb9fa5c95e5e9cdc25184eb07da19e5586 Mon Sep 17 00:00:00 2001 From: brjiang Date: Wed, 9 Oct 2024 09:42:15 +0800 Subject: [PATCH 10/12] =?UTF-8?q?VAD=E6=A8=A1=E5=9E=8B=E4=BD=BF=E7=94=A8PR?= =?UTF-8?q?EF=5FFP32=E7=BC=96=E8=AF=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py index 5f71207153..f7c874283d 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py @@ -71,7 +71,7 @@ def export(args): compiled_model = mindietorch.compile( export_model, inputs = compile_inputs, - precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, + precision_policy = mindietorch.PrecisionPolicy.PREF_FP32, default_buffer_size_vec = [50, ], soc_version = "Ascend310P3", # 需要根据设备进行修改 ir = "ts" -- Gitee From 4c7e42437a2f4ff39623870a8d019262f59399bd Mon Sep 17 00:00:00 2001 From: brjiang Date: Wed, 16 Oct 2024 16:53:04 +0800 Subject: [PATCH 11/12] =?UTF-8?q?=E6=9B=B4=E6=96=B0patch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/audio/Paraformer/README.md | 2 +- .../built-in/audio/Paraformer/mindie.patch | 31 +++++++++++++++++-- .../built-in/audio/Paraformer/mindie_cif.py | 4 +-- .../Paraformer/mindie_encoder_decoder.py | 4 +-- .../built-in/audio/Paraformer/mindie_punc.py | 2 +- 5 files changed, 35 insertions(+), 8 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md index cc86138454..9599dba64b 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md @@ -59,7 +59,7 @@ ``` mv mindie.patch ./FunASR cd ./FunASR - git apply mindie.patch --reject + git apply mindie.patch --ignore-whitespace cd .. ``` diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie.patch b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie.patch index 23d3207807..2f78ed4e84 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie.patch +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie.patch @@ -155,7 +155,7 @@ index 01e6aaf6..2081ebb1 100644 def export(self, input=None, **cfg): """ diff --git a/funasr/models/bicif_paraformer/cif_predictor.py b/funasr/models/bicif_paraformer/cif_predictor.py -index ca98cdc2..4d61ab85 100644 +index ca98cdc2..65a08cdb 100644 --- a/funasr/models/bicif_paraformer/cif_predictor.py +++ b/funasr/models/bicif_paraformer/cif_predictor.py @@ -238,6 +238,7 @@ class CifPredictorV3(torch.nn.Module): @@ -166,14 +166,41 @@ index ca98cdc2..4d61ab85 100644 acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold) if target_length is None and self.tail_threshold > 0.0: token_num_int = torch.max(token_num).type(torch.int32).item() -@@ -282,6 +283,7 @@ class CifPredictorV3(torch.nn.Module): +@@ -282,6 +283,8 @@ class CifPredictorV3(torch.nn.Module): _token_num = alphas2.sum(-1) if token_num is not None: alphas2 *= (token_num / _token_num)[:, None].repeat(1, alphas2.size(1)) ++ + return alphas2 # re-downsample ds_alphas = alphas2.reshape(b, -1, self.upsample_times).sum(-1) ds_cif_peak = cif_wo_hidden(ds_alphas, self.threshold - 1e-4) +diff --git a/funasr/models/fsmn_vad_streaming/encoder.py b/funasr/models/fsmn_vad_streaming/encoder.py +index 6668c5d5..6b7c80fc 100755 +--- a/funasr/models/fsmn_vad_streaming/encoder.py ++++ b/funasr/models/fsmn_vad_streaming/encoder.py +@@ -231,7 +231,7 @@ class FSMN(nn.Module): + pass + + def forward( +- self, input: torch.Tensor, cache: Dict[str, torch.Tensor] ++ self, input: torch.Tensor, cache: Dict[str, torch.Tensor] = {} + ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + """ + Args: +diff --git a/funasr/models/fsmn_vad_streaming/model.py b/funasr/models/fsmn_vad_streaming/model.py +index 04689bed..665b93cd 100644 +--- a/funasr/models/fsmn_vad_streaming/model.py ++++ b/funasr/models/fsmn_vad_streaming/model.py +@@ -348,7 +348,7 @@ class FsmnVADStreaming(nn.Module): + ) + + def ComputeScores(self, feats: torch.Tensor, cache: dict = {}) -> None: +- scores = self.encoder(feats, cache=cache["encoder"]).to("cpu") # return B * T * D ++ scores = self.encoder(feats).to("cpu") # return B * T * D + assert ( + scores.shape[1] == feats.shape[1] + ), "The shape between feats and scores does not match" diff --git a/funasr/models/sanm/attention.py b/funasr/models/sanm/attention.py index c7e8a8e0..0f1862ae 100644 --- a/funasr/models/sanm/attention.py diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py index 0e3d6940c5..e706e3e79c 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py @@ -108,7 +108,7 @@ class MindieCif(torch.nn.Module): export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, - default_buffer_size_vec = [1, 5, 5], + default_buffer_size_vec = [1, 10, 10], soc_version = "Ascend310P3", ir = "ts" ) @@ -159,7 +159,7 @@ class MindieCifTimestamp(torch.nn.Module): export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, - default_buffer_size_vec = [1, 5], + default_buffer_size_vec = [1, 10], soc_version = "Ascend310P3", ir = "ts" ) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py index 93d5f295f9..7cbb952329 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py @@ -57,7 +57,7 @@ class MindieEncoder(Paraformer): export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, - default_buffer_size_vec = [200, 1, 5, 200, 1], + default_buffer_size_vec = [400, 1, 1, 400, 1], soc_version = "Ascend310P3", ir = "ts" ) @@ -130,7 +130,7 @@ class MindieDecoder(Paraformer): export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, - default_buffer_size_vec = [400, 5], + default_buffer_size_vec = [800, 10], soc_version = "Ascend310P3", ir = "ts" ) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py index 54810f6797..73b131a9fb 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py @@ -44,7 +44,7 @@ class MindiePunc(CTTransformer): export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, - default_buffer_size_vec = [1, ], + default_buffer_size_vec = [10, ], soc_version = "Ascend310P3", ir = "ts" ) -- Gitee From 82694ac9c699aefc691ea655aa53c318d8e4d3d7 Mon Sep 17 00:00:00 2001 From: brjiang Date: Tue, 26 Nov 2024 15:57:47 +0800 Subject: [PATCH 12/12] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E5=AF=B9=E4=BA=8EVAD?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E7=9A=84=E4=BC=98=E5=8C=96=EF=BC=8C=E6=95=B4?= =?UTF-8?q?=E7=90=86=E5=B9=B6=E4=BC=98=E5=8C=96=E4=BB=A3=E7=A0=81=E8=84=9A?= =?UTF-8?q?=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/audio/Paraformer/README.md | 139 +++- .../built-in/audio/Paraformer/compile.py | 24 +- .../built-in/audio/Paraformer/compile_vad.py | 101 --- .../built-in/audio/Paraformer/mindie.patch | 688 ++++++++++++++---- .../audio/Paraformer/mindie_auto_model.py | 127 +--- .../built-in/audio/Paraformer/mindie_cif.py | 30 +- .../Paraformer/mindie_encoder_decoder.py | 99 +-- .../built-in/audio/Paraformer/mindie_fa.patch | 121 +++ .../audio/Paraformer/mindie_paraformer.py | 31 +- .../built-in/audio/Paraformer/mindie_punc.py | 28 +- .../built-in/audio/Paraformer/mindie_vad.py | 51 ++ .../built-in/audio/Paraformer/test.py | 21 +- .../audio/Paraformer/test_accuracy.py | 70 ++ .../audio/Paraformer/test_performance.py | 97 +++ ...ce_decoder.py => trace_encoder_decoder.py} | 152 ++-- 15 files changed, 1191 insertions(+), 588 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_fa.patch create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_vad.py create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/test_accuracy.py create mode 100644 MindIE/MindIE-Torch/built-in/audio/Paraformer/test_performance.py rename MindIE/MindIE-Torch/built-in/audio/Paraformer/{trace_decoder.py => trace_encoder_decoder.py} (37%) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md index 9599dba64b..5efeea18b6 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/README.md @@ -8,7 +8,7 @@ # 概述 -该工程使用mindietorch部署paraformer模型 +该工程使用mindietorch部署Paraformer语音识别模型,同时该工程还适配了VAD音频切分模型以及PUNC标点符号模型,三个模型可组成VAD+Paraformer+PUNC的pipeline,实现对于长音频的识别 - 模型路径: ```bash @@ -31,8 +31,8 @@ | Python | 3.10.13 | - | | torch | 2.1.0+cpu | - | | torch_audio | 2.1.0+cpu | - | - | CANN | 8.0.RC2 | - | - | MindIE | 1.0.RC2.B091 | - | + | CANN | 8.0.RC3 | - | + | MindIE | 1.0.RC3 | - | # 快速上手 ## 获取源码 @@ -55,11 +55,17 @@ cd .. ``` -3. 修改Funasr的源码,先将patch文件移动至Funasr的工程路径下,而后将patch应用到代码中(若patch应用失败,则需要手动进行修改) +3. 修改Funasr的源码,将patch应用到代码中(若patch应用失败,则需要手动进行修改) ``` - mv mindie.patch ./FunASR cd ./FunASR - git apply mindie.patch --ignore-whitespace + git apply ../mindie.patch --ignore-whitespace + cd .. + ``` + + (可选)若为Atlas 800I A2服务器,可以使用如下命令将Attention替换为Flash Attention,可以提升Paraformer模型的性能 + ``` + cd ./FunASR + git apply ../mindie_fa.patch --ignore-whitespace cd .. ``` @@ -92,26 +98,26 @@ 5. 安装Funasr的依赖 ``` - sudo apt install ffmpeg + apt install ffmpeg pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cpu pip install jieba omegaconf kaldiio librosa tqdm hydra-core six attrs psutil tornado ``` -## 模型推理 -1. 设置mindie内存池上限为12G,执行如下命令设置环境变量 - ``` - export TORCH_AIE_NPU_CACHE_MAX_SIZE=12 - ``` +6. 安装配套版本的torch_npu,同时参考[昇腾文档](https://www.hiascend.com/document/detail/zh/mindie/10RC3/mindietorch/Torchdev/mindie_torch0018.html)兼容mindie和torch_npu + +7. (可选) 若要进行精度或性能测试,可下载数据集[AISHELL-1](https://www.aishelltech.com/kysjcp)并保存于任意路径 -2. (可选) 若CPU为aarch64的架构,则在编译encoder和decoder时会出现RuntimeError: could not create a primitive descriptor for a matmul primitive,此时需要新创建一个Python环境(推荐使用conda创建),而后使用如下命令安装torch 2.2.1及相关依赖 +# 模型编译 +1. (可选)模型序列化 + 若CPU为aarch64的架构,则在编译encoder和decoder时会出现RuntimeError: could not create a primitive descriptor for a matmul primitive,此时需要新创建一个Python环境(推荐使用conda创建),使用如下命令安装torch 2.2.1及相关依赖 ``` pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cpu pip install omegaconf kaldiio librosa tqdm hydra-core six ``` - 而后,执行如下脚本将decoder序列化 + 而后,执行如下脚本将encoder和decoder序列化 ```bash - python trace_decoder.py \ + python trace_encoder_decoder.py \ --model ./model \ --traced_encoder ./compiled_model/traced_encoder.pt \ --traced_decoder ./compiled_model/traced_decoder.pt @@ -124,7 +130,7 @@ 该步骤获得序列化后的encoder和decoder模型,后续模型编译仍需回到原始环境 -3. 模型编译 +2. 模型编译 执行下述命令进行模型编译(如编译后的模型保存于compiled_model目录下,需要首先mkdir compiled_model) ```bash python compile.py \ @@ -135,9 +141,11 @@ --compiled_decoder ./compiled_model/compiled_decoder.pt \ --compiled_cif ./compiled_model/compiled_cif.pt \ --compiled_cif_timestamp ./compiled_model/compiled_cif_timestamp.pt \ - --compiled_punc ./compiled_model/compiled_punc.ts \ + --compiled_vad ./compiled_model/compiled_vad.pt \ + --compiled_punc ./compiled_model/compiled_punc.pt \ --traced_encoder ./compiled_model/traced_encoder.pt \ - --traced_decoder ./compiled_model/traced_decoder.pt + --traced_decoder ./compiled_model/traced_decoder.pt \ + --soc_version Ascendxxx ``` 参数说明: @@ -148,46 +156,103 @@ - --compiled_decoder:编译后的decoder模型的保存路径 - --compiled_cif:编译后的cif函数的保存路径 - --compiled_cif_timestamp:编译后的cif_timestamp函数的保存路径 + - --compiled_vad:编译后的vad的保存路径 - --compiled_punc:编译后的punc的保存路径 - --traced_encoder:预先序列化的encoder模型的路径,若并未执行第2步提前编译模型,则无需指定该参数 - --traced_decoder:预先序列化的decoder模型的路径,若并未执行第2步提前编译模型,则无需指定该参数 - - 执行下面的命令进行vad模型的编译优化 + - --soc_version:昇腾芯片的型号,输入格式为Ascendxxx(其中xxx请使用npu-smi info命令查看) + + +## 模型推理 +1. 设置mindie内存池上限为12G,执行如下命令设置环境变量 + ``` + export INF_NAN_MODE_ENABLE=0 + export TORCH_AIE_NPU_CACHE_MAX_SIZE=12 + ``` + +2. 样本测试 + 执行下述命令进行音频样本测试,该脚本将会测试VAD+Paraformer+PUNC整个Pipeline,脚本单次只会读取一个音频文件,音频文件可以为任意长度 ```bash - python compile_vad.py \ + python test.py \ + --model ./model \ --model_vad ./model_vad \ - --compiled_vad ./compiled_model/compiled_vad.ts + --model_punc ./model_punc \ + --compiled_encoder ./compiled_model/compiled_encoder.pt \ + --compiled_decoder ./compiled_model/compiled_decoder.pt \ + --compiled_cif ./compiled_model/compiled_cif.pt \ + --compiled_cif_timestamp ./compiled_model/compiled_cif_timestamp.pt \ + --compiled_punc ./compiled_model/compiled_punc.pt \ + --compiled_vad ./compiled_model/compiled_vad.pt \ + --paraformer_batch_size 16 \ + --sample_path ./model/example \ + --soc_version Ascendxxx ``` 参数说明: + - --model:预训练模型路径 - --model_vad:VAD预训练模型路径 - - --compiled_vad:编译后的vad的保存路径 + - --model_punc:PUNC预训练模型路径 + - --compiled_encoder:编译后的encoder模型的路径 + - --compiled_decoder:编译后的decoder模型的路径 + - --compiled_cif:编译后的cif函数的路径 + - --compiled_cif_timestamp:编译后的cif_timestamp函数的路径 + - --compiled_punc:编译后的punc模型的路径 + - --compiled_vad:编译后的vad模型的路径 + - --paraformer_batch_size:Paraformer模型所使用的batch_size + - --sample_path:测试音频的路径或所在的文件夹路径,若为文件夹路径则会遍历文件夹下的所有音频文件 + - --soc_version:昇腾芯片的型号,输入格式为Ascendxxx(其中xxx请使用npu-smi info命令查看) -4. 样本测试 - 执行下述命令进行音频样本测试 - ```bash - python test.py \ +3. 性能测试 + 执行下述命令对于Paraformer进行性能测试,该脚本仅针对Paraformer模型进行测试,batch_size参数用于控制同时处理的最大音频数量(例如设置为64,则会在sample_path下同时读取64个音频,并组合成一个输入进行处理),但需要注意音频的长度不能过长,否则可能超出NPU的显存 + ``` + python test_performance.py \ --model ./model \ - --model_vad ./model_vad \ - --model_punc ./model_punc \ --compiled_encoder ./compiled_model/compiled_encoder.pt \ --compiled_decoder ./compiled_model/compiled_decoder.pt \ --compiled_cif ./compiled_model/compiled_cif.pt \ --compiled_cif_timestamp ./compiled_model/compiled_cif_timestamp.pt \ - --compiled_punc ./compiled_model/compiled_punc.ts \ - --compiled_vad ./compiled_model/compiled_vad.ts \ - --batch_size 16 \ - --sample_path ./model/example + --batch_size 64 \ + --result_path ./aishell_test_result.txt \ + --sample_path /path/to/AISHELL-1/wav/test \ + --soc_version Ascendxxx ``` 参数说明: - --model:预训练模型路径 - - --model_vad:VAD预训练模型路径,若不使用VAD模型则设置为None - - --model_punc:PUNC预训练模型路径,若不使用PUNC模型则设置为None - --compiled_encoder:编译后的encoder模型的路径 - --compiled_decoder:编译后的decoder模型的路径 - --compiled_cif:编译后的cif函数的路径 - --compiled_cif_timestamp:编译后的cif_timestamp函数的路径 - - --compiled_punc:编译后的punc的路径 - --batch_size:Paraformer模型所使用的batch_size - - --sample_path:测试音频的路径或所在的文件夹路径,若为文件夹路径则会遍历文件夹下的所有音频文件 \ No newline at end of file + - --sample_path:AISHELL-1测试集音频所在路径,模型会递归查找该路径下的所有音频文件 + - --result_path:测试音频的推理结果的保存路径 + - --soc_version:昇腾芯片的型号,输入格式为Ascendxxx(其中xxx请使用npu-smi info命令查看) + + +4. 精度测试 + + 利用如下命令安装中文文本精度对比库nltk + ``` + pip install nltk + ``` + + 需要首先执行第4步完成性能测试,而后利用性能测试保存到result_path的结果进行精度验证,执行如下命令 + ``` + python test_accuracy.py \ + --result_path ./aishell_test_result.txt \ + --ref_path /path/to/AISHELL-1/transcript/aishell_transcript_v0.8.txt + ``` + + 参数说明: + - --result_path:测试音频的推理结果的保存路径 + - --ref_path:AISHELL-1测试音频的Ground Truth所在路径 + + +## 模型精度及性能 + +该模型在Atlas 310I pro和Atlas 800I A2上的参考性能及精度如下所示(其中性能数据为Paraformer模型纯推理性能,并非端到端推理性能) + +| NPU | batch_size | rtx_avg | cer | +|----------------|------------|---------|---------| +| Atlas 310I pro | 16 | 217.175 | 0.0198 | +| Atlas 800I A2 | 64 | 461.775 | 0.0198 | \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py index 6776879ac5..2287b61da3 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile.py @@ -38,22 +38,30 @@ if __name__ == "__main__": help="path to save compiled cif function") parser.add_argument("--compiled_cif_timestamp", default="./compiled_model/compiled_cif_timestamp.pt", help="path to save compiled cif timestamp function") - parser.add_argument("--compiled_punc", default="./compiled_model/compiled_punc.ts", + parser.add_argument("--compiled_punc", default="./compiled_model/compiled_punc.pt", + help="path to save compiled punc model") + parser.add_argument("--compiled_vad", default="./compiled_model/compiled_vad.pt", help="path to save compiled punc model") parser.add_argument("--traced_encoder", default=None, help="path to save traced encoder model") parser.add_argument("--traced_decoder", default=None, help="path to save traced decoder model") + parser.add_argument("--soc_version", required=True, type=str, + help="soc version of Ascend") args = parser.parse_args() mindietorch.set_device(0) # use mindietorch to compile sub-models in Paraformer - print("Begin compiling sub-models") - MindieAutoModel.export_model(model=args.model_punc, compiled_path=args.compiled_punc, compile_type="punc") + print("Begin compiling sub-models.") + MindieAutoModel.export_model(model=args.model_vad, compiled_path=args.compiled_vad, + compile_type="vad", soc_version=args.soc_version) + MindieAutoModel.export_model(model=args.model_punc, compiled_path=args.compiled_punc, + compile_type="punc", soc_version=args.soc_version) MindieAutoModel.export_model(model=args.model, compiled_encoder=args.compiled_encoder, - compiled_decoder=args.compiled_decoder, compiled_cif=args.compiled_cif, - compiled_cif_timestamp=args.compiled_cif_timestamp, - traced_encoder=args.traced_encoder, traced_decoder=args.traced_decoder, - cif_interval=200, cif_timestamp_interval=500, compile_type="paraformer") - print("Finish compiling sub-models") \ No newline at end of file + compiled_decoder=args.compiled_decoder, compiled_cif=args.compiled_cif, + compiled_cif_timestamp=args.compiled_cif_timestamp, + traced_encoder=args.traced_encoder, traced_decoder=args.traced_decoder, + cif_interval=200, cif_timestamp_interval=500, + compile_type="paraformer", soc_version=args.soc_version) + print("Finish compiling sub-models.") \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py deleted file mode 100644 index f7c874283d..0000000000 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/compile_vad.py +++ /dev/null @@ -1,101 +0,0 @@ -import sys -sys.path.append("./FunASR") - -import time -import argparse - -from funasr import AutoModel - -import torch -import mindietorch - - -COSINE_THRESHOLD = 0.999 -def cosine_similarity(gt_tensor, pred_tensor): - gt_tensor = gt_tensor.flatten().to(torch.float32) - pred_tensor = pred_tensor.flatten().to(torch.float32) - if torch.sum(gt_tensor) == 0.0 or torch.sum(pred_tensor) == 0.0: - if torch.allclose(gt_tensor, pred_tensor, atol=1e-4, rtol=1e-4, equal_nan=True): - return 1.0 - res = torch.nn.functional.cosine_similarity(gt_tensor, pred_tensor, dim=0, eps=1e-6) - res = res.cpu().detach().item() - return res - - -def precision_eval(mrt_res, ref_res): - if not isinstance(mrt_res, (list, tuple)): - mrt_res = [mrt_res, ] - if not isinstance(ref_res, (list, tuple)): - ref_res = [ref_res, ] - - com_res = True - for j, a in zip(mrt_res, ref_res): - res = cosine_similarity(j.to("cpu"), a) - print(res) - if res < COSINE_THRESHOLD: - com_res = False - - if com_res: - print("Compare success ! NPU model have the same output with CPU model !") - else: - print("Compare failed ! Outputs of NPU model are not the same with CPU model !") - -class MindieVAD(torch.nn.Module): - def __init__(self, vad_path): - super().__init__() - - self.model = AutoModel(model=vad_path) - self.model.model.encoder.eval() - for para in self.model.model.encoder.parameters(): - para.requires_grad = False - - def forward(self, feat): - result = self.model.model.encoder(feat, {}) - - return result - - -def export(args): - vad = MindieVAD(args.model_vad) - - print("Begin trace vad!") - input_shape = (1, 5996, 400) - min_shape = (1, -1, 400) - max_shape = (1, -1, 400) - input_feat = torch.randn(input_shape, dtype=torch.float32) - compile_inputs = [mindietorch.Input(min_shape = min_shape, max_shape = max_shape, dtype = torch.float32)] - - export_model = torch.jit.trace(vad, input_feat) - print("Finish trace vad") - - compiled_model = mindietorch.compile( - export_model, - inputs = compile_inputs, - precision_policy = mindietorch.PrecisionPolicy.PREF_FP32, - default_buffer_size_vec = [50, ], - soc_version = "Ascend310P3", # 需要根据设备进行修改 - ir = "ts" - ) - print("mindietorch compile done !") - compiled_model.save(args.compiled_vad) - # compiled_model = torch.jit.load(args.compiled_vad) - - print("start to check the percision of vad.") - sample_feat = torch.randn(input_shape, dtype=torch.float32) - mrt_res = compiled_model(sample_feat.to("npu")) - print("mindie infer done !") - ref_res = vad(sample_feat) - print("torch infer done !") - - precision_eval(mrt_res, ref_res) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model_vad", default="./model_vad", - help="path of pretrained vad model") - parser.add_argument("--compiled_vad", default="./compiled_model/compiled_vad.ts", - help="path to save compiled vad") - args = parser.parse_args() - - export(args) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie.patch b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie.patch index 2f78ed4e84..a485f5eb86 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie.patch +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie.patch @@ -1,8 +1,18 @@ diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py -index 01e6aaf6..2081ebb1 100644 +index 01e6aaf6..0a80f1c0 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py -@@ -277,9 +277,9 @@ class AutoModel: +@@ -171,7 +171,8 @@ class AutoModel: + self.spk_kwargs = spk_kwargs + self.model_path = kwargs.get("model_path") + +- def build_model(self, **kwargs): ++ @staticmethod ++ def build_model(**kwargs): + assert "model" in kwargs + if "model_conf" not in kwargs: + logging.info("download models from model hub: {}".format(kwargs.get("hub", "ms"))) +@@ -277,9 +278,9 @@ class AutoModel: asr_result_list = [] num_samples = len(data_list) disable_pbar = self.kwargs.get("disable_pbar", False) @@ -15,7 +25,7 @@ index 01e6aaf6..2081ebb1 100644 time_speech_total = 0.0 time_escape_total = 0.0 for beg_idx in range(0, num_samples, batch_size): -@@ -311,20 +311,22 @@ class AutoModel: +@@ -311,27 +312,32 @@ class AutoModel: speed_stats["batch_size"] = f"{len(results)}" speed_stats["rtf"] = f"{(time_escape) / batch_data_time:0.3f}" description = f"{speed_stats}, " @@ -42,17 +52,38 @@ index 01e6aaf6..2081ebb1 100644 + time_stats = {"input_speech_time": 0.0, "end_to_end_time": 0.0, "vad_time" : 0.0, + "paraformer_time": 0.0, "punc_time": 0.0} # step.1: compute the vad model ++ print("Start using VAD model to segment input audios.") deep_update(self.vad_kwargs, cfg) beg_vad = time.time() -@@ -332,6 +334,7 @@ class AutoModel: + res = self.inference( input, input_len=input_len, model=self.vad_model, kwargs=self.vad_kwargs, **cfg ) end_vad = time.time() + time_stats["vad_time"] = end_vad - beg_vad ++ print("Finish segmenting audios within {:.3f} seconds.".format(time_stats["vad_time"])) # FIX(gcf): concat the vad clips for sense vocie model for better aed if kwargs.get("merge_vad", False): -@@ -366,7 +369,7 @@ class AutoModel: +@@ -352,12 +358,13 @@ class AutoModel: + time_speech_total_all_samples = 1e-6 + + beg_total = time.time() +- pbar_total = ( +- tqdm(colour="red", total=len(res), dynamic_ncols=True) +- if not kwargs.get("disable_pbar", False) +- else None +- ) ++ # pbar_total = ( ++ # tqdm(colour="red", total=len(res), dynamic_ncols=True) ++ # if not kwargs.get("disable_pbar", False) ++ # else None ++ # ) + for i in range(len(res)): ++ print("Begin processing audio with Paraformer and PUNC model.") + key = res[i]["key"] + vadsegments = res[i]["value"] + input_i = data_list[i] +@@ -366,7 +373,7 @@ class AutoModel: speech_lengths = len(speech) n = len(vadsegments) data_with_index = [(vadsegments[i], i) for i in range(n)] @@ -61,16 +92,7 @@ index 01e6aaf6..2081ebb1 100644 results_sorted = [] if not len(sorted_data): -@@ -377,7 +380,7 @@ class AutoModel: - if len(sorted_data) > 0 and len(sorted_data[0]) > 0: - batch_size = max(batch_size, sorted_data[0][0][1] - sorted_data[0][0][0]) - -- beg_idx = 0 -+ batch_size = 0 - beg_asr_total = time.time() - time_speech_total_per_sample = speech_lengths / 16000 - time_speech_total_all_samples += time_speech_total_per_sample -@@ -385,28 +388,19 @@ class AutoModel: +@@ -385,26 +392,15 @@ class AutoModel: # pbar_sample = tqdm(colour="blue", total=n, dynamic_ncols=True) all_segments = [] @@ -89,25 +111,20 @@ index 01e6aaf6..2081ebb1 100644 - max_len_in_batch = max(max_len_in_batch, sample_length) - end_idx += 1 - continue -- + batch_segments = kwargs["paraformer_batch_size"] -+ print("Num of vadsegments: {}, Paraformer batch_size: {}".format(n, batch_segments)) + loop_num = n // batch_segments if n % batch_segments == 0 else n // batch_segments + 1 -+ beg_idx = 0 + end_idx = batch_segments + + for j in range(loop_num): speech_j, speech_lengths_j = slice_padding_audio_samples( speech, speech_lengths, sorted_data[beg_idx:end_idx] ) - results = self.inference( -+ results_batch = self.inference_with_asr( ++ results, meta_data = self.inference_with_asr( speech_j, input_len=None, model=model, kwargs=kwargs, **cfg ) -+ results = results_batch[0] if self.spk_model is not None: - # compose vad segments: [[start_time_sec, end_time_sec, speech], [...]] - for _b in range(len(speech_j)): -@@ -425,8 +419,7 @@ class AutoModel: +@@ -425,8 +421,7 @@ class AutoModel: ) results[_b]["spk_embedding"] = spk_res[0]["spk_embedding"] beg_idx = end_idx @@ -117,32 +134,55 @@ index 01e6aaf6..2081ebb1 100644 if len(results) < 1: continue results_sorted.extend(results) -@@ -478,6 +471,10 @@ class AutoModel: +@@ -478,6 +473,13 @@ class AutoModel: if not len(result["text"].strip()): continue return_raw_text = kwargs.get("return_raw_text", False) + + end_paraformer = time.time() -+ time_stats["paraformer_time"] = end_paraformer - beg_asr_total ++ time_stats["paraformer_time"] = time_stats["paraformer_time"] + end_paraformer - beg_asr_total ++ print("\tFinish recognizing audio using Paraformer within {:.3f} seconds, " ++ "which contains {} segments and batch_size is {}." ++ .format(time_stats["paraformer_time"], n, batch_segments)) + # step.3 compute punc model raw_text = None if self.punc_model is not None: -@@ -489,6 +486,8 @@ class AutoModel: +@@ -489,6 +491,9 @@ class AutoModel: if return_raw_text: result["raw_text"] = raw_text result["text"] = punc_res[0]["text"] + end_punc = time.time() -+ time_stats["punc_time"] = end_punc - end_paraformer ++ time_stats["punc_time"] = time_stats["punc_time"] + end_punc - end_paraformer ++ print("\tFinish adding punctuation using PUNC model within {:.3f} seconds.".format(time_stats["punc_time"])) # speaker embedding cluster after resorted if self.spk_model is not None and kwargs.get("return_spk_res", True): -@@ -575,12 +574,13 @@ class AutoModel: - f"time_escape: {time_escape_total_per_sample:0.3f}" - ) - +@@ -567,20 +572,24 @@ class AutoModel: + results_ret_list.append(result) + end_asr_total = time.time() + time_escape_total_per_sample = end_asr_total - beg_asr_total +- if pbar_total: +- pbar_total.update(1) +- pbar_total.set_description( +- f"rtf_avg: {time_escape_total_per_sample / time_speech_total_per_sample:0.3f}, " +- f"time_speech: {time_speech_total_per_sample: 0.3f}, " +- f"time_escape: {time_escape_total_per_sample:0.3f}" +- ) +- - # end_total = time.time() - # time_escape_total_all_samples = end_total - beg_total ++ print("Finish processing audio which is {:.3f} seconds. " ++ "Time consumption of Paraformer and PUNC is {:.3f} seconds." ++ .format(time_speech_total_per_sample, time_escape_total_per_sample)) ++ # if pbar_total: ++ # pbar_total.update(1) ++ # pbar_total.set_description( ++ # f"rtf_avg: {time_escape_total_per_sample / time_speech_total_per_sample:0.3f}, " ++ # f"time_speech: {time_speech_total_per_sample: 0.3f}, " ++ # f"time_escape: {time_escape_total_per_sample:0.3f}" ++ # ) ++ + end_total = time.time() + time_stats["end_to_end_time"] = end_total - beg_vad + time_stats["input_speech_time"] = time_speech_total_all_samples @@ -154,27 +194,111 @@ index 01e6aaf6..2081ebb1 100644 def export(self, input=None, **cfg): """ +diff --git a/funasr/frontends/wav_frontend.py b/funasr/frontends/wav_frontend.py +index a4002df5..e62f3baf 100644 +--- a/funasr/frontends/wav_frontend.py ++++ b/funasr/frontends/wav_frontend.py +@@ -134,7 +134,7 @@ class WavFrontend(nn.Module): + mat = kaldi.fbank( + waveform, + num_mel_bins=self.n_mels, +- frame_length=self.frame_length, ++ frame_length=min(self.frame_length, waveform_length/self.fs*1000), + frame_shift=self.frame_shift, + dither=self.dither, + energy_floor=0.0, +@@ -282,30 +282,42 @@ class WavFrontendOnline(nn.Module): + Apply lfr with data + """ + +- LFR_inputs = [] +- # inputs = torch.vstack((inputs_lfr_cache, inputs)) +- T = inputs.shape[0] # include the right context +- T_lfr = int( +- np.ceil((T - (lfr_m - 1) // 2) / lfr_n) +- ) # minus the right context: (lfr_m - 1) // 2 +- splice_idx = T_lfr +- for i in range(T_lfr): +- if lfr_m <= T - i * lfr_n: +- LFR_inputs.append((inputs[i * lfr_n : i * lfr_n + lfr_m]).view(1, -1)) +- else: # process last LFR frame +- if is_final: +- num_padding = lfr_m - (T - i * lfr_n) +- frame = (inputs[i * lfr_n :]).view(-1) +- for _ in range(num_padding): +- frame = torch.hstack((frame, inputs[-1])) +- LFR_inputs.append(frame) +- else: +- # update splice_idx and break the circle +- splice_idx = i +- break +- splice_idx = min(T - 1, splice_idx * lfr_n) ++ T, D = inputs.shape ++ if T == 0: ++ return torch.empty(0, D * lfr_m), inputs, 0 ++ ++ # Calculate the number of LFR frames ++ T_lfr = (T - lfr_m) // lfr_n + 1 ++ if T_lfr <= 0: ++ T_lfr = 0 ++ required_length = (T_lfr - 1) * lfr_n + lfr_m ++ ++ # Handle padding ++ if required_length > T: ++ if is_final: ++ pad_amount = required_length - T ++ padding_frame = inputs[-1].unsqueeze(0).expand(pad_amount, D) ++ inputs_padded = torch.cat([inputs, padding_frame], dim=0) ++ else: ++ T_lfr -= 1 ++ required_length = (T_lfr - 1) * lfr_n + lfr_m ++ inputs_padded = inputs[:required_length] ++ else: ++ inputs_padded = inputs[:required_length] ++ ++ # Transpose to [D, T_speed] ++ inputs_padded = inputs_padded.transpose(0, 1) ++ # Apply unfold ++ frames = inputs_padded.unfold(1, lfr_m, lfr_n) # Shape: [D, T_lfr, lfr_m] ++ # Permute to [T_lfr, lfr_m, D] ++ frames = frames.permute(1, 2, 0) ++ # Flatten frames ++ LFR_outputs = frames.contiguous().view(T_lfr, -1) ++ # Update splice_idx and cache ++ splice_idx = T_lfr * lfr_n ++ splice_idx = min(splice_idx, T) ++ + lfr_splice_cache = inputs[splice_idx:, :] +- LFR_outputs = torch.vstack(LFR_inputs) + return LFR_outputs.type(torch.float32), lfr_splice_cache, splice_idx + + @staticmethod diff --git a/funasr/models/bicif_paraformer/cif_predictor.py b/funasr/models/bicif_paraformer/cif_predictor.py -index ca98cdc2..65a08cdb 100644 +index ca98cdc2..796b24dc 100644 --- a/funasr/models/bicif_paraformer/cif_predictor.py +++ b/funasr/models/bicif_paraformer/cif_predictor.py -@@ -238,6 +238,7 @@ class CifPredictorV3(torch.nn.Module): - elif self.tail_threshold > 0.0: - hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=mask) +@@ -412,11 +412,12 @@ class CifPredictorV3Export(torch.nn.Module): + mask = mask.squeeze(-1) + hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, mask=mask) + return hidden, alphas, token_num - acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold) - if target_length is None and self.tail_threshold > 0.0: - token_num_int = torch.max(token_num).type(torch.int32).item() -@@ -282,6 +283,8 @@ class CifPredictorV3(torch.nn.Module): + acoustic_embeds, cif_peak = cif_export(hidden, alphas, self.threshold) + + return acoustic_embeds, token_num, alphas, cif_peak + +- def get_upsample_timestmap(self, hidden, mask=None, token_num=None): ++ def get_upsample_timestamp(self, hidden, mask=None, token_num=None): + h = hidden + b = hidden.shape[0] + context = h.transpose(1, 2) +@@ -437,6 +438,7 @@ class CifPredictorV3Export(torch.nn.Module): + alphas2 = alphas2.squeeze(-1) _token_num = alphas2.sum(-1) - if token_num is not None: - alphas2 *= (token_num / _token_num)[:, None].repeat(1, alphas2.size(1)) -+ + alphas2 *= (token_num / _token_num)[:, None].repeat(1, alphas2.size(1)) + return alphas2 - # re-downsample - ds_alphas = alphas2.reshape(b, -1, self.upsample_times).sum(-1) - ds_cif_peak = cif_wo_hidden(ds_alphas, self.threshold - 1e-4) + # upsampled alphas and cif_peak + us_alphas = alphas2 + us_cif_peak = cif_wo_hidden_export(us_alphas, self.threshold - 1e-4) diff --git a/funasr/models/fsmn_vad_streaming/encoder.py b/funasr/models/fsmn_vad_streaming/encoder.py index 6668c5d5..6b7c80fc 100755 --- a/funasr/models/fsmn_vad_streaming/encoder.py @@ -189,11 +313,109 @@ index 6668c5d5..6b7c80fc 100755 """ Args: diff --git a/funasr/models/fsmn_vad_streaming/model.py b/funasr/models/fsmn_vad_streaming/model.py -index 04689bed..665b93cd 100644 +index 04689bed..9db5bfd0 100644 --- a/funasr/models/fsmn_vad_streaming/model.py +++ b/funasr/models/fsmn_vad_streaming/model.py -@@ -348,7 +348,7 @@ class FsmnVADStreaming(nn.Module): - ) +@@ -18,19 +18,21 @@ from funasr.utils.datadir_writer import DatadirWriter + from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank + + ++# 表示VAD状态机的状态,包括起点未检测、语音片段中、终点已检测 + class VadStateMachine(Enum): + kVadInStateStartPointNotDetected = 1 + kVadInStateInSpeechSegment = 2 + kVadInStateEndPointDetected = 3 + + ++# 表示每一帧的状态,是语音还是静音 + class FrameState(Enum): + kFrameStateInvalid = -1 + kFrameStateSpeech = 1 + kFrameStateSil = 0 + + +-# final voice/unvoice state per frame ++# 表示音频状态的变化,如从静音到语音,从语音到静音等(final voice/unvoice state per frame) + class AudioChangeState(Enum): + kChangeStateSpeech2Speech = 0 + kChangeStateSpeech2Sil = 1 +@@ -40,6 +42,7 @@ class AudioChangeState(Enum): + kChangeStateInvalid = 5 + + ++# 表示VAD的检测模式,支持单一和多重语音检测模式 + class VadDetectMode(Enum): + kVadSingleUtteranceDetectMode = 0 + kVadMutipleUtteranceDetectMode = 1 +@@ -299,6 +302,9 @@ class FsmnVADStreaming(nn.Module): + self.encoder = encoder + self.encoder_conf = encoder_conf + ++ self.ten_sil_pdf_ids = torch.tensor([0]) ++ self.tol_idx = 0 ++ + def ResetDetection(self, cache: dict = {}): + cache["stats"].continous_silence_frame_count = 0 + cache["stats"].latest_confirmed_speech_frame = 0 +@@ -323,32 +329,44 @@ class FsmnVADStreaming(nn.Module): + cache["stats"].scores = cache["stats"].scores[:, real_drop_frames:, :] + + def ComputeDecibel(self, cache: dict = {}) -> None: +- frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000) +- frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000) ++ frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000) # 每帧的样本长度 ++ frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000) # 帧移的样本长度 ++ waveform = cache["stats"].waveform[0] ++ # 如果缓存中的"data_buf_all"为空,初始化并将当前波形作为数据缓冲区 + if cache["stats"].data_buf_all is None: +- cache["stats"].data_buf_all = cache["stats"].waveform[ +- 0 +- ] # cache["stats"].data_buf is pointed to cache["stats"].waveform[0] ++ cache["stats"].data_buf_all = waveform + cache["stats"].data_buf = cache["stats"].data_buf_all ++ cache["stats"].prev_waveform_length = 0 # 初始化,用于记录之前处理的波形长度 + else: +- cache["stats"].data_buf_all = torch.cat( +- (cache["stats"].data_buf_all, cache["stats"].waveform[0]) +- ) +- for offset in range( +- 0, cache["stats"].waveform.shape[1] - frame_sample_length + 1, frame_shift_length +- ): +- cache["stats"].decibel.append( +- 10 +- * math.log10( +- (cache["stats"].waveform[0][offset : offset + frame_sample_length]) +- .square() +- .sum() +- + 0.000001 +- ) +- ) ++ # 如果"data_buf_all"不为空,将新的波形数据拼接到已有的缓冲区 ++ cache["stats"].data_buf_all = torch.cat((cache["stats"].data_buf_all, waveform)) ++ cache["stats"].data_buf = cache["stats"].data_buf_all ++ ++ total_waveform_length = cache["stats"].data_buf_all.shape[0] ++ num_total_frames = (total_waveform_length - frame_sample_length) ++ num_existing_frames = len(cache["stats"].decibel) # 已有的帧数 ++ num_new_frames = num_total_frames - num_existing_frames # 需要处理的新帧数 ++ ++ if num_new_frames > 0: ++ # 计算新帧起始索引 ++ start_index = num_existing_frames * frame_shift_length ++ end_index = start_index + num_new_frames * frame_shift_length + frame_sample_length - frame_shift_length ++ ++ # 获取波形数据 ++ processing_waveform = cache["stats"].data_buf_all[start_index:end_index] ++ ++ # 使用unfold将波形数据切分为帧,形状为[num_new_frames, frame_sample_length] ++ frames = processing_waveform.unfold(0, frame_sample_length, frame_shift_length) ++ frame_energies = frames.pow(2).sum(dim=1) + 1e-6 # 形状为[num_new_frames] ++ decibels = 10 * torch.log10(frame_energies) # 计算分贝值 ++ cache["stats"].decibel.extend(decibels.tolist()) # 结果添加到cache["stats"].decibel ++ ++ # 更新prev_waveform_length,指向已处理的波形位置 ++ cache["stats"].prev_waveform_length = start_index + num_new_frames * frame_shift_length ++ def ComputeScores(self, feats: torch.Tensor, cache: dict = {}) -> None: - scores = self.encoder(feats, cache=cache["encoder"]).to("cpu") # return B * T * D @@ -201,102 +423,314 @@ index 04689bed..665b93cd 100644 assert ( scores.shape[1] == feats.shape[1] ), "The shape between feats and scores does not match" -diff --git a/funasr/models/sanm/attention.py b/funasr/models/sanm/attention.py -index c7e8a8e0..0f1862ae 100644 ---- a/funasr/models/sanm/attention.py -+++ b/funasr/models/sanm/attention.py -@@ -275,13 +275,15 @@ class MultiHeadedAttentionSANM(nn.Module): - "inf" - ) # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min) - scores = scores.masked_fill(mask, min_value) -- self.attn = torch.softmax(scores, dim=-1).masked_fill( -- mask, 0.0 -- ) # (batch, head, time1, time2) -+ # self.attn = torch.softmax(scores, dim=-1).masked_fill( -+ # mask, 0.0 -+ # ) # (batch, head, time1, time2) -+ attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) +@@ -498,13 +516,14 @@ class FsmnVADStreaming(nn.Module): + return vad_latency + + def GetFrameState(self, t: int, cache: dict = {}): +- frame_state = FrameState.kFrameStateInvalid ++ frame_state = FrameState.kFrameStateInvalid # 初始化当前帧的状态为无效状态 + cur_decibel = cache["stats"].decibel[t] + cur_snr = cur_decibel - cache["stats"].noise_average_decibel + # for each frame, calc log posterior probability of each state ++ # 判断当前帧的分贝值是否小于预设的分贝阈值,如果是,则认为是静音帧 + if cur_decibel < self.vad_opts.decibel_thres: +- frame_state = FrameState.kFrameStateSil +- self.DetectOneFrame(frame_state, t, False, cache=cache) ++ frame_state = FrameState.kFrameStateSil # 设置帧状态为静音 ++ self.DetectOneFrame(frame_state, t, False, cache=cache) # 检测静音帧 + return frame_state + + sum_score = 0.0 +@@ -512,14 +531,18 @@ class FsmnVADStreaming(nn.Module): + assert len(cache["stats"].sil_pdf_ids) == self.vad_opts.silence_pdf_num + if len(cache["stats"].sil_pdf_ids) > 0: + assert len(cache["stats"].scores) == 1 # 只支持batch_size = 1的测试 +- sil_pdf_scores = [ +- cache["stats"].scores[0][t][sil_pdf_id] for sil_pdf_id in cache["stats"].sil_pdf_ids +- ] ++ ++ scores_tensor = cache["stats"].scores[0][t] ++ sil_pdf_scores = scores_tensor[self.ten_sil_pdf_ids].tolist() # 使用张量索引直接获取sil_pdf_scores ++ ++ # 计算噪声的概率,使用对数概率并乘以语音与噪声的比例 + sum_score = sum(sil_pdf_scores) + noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio + total_score = 1.0 + sum_score = total_score - sum_score + speech_prob = math.log(sum_score) ++ ++ # 如果需要输出帧的概率,则将噪声和语音概率保存到缓存中 + if self.vad_opts.output_frame_probs: + frame_prob = E2EVadFrameProb() + frame_prob.noise_prob = noise_prob +@@ -527,16 +550,22 @@ class FsmnVADStreaming(nn.Module): + frame_prob.score = sum_score + frame_prob.frame_id = t + cache["stats"].frame_probs.append(frame_prob) ++ ++ # 判断当前帧是否为语音帧,基于语音和噪声的概率以及设定的阈值 + if math.exp(speech_prob) >= math.exp(noise_prob) + cache["stats"].speech_noise_thres: ++ # 如果信噪比和分贝值都超过了阈值,则认为是语音帧 + if cur_snr >= self.vad_opts.snr_thres and cur_decibel >= self.vad_opts.decibel_thres: +- frame_state = FrameState.kFrameStateSpeech ++ frame_state = FrameState.kFrameStateSpeech # 设置帧状态为语音 + else: +- frame_state = FrameState.kFrameStateSil ++ frame_state = FrameState.kFrameStateSil # 设置帧状态为静音 ++ # 如果语音概率低于噪声概率,直接将帧状态设置为静音 else: -- self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) -+ # self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) -+ attn = torch.softmax(scores, dim=-1) - -- p_attn = self.dropout(self.attn) -+ p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = ( - x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) -@@ -683,18 +685,22 @@ class MultiHeadedAttentionCrossAtt(nn.Module): - # logging.info( - # "scores: {}, mask_size: {}".format(scores.size(), mask.size())) - scores = scores.masked_fill(mask, min_value) -- self.attn = torch.softmax(scores, dim=-1).masked_fill( -- mask, 0.0 -- ) # (batch, head, time1, time2) -+ # self.attn = torch.softmax(scores, dim=-1).masked_fill( -+ # mask, 0.0 -+ # ) # (batch, head, time1, time2) -+ attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) + frame_state = FrameState.kFrameStateSil ++ # 更新噪声的平均分贝值,用于后续帧的信噪比计算 + if cache["stats"].noise_average_decibel < -99.9: + cache["stats"].noise_average_decibel = cur_decibel + else: ++ # 平滑更新噪声的平均分贝值,基于一定数量的帧 + cache["stats"].noise_average_decibel = ( + cur_decibel + + cache["stats"].noise_average_decibel +@@ -556,21 +585,26 @@ class FsmnVADStreaming(nn.Module): + # if len(cache) == 0: + # self.AllResetDetection() + # self.waveform = waveform # compute decibel for each frame +- cache["stats"].waveform = waveform +- is_streaming_input = kwargs.get("is_streaming_input", True) ++ cache["stats"].waveform = waveform # 将输入的音频波形存入缓存的统计部分 ++ is_streaming_input = kwargs.get("is_streaming_input", True) # 是否为流式输入,默认是流式模式 ++ # 计算当前音频的分贝值,并更新缓存中的相关统计数据 + self.ComputeDecibel(cache=cache) ++ # 根据提取的音频特征计算得分 + self.ComputeScores(feats, cache=cache) ++ # 如果当前音频段不是最后一个段,则检测常规帧 + if not is_final: + self.DetectCommonFrames(cache=cache) else: - self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) -- p_attn = self.dropout(self.attn) -+ attn = torch.softmax(scores, dim=-1) -+ # p_attn = self.dropout(self.attn) -+ p_attn = self.dropout(attn) - x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) - x = ( - x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) - ) # (batch, time1, d_model) - if ret_attn: -- return self.linear_out(x), self.attn # (batch, time1, d_model) -+ # return self.linear_out(x), self.attn # (batch, time1, d_model) -+ return self.linear_out(x), attn - return self.linear_out(x) # (batch, time1, d_model) - - def forward(self, x, memory, memory_mask, ret_attn=False): + self.DetectLastFrames(cache=cache) + segments = [] ++ # 遍历每个批次的特征数据,当前只支持batch_size=1 + for batch_num in range(0, feats.shape[0]): # only support batch_size = 1 now + segment_batch = [] + if len(cache["stats"].output_data_buf) > 0: + for i in range( + cache["stats"].output_data_buf_offset, len(cache["stats"].output_data_buf) + ): ++ # 流式输入情况 + if ( + is_streaming_input + ): # in this case, return [beg, -1], [], [-1, end], [beg, end] +@@ -594,22 +628,24 @@ class FsmnVADStreaming(nn.Module): + end_ms = -1 + cache["stats"].next_seg = False + segment = [start_ms, end_ms] +- ++ # 非流式输入情况 + else: # in this case, return [beg, end] +- ++ # 如果当前段没有起始或者结束点,并且不是最后一个段,则跳过 + if not is_final and ( + not cache["stats"].output_data_buf[i].contain_seg_start_point + or not cache["stats"].output_data_buf[i].contain_seg_end_point + ): + continue ++ # 获取当前段的起始和结束时间 + segment = [ + cache["stats"].output_data_buf[i].start_ms, + cache["stats"].output_data_buf[i].end_ms, + ] ++ # 更新缓存中的偏移量 + cache["stats"].output_data_buf_offset += 1 # need update this parameter + +- segment_batch.append(segment) +- ++ segment_batch.append(segment) # 将当前段加入批次段列表 ++ # 如果当前批次中有有效的段,加入到总体段列表中 + if segment_batch: + segments.append(segment_batch) + # if is_final: +@@ -655,24 +691,28 @@ class FsmnVADStreaming(nn.Module): + cache: dict = {}, + **kwargs, + ): +- ++ # 初始化缓存,如果缓存为空,初始化缓存以进行推理 + if len(cache) == 0: + self.init_cache(cache, **kwargs) + + meta_data = {} ++ # 获取分块大小,默认为60000 + chunk_size = kwargs.get("chunk_size", 60000) # 50ms +- chunk_stride_samples = int(chunk_size * frontend.fs / 1000) ++ chunk_stride_samples = int(chunk_size * frontend.fs / 1000) # 计算每个分块的步长,以采样率frontend.fs为基础,计算出每块的音频样本数 + + time1 = time.perf_counter() ++ # 判断是否为流式输入,依据chunk_size决定是否开启流式模式 + is_streaming_input = ( + kwargs.get("is_streaming_input", False) + if chunk_size >= 15000 + else kwargs.get("is_streaming_input", True) + ) ++ # 判断是否为最终块,流式输入时由"is_streaming_input"决定,非流式时总是"True" + is_final = ( + kwargs.get("is_final", False) if is_streaming_input else kwargs.get("is_final", True) + ) +- cfg = {"is_final": is_final, "is_streaming_input": is_streaming_input} ++ cfg = {"is_final": is_final, "is_streaming_input": is_streaming_input} # 传递推理配置参数,包括是否为流式输入和是否为最终块 ++ # 加载音频数据,同时支持加载文本、图像和视频,默认是"sound"类型 + audio_sample_list = load_audio_text_image_video( + data_in, + fs=frontend.fs, +@@ -685,16 +725,18 @@ class FsmnVADStreaming(nn.Module): + is_streaming_input = cfg["is_streaming_input"] + time2 = time.perf_counter() + meta_data["load_data"] = f"{time2 - time1:0.3f}" +- assert len(audio_sample_list) == 1, "batch_size must be set 1" +- ++ self.ten_sil_pdf_ids = torch.tensor(cache["stats"].sil_pdf_ids) ++ assert len(audio_sample_list) == 1, "batch_size must be set 1" # 确保一次只能处理一个音频样本 ++ # 将当前缓存中的音频样本与新的音频样本拼接在一起 + audio_sample = torch.cat((cache["prev_samples"], audio_sample_list[0])) + +- n = int(len(audio_sample) // chunk_stride_samples + int(_is_final)) +- m = int(len(audio_sample) % chunk_stride_samples * (1 - int(_is_final))) ++ n = int(len(audio_sample) // chunk_stride_samples + int(_is_final)) # 根据音频样本的长度和块大小计算分块数 ++ m = int(len(audio_sample) % chunk_stride_samples * (1 - int(_is_final))) # 计算最后一个块的剩余长度 + segments = [] ++ # 遍历每个分块进行推理 + for i in range(n): +- kwargs["is_final"] = _is_final and i == n - 1 +- audio_sample_i = audio_sample[i * chunk_stride_samples : (i + 1) * chunk_stride_samples] ++ kwargs["is_final"] = _is_final and i == n - 1 # 对每一个分块,确定是否为最后一块 ++ audio_sample_i = audio_sample[i * chunk_stride_samples : (i + 1) * chunk_stride_samples] # 取出当前分块的音频样本 + + # extract fbank feats + speech, speech_lengths = extract_fbank( +@@ -719,20 +761,20 @@ class FsmnVADStreaming(nn.Module): + "cache": cache, + "is_streaming_input": is_streaming_input, + } +- segments_i = self.forward(**batch) ++ segments_i = self.forward(**batch) # 前向传播 + if len(segments_i) > 0: + segments.extend(*segments_i) +- ++ # 如果当前分块是最后一块,重新初始化缓存 + cache["prev_samples"] = audio_sample[:-m] + if _is_final: + self.init_cache(cache) +- ++ # 如果输出目录存在,初始化文件写入器 + ibest_writer = None + if kwargs.get("output_dir") is not None: + if not hasattr(self, "writer"): + self.writer = DatadirWriter(kwargs.get("output_dir")) + ibest_writer = self.writer[f"{1}best_recog"] +- ++ # 最终结果列表 + results = [] + result_i = {"key": key[0], "value": segments} + # if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas": +@@ -755,34 +797,35 @@ class FsmnVADStreaming(nn.Module): + def DetectCommonFrames(self, cache: dict = {}) -> int: + if cache["stats"].vad_state_machine == VadStateMachine.kVadInStateEndPointDetected: + return 0 ++ frame_states = [FrameState.kFrameStateInvalid] * self.vad_opts.nn_eval_block_size ++ # 批量计算多个帧的状态 + for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1): +- frame_state = FrameState.kFrameStateInvalid +- frame_state = self.GetFrameState( ++ frame_states[i] = self.GetFrameState( + cache["stats"].frm_cnt - 1 - i - cache["stats"].last_drop_frames, cache=cache + ) +- self.DetectOneFrame(frame_state, cache["stats"].frm_cnt - 1 - i, False, cache=cache) ++ for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1): ++ self.DetectOneFrame(frame_states[i], cache["stats"].frm_cnt - 1 - i, False, cache=cache) + + return 0 + + def DetectLastFrames(self, cache: dict = {}) -> int: +- if cache["stats"].vad_state_machine == VadStateMachine.kVadInStateEndPointDetected: +- return 0 ++ frame_states = [FrameState.kFrameStateInvalid] * self.vad_opts.nn_eval_block_size + for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1): +- frame_state = FrameState.kFrameStateInvalid +- frame_state = self.GetFrameState( ++ frame_states[i] = self.GetFrameState( + cache["stats"].frm_cnt - 1 - i - cache["stats"].last_drop_frames, cache=cache + ) ++ for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1): + if i != 0: +- self.DetectOneFrame(frame_state, cache["stats"].frm_cnt - 1 - i, False, cache=cache) ++ self.DetectOneFrame(frame_states[i], cache["stats"].frm_cnt - 1 - i, False, cache=cache) + else: +- self.DetectOneFrame(frame_state, cache["stats"].frm_cnt - 1, True, cache=cache) +- +- return 0 ++ self.DetectOneFrame(frame_states[i], cache["stats"].frm_cnt - 1, True, cache=cache) + + def DetectOneFrame( + self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool, cache: dict = {} + ) -> None: +- tmp_cur_frm_state = FrameState.kFrameStateInvalid ++ tmp_cur_frm_state = FrameState.kFrameStateInvalid # 初始化当前帧的临时状态为无效状态 ++ ++ # 根据当前帧的状态确定临时状态 + if cur_frm_state == FrameState.kFrameStateSpeech: + if math.fabs(1.0) > self.vad_opts.fe_prior_thres: + tmp_cur_frm_state = FrameState.kFrameStateSpeech +@@ -793,7 +836,8 @@ class FsmnVADStreaming(nn.Module): + state_change = cache["windows_detector"].DetectOneFrame( + tmp_cur_frm_state, cur_frm_idx, cache=cache + ) +- frm_shift_in_ms = self.vad_opts.frame_in_ms ++ frm_shift_in_ms = self.vad_opts.frame_in_ms # 获取帧移的时间 ++ # 检测状态转换:从静音到语音 + if AudioChangeState.kChangeStateSil2Speech == state_change: + silence_frame_count = cache["stats"].continous_silence_frame_count + cache["stats"].continous_silence_frame_count = 0 +@@ -873,10 +917,12 @@ class FsmnVADStreaming(nn.Module): + self.OnVoiceEnd(0, True, False, cache=cache) + cache["stats"].vad_state_machine = VadStateMachine.kVadInStateEndPointDetected + else: +- if cur_frm_idx >= self.LatencyFrmNumAtStartPoint(cache=cache): ++ lfasp = self.LatencyFrmNumAtStartPoint(cache=cache) ++ if cur_frm_idx >= lfasp: + self.OnSilenceDetected( +- cur_frm_idx - self.LatencyFrmNumAtStartPoint(cache=cache), cache=cache ++ cur_frm_idx - lfasp, cache=cache + ) ++ # 如果处于语音段中,检查是否超时或静音段是否过长 + elif cache["stats"].vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment: + if ( + cache["stats"].continous_silence_frame_count * frm_shift_in_ms diff --git a/funasr/models/sanm/encoder.py b/funasr/models/sanm/encoder.py -index b2a442bd..c8044cee 100644 +index b2a442bd..4b244353 100644 --- a/funasr/models/sanm/encoder.py +++ b/funasr/models/sanm/encoder.py -@@ -15,7 +15,7 @@ import torch.nn.functional as F - +@@ -16,6 +16,7 @@ import torch.nn.functional as F import numpy as np from funasr.train_utils.device_funcs import to_device --from funasr.models.transformer.utils.nets_utils import make_pad_mask -+from funasr.models.transformer.utils.nets_utils import make_pad_mask, make_pad_mask_new + from funasr.models.transformer.utils.nets_utils import make_pad_mask ++from funasr.utils.torch_function import sequence_mask from funasr.models.sanm.attention import MultiHeadedAttention, MultiHeadedAttentionSANM from funasr.models.transformer.embedding import ( SinusoidalPositionEncoder, -@@ -374,7 +374,8 @@ class SANMEncoder(nn.Module): +@@ -355,6 +356,8 @@ class SANMEncoder(nn.Module): + self.tf2torch_tensor_name_prefix_torch = tf2torch_tensor_name_prefix_torch + self.tf2torch_tensor_name_prefix_tf = tf2torch_tensor_name_prefix_tf + ++ self.make_pad_mask = sequence_mask() ++ + def output_size(self) -> int: + return self._output_size + +@@ -374,7 +377,7 @@ class SANMEncoder(nn.Module): Returns: position embedded tensor and mask """ - masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) -+ # masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) -+ masks = (~make_pad_mask_new(ilens)[:, None, :]).to(xs_pad.device) ++ masks = (self.make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) xs_pad = xs_pad * self.output_size() ** 0.5 if self.embed is None: xs_pad = xs_pad -diff --git a/funasr/models/transformer/utils/nets_utils.py b/funasr/models/transformer/utils/nets_utils.py -index 29d23ee5..19693c9e 100644 ---- a/funasr/models/transformer/utils/nets_utils.py -+++ b/funasr/models/transformer/utils/nets_utils.py -@@ -218,6 +218,15 @@ def make_pad_mask(lengths, xs=None, length_dim=-1, maxlen=None): - return mask - - -+def make_pad_mask_new(lengths): -+ maxlen = lengths.max() -+ row_vector = torch.arange(0, maxlen, 1).to(lengths.device) -+ matrix = torch.unsqueeze(lengths, dim=-1) -+ mask = row_vector >= matrix -+ mask = mask.detach() -+ return mask -+ -+ - def make_non_pad_mask(lengths, xs=None, length_dim=-1): - """Make mask tensor containing indices of non-padded part. - diff --git a/funasr/models/transformer/utils/repeat.py b/funasr/models/transformer/utils/repeat.py index a44c1a01..0935d854 100644 --- a/funasr/models/transformer/utils/repeat.py diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py index c02daff479..7f6d829303 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_auto_model.py @@ -4,11 +4,13 @@ sys.path.append("./FunASR") import torch import time import logging +from tqdm import tqdm from mindie_paraformer import MindieBiCifParaformer from mindie_encoder_decoder import MindieEncoder, MindieDecoder from mindie_punc import MindiePunc, MindieCTTransformer from mindie_cif import MindieCifTimestamp, MindieCif +from mindie_vad import MindieVAD from funasr.auto.auto_model import AutoModel, download_model, tables, deep_update, \ load_pretrained_model, prepare_data_iterator @@ -67,98 +69,35 @@ class MindieAutoModel(AutoModel): @staticmethod def export_model(**kwargs): - # load model config - assert "model" in kwargs - if "model_conf" not in kwargs: - print("download models from model hub: {}".format(kwargs.get("hub", "ms"))) - kwargs = download_model(**kwargs) + model, kwargs = AutoModel.build_model(**kwargs) - kwargs["batch_size"] = 1 - kwargs["device"] = "cpu" - - # build tokenizer - tokenizer = kwargs.get("tokenizer", None) - if tokenizer is not None: - tokenizer_class = tables.tokenizer_classes.get(tokenizer) - tokenizer = tokenizer_class(**kwargs.get("tokenizer_conf", {})) - kwargs["token_list"] = ( - tokenizer.token_list if hasattr(tokenizer, "token_list") else None - ) - kwargs["token_list"] = ( - tokenizer.get_vocab() if hasattr(tokenizer, "get_vocab") else kwargs["token_list"] - ) - vocab_size = len(kwargs["token_list"]) if kwargs["token_list"] is not None else -1 - if vocab_size == -1 and hasattr(tokenizer, "get_vocab_size"): - vocab_size = tokenizer.get_vocab_size() + if kwargs["compile_type"] == "punc": + punc = MindiePunc(model) + MindiePunc.export(punc, kwargs["compiled_path"], kwargs["soc_version"]) + elif kwargs["compile_type"] == "vad": + vad = MindieVAD(model) + MindieVAD.export(vad, kwargs["compiled_path"], kwargs["soc_version"]) else: - vocab_size = -1 - kwargs["tokenizer"] = tokenizer + import copy + from funasr.models.bicif_paraformer.export_meta import export_rebuild_model - # build frontend - frontend = kwargs.get("frontend", None) - kwargs["input_size"] = None - if frontend is not None: - frontend_class = tables.frontend_classes.get(frontend) - frontend = frontend_class(**kwargs.get("frontend_conf", {})) - kwargs["input_size"] = ( - frontend.output_size() if hasattr(frontend, "output_size") else None - ) - kwargs["frontend"] = frontend - - model_conf = {} - deep_update(model_conf, kwargs.get("model_conf", {})) - deep_update(model_conf, kwargs) - init_param = kwargs.get("init_param", None) + kwargs_new = copy.deepcopy(kwargs) + kwargs_new['onnx'] = False + kwargs_new["max_seq_len"] = 512 + del kwargs_new["model"] + model = export_rebuild_model(model, **kwargs_new) - if kwargs["compile_type"] == "punc": - model = MindiePunc(**model_conf, vocab_size=vocab_size) - model.eval() - print(f"Loading pretrained params from {init_param}") - load_pretrained_model( - model=model, - path=init_param, - ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True), - oss_bucket=kwargs.get("oss_bucket", None), - scope_map=kwargs.get("scope_map", []), - excludes=kwargs.get("excludes", None), - ) - MindiePunc.export_ts(model, kwargs["compiled_path"]) - else: - # compile encoder - encoder = MindieEncoder(**model_conf, vocab_size=vocab_size) - encoder.eval() - print(f"Loading pretrained params from {init_param}") - load_pretrained_model( - model=encoder, - path=init_param, - ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True), - oss_bucket=kwargs.get("oss_bucket", None), - scope_map=kwargs.get("scope_map", []), - excludes=kwargs.get("excludes", None), - ) - MindieEncoder.export_ts(encoder, kwargs["compiled_encoder"], kwargs["traced_encoder"]) - - # compile decoder - decoder = MindieDecoder(**model_conf, vocab_size=vocab_size) - decoder.eval() - print(f"Loading pretrained params from {init_param}") - load_pretrained_model( - model=decoder, - path=init_param, - ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True), - oss_bucket=kwargs.get("oss_bucket", None), - scope_map=kwargs.get("scope_map", []), - excludes=kwargs.get("excludes", None), - ) - MindieDecoder.export_ts(decoder, kwargs["compiled_decoder"], kwargs["traced_decoder"]) + encoder = MindieEncoder(model) + MindieEncoder.export_ts(encoder, kwargs["compiled_encoder"], kwargs["soc_version"], kwargs["traced_encoder"]) + + decoder = MindieDecoder(model) + MindieDecoder.export_ts(decoder, kwargs["compiled_decoder"], kwargs["soc_version"], kwargs["traced_decoder"]) - # compile cif - mindie_cif = MindieCif(decoder.predictor.threshold, kwargs["cif_interval"]) - mindie_cif.export_ts(kwargs["compiled_cif"]) + mindie_cif = MindieCif(model.predictor.threshold, kwargs["cif_interval"]) + mindie_cif.export_ts(kwargs["compiled_cif"], kwargs["soc_version"]) - # compile cif_timestamp - mindie_cif_timestamp = MindieCifTimestamp(decoder.predictor.threshold - 1e-4, kwargs["cif_timestamp_interval"]) - mindie_cif_timestamp.export_ts(kwargs["compiled_cif_timestamp"]) + mindie_cif_timestamp = MindieCifTimestamp(model.predictor.threshold - 1e-4, kwargs["cif_timestamp_interval"]) + mindie_cif_timestamp.export_ts(kwargs["compiled_cif_timestamp"], kwargs["soc_version"]) def build_model_with_mindie(self, **kwargs): assert "model" in kwargs @@ -221,7 +160,7 @@ class MindieAutoModel(AutoModel): return model, kwargs - def inference_with_asr(self, input, input_len=None, model=None, kwargs=None, key=None, **cfg): + def inference_with_asr(self, input, input_len=None, model=None, kwargs=None, key=None, display_pbar=False, **cfg): kwargs = self.kwargs if kwargs is None else kwargs deep_update(kwargs, cfg) model = self.model if model is None else model @@ -233,12 +172,15 @@ class MindieAutoModel(AutoModel): input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key ) - time_stats = {"input_speech_time": 0.0, "end_to_end_time": 0.0, "pure_infer_time": 0.0, + time_stats = {"rtf_avg": 0.0, "input_speech_time": 0.0, "end_to_end_time": 0.0, "pure_infer_time": 0.0, "load_data": 0.0, "encoder": 0.0, "predictor": 0.0, "decoder": 0.0, "predictor_timestamp": 0.0, "post_process": 0.0} asr_result_list = [] num_samples = len(data_list) + if display_pbar: + pbar = tqdm(colour="blue", total=num_samples, dynamic_ncols=True) + for beg_idx in range(0, num_samples, batch_size): end_idx = min(num_samples, beg_idx + batch_size) data_batch = data_list[beg_idx:end_idx] @@ -272,7 +214,12 @@ class MindieAutoModel(AutoModel): time_stats["end_to_end_time"] += time_escape time_stats["input_speech_time"] += batch_data_time - - time_stats["pure_infer_time"] = time_stats["end_to_end_time"] - time_stats["load_data"] + + time_stats["pure_infer_time"] = time_stats["end_to_end_time"] - time_stats["load_data"] + time_stats["rtf_avg"] = time_stats["input_speech_time"] / time_stats["pure_infer_time"] + + if display_pbar: + pbar.update(batch_size) + pbar.set_description("rtf_avg:{:.3f}".format(time_stats["rtf_avg"])) return asr_result_list, time_stats \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py index e706e3e79c..1782973ed3 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_cif.py @@ -84,8 +84,8 @@ class MindieCif(torch.nn.Module): return frame, integrate_new, frame_new - def export_ts(self, path="./compiled_cif.ts"): - print("Begin trace cif!") + def export_ts(self, path="./compiled_cif.pt", soc_version="Ascendxxx"): + print("Begin tracing cif function.") input_shape1 = (1, self.seq_len, 512) input_shape2 = (1, self.seq_len) @@ -102,31 +102,28 @@ class MindieCif(torch.nn.Module): mindietorch.Input(shape = input_shape4, dtype = torch.float32)] export_model = torch.jit.trace(self, example_inputs=(hidden, alphas, integrate, frame)) - print("Finish trace cif") + print("Finish tracing cif function.") compiled_model = mindietorch.compile( export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, default_buffer_size_vec = [1, 10, 10], - soc_version = "Ascend310P3", + soc_version = soc_version, ir = "ts" ) - print("mindietorch compile done !") compiled_model.save(path) + print("Finish compiling cif function, compiled model is saved in {}.".format(path)) # compiled_model = torch.jit.load(path) - print("start to check the percision of cif model.") + print("Start checking the percision of cif function.") sample_hidden = torch.randn(input_shape1, dtype=torch.float32) sample_alphas = torch.randn(input_shape2, dtype=torch.float32) sample_integrate = torch.randn(input_shape3, dtype=torch.float32) sample_frame = torch.randn(input_shape4, dtype=torch.float32) mrt_res = compiled_model(sample_hidden.to("npu"), sample_alphas.to("npu"), sample_integrate.to("npu"), sample_frame.to("npu")) - print("mindie infer done !") ref_res = self.forward(sample_hidden, sample_alphas, sample_integrate, sample_frame) - print("torch infer done !") - precision_eval(mrt_res, ref_res) @@ -141,8 +138,8 @@ class MindieCifTimestamp(torch.nn.Module): return us_peaks, integrate_new - def export_ts(self, path="./compiled_cif_timestamp.ts"): - print("Begin trace cif_timestamp!") + def export_ts(self, path="./compiled_cif_timestamp.ts", soc_version="Ascend310P3"): + print("Begin tracing cif_timestamp function.") input_shape1 = (1, self.seq_len) input_shape2 = (1, ) @@ -153,26 +150,23 @@ class MindieCifTimestamp(torch.nn.Module): mindietorch.Input(shape = input_shape2, dtype = torch.float32)] export_model = torch.jit.trace(self, example_inputs=(us_alphas, integrate)) - print("Finish trace cif_timestamp") + print("Finish tracing cif_timestamp function.") compiled_model = mindietorch.compile( export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, default_buffer_size_vec = [1, 10], - soc_version = "Ascend310P3", + soc_version = soc_version, ir = "ts" ) - print("mindietorch compile done !") compiled_model.save(path) + print("Finish compiling cif_timestamp function, compiled model is saved in {}.".format(path)) # compiled_model = torch.jit.load(path) - print("start to check the percision of cif_timestamp model.") + print("Start checking the percision of cif_timestamp function.") sample_input1 = torch.randn(input_shape1, dtype=torch.float32) sample_input2 = torch.randn(input_shape2, dtype=torch.float32) mrt_res = compiled_model(sample_input1.to("npu"), sample_input2.to("npu")) - print("mindie infer done !") ref_res = self.forward(sample_input1, sample_input2) - print("torch infer done !") - precision_eval(mrt_res, ref_res) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py index 7cbb952329..39ca45313d 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_encoder_decoder.py @@ -5,50 +5,36 @@ sys.path.append("./FunASR") import torch import mindietorch -from mindie_paraformer import precision_eval -from funasr.models.bicif_paraformer.model import Paraformer -from funasr.models.transformer.utils.nets_utils import make_pad_mask_new - - -class MindieEncoder(Paraformer): - def __init__( - self, - *args, - **kwargs, - ): - super().__init__(*args, **kwargs) + +class MindieEncoder(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model.eval() def forward(self, speech, speech_length): - encoder_out, encoder_out_lens = self.encode(speech, speech_length) - - encoder_out_lens = encoder_out_lens.to(torch.int32) - - encoder_out_mask = ( - ~make_pad_mask_new(encoder_out_lens)[:, None, :] - ).to(encoder_out.device) - hidden, alphas, pre_token_length = ( - self.predictor(encoder_out, None, encoder_out_mask, ignore_id=self.ignore_id) - ) - - return encoder_out, encoder_out_lens, hidden, alphas, pre_token_length + batch = {"speech": speech, "speech_lengths": speech_length} + enc, enc_len = self.model.encoder(**batch) + mask = self.model.make_pad_mask(enc_len)[:, None, :] + hidden, alphas, pre_token_length = self.model.predictor(enc, mask) + return enc, hidden, alphas, pre_token_length @staticmethod - def export_ts(encoder, path="./compiled_encoder.ts", traced_path="./traced_encoder.ts"): - print("Begin trace encoder!") + def export_ts(encoder, path="./compiled_encoder.pt", soc_version="Ascendxxx", traced_path=None): + print("Begin tracing encoder.") input_shape = (2, 50, 560) min_shape = (-1, -1, 560) max_shape = (-1, -1, 560) - if traced_path is not None and os.path.exists(traced_path): export_model = torch.load(traced_path) - print("load existing traced_encoder") + print("Load existing traced encoder from {}".format(traced_path)) else: input_speech = torch.randn(input_shape, dtype=torch.float32) input_speech_lens = torch.tensor([50, 25], dtype=torch.int32) export_model = torch.jit.trace(encoder, example_inputs=(input_speech, input_speech_lens)) + print("Finish tracing encoder.") compile_inputs = [mindietorch.Input(min_shape = min_shape, max_shape = max_shape, dtype = torch.float32), mindietorch.Input(min_shape = (-1, ), max_shape = (-1, ), dtype = torch.int32)] @@ -57,49 +43,40 @@ class MindieEncoder(Paraformer): export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, - default_buffer_size_vec = [400, 1, 1, 400, 1], - soc_version = "Ascend310P3", + default_buffer_size_vec = [400, 1, 400, 1], + soc_version = soc_version, ir = "ts" ) - print("mindietorch compile done !") compiled_model.save(path) + print("Finish compiling encoder, compiled model is saved in {}.".format(path)) # compiled_model = torch.jit.load(path) - print("start to check the percision of encoder.") + print("Start checking the percision of encoder.") sample_speech = torch.randn((4, 100, 560), dtype=torch.float32) sample_speech_lens = torch.tensor([100, 50, 100, 25], dtype=torch.int32) - mrt_res = compiled_model(sample_speech.to("npu"), sample_speech_lens.to("npu")) - print("mindie infer done !") - # ref_res = export_model(sample_speech, sample_speech_lens) - # print("torch infer done !") + _ = compiled_model(sample_speech.to("npu"), sample_speech_lens.to("npu")) + print("Finish checking encoder.") - # precision_eval(mrt_res, ref_res) - -class MindieDecoder(Paraformer): - def __init__( - self, - *args, - **kwargs, - ): - super().__init__(*args, **kwargs) +class MindieDecoder(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model.eval() def forward(self, encoder_out, encoder_out_lens, sematic_embeds, pre_token_length): - decoder_outs = self.decoder(encoder_out, encoder_out_lens, sematic_embeds, pre_token_length) + decoder_outs = self.model.decoder(encoder_out, encoder_out_lens, sematic_embeds, pre_token_length) decoder_out = decoder_outs[0] decoder_out = torch.log_softmax(decoder_out, dim=-1) - encoder_out_mask = ( - ~make_pad_mask_new(encoder_out_lens)[:, None, :] - ).to(encoder_out.device) + encoder_out_mask = self.model.make_pad_mask(encoder_out_lens)[:, None, :] - us_alphas = self.predictor.get_upsample_timestamp(encoder_out, encoder_out_mask, pre_token_length) + us_alphas = self.model.predictor.get_upsample_timestamp(encoder_out, encoder_out_mask, pre_token_length) return decoder_out, us_alphas @staticmethod - def export_ts(decoder, path="./compiled_decoder.ts", traced_path="./traced_decoder.ts"): - print("Begin trace decoder!") + def export_ts(decoder, path="./compiled_decoder.pt", soc_version="Ascendxxx", traced_path=None): + print("Begin tracing decoder.") input_shape1 = (2, 939, 512) min_shape1 = (-1, -1, 512) @@ -111,7 +88,7 @@ class MindieDecoder(Paraformer): if traced_path is not None and os.path.exists(traced_path): export_model = torch.load(traced_path) - print("load existing traced_decoder") + print("Load existing traced decoder from {}".format(traced_path)) else: encoder_out = torch.randn(input_shape1, dtype=torch.float32) encoder_out_lens = torch.tensor([939, 500], dtype=torch.int32) @@ -119,7 +96,7 @@ class MindieDecoder(Paraformer): sematic_embeds_lens = torch.tensor([261, 100], dtype=torch.int32) export_model = torch.jit.trace(decoder, example_inputs=(encoder_out, encoder_out_lens, sematic_embeds, sematic_embeds_lens)) - print("Finish trace decoder") + print("Finish tracing decoder.") compile_inputs = [mindietorch.Input(min_shape = min_shape1, max_shape = max_shape1, dtype = torch.float32), mindietorch.Input(min_shape = (-1, ), max_shape = (-1, ), dtype = torch.int32), @@ -131,21 +108,17 @@ class MindieDecoder(Paraformer): inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, default_buffer_size_vec = [800, 10], - soc_version = "Ascend310P3", + soc_version = soc_version, ir = "ts" ) - print("mindietorch compile done !") compiled_model.save(path) + print("Finish compiling decoder, compiled model is saved in {}.".format(path)) # compiled_model = torch.jit.load(path) - print("start to check the percision of decoder.") + print("Start checking the percision of decoder.") sample_encoder = torch.randn((4, 150, 512), dtype=torch.float32) sample_encoder_lens = torch.tensor([150, 100, 150, 50], dtype=torch.int32) sample_sematic = torch.randn((4, 50, 512), dtype=torch.float32) sample_sematic_lens = torch.tensor([50, 30, 50, 10], dtype=torch.int32) - mrt_res = compiled_model(sample_encoder.to("npu"), sample_encoder_lens.to("npu"), sample_sematic.to("npu"), sample_sematic_lens.to("npu")) - print("mindie infer done !") - # ref_res = export_model(sample_encoder, sample_encoder_lens, sample_sematic, sample_sematic_lens) - # print("torch infer done !") - - # precision_eval(mrt_res, ref_res) \ No newline at end of file + _ = compiled_model(sample_encoder.to("npu"), sample_encoder_lens.to("npu"), sample_sematic.to("npu"), sample_sematic_lens.to("npu")) + print("Finish checking decoder.") \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_fa.patch b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_fa.patch new file mode 100644 index 0000000000..6e25149718 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_fa.patch @@ -0,0 +1,121 @@ +diff --git a/funasr/models/sanm/attention.py b/funasr/models/sanm/attention.py +index c7e8a8e0..9bfccb78 100644 +--- a/funasr/models/sanm/attention.py ++++ b/funasr/models/sanm/attention.py +@@ -365,76 +365,26 @@ class MultiHeadedAttentionSANMExport(nn.Module): + + def forward(self, x, mask): + mask_3d_btd, mask_4d_bhlt = mask +- q_h, k_h, v_h, v = self.forward_qkv(x) ++ q_h, k_h, v_h, v = self.forward_qkv(x) # [b, s, h, d] + fsmn_memory = self.forward_fsmn(v, mask_3d_btd) +- q_h = q_h * self.d_k ** (-0.5) +- scores = torch.matmul(q_h, k_h.transpose(-2, -1)) +- att_outs = self.forward_attention(v_h, scores, mask_4d_bhlt) +- return att_outs + fsmn_memory + +- def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: +- new_x_shape = x.size()[:-1] + (self.h, self.d_k) +- x = x.view(new_x_shape) +- return x.permute(0, 2, 1, 3) ++ scale = self.d_k ** (-0.5) + +- def forward_qkv(self, x): +- q_k_v = self.linear_q_k_v(x) +- q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1) +- q_h = self.transpose_for_scores(q) +- k_h = self.transpose_for_scores(k) +- v_h = self.transpose_for_scores(v) +- return q_h, k_h, v_h, v ++ seq_len = mask_4d_bhlt.size(-1) ++ attn_mask = mask_4d_bhlt != 0 ++ attn_mask = attn_mask.expand(-1, -1, seq_len, seq_len) ++ ++ context_layer = torch.ops.aie.flash_attention(q_h, k_h, v_h, num_head=self.h, attn_mask=attn_mask, pse=None, scale=scale, layout="BSND", type="PFA") + +- def forward_fsmn(self, inputs, mask): +- # b, t, d = inputs.size() +- # mask = torch.reshape(mask, (b, -1, 1)) +- inputs = inputs * mask +- x = inputs.transpose(1, 2) +- x = self.pad_fn(x) +- x = self.fsmn_block(x) +- x = x.transpose(1, 2) +- x = x + inputs +- x = x * mask +- return x +- +- def forward_attention(self, value, scores, mask): +- scores = scores + mask +- +- self.attn = torch.softmax(scores, dim=-1) +- context_layer = torch.matmul(self.attn, value) # (batch, head, time1, d_k) +- +- context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) +- return self.linear_out(context_layer) # (batch, time1, d_model) +- +- +-class MultiHeadedAttentionSANMExport(nn.Module): +- def __init__(self, model): +- super().__init__() +- self.d_k = model.d_k +- self.h = model.h +- self.linear_out = model.linear_out +- self.linear_q_k_v = model.linear_q_k_v +- self.fsmn_block = model.fsmn_block +- self.pad_fn = model.pad_fn +- +- self.attn = None +- self.all_head_size = self.h * self.d_k +- +- def forward(self, x, mask): +- mask_3d_btd, mask_4d_bhlt = mask +- q_h, k_h, v_h, v = self.forward_qkv(x) +- fsmn_memory = self.forward_fsmn(v, mask_3d_btd) +- q_h = q_h * self.d_k ** (-0.5) +- scores = torch.matmul(q_h, k_h.transpose(-2, -1)) +- att_outs = self.forward_attention(v_h, scores, mask_4d_bhlt) ++ att_outs = self.linear_out(context_layer) + return att_outs + fsmn_memory + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.h, self.d_k) + x = x.view(new_x_shape) +- return x.permute(0, 2, 1, 3) ++ return x + + def forward_qkv(self, x): + q_k_v = self.linear_q_k_v(x) +@@ -760,14 +710,26 @@ class MultiHeadedAttentionCrossAttExport(nn.Module): + self.all_head_size = self.h * self.d_k + + def forward(self, x, memory, memory_mask, ret_attn=False): +- q, k, v = self.forward_qkv(x, memory) +- scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) +- return self.forward_attention(v, scores, memory_mask, ret_attn) ++ q, k, v = self.forward_qkv(x, memory) # [b, s, h, d] ++ ++ scale = 1 / math.sqrt(self.d_k) ++ ++ seq_len = q.size(1) ++ attn_mask = memory_mask != 0 ++ attn_mask = attn_mask.expand(-1, -1, seq_len, -1) ++ ++ context_layer = torch.ops.aie.flash_attention(q, k, v, num_head=self.h, attn_mask=attn_mask, pse=None, scale=scale, layout="BSND", type="FA") ++ ++ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) ++ context_layer = context_layer.view(new_context_layer_shape) ++ att_outs = self.linear_out(context_layer) ++ ++ return att_outs + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.h, self.d_k) + x = x.view(new_x_shape) +- return x.permute(0, 2, 1, 3) ++ return x + + def forward_qkv(self, x, memory): + q = self.linear_q(x) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py index 0b82da0aec..1d0b73cf89 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_paraformer.py @@ -8,7 +8,6 @@ import torch.nn.functional as F from funasr.models.bicif_paraformer.model import BiCifParaformer, load_audio_text_image_video, \ extract_fbank, Hypothesis, ts_prediction_lfr6_standard, postprocess_utils -from funasr.models.transformer.utils.nets_utils import make_pad_mask_new COSINE_THRESHOLD = 0.999 @@ -37,9 +36,9 @@ def precision_eval(mrt_res, ref_res): com_res = False if com_res: - print("Compare success ! NPU model have the same output with CPU model !") + print("Compare success! NPU model have the same output with CPU model!") else: - print("Compare failed ! Outputs of NPU model are not the same with CPU model !") + print("Compare failed! Outputs of NPU model are not the same with CPU model!") class MindieBiCifParaformer(BiCifParaformer): @@ -73,7 +72,7 @@ class MindieBiCifParaformer(BiCifParaformer): kwargs.get("lm_weight", 0.0) > 0.00001 and kwargs.get("lm_file", None) is not None ) if self.beam_search is None and (is_use_lm or is_use_ctc): - self.init_beam_search(**kwargs) + self.init_beam_search(**kwargs) self.nbest = kwargs.get("nbest", 1) audio_sample_list = load_audio_text_image_video( data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000) @@ -82,7 +81,6 @@ class MindieBiCifParaformer(BiCifParaformer): speech, speech_lengths = extract_fbank( audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend ) - print("Input shape: ", speech.shape) meta_data["batch_data_time"] = ( speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000 ) @@ -93,11 +91,12 @@ class MindieBiCifParaformer(BiCifParaformer): meta_data["load_data"] = time2 - time1 # Step2: run with compiled encoder - encoder_out, encoder_out_lens, hidden, alphas, pre_token_length = self.mindie_encoder(speech, speech_lengths) + encoder_out, hidden, alphas, pre_token_length = self.mindie_encoder(speech, speech_lengths) + encoder_out_lens = speech_lengths - hidden = hidden.to("cpu") - alphas = alphas.to("cpu") - pre_token_length = pre_token_length.to("cpu") + hidden = hidden.to(kwargs["mindie_device"]) + alphas = alphas.to(kwargs["mindie_device"]) + pre_token_length = pre_token_length.to(kwargs["mindie_device"]) pre_token_length = pre_token_length.round().to(torch.int32) time3 = time.perf_counter() @@ -125,7 +124,7 @@ class MindieBiCifParaformer(BiCifParaformer): cur_hidden = padded_hidden[b : b + 1, i * kwargs["cif_interval"] : (i + 1) * kwargs["cif_interval"], :] cur_alphas = padded_alphas[b : b + 1, i * kwargs["cif_interval"] : (i + 1) * kwargs["cif_interval"]] cur_frames, integrate, frame = self.mindie_cif(cur_hidden.to("npu"), cur_alphas.to("npu"), integrate, frame) - frames_list.append(cur_frames.to("cpu")) + frames_list.append(cur_frames.to(kwargs["mindie_device"])) frame = torch.cat(frames_list, 0) pad_frame = torch.zeros([max_label_len - frame.size(0), hidden_size], device=hidden.device) frames_batch.append(torch.cat([frame, pad_frame], 0)) @@ -143,7 +142,7 @@ class MindieBiCifParaformer(BiCifParaformer): # Step4: run with compiled decoder decoder_out, us_alphas = self.mindie_decoder(encoder_out, encoder_out_lens, pre_acoustic_embeds.contiguous().to("npu"), pre_token_length.contiguous().to("npu")) - us_alphas = us_alphas.to("cpu") + us_alphas = us_alphas.to(kwargs["mindie_device"]) time5 = time.perf_counter() meta_data["decoder"] = time5 - time4 @@ -163,7 +162,7 @@ class MindieBiCifParaformer(BiCifParaformer): for i in range(loop_num): cur_alphas = padded_alphas[b:b+1, i * kwargs["cif_timestamp_interval"] : (i + 1) * kwargs["cif_timestamp_interval"]] peak, integrate_alphas = self.mindie_cif_timestamp(cur_alphas.to("npu"), integrate_alphas) - peak_list.append(peak.to("cpu")) + peak_list.append(peak.to(kwargs["mindie_device"])) us_peak = torch.cat(peak_list, 1)[:, :len_alphas] peak_batch.append(us_peak) us_peaks = torch.cat(peak_batch, 0) @@ -173,10 +172,10 @@ class MindieBiCifParaformer(BiCifParaformer): # Step6: post process - decoder_out = decoder_out.to("cpu") - us_alphas = us_alphas.to("cpu") - us_peaks = us_peaks.to("cpu") - encoder_out_lens = encoder_out_lens.to("cpu") + decoder_out = decoder_out.to(kwargs["mindie_device"]) + us_alphas = us_alphas.to(kwargs["mindie_device"]) + us_peaks = us_peaks.to(kwargs["mindie_device"]) + encoder_out_lens = encoder_out_lens.to(kwargs["mindie_device"]) results = [] b, n, d = decoder_out.size() for i in range(b): diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py index 73b131a9fb..25b1898b64 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_punc.py @@ -10,24 +10,21 @@ from funasr.models.ct_transformer.model import CTTransformer from funasr.models.ct_transformer.utils import split_to_mini_sentence, split_words -class MindiePunc(CTTransformer): - def __init__( - self, - *args, - **kwargs, - ): - super().__init__(*args, **kwargs) +class MindiePunc(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model.eval() def forward(self, text, text_lengths): - y, _ = self.punc_forward(text, text_lengths) + y, _ = self.model.punc_forward(text, text_lengths) _, indices = y.view(-1, y.shape[-1]).topk(1, dim=1) punctuations = torch.squeeze(indices, dim=1) return punctuations @staticmethod - def export_ts(punc, path="./compiled_punc.ts"): - print("Begin trace punc!") + def export(punc, path="./compiled_punc.pt", soc_version="Ascendxxx"): + print("Begin tracing punc model.") input_shape = (1, 20) min_shape = (1, -1) @@ -38,28 +35,25 @@ class MindiePunc(CTTransformer): mindietorch.Input(min_shape = (1, ), max_shape = (1, ), dtype = torch.int32)] export_model = torch.jit.trace(punc, example_inputs=(input_speech, input_speech_lengths)) - print("Finish trace punc") + print("Finish tracing punc model.") compiled_model = mindietorch.compile( export_model, inputs = compile_inputs, precision_policy = mindietorch.PrecisionPolicy.PREF_FP16, default_buffer_size_vec = [10, ], - soc_version = "Ascend310P3", + soc_version = soc_version, ir = "ts" ) - print("mindietorch compile done !") compiled_model.save(path) + print("Finish compiling punc model, compiled model is saved in {}.".format(path)) # compiled_model = torch.jit.load(path) - print("start to check the percision of punc model.") + print("Start checking the percision of punc model.") sample_speech = torch.randint(1, 10, (1, 10), dtype=torch.int32) sample_speech_lengths = torch.tensor([10, ], dtype=torch.int32) mrt_res = compiled_model(sample_speech.to("npu"), sample_speech_lengths.to("npu")) - print("mindie infer done !") ref_res = punc(sample_speech, sample_speech_lengths) - print("torch infer done !") - precision_eval(mrt_res, ref_res) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_vad.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_vad.py new file mode 100644 index 0000000000..753548aff1 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/mindie_vad.py @@ -0,0 +1,51 @@ +import sys +import os +sys.path.append("./FunASR") + +import torch +import mindietorch + +from mindie_paraformer import precision_eval + + +class MindieVAD(torch.nn.Module): + def __init__(self, model): + super().__init__() + model.encoder.eval() + for para in model.encoder.parameters(): + para.requires_grad = False + self.model = model + + def forward(self, feat): + result = self.model.encoder(feat, {}) + return result + + @staticmethod + def export(vad, path="./compiled_vad.pt", soc_version="Ascendxxx"): + print("Begin tracing vad model.") + input_shape = (1, 5996, 400) + min_shape = (1, -1, 400) + max_shape = (1, -1, 400) + input_feat = torch.randn(input_shape, dtype=torch.float32) + compile_inputs = [mindietorch.Input(min_shape = min_shape, max_shape = max_shape, dtype = torch.float32)] + + export_model = torch.jit.trace(vad, input_feat) + print("Finish tracing vad model.") + + compiled_model = mindietorch.compile( + export_model, + inputs = compile_inputs, + precision_policy = mindietorch.PrecisionPolicy.PREF_FP32, + default_buffer_size_vec = [50, ], + soc_version = soc_version, + ir = "ts" + ) + compiled_model.save(path) + print("Finish compiling vad model, compiled model is saved in {}.".format(path)) + # compiled_model = torch.jit.load(path) + + print("Start checking the percision of vad model.") + sample_feat = torch.randn(input_shape, dtype=torch.float32) + mrt_res = compiled_model(sample_feat.to("npu")) + ref_res = vad(sample_feat) + precision_eval(mrt_res, ref_res) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/test.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/test.py index 3a078e42dc..ae1f2a40af 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/test.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/test.py @@ -17,6 +17,9 @@ import os import argparse +import torch +import torch_npu + import mindietorch from mindie_auto_model import MindieAutoModel @@ -42,10 +45,12 @@ if __name__ == "__main__": help="path to save compiled punc model") parser.add_argument("--compiled_vad", default="./compiled_model/compiled_vad.ts", help="path to save compiled vad model") - parser.add_argument("--batch_size", default=16, type=int, + parser.add_argument("--paraformer_batch_size", default=16, type=int, help="batch size of paraformer model") parser.add_argument("--sample_path", default="./audio/", help="directory or path of sample audio") + parser.add_argument("--soc_version", default="Ascendxxx", type=str, + help="soc version of Ascend") args = parser.parse_args() mindietorch.set_device(0) @@ -70,16 +75,22 @@ if __name__ == "__main__": compiled_encoder=args.compiled_encoder, compiled_decoder=args.compiled_decoder, compiled_cif=args.compiled_cif, compiled_cif_timestamp=args.compiled_cif_timestamp, compiled_punc=args.compiled_punc, compiled_vad=args.compiled_vad, - paraformer_batch_size=args.batch_size, cif_interval=200, cif_timestamp_interval=500) + paraformer_batch_size=args.paraformer_batch_size, + cif_interval=200, cif_timestamp_interval=500) + + if "910" in args.soc_version: + model.kwargs["mindie_device"] = "npu" + else: + model.kwargs["mindie_device"] = "cpu" # warm up - print("Begin warming up") + print("Begin warming up.") _ = model.generate(input=audio_files[0]) - print("Finish warming up") + print("Finish warming up.") # iterate over sample_dir for wav_file in audio_files: - print("Begin evaluating {}".format(wav_file)) + print("\nBegin evaluating {}.".format(wav_file)) res, time_stats = model.generate(input=wav_file) print("Model output: {}".format(res[0]["text"])) diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/test_accuracy.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/test_accuracy.py new file mode 100644 index 0000000000..4af01ede69 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/test_accuracy.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +from tqdm import tqdm + +import torch +import torch_npu +import mindietorch + +from mindie_auto_model import MindieAutoModel +from nltk.metrics.distance import edit_distance + + +def load_txt(file_name): + result = {} + + with open(file_name, "r") as file: + for line in file: + parts = line.strip().split(maxsplit=1) + + if len(parts) == 2: + result[parts[0]] = parts[1] + + return result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--result_path", default="./aishell_test_result.txt", type=str, + help="path to save infer result") + parser.add_argument("--ref_path", default="/path/to/AISHELL-1/transcript/aishell_transcript_v0.8.txt", + type=str, help="directory or path of sample audio") + args = parser.parse_args() + + infer_result = load_txt(args.result_path) + ref_result = load_txt(args.ref_path) + + infer_list = [] + refer_list = [] + for key, value in infer_result.items(): + if key in ref_result: + infer_list.append(value.replace(" ", "")) + refer_list.append(ref_result[key].replace(" ", "")) + + cer_total = 0 + step = 0 + for infer, refer in tqdm(zip(infer_list, refer_list)): + infer = [i for i in infer] + refer = [r for r in refer] + cer_total += edit_distance(infer, refer) / len(refer) + step += 1 + + cer = cer_total / step + accuracy = 1 - cer + print("character-errer-rate: {:.4f}, accuracy: {:.4f}".format(cer, accuracy)) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/test_performance.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/test_performance.py new file mode 100644 index 0000000000..1d3cc39f5f --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/test_performance.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse + +import torch +import torch_npu +import mindietorch + +from mindie_auto_model import MindieAutoModel + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", default="./model", + help="path of pretrained model") + parser.add_argument("--compiled_encoder", default="./compiled_model/compiled_encoder.pt", + help="path to save compiled encoder") + parser.add_argument("--compiled_decoder", default="./compiled_model/compiled_decoder.pt", + help="path to save compiled decoder") + parser.add_argument("--compiled_cif", default="./compiled_model/compiled_cif.pt", + help="path to save compiled cif function") + parser.add_argument("--compiled_cif_timestamp", default="./compiled_model/compiled_cif_timestamp.pt", + help="path to save compiled cif timestamp function") + parser.add_argument("--batch_size", default=64, type=int, + help="batch size of paraformer model") + parser.add_argument("--sample_path", default="/path/to/AISHELL-1", type=str, + help="directory or path of sample audio") + parser.add_argument("--result_path", default="./aishell_test_result.txt", type=str, + help="path to save infer result") + parser.add_argument("--soc_version", default="Ascend310P3", type=str, + help="soc version of Ascend") + args = parser.parse_args() + + mindietorch.set_device(0) + + valid_extensions = ['.wav'] + audio_files = [] + + if os.path.isfile(args.sample_path): + if any(args.sample_path.endswith(ext) for ext in valid_extensions): + audio_files.append(args.sample_path) + elif os.path.isdir(args.sample_path): + for root, dirs, files in os.walk(args.sample_path): + for file in files: + if any(file.endswith(ext) for ext in valid_extensions): + audio_files.append(os.path.join(root, file)) + + # filter out wav files which is smaller than 1KB + audio_files = [file for file in audio_files if os.path.getsize(file) >= 1024] + + if len(audio_files) == 0: + print("There is no valid wav file in sample_dir.") + else: + # initialize auto model + model = MindieAutoModel(model=args.model, + compiled_encoder=args.compiled_encoder, compiled_decoder=args.compiled_decoder, + compiled_cif=args.compiled_cif, compiled_cif_timestamp=args.compiled_cif_timestamp, + batch_size=args.batch_size, + cif_interval=200, cif_timestamp_interval=500) + + if "910" in args.soc_version: + model.kwargs["mindie_device"] = "npu" + else: + model.kwargs["mindie_device"] = "cpu" + + # warm up + print("Begin warming up.") + for i in range(3): + _ = model.inference_with_asr(input=audio_files[0]) + print("Finish warming up") + + # iterate over sample_dir + print("Begin evaluating.") + + results, time_stats = model.inference_with_asr(input=audio_files, display_pbar=True) + print("Average RTX: {:.3f}".format(time_stats["rtf_avg"])) + print("Time comsumption:") + print(" ".join(f"{key}: {value:.3f}s" for key, value in time_stats.items() if key != "rtf_avg")) + + with open(args.result_path, "w") as f: + for res in results: + f.write("{} {}\n".format(res["key"], res["text"])) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_decoder.py b/MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_encoder_decoder.py similarity index 37% rename from MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_decoder.py rename to MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_encoder_decoder.py index 44bb0e00f8..91a7d3af01 100644 --- a/MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_decoder.py +++ b/MindIE/MindIE-Torch/built-in/audio/Paraformer/trace_encoder_decoder.py @@ -16,39 +16,36 @@ import argparse import torch +from torch import library import sys sys.path.append("./FunASR") -from funasr.auto.auto_model import AutoModel, download_model, tables, deep_update, \ - load_pretrained_model, prepare_data_iterator -from funasr.models.bicif_paraformer.model import Paraformer -from funasr.models.transformer.utils.nets_utils import make_pad_mask_new +from funasr.auto.auto_model import AutoModel -class ParaformerEncoder(Paraformer): - def __init__( - self, - *args, - **kwargs, - ): - super().__init__(*args, **kwargs) - - def forward(self, speech, speech_length): - encoder_out, encoder_out_lens = self.encode(speech, speech_length) +torch.library.define("aie::flash_attention", "(Tensor query, Tensor key, Tensor value, int num_head, " + "Tensor? attn_mask=None, Tensor? pse=None, float scale=1.0, str layout='BSH', str type='PFA') -> Tensor") - encoder_out_lens = encoder_out_lens.to(torch.int32) - - encoder_out_mask = ( - ~make_pad_mask_new(encoder_out_lens)[:, None, :] - ).to(encoder_out.device) - hidden, alphas, pre_token_length = ( - self.predictor(encoder_out, None, encoder_out_mask, ignore_id=self.ignore_id) - ) +@torch.library.impl('aie::flash_attention', "cpu") +def flash_attention_wrapper(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, num_head: int, + attn_mask: torch.Tensor = None, pse: torch.Tensor = None, scale: float = 1.0, + layout: str = 'BSH', type: str = 'PFA') -> torch.Tensor: + return query - return encoder_out, encoder_out_lens, hidden, alphas, pre_token_length +class ParaformerEncoder(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model.eval() + + def forward(self, speech, speech_length): + batch = {"speech": speech, "speech_lengths": speech_length} + enc, enc_len = self.model.encoder(**batch) + mask = self.model.make_pad_mask(enc_len)[:, None, :] + hidden, alphas, pre_token_length = self.model.predictor(enc, mask) + return enc, hidden, alphas, pre_token_length - def trace_model(encoder, path="./traced_encoder.ts"): + def trace_model(encoder, path="./traced_encoder.pt"): print("Begin trace encoder!") input_shape = (2, 50, 560) @@ -60,28 +57,23 @@ class ParaformerEncoder(Paraformer): print("Finish trace encoder") -class ParaformerDecoder(Paraformer): - def __init__( - self, - *args, - **kwargs, - ): - super().__init__(*args, **kwargs) +class ParaformerDecoder(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model.eval() def forward(self, encoder_out, encoder_out_lens, sematic_embeds, pre_token_length): - decoder_outs = self.decoder(encoder_out, encoder_out_lens, sematic_embeds, pre_token_length) + decoder_outs = self.model.decoder(encoder_out, encoder_out_lens, sematic_embeds, pre_token_length) decoder_out = decoder_outs[0] decoder_out = torch.log_softmax(decoder_out, dim=-1) - encoder_out_mask = ( - ~make_pad_mask_new(encoder_out_lens)[:, None, :] - ).to(encoder_out.device) + encoder_out_mask = self.model.make_pad_mask(encoder_out_lens)[:, None, :] - us_alphas = self.predictor.get_upsample_timestamp(encoder_out, encoder_out_mask, pre_token_length) + us_alphas = self.model.predictor.get_upsample_timestamp(encoder_out, encoder_out_mask, pre_token_length) return decoder_out, us_alphas - def trace_model(decoder, path="./traced_decoder.ts"): + def trace_model(decoder, path="./traced_decoder.pt"): print("Begin trace decoder!") input_shape1 = (2, 939, 512) @@ -97,79 +89,27 @@ class ParaformerDecoder(Paraformer): print("Finish trace decoder") -class AutoModelDecoder(AutoModel): +class AutoModelParaformer(AutoModel): def __init__(self, **kwargs): super().__init__(**kwargs) @staticmethod - def trace_decoder(**kwargs): - # load model config - assert "model" in kwargs - if "model_conf" not in kwargs: - print("download models from model hub: {}".format(kwargs.get("hub", "ms"))) - kwargs = download_model(**kwargs) - - kwargs["batch_size"] = 1 - kwargs["device"] = "cpu" - - # build tokenizer - tokenizer = kwargs.get("tokenizer", None) - if tokenizer is not None: - tokenizer_class = tables.tokenizer_classes.get(tokenizer) - tokenizer = tokenizer_class(**kwargs.get("tokenizer_conf", {})) - kwargs["token_list"] = ( - tokenizer.token_list if hasattr(tokenizer, "token_list") else None - ) - kwargs["token_list"] = ( - tokenizer.get_vocab() if hasattr(tokenizer, "get_vocab") else kwargs["token_list"] - ) - vocab_size = len(kwargs["token_list"]) if kwargs["token_list"] is not None else -1 - if vocab_size == -1 and hasattr(tokenizer, "get_vocab_size"): - vocab_size = tokenizer.get_vocab_size() - else: - vocab_size = -1 - kwargs["tokenizer"] = tokenizer - - # build frontend - frontend = kwargs.get("frontend", None) - kwargs["input_size"] = None - if frontend is not None: - frontend_class = tables.frontend_classes.get(frontend) - frontend = frontend_class(**kwargs.get("frontend_conf", {})) - kwargs["input_size"] = ( - frontend.output_size() if hasattr(frontend, "output_size") else None - ) - kwargs["frontend"] = frontend + def trace(**kwargs): + model, kwargs = AutoModel.build_model(**kwargs) - model_conf = {} - deep_update(model_conf, kwargs.get("model_conf", {})) - deep_update(model_conf, kwargs) - init_param = kwargs.get("init_param", None) - - encoder = ParaformerEncoder(**model_conf, vocab_size=vocab_size) - encoder.eval() - print(f"Loading pretrained params from {init_param}") - load_pretrained_model( - model=encoder, - path=init_param, - ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True), - oss_bucket=kwargs.get("oss_bucket", None), - scope_map=kwargs.get("scope_map", []), - excludes=kwargs.get("excludes", None), - ) + import copy + from funasr.models.bicif_paraformer.export_meta import export_rebuild_model + + kwargs_new = copy.deepcopy(kwargs) + kwargs_new['onnx'] = False + kwargs_new["max_seq_len"] = 512 + del kwargs_new["model"] + model = export_rebuild_model(model, **kwargs_new) + + encoder = ParaformerEncoder(model) ParaformerEncoder.trace_model(encoder, kwargs["traced_encoder"]) - decoder = ParaformerDecoder(**model_conf, vocab_size=vocab_size) - decoder.eval() - print(f"Loading pretrained params from {init_param}") - load_pretrained_model( - model=decoder, - path=init_param, - ignore_init_mismatch=kwargs.get("ignore_init_mismatch", True), - oss_bucket=kwargs.get("oss_bucket", None), - scope_map=kwargs.get("scope_map", []), - excludes=kwargs.get("excludes", None), - ) + decoder = ParaformerDecoder(model) ParaformerDecoder.trace_model(decoder, kwargs["traced_decoder"]) @@ -177,10 +117,10 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model", default="./model", help="path of pretrained model") - parser.add_argument("--traced_encoder", default="./compiled_model/traced_encoder.ts", + parser.add_argument("--traced_encoder", default="./compiled_model/traced_encoder.pt", help="path to save compiled decoder") - parser.add_argument("--traced_decoder", default="./compiled_model/traced_decoder.ts", + parser.add_argument("--traced_decoder", default="./compiled_model/traced_decoder.pt", help="path to save compiled decoder") args = parser.parse_args() - AutoModelDecoder.trace_decoder(model=args.model, traced_encoder=args.traced_encoder, traced_decoder=args.traced_decoder) \ No newline at end of file + AutoModelParaformer.trace(model=args.model, traced_encoder=args.traced_encoder, traced_decoder=args.traced_decoder) \ No newline at end of file -- Gitee