diff --git a/ACL_PyTorch/built-in/audio/Paraformer/README.md b/ACL_PyTorch/built-in/audio/Paraformer/README.md index e7c0d7b618436d03fb707c5e746ce696ce0ea2d2..1639049be71da882a4569c837f7c0d2d9551d1b0 100644 --- a/ACL_PyTorch/built-in/audio/Paraformer/README.md +++ b/ACL_PyTorch/built-in/audio/Paraformer/README.md @@ -91,7 +91,7 @@ Paraformer是阿里达摩院语音团队提出的一种高效的非自回归端 # 解决nan值导致的精度异常问题 export INF_NAN_MODE_FORCE_DISABLE=1 - python3 infer_air.py \ + python3 infer.py \ --model_path=./speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404 \ --data=speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/example/asr_example.wav \ --hotwords="魔搭" \ diff --git a/ACL_PyTorch/built-in/audio/Paraformer/adapt-torchair.patch b/ACL_PyTorch/built-in/audio/Paraformer/adapt-torchair.patch index ad509a1b34d49a2fd465e9d6b55824a6d03385f5..8ba3e835bff8f65bd7e4f46e3ad10d4512988807 100644 --- a/ACL_PyTorch/built-in/audio/Paraformer/adapt-torchair.patch +++ b/ACL_PyTorch/built-in/audio/Paraformer/adapt-torchair.patch @@ -1,199 +1,199 @@ ---- - .../models/contextual_paraformer/decoder.py | 14 ++++--- - funasr/models/contextual_paraformer/model.py | 13 ++++++- - funasr/models/paraformer/model.py | 3 +- - funasr/models/sanm/attention.py | 38 ++++++++++++++++++- - funasr/models/sanm/encoder.py | 4 +- - 5 files changed, 59 insertions(+), 13 deletions(-) - -diff --git a/funasr/models/contextual_paraformer/decoder.py b/funasr/models/contextual_paraformer/decoder.py -index ba2ce9ad..be36352f 100644 ---- a/funasr/models/contextual_paraformer/decoder.py -+++ b/funasr/models/contextual_paraformer/decoder.py -@@ -254,10 +254,11 @@ class ContextualParaformerDecoder(ParaformerSANMDecoder): - def forward( - self, - hs_pad: torch.Tensor, -- hlens: torch.Tensor, -+ memory_mask: torch.Tensor, - ys_in_pad: torch.Tensor, -- ys_in_lens: torch.Tensor, -+ tgt_mask: torch.Tensor, - contextual_info: torch.Tensor, -+ contextual_mask: torch.Tensor, - clas_scale: float = 1.0, - return_hidden: bool = False, - ) -> Tuple[torch.Tensor, torch.Tensor]: -@@ -279,18 +280,19 @@ class ContextualParaformerDecoder(ParaformerSANMDecoder): - olens: (batch, ) - """ - tgt = ys_in_pad -- tgt_mask = myutils.sequence_mask(ys_in_lens, device=tgt.device)[:, :, None] -+ # tgt_mask = myutils.sequence_mask(ys_in_lens, device=tgt.device)[:, :, None] - - memory = hs_pad -- memory_mask = myutils.sequence_mask(hlens, device=memory.device)[:, None, :] -+ # memory_mask = myutils.sequence_mask(hlens, device=memory.device)[:, None, :] - - x = tgt - x, tgt_mask, memory, memory_mask, _ = self.decoders(x, tgt_mask, memory, memory_mask) - _, _, x_self_attn, x_src_attn = self.last_decoder(x, tgt_mask, memory, memory_mask) - - # contextual paraformer related -- contextual_length = torch.Tensor([contextual_info.shape[1]]).int().repeat(hs_pad.shape[0]) -- contextual_mask = myutils.sequence_mask(contextual_length, device=memory.device)[:, None, :] -+ # contextual_length = torch.Tensor([contextual_info.shape[1]]).int().repeat(hs_pad.shape[0]) -+ # contextual_mask = myutils.sequence_mask(contextual_length, device=memory.device)[:, None, :] -+ contextual_mask = contextual_mask.unsqueeze(1).eq(0).expand(-1, -1, x_self_attn.size(1), -1) - cx, tgt_mask, _, _, _ = self.bias_decoder( - x_self_attn, tgt_mask, contextual_info, memory_mask=contextual_mask - ) -diff --git a/funasr/models/contextual_paraformer/model.py b/funasr/models/contextual_paraformer/model.py -index fd882202..f5a16854 100644 ---- a/funasr/models/contextual_paraformer/model.py -+++ b/funasr/models/contextual_paraformer/model.py -@@ -18,6 +18,7 @@ from distutils.version import LooseVersion - - from funasr.register import tables - from funasr.utils import postprocess_utils -+from funasr.models.scama import utils as myutils - from funasr.metrics.compute_acc import th_accuracy - from funasr.models.paraformer.model import Paraformer - from funasr.utils.datadir_writer import DatadirWriter -@@ -328,12 +329,20 @@ class ContextualParaformer(Paraformer): - _, (h_n, _) = self.bias_encoder(hw_embed) - hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1) - -+ # build mask before model -+ tgt_mask = myutils.sequence_mask(ys_pad_lens, device=encoder_out.device)[:, :, None] -+ memory_mask = myutils.sequence_mask(encoder_out_lens, device=encoder_out.device)[:, None, :] -+ memory_mask = memory_mask.unsqueeze(1).eq(0).expand(-1, -1, sematic_embeds.size(1), -1) -+ contextual_length = torch.Tensor([hw_embed.shape[1]]).int().repeat(encoder_out.shape[0]) -+ contextual_mask = myutils.sequence_mask(contextual_length, device=encoder_out.device)[:, None, :] -+ - decoder_outs = self.decoder( - encoder_out, -- encoder_out_lens, -+ memory_mask, - sematic_embeds, -- ys_pad_lens, -+ tgt_mask, - contextual_info=hw_embed, -+ contextual_mask=contextual_mask, - clas_scale=clas_scale, - ) - -diff --git a/funasr/models/paraformer/model.py b/funasr/models/paraformer/model.py -index 85967af3..935c6e77 100644 ---- a/funasr/models/paraformer/model.py -+++ b/funasr/models/paraformer/model.py -@@ -259,7 +259,8 @@ class Paraformer(torch.nn.Module): - speech, speech_lengths = self.normalize(speech, speech_lengths) - - # Forward encoder -- encoder_out, encoder_out_lens, _ = self.encoder(speech, speech_lengths) -+ mask = (~make_pad_mask(speech_lengths)[:, None :]).to(speech.device) -+ encoder_out, encoder_out_lens, _ = self.encoder(speech, mask) - if isinstance(encoder_out, tuple): - encoder_out = encoder_out[0] - -diff --git a/funasr/models/sanm/attention.py b/funasr/models/sanm/attention.py -index 47d60cb6..34ae46b5 100644 ---- a/funasr/models/sanm/attention.py -+++ b/funasr/models/sanm/attention.py -@@ -10,6 +10,7 @@ import math - - import numpy - import torch -+import torch_npu - from torch import nn - from typing import Optional, Tuple - -@@ -289,7 +290,7 @@ class MultiHeadedAttentionSANM(nn.Module): - - return self.linear_out(x) # (batch, time1, d_model) - -- def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): -+ def forward_ori(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): - """Compute scaled dot product attention. - - Args: -@@ -310,6 +311,23 @@ class MultiHeadedAttentionSANM(nn.Module): - att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder) - return att_outs + fsmn_memory - -+ def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): -+ q_k_v = self.linear_q_k_v(x).half() -+ q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1) -+ fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk) -+ attn_out = torch_npu.npu_prompt_flash_attention( -+ q, -+ k, -+ v, -+ scale_value=self.d_k ** (-0.5), -+ atten_mask=mask.unsqueeze(1).eq(0).repeat(1, 1, q.size(1), 1), -+ input_layout='BSH', -+ num_heads=self.h, -+ sparse_mode=1, -+ ).to(q.dtype) -+ att_outs = self.linear_out(attn_out) -+ return att_outs + fsmn_memory -+ - def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): - """Compute scaled dot product attention. - -@@ -697,7 +715,7 @@ class MultiHeadedAttentionCrossAtt(nn.Module): - return self.linear_out(x), attn # (batch, time1, d_model) - return self.linear_out(x) # (batch, time1, d_model) - -- def forward(self, x, memory, memory_mask, ret_attn=False): -+ def forward_ori(self, x, memory, memory_mask, ret_attn=False): - """Compute scaled dot product attention. - - Args: -@@ -716,6 +734,22 @@ class MultiHeadedAttentionCrossAtt(nn.Module): - scores = torch.matmul(q_h, k_h.transpose(-2, -1)) - return self.forward_attention(v_h, scores, memory_mask, ret_attn=ret_attn) - -+ def forward(self, x, memory, memory_mask, ret_attn=False): -+ q = self.linear_q(x) -+ k_v = self.linear_k_v(memory) -+ k, v = torch.split(k_v, int(self.h * self.d_k), dim=-1) -+ attn_out = torch_npu.npu_prompt_flash_attention( -+ q.to(torch.float16), -+ k.to(torch.float16), -+ v.to(torch.float16), -+ scale_value=self.d_k ** (-0.5), -+ atten_mask=memory_mask, -+ input_layout='BSH', -+ num_heads=self.h, -+ sparse_mode=1, -+ ).to(q.dtype) -+ return self.linear_out(attn_out) -+ - def forward_chunk(self, x, memory, cache=None, chunk_size=None, look_back=0): - """Compute scaled dot product attention. - -diff --git a/funasr/models/sanm/encoder.py b/funasr/models/sanm/encoder.py -index 0d39ca74..b062e9d8 100644 ---- a/funasr/models/sanm/encoder.py -+++ b/funasr/models/sanm/encoder.py -@@ -361,7 +361,7 @@ class SANMEncoder(nn.Module): - def forward( - self, - xs_pad: torch.Tensor, -- ilens: torch.Tensor, -+ masks: torch.Tensor, - prev_states: torch.Tensor = None, - ctc: CTC = None, - ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: -@@ -374,7 +374,7 @@ class SANMEncoder(nn.Module): - Returns: - position embedded tensor and mask - """ -- masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) -+ # masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) - xs_pad = xs_pad * self.output_size() ** 0.5 - if self.embed is None: - xs_pad = xs_pad +--- + .../models/contextual_paraformer/decoder.py | 14 ++++--- + funasr/models/contextual_paraformer/model.py | 13 ++++++- + funasr/models/paraformer/model.py | 3 +- + funasr/models/sanm/attention.py | 38 ++++++++++++++++++- + funasr/models/sanm/encoder.py | 4 +- + 5 files changed, 59 insertions(+), 13 deletions(-) + +diff --git a/funasr/models/contextual_paraformer/decoder.py b/funasr/models/contextual_paraformer/decoder.py +index ba2ce9ad..be36352f 100644 +--- a/funasr/models/contextual_paraformer/decoder.py ++++ b/funasr/models/contextual_paraformer/decoder.py +@@ -254,10 +254,11 @@ class ContextualParaformerDecoder(ParaformerSANMDecoder): + def forward( + self, + hs_pad: torch.Tensor, +- hlens: torch.Tensor, ++ memory_mask: torch.Tensor, + ys_in_pad: torch.Tensor, +- ys_in_lens: torch.Tensor, ++ tgt_mask: torch.Tensor, + contextual_info: torch.Tensor, ++ contextual_mask: torch.Tensor, + clas_scale: float = 1.0, + return_hidden: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: +@@ -279,18 +280,19 @@ class ContextualParaformerDecoder(ParaformerSANMDecoder): + olens: (batch, ) + """ + tgt = ys_in_pad +- tgt_mask = myutils.sequence_mask(ys_in_lens, device=tgt.device)[:, :, None] ++ # tgt_mask = myutils.sequence_mask(ys_in_lens, device=tgt.device)[:, :, None] + + memory = hs_pad +- memory_mask = myutils.sequence_mask(hlens, device=memory.device)[:, None, :] ++ # memory_mask = myutils.sequence_mask(hlens, device=memory.device)[:, None, :] + + x = tgt + x, tgt_mask, memory, memory_mask, _ = self.decoders(x, tgt_mask, memory, memory_mask) + _, _, x_self_attn, x_src_attn = self.last_decoder(x, tgt_mask, memory, memory_mask) + + # contextual paraformer related +- contextual_length = torch.Tensor([contextual_info.shape[1]]).int().repeat(hs_pad.shape[0]) +- contextual_mask = myutils.sequence_mask(contextual_length, device=memory.device)[:, None, :] ++ # contextual_length = torch.Tensor([contextual_info.shape[1]]).int().repeat(hs_pad.shape[0]) ++ # contextual_mask = myutils.sequence_mask(contextual_length, device=memory.device)[:, None, :] ++ contextual_mask = contextual_mask.unsqueeze(1).eq(0).expand(-1, -1, x_self_attn.size(1), -1) + cx, tgt_mask, _, _, _ = self.bias_decoder( + x_self_attn, tgt_mask, contextual_info, memory_mask=contextual_mask + ) +diff --git a/funasr/models/contextual_paraformer/model.py b/funasr/models/contextual_paraformer/model.py +index fd882202..f5a16854 100644 +--- a/funasr/models/contextual_paraformer/model.py ++++ b/funasr/models/contextual_paraformer/model.py +@@ -18,6 +18,7 @@ from distutils.version import LooseVersion + + from funasr.register import tables + from funasr.utils import postprocess_utils ++from funasr.models.scama import utils as myutils + from funasr.metrics.compute_acc import th_accuracy + from funasr.models.paraformer.model import Paraformer + from funasr.utils.datadir_writer import DatadirWriter +@@ -328,12 +329,20 @@ class ContextualParaformer(Paraformer): + _, (h_n, _) = self.bias_encoder(hw_embed) + hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1) + ++ # build mask before model ++ tgt_mask = myutils.sequence_mask(ys_pad_lens, device=encoder_out.device)[:, :, None] ++ memory_mask = myutils.sequence_mask(encoder_out_lens, device=encoder_out.device)[:, None, :] ++ memory_mask = memory_mask.unsqueeze(1).eq(0).expand(-1, -1, sematic_embeds.size(1), -1) ++ contextual_length = torch.Tensor([hw_embed.shape[1]]).int().repeat(encoder_out.shape[0]) ++ contextual_mask = myutils.sequence_mask(contextual_length, device=encoder_out.device)[:, None, :] ++ + decoder_outs = self.decoder( + encoder_out, +- encoder_out_lens, ++ memory_mask, + sematic_embeds, +- ys_pad_lens, ++ tgt_mask, + contextual_info=hw_embed, ++ contextual_mask=contextual_mask, + clas_scale=clas_scale, + ) + +diff --git a/funasr/models/paraformer/model.py b/funasr/models/paraformer/model.py +index 85967af3..935c6e77 100644 +--- a/funasr/models/paraformer/model.py ++++ b/funasr/models/paraformer/model.py +@@ -259,7 +259,8 @@ class Paraformer(torch.nn.Module): + speech, speech_lengths = self.normalize(speech, speech_lengths) + + # Forward encoder +- encoder_out, encoder_out_lens, _ = self.encoder(speech, speech_lengths) ++ mask = (~make_pad_mask(speech_lengths)[:, None :]).to(speech.device) ++ encoder_out, encoder_out_lens, _ = self.encoder(speech, mask) + if isinstance(encoder_out, tuple): + encoder_out = encoder_out[0] + +diff --git a/funasr/models/sanm/attention.py b/funasr/models/sanm/attention.py +index 47d60cb6..34ae46b5 100644 +--- a/funasr/models/sanm/attention.py ++++ b/funasr/models/sanm/attention.py +@@ -10,6 +10,7 @@ import math + + import numpy + import torch ++import torch_npu + from torch import nn + from typing import Optional, Tuple + +@@ -289,7 +290,7 @@ class MultiHeadedAttentionSANM(nn.Module): + + return self.linear_out(x) # (batch, time1, d_model) + +- def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): ++ def forward_ori(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): + """Compute scaled dot product attention. + + Args: +@@ -310,6 +311,23 @@ class MultiHeadedAttentionSANM(nn.Module): + att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder) + return att_outs + fsmn_memory + ++ def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None): ++ q_k_v = self.linear_q_k_v(x).half() ++ q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1) ++ fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk) ++ attn_out = torch_npu.npu_prompt_flash_attention( ++ q, ++ k, ++ v, ++ scale_value=self.d_k ** (-0.5), ++ atten_mask=mask.unsqueeze(1).eq(0).repeat(1, 1, q.size(1), 1), ++ input_layout='BSH', ++ num_heads=self.h, ++ sparse_mode=1, ++ ).to(q.dtype) ++ att_outs = self.linear_out(attn_out) ++ return att_outs + fsmn_memory ++ + def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0): + """Compute scaled dot product attention. + +@@ -697,7 +715,7 @@ class MultiHeadedAttentionCrossAtt(nn.Module): + return self.linear_out(x), attn # (batch, time1, d_model) + return self.linear_out(x) # (batch, time1, d_model) + +- def forward(self, x, memory, memory_mask, ret_attn=False): ++ def forward_ori(self, x, memory, memory_mask, ret_attn=False): + """Compute scaled dot product attention. + + Args: +@@ -716,6 +734,22 @@ class MultiHeadedAttentionCrossAtt(nn.Module): + scores = torch.matmul(q_h, k_h.transpose(-2, -1)) + return self.forward_attention(v_h, scores, memory_mask, ret_attn=ret_attn) + ++ def forward(self, x, memory, memory_mask, ret_attn=False): ++ q = self.linear_q(x) ++ k_v = self.linear_k_v(memory) ++ k, v = torch.split(k_v, int(self.h * self.d_k), dim=-1) ++ attn_out = torch_npu.npu_prompt_flash_attention( ++ q.to(torch.float16), ++ k.to(torch.float16), ++ v.to(torch.float16), ++ scale_value=self.d_k ** (-0.5), ++ atten_mask=memory_mask, ++ input_layout='BSH', ++ num_heads=self.h, ++ sparse_mode=1, ++ ).to(q.dtype) ++ return self.linear_out(attn_out) ++ + def forward_chunk(self, x, memory, cache=None, chunk_size=None, look_back=0): + """Compute scaled dot product attention. + +diff --git a/funasr/models/sanm/encoder.py b/funasr/models/sanm/encoder.py +index 0d39ca74..b062e9d8 100644 +--- a/funasr/models/sanm/encoder.py ++++ b/funasr/models/sanm/encoder.py +@@ -361,7 +361,7 @@ class SANMEncoder(nn.Module): + def forward( + self, + xs_pad: torch.Tensor, +- ilens: torch.Tensor, ++ masks: torch.Tensor, + prev_states: torch.Tensor = None, + ctc: CTC = None, + ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: +@@ -374,7 +374,7 @@ class SANMEncoder(nn.Module): + Returns: + position embedded tensor and mask + """ +- masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) ++ # masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device) + xs_pad = xs_pad * self.output_size() ** 0.5 + if self.embed is None: + xs_pad = xs_pad -- \ No newline at end of file diff --git a/ACL_PyTorch/built-in/audio/Paraformer/requirements.txt b/ACL_PyTorch/built-in/audio/Paraformer/requirements.txt index 930d38da6b64a0ff217573838e3f5d98e2393ac3..df2267459c3fcb6376c5ab6f44503efd1e1e84a5 100644 --- a/ACL_PyTorch/built-in/audio/Paraformer/requirements.txt +++ b/ACL_PyTorch/built-in/audio/Paraformer/requirements.txt @@ -1,4 +1,4 @@ torch==2.1.0 -torch-npu==2.1.0.post11 +torch-npu==2.1.0 numpy==1.26 torchaudio \ No newline at end of file