diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md index 708b2243700472741188235213ccc52ecf4fe79b..91c8165508b45c54b6a6a030422ac3c3af64f9c4 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/README.md @@ -126,13 +126,11 @@ cd ${wenet_path} git checkout . patch -p1 < get_no_flash_encoder_out.diff cd ${wenet_path}/examples/aishell/s0/ -bash run_no_flash_encoder_out.sh -mv encoder_data_noflash encoder_data +bash run_no_flash_encoder_out.sh --bin_path encoder_data_noflash --model_path ./no_flash_encoder_revise.om --json_path encoder_noflash.json --perf_json t1.json ``` - 获取非流式场景下decoder处理结果: - 注意修改、wenet/bin/recognize_attension_rescoring.py中json_path的路径,为了测试返回的性能,json_path里面带有"no"关键字 ``` @@ -140,7 +138,7 @@ cd ${wenet_path} git checkout . patch -p1 < getwer.diff cd ${wenet_path}/examples/aishell/s0/ -bash run_attention_rescoring.sh +bash run_attention_rescoring.sh --model_path ./decoder_final.om --perf_json t2.json --json_path encoder_noflash.json --bin_path encoder_data_noflash ``` - 查看overall精度 @@ -171,7 +169,7 @@ cd ${wenet_path} git checkout . patch -p1 < get_flash_encoder_out.diff cd ${wenet_path}/examples/aishell/s0/ -bash run_encoder_out.sh +bash run_encoder_out.sh --bin_path encoder_data --model_path encoder_revise.om --json_path encoder.json ``` - 获取流式场景下,decoder处理结果: @@ -184,7 +182,7 @@ cd ${wenet_path} git checkout . patch -p1 < getwer.diff cd ${wenet_path}/examples/aishell/s0/ -bash run_attention_rescoring.sh +bash run_attention_rescoring.sh --model_path ./decoder_final.om --json_path encoder.json --bin_path ./encoder_data ``` - 查看overall精度 @@ -216,14 +214,13 @@ cp ${code_path}/decoder_fendang.om ${wenet_path}/ - 精度测试: - - 设置日志等级export ASCEND_GLOBAL_LOG_LEVEL=3,指定acc.diff中self.encoder_ascend, self.decoder_ascend加载的文件为静态转出的encoder,decoder模型,修改run.sh中average_checkpoint为false, decode_modes修改为attention_rescoring, stage=5 decode阶段185、198行修改python为python3.7.5, 185行recognize.py修改为static.py ``` cd ${wenet_path}/ git checkout . patch -p1 < acc.diff cd ${wenet_path}/examples/aishell/s0/ -bash run.sh --stage 5 --stop_stage 5 +bash run_static.sh ``` - 查看overall精度 diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py index 690b4f5bdeff7dda54c5de1dab23f10345316849..ff17af1345adafa662a3b0b31f519723ebb5d1a5 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_flash.py @@ -54,7 +54,7 @@ import os def dic2json(input_dict, json_path): json_str = json.dumps(input_dict) - with open(json_path, 'a') as json_file: + with open(json_path, 'w+') as json_file: json_file.write(json_str) if __name__ == '__main__': parser = argparse.ArgumentParser(description='recognize with your model') diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py index 945465ca0f674f6fba9ff5791907bce2a7fa1037..88946937c40343a9240ab2b5075da58b215a131d 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/process_encoder_data_noflash.py @@ -53,7 +53,7 @@ import acl from wenet.transformer.acl_net import Net def dic2json(input_dict, json_path): json_str = json.dumps(input_dict) - with open(json_path, 'a') as json_file: + with open(json_path, 'w+') as json_file: json_file.write(json_str) if __name__ == '__main__': parser = argparse.ArgumentParser(description='recognize with your model') @@ -77,6 +77,7 @@ if __name__ == '__main__': parser.add_argument('--bin_path', type=str, default="./encoder_data_noflash", help='encoder bin images dir') parser.add_argument('--model_path', type=str, default="no_flash_encoder_revise.om", help='encoder bin images dir') parser.add_argument('--json_path', type=str, default="encoder_noflash_all.json", help='encoder bin images dir') + parser.add_argument('--perf_json', type=str, default="t1.json", help='no flash encoder time') parser.add_argument('--batch_size', type=int, default=16, @@ -209,6 +210,6 @@ if __name__ == '__main__': ave_t = total_t / (batch_idx + 1) dic_perf = {} dic_perf["t1"] = ave_t - dic2json(dic_perf, "t1.json") + dic2json(dic_perf, args.perf_json) dic2json(encoder_dic, args.json_path) del encoder_model_noflash diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/recognize_attenstion_rescoring.py b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/recognize_attenstion_rescoring.py index 1c8850d6e7cda424f39c58a1bff697269444af28..6bc3c672608154579a4be1326f1c5428c677bc8b 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/recognize_attenstion_rescoring.py +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/recognize_attenstion_rescoring.py @@ -55,7 +55,7 @@ import os def dic2json(input_dict, json_path): json_str = json.dumps(input_dict) - with open(json_path, 'a') as json_file: + with open(json_path, 'w+') as json_file: json_file.write(json_str) if __name__ == '__main__': @@ -75,6 +75,7 @@ if __name__ == '__main__': parser.add_argument('--bin_path', type=str, default="./encoder_data", help='encoder bin images dir') parser.add_argument('--model_path', type=str, default="decoder_final.om", help='encoder bin images dir') parser.add_argument('--json_path', type=str, default="encoder.json", help='encoder bin images dir') + parser.add_argument('--perf_json', type=str, default="t2.json", help='decoder time') parser.add_argument('--penalty', type=float, default=0.0, @@ -201,6 +202,5 @@ if __name__ == '__main__': ave_t = total_t / (batch_idx + 1) dic_perf = {} dic_perf["t2"] = ave_t - if "no" in args.bin_path: - dic2json(dic_perf, "t2.json") + dic2json(dic_perf, args.perf_json) del decoder_model diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_attention_rescoring.sh b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_attention_rescoring.sh index a97faa7335f16b1fbf4e6e38d8d5be90069a5af3..97743363e9b537bd11ffeed978bc487a336725b8 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_attention_rescoring.sh +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_attention_rescoring.sh @@ -48,7 +48,10 @@ average_checkpoint=false decode_checkpoint=$dir/final.pt average_num=30 decode_modes="attention_rescoring" - +model_path=./decoder_final.om +perf_json=./t2.json +json_path=./encoder.json +bin_path=./encoder_data . tools/parse_options.sh || exit 1; if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then @@ -79,6 +82,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --batch_size 1 \ --penalty 0.0 \ --dict $dict \ + --model_path ${model_path} \ + --json_path ${json_path} \ + --bin_path ${bin_path} \ + --perf_json ${perf_json} \ --ctc_weight $ctc_weight \ --reverse_weight $reverse_weight \ --result_file $test_dir/text \ diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_encoder_out.sh b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_encoder_out.sh index 4fbf960942d169039c852c31178b3b104f9b5145..1331892e2226158c9bb6664f1a8943146c6a3712 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_encoder_out.sh +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_encoder_out.sh @@ -48,6 +48,9 @@ average_checkpoint=false decode_checkpoint=$dir/final.pt average_num=30 decode_modes="attention_rescoring" +bin_path=./encoder_data +model_path=./encoder_revise.om +json_path=./encoder.json . tools/parse_options.sh || exit 1; @@ -80,6 +83,9 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --penalty 0.0 \ --dict $dict \ --ctc_weight $ctc_weight \ + --bin_path ${bin_path} \ + --model_path ${model_path} \ + --json_path ${json_path} \ --reverse_weight $reverse_weight \ --result_file $test_dir/text \ --simulate_streaming \ diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_no_flash_encoder_out.sh b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_no_flash_encoder_out.sh index 0ff98b11f617008dc4396af29e4f1c43ecf0bedb..dfed9dbf2ff089e7a7bab8eab1be5cd0fde9d337 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_no_flash_encoder_out.sh +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_no_flash_encoder_out.sh @@ -48,7 +48,10 @@ average_checkpoint=false decode_checkpoint=$dir/final.pt average_num=30 decode_modes="attention_rescoring" - +bin_path=./encoder_data_noflash +model_path=./no_flash_encoder_revise.om +json_path=./encoder_noflash.json +perf_json=./t1.json . tools/parse_options.sh || exit 1; if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then @@ -78,6 +81,10 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --batch_size 1 \ --penalty 0.0 \ --dict $dict \ + --bin_path ${bin_path} \ + --model_path ${model_path} \ + --json_path ${json_path} \ + --perf_json ${perf_json} \ --ctc_weight $ctc_weight \ --reverse_weight $reverse_weight \ --result_file $test_dir/text \ diff --git a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_static.sh b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_static.sh index 5a67f453f817cc2a4d8ce183575f4ae5ccbf6220..daf604683ddfa1885cd2d314fc9e20b986ee2812 100644 --- a/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_static.sh +++ b/ACL_PyTorch/built-in/audio/Wenet_for_Pytorch/run_static.sh @@ -51,12 +51,123 @@ decode_modes="attention_rescoring" . tools/parse_options.sh || exit 1; +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "stage -1: Data Download" + local/download_and_untar.sh ${data} ${data_url} data_aishell + local/download_and_untar.sh ${data} ${data_url} resource_aishell +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # Data preparation + local/aishell_data_prep.sh ${data}/data_aishell/wav ${data}/data_aishell/transcript +fi + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # remove the space between the text labels for Mandarin dataset + for x in train dev test; do + cp data/${x}/text data/${x}/text.org + paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \ + > data/${x}/text + rm data/${x}/text.org + done + # For wav feature, just copy the data. Fbank extraction is done in training + mkdir -p $feat_dir + for x in ${train_set} dev test; do + cp -r data/$x $feat_dir + done + + tools/compute_cmvn_stats.py --num_workers 16 --train_config $train_config \ + --in_scp data/${train_set}/wav.scp \ + --out_cmvn $feat_dir/$train_set/global_cmvn + +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # Make train dict + echo "Make a dictionary" + mkdir -p $(dirname $dict) + echo " 0" > ${dict} # 0 will be used for "blank" in CTC + echo " 1" >> ${dict} # must be 1 + tools/text2token.py -s 1 -n 1 data/train/text | cut -f 2- -d" " | tr " " "\n" \ + | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict} + num_token=$(cat $dict | wc -l) + echo " $num_token" >> $dict # +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + nj=32 + # Prepare wenet requried data + echo "Prepare data, prepare requried format" + for x in dev test ${train_set}; do + tools/format_data.sh --nj ${nj} \ + --feat-type wav --feat $feat_dir/$x/wav.scp \ + $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp + + tools/remove_longshortdata.py \ + --min_input_len 0.5 \ + --max_input_len 20 \ + --max_output_len 400 \ + --max_output_input_ratio 10.0 \ + --data_file $feat_dir/$x/format.data.tmp \ + --output_data_file $feat_dir/$x/format.data + done +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + # Training + mkdir -p $dir + INIT_FILE=$dir/ddp_init + # You had better rm it manually before you start run.sh on first node. + # rm -f $INIT_FILE # delete old one before starting + init_method=file://$(readlink -f $INIT_FILE) + echo "$0: init method is $init_method" + # The number of gpus runing on each node/machine + num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') + # Use "nccl" if it works, otherwise use "gloo" + dist_backend="nccl" + # The total number of processes/gpus, so that the master knows + # how many workers to wait for. + # More details about ddp can be found in + # https://pytorch.org/tutorials/intermediate/dist_tuto.html + world_size=`expr $num_gpus \* $num_nodes` + echo "total gpus is: $world_size" + cmvn_opts= + $cmvn && cp ${feat_dir}/${train_set}/global_cmvn $dir + $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" + # train.py will write $train_config to $dir/train.yaml with model input + # and output dimension, train.yaml will be used for inference or model + # export later + for ((i = 0; i < $num_gpus; ++i)); do + { + gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) + # Rank of each gpu/process used for knowing whether it is + # the master of a worker. + rank=`expr $node_rank \* $num_gpus + $i` + python wenet/bin/train.py --gpu $gpu_id \ + --config $train_config \ + --train_data $feat_dir/$train_set/format.data \ + --cv_data $feat_dir/dev/format.data \ + ${checkpoint:+--checkpoint $checkpoint} \ + --model_dir $dir \ + --ddp.init_method $init_method \ + --ddp.world_size $world_size \ + --ddp.rank $rank \ + --ddp.dist_backend $dist_backend \ + --num_workers 2 \ + $cmvn_opts \ + --pin_memory + } & + done + wait +fi + if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # Test model, please specify the model you want to test by --checkpoint if [ ${average_checkpoint} == true ]; then decode_checkpoint=$dir/avg_${average_num}.pt echo "do model average and final checkpoint is $decode_checkpoint" - python3 wenet/bin/average_model.py \ + python wenet/bin/average_model.py \ --dst_model $decode_checkpoint \ --src_path $dir \ --num ${average_num} \ @@ -65,13 +176,14 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then # Specify decoding_chunk_size if it's a unified dynamic chunk trained model # -1 for full chunk decoding_chunk_size= - ctc_weight=0.3 - reverse_weight=0.3 + ctc_weight=0.5 + reverse_weight=0.0 for mode in ${decode_modes}; do { test_dir=$dir/test_${mode} mkdir -p $test_dir - python3.7 wenet/bin/static.py --gpu -1 \ + python3.7.5 wenet/bin/static.py --gpu 0 \ + --mode $mode \ --config $dir/train.yaml \ --test_data $feat_dir/test/format.data \ --checkpoint $decode_checkpoint \ @@ -82,10 +194,8 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then --ctc_weight $ctc_weight \ --reverse_weight $reverse_weight \ --result_file $test_dir/text \ - --simulate_streaming \ - --decoding_chunk_size 1 ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - python3.7 tools/compute-wer.py --char=1 --v=1 \ + python3.7.5 tools/compute-wer.py --char=1 --v=1 \ $feat_dir/test/text $test_dir/text > $test_dir/wer } & done @@ -93,3 +203,60 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then fi +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + # Export the best model you want + python wenet/bin/export_jit.py \ + --config $dir/train.yaml \ + --checkpoint $dir/avg_${average_num}.pt \ + --output_file $dir/final.zip \ + --output_quant_file $dir/final_quant.zip +fi + +# Optionally, you can add LM and test it with runtime. +if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then + # 7.1 Prepare dict + unit_file=$dict + mkdir -p data/local/dict + cp $unit_file data/local/dict/units.txt + tools/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \ + data/local/dict/lexicon.txt + # 7.2 Train lm + lm=data/local/lm + mkdir -p $lm + tools/filter_scp.pl data/train/text \ + $data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text + local/aishell_train_lms.sh + # 7.3 Build decoding TLG + tools/fst/compile_lexicon_token_fst.sh \ + data/local/dict data/local/tmp data/local/lang + tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; + # 7.4 Decoding with runtime + # reverse_weight only works for u2++ model and only left to right decoder is used when it is set to 0.0. + reverse_weight=0.0 + chunk_size=-1 + ./tools/decode.sh --nj 16 \ + --beam 15.0 --lattice_beam 7.5 --max_active 7000 \ + --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \ + --reverse_weight $reverse_weight --chunk_size $chunk_size \ + --fst_path data/lang_test/TLG.fst \ + data/test/wav.scp data/test/text $dir/final.zip \ + data/lang_test/words.txt $dir/lm_with_runtime + # See $dir/lm_with_runtime for wer +fi + +if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + # Test model, please specify the model you want to use by --checkpoint + # alignment input + ali_format=$feat_dir/test/format.data + # alignment output + ali_result=$dir/ali + python wenet/bin/alignment.py --gpu -1 \ + --config $dir/train.yaml \ + --input_file $ali_format \ + --checkpoint $checkpoint \ + --batch_size 1 \ + --dict $dict \ + --result_file $ali_result \ + --gen_praat +fi +