diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/modules/multihead_attention.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/modules/multihead_attention.py index 9c2484f701f83ae038ff20d6c52c4b7a7de1eef4..ac8db9f8d029fb0872ceec5ea0abde5e3ee1009b 100644 --- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/modules/multihead_attention.py +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/modules/multihead_attention.py @@ -294,7 +294,7 @@ class MultiheadAttention(nn.Module): attn_weights = F.softmax(attn_weights, dim=-1) if self.training: - attn_weights, _, _ = torch.npu_dropoutV2(attn_weights, self.seed, p=self.dropout) + attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training) attn = strided_bmm2(attn_weights, v) assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh index 48e7fb3af7decbc497fd4f94d3464f1fc9046a98..6761832491b7719b9b6b7f7c33cef6143a88ff21 100644 --- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_1p.sh @@ -62,8 +62,11 @@ export ASCEND_SLOG_PRINT_TO_STDOUT=0 export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 export PTCOPY_ENABLE=1 export TASK_QUEUE_ENABLE=1 -export DYNAMIC_OP="ADD#MUL" +#export DYNAMIC_OP="ADD#MUL" +export COMBINED_ENABLE=1 +export SCALAR_TO_HOST_MEM=1 start_time=$(date +%s) + python3 -u train_1p.py \ $data_path \ --device-id ${ASCEND_DEVICE_ID}\ @@ -81,7 +84,7 @@ python3 -u train_1p.py \ --min-lr 0.0 \ --dropout 0.1 \ --weight-decay 0.0 \ - --criterion label_smoothed_cross_entropy \ + --criterion cross_entropy \ --label-smoothing 0.1 \ --max-sentences 128\ --max-tokens 102400\ @@ -97,7 +100,7 @@ python3 -u train_1p.py \ wait sed -i "s|if i>100:break|if i>100:pass|g" train_1p.py -sed -i "s|if m >=2:break|if i>100:pass|g" train_1p.py +sed -i "s|if m >=2:break|if m>=2:pass|g" train_1p.py ##################获取训练数据################ #训练结束时间,不需要修改 end_time=$(date +%s) diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.sh index 3e0bab82c3c675a33bd37cc027ea979a74f29e19..ced96a4427c2ead5552a0d09c962361526edce93 100644 --- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.sh +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_1p.sh @@ -10,10 +10,12 @@ export ASCEND_SLOG_PRINT_TO_STDOUT=0 export ASCEND_GLOBAL_LOG_LEVEL=3 export PTCOPY_ENABLE=1 export TASK_QUEUE_ENABLE=1 -export DYNAMIC_OP="ADD#MUL" +#export DYNAMIC_OP="ADD#MUL" +export COMBINED_ENABLE=1 +export SCALAR_TO_HOST_MEM=1 python3 -u train_1p.py \ - ./data/dataset/wmt14_en_de_joined_dict/ \ +./data/dataset/wmt14_en_de_joined_dict/ \ --device-id 7\ --arch transformer_wmt_en_de \ --share-all-embeddings \ @@ -29,7 +31,7 @@ python3 -u train_1p.py \ --min-lr 0.0 \ --dropout 0.1 \ --weight-decay 0.0 \ - --criterion label_smoothed_cross_entropy \ + --criterion cross_entropy \ --label-smoothing 0.1 \ --max-sentences 128\ --max-tokens 102400\