diff --git a/PyTorch/built-in/nlp/GPT-2_for_PyTorch/gpt_patch/gpt_patch.py b/PyTorch/built-in/nlp/GPT-2_for_PyTorch/gpt_patch/gpt_patch.py
index ef205768dd82787b7b944860d6cb8255950a7e86..1377d7c655606762f20f5168eea35f675c120f97 100644
--- a/PyTorch/built-in/nlp/GPT-2_for_PyTorch/gpt_patch/gpt_patch.py
+++ b/PyTorch/built-in/nlp/GPT-2_for_PyTorch/gpt_patch/gpt_patch.py
@@ -849,7 +849,7 @@ def FusedScaleMaskSoftmaxForward(self, input, mask, norm_factor):
             input = input.float()
 
         if self.scale is not None:
-            input = input * (scale * 1.0 / norm_factor)
+            input = input * (self.scale * 1.0 / norm_factor)
 
         if self.attn_mask_type == AttnMaskType.causal:
             if self.mask_tri is None: