diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_16p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_16p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2428c49657de48394c5632ffddc1a2346c186d27
--- /dev/null
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_16p.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+
+cur_path=`pwd`
+#集合通信参数,不需要修改
+export RANK_SIZE=8
+#export MASTER_ADDR=localhost
+export MASTER_PORT=29688
+# 数据集路径,保持为空,不需要修改
+data_path=""
+conf_path=""
+server_index=""
+fix_node_ip=""
+
+#网络名称,同目录名称,需要模型审视修改
+Network="Transformer_ID0105_for_PyTorch"
+
+export BMMV2_ENABLE=1
+#训练epoch
+train_epochs=30
+#训练batch_size,,需要模型审视修改
+batch_size=128
+
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --epochs* ]];then
+        epochs=`echo ${para#*=}`
+    elif [[ $para == --conf_path* ]];then
+            conf_path=`echo ${para#*=}`
+    elif [[ $para == --server_index* ]];then
+            server_index=`echo ${para#*=}`
+    elif [[ $para == --fix_node_ip* ]];then
+            fix_node_ip=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'`
+linux_num=`find $conf_path -name "server_*.info" |wc -l`
+
+export HCCL_IF_IP=$fix_node_ip
+export MASTER_ADDR=$one_node_ip
+
+
+#创建DeviceID输出目录，不需要修改
+if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf $cur_path/output/$ASCEND_DEVICE_ID
+    mkdir -p $cur_path/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p $cur_path/output/$ASCEND_DEVICE_ID
+fi
+
+#################启动训练脚本#################
+
+
+# 必要参数替换配置文件
+cd $cur_path/..
+DATA_DIR=./data/dataset/wmt14_en_de_joined_dict/
+MODELDIR="./checkpoints/"
+mkdir -p "$MODELDIR"
+LOGFILE="$MODELDIR/log"
+STAT_FILE="log.txt"
+
+
+start_time=$(date +%s)
+NPUS=($(seq 0 7))
+rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'`
+export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'`
+#export NPU_WORLD_SIZE=${#NPUS[@]}
+rank=0
+for i in ${NPUS[@]}
+do
+    export NPU_CALCULATE_DEVICE=${i}
+    mkdir -p  $cur_path/output/${i}/
+    export ASCEND_DEVICE_ID=${i}
+    export RANK=`awk 'BEGIN{printf "%.0f\n",'${rank}'+'${rank_server}'}'`
+    echo run process ${rank}
+
+
+    python3 train_8p.py \
+        $data_path \
+      --arch transformer_wmt_en_de \
+      --share-all-embeddings \
+      --optimizer adam \
+      --adam-beta1 0.9 \
+      --adam-beta2 0.997 \
+      --addr ${one_node_ip} \
+      --port 29990 \
+      --adam-eps "1e-9" \
+      --clip-norm 0.0 \
+      --lr-scheduler inverse_sqrt \
+      --warmup-init-lr 0.0 \
+      --warmup-updates 4000 \
+      --lr 0.0006 \
+      --min-lr 0.0 \
+      --dropout 0.1 \
+      --weight-decay 0.0 \
+      --criterion label_smoothed_cross_entropy \
+      --label-smoothing 0.1 \
+      --max-sentences 128\
+      --max-tokens 102400 \
+      --max-epoch $train_epochs \
+      --seed 1 \
+      --save-dir $MODELDIR \
+      --stat-file $STAT_FILE\
+      --log-interval 1\
+      --amp\
+      --device-id ${rank}\
+      --amp-level O2  >  $cur_path/output/${i}/train_${i}.log 2>&1 &
+    let rank++
+done
+wait
+
+
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=`grep -rns "Time" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |grep -v "all" |awk -F "Time" '{print$2}' |awk -F "(" '{print$1}'|tail -n +5|awk '{sum+=$1} END {print"",8*128*NR/sum}'|sed s/[[:space:]]//g`
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep -rns "Validation" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk 'END {print $6}'`
+
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+#获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'$FPS'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep -rns "Time" ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |grep -v "all" |awk -F "Loss" '{print$2}' |awk -F "(" '{print$1}'  > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' ${cur_path}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh
index dc8f9b79a7bd68f19a9dfce631982cfbb5c440e3..f955e044b1c58d56b34ede041046618c8307f165 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh
@@ -71,7 +71,7 @@ do
     echo run process ${rank}
 
 
-    python3 train_8p_new.py \
+    python3 train_8p.py \
         $data_path \
       --arch transformer_wmt_en_de \
       --share-all-embeddings \
@@ -104,7 +104,7 @@ do
 done
 wait
 
-    
+
 
 
 ##################获取训练数据################
diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_16p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_16p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..72e841894480ff5a2c1ad4a18224ccd567e2a8dd
--- /dev/null
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_16p.sh
@@ -0,0 +1,176 @@
+#!/bin/bash
+
+cur_path=`pwd`
+#nmon -s3 -c 500 -f -m $cur_path
+#集合通信参数,不需要修改
+export RANK_SIZE=16
+#export MASTER_ADDR=localhost
+export MASTER_PORT=29688
+export HCCL_WHITELIST_DISABLE=1
+export BMMV2_ENABLE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+conf_path=""
+server_index=""
+fix_node_ip=""
+
+#网络名称,同目录名称,需要模型审视修改
+Network="Transformer_ID0105_for_PyTorch"
+
+#训练batch_size,,需要模型审视修改
+batch_size=128
+
+
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --epochs* ]];then
+        epochs=`echo ${para#*=}`
+    elif [[ $para == --conf_path* ]];then
+            conf_path=`echo ${para#*=}`
+    elif [[ $para == --server_index* ]];then
+            server_index=`echo ${para#*=}`
+    elif [[ $para == --fix_node_ip* ]];then
+            fix_node_ip=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'`
+linux_num=`find $conf_path -name "server_*.info" |wc -l`
+
+export HCCL_IF_IP=$fix_node_ip
+export MASTER_ADDR=$one_node_ip
+
+
+#创建DeviceID输出目录，不需要修改
+if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf $cur_path/output/$ASCEND_DEVICE_ID
+    mkdir -p $cur_path/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p $cur_path/output/$ASCEND_DEVICE_ID
+fi
+
+#################启动训练脚本#################
+
+
+# 必要参数替换配置文件
+cd $cur_path/..
+DATA_DIR=./data/dataset/wmt14_en_de_joined_dict/
+MODELDIR="./checkpoints/"
+mkdir -p "$MODELDIR"
+LOGFILE="$MODELDIR/log"
+STAT_FILE="log.txt"
+
+sed -i "s|if i>100:pass|if i>100:break|g" train_8p.py
+sed -i "s|if m >=2:pass|if m >=2:break|g" train_8p.py
+
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=3
+export PTCOPY_ENABLE=1
+export TASK_QUEUE_ENABLE=1
+export DYNAMIC_OP="ADD#MUL"
+start_time=$(date +%s)
+NPUS=($(seq 0 7))
+rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'`
+export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'`
+#export NPU_WORLD_SIZE=${#NPUS[@]}
+rank=0
+for i in ${NPUS[@]}
+do
+    export NPU_CALCULATE_DEVICE=${i}
+    mkdir -p  $cur_path/output/${i}/
+    export ASCEND_DEVICE_ID=${i}
+    export RANK=`awk 'BEGIN{printf "%.0f\n",'${rank}'+'${rank_server}'}'`
+    echo run process ${rank}
+
+
+    python3 train_8p.py \
+       $data_path \
+      --arch transformer_wmt_en_de \
+      --share-all-embeddings \
+      --optimizer adam \
+  --adam-beta1 0.9 \
+  --distributed-world-size ${NPU_WORLD_SIZE} \
+  --adam-beta2 0.997 \
+  --addr ${one_node_ip} \
+  --port 29990 \
+  --adam-eps "1e-9" \
+  --clip-norm 0.0 \
+  --lr-scheduler inverse_sqrt \
+  --warmup-init-lr 0.0 \
+  --warmup-updates 4000 \
+  --lr 0.0006 \
+  --min-lr 0.0 \
+  --dropout 0.1 \
+  --weight-decay 0.0 \
+  --criterion label_smoothed_cross_entropy \
+  --label-smoothing 0.1 \
+  --max-sentences 128\
+  --max-tokens 102400 \
+  --seed 1 \
+  --save-dir $MODELDIR \
+  --stat-file $STAT_FILE\
+  --log-interval 1\
+  --amp\
+  --device-id ${rank}\
+  --amp-level O2  >  $cur_path/output/${i}/train_${i}.log 2>&1 &
+    let rank++
+done
+wait
+sed -i "s|if i>100:break|if i>100:pass|g" train_8p.py
+sed -i "s|if m >=2:break|if m >=2:pass|g" train_8p.py
+
+
+
+##################获取训练数据################
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+FPS=` grep -rns "Time" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |grep -v "all" |awk -F "Time" '{print$2}' |awk -F "(" '{print$1}'|tail -n +5|awk '{sum+=$1} END {print"",16*128*NR/sum}'|sed s/[[:space:]]//g`
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#打印，不需要修改
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+BatchSize=${batch_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+#获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+#单迭代训练时长
+TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'`
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要模型审视修改
+grep -rns "Time" ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |grep -v "all" |awk -F "Loss" '{print$2}' |awk -F "(" '{print$1}'  > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' ${cur_path}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh
index f4b26836cd96412edf282d783f2236ab1cdbceb4..91db273613365ae5ce413510a2cab5d36fd1e02d 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 cur_path=`pwd`
-nmon -s3 -c 500 -f -m $cur_path
+#nmon -s3 -c 500 -f -m $cur_path
 #集合通信参数,不需要修改
 export RANK_SIZE=8
 export MASTER_ADDR=localhost
@@ -57,11 +57,11 @@ mkdir -p "$MODELDIR"
 LOGFILE="$MODELDIR/log"
 STAT_FILE="log.txt"
 
-sed -i "s|if i>100:pass|if i>100:break|g" train_8p_new.py
-sed -i "s|if m >=2:pass|if m >=2:break|g" train_8p_new.py
+sed -i "s|if i>100:pass|if i>100:break|g" train_8p.py
+sed -i "s|if m >=2:pass|if m >=2:break|g" train_8p.py
 
 export ASCEND_SLOG_PRINT_TO_STDOUT=0
-export ASCEND_GLOBAL_LOG_LEVEL_ETP=3
+export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=3
 export PTCOPY_ENABLE=1
 export TASK_QUEUE_ENABLE=1
 export DYNAMIC_OP="ADD#MUL"
@@ -78,7 +78,7 @@ do
     echo run process ${rank}
 
 
-    python3 train_8p_new.py \
+    python3 train_8p.py \
        $data_path \
       --arch transformer_wmt_en_de \
       --share-all-embeddings \
@@ -110,9 +110,9 @@ do
     let rank++
 done
 wait
-sed -i "s|if i>100:break|if i>100:pass|g" train_8p_new.py
-sed -i "s|if m >=2:break|if m >=2:pass|g" train_8p_new.py 
-    
+sed -i "s|if i>100:break|if i>100:pass|g" train_8p.py
+sed -i "s|if m >=2:break|if m >=2:pass|g" train_8p.py
+
 
 
 ##################获取训练数据################
diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_8p.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_8p.py
index ac8ca3a8fef7dc94c3f7f2fce4fe104e6fb50ed2..ccfd072d2403c944a4b45d598a6f597d1cb33ba1 100644
--- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_8p.py
+++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_8p.py
@@ -48,7 +48,16 @@ import dllogger as DLLogger
 from utils.log_helper import AggregatorBackend, setup_logger
 
 
+NPU_CALCULATE_DEVICE = 0
+if os.getenv('NPU_CALCULATE_DEVICE') and str.isdigit(os.getenv('NPU_CALCULATE_DEVICE')):
+    NPU_CALCULATE_DEVICE = int(os.getenv('NPU_CALCULATE_DEVICE'))
+if torch.npu.current_device() != NPU_CALCULATE_DEVICE:
+    torch.npu.set_device(f'npu:{NPU_CALCULATE_DEVICE}')
+NPU_WORLD_SIZE = int(os.getenv('NPU_WORLD_SIZE'))
+RANK = int(os.getenv('RANK'))
+torch.distributed.init_process_group('hccl', rank=RANK, world_size=NPU_WORLD_SIZE)
 MAX = 2147483647
+
 def _gen_seeds(shape):
     return np.random.uniform(1, MAX, size=shape).astype(np.float32)
 seed_shape = (32 * 1024 * 12, )
@@ -102,16 +111,18 @@ def main():
     print(args)
     os.environ['MASTER_ADDR'] = args.addr
     os.environ['MASTER_PORT'] = args.port
-    mp.spawn(main_worker, nprocs=args.distributed_world_size, args=(args.distributed_world_size, args))
+    device_id = args.device_id
+    #mp.spawn(main_worker, nprocs=args.distributed_world_size, args=(args.distributed_world_size, args))
+    main_worker(pid_idx=device_id, args=args)
 
 
 
-def main_worker(pid_idx, device_nums_per_node, args):
+def main_worker(pid_idx, args):
     setup_logger(args)
-
+    print('pid_idx:',str(pid_idx))
     args.distributed_rank = pid_idx
     args.device_id = args.distributed_rank
-    dist.init_process_group(backend=args.dist_backend, world_size=args.distributed_world_size, rank=args.distributed_rank)
+    #dist.init_process_group(backend=args.dist_backend, world_size=NPU_WORLD_SIZE, rank=args.distributed_rank)
     loc = 'npu:{}'.format(args.device_id)
     torch.npu.set_device(loc)
 
@@ -128,16 +139,16 @@ def main_worker(pid_idx, device_nums_per_node, args):
     seed = torch.from_numpy(seed)
     seed = seed.to(loc)
     model = build_model(args, seed=seed)
-    if args.distributed_world_size > 1 and args.distributed_rank == 0:
+    if args.distributed_world_size > 1 :
         print('| num. model params: {}'.format(sum(p.numel() for p in model.parameters())))
 
     # Build trainer
     trainer = DDPTrainer(args, model)
-    if args.distributed_world_size > 1 and args.distributed_rank == 0:
+    if args.distributed_world_size > 1 :
         print('| model {}, criterion {}'.format(args.arch, trainer.criterion.__class__.__name__))
         print('| training on {} NPUs'.format(args.distributed_world_size))
 
-    if args.distributed_world_size > 1 and args.distributed_rank == 0:
+    if args.distributed_world_size > 1 :
         print('| max sentences per NPU = {}'.format(args.max_sentences))
 
     epoch_itr = data.EpochBatchIterator(
@@ -199,7 +210,7 @@ def main_worker(pid_idx, device_nums_per_node, args):
     train_meter.stop()
     DLLogger.log(step=[], data=run_summary, verbosity=0)
     DLLogger.log(step='RUN', data={'walltime': train_meter.sum}, verbosity=0)
-    if args.distributed_world_size > 1 and args.distributed_rank == 0:
+    if args.distributed_world_size > 1 :
         print('| done training in {:.1f} seconds'.format(train_meter.sum))
 
 
@@ -219,7 +230,7 @@ def train(args, trainer, datasets, epoch_itr):
     batch_time = AverageMeter('Time', ':6.3f')
     sentence_s = AverageMeter('Sentence/s', ':6.3f')
     losses = AverageMeter('Loss', ':.4f')
-    progress = ProgressMeter(int(num_batches/args.distributed_world_size),
+    progress = ProgressMeter(int(num_batches/args.distributed_world_size/update_freq),
                              [batch_time, sentence_s,losses],
                              prefix = "Epoch: [{}]".format(epoch_itr.epoch))
 
@@ -242,15 +253,15 @@ def train(args, trainer, datasets, epoch_itr):
             if loss != None:
                 losses.update(loss)
 
-        if i >= 2:
+        if i >= 4:
             t = time.time()
             batch_time.update((t - end)/update_freq)
             sentence_s.update(args.max_sentences/(t-end)*args.distributed_world_size)
             end = time.time()
-        if i < 2:
+        if i < 4:
             end = time.time()
-        if i >= 2:
-            if args.distributed_world_size > 1 and args.distributed_rank == 0:
+        if i >= 4:
+            if args.distributed_world_size > 1 :
                 progress.display(int((i+1)/update_freq))
 
 
@@ -265,7 +276,7 @@ def train(args, trainer, datasets, epoch_itr):
 
         # Mid epoch checkpoint
         num_updates = trainer.get_num_updates()
-        if args.distributed_world_size > 1 and args.distributed_rank == 0:
+        if args.distributed_world_size > 1 :
             if args.save_interval_updates > 0 and num_updates % args.save_interval_updates == 0:
                 valid_losses = validate(args, trainer, datasets, [first_valid])
                 save_checkpoint(args, trainer, epoch_itr, valid_losses[0])
@@ -276,14 +287,14 @@ def train(args, trainer, datasets, epoch_itr):
         if num_updates >= max_update:
             break
 
-    if args.distributed_world_size > 1 and args.distributed_rank == 0:
+    if args.distributed_world_size > 1 :
         if batch_time.avg > 0:
             print("End of epoch, batch_size:", args.max_sentences, 'Time: {:.3f}'.format(batch_time.avg),
                   ' Sentence/s@all {:.3f}'.format(
                       args.max_sentences / batch_time.avg * args.distributed_world_size))
 
     # Print epoch stats and reset training meters
-    if args.distributed_world_size > 1 and args.distributed_rank == 0:
+    if args.distributed_world_size > 1 :
         DLLogger.log(step=trainer.get_num_updates(), data={'speed': trainer.get_throughput_meter().avg}, verbosity=0)
         DLLogger.flush()
 
@@ -314,7 +325,7 @@ def validate(args, trainer, datasets, subsets):
         ).next_epoch_itr(shuffle=False)
 
         # reset validation loss meters
-        if args.distributed_world_size > 1 and args.distributed_rank == 0:
+        if args.distributed_world_size > 1 :
             DLLogger.flush()
 
         subset_losses = []
@@ -326,7 +337,7 @@ def validate(args, trainer, datasets, subsets):
         DLLogger.flush()
 
         valid_losses.append(subset_loss)
-        if args.distributed_world_size > 1 and args.distributed_rank == 0:
+        if args.distributed_world_size > 1 :
             print(f'Validation loss on subset {subset}: {subset_loss}')
 
     return valid_losses
@@ -393,7 +404,7 @@ def load_checkpoint(args, trainer, epoch_itr):
         if extra_state is not None:
             # replay train iterator to match checkpoint
             epoch_itr.load_state_dict(extra_state['train_iterator'])
-            if args.distributed_world_size > 1 and args.distributed_rank == 0:
+            if args.distributed_world_size > 1 :
                 print('| loaded checkpoint {} (epoch {} @ {} updates)'.format(
                     checkpoint_path, epoch_itr.epoch, trainer.get_num_updates()))
 
@@ -404,4 +415,4 @@ def load_checkpoint(args, trainer, epoch_itr):
 
 
 if __name__ == '__main__':
-    main()
+    main()
\ No newline at end of file