diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_16p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..2428c49657de48394c5632ffddc1a2346c186d27 --- /dev/null +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_16p.sh @@ -0,0 +1,175 @@ +#!/bin/bash + +cur_path=`pwd` +#集合通信参数,不需要修改 +export RANK_SIZE=8 +#export MASTER_ADDR=localhost +export MASTER_PORT=29688 +# 数据集路径,保持为空,不需要修改 +data_path="" +conf_path="" +server_index="" +fix_node_ip="" + +#网络名称,同目录名称,需要模型审视修改 +Network="Transformer_ID0105_for_PyTorch" + +export BMMV2_ENABLE=1 +#训练epoch +train_epochs=30 +#训练batch_size,,需要模型审视修改 +batch_size=128 + + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --epochs* ]];then + epochs=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip + + +#创建DeviceID输出目录,不需要修改 +if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then + rm -rf $cur_path/output/$ASCEND_DEVICE_ID + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# + + +# 必要参数替换配置文件 +cd $cur_path/.. +DATA_DIR=./data/dataset/wmt14_en_de_joined_dict/ +MODELDIR="./checkpoints/" +mkdir -p "$MODELDIR" +LOGFILE="$MODELDIR/log" +STAT_FILE="log.txt" + + +start_time=$(date +%s) +NPUS=($(seq 0 7)) +rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` +#export NPU_WORLD_SIZE=${#NPUS[@]} +rank=0 +for i in ${NPUS[@]} +do + export NPU_CALCULATE_DEVICE=${i} + mkdir -p $cur_path/output/${i}/ + export ASCEND_DEVICE_ID=${i} + export RANK=`awk 'BEGIN{printf "%.0f\n",'${rank}'+'${rank_server}'}'` + echo run process ${rank} + + + python3 train_8p.py \ + $data_path \ + --arch transformer_wmt_en_de \ + --share-all-embeddings \ + --optimizer adam \ + --adam-beta1 0.9 \ + --adam-beta2 0.997 \ + --addr ${one_node_ip} \ + --port 29990 \ + --adam-eps "1e-9" \ + --clip-norm 0.0 \ + --lr-scheduler inverse_sqrt \ + --warmup-init-lr 0.0 \ + --warmup-updates 4000 \ + --lr 0.0006 \ + --min-lr 0.0 \ + --dropout 0.1 \ + --weight-decay 0.0 \ + --criterion label_smoothed_cross_entropy \ + --label-smoothing 0.1 \ + --max-sentences 128\ + --max-tokens 102400 \ + --max-epoch $train_epochs \ + --seed 1 \ + --save-dir $MODELDIR \ + --stat-file $STAT_FILE\ + --log-interval 1\ + --amp\ + --device-id ${rank}\ + --amp-level O2 > $cur_path/output/${i}/train_${i}.log 2>&1 & + let rank++ +done +wait + + + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -rns "Time" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |grep -v "all" |awk -F "Time" '{print$2}' |awk -F "(" '{print$1}'|tail -n +5|awk '{sum+=$1} END {print"",8*128*NR/sum}'|sed s/[[:space:]]//g` + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -rns "Validation" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |awk 'END {print $6}'` + +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +#获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'$FPS'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 +grep -rns "Time" ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |grep -v "all" |awk -F "Loss" '{print$2}' |awk -F "(" '{print$1}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${cur_path}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh index dc8f9b79a7bd68f19a9dfce631982cfbb5c440e3..f955e044b1c58d56b34ede041046618c8307f165 100644 --- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_full_8p.sh @@ -71,7 +71,7 @@ do echo run process ${rank} - python3 train_8p_new.py \ + python3 train_8p.py \ $data_path \ --arch transformer_wmt_en_de \ --share-all-embeddings \ @@ -104,7 +104,7 @@ do done wait - + ##################获取训练数据################ diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_16p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_16p.sh new file mode 100644 index 0000000000000000000000000000000000000000..72e841894480ff5a2c1ad4a18224ccd567e2a8dd --- /dev/null +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_16p.sh @@ -0,0 +1,176 @@ +#!/bin/bash + +cur_path=`pwd` +#nmon -s3 -c 500 -f -m $cur_path +#集合通信参数,不需要修改 +export RANK_SIZE=16 +#export MASTER_ADDR=localhost +export MASTER_PORT=29688 +export HCCL_WHITELIST_DISABLE=1 +export BMMV2_ENABLE=1 +# 数据集路径,保持为空,不需要修改 +data_path="" +conf_path="" +server_index="" +fix_node_ip="" + +#网络名称,同目录名称,需要模型审视修改 +Network="Transformer_ID0105_for_PyTorch" + +#训练batch_size,,需要模型审视修改 +batch_size=128 + + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --epochs* ]];then + epochs=`echo ${para#*=}` + elif [[ $para == --conf_path* ]];then + conf_path=`echo ${para#*=}` + elif [[ $para == --server_index* ]];then + server_index=`echo ${para#*=}` + elif [[ $para == --fix_node_ip* ]];then + fix_node_ip=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +one_node_ip=`find $conf_path -name "server_*0.info"|awk -F "server_" '{print $2}'|awk -F "_" '{print $1}'` +linux_num=`find $conf_path -name "server_*.info" |wc -l` + +export HCCL_IF_IP=$fix_node_ip +export MASTER_ADDR=$one_node_ip + + +#创建DeviceID输出目录,不需要修改 +if [ -d $cur_path/output/${ASCEND_DEVICE_ID} ];then + rm -rf $cur_path/output/$ASCEND_DEVICE_ID + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +else + mkdir -p $cur_path/output/$ASCEND_DEVICE_ID +fi + +#################启动训练脚本################# + + +# 必要参数替换配置文件 +cd $cur_path/.. +DATA_DIR=./data/dataset/wmt14_en_de_joined_dict/ +MODELDIR="./checkpoints/" +mkdir -p "$MODELDIR" +LOGFILE="$MODELDIR/log" +STAT_FILE="log.txt" + +sed -i "s|if i>100:pass|if i>100:break|g" train_8p.py +sed -i "s|if m >=2:pass|if m >=2:break|g" train_8p.py + +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=3 +export PTCOPY_ENABLE=1 +export TASK_QUEUE_ENABLE=1 +export DYNAMIC_OP="ADD#MUL" +start_time=$(date +%s) +NPUS=($(seq 0 7)) +rank_server=`awk 'BEGIN{printf "%.0f\n",8*'${server_index}'}'` +export NPU_WORLD_SIZE=`awk 'BEGIN{printf "%.0f\n",8*'${linux_num}'}'` +#export NPU_WORLD_SIZE=${#NPUS[@]} +rank=0 +for i in ${NPUS[@]} +do + export NPU_CALCULATE_DEVICE=${i} + mkdir -p $cur_path/output/${i}/ + export ASCEND_DEVICE_ID=${i} + export RANK=`awk 'BEGIN{printf "%.0f\n",'${rank}'+'${rank_server}'}'` + echo run process ${rank} + + + python3 train_8p.py \ + $data_path \ + --arch transformer_wmt_en_de \ + --share-all-embeddings \ + --optimizer adam \ + --adam-beta1 0.9 \ + --distributed-world-size ${NPU_WORLD_SIZE} \ + --adam-beta2 0.997 \ + --addr ${one_node_ip} \ + --port 29990 \ + --adam-eps "1e-9" \ + --clip-norm 0.0 \ + --lr-scheduler inverse_sqrt \ + --warmup-init-lr 0.0 \ + --warmup-updates 4000 \ + --lr 0.0006 \ + --min-lr 0.0 \ + --dropout 0.1 \ + --weight-decay 0.0 \ + --criterion label_smoothed_cross_entropy \ + --label-smoothing 0.1 \ + --max-sentences 128\ + --max-tokens 102400 \ + --seed 1 \ + --save-dir $MODELDIR \ + --stat-file $STAT_FILE\ + --log-interval 1\ + --amp\ + --device-id ${rank}\ + --amp-level O2 > $cur_path/output/${i}/train_${i}.log 2>&1 & + let rank++ +done +wait +sed -i "s|if i>100:break|if i>100:pass|g" train_8p.py +sed -i "s|if m >=2:break|if m >=2:pass|g" train_8p.py + + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=` grep -rns "Time" $cur_path/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |grep -v "all" |awk -F "Time" '{print$2}' |awk -F "(" '{print$1}'|tail -n +5|awk '{sum+=$1} END {print"",16*128*NR/sum}'|sed s/[[:space:]]//g` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +#打印,不需要修改 +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +#获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要模型审视修改 +grep -rns "Time" ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log |grep -v "all" |awk -F "Loss" '{print$2}' |awk -F "(" '{print$1}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${cur_path}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${cur_path}/output/$ASCEND_DEVICE_ID/${CaseName}.log \ No newline at end of file diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh index f4b26836cd96412edf282d783f2236ab1cdbceb4..91db273613365ae5ce413510a2cab5d36fd1e02d 100644 --- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/test/train_performance_8p.sh @@ -1,7 +1,7 @@ #!/bin/bash cur_path=`pwd` -nmon -s3 -c 500 -f -m $cur_path +#nmon -s3 -c 500 -f -m $cur_path #集合通信参数,不需要修改 export RANK_SIZE=8 export MASTER_ADDR=localhost @@ -57,11 +57,11 @@ mkdir -p "$MODELDIR" LOGFILE="$MODELDIR/log" STAT_FILE="log.txt" -sed -i "s|if i>100:pass|if i>100:break|g" train_8p_new.py -sed -i "s|if m >=2:pass|if m >=2:break|g" train_8p_new.py +sed -i "s|if i>100:pass|if i>100:break|g" train_8p.py +sed -i "s|if m >=2:pass|if m >=2:break|g" train_8p.py export ASCEND_SLOG_PRINT_TO_STDOUT=0 -export ASCEND_GLOBAL_LOG_LEVEL_ETP=3 +export ASCEND_GLOBAL_LOG_LEVEL_ETP_ETP=3 export PTCOPY_ENABLE=1 export TASK_QUEUE_ENABLE=1 export DYNAMIC_OP="ADD#MUL" @@ -78,7 +78,7 @@ do echo run process ${rank} - python3 train_8p_new.py \ + python3 train_8p.py \ $data_path \ --arch transformer_wmt_en_de \ --share-all-embeddings \ @@ -110,9 +110,9 @@ do let rank++ done wait -sed -i "s|if i>100:break|if i>100:pass|g" train_8p_new.py -sed -i "s|if m >=2:break|if m >=2:pass|g" train_8p_new.py - +sed -i "s|if i>100:break|if i>100:pass|g" train_8p.py +sed -i "s|if m >=2:break|if m >=2:pass|g" train_8p.py + ##################获取训练数据################ diff --git a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_8p.py b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_8p.py index ac8ca3a8fef7dc94c3f7f2fce4fe104e6fb50ed2..ccfd072d2403c944a4b45d598a6f597d1cb33ba1 100644 --- a/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_8p.py +++ b/PyTorch/dev/nlp/Transformer_ID0105_for_PyTorch/train_8p.py @@ -48,7 +48,16 @@ import dllogger as DLLogger from utils.log_helper import AggregatorBackend, setup_logger +NPU_CALCULATE_DEVICE = 0 +if os.getenv('NPU_CALCULATE_DEVICE') and str.isdigit(os.getenv('NPU_CALCULATE_DEVICE')): + NPU_CALCULATE_DEVICE = int(os.getenv('NPU_CALCULATE_DEVICE')) +if torch.npu.current_device() != NPU_CALCULATE_DEVICE: + torch.npu.set_device(f'npu:{NPU_CALCULATE_DEVICE}') +NPU_WORLD_SIZE = int(os.getenv('NPU_WORLD_SIZE')) +RANK = int(os.getenv('RANK')) +torch.distributed.init_process_group('hccl', rank=RANK, world_size=NPU_WORLD_SIZE) MAX = 2147483647 + def _gen_seeds(shape): return np.random.uniform(1, MAX, size=shape).astype(np.float32) seed_shape = (32 * 1024 * 12, ) @@ -102,16 +111,18 @@ def main(): print(args) os.environ['MASTER_ADDR'] = args.addr os.environ['MASTER_PORT'] = args.port - mp.spawn(main_worker, nprocs=args.distributed_world_size, args=(args.distributed_world_size, args)) + device_id = args.device_id + #mp.spawn(main_worker, nprocs=args.distributed_world_size, args=(args.distributed_world_size, args)) + main_worker(pid_idx=device_id, args=args) -def main_worker(pid_idx, device_nums_per_node, args): +def main_worker(pid_idx, args): setup_logger(args) - + print('pid_idx:',str(pid_idx)) args.distributed_rank = pid_idx args.device_id = args.distributed_rank - dist.init_process_group(backend=args.dist_backend, world_size=args.distributed_world_size, rank=args.distributed_rank) + #dist.init_process_group(backend=args.dist_backend, world_size=NPU_WORLD_SIZE, rank=args.distributed_rank) loc = 'npu:{}'.format(args.device_id) torch.npu.set_device(loc) @@ -128,16 +139,16 @@ def main_worker(pid_idx, device_nums_per_node, args): seed = torch.from_numpy(seed) seed = seed.to(loc) model = build_model(args, seed=seed) - if args.distributed_world_size > 1 and args.distributed_rank == 0: + if args.distributed_world_size > 1 : print('| num. model params: {}'.format(sum(p.numel() for p in model.parameters()))) # Build trainer trainer = DDPTrainer(args, model) - if args.distributed_world_size > 1 and args.distributed_rank == 0: + if args.distributed_world_size > 1 : print('| model {}, criterion {}'.format(args.arch, trainer.criterion.__class__.__name__)) print('| training on {} NPUs'.format(args.distributed_world_size)) - if args.distributed_world_size > 1 and args.distributed_rank == 0: + if args.distributed_world_size > 1 : print('| max sentences per NPU = {}'.format(args.max_sentences)) epoch_itr = data.EpochBatchIterator( @@ -199,7 +210,7 @@ def main_worker(pid_idx, device_nums_per_node, args): train_meter.stop() DLLogger.log(step=[], data=run_summary, verbosity=0) DLLogger.log(step='RUN', data={'walltime': train_meter.sum}, verbosity=0) - if args.distributed_world_size > 1 and args.distributed_rank == 0: + if args.distributed_world_size > 1 : print('| done training in {:.1f} seconds'.format(train_meter.sum)) @@ -219,7 +230,7 @@ def train(args, trainer, datasets, epoch_itr): batch_time = AverageMeter('Time', ':6.3f') sentence_s = AverageMeter('Sentence/s', ':6.3f') losses = AverageMeter('Loss', ':.4f') - progress = ProgressMeter(int(num_batches/args.distributed_world_size), + progress = ProgressMeter(int(num_batches/args.distributed_world_size/update_freq), [batch_time, sentence_s,losses], prefix = "Epoch: [{}]".format(epoch_itr.epoch)) @@ -242,15 +253,15 @@ def train(args, trainer, datasets, epoch_itr): if loss != None: losses.update(loss) - if i >= 2: + if i >= 4: t = time.time() batch_time.update((t - end)/update_freq) sentence_s.update(args.max_sentences/(t-end)*args.distributed_world_size) end = time.time() - if i < 2: + if i < 4: end = time.time() - if i >= 2: - if args.distributed_world_size > 1 and args.distributed_rank == 0: + if i >= 4: + if args.distributed_world_size > 1 : progress.display(int((i+1)/update_freq)) @@ -265,7 +276,7 @@ def train(args, trainer, datasets, epoch_itr): # Mid epoch checkpoint num_updates = trainer.get_num_updates() - if args.distributed_world_size > 1 and args.distributed_rank == 0: + if args.distributed_world_size > 1 : if args.save_interval_updates > 0 and num_updates % args.save_interval_updates == 0: valid_losses = validate(args, trainer, datasets, [first_valid]) save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) @@ -276,14 +287,14 @@ def train(args, trainer, datasets, epoch_itr): if num_updates >= max_update: break - if args.distributed_world_size > 1 and args.distributed_rank == 0: + if args.distributed_world_size > 1 : if batch_time.avg > 0: print("End of epoch, batch_size:", args.max_sentences, 'Time: {:.3f}'.format(batch_time.avg), ' Sentence/s@all {:.3f}'.format( args.max_sentences / batch_time.avg * args.distributed_world_size)) # Print epoch stats and reset training meters - if args.distributed_world_size > 1 and args.distributed_rank == 0: + if args.distributed_world_size > 1 : DLLogger.log(step=trainer.get_num_updates(), data={'speed': trainer.get_throughput_meter().avg}, verbosity=0) DLLogger.flush() @@ -314,7 +325,7 @@ def validate(args, trainer, datasets, subsets): ).next_epoch_itr(shuffle=False) # reset validation loss meters - if args.distributed_world_size > 1 and args.distributed_rank == 0: + if args.distributed_world_size > 1 : DLLogger.flush() subset_losses = [] @@ -326,7 +337,7 @@ def validate(args, trainer, datasets, subsets): DLLogger.flush() valid_losses.append(subset_loss) - if args.distributed_world_size > 1 and args.distributed_rank == 0: + if args.distributed_world_size > 1 : print(f'Validation loss on subset {subset}: {subset_loss}') return valid_losses @@ -393,7 +404,7 @@ def load_checkpoint(args, trainer, epoch_itr): if extra_state is not None: # replay train iterator to match checkpoint epoch_itr.load_state_dict(extra_state['train_iterator']) - if args.distributed_world_size > 1 and args.distributed_rank == 0: + if args.distributed_world_size > 1 : print('| loaded checkpoint {} (epoch {} @ {} updates)'.format( checkpoint_path, epoch_itr.epoch, trainer.get_num_updates())) @@ -404,4 +415,4 @@ def load_checkpoint(args, trainer, epoch_itr): if __name__ == '__main__': - main() + main() \ No newline at end of file