From 8c3be1de99422fd90b73a316d2c0d9c9eb9199d9 Mon Sep 17 00:00:00 2001 From: x-ting Date: Fri, 25 Nov 2022 18:01:38 +0800 Subject: [PATCH] =?UTF-8?q?RetinaNet=E9=80=82=E9=85=8DPT-1.8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../contrib/cv/detection/RetinaNet/README.md | 161 ++++++++++++++++++ .../cv/detection/RetinaNet/README_NPU.md | 40 ----- .../RetinaNet/detectron2/engine/defaults.py | 1 + .../RetinaNet/detectron2/utils/events.py | 10 +- .../cv/detection/RetinaNet/requirements.txt | 2 +- .../cv/detection/RetinaNet/test/env_npu.sh | 47 +++-- .../detection/RetinaNet/test/train_eval_8p.sh | 2 +- .../detection/RetinaNet/test/train_full_1p.sh | 3 +- .../detection/RetinaNet/test/train_full_8p.sh | 6 +- .../RetinaNet/test/train_performance_1p.sh | 10 +- .../RetinaNet/test/train_performance_8p.sh | 13 +- .../cv/detection/RetinaNet/tools/train_net.py | 8 + 12 files changed, 216 insertions(+), 87 deletions(-) create mode 100644 PyTorch/contrib/cv/detection/RetinaNet/README.md delete mode 100644 PyTorch/contrib/cv/detection/RetinaNet/README_NPU.md diff --git a/PyTorch/contrib/cv/detection/RetinaNet/README.md b/PyTorch/contrib/cv/detection/RetinaNet/README.md new file mode 100644 index 0000000000..379eaf8e2f --- /dev/null +++ b/PyTorch/contrib/cv/detection/RetinaNet/README.md @@ -0,0 +1,161 @@ +# RetinaNet + +- [概述](概述.md) +- [准备训练环境](准备训练环境.md) +- [开始训练](开始训练.md) +- [训练结果展示](训练结果展示.md) +- [版本说明](版本说明.md) + + + +# 概述 + +## 简述 + +针对one stage网络中类别不均衡问题,提出一种新的损失函数:Focal Loss,这个损失函数是在标准交叉熵损失基础上修改得到的。这个函数可以通过减少易分类样本的权重,使得模型在训练时更专注于稀疏的难分类的样本;防止大量易分类负样本在训练中压垮检测器。为了证明focal loss的有效性,作者设计了一个dense detector:RetinaNet,并且在训练时采用focal loss训练。实验证明RetinaNet不仅可以达到one-stage detector的速度,也能超过现有two-stage detector的准确率。 + + +- 参考实现: + + ``` + url=https://github.com/facebookresearch/detectron2 + commit_id=96c752ce821a3340e27edd51c28a00665dd32a30 + ``` + +- 适配昇腾 AI 处理器的实现: + + ``` + url=https://gitee.com/ascend/ModelZoo-PyTorch.git + code_path=PyTorch/built-in/cv/detection + ``` + +- 通过Git获取代码方法如下: + + ``` + git clone {url} # 克隆仓库的代码 + cd {code_path} # 切换到模型代码所在路径,若仓库下只有该模型,则无需切换 + ``` + +- 通过单击“立即下载”,下载源码包。 + + + +# 准备训练环境 + +## 准备环境 + +- 当前模型支持的固件与驱动、 CANN 以及 PyTorch 如下表所示。 + + **表 1** 版本配套表 + + | 配套 | 版本 | + | ---------- | ------------------------------------------------------------ | + | 硬件 | [1.0.16](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) | + | 固件与驱动 | [5.1.RC2](https://www.hiascend.com/hardware/firmware-drivers?tag=commercial) | + | CANN | [5.1.RC2](https://www.hiascend.com/software/cann/commercial?version=5.1.RC2) | + | PyTorch | [1.8.1](https://gitee.com/ascend/pytorch/tree/master/) | + +- 环境准备指导。 + + 请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。 + +- 安装依赖。 + + ``` + pip3.7 install -r requirements.txt + ``` + pillow建议安装较新版本,与之对应的torchvision版本如果无法直接安装,可使用源码安装对应的版本,源码参考链接:https://github.com/pytorch/vision + 建议:Pillow版本是9.1.0 torchvision版本是0.6.0。 + + +## 准备数据集 + + + 用户自行获取coco数据集,包含images图片和annotations文件。其中images图片和annotations文件从[coco官网](https://cocodataset.org/#download)获取,另外还需要labels图片,用户可以从[google drive](https://drive.google.com/uc?export=download&id=1cXZR_ckHki6nddOmcysCuuJFM--T-Q6L)中获取。将获取后的数据集解压放置服务器的任意目录下(建议放到源码包根目录XXX/coco/下)。 + + 数据集目录结构如下所示: + +``` + coco + |-- annotations + |-- images + |-- train2017 + |-- val2017 + |-- labels + |-- train2017 + |-- val2017 +``` + +# 开始训练 + +## 训练模型 + +1. 进入解压后的源码包根目录。 + + ``` + cd /${模型文件夹名称} + ``` + +2. 运行训练脚本。 + + 该模型支持单机单卡训练和单机8卡训练。 + + - 单机单卡训练 + + 启动单卡训练。 + + ``` + bash ./test/train_full_1p.sh --data_path=real_data_path # 1p精度 + bash ./test/train_performance_1p.sh --data_path=real_data_path # 1p性能 + ``` + + - 单机8卡训练 + + 启动8卡训练。 + + ``` + bash ./test/train_full_8p.sh --data_path=real_data_path # 8p精度 + bash ./test/train_performance_8p.sh --data_path=real_data_path # 8p性能 + + ``` + --data_path参数填写数据集路径。 + + 模型训练脚本参数说明如下。 + + ``` + 公共参数: + --data_path //数据集路径 + --config-file //训练默认配置文件 + --device_ids //训练指定训练用卡 + --batch-size //训练批次大小,默认:8 + AMP //是否开启混合精度 + OPT_LEVEL //混合精度类型,默认:O1 + LOSS_SCALE_VALUE //混合精度lossscale大小,默认:64 + ``` + + 训练完成后,权重文件保存在当前路径下,并输出模型训练精度和性能信息。 + + +# 训练结果展示 + +**表 2** 训练结果展示表 + +| 名称 | 精度 | 性能 | torch版本 | +| ------ | ---- | ------- | ------- | +| NPU-1p | - | 5.58 | 1.5 | +| NPU-8p | 0.372 | 34.875 | 1.5 | +| NPU-1p | - | 9.586 | 1.8 | +| NPU-8p | 0.375 | 64.0 | 1.8 | + + +# 版本说明 + +## 变更 + +2022.11.25:更新pytorch1.8版本,重新发布。 + +2022.03.18:首次发布。 + +## 已知问题 + +无。 diff --git a/PyTorch/contrib/cv/detection/RetinaNet/README_NPU.md b/PyTorch/contrib/cv/detection/RetinaNet/README_NPU.md deleted file mode 100644 index 04255b92bd..0000000000 --- a/PyTorch/contrib/cv/detection/RetinaNet/README_NPU.md +++ /dev/null @@ -1,40 +0,0 @@ -# RetinaNet(Detectron2) - -## RetinaNet Detail - -As of the current date, Ascend-Pytorch is still inefficient for contiguous operations. -Therefore, RetinaNet is re-implemented using semantics such as custom OP. For details, see detectron2/modeling/meta_arch/retinanet.py - - -## Requirements - -- Install PyTorch ([pytorch.org](http://pytorch.org)) -- Install detectron2 - - Download RetinaNet from https://gitee.com/ascend/modelzoo.git - - Then, cd contrib/PyTorch/Official/cv/image_object_detection/RetinaNet - - Then, pip3.7 install -e . -- Download the ImageNet dataset from http://cocodataset.org/#home - - Then, and move validation images to labeled subfolders, using [the following shell script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh) -- When do the demo, need to download a picture locally and name it input1.jpg -## Training - -Before to train, preparing R-50.pkl and config weight in the config yaml file. -To train a model, run `tools/train_net.py` with the desired model architecture and the path to the ImageNet dataset: - -```bash -# 1p train 1p -bash ./test/train_full_1p.sh --data_path=数据集路径 - -# 8p train 8p -bash ./test/train_full_8p.sh --data_path=数据集路径 - -# 8p eval -bash ./test/train_eval_8p.sh --data_path=数据集路径 - -# To ONNX -python3.7.5 pthtar2onnx.py -``` - - - - diff --git a/PyTorch/contrib/cv/detection/RetinaNet/detectron2/engine/defaults.py b/PyTorch/contrib/cv/detection/RetinaNet/detectron2/engine/defaults.py index 0b33ab2e52..0307b4f69a 100644 --- a/PyTorch/contrib/cv/detection/RetinaNet/detectron2/engine/defaults.py +++ b/PyTorch/contrib/cv/detection/RetinaNet/detectron2/engine/defaults.py @@ -93,6 +93,7 @@ Run on multiple machines: ) parser.add_argument("--eval-only", action="store_true", help="perform evaluation only") parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*") + parser.add_argument("--batch-size", type=int, default=64, help="batch_size of all gpus") parser.add_argument('--device-ids',nargs='+') parser.add_argument("--num-machines", type=int, default=1, help="total number of machines") parser.add_argument( diff --git a/PyTorch/contrib/cv/detection/RetinaNet/detectron2/utils/events.py b/PyTorch/contrib/cv/detection/RetinaNet/detectron2/utils/events.py index a700d42bdc..5c5f008cdd 100644 --- a/PyTorch/contrib/cv/detection/RetinaNet/detectron2/utils/events.py +++ b/PyTorch/contrib/cv/detection/RetinaNet/detectron2/utils/events.py @@ -26,7 +26,7 @@ from fvcore.common.history_buffer import HistoryBuffer from detectron2.config import get_cfg _CURRENT_STORAGE_STACK = [] -cfg = get_cfg() + def get_event_storage(): """ @@ -184,11 +184,13 @@ class CommonMetricPrinter(EventWriter): self.logger = logging.getLogger(__name__) self._max_iter = max_iter self._last_write = None + self.cfg = None def write(self): storage = get_event_storage() iteration = storage.iter - + if self.cfg is None: + self.cfg = get_cfg() try: data_time = storage.history("data_time").avg(20) except KeyError: @@ -239,8 +241,8 @@ class CommonMetricPrinter(EventWriter): data_time="data_time: {:.4f} ".format(data_time) if data_time is not None else "", lr=lr, memory="max_mem: {:.0f}M".format(max_mem_mb) if max_mem_mb is not None else "", - batchsize = "batchsize: {:.0f} " .format(cfg.SOLVER.IMS_PER_BATCH), - fps = "fps: {:.3f} ".format((cfg.SOLVER.IMS_PER_BATCH *8) / iter_time) if iter_time is not None else "", + batchsize = "batchsize: {:.0f} " .format(int(os.environ['batch_size'])), + fps = "fps: {:.3f} ".format((int(os.environ['batch_size'])) / iter_time) if iter_time is not None else "", ) ) diff --git a/PyTorch/contrib/cv/detection/RetinaNet/requirements.txt b/PyTorch/contrib/cv/detection/RetinaNet/requirements.txt index 2004eb6ec5..a494133671 100644 --- a/PyTorch/contrib/cv/detection/RetinaNet/requirements.txt +++ b/PyTorch/contrib/cv/detection/RetinaNet/requirements.txt @@ -1,4 +1,4 @@ -torchvision==0.2.2.post2 +torchvision fvcore pycocotools cloudpickle diff --git a/PyTorch/contrib/cv/detection/RetinaNet/test/env_npu.sh b/PyTorch/contrib/cv/detection/RetinaNet/test/env_npu.sh index f96f48f812..c80981db88 100644 --- a/PyTorch/contrib/cv/detection/RetinaNet/test/env_npu.sh +++ b/PyTorch/contrib/cv/detection/RetinaNet/test/env_npu.sh @@ -1,36 +1,23 @@ #!/bin/bash -export install_path=/usr/local/Ascend +CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info' -if [ -d ${install_path}/toolkit ]; then - export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH} - export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH - export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH - export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH - export ASCEND_OPP_PATH=${install_path}/opp +if [ -f $CANN_INSTALL_PATH_CONF ]; then + CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2) else - if [ -d ${install_path}/nnae/latest ];then - export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH - export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/ - export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/ - export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so - export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH - export ASCEND_AICPU_PATH=${install_path}/nnae/latest - else - export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH - export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/ - export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/ - export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so - export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH - export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest - fi + CANN_INSTALL_PATH="/usr/local/Ascend" fi +if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then + source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh +else + source ${CANN_INSTALL_PATH}/nnae/set_env.sh +fi #将Host日志输出到串口,0-关闭/1-开启 export ASCEND_SLOG_PRINT_TO_STDOUT=0 #设置默认日志级别,0-debug/1-info/2-warning/3-error export ASCEND_GLOBAL_LOG_LEVEL=3 -#设置Event日志开启标志,0-关闭/1-开启 +#设置Host侧Event日志开启标志,0-关闭/1-开启 export ASCEND_GLOBAL_EVENT_ENABLE=0 #设置是否开启taskque,0-关闭/1-开启 export TASK_QUEUE_ENABLE=1 @@ -42,8 +29,18 @@ export COMBINED_ENABLE=1 export DYNAMIC_OP="ADD#MUL" #HCCL白名单开关,1-关闭/0-开启 export HCCL_WHITELIST_DISABLE=1 -export HCCL_IF_IP=$(hostname -I |awk '{print $1}') +#设置device侧日志登记为error +msnpureport -g error -d 0 +msnpureport -g error -d 1 +msnpureport -g error -d 2 +msnpureport -g error -d 3 +msnpureport -g error -d 4 +msnpureport -g error -d 5 +msnpureport -g error -d 6 +msnpureport -g error -d 7 +#关闭Device侧Event日志 +msnpureport -e disable path_lib=$(python3.7 -c """ import sys @@ -64,4 +61,4 @@ print(result)""" echo ${path_lib} -export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH \ No newline at end of file diff --git a/PyTorch/contrib/cv/detection/RetinaNet/test/train_eval_8p.sh b/PyTorch/contrib/cv/detection/RetinaNet/test/train_eval_8p.sh index b1b4e7599f..8a688fd2bf 100644 --- a/PyTorch/contrib/cv/detection/RetinaNet/test/train_eval_8p.sh +++ b/PyTorch/contrib/cv/detection/RetinaNet/test/train_eval_8p.sh @@ -70,7 +70,7 @@ python3.7 tools/train_net.py \ --config-file configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml \ --eval-only \ AMP 1\ - OPT_LEVEL O2 \ + OPT_LEVEL O1 \ LOSS_SCALE_VALUE 64 \ MODEL.DEVICE npu:0 \ SOLVER.IMS_PER_BATCH 16 \ diff --git a/PyTorch/contrib/cv/detection/RetinaNet/test/train_full_1p.sh b/PyTorch/contrib/cv/detection/RetinaNet/test/train_full_1p.sh index 006be2a84f..b110b86276 100644 --- a/PyTorch/contrib/cv/detection/RetinaNet/test/train_full_1p.sh +++ b/PyTorch/contrib/cv/detection/RetinaNet/test/train_full_1p.sh @@ -82,8 +82,9 @@ if [ x"${etp_flag}" != x"true" ];then fi python3.7 tools/train_net.py \ --config-file configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml \ + --batch-size ${batch_size} \ AMP 1 \ - OPT_LEVEL O2 \ + OPT_LEVEL O1 \ MODEL.DEVICE npu:${ASCEND_DEVICE_ID} \ LOSS_SCALE_VALUE 64 \ SOLVER.IMS_PER_BATCH ${batch_size} \ diff --git a/PyTorch/contrib/cv/detection/RetinaNet/test/train_full_8p.sh b/PyTorch/contrib/cv/detection/RetinaNet/test/train_full_8p.sh index 4ff3836cde..9285109f55 100644 --- a/PyTorch/contrib/cv/detection/RetinaNet/test/train_full_8p.sh +++ b/PyTorch/contrib/cv/detection/RetinaNet/test/train_full_8p.sh @@ -70,12 +70,14 @@ python3.7 -u tools/train_net.py \ --config-file configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml \ --device-ids 0 1 2 3 4 5 6 7 \ --num-gpus 8 \ + --batch-size ${batch_size} \ AMP 1\ - OPT_LEVEL O2 \ + OPT_LEVEL O1 \ LOSS_SCALE_VALUE 64 \ SOLVER.IMS_PER_BATCH ${batch_size} \ DATALOADER.NUM_WORKERS ${workers} \ - SOLVER.BASE_LR 0.04 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + SOLVER.BASE_LR 0.04 \ + SOLVER.MAX_ITER ${max_iter} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait diff --git a/PyTorch/contrib/cv/detection/RetinaNet/test/train_performance_1p.sh b/PyTorch/contrib/cv/detection/RetinaNet/test/train_performance_1p.sh index 774a2afef4..16c2d44f57 100644 --- a/PyTorch/contrib/cv/detection/RetinaNet/test/train_performance_1p.sh +++ b/PyTorch/contrib/cv/detection/RetinaNet/test/train_performance_1p.sh @@ -5,7 +5,7 @@ # 网络名称,同目录名称 Network="RetinaNet" # 训练batch_size -batch_size=16 +batch_size=8 # 训练使用的npu卡数 export RANK_SIZE=1 # 数据集路径,保持为空,不需要修改 @@ -82,13 +82,15 @@ if [ x"${etp_flag}" != x"true" ];then fi python3.7 tools/train_net.py \ --config-file configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml \ + --batch-size ${batch_size} \ AMP 1 \ - OPT_LEVEL O2 \ + OPT_LEVEL O1 \ MODEL.DEVICE npu:${ASCEND_DEVICE_ID} \ LOSS_SCALE_VALUE 64 \ SOLVER.IMS_PER_BATCH ${batch_size} \ DATALOADER.NUM_WORKERS ${workers} \ SOLVER.BASE_LR 0.01 \ + DATASETS.TEST '()' \ SOLVER.MAX_ITER ${max_iter} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait @@ -107,11 +109,7 @@ FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_fps.log | # 打印,不需要修改 echo "Final Performance images/sec : $FPS" -# 输出训练精度,需要模型审视修改 -train_accuracy=`grep -A 3 "Evaluation results for bbox:" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | tail -n 1 | awk '{print $2}'` - # 打印,不需要修改 -echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" # 性能看护结果汇总 diff --git a/PyTorch/contrib/cv/detection/RetinaNet/test/train_performance_8p.sh b/PyTorch/contrib/cv/detection/RetinaNet/test/train_performance_8p.sh index f588426261..df967b0b65 100644 --- a/PyTorch/contrib/cv/detection/RetinaNet/test/train_performance_8p.sh +++ b/PyTorch/contrib/cv/detection/RetinaNet/test/train_performance_8p.sh @@ -12,7 +12,7 @@ export RANK_SIZE=8 data_path="" # 训练最大iter数 -max_iter=5000 +max_iter=1000 # 加载数据进程数 workers=4 @@ -70,12 +70,15 @@ python3.7 -u tools/train_net.py \ --config-file configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml \ --device-ids 0 1 2 3 4 5 6 7 \ --num-gpus 8 \ + --batch-size ${batch_size} \ AMP 1\ - OPT_LEVEL O2 \ + OPT_LEVEL O1 \ LOSS_SCALE_VALUE 64 \ SOLVER.IMS_PER_BATCH ${batch_size} \ DATALOADER.NUM_WORKERS ${workers} \ - SOLVER.BASE_LR 0.04 > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + SOLVER.BASE_LR 0.04 \ + DATASETS.TEST '()' \ + SOLVER.MAX_ITER ${max_iter} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & wait @@ -93,11 +96,7 @@ FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_fps.log | # 打印,不需要修改 echo "Final Performance images/sec : $FPS" -# 输出训练精度,需要模型审视修改 -train_accuracy=`grep -A 3 "Evaluation results for bbox:" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | tail -n 1 | awk '{print $2}'` - # 打印,不需要修改 -echo "Final Train Accuracy : ${train_accuracy}" echo "E2E Training Duration sec : $e2e_time" # 性能看护结果汇总 diff --git a/PyTorch/contrib/cv/detection/RetinaNet/tools/train_net.py b/PyTorch/contrib/cv/detection/RetinaNet/tools/train_net.py index fe8a0cdf85..080f8bed87 100644 --- a/PyTorch/contrib/cv/detection/RetinaNet/tools/train_net.py +++ b/PyTorch/contrib/cv/detection/RetinaNet/tools/train_net.py @@ -33,6 +33,8 @@ import logging import os from collections import OrderedDict import torch +if torch.__version__ >= '1.8.1': + import torch_npu from apex import amp import detectron2.utils.comm as comm @@ -146,6 +148,7 @@ def setup(args): def main(args): + os.environ['batch_size'] = str(args.batch_size) cfg = setup(args) """ @@ -153,6 +156,11 @@ def main(args): consider writing your own training loop (see plain_train_net.py) or subclassing the trainer. """ + option = {} + option["ACL_OP_COMPILER_CACHE_MODE"] = "enable" # cache功能启用 + option["ACL_OP_COMPILER_CACHE_DIR"] = "./cache" # cache所在文件夹 + print("option:",option) + torch.npu.set_option(option) trainer = Trainer(cfg, args) trainer.resume_or_load(resume=args.resume) if cfg.TEST.AUG.ENABLED: -- Gitee