diff --git a/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/3d_nested_unet_postprocess.py b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/3d_nested_unet_postprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..023869fa0adbf04bb38e73feb28298d42d0f7f6c
--- /dev/null
+++ b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/3d_nested_unet_postprocess.py
@@ -0,0 +1,39 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# 3d_nested_unet_postprocess.py
+import sys
+import os
+import time
+import pdb
+import argparse
+from nnunet.inference import predict_simple2
+
+
+def main():
+    # pdb.set_trace()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-fp', '--file_path', help='output bin files path', required=True)
+    args = parser.parse_args()
+    python_file = predict_simple2.__file__  # /home/hyp/UNetPlusPlus/pytorch/nnunet/inference/predict_simple2.py
+    file_path = args.file_path
+    pre_mode = 2
+    command = 'python3 ' + str(python_file) + ' --pre_mode ' + str(pre_mode) + ' --file_path ' + str(file_path)
+    os.system(command)
+
+
+if __name__ == "__main__":
+    main()
+    print('main end')
+
diff --git a/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/3d_nested_unet_preprocess.py b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/3d_nested_unet_preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..143fb939b116e7d44c7dc316584fe84b9156a3f2
--- /dev/null
+++ b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/3d_nested_unet_preprocess.py
@@ -0,0 +1,39 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# 3d_nested_unet_preprocess.py
+import sys
+import os
+import time
+import pdb
+import argparse
+from nnunet.inference import predict_simple2
+
+
+def main():
+    # pdb.set_trace()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-fp', '--file_path', help='input bin files path', required=True)
+    args = parser.parse_args()
+    python_file = predict_simple2.__file__  # /home/hyp/UNetPlusPlus/pytorch/nnunet/inference/predict_simple2.py
+    file_path = args.file_path
+    pre_mode = 1
+    command = 'python3 ' + str(python_file) + ' --pre_mode ' + str(pre_mode) + ' --file_path ' + str(file_path)
+    os.system(command)
+
+
+if __name__ == "__main__":
+    main()
+    print('main end')
+
diff --git a/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/3d_nested_unet_pth2onnx.py b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/3d_nested_unet_pth2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d8e115f5bd1c2f6919069e2ed6807b4cf7819bb
--- /dev/null
+++ b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/3d_nested_unet_pth2onnx.py
@@ -0,0 +1,57 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# 3d_nested_unet_pth2onnx.py
+import sys
+import os
+import time
+import pdb
+import argparse
+from batchgenerators.utilities.file_and_folder_operations import join, isdir
+from nnunet.paths import default_plans_identifier, network_training_output_dir, default_cascade_trainer, default_trainer
+from nnunet.training.model_restore import load_model_and_checkpoint_files
+from nnunet.inference.predict2 import pth2onnx
+
+
+def main():
+    # pdb.set_trace()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-fp', '--file_path', help='output onnx file path', required=True)
+    args = parser.parse_args()
+    fp = args.file_path
+    model = '3d_fullres'
+    task_name = 'Task003_Liver'
+    trainer = 'nnUNetPlusPlusTrainerV2'
+    plans_identifier = 'nnUNetPlansv2.1'
+    model_folder_name = join(network_training_output_dir, model, task_name, trainer + "__" + plans_identifier)
+    model = model_folder_name
+    folds = None  # 如果文件存放路径正确，会自动识别到教程中的fold 0
+    mixed_precision = True
+    checkpoint_name = 'model_final_checkpoint'
+    trainer, params = load_model_and_checkpoint_files(model, folds, mixed_precision=mixed_precision, checkpoint_name=checkpoint_name)
+    pre_mode = -1
+    if int(pre_mode) == -1:
+        p = params[0]
+        trainer.load_checkpoint_ram(p, False)  # nnUnetPlusPlusTrainerV2，实际函数在network_trainer里
+        print('pth2onnx start')
+        pth2onnx(trainer.network, fp)
+        print('pth2onnx end')
+        print('onnx模型已经输出至：', fp)
+
+
+if __name__ == "__main__":
+    main()
+    print('main end')
+
+
diff --git a/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/License b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/License
new file mode 100644
index 0000000000000000000000000000000000000000..eeac88fb9dc15a1427b41173cf5f136327230c49
--- /dev/null
+++ b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/License
@@ -0,0 +1,201 @@
+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/README.md b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e486602d5b65f382eaa4ca05032652c462665d2
--- /dev/null
+++ b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/README.md
@@ -0,0 +1,409 @@
+# 3D_Nested_Unet模型PyTorch离线推理指导
+
+**本教程的文件及其说明**
+```
+推理工具
+├── benchmark.aarch64             //离线推理工具（适用ARM架构），可能需要用户自行编译获得
+├── benchmark.x86_64              //离线推理工具（适用x86架构），可能需要用户自行编译获得
+脚本文件
+├── set_env.sh                    //NPU环境变量 
+├── clear2345.sh                  //清理文件、合并结果脚本
+├── get_dataset_info.py           //用于获取二进制数据集信息的脚本 
+├── 3d_nested_unet_pth2onnx.py    //生成ONNX模型文件的程序
+├── 3d_nested_unet_preprocess.py  //数据前处理，生成输入bin文件的程序 
+├── 3d_nested_unet_postprocess.py //数据后处理，合并输出bin生成推理结果的程序
+├── onnx_infer.py                 //评测GPU性能的程序
+├── change_infer_path.py          //修改实验路径的程序
+模型及权重文件（模型文件过大，很可能已经从本仓中移除）
+├── nnunetplusplus.onnx           //ONNX模型文件
+├── nnunetplusplus.om             //OM模型文件
+其他文件
+├── README.md                     //快速上手指导，过程内容和本文大致相同
+├── new.patch                     //修改源代码的补丁
+├── requirements.txt              //环境依赖，由pip freeze > re.txt生成
+权重文件download_models文件夹（该文件夹可能被打包上传至别处，请用户提前下载该文件）
+├── Task003_Liver/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1/fold_0/*   //内含权重文件
+├── Task003_Liver/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1/plans.pkl  //实验配置文件
+备份文件backup文件夹（该文件夹可能被打包上传至别处，请用户提前下载该文件）
+├── nnUNet_preprocessed/          //待拷贝的实验配置文件
+├── output-gpu/                   //在GPU上的全部推理结果，内含GPU精度结果
+├── output-npu/                   //在NPU上的全部推理结果，内含NPU精度结果
+├── nnunetplusplus_prep_bin.info  //对MSD数据集Task03中第11号图像生成的info文件
+├── perf_vision_batchsize_1_device_0.txt  //NPU上的性能结果
+└── perf_T4gpu_batchsize_1.txt            //GPU上的性能结果
+```
+**关键环境：**
+| 依赖名 | 版本号 |
+| :------: | :------: |
+| CANN  | 5.1.RC1.alpha001 |
+| CANN（仅在atc转换OM时）  | 5.0.3 / 5.1.RC1.alpha001 |
+| CANN（除了使用atc以外的实验步骤时）  | 5.0.3 / 5.0.4 / 5.1.RC1.alpha001 |
+| python  | ==3.7.5 |
+| torch   | >=1.6.0 (cpu版本即可) |
+| batchgenerators  | ==0.21 |
+| numpy  | 无特定版本要求 |
+| pandas  | 无特定版本要求 |
+| pillow  | 无特定版本要求 |
+| SimpleITK  | 无特定版本要求 |
+| scikit-image  | 无特定版本要求 |
+| 其他依赖可在后文实验步骤中查阅  | 未指明 |
+
+**相关链接：**
+| 名称和地址 | 说明 |
+| :------: | :------: |
+| [UNET官方代码仓](https://github.com/MIC-DKFZ/nnUNet)  | UNET官方框架。 |
+| [UNET++官方代码仓](https://github.com/MrGiovanni/UNetPlusPlus/tree/master/pytorch)  | 依据UNET官方框架进行开发的UNET++官方代码。 |
+| [MSD数据集（Medical Segmentation Decathlon）](http://medicaldecathlon.com/)  | 医学十项全能数据集，内含10个子任务，本文只对任务3肝脏任务进行验证。数据图像均为三维灰度图像，您可以下载使用ITK-SNAP工具来可视化图像。 |
+| [ITK-SNAP](http://www.itksnap.org/pmwiki/pmwiki.php)  | 三维图像可视化工具。 |
+| [UNET++模型权重文件](https://github.com/MrGiovanni/UNetPlusPlus/tree/master/pytorch)  | UNET++作者提供的模型权重，在官方仓中“How to use UNet++”章节中存有链接。 |
+| 权重文件download_models文件夹  | 本文所使用的，只节选了fold_0及plans.pkl的权重文件。若无链接，可下载UNET++作者提供的权重文件。 |
+| 备份文件backup文件夹  | 本文所使用的，相关实验配置文件。链接位于：obs://ascend-pytorch-model-file/验收-推理/cv/segmentation/3D_Nested_Unet/实验配置文件、推理结果、性能参考文件/ |
+| [benchmark工具](https://gitee.com/ascend/cann-benchmark/tree/master/infer)  | 在310上进行推理所需的可执行文件。或许更新的msame工具也可以使用。 |
+## 1 环境准备 
+
+### 1.1 获取源代码
+下载官方代码仓，并退回至指定版本，以保证代码稳定不变。本文以下教程与模型推理指导书保持相同。
+```
+cd /home/hyp/
+git clone https://github.com/MrGiovanni/UNetPlusPlus.git
+cd UNetPlusPlus
+git reset e145ba63862982bf1099cf2ec11d5466b434ae0b --hard
+```
+
+### 1.2 安装依赖，修改模型代码  
+```
+cd /home/hyp/UNetPlusPlus/
+patch -p1 < ../new.patch  # 载入代码修改补丁
+cd pytorch
+pip install -e .
+pip install batchgenerators==0.21  # 该依赖十分关键，指定其版本再手动安装一次
+
+# 您也可以通过requirements来安装依赖包，但我们不推荐该方法
+pip install -r requirements.txt
+```
+patch命令的最后一个参数需要指定本仓中的new.patch文件的路径。由于该模型需要将命令注册到环境中才能找到正确的函数入口，所以我们仍然需要一步pip来将代码注册到环境中。除此之外，每次将代码文件进行大幅度地增减时，“pip install -e .”都是必须的，否则很可能出现“import nnunet”错误。
+
+我们不推荐您使用requirements.txt的方式来安装环境，因为这很可能遗漏nnunet的注册步骤，使得后续实验无法进行。
+
+注：如果在执行“pip install -e .”或在后面的实验过程中，仍然出现了环境包或模块的安装或导入错误，则很可能需要重新手动安装部分包。我们认为，原作者没有完全指明必要的依赖，而那些隐藏的依赖目前已经升级了多个版本，导致各个依赖间的关系出现变化，进而使得如今完全按作者的描述安装依赖是不可行的。我们在多个服务器上，已观测到仍然可能出现异常的包有但不仅限于：
+ - torch (CPU版本即可)
+ - decorator
+ - sympy
+ - SimpleITK
+ - matplotlib
+ - batchgenerators==0.21
+ - pandas
+ - scikit-image
+ - sklearn
+ - nibabel
+ 
+在多个不同的服务器环境上，上述的包都有过至少两次安装失败的经历。通常手动安装上述的包就可以解决问题，使用诸如“pip install batchgenerators==0.21”的方式来重新安装界面报错提示中指定的那些包，或更换镜像源。第二种方法是使用离线whl包进行安装。若仍无法解决，则很可能是系统底层版本过低，例如GLIBC。
+
+### 1.3 准备数据集及环境设置
+该模型是依赖于[UNET官方代码仓](https://github.com/MIC-DKFZ/nnUNet)而进行的二次开发，依据UNET的描述，整个实验流程大体可描述为“数据格式转换->数据预处理->训练->验证->推理”。中间不可跳过，因为每一个后续步骤都依赖于前一个步骤的结果。您可以参照官方说明进行数据集设置，但过于繁琐。下面我们将描述其中的核心步骤及注意事项，必要时通过提供中间结果文件来帮助我们跳过一些步骤。
+
+#### 1.3.1 设置nnunet环境变量
+参照UNET的描述，在硬盘空间充足的路径下，我们以/home/hyp/为例，在该路径下创建一个新的文件夹environment，用于存放相关实验数据，该路径不强求和项目所在路径相同。在environment中再创建三个子文件夹：nnUNet_raw_data_base、nnUNet_preprocessed和RESULTS_FOLDER。这三个文件夹不强求位于同一目录下，甚至可以位于多块硬盘下，出于检索方便的考虑，我们推荐将其位于同一目录下，例如environment。我们推荐您至少确保该路径下（指代environment）有400GB的存储空间。
+```
+cd environment
+mkdir nnUNet_raw_data_base
+mkdir nnUNet_preprocessed
+mkdir RESULTS_FOLDER
+```
+最后修改/root/.bashrc文件，在文件尾部添加如下环境变量。这样以后每次开启新会话时，位于.bashrc中的环境变量都会自动导入，无需用户再手动export一次。
+```
+export nnUNet_raw_data_base="/home/hyp/environment/nnUNet_raw_data_base"
+export nnUNet_preprocessed="/home/hyp/environment/nnUNet_preprocessed"
+export RESULTS_FOLDER="/home/hyp/environment/RESULTS_FOLDER"
+```
+使用source命令来刷新环境变量。如果您实在不想修改.bashrc文件，您也可以在您当前会话中直接输入上面的三条export语句来设置环境变量，但是这些变量只在当前会话内有效。
+```
+source ~/.bashrc
+```
+注：我们十分推荐将以上文件夹置于SSD上。如果使用的是机械硬盘，我们观察到该模型会占据大量的IO资源，导致系统卡顿。如果您还希望使用您设备上可用的GPU，则还需要额外添加以下环境变量。
+```
+# 配置GPU编号为0至3的四卡环境
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+```
+
+#### 1.3.2 获取数据集
+获取[Medical Segmentation Decathlon](http://medicaldecathlon.com/)，下载其中的第三个子任务集Task03_Liver.tar，放到environment目录下（后文中environment均指代/home/hyp/environment/），并解压。该数据集中的Task03_Liver在后续实验过程中会被裁剪展开，该数据集在使用时将占用约260GB的存储空间。
+```
+# 确认系统剩余存储空间
+df -h
+# 转移并解压数据集
+mv ./Task03_Liver.tar /home/hyp/environment/
+cd /home/hyp/environment/
+tar xvf Task03_Liver.tar
+```
+至此，在environment文件夹内，文件结构应像如下所示，它们与上一节在.bashrc中设置的环境变量路径要保持相同：
+```
+environment/
+├── nnUNet_preprocessed/
+├── nnUNet_raw_data_base/
+├── RESULTS_FOLDER/
+├── Task03_Liver/
+└── Task03_Liver.tar
+```
+
+#### 1.3.3 数据格式转换
+在environment文件夹内，使用nnunet的脚本命令，对解压出的Task03_Liver文件夹中的数据进行数据格式转换。该脚本将运行约5分钟，转换结果将出现在nnUNet_raw_data_base子文件夹中。
+```
+nnUNet_convert_decathlon_task -i Task03_Liver -p 8
+```
+如果您的设备性能较差或者该命令在较长时间后都未结束，您可以将参数-p的数值调小，这将消耗更多的时间。
+
+注：若您在之后的实验过程中想要重置实验或者数据集发生严重问题（例如读取数据时遇到了EOF等读写错误），您可以将nnUNet_preprocessed、nnUNet_raw_data_base和RESULTS_FOLDER下的文件全部删除，并从本节开始复现后续过程。
+
+#### 1.3.4 实验计划与预处理
+nnunet十分依赖数据集，这一步需要提取数据集的属性，例如图像大小、体素间距等，并生成后续实验的配置文件。若删减数据集图像，都将使后续实验配置发生变化。使用nnunet的脚本命令，对nnUNet_raw_data_base中的003任务采集信息。这个过程将持续半小时至六小时不等，具体时间依赖于设备性能，转换结果将出现在nnUNet_preprocessed子文件夹中。
+```
+nnUNet_plan_and_preprocess -t 003 --verify_dataset_integrity
+```
+我们观察到，该过程很可能意外中断却不给予用户提示信息，这在系统内存较小时会随机发生，请您确保该实验过程可以正常结束。如果您的设备性能较差或者较长时间后都未能正常结束，您可以改用下面的命令来降低系统占用，而这将显著提升该步骤的运行时间。实践来看，通过输入free -m命令，如果系统显示的available Mem低于30000或在30000左右，则我们推荐您使用下面的命令。
+```
+nnUNet_plan_and_preprocess -t 003 --verify_dataset_integrity -tl 1 -tf 1
+```
+注：若在后续的实验步骤中出现形如“RuntimeError: Expected index [2, 1, 128, 128, 128] to be smaller than self [2, 3, 8, 8, 8] apart from dimension 1”的错误，请删除environment/nnUNet_preprocessed/Task003_Liver/以及environment/nnUNet_raw_data_base/nnUNet_cropped_data/下的所有文件，然后重新完成本节内容。
+
+#### 1.3.5 拷贝实验配置文件
+由于nnunet的实验计划与预处理中，对数据集的划分存在随机性，为了保证后续实验的可控性，我们提供了一些支撑材料，位于backup文件夹内。其中有一份可用的实验配置文件，即设定了训练集、验证集的划分。请将这些文件覆盖到environment中。
+
+注：请用户自行检查：若backup/nnUNet_preprocessed/内的文件为.json格式，请将其格式手动修改为.pkl格式，保持文件名不变，之后再进行拷贝。
+```
+# 拷贝实验计划的.pkl文件和对数据集划分的.pkl文件至environment中
+cp -rf /home/hyp/backup/nnUNet_preprocessed /home/hyp/environment/
+```
+在environment中创建一个新的子文件夹名为input，用于存放待推理的图像，同时再创建一个output文件夹用于存放模型的推理输出，请勿在以上两个文件夹中存放多余无关的文件。
+```
+cd environment
+mkdir input output
+```
+splits_final.pkl中存储了对数据的划分，27张图片编号如下所示。我们需要将这些验证集图像（存放于nnUNet_raw_data_base/nnUNet_raw_data/Task003_Liver/imagesTr/拷贝到指定文件夹input下，作为我们的待推理图像，使用create_testset.py来完成验证集的迁移复制。当然您也可以自己指定想要推理的文件夹路径。
+```
+# 原始图像文件名形如liver_3_0000.nii.gz、liver_128_0000.nii.gz
+# 验证集图片编号：3, 5, 11, 12, 17, 19, 24, 25, 27, 38, 40, 41, 42, 44, 51, 52, 58, 64, 70, 75, 77, 82, 101, 112, 115, 120, 128
+cd /home/hyp/UNetPlusPlus/pytorch/nnunet/inference
+python create_testset.py /home/hyp/environment/input/
+```
+注：该步骤与UNET官方教程不同，官方使用nnUNet_raw_data_base/nnUNet_raw_data/Task003_Liver/imagesTs/下的图像作为待推理图像，图像来自测试集，而本教程使用的是验证集。
+
+#### 1.3.6 获取权重文件
+该模型采用了五重交叉验证的方法，因此作者提供的预训练的权重文件也分为五个文件夹，分别代表着5个fold（交叉）的结果。实测后，各个fold的精度都相差不大，浮动大约在1%以内，鉴于计算资源的考虑，整个实验过程我们只采用fold 0（第一个交叉实验）的结果。
+
+下载预训练过的[模型参数权重download models](https://github.com/MrGiovanni/UNetPlusPlus/tree/master/pytorch)，在environment下创建一个新的子文件夹download_models用于存放下载得到的压缩包，将该压缩包解压后得到五个文件夹及一个配置文件：fold_0, fold_1, fold_2, fold_3, fold_4, plans.pkl。
+
+本文教程日后可能会单独提供fold_0及plans.pkl的压缩包，如果有，用户可以自行下载使用。
+
+将其中的fold_0文件夹和plans.pkl拷贝至environment/RESULTS_FOLDER/nnUNet/3d_fullres/Task003_Liver/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1/下，模拟我们已经完成了训练过程，请提前创建相关子文件夹。
+```
+cd environment
+cp -rf download_models/* /home/hyp/environment/RESULTS_FOLDER/nnUNet/3d_fullres/Task003_Liver/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1/
+```
+最终文件结构目录如下：
+```
+environment/RESULTS_FOLDER/nnUNet/3d_fullres/Task003_Liver/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1/
+├── fold_0/
+│   ├── ...
+│   ├── model_final_checkpoint.model
+│   ├── model_final_checkpoint.model.pkl
+│   └── ...
+└── plans.pkl
+```
+
+#### 1.3.7 设置推理实验相关路径
+后续推理实验通常要使用多个路径参数，使用时十分容易造成混淆。因为在前文中我们已经设置了nnunet环境变量，所以我们可以认为该模型的相关路径都是稳定的，不会经常变动。为了让后续的实验更加便捷，我们可以在程序中设置好路径作为默认参数。使用change_infer_path.py来完成这个操作，参数为三个绝对路径（以下三个fp不能指向同一个目录）。
+```
+python change_infer_path.py -fp1 INFERENCE_INPUT_FOLDER –fp2 INFERENCE_OUTPUT_FOLDER -fp3 INFERENCE_SHAPE_PATH
+#例：python change_infer_path.py -fp1 /home/hyp/environment/input/ -fp2 /home/hyp/environment/output/ -fp3 /home/hyp/environment/
+
+```
+以上的三个路径参数的具体说明如下，我们推荐将这些路径指向environment文件夹内，便于用户检索：
+ - INFERENCE_INPUT_FOLDER：存放待推理图像的文件夹。（该文件夹在1.3.5节中被创建）
+ - INFERENCE_OUTPUT_FOLDER：推理完成后，存放推理结果的文件夹。（该文件夹在1.3.5节中被创建）
+ - INFERENCE_SHAPE_PATH：存放文件all_shape.txt的目录。在后续实验过程中会被介绍到，在该目录下会生成一个all_shape.txt，存储着当前待推理图像的属性。这是一个中间结果文件，用户无需具体了解。
+ 
+最后，您可以打开项目代码中的UNetPlusPlus/pytorch/nnunet/inference/infer_path.py查看修改的结果，修改后的效果示例如下所示：
+```
+# 以下两项为固定值，为历史需求变更后的版本遗留项，请保证为None
+INFERENCE_BIN_INPUT_FOLDER = None
+INFERENCE_BIN_OUTPUT_FOLDER = None
+
+# 以下三项为change_infer_path.py修改后的三个路径
+INFERENCE_INPUT_FOLDER = '/home/hyp/environment/input/'
+INFERENCE_OUTPUT_FOLDER = '/home/hyp/environment/output/'
+INFERENCE_SHAPE_PATH = '/home/hyp/environment/'
+```
+注：在本节中，您可能会首次打开并查看项目代码。如果文件中存在中文字符，在您的软件上可能会显示乱码，请更换编码方式为UTF-8来查看。
+
+#### 1.3.8 拷贝实验结果
+推理需要对验证集中未经训练的27张图像进行推理，实测上在NPU上完成全部的推理需要2-4天时间。由于推理过程过于繁琐，我们额外提供了一份含有在fold 0设置下的全部推理结果的附加文件，也包含在NPU上的完整推理流程下的推理结果。后文将以编号11的图像为例，讲解如何进行单幅图像的推理，而其他编号的图像也可以遵循同样的方法来得到，进而复现出所有的推理结果。所有验证集图像的编号如下，将backup/output-npu/中的NPU推理结果拷贝至INFERENCE_OUTPUT_FOLDER（在1.3.7节中被设置为/home/hyp/environment/output/）下。
+```
+# 结果图像文件名形如liver_5.nii.gz、liver_112.nii.gz
+# 图片编号同1.3.5节中所介绍的：3, 5, 11, 12, 17, 19, 24, 25, 27, 38, 40, 41, 42, 44, 51, 52, 58, 64, 70, 75, 77, 82, 101, 112, 115, 120, 128
+cp -rf /home/hyp/backup/output-npu/* /home/hyp/environment/output/
+```
+注：output-npu和output-gpu下的summary.json即为整个实验在NPU和GPU上的精度评测结果，仅供参考。在2.9节中我们会替换掉它生成新的评测结果。若用户发现存在plans.json文件，请将其后缀格式修改为.pkl。
+
+### 1.4 获取[benchmark工具](https://gitee.com/ascend/cann-benchmark/tree/master/infer) 
+将编译好的benchmark.x86_64或benchmark.aarch64放到当前工作目录。您可以使用如下命令来确认自己的系统是x86架构还是aarch架构。
+```
+uname -a
+```
+
+## 2 离线推理 
+
+### 2.1 生成om模型
+下面简要介绍了离线推理中的重要步骤所使用的程序，它们都必须接受一个用户提供的路径参数--file_path：
+ - 3d_nested_unet_pth2onnx.py：转换模式。加载预训练的模型并转化为onnx模型，输出的onnx文件为--file_path。
+ - 3d_nested_unet_preprocess.py：拆分模式。数据前处理，将INFERENCE_INPUT_FOLDER（在1.3.7节中被设置为/home/hyp/environment/input/）下的待推理图像切割子图，生成一批输入.bin文件，存放到--file_path下。
+ - 3d_nested_unet_postprocess.py：组合模式。数据后处理，将--file_path下的输出.bin文件合并出推理结果，推理结果会存放到INFERENCE_OUTPUT_FOLDER（在1.3.7节中被设置为/home/hyp/environment/output/）下。
+
+首先让模型载入预训练好的权重，将其转化为onnx模型，输出文件为一个指定路径下的nnunetplusplus.onnx，暂且将其置于environment内。
+```
+python 3d_nested_unet_pth2onnx.py --file_path /home/hyp/environment/nnunetplusplus.onnx
+```
+注：首次运行该程序时，将消耗比平时更多的时间。
+
+之后我们需要将onnx转化为om模型，先使用npu-smi info查看设备状态，确保device空闲后，执行以下命令。这将生成batch_size为1的om模型，其输入onnx文件为nnunetplusplus.onnx，输出om文件命名为nnunetplusplus，这将在当前路径下生成nnunetplusplus.om文件，后面的--input_format和--input_shape参数则指代了该模型的输入图像规格与尺寸。
+```
+cd environment
+atc --framework=5 --model=nnunetplusplus.onnx --output=nnunetplusplus --input_format=NCDHW --input_shape="image:1,1,128,128,128" --log=debug --soc_version=Ascend310
+```
+注：我们注意到在CANN 5.0.3上，atc命令可以通过，但在CANN 5.0.4上却会报错：RuntimeError: ({'errCode': 'E90003', 'detailed_cause': 'tuple_reduce_sum not support'}, 'Compile operator failed, cause: Template constraint, detailed information: tuple_reduce_sum not support.')。最终本文在CANN 5.1.RC1.alpha001下又得以通过。
+
+### 2.2 删除指定的待推理图像的结果文件
+本质上，若在INFERENCE_INPUT_FOLDER（在1.3.7节中被设置为/home/hyp/environment/input/）中存在输入图像而在INFERENCE_OUTPUT_FOLDER（在1.3.7节中被设置为/home/hyp/environment/output/）中不存在结果图像，二者的差集便是模型需要进行推理的内容，接着模型便是随机挑选一张未经推理的图像进行推理，这个随机性是由多个进程的IO读取速率来决定的。
+
+因此，我们将INFERENCE_OUTPUT_FOLDER中的某个指定编号的文件删除掉，就可以对该图像进行一次推理流程了。删除输出结果文件夹INFERENCE_OUTPUT_FOLDER中的编号为11的结果，模拟已经完成了其余26张图像的推理，并准备开始对编号11的图像进行推理。
+```
+# 全部验证集图像的编号：3, 5, 11, 12, 17, 19, 24, 25, 27, 38, 40, 41, 42, 44, 51, 52, 58, 64, 70, 75, 77, 82, 101, 112, 115, 120, 128
+rm /home/hyp/environment/output/liver_11.nii.gz
+```
+如果您想推理其他图像，删除在INFERENCE_OUTPUT_FOLDER中的其他编号的结果文件，使得与INFERENCE_INPUT_FOLDER的差集不为空集即可。我们推荐您每次只推理一张图像，否则您无法确切知道模型目前正在推理哪张图像，以及当前推理的进度。如果差集较大，则很可能占据超过预期的存储空间。
+
+### 2.3 数据预处理后切割子图，生成待输入bin文件
+遵从UNET的实验流程，一张待推理的图像会被切割出1000至4000张的子图，我们需要将这些子图存储为.bin文件，存放在指定目录下，暂且先定为environment/input_bins。使用3d_nested_unet_preprocess.py，参数--file_path指定为想要生成输入bin文件的目录，请用户自行创建该文件夹。
+```
+python 3d_nested_unet_preprocess.py --file_path /home/hyp/environment/input_bins/
+```
+该程序执行成功后，会在--file_path下生成大量的.bin文件，并且在INFERENCE_SHAPE_PATH（在1.3.7节中被设置为/home/hyp/environment/）下生成一个all_shape.txt文件，该文件存储了当前待输入图像的部分属性信息，这些信息将在后续的实验过程中帮助输出.bin的结果合并，使用过程中无需查阅里面的内容。
+
+注：请确保有充足的硬盘空间。若使用310设备，遵从UNET的实验流程设计，推理一副图像，预计消耗200GB至800GB（多为300GB左右，上限受原始图像尺寸影响，800GB是一个预估值）的额外存储空间，耗时半小时至两小时不等。待推理的图像共有27张，不可能一次性将所有图像都推理完毕，因此我们只能采用逐个图像推理，之后立即做结果合并，然后删除掉使用过的bin文件，重复此过程。
+
+### 2.4 生成info文件
+使用UNetPlusPlus/pytorch/nnunet/inference/gen_dataset_info.py，读取INFERENCE_INPUT_FOLDER（在1.3.7节中被设置为/home/hyp/environment/input/）中全部文件的路径，即生成的预处理数据.bin的路径，进而生成对应的info文件，作为benchmark工具推理的输入，将结果命名为nnunetplusplus.info，两个参数128指代了模型的输入尺寸。
+```
+python gen_dataset_info.py bin ./environment/input_bins nnunetplusplus_prep_bin.info 128 128
+```
+这个操作同时会在nnunetplusplus_prep_bin.info所在目录下额外生成四个子文件：sth1.info, sth2.info, sth3.info, sth4.info。它们是对nnunetplusplus_prep_bin.info的不重叠的有序拆分，有了这些拆分的info文件，便于我们同步使用4个310设备进行推理，加快实验进度。
+
+### 2.5 使用benchmark工具进行推理
+确保device空闲，将benchmark工具与上节生成的.info文件放于同一目录下，使用benchmark工具同步开启一个或四个进程进行推理。参数-device_id指代了使用的设备编号，-om_path指代了使用的om模型，-input_text_path指代了采用的info文件，-output_binary=True指代了将结果保存为.bin。
+```
+source set_env.sh  # 激活NPU环境
+# 方法一：使用总的nnunetplusplus_prep_bin.info，使用1个310进行推理
+./benchmark.x86_64 -model_type=vision -device_id=0 -batch_size=1 -om_path=./environment/nnunetplusplus.om -input_text_path=nnunetplusplus_prep_bin.info -input_width=128 -input_height=128 -output_binary=True -useDvpp=False
+
+# 方法二：使用拆分的四个info，使用4个310进行推理，全部推理结束后必须使用clear2345.sh脚本。可以通过打开四个session来完成
+./benchmark.x86_64 -model_type=vision -device_id=0 -batch_size=1 -om_path=./environment/nnunetplusplus.om -input_text_path=sth1.info -input_width=128 -input_height=128 -output_binary=True -useDvpp=False
+./benchmark.x86_64 -model_type=vision -device_id=1 -batch_size=1 -om_path=./environment/nnunetplusplus.om -input_text_path=sth2.info -input_width=128 -input_height=128 -output_binary=True -useDvpp=False
+./benchmark.x86_64 -model_type=vision -device_id=2 -batch_size=1 -om_path=./environment/nnunetplusplus.om -input_text_path=sth3.info -input_width=128 -input_height=128 -output_binary=True -useDvpp=False
+./benchmark.x86_64 -model_type=vision -device_id=3 -batch_size=1 -om_path=./environment/nnunetplusplus.om -input_text_path=sth4.info -input_width=128 -input_height=128 -output_binary=True -useDvpp=False
+```
+这会在当前路径下，自动生成result文件夹，里面有形如dumpOutput_device0的子文件夹，存放着对info文件中记录的每个输入.bin的推理输出.bin，“device0”指在第0个设备上的运行结果。而生成的形如perf_vision_batchsize_1_device_0.txt的文件，则记录了310推理过程中的部分指标与性能。
+
+注：本节内容将会产生大量的输出.bin文件，请使用df -h及时观测硬盘剩余空间。如果实验进行到一半，硬盘空间紧张，请查阅下节内容。
+
+### 2.6 清除多余的结果
+上节中使用的benchmark工具，对每张输入.bin会输出五个输出结果.bin文件，而只有其中之一是我们所需要的结果。我们需要修改程序脚本中的路径。找到项目UNetPlusPlus下的clear2345.sh，该脚本用于删除310输出结果中后缀带有2、3、4、5的冗余.bin文件（保留后缀带有1的.bin文件），并将所有的.bin文件都移动到同一个文件夹下（例如放置于device0卡的输出路径），便于后续的结果合并搜索指定子图结果。我们将该脚本中的rm命令参数替换为正确的310输出路径。之后的mv命令，用于将4卡的输出结果全部移动到1卡上，也要保持正确。该脚本在推理时才会用到，一份可用的示例如下：
+```
+# 删除多余的输出.bin文件
+rm -rf ./result/dumpOutput_device*/*_2.bin
+rm -rf ./result/dumpOutput_device*/*_3.bin
+rm -rf ./result/dumpOutput_device*/*_4.bin
+rm -rf ./result/dumpOutput_device*/*_5.bin
+
+# 将其他文件夹的.bin结果移动到同一个目录下
+mv ./result/dumpOutput_device1/* ./result/dumpOutput_device0/
+mv ./result/dumpOutput_device2/* ./result/dumpOutput_device0/
+mv ./result/dumpOutput_device3/* ./result/dumpOutput_device0/
+```
+通常来说，您只需要对上述脚本设置一次即可。执行该脚本将多余的.bin文件删除。当所有设备都推理结束后，也请执行一次该脚本，确保所有结果都位于同一文件夹下。
+```
+bash clear2345.sh
+```
+注：clear2345.sh脚本可与前一节同步使用。及时使用df -h命令查看硬盘剩余空间，适时调用该脚本清理多余的后缀为2、3、4、5的输出.bin文件，使得该实验仍可以在存储空间较小的设备上运行。以4卡并行为例，每半小时运行一次该脚本，可以清理出约150GB-200GB的存储空间。在前一节内容全部完成后，也要调用一次该脚本，将4卡的结果都移动到dumpOutput_device0文件夹中，保证dumpOutput_device0文件夹中保留有全部的输出.bin文件。
+
+### 2.7 将结果.bin文件合并为最终推理结果
+使用3d_nested_unet_postprocess.py，参数--file_path指定为经310推理生成的.bin文件的目录，也就是将result/dumpOutput_device0/下的.bin文件做结果合并。生成的推理结果会输出到INFERENCE_OUTPUT_FOLDER（在1.3.7节中被设置为/home/hyp/environment/output/）下。
+```
+python 3d_nested_unet_postprocess.py --file_path /home/hyp/result/dumpOutput_device0/
+```
+
+### 2.8 重复实验
+截止目前，我们已经完成了1张编号为11的待推理图像的推理结果，删除benchmark工具生成的相关文件，即result/dumpOutput_device*/，释放硬盘空间。
+
+若用户希望复现其他结果，请重复2.2至2.8的步骤，直至全部的验证集图片都推理完毕。
+
+### 2.9 精度评测
+推理完成后，我们需要对全部结果做精度验证。将INFERENCE_OUTPUT_FOLDER（在1.3.7节中被设置为/home/hyp/environment/output/）下的结果拷贝至environment/RESULTS_FOLDER/nnUNet/3d_fullres/Task003_Liver/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1/fold_0/validation_raw/下，模拟我们已经使用模型完成了训练过程，并且进入了验证阶段。请用户自行创建相关子文件夹。
+```
+cp -rf /home/hyp/environment/output/* environment/RESULTS_FOLDER/nnUNet/3d_fullres/Task003_Liver/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1/fold_0/validation_raw/
+```
+请确保有27个结果图像已经位于上述的validation_raw文件夹中。然后使用nnUNet_train脚本命令开始评测精度，--validation_only表明我们不需要重新训练，直接进入验证步骤。
+```
+nnUNet_train 3d_fullres nnUNetPlusPlusTrainerV2 003 0 --validation_only
+```
+注：首次运行nnUNet_train命令时，模型将开始对数据集解包，这将消耗比平时更多的时间。
+
+实验的精度将记录在environment/RESULTS_FOLDER/nnUNet/3d_fullres/Task003_Liver/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1/fold_0/validation_raw/summary.json中，您可以参照如下的结构树来找到其中的Dice指标。结果存在浮动是正常现象。
+```
+summary.json
+├── "author"
+├── "description"
+├── "id"
+├── "name"
+├── "results"
+│   ├── "all"
+│   └── "mean"
+│       ├── "0"
+│       ├── "1"
+│       │   ├── ...
+│       │   ├── "Dice": 0.9655123016429166
+│       │   └── ...
+│       └── "2"
+│           ├── ...
+│           ├── "Dice": 0.719350267858144
+│           └── ...
+├── "task": "Task003_Liver"
+└── "timestamp"
+```
+这是在第一折交叉验证下的结果，验证集图像只有27张，本文的肝脏数据是在不同的实验仪器下采集的，图像尺寸与图像质量均存在较大差异。选用不同的交叉必然会导致不同的实验结果，但对精度达标的目标来说影响不大。
+
+### 2.10 性能评测
+GPU上的性能使用onnx_infer.py来计算，需要在T4服务器上执行。您也可以在从backup/perf_T4gpu_batchsize_1.txt中直接查看性能结果。
+```
+python onnx_infer.py nnunetplusplus.onnx 1,1,128,128,128
+```
+NPU上的性能使用benchmark工具来计算，需要在310服务器上执行。使用benchmark前需要激活set_env.sh环境变量。您也可以在前面benchmark的输出文件夹result/下找到perf_vision_batchsize_1_device_0.txt文件，该文件由benchmark默认生成，在backup中我们也提供了一份实测样本，该结果与以下命令得到的结果几乎相同。
+```
+source set_env.sh
+./benchmark.x86_64 -round=20 -om_path=nnunetplusplus.om -device_id=0 -batch_size=1
+```
+以下是实测结果，可供参考：
+```
+NPU 310性能：ave_throughputRate = 0.235349samples/s, ave_latency = 4249.14ms
+GPU T4性能：Average time spent: 2.68s
+```
+
+**评测结果：**   
+| 模型      | 官网pth精度  | GPU推理精度 | 310离线推理精度  | 基准性能    | 310性能    |
+| :------: | :------: | :------: | :------: | :------:  | :------:  | 
+| 3D nested_unet bs1  | [Liver 1_Dice (val):95.80, Liver 2_Dice (val):65.60](https://github.com/MrGiovanni/UNetPlusPlus/tree/master/pytorch) | Liver 1_Dice (val):96.55, Liver 2_Dice (val):71.94 | Liver 1_Dice (val):96.55, Liver 2_Dice (val):71.97 |  0.3731fps | 0.9414fps | 
+
+备注：
+
+1.该模型的推理过程从设计之初便不支持batchsize 2及以上，本教程全程使用了batchsize 1。
+
+2.本应使用测试集进行精度验证的。但该数据集的测试集不支持单任务的精度测试，其测试集label是不公开的。因此本文只能使用数据集的验证集进行精度测试，这也导致了本文的一些实验步骤与官方不同。
diff --git a/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/change_infer_path.py b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/change_infer_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..a46e74d40f8dc8098a83065c29ab26a6ea59dc94
--- /dev/null
+++ b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/change_infer_path.py
@@ -0,0 +1,61 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# 3d_nested_unet_preprocess.py
+import sys
+import os
+import time
+import pdb
+import argparse
+from nnunet.inference import infer_path
+
+
+def main():
+    # pdb.set_trace()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-fp1', '--file_path1', help='INFERENCE_INPUT_FOLDER', required=True, default='/home/hyp/environment/input/')
+    parser.add_argument('-fp2', '--file_path2', help='INFERENCE_OUTPUT_FOLDER', required=True, default='/home/hyp/environment/output/')
+    parser.add_argument('-fp3', '--file_path3', help='INFERENCE_SHAPE_PATH', required=True, default='/home/hyp/environment/')
+    args = parser.parse_args()
+    python_file = infer_path.__file__
+    fp1 = args.file_path1
+    fp2 = args.file_path2
+    fp3 = args.file_path3
+    lines = []
+    print('尝试读取：', python_file)
+    file = open(python_file, 'r', encoding='utf-8')
+    lines = file.readlines()
+    file.close()
+    print('尝试修改路径')
+    with open(python_file, 'w', encoding='utf-8') as f:
+        for line in lines:
+            if line.startswith('INFERENCE_INPUT_FOLDER'):
+                line = 'INFERENCE_INPUT_FOLDER = ' + '\'' + str(fp1) + '\'' + '\n'
+            if line.startswith('INFERENCE_OUTPUT_FOLDER'):
+                line = 'INFERENCE_OUTPUT_FOLDER = ' + '\'' + str(fp2) + '\'' + '\n'
+            if line.startswith('INFERENCE_SHAPE_PATH'):
+                line = 'INFERENCE_SHAPE_PATH = ' + '\'' + str(fp3) + '\'' + '\n'
+            f.write(line)
+        print('正在修改：', python_file)
+        print('INFERENCE_INPUT_FOLDER =', fp1)
+        print('INFERENCE_OUTPUT_FOLDER=', fp2)
+        print('INFERENCE_SHAPE_PATH   =', fp3)
+    f.close()
+    print('修改完成')
+
+
+if __name__ == "__main__":
+    main()
+    print('main end')
+
diff --git a/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/clear2345.sh b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/clear2345.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cf512f9a2ce0a227fe95fed96dd85816ec69656f
--- /dev/null
+++ b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/clear2345.sh
@@ -0,0 +1,12 @@
+# 删除多余的输出.bin文件
+rm -rf ./result/dumpOutput_device*/*_2.bin
+rm -rf ./result/dumpOutput_device*/*_3.bin
+rm -rf ./result/dumpOutput_device*/*_4.bin
+rm -rf ./result/dumpOutput_device*/*_5.bin
+
+# 将其他文件夹的.bin结果移动到同一个目录下
+mv ./result/dumpOutput_device1/* ./result/dumpOutput_device0/
+mv ./result/dumpOutput_device2/* ./result/dumpOutput_device0/
+mv ./result/dumpOutput_device3/* ./result/dumpOutput_device0/
+
+echo 'clear2345.sh done'
diff --git a/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/gen_dataset_info.py b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/gen_dataset_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..68f13f27c3ec643068e0a5662b610cc59325747b
--- /dev/null
+++ b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/gen_dataset_info.py
@@ -0,0 +1,80 @@
+"""
+    Copyright 2020 Huawei Technologies Co., Ltd
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+    Typical usage example:
+"""
+import os
+import sys
+from glob import glob
+import pdb
+
+
+def get_bin_info(file_path, info_name, shape, split4=True):
+    """
+    @description: get given bin information
+    @param file_path  bin file path
+    @param info_name given information name
+    @param shape  image shape
+    @return
+    """
+    bin_images = glob(os.path.join(file_path, '*.bin'))
+    with open(info_name, 'w') as file:
+        for index, img in enumerate(bin_images):
+            content = ' '.join([str(index), img, shape[0], shape[1]])
+            file.write(content)
+            file.write('\n')
+    print('共计.bin文件个数：', len(bin_images))
+    print('info已写入：', os.path.abspath(info_name))
+    if split4:  # 是否切割为4卡的info
+        sths = ['sth1.info', 'sth2.info', 'sth3.info', 'sth4.info']
+        length = len(bin_images)
+        step = length // 4
+        b1 = bin_images[0: step]
+        b2 = bin_images[step: 2*step]
+        b3 = bin_images[2*step: 3*step]
+        b4 = bin_images[3*step:]
+        with open(sths[0], 'w') as file:
+            for index, img in enumerate(b1):
+                content = ' '.join([str(index), img, shape[0], shape[1]])
+                file.write(content)
+                file.write('\n')
+        with open(sths[1], 'w') as file:
+            for index, img in enumerate(b2):
+                content = ' '.join([str(index), img, shape[0], shape[1]])
+                file.write(content)
+                file.write('\n')
+        with open(sths[2], 'w') as file:
+            for index, img in enumerate(b3):
+                content = ' '.join([str(index), img, shape[0], shape[1]])
+                file.write(content)
+                file.write('\n')
+        with open(sths[3], 'w') as file:
+            for index, img in enumerate(b4):
+                content = ' '.join([str(index), img, shape[0], shape[1]])
+                file.write(content)
+                file.write('\n')
+        print('成功切分为四个子集', sths)
+
+
+if __name__ == '__main__':
+    file_type = sys.argv[1]
+    file_path = sys.argv[2]
+    info_name = sys.argv[3]
+    if file_type == 'bin':
+        shape1 = sys.argv[4]
+        shape2 = sys.argv[5]
+        shape = [shape1, shape2]
+        assert len(sys.argv) == 6, 'The number of input parameters must be equal to 5'
+        get_bin_info(file_path, info_name, shape)
+    
\ No newline at end of file
diff --git a/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/new.patch b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/new.patch
new file mode 100644
index 0000000000000000000000000000000000000000..5307bdd0c26a0cd03c7b0b06b901239d4c101087
--- /dev/null
+++ b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/new.patch
@@ -0,0 +1,4246 @@
+diff --git a/pytorch/nnunet/evaluation/model_selection/figure_out_want_to_submit2.py b/pytorch/nnunet/evaluation/model_selection/figure_out_want_to_submit2.py
+new file mode 100644
+index 0000000..2a17e8a
+--- /dev/null
++++ b/pytorch/nnunet/evaluation/model_selection/figure_out_want_to_submit2.py
+@@ -0,0 +1,200 @@
++#    Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
++#
++#    Licensed under the Apache License, Version 2.0 (the "License");
++#    you may not use this file except in compliance with the License.
++#    You may obtain a copy of the License at
++#
++#        http://www.apache.org/licenses/LICENSE-2.0
++#
++#    Unless required by applicable law or agreed to in writing, software
++#    distributed under the License is distributed on an "AS IS" BASIS,
++#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++#    See the License for the specific language governing permissions and
++#    limitations under the License.
++
++
++from itertools import combinations
++import nnunet
++from batchgenerators.utilities.file_and_folder_operations import *
++from nnunet.evaluation.add_mean_dice_to_json import foreground_mean
++from nnunet.evaluation.model_selection.ensemble import ensemble
++from nnunet.paths import network_training_output_dir
++import numpy as np
++from subprocess import call
++from nnunet.postprocessing.consolidate_postprocessing import consolidate_folds
++from nnunet.utilities.folder_names import get_output_folder_name
++from nnunet.paths import default_cascade_trainer, default_trainer, default_plans_identifier
++
++
++def find_task_name(folder, task_id):
++    candidates = subdirs(folder, prefix="Task%03.0d_" % task_id, join=False)
++    assert len(candidates) > 0, "no candidate for Task id %d found in folder %s" % (task_id, folder)
++    assert len(candidates) == 1, "more than one candidate for Task id %d found in folder %s" % (task_id, folder)
++    return candidates[0]
++
++
++def get_mean_foreground_dice(json_file):
++    results = load_json(json_file)
++    return get_foreground_mean(results)
++
++
++def get_foreground_mean(results):
++    results_mean = results['results']['mean']
++    dice_scores = [results_mean[i]['Dice'] for i in results_mean.keys() if i != "0" and i != 'mean']
++    return np.mean(dice_scores)
++
++
++def main():
++    import argparse
++    parser = argparse.ArgumentParser(usage="This is intended to identify the best model based on the five fold "
++                                           "cross-validation. Running this script requires all models to have been run "
++                                           "already. This script will summarize the results of the five folds of all "
++                                           "models in one json each for easy interpretability")
++
++    parser.add_argument("-m", '--models', nargs="+", required=False, default=['3d_fullres'])
++    parser.add_argument("-t", '--task_ids', nargs="+", required=False, default='003')
++
++    parser.add_argument("-tr", type=str, required=False, default=default_trainer,
++                           help="nnUNetTrainer class. Default: %s" % default_trainer)
++    parser.add_argument("-ctr", type=str, required=False, default=default_cascade_trainer,
++                           help="nnUNetTrainer class for cascade model. Default: %s" % default_cascade_trainer)
++    parser.add_argument("-pl", type=str, required=False, default=default_plans_identifier,
++                           help="plans name, Default: %s" % default_plans_identifier)
++    parser.add_argument('-f', '--folds', nargs='+', default=(0, 1, 2, 3, 4), help="use this if you have non-standard folds")
++    parser.add_argument("--strict", required=False, default=True, action="store_true",
++                        help="set this flag if you want this script to crash of one of the models is missing")
++
++    args = parser.parse_args()
++    tasks = [int(i) for i in args.task_ids]
++
++    models = args.models
++    tr = args.tr
++    trc = args.ctr
++    strict = args.strict
++    pl = args.pl
++    folds = tuple(int(i) for i in args.folds)
++
++    validation_folder = "validation_raw"
++
++    # this script now acts independently from the summary jsons. That was unnecessary
++    id_task_mapping = {}
++    # for each task, run ensembling using all combinations of two models
++    for t in tasks:
++        # first collect pure model performance (postprocessed)
++        results = {}
++        all_results = {}
++        valid_models = []
++        for m in models:
++            try:
++                if m == "3d_cascade_fullres":
++                    trainer = trc
++                else:
++                    trainer = tr
++
++                if t not in id_task_mapping.keys():
++                    task_name = find_task_name(get_output_folder_name(m), t)
++                    id_task_mapping[t] = task_name
++
++                output_folder = get_output_folder_name(m, id_task_mapping[t], trainer, pl)
++                assert isdir(output_folder), "Output folder for model %s is missing, expected: %s" % (m, output_folder)
++
++                # we need a postprocessing_json for inference, so that must be present
++                postprocessing_json = join(output_folder, "postprocessing.json")
++                # we need cv_niftis_postprocessed to know the single model performance
++                cv_niftis_folder = join(output_folder, "cv_niftis_raw")
++                if not isfile(postprocessing_json) or not isdir(cv_niftis_folder):
++                    print("running missing postprocessing for %s and model %s" % (id_task_mapping[t], m))
++                    consolidate_folds(output_folder, folds=folds)
++                assert isfile(postprocessing_json), "Postprocessing json missing, expected: %s" % postprocessing_json
++                assert isdir(cv_niftis_folder), "Folder with niftis from CV missing, expected: %s" % cv_niftis_folder
++
++                # obtain mean foreground dice
++                summary_file = join(cv_niftis_folder, "summary.json")
++                results[m] = get_mean_foreground_dice(summary_file)
++                foreground_mean(summary_file)
++                all_results[m] = load_json(summary_file)['results']['mean']
++                valid_models.append(m)
++
++            except Exception as e:
++                if strict:
++                    raise e
++                else:
++                    print("WARNING!")
++                    print(e)
++
++        # now run ensembling and add ensembling to results
++        print("\nFound the following valid models:\n", valid_models)
++        if len(valid_models) > 1:
++            for m1, m2 in combinations(valid_models, 2):
++
++                trainer_m1 = trc if m1 == "3d_cascade_fullres" else tr
++                trainer_m2 = trc if m2 == "3d_cascade_fullres" else tr
++
++                ensemble_name = "ensemble_" + m1 + "__" + trainer_m1 + "__" + pl + "--" + m2 + "__" + trainer_m2 + "__" + pl
++                output_folder_base = join(network_training_output_dir, "ensembles", id_task_mapping[t], ensemble_name)
++                maybe_mkdir_p(output_folder_base)
++
++                network1_folder = get_output_folder_name(m1, id_task_mapping[t], trainer_m1, pl)
++                network2_folder = get_output_folder_name(m2, id_task_mapping[t], trainer_m2, pl)
++
++                print("ensembling", network1_folder, network2_folder)
++                ensemble(network1_folder, network2_folder, output_folder_base, id_task_mapping[t], validation_folder, folds)
++                # ensembling will automatically do postprocessingget_foreground_mean
++
++                # now get result of ensemble
++                results[ensemble_name] = get_mean_foreground_dice(join(output_folder_base, "ensembled_raw", "summary.json"))
++                summary_file = join(output_folder_base, "ensembled_raw", "summary.json")
++                foreground_mean(summary_file)
++                all_results[ensemble_name] = load_json(summary_file)['results']['mean']
++
++        # now print all mean foreground dice and highlight the best
++        foreground_dices = list(results.values())
++        best = np.max(foreground_dices)
++        for k, v in results.items():
++            print(k, v)
++
++        predict_str = ""
++        best_model = None
++        for k, v in results.items():
++            if v == best:
++                print("%s submit model %s" % (id_task_mapping[t], k), v)
++                best_model = k
++                print("\nHere is how you should predict test cases. Run in sequential order and replace all input and output folder names with your personalized ones\n")
++                if k.startswith("ensemble"):
++                    tmp = k[len("ensemble_"):]
++                    model1, model2 = tmp.split("--")
++                    m1, t1, pl1 = model1.split("__")
++                    m2, t2, pl2 = model2.split("__")
++                    predict_str += "nnUNet_predict -i FOLDER_WITH_TEST_CASES -o OUTPUT_FOLDER_MODEL1 -tr " + tr + " -ctr " + trc + " -m " + m1 + " -p " + pl + " -t " + \
++                                   id_task_mapping[t] + "\n"
++                    predict_str += "nnUNet_predict -i FOLDER_WITH_TEST_CASES -o OUTPUT_FOLDER_MODEL2 -tr " + tr + " -ctr " + trc + " -m " + m2 + " -p " + pl + " -t " + \
++                                   id_task_mapping[t] + "\n"
++
++                    predict_str += "nnUNet_ensemble -f OUTPUT_FOLDER_MODEL1 OUTPUT_FOLDER_MODEL2 -o OUTPUT_FOLDER -pp " + join(network_training_output_dir, "ensembles", id_task_mapping[t], k, "postprocessing.json") + "\n"
++                else:
++                    predict_str += "nnUNet_predict -i FOLDER_WITH_TEST_CASES -o OUTPUT_FOLDER_MODEL1 -tr " + tr + " -ctr " + trc + " -m " + k + " -p " + pl + " -t " + \
++                                   id_task_mapping[t] + "\n"
++                print(predict_str)
++
++        summary_folder = join(network_training_output_dir, "ensembles", id_task_mapping[t])
++        maybe_mkdir_p(summary_folder)
++        with open(join(summary_folder, "prediction_commands.txt"), 'w') as f:
++            f.write(predict_str)
++
++        num_classes = len([i for i in all_results[best_model].keys() if i != 'mean'])
++        with open(join(summary_folder, "summary.csv"), 'w') as f:
++            f.write("model")
++            for c in range(1, num_classes):
++                f.write(",class%d" % c)
++            f.write(",average")
++            f.write("\n")
++            for m in all_results.keys():
++                f.write(m)
++                for c in range(1, num_classes):
++                    f.write(",%01.4f" % all_results[m][str(c)]["Dice"])
++                f.write(",%01.4f" % all_results[m]['mean']["Dice"])
++                f.write("\n")
++
++
++if __name__ == "__main__":
++    main()
+diff --git a/pytorch/nnunet/experiment_planning/nnUNet_convert_decathlon_task.py b/pytorch/nnunet/experiment_planning/nnUNet_convert_decathlon_task.py
+index cf5285a..a0384f0 100644
+--- a/pytorch/nnunet/experiment_planning/nnUNet_convert_decathlon_task.py
++++ b/pytorch/nnunet/experiment_planning/nnUNet_convert_decathlon_task.py
+@@ -24,14 +24,14 @@ def crawl_and_remove_hidden_from_decathlon(folder):
+                                                      "labelsTr and imagesTs"
+     subf = subfolders(folder, join=False)
+     assert 'imagesTr' in subf, "This does not seem to be a decathlon folder. Please give me a " \
+-                                                     "folder that starts with TaskXX and has the subfolders imagesTr, " \
+-                                                     "labelsTr and imagesTs"
++                               "folder that starts with TaskXX and has the subfolders imagesTr, " \
++                               "labelsTr and imagesTs"
+     assert 'imagesTs' in subf, "This does not seem to be a decathlon folder. Please give me a " \
+-                                                     "folder that starts with TaskXX and has the subfolders imagesTr, " \
+-                                                     "labelsTr and imagesTs"
++                               "folder that starts with TaskXX and has the subfolders imagesTr, " \
++                               "labelsTr and imagesTs"
+     assert 'labelsTr' in subf, "This does not seem to be a decathlon folder. Please give me a " \
+-                                                     "folder that starts with TaskXX and has the subfolders imagesTr, " \
+-                                                     "labelsTr and imagesTs"
++                               "folder that starts with TaskXX and has the subfolders imagesTr, " \
++                               "labelsTr and imagesTs"
+     _ = [os.remove(i) for i in subfiles(folder, prefix=".")]
+     _ = [os.remove(i) for i in subfiles(join(folder, 'imagesTr'), prefix=".")]
+     _ = [os.remove(i) for i in subfiles(join(folder, 'labelsTr'), prefix=".")]
+@@ -45,9 +45,10 @@ def main():
+                                                  "therefore expect 3D niftixs instead, with one file per modality. "
+                                                  "This utility will convert 4D MSD data into the format nnU-Net "
+                                                  "expects")
+-    parser.add_argument("-i", help="Input folder. Must point to a TaskXX_TASKNAME folder as downloaded from the MSD "
+-                                   "website", required=True)
+-    parser.add_argument("-p", required=False, default=default_num_threads, type=int,
++    parser.add_argument("-i", required=False, default='/data/yupeng/Task03_Liver/',
++                        help="Input folder. Must point to a TaskXX_TASKNAME folder as downloaded from the MSD "
++                             "website")
++    parser.add_argument("-p", required=False, default=8, type=int,
+                         help="Use this to specify how many processes are used to run the script. "
+                              "Default is %d" % default_num_threads)
+     parser.add_argument("-output_task_id", required=False, default=None, type=int,
+diff --git a/pytorch/nnunet/experiment_planning/nnUNet_plan_and_preprocess.py b/pytorch/nnunet/experiment_planning/nnUNet_plan_and_preprocess.py
+index bb6785b..0b0ccd9 100644
+--- a/pytorch/nnunet/experiment_planning/nnUNet_plan_and_preprocess.py
++++ b/pytorch/nnunet/experiment_planning/nnUNet_plan_and_preprocess.py
+@@ -28,10 +28,11 @@ def main():
+     import argparse
+ 
+     parser = argparse.ArgumentParser()
+-    parser.add_argument("-t", "--task_ids", nargs="+", help="List of integers belonging to the task ids you wish to run"
+-                                                            " experiment planning and preprocessing for. Each of these "
+-                                                            "ids must, have a matching folder 'TaskXXX_' in the raw "
+-                                                            "data folder")
++    parser.add_argument("-t", "--task_ids", default="3", nargs="+",
++                        help="List of integers belonging to the task ids you wish to run"
++                        " experiment planning and preprocessing for. Each of these "
++                        "ids must, have a matching folder 'TaskXXX_' in the raw "
++                        "data folder")
+     parser.add_argument("-pl3d", "--planner3d", type=str, default="ExperimentPlanner3D_v21",
+                         help="Name of the ExperimentPlanner class for the full resolution 3D U-Net and U-Net cascade. "
+                              "Default is ExperimentPlanner3D_v21. Can be 'None', in which case these U-Nets will not be "
+diff --git a/pytorch/nnunet/hyp_getnpz.py b/pytorch/nnunet/hyp_getnpz.py
+new file mode 100644
+index 0000000..5113f93
+--- /dev/null
++++ b/pytorch/nnunet/hyp_getnpz.py
+@@ -0,0 +1,36 @@
++import numpy as np
++import os
++import nibabel as nib
++import pickle
++
++
++raw_data = '/data/yupeng/environment_variables/nnUNet_raw_data_base/nnUNet_raw_data/Task003_Liver/imagesTr/liver_0_0000.nii.gz'
++crop_data = '/data/yupeng/environment_variables/nnUNet_raw_data_base/nnUNet_cropped_data/Task003_Liver/liver_0.npz'
++crop_data = '/data/yupeng/environment_variables/nnUNet_preprocessed/Task003_Liver/nnUNetData_plans_v2.1_stage0/liver_0.npz'
++pickle_data = '/data/yupeng/environment_variables/nnUNet_preprocessed/Task003_Liver/nnUNetPlansv2.1_plans_3D.pkl'
++
++print('start')
++
++p_data = pickle.load(open(pickle_data, 'rb'))
++
++
++
++c_data = np.load(crop_data)
++print(c_data.files)
++
++r_data = nib.load(raw_data).get_data()
++r_data = r_data / np.amax(r_data)
++
++min2 = min(r_data)
++
++for i in range(512):
++    for j in range(512):
++        for k in range(75):
++            data1 = r_data[i][j][k]
++            data2 = c_data.f.data[0][k][i][j]
++            if data1 != data2:
++                print("wrong")
++                break
++
++
++print('end')
+\ No newline at end of file
+diff --git a/pytorch/nnunet/inference/copy_val_to_test.py b/pytorch/nnunet/inference/copy_val_to_test.py
+new file mode 100644
+index 0000000..405345b
+--- /dev/null
++++ b/pytorch/nnunet/inference/copy_val_to_test.py
+@@ -0,0 +1,19 @@
++import os
++import shutil
++
++# fold = 0
++val_folder = '/root/heyupeng/environment/Task03_Liver/imagesTr/'
++test_folder = '/root/heyupeng/environment/nnUNet_raw_data_base/nnUNet_raw_data/Task003_Liver/imagesTs/'
++val_list = [101, 11, 112, 115, 12, 120, 128, 17, 19, 24, 25, 27, 3, 38, 40, 41, 42, 44, 5, 51, 52, 58, 64, 70, 75, 77,
++            82]
++print('val_list:', val_list)
++for val in val_list:
++    source_file = 'liver_' + str(val) + '.nii.gz'
++    source_path = os.path.join(val_folder, source_file)
++    target_file = 'liver_' + str(val) + '_0000.nii.gz'
++    target_path = os.path.join(test_folder, target_file)
++    print('copy: ', source_path, '->', target_path)
++    shutil.copyfile(source_path, target_path)
++print('done')
++
++
+diff --git a/pytorch/nnunet/inference/create_testset.py b/pytorch/nnunet/inference/create_testset.py
+new file mode 100644
+index 0000000..cd13c1e
+--- /dev/null
++++ b/pytorch/nnunet/inference/create_testset.py
+@@ -0,0 +1,28 @@
++import os
++import pdb
++import sys
++import shutil
++
++
++def main(input_path):
++    if input_path is None:
++        raise Exception('Parameter need to be filled in: input_path')
++    env_dist = os.environ
++    p1 = env_dist.get('nnUNet_raw_data_base')
++    val_list = [101, 11, 112, 115, 12, 120, 128, 17, 19, 24, 25, 27, 3, 38, 40, 41, 42, 44, 5, 51, 52, 58, 64, 70, 75,
++                77, 82]  # 数据集的验证集部分
++    p2 = 'nnUNet_raw_data/Task003_Liver/imagesTr/'
++    target_path = os.path.join(p1, p2)
++    for v in val_list:
++        file_name = 'liver_' + str(v) + '_0000.nii.gz'
++        file_path = os.path.join(target_path, file_name)
++        # pdb.set_trace()
++        print('copy file:[', file_path, '] to folder:', input_path)
++        shutil.copy(file_path, input_path)
++    print('done')
++
++
++
++if __name__ == "__main__":
++    input_path = sys.argv[1]
++    main(input_path)
+diff --git a/pytorch/nnunet/inference/delete_other_data.py b/pytorch/nnunet/inference/delete_other_data.py
+new file mode 100644
+index 0000000..b58367f
+--- /dev/null
++++ b/pytorch/nnunet/inference/delete_other_data.py
+@@ -0,0 +1,30 @@
++import os
++import pdb
++
++
++def listdir(path, list_name):
++    for file in os.listdir(path):
++        file_path = os.path.join(path, file)
++        if os.path.isdir(file_path):
++            listdir(file_path, list_name)
++        elif os.path.splitext(file_path)[1] == '.gz':
++            list_name.append(file_path)
++    return list_name
++
++val_list = [101, 11, 112, 115, 12, 120, 128, 17, 19, 24, 25, 27, 3, 38, 40, 41, 42, 44, 5, 51, 52, 58, 64, 70, 75, 77,
++            82]
++target_folder = ['imagesTr', 'labelsTr', 'imagesTs']
++for i in range(len(target_folder)):
++    t = target_folder[i]
++    if i == 2:
++        val_list = [132]
++    p = os.path.join('./Task03_Liver/', t)
++    files = []
++    files = listdir(p, files)
++    files = set(files)
++    for e in val_list:
++        str_e = './Task03_Liver/' + t + '/liver_' + str(e) + '.nii.gz'
++        files.remove(str_e)
++    for f in files:
++        os.remove(f)
++print('end')
+diff --git a/pytorch/nnunet/inference/gen_dataset_info.py b/pytorch/nnunet/inference/gen_dataset_info.py
+new file mode 100644
+index 0000000..d1cb265
+--- /dev/null
++++ b/pytorch/nnunet/inference/gen_dataset_info.py
+@@ -0,0 +1,83 @@
++"""
++    Copyright 2020 Huawei Technologies Co., Ltd
++
++    Licensed under the Apache License, Version 2.0 (the "License");
++    you may not use this file except in compliance with the License.
++    You may obtain a copy of the License at
++
++        http://www.apache.org/licenses/LICENSE-2.0
++
++    Unless required by applicable law or agreed to in writing, software
++    distributed under the License is distributed on an "AS IS" BASIS,
++    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++    See the License for the specific language governing permissions and
++    limitations under the License.
++    Typical usage example:
++"""
++import os
++import sys
++from glob import glob
++import pdb
++
++
++def get_bin_info(file_path, info_name, shape, split4=True):
++    """
++    @description: get given bin information
++    @param file_path  bin file path
++    @param info_name given information name
++    @param shape  image shape
++    @return
++    """
++    bin_images = glob(os.path.join(file_path, '*.bin'))
++    with open(info_name, 'w') as file:
++        for index, img in enumerate(bin_images):
++            content = ' '.join([str(index), img, shape[0], shape[1]])
++            file.write(content)
++            file.write('\n')
++    print('info已写入：', info_name)
++    if split4:  # 是否切割为4卡的info
++        sths = ['sth1.info', 'sth2.info', 'sth3.info', 'sth4.info']
++        for i in range(len(sths)):
++            s = sths[i]
++            s = os.path.join(info_name, '..', s)
++            sths[i] = s
++        length = len(bin_images)
++        step = length // 4
++        b1 = bin_images[0: step]
++        b2 = bin_images[step: 2*step]
++        b3 = bin_images[2*step: 3*step]
++        b4 = bin_images[3*step:]
++        with open(sths[0], 'w') as file:
++            for index, img in enumerate(b1):
++                content = ' '.join([str(index), img, shape[0], shape[1]])
++                file.write(content)
++                file.write('\n')
++        with open(sths[1], 'w') as file:
++            for index, img in enumerate(b2):
++                content = ' '.join([str(index), img, shape[0], shape[1]])
++                file.write(content)
++                file.write('\n')
++        with open(sths[2], 'w') as file:
++            for index, img in enumerate(b3):
++                content = ' '.join([str(index), img, shape[0], shape[1]])
++                file.write(content)
++                file.write('\n')
++        with open(sths[3], 'w') as file:
++            for index, img in enumerate(b4):
++                content = ' '.join([str(index), img, shape[0], shape[1]])
++                file.write(content)
++                file.write('\n')
++        print('成功切分为四个子集', sths)
++
++
++if __name__ == '__main__':
++    file_type = sys.argv[1]
++    file_path = sys.argv[2]
++    info_name = sys.argv[3]
++    if file_type == 'bin':
++        shape1 = sys.argv[4]
++        shape2 = sys.argv[5]
++        shape = [shape1, shape2]
++        assert len(sys.argv) == 6, 'The number of input parameters must be equal to 5'
++        get_bin_info(file_path, info_name, shape)
++    print('end main')
+\ No newline at end of file
+diff --git a/pytorch/nnunet/inference/infer_path.py b/pytorch/nnunet/inference/infer_path.py
+new file mode 100644
+index 0000000..03ab90e
+--- /dev/null
++++ b/pytorch/nnunet/inference/infer_path.py
+@@ -0,0 +1,27 @@
++import os
++import sys
++
++#  历史残留设置，后续可以增加功能
++INFERENCE_BIN_INPUT_FOLDER    = None
++INFERENCE_BIN_OUTPUT_FOLDER   = None
++
++# 47服务器设置
++# INFERENCE_INPUT_FOLDER      = '/root/heyupeng2/environment/input/'  # 存放待推理图像的文件夹
++# INFERENCE_OUTPUT_FOLDER     = '/root/heyupeng2/environment/output'  # 推理完成后，存放推理结果的文件夹
++# INFERENCE_SHAPE_PATH        = '/root/heyupeng2/environment/'  # 存放文件all_shape.txt的目录
++# INFERENCE_BIN_INPUT_FOLDER  = '/root/heyupeng2/environment/bin_files/'  # 存放输入.bin文件的目录!!!!!
++# INFERENCE_BIN_OUTPUT_FOLDER = '/root/heyupeng2/result/dumpOutput_device0/'  # 存放输出.bin文件的目录，用户请注意，该路径可能需要后续再确认！!!!!!
++
++# 3090服务器设置
++# INFERENCE_INPUT_FOLDER      = '/data/yupeng/environment_variables/nnUNet_raw_data_base/nnUNet_raw_data/Task003_Liver/imagesTs/'  # 存放待推理图像的文件夹
++# INFERENCE_OUTPUT_FOLDER     = '/data/yupeng/environment_variables/output/'  # 推理完成后，存放推理结果的文件夹
++# INFERENCE_SHAPE_PATH        = '/data/yupeng/environment_variables/'  # 存放文件all_shape.txt的目录
++# INFERENCE_BIN_INPUT_FOLDER  = '/data/yupeng/environment_variables/output/bin_file/'  # 存放输入.bin文件的目录
++# INFERENCE_BIN_OUTPUT_FOLDER = '/data/yupeng/environment_variables/output/bin_file_benchmark/real_output/'  # 存放输出.bin文件的目录，用户请注意，该路径可能需要后续再确认！
++
++# 241服务器设置
++INFERENCE_INPUT_FOLDER = '/home/modelzoo/contrib/ACL_PyTorch/Research/cv/segmentation/3D_Nested_Unet/environment/input/'
++INFERENCE_OUTPUT_FOLDER = '/home/modelzoo/contrib/ACL_PyTorch/Research/cv/segmentation/3D_Nested_Unet/environment/output/'
++INFERENCE_SHAPE_PATH = '/home/modelzoo/contrib/ACL_PyTorch/Research/cv/segmentation/3D_Nested_Unet/environment/'
++
++
+diff --git a/pytorch/nnunet/inference/model2onnx.py b/pytorch/nnunet/inference/model2onnx.py
+new file mode 100644
+index 0000000..e69de29
+diff --git a/pytorch/nnunet/inference/predict.py b/pytorch/nnunet/inference/predict.py
+index fdb43bc..bf140cb 100644
+--- a/pytorch/nnunet/inference/predict.py
++++ b/pytorch/nnunet/inference/predict.py
+@@ -177,8 +177,15 @@ def predict_cases(model, list_of_lists, output_filenames, folds, save_npz, num_t
+ 
+     print("emptying cuda cache")
+     torch.cuda.empty_cache()
+-
+-    print("loading parameters for folds,", folds)
++    '''
++    model='/data/yupeng/environment_variables/RESULTS_FOLDER/nnUNet/3d_fullres/Task003_Liver/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1'
++    folds=None
++    mixed_precision=True
++    checkpoint_name='model_final_checkpoint'
++    trainer=class-nnUNetPlusPlusTrainerV2
++    params=list 5 -> dict 6 -> epoch state_dict optimizer_state_dict lr_scheduler_state_dict plot_stuff amp_grad_scaler
++    '''
++    print("loading parameters for folds,", folds)  # 得到参数，实际还未加载进模型
+     trainer, params = load_model_and_checkpoint_files(model, folds, mixed_precision=mixed_precision, checkpoint_name=checkpoint_name)
+ 
+     if segmentation_export_kwargs is None:
+@@ -202,6 +209,7 @@ def predict_cases(model, list_of_lists, output_filenames, folds, save_npz, num_t
+     all_output_files = []
+     for preprocessed in preprocessing:
+         output_filename, (d, dct) = preprocessed
++        print('output_filename, d, dct = ', output_filename, d, dct)
+         all_output_files.append(all_output_files)
+         if isinstance(d, str):
+             data = np.load(d)
+@@ -211,10 +219,19 @@ def predict_cases(model, list_of_lists, output_filenames, folds, save_npz, num_t
+         print("predicting", output_filename)
+         softmax = []
+         for p in params:
++            print("len(p)=", len(p))
+             trainer.load_checkpoint_ram(p, False)
+             softmax.append(trainer.predict_preprocessed_data_return_seg_and_softmax(d, do_tta, trainer.data_aug_params[
+                 'mirror_axes'], True, step_size=step_size, use_gaussian=True, all_in_gpu=all_in_gpu,
+                                                                                     mixed_precision=mixed_precision)[1][None])
++            '''
++            d=
++            do_tta=
++            step_size=
++            all_in_gpu=
++            mixed_precision=
++            softmax=
++            '''
+ 
+         softmax = np.vstack(softmax)
+         softmax_mean = np.mean(softmax, 0)
+diff --git a/pytorch/nnunet/inference/predict2.py b/pytorch/nnunet/inference/predict2.py
+new file mode 100644
+index 0000000..263dbd2
+--- /dev/null
++++ b/pytorch/nnunet/inference/predict2.py
+@@ -0,0 +1,845 @@
++#    Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
++#
++#    Licensed under the Apache License, Version 2.0 (the "License");
++#    you may not use this file except in compliance with the License.
++#    You may obtain a copy of the License at
++#
++#        http://www.apache.org/licenses/LICENSE-2.0
++#
++#    Unless required by applicable law or agreed to in writing, software
++#    distributed under the License is distributed on an "AS IS" BASIS,
++#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++#    See the License for the specific language governing permissions and
++#    limitations under the License.
++
++
++import argparse
++from copy import deepcopy
++from typing import Tuple, Union, List
++
++import numpy as np
++from batchgenerators.augmentations.utils import resize_segmentation
++from nnunet.inference.segmentation_export import save_segmentation_nifti_from_softmax, save_segmentation_nifti
++from batchgenerators.utilities.file_and_folder_operations import *
++from multiprocessing import Process, Queue
++import torch
++import SimpleITK as sitk
++import shutil
++from multiprocessing import Pool
++from nnunet.postprocessing.connected_components import load_remove_save, load_postprocessing
++from nnunet.training.model_restore import load_model_and_checkpoint_files
++from nnunet.training.network_training.nnUNetTrainer import nnUNetTrainer
++from nnunet.utilities.one_hot_encoding import to_one_hot
++from nnunet.utilities.to_torch import maybe_to_torch, to_cuda
++import pdb
++
++
++def preprocess_save_to_queue(preprocess_fn, q, list_of_lists, output_files, segs_from_prev_stage, classes,
++                             transpose_forward):
++    # suppress output
++    # sys.stdout = open(os.devnull, 'w')
++
++    errors_in = []
++    for i, l in enumerate(list_of_lists):
++        try:
++            output_file = output_files[i]
++            print("preprocessing", output_file)
++            d, _, dct = preprocess_fn(l)
++            # print(output_file, dct)
++            if segs_from_prev_stage[i] is not None:
++                assert isfile(segs_from_prev_stage[i]) and segs_from_prev_stage[i].endswith(
++                    ".nii.gz"), "segs_from_prev_stage" \
++                                " must point to a " \
++                                "segmentation file"
++                seg_prev = sitk.GetArrayFromImage(sitk.ReadImage(segs_from_prev_stage[i]))
++                # check to see if shapes match
++                img = sitk.GetArrayFromImage(sitk.ReadImage(l[0]))
++                assert all([i == j for i, j in zip(seg_prev.shape, img.shape)]), "image and segmentation from previous " \
++                                                                                 "stage don't have the same pixel array " \
++                                                                                 "shape! image: %s, seg_prev: %s" % \
++                                                                                 (l[0], segs_from_prev_stage[i])
++                seg_prev = seg_prev.transpose(transpose_forward)
++                seg_reshaped = resize_segmentation(seg_prev, d.shape[1:], order=1, cval=0)
++                seg_reshaped = to_one_hot(seg_reshaped, classes)
++                d = np.vstack((d, seg_reshaped)).astype(np.float32)
++            """There is a problem with python process communication that prevents us from communicating obejcts 
++            larger than 2 GB between processes (basically when the length of the pickle string that will be sent is 
++            communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long 
++            enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually 
++            patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will 
++            then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either 
++            filename or np.ndarray and will handle this automatically"""
++            print(d.shape)
++            if np.prod(d.shape) > (2e9 / 4 * 0.85):  # *0.85 just to be save, 4 because float32 is 4 bytes
++                print(
++                    "This output is too large for python process-process communication. "
++                    "Saving output temporarily to disk")
++                np.save(output_file[:-7] + ".npy", d)
++                d = output_file[:-7] + ".npy"
++            q.put((output_file, (d, dct)))
++        except KeyboardInterrupt:
++            raise KeyboardInterrupt
++        except Exception as e:
++            print("error in", l)
++            print(e)
++    q.put("end")
++    if len(errors_in) > 0:
++        print("There were some errors in the following cases:", errors_in)
++        print("These cases were ignored.")
++    else:
++        print("This worker has ended successfully, no errors to report")
++    # restore output
++    # sys.stdout = sys.__stdout__
++
++
++def preprocess_multithreaded(trainer, list_of_lists, output_files, num_processes=2, segs_from_prev_stage=None):
++    if segs_from_prev_stage is None:
++        segs_from_prev_stage = [None] * len(list_of_lists)
++
++    num_processes = min(len(list_of_lists), num_processes)
++
++    classes = list(range(1, trainer.num_classes))
++    assert isinstance(trainer, nnUNetTrainer)
++    q = Queue(1)
++    processes = []
++    for i in range(num_processes):
++        pr = Process(target=preprocess_save_to_queue, args=(trainer.preprocess_patient, q,
++                                                            list_of_lists[i::num_processes],
++                                                            output_files[i::num_processes],
++                                                            segs_from_prev_stage[i::num_processes],
++                                                            classes, trainer.plans['transpose_forward']))
++        pr.start()
++        processes.append(pr)
++
++    try:
++        end_ctr = 0
++        while end_ctr != num_processes:
++            item = q.get()
++            if item == "end":
++                end_ctr += 1
++                continue
++            else:
++                yield item
++
++    finally:
++        for p in processes:
++            if p.is_alive():
++                p.terminate()  # this should not happen but better safe than sorry right
++            p.join()
++
++        q.close()
++
++
++def pth2onnx(model, output_file=r'/home/yupeng/HUAWEI/UNetPlusPlus/pytorch/nnunet/run/nnunetplusplus.onnx'):
++    # model = EfficientNet.from_pretrained('efficientnet-b0', weights_path=input_file)
++    # 调整模型为eval mode
++    model.eval()
++    # 输入节点名
++    input_names = ["image"]
++    # 输出节点名
++    output_names = ["class"]
++    dynamic_axes = {'image': {0: '-1'}, 'class': {0: '-1'}}
++    dummy_input = torch.randn(1, 1, 128, 128, 128)
++    # dummy_input = to_cuda(dummy_input)
++    # verbose=True，支持打印onnx节点和对应的PyTorch代码行
++    torch.onnx.export(model, dummy_input, output_file, input_names=input_names, dynamic_axes=dynamic_axes,
++                      output_names=output_names, opset_version=11, verbose=True)
++
++
++def predict_cases(model, list_of_lists, output_filenames, folds, save_npz, num_threads_preprocessing,
++                  num_threads_nifti_save, segs_from_prev_stage=None, do_tta=True, mixed_precision=True, overwrite_existing=False,
++                  all_in_gpu=False, step_size=0.5, checkpoint_name="model_final_checkpoint",
++                  segmentation_export_kwargs: dict = None, pre_mode=None, fp=None):
++    """
++    :param segmentation_export_kwargs:
++    :param model: folder where the model is saved, must contain fold_x subfolders
++    :param list_of_lists: [[case0_0000.nii.gz, case0_0001.nii.gz], [case1_0000.nii.gz, case1_0001.nii.gz], ...]
++    :param output_filenames: [output_file_case0.nii.gz, output_file_case1.nii.gz, ...]
++    :param folds: default: (0, 1, 2, 3, 4) (but can also be 'all' or a subset of the five folds, for example use (0, )
++    for using only fold_0
++    :param save_npz: default: False
++    :param num_threads_preprocessing:
++    :param num_threads_nifti_save:
++    :param segs_from_prev_stage:
++    :param do_tta: default: True, can be set to False for a 8x speedup at the cost of a reduced segmentation quality
++    :param overwrite_existing: default: True
++    :param mixed_precision: if None then we take no action. If True/False we overwrite what the model has in its init
++    :return:
++    """
++    assert len(list_of_lists) == len(output_filenames)
++    if segs_from_prev_stage is not None: assert len(segs_from_prev_stage) == len(output_filenames)
++
++    pool = Pool(num_threads_nifti_save)
++    results = []
++
++    cleaned_output_files = []
++    for o in output_filenames:
++        dr, f = os.path.split(o)
++        if len(dr) > 0:
++            maybe_mkdir_p(dr)
++        if not f.endswith(".nii.gz"):
++            f, _ = os.path.splitext(f)
++            f = f + ".nii.gz"
++        cleaned_output_files.append(join(dr, f))
++
++    if not overwrite_existing:
++        print("number of cases:", len(list_of_lists))
++        not_done_idx = [i for i, j in enumerate(cleaned_output_files) if not isfile(j)]
++
++        cleaned_output_files = [cleaned_output_files[i] for i in not_done_idx]
++        list_of_lists = [list_of_lists[i] for i in not_done_idx]
++        if segs_from_prev_stage is not None:
++            segs_from_prev_stage = [segs_from_prev_stage[i] for i in not_done_idx]
++
++        print("number of cases that still need to be predicted:", len(cleaned_output_files))
++
++    print("emptying cuda cache")
++    torch.cuda.empty_cache()
++    '''
++    model='/data/yupeng/environment_variables/RESULTS_FOLDER/nnUNet/3d_fullres/Task003_Liver/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1'
++    folds=None
++    mixed_precision=True
++    checkpoint_name='model_final_checkpoint'
++    trainer=class-nnUNetPlusPlusTrainerV2
++    params=list 5 -> dict 6 -> epoch state_dict optimizer_state_dict lr_scheduler_state_dict plot_stuff amp_grad_scaler
++    '''
++    print("loading parameters for folds,", folds)  # 得到参数，实际还未加载进模型
++    trainer, params = load_model_and_checkpoint_files(model, folds, mixed_precision=mixed_precision, checkpoint_name=checkpoint_name)
++
++    if segmentation_export_kwargs is None:
++        if 'segmentation_export_params' in trainer.plans.keys():
++            force_separate_z = trainer.plans['segmentation_export_params']['force_separate_z']
++            interpolation_order = trainer.plans['segmentation_export_params']['interpolation_order']
++            interpolation_order_z = trainer.plans['segmentation_export_params']['interpolation_order_z']
++        else: # 走到这里
++            force_separate_z = None
++            interpolation_order = 1
++            interpolation_order_z = 0
++    else:
++        force_separate_z = segmentation_export_kwargs['force_separate_z']
++        interpolation_order = segmentation_export_kwargs['interpolation_order']
++        interpolation_order_z = segmentation_export_kwargs['interpolation_order_z']
++
++    print("starting preprocessing generator")
++    preprocessing = preprocess_multithreaded(trainer, list_of_lists, cleaned_output_files, num_threads_preprocessing,
++                                             segs_from_prev_stage)
++    # unet++V2class, [['/data/yupeng/environment_variables/nnUNet_raw_data_base/nnUNet_raw_data/Task003_Liver/imagesTs/liver_132_0000.nii.gz']]
++    # ['/data/yupeng/environment_variables/output/liver_132.nii.gz'], 6, None
++    print("starting prediction...")
++    if int(pre_mode) == -1:
++        p = params[0]
++        trainer.load_checkpoint_ram(p, False)  # nnUnetPlusPlusTrainerV2，实际函数在network_trainer里
++        print('pth2onnx start')
++        pth2onnx(trainer.network, fp)
++        print('pth2onnx end')
++        print('onnx模型已经输出至：', fp)
++        import sys
++        sys.exit(0)
++    all_output_files = []
++    for preprocessed in preprocessing:
++        output_filename, (d, dct) = preprocessed
++        print('output_filename, d, dct = ', output_filename, d, dct)
++        all_output_files.append(all_output_files)
++        if isinstance(d, str):
++            data = np.load(d)
++            os.remove(d)
++            d = data
++        print("predicting", output_filename)
++        softmax = []
++        params = [params[0]]  # 只求第一个模型的推理结果
++        for p in params:
++            # trainer.load_checkpoint_ram(p, False)  # nnUnetPlusPlusTrainerV2，实际函数在network_trainer里
++            # output_filename = '/data/yupeng/environment_variables/output/liver_132.nii.gz'
++            ttttt = trainer.predict_preprocessed_data_return_seg_and_softmax(d, do_tta, trainer.data_aug_params[
++                'mirror_axes'], True, step_size=step_size, use_gaussian=True, all_in_gpu=all_in_gpu,
++                mixed_precision=mixed_precision, img_name=output_filename, pre_mode=pre_mode, fp=fp)  # tuple(ndarray 489 500 500; 3 489 500 500)
++            softmax.append(ttttt[1][None])  # 扩充了1 3 489 500 500
++            '''
++            d=
++            do_tta=
++            step_size=
++            all_in_gpu=
++            mixed_precision=
++            softmax=
++            '''
++        # softmax是list 5，每个元素是ndarray 1 3 489 500 500
++        softmax = np.vstack(softmax)  # 5 3 489 500 500
++        softmax_mean = np.mean(softmax, 0)  # 3 489 500 500
++
++        transpose_forward = trainer.plans.get('transpose_forward')  # [0,1,2]
++        if transpose_forward is not None:
++            transpose_backward = trainer.plans.get('transpose_backward')
++            softmax_mean = softmax_mean.transpose([0] + [i + 1 for i in transpose_backward])
++
++        if save_npz:  # False
++            npz_file = output_filename[:-7] + ".npz"
++        else:
++            npz_file = None
++
++        if hasattr(trainer, 'regions_class_order'):  # False
++            region_class_order = trainer.regions_class_order
++        else:
++            region_class_order = None
++
++        """There is a problem with python process communication that prevents us from communicating obejcts 
++        larger than 2 GB between processes (basically when the length of the pickle string that will be sent is 
++        communicated by the multiprocessing.Pipe object then the placeholder (\%i I think) does not allow for long 
++        enough strings (lol). This could be fixed by changing i to l (for long) but that would require manually 
++        patching system python code. We circumvent that problem here by saving softmax_pred to a npy file that will 
++        then be read (and finally deleted) by the Process. save_segmentation_nifti_from_softmax can take either 
++        filename or np.ndarray and will handle this automatically"""
++        bytes_per_voxel = 4
++        if all_in_gpu:
++            bytes_per_voxel = 2  # if all_in_gpu then the return value is half (float16)
++        if np.prod(softmax_mean.shape) > (2e9 / bytes_per_voxel * 0.85):  # * 0.85 just to be save
++            print(
++                "This output is too large for python process-process communication. Saving output temporarily to disk")
++            np.save(output_filename[:-7] + ".npy", softmax_mean)
++            softmax_mean = output_filename[:-7] + ".npy"
++
++        results.append(pool.starmap_async(save_segmentation_nifti_from_softmax,
++                                          ((softmax_mean, output_filename, dct, interpolation_order, region_class_order,
++                                            None, None,
++                                            npz_file, None, force_separate_z, interpolation_order_z),)
++                                          ))
++
++    print("inference done. Now waiting for the segmentation export to finish...")
++    _ = [i.get() for i in results]
++    # now apply postprocessing
++    # first load the postprocessing properties if they are present. Else raise a well visible warning
++    results = []
++    pp_file = join(model, "postprocessing.json")  # '/data/yupeng/environment_variables/RESULTS_FOLDER/nnUNet/3d_fullres/Task003_Liver/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1/postprocessing.json'
++    if isfile(pp_file):
++        print("postprocessing...")
++        shutil.copy(pp_file, os.path.abspath(os.path.dirname(output_filenames[0])))
++        # for_which_classes stores for which of the classes everything but the largest connected component needs to be
++        # removed
++        for_which_classes, min_valid_obj_size = load_postprocessing(pp_file)
++        results.append(pool.starmap_async(load_remove_save,
++                                          zip(output_filenames, output_filenames,
++                                              [for_which_classes] * len(output_filenames),
++                                              [min_valid_obj_size] * len(output_filenames))))
++        _ = [i.get() for i in results]
++    else:
++        print("WARNING! Cannot run postprocessing because the postprocessing file is missing. Make sure to run "
++              "consolidate_folds in the output folder of the model first!\nThe folder you need to run this in is "
++              "%s" % model)
++
++    pool.close()
++    pool.join()
++
++def predict_cases_fast(model, list_of_lists, output_filenames, folds, num_threads_preprocessing,
++                       num_threads_nifti_save, segs_from_prev_stage=None, do_tta=True, mixed_precision=True,
++                       overwrite_existing=False,
++                       all_in_gpu=False, step_size=0.5, checkpoint_name="model_final_checkpoint",
++                       segmentation_export_kwargs: dict = None):
++    assert len(list_of_lists) == len(output_filenames)
++    if segs_from_prev_stage is not None: assert len(segs_from_prev_stage) == len(output_filenames)
++
++    pool = Pool(num_threads_nifti_save)
++    results = []
++
++    cleaned_output_files = []
++    for o in output_filenames:
++        dr, f = os.path.split(o)
++        if len(dr) > 0:
++            maybe_mkdir_p(dr)
++        if not f.endswith(".nii.gz"):
++            f, _ = os.path.splitext(f)
++            f = f + ".nii.gz"
++        cleaned_output_files.append(join(dr, f))
++
++    if not overwrite_existing:
++        print("number of cases:", len(list_of_lists))
++        not_done_idx = [i for i, j in enumerate(cleaned_output_files) if not isfile(j)]
++
++        cleaned_output_files = [cleaned_output_files[i] for i in not_done_idx]
++        list_of_lists = [list_of_lists[i] for i in not_done_idx]
++        if segs_from_prev_stage is not None:
++            segs_from_prev_stage = [segs_from_prev_stage[i] for i in not_done_idx]
++
++        print("number of cases that still need to be predicted:", len(cleaned_output_files))
++
++    print("emptying cuda cache")
++    torch.cuda.empty_cache()
++
++    print("loading parameters for folds,", folds)
++    trainer, params = load_model_and_checkpoint_files(model, folds, mixed_precision=mixed_precision, checkpoint_name=checkpoint_name)
++
++    if segmentation_export_kwargs is None:
++        if 'segmentation_export_params' in trainer.plans.keys():
++            force_separate_z = trainer.plans['segmentation_export_params']['force_separate_z']
++            interpolation_order = trainer.plans['segmentation_export_params']['interpolation_order']
++            interpolation_order_z = trainer.plans['segmentation_export_params']['interpolation_order_z']
++        else:
++            force_separate_z = None
++            interpolation_order = 1
++            interpolation_order_z = 0
++    else:
++        force_separate_z = segmentation_export_kwargs['force_separate_z']
++        interpolation_order = segmentation_export_kwargs['interpolation_order']
++        interpolation_order_z = segmentation_export_kwargs['interpolation_order_z']
++
++    print("starting preprocessing generator")
++    preprocessing = preprocess_multithreaded(trainer, list_of_lists, cleaned_output_files, num_threads_preprocessing,
++                                             segs_from_prev_stage)
++
++    print("starting prediction...")
++    for preprocessed in preprocessing:
++        print("getting data from preprocessor")
++        output_filename, (d, dct) = preprocessed
++        print("got something")
++        if isinstance(d, str):
++            print("what I got is a string, so I need to load a file")
++            data = np.load(d)
++            os.remove(d)
++            d = data
++
++        # preallocate the output arrays
++        # same dtype as the return value in predict_preprocessed_data_return_seg_and_softmax (saves time)
++        softmax_aggr = None  # np.zeros((trainer.num_classes, *d.shape[1:]), dtype=np.float16)
++        all_seg_outputs = np.zeros((len(params), *d.shape[1:]), dtype=int)
++        print("predicting", output_filename)
++
++        for i, p in enumerate(params):
++            trainer.load_checkpoint_ram(p, False)
++
++            res = trainer.predict_preprocessed_data_return_seg_and_softmax(d, do_tta,
++                                                                           trainer.data_aug_params['mirror_axes'], True,
++                                                                           step_size=step_size, use_gaussian=True,
++                                                                           all_in_gpu=all_in_gpu,
++                                                                           mixed_precision=mixed_precision)
++
++            if len(params) > 1:
++                # otherwise we dont need this and we can save ourselves the time it takes to copy that
++                print("aggregating softmax")
++                if softmax_aggr is None:
++                    softmax_aggr = res[1]
++                else:
++                    softmax_aggr += res[1]
++            all_seg_outputs[i] = res[0]
++
++        print("obtaining segmentation map")
++        if len(params) > 1:
++            # we dont need to normalize the softmax by 1 / len(params) because this would not change the outcome of the argmax
++            seg = softmax_aggr.argmax(0)
++        else:
++            seg = all_seg_outputs[0]
++
++        print("applying transpose_backward")
++        transpose_forward = trainer.plans.get('transpose_forward')
++        if transpose_forward is not None:
++            transpose_backward = trainer.plans.get('transpose_backward')
++            seg = seg.transpose([i for i in transpose_backward])
++
++        print("initializing segmentation export")
++        results.append(pool.starmap_async(save_segmentation_nifti,
++                                          ((seg, output_filename, dct, interpolation_order, force_separate_z,
++                                            interpolation_order_z),)
++                                          ))
++        print("done")
++
++    print("inference done. Now waiting for the segmentation export to finish...")
++    _ = [i.get() for i in results]
++    # now apply postprocessing
++    # first load the postprocessing properties if they are present. Else raise a well visible warning
++    results = []
++    pp_file = join(model, "postprocessing.json")
++    if isfile(pp_file):
++        print("postprocessing...")
++        shutil.copy(pp_file, os.path.dirname(output_filenames[0]))
++        # for_which_classes stores for which of the classes everything but the largest connected component needs to be
++        # removed
++        for_which_classes, min_valid_obj_size = load_postprocessing(pp_file)
++        results.append(pool.starmap_async(load_remove_save,
++                                          zip(output_filenames, output_filenames,
++                                              [for_which_classes] * len(output_filenames),
++                                              [min_valid_obj_size] * len(output_filenames))))
++        _ = [i.get() for i in results]
++    else:
++        print("WARNING! Cannot run postprocessing because the postprocessing file is missing. Make sure to run "
++              "consolidate_folds in the output folder of the model first!\nThe folder you need to run this in is "
++              "%s" % model)
++
++    pool.close()
++    pool.join()
++
++
++def predict_cases_fastest(model, list_of_lists, output_filenames, folds, num_threads_preprocessing,
++                          num_threads_nifti_save, segs_from_prev_stage=None, do_tta=True, mixed_precision=True,
++                          overwrite_existing=False, all_in_gpu=True, step_size=0.5,
++                          checkpoint_name="model_final_checkpoint"):
++    assert len(list_of_lists) == len(output_filenames)
++    if segs_from_prev_stage is not None: assert len(segs_from_prev_stage) == len(output_filenames)
++
++    pool = Pool(num_threads_nifti_save)
++    results = []
++
++    cleaned_output_files = []
++    for o in output_filenames:
++        dr, f = os.path.split(o)
++        if len(dr) > 0:
++            maybe_mkdir_p(dr)
++        if not f.endswith(".nii.gz"):
++            f, _ = os.path.splitext(f)
++            f = f + ".nii.gz"
++        cleaned_output_files.append(join(dr, f))
++
++    if not overwrite_existing:
++        print("number of cases:", len(list_of_lists))
++        not_done_idx = [i for i, j in enumerate(cleaned_output_files) if not isfile(j)]
++
++        cleaned_output_files = [cleaned_output_files[i] for i in not_done_idx]
++        list_of_lists = [list_of_lists[i] for i in not_done_idx]
++        if segs_from_prev_stage is not None:
++            segs_from_prev_stage = [segs_from_prev_stage[i] for i in not_done_idx]
++
++        print("number of cases that still need to be predicted:", len(cleaned_output_files))
++
++    print("emptying cuda cache")
++    torch.cuda.empty_cache()
++
++    print("loading parameters for folds,", folds)
++    trainer, params = load_model_and_checkpoint_files(model, folds, mixed_precision=mixed_precision, checkpoint_name=checkpoint_name)
++
++    print("starting preprocessing generator")
++    preprocessing = preprocess_multithreaded(trainer, list_of_lists, cleaned_output_files, num_threads_preprocessing,
++                                             segs_from_prev_stage)
++
++    print("starting prediction...")
++    for preprocessed in preprocessing:
++        print("getting data from preprocessor")
++        output_filename, (d, dct) = preprocessed
++        print("got something")
++        if isinstance(d, str):
++            print("what I got is a string, so I need to load a file")
++            data = np.load(d)
++            os.remove(d)
++            d = data
++
++        # preallocate the output arrays
++        # same dtype as the return value in predict_preprocessed_data_return_seg_and_softmax (saves time)
++        all_softmax_outputs = np.zeros((len(params), trainer.num_classes, *d.shape[1:]), dtype=np.float16)
++        all_seg_outputs = np.zeros((len(params), *d.shape[1:]), dtype=int)
++        print("predicting", output_filename)
++
++        for i, p in enumerate(params):
++            trainer.load_checkpoint_ram(p, False)
++            res = trainer.predict_preprocessed_data_return_seg_and_softmax(d, do_tta,
++                                                                           trainer.data_aug_params['mirror_axes'], True,
++                                                                           step_size=step_size, use_gaussian=True,
++                                                                           all_in_gpu=all_in_gpu,
++                                                                           mixed_precision=mixed_precision)
++            if len(params) > 1:
++                # otherwise we dont need this and we can save ourselves the time it takes to copy that
++                all_softmax_outputs[i] = res[1]
++            all_seg_outputs[i] = res[0]
++
++        print("aggregating predictions")
++        if len(params) > 1:
++            softmax_mean = np.mean(all_softmax_outputs, 0)
++            seg = softmax_mean.argmax(0)
++        else:
++            seg = all_seg_outputs[0]
++
++        print("applying transpose_backward")
++        transpose_forward = trainer.plans.get('transpose_forward')
++        if transpose_forward is not None:
++            transpose_backward = trainer.plans.get('transpose_backward')
++            seg = seg.transpose([i for i in transpose_backward])
++
++        print("initializing segmentation export")
++        results.append(pool.starmap_async(save_segmentation_nifti,
++                                          ((seg, output_filename, dct, 0, None),)
++                                          ))
++        print("done")
++
++    print("inference done. Now waiting for the segmentation export to finish...")
++    _ = [i.get() for i in results]
++    # now apply postprocessing
++    # first load the postprocessing properties if they are present. Else raise a well visible warning
++    results = []
++    pp_file = join(model, "postprocessing.json")
++    if isfile(pp_file):
++        print("postprocessing...")
++        shutil.copy(pp_file, os.path.dirname(output_filenames[0]))
++        # for_which_classes stores for which of the classes everything but the largest connected component needs to be
++        # removed
++        for_which_classes, min_valid_obj_size = load_postprocessing(pp_file)
++        results.append(pool.starmap_async(load_remove_save,
++                                          zip(output_filenames, output_filenames,
++                                              [for_which_classes] * len(output_filenames),
++                                              [min_valid_obj_size] * len(output_filenames))))
++        _ = [i.get() for i in results]
++    else:
++        print("WARNING! Cannot run postprocessing because the postprocessing file is missing. Make sure to run "
++              "consolidate_folds in the output folder of the model first!\nThe folder you need to run this in is "
++              "%s" % model)
++
++    pool.close()
++    pool.join()
++
++
++def check_input_folder_and_return_caseIDs(input_folder, expected_num_modalities):
++    print("This model expects %d input modalities for each image" % expected_num_modalities)
++    files = subfiles(input_folder, suffix=".nii.gz", join=False, sort=True)
++
++    maybe_case_ids = np.unique([i[:-12] for i in files])
++
++    remaining = deepcopy(files)
++    missing = []
++
++    assert len(files) > 0, "input folder did not contain any images (expected to find .nii.gz file endings)"
++
++    # now check if all required files are present and that no unexpected files are remaining
++    for c in maybe_case_ids:
++        for n in range(expected_num_modalities):
++            expected_output_file = c + "_%04.0d.nii.gz" % n
++            if not isfile(join(input_folder, expected_output_file)):
++                missing.append(expected_output_file)
++            else:
++                remaining.remove(expected_output_file)
++
++    print("Found %d unique case ids, here are some examples:" % len(maybe_case_ids),
++          np.random.choice(maybe_case_ids, min(len(maybe_case_ids), 10)))
++    print("If they don't look right, make sure to double check your filenames. They must end with _0000.nii.gz etc")
++
++    if len(remaining) > 0:
++        print("found %d unexpected remaining files in the folder. Here are some examples:" % len(remaining),
++              np.random.choice(remaining, min(len(remaining), 10)))
++
++    if len(missing) > 0:
++        print("Some files are missing:")
++        print(missing)
++        raise RuntimeError("missing files in input_folder")
++
++    return maybe_case_ids
++
++
++def predict_from_folder(model: str, input_folder: str, output_folder: str, folds: Union[Tuple[int], List[int]],
++                        save_npz: bool, num_threads_preprocessing: int, num_threads_nifti_save: int,
++                        lowres_segmentations: Union[str, None],
++                        part_id: int, num_parts: int, tta: bool, mixed_precision: bool = True,
++                        overwrite_existing: bool = True, mode: str = 'normal', overwrite_all_in_gpu: bool = None,
++                        step_size: float = 0.5, checkpoint_name: str = "model_final_checkpoint",
++                        segmentation_export_kwargs: dict = None, pre_mode=None, fp=None):
++    """
++        here we use the standard naming scheme to generate list_of_lists and output_files needed by predict_cases
++
++    :param model:
++    :param input_folder:
++    :param output_folder:
++    :param folds:
++    :param save_npz:
++    :param num_threads_preprocessing:
++    :param num_threads_nifti_save:
++    :param lowres_segmentations:
++    :param part_id:
++    :param num_parts:
++    :param tta:
++    :param mixed_precision:
++    :param overwrite_existing: if not None then it will be overwritten with whatever is in there. None is default (no overwrite)
++    :return:
++    """
++    maybe_mkdir_p(output_folder)
++    shutil.copy(join(model, 'plans.pkl'), output_folder)
++
++    assert isfile(join(model, "plans.pkl")), "Folder with saved model weights must contain a plans.pkl file"
++    expected_num_modalities = load_pickle(join(model, "plans.pkl"))['num_modalities']
++
++    # check input folder integrity
++    case_ids = check_input_folder_and_return_caseIDs(input_folder, expected_num_modalities)
++
++    output_files = [join(output_folder, i + ".nii.gz") for i in case_ids]
++    all_files = subfiles(input_folder, suffix=".nii.gz", join=False, sort=True)
++    list_of_lists = [[join(input_folder, i) for i in all_files if i[:len(j)].startswith(j) and
++                      len(i) == (len(j) + 12)] for j in case_ids]
++
++    if lowres_segmentations is not None:
++        assert isdir(lowres_segmentations), "if lowres_segmentations is not None then it must point to a directory"
++        lowres_segmentations = [join(lowres_segmentations, i + ".nii.gz") for i in case_ids]
++        assert all([isfile(i) for i in lowres_segmentations]), "not all lowres_segmentations files are present. " \
++                                                               "(I was searching for case_id.nii.gz in that folder)"
++        lowres_segmentations = lowres_segmentations[part_id::num_parts]
++    else:
++        lowres_segmentations = None
++
++    if mode == "normal":  # step this
++        if overwrite_all_in_gpu is None:  # True
++            all_in_gpu = False
++        else:
++            all_in_gpu = overwrite_all_in_gpu
++
++        return predict_cases(model, list_of_lists[part_id::num_parts], output_files[part_id::num_parts], folds,
++                             save_npz, num_threads_preprocessing, num_threads_nifti_save, lowres_segmentations, tta,
++                             mixed_precision=mixed_precision, overwrite_existing=overwrite_existing, all_in_gpu=all_in_gpu,
++                             step_size=step_size, checkpoint_name=checkpoint_name,
++                             segmentation_export_kwargs=segmentation_export_kwargs, pre_mode=pre_mode, fp=fp)
++    elif mode == "fast":
++        if overwrite_all_in_gpu is None:
++            all_in_gpu = True
++        else:
++            all_in_gpu = overwrite_all_in_gpu
++
++        assert save_npz is False
++        return predict_cases_fast(model, list_of_lists[part_id::num_parts], output_files[part_id::num_parts], folds,
++                                  num_threads_preprocessing, num_threads_nifti_save, lowres_segmentations,
++                                  tta, mixed_precision=mixed_precision, overwrite_existing=overwrite_existing, all_in_gpu=all_in_gpu,
++                                  step_size=step_size, checkpoint_name=checkpoint_name,
++                                  segmentation_export_kwargs=segmentation_export_kwargs)
++    elif mode == "fastest":
++        if overwrite_all_in_gpu is None:
++            all_in_gpu = True
++        else:
++            all_in_gpu = overwrite_all_in_gpu
++
++        assert save_npz is False
++        return predict_cases_fastest(model, list_of_lists[part_id::num_parts], output_files[part_id::num_parts], folds,
++                                     num_threads_preprocessing, num_threads_nifti_save, lowres_segmentations,
++                                     tta, mixed_precision=mixed_precision, overwrite_existing=overwrite_existing, all_in_gpu=all_in_gpu,
++                                     step_size=step_size, checkpoint_name=checkpoint_name)
++    else:
++        raise ValueError("unrecognized mode. Must be normal, fast or fastest")
++
++
++if __name__ == "__main__":
++    parser = argparse.ArgumentParser()
++    parser.add_argument("-i", '--input_folder', help="Must contain all modalities for each patient in the correct"
++                                                     " order (same as training). Files must be named "
++                                                     "CASENAME_XXXX.nii.gz where XXXX is the modality "
++                                                     "identifier (0000, 0001, etc)", required=True)
++    parser.add_argument('-o', "--output_folder", required=True, help="folder for saving predictions")
++    parser.add_argument('-m', '--model_output_folder',
++                        help='model output folder. Will automatically discover the folds '
++                             'that were '
++                             'run and use those as an ensemble', required=True)
++    parser.add_argument('-f', '--folds', nargs='+', default='None', help="folds to use for prediction. Default is None "
++                                                                         "which means that folds will be detected "
++                                                                         "automatically in the model output folder")
++    parser.add_argument('-z', '--save_npz', required=False, action='store_true', help="use this if you want to ensemble"
++                                                                                      " these predictions with those of"
++                                                                                      " other models. Softmax "
++                                                                                      "probabilities will be saved as "
++                                                                                      "compresed numpy arrays in "
++                                                                                      "output_folder and can be merged "
++                                                                                      "between output_folders with "
++                                                                                      "merge_predictions.py")
++    parser.add_argument('-l', '--lowres_segmentations', required=False, default='None', help="if model is the highres "
++                                                                                             "stage of the cascade then you need to use -l to specify where the segmentations of the "
++                                                                                             "corresponding lowres unet are. Here they are required to do a prediction")
++    parser.add_argument("--part_id", type=int, required=False, default=0, help="Used to parallelize the prediction of "
++                                                                               "the folder over several GPUs. If you "
++                                                                               "want to use n GPUs to predict this "
++                                                                               "folder you need to run this command "
++                                                                               "n times with --part_id=0, ... n-1 and "
++                                                                               "--num_parts=n (each with a different "
++                                                                               "GPU (for example via "
++                                                                               "CUDA_VISIBLE_DEVICES=X)")
++    parser.add_argument("--num_parts", type=int, required=False, default=1,
++                        help="Used to parallelize the prediction of "
++                             "the folder over several GPUs. If you "
++                             "want to use n GPUs to predict this "
++                             "folder you need to run this command "
++                             "n times with --part_id=0, ... n-1 and "
++                             "--num_parts=n (each with a different "
++                             "GPU (via "
++                             "CUDA_VISIBLE_DEVICES=X)")
++    parser.add_argument("--num_threads_preprocessing", required=False, default=6, type=int, help=
++    "Determines many background processes will be used for data preprocessing. Reduce this if you "
++    "run into out of memory (RAM) problems. Default: 6")
++    parser.add_argument("--num_threads_nifti_save", required=False, default=2, type=int, help=
++    "Determines many background processes will be used for segmentation export. Reduce this if you "
++    "run into out of memory (RAM) problems. Default: 2")
++    parser.add_argument("--tta", required=False, type=int, default=1, help="Set to 0 to disable test time data "
++                                                                           "augmentation (speedup of factor "
++                                                                           "4(2D)/8(3D)), "
++                                                                           "lower quality segmentations")
++    parser.add_argument("--overwrite_existing", required=False, type=int, default=1, help="Set this to 0 if you need "
++                                                                                          "to resume a previous "
++                                                                                          "prediction. Default: 1 "
++                                                                                          "(=existing segmentations "
++                                                                                          "in output_folder will be "
++                                                                                          "overwritten)")
++    parser.add_argument("--mode", type=str, default="normal", required=False)
++    parser.add_argument("--all_in_gpu", type=str, default="None", required=False, help="can be None, False or True")
++    parser.add_argument("--step_size", type=float, default=0.5, required=False, help="don't touch")
++    # parser.add_argument("--interp_order", required=False, default=3, type=int,
++    #                     help="order of interpolation for segmentations, has no effect if mode=fastest")
++    # parser.add_argument("--interp_order_z", required=False, default=0, type=int,
++    #                     help="order of interpolation along z is z is done differently")
++    # parser.add_argument("--force_separate_z", required=False, default="None", type=str,
++    #                     help="force_separate_z resampling. Can be None, True or False, has no effect if mode=fastest")
++    parser.add_argument('--disable_mixed_precision', default=False, action='store_true', required=False,
++                        help='Predictions are done with mixed precision by default. This improves speed and reduces '
++                             'the required vram. If you want to disable mixed precision you can set this flag. Note '
++                             'that yhis is not recommended (mixed precision is ~2x faster!)')
++
++    args = parser.parse_args()
++    input_folder = args.input_folder
++    output_folder = args.output_folder
++    part_id = args.part_id
++    num_parts = args.num_parts
++    model = args.model_output_folder
++    folds = args.folds
++    save_npz = args.save_npz
++    lowres_segmentations = args.lowres_segmentations
++    num_threads_preprocessing = args.num_threads_preprocessing
++    num_threads_nifti_save = args.num_threads_nifti_save
++    tta = args.tta
++    step_size = args.step_size
++
++    # interp_order = args.interp_order
++    # interp_order_z = args.interp_order_z
++    # force_separate_z = args.force_separate_z
++
++    # if force_separate_z == "None":
++    #     force_separate_z = None
++    # elif force_separate_z == "False":
++    #     force_separate_z = False
++    # elif force_separate_z == "True":
++    #     force_separate_z = True
++    # else:
++    #     raise ValueError("force_separate_z must be None, True or False. Given: %s" % force_separate_z)
++
++    overwrite = args.overwrite_existing
++    mode = args.mode
++    all_in_gpu = args.all_in_gpu
++
++    if lowres_segmentations == "None":
++        lowres_segmentations = None
++
++    if isinstance(folds, list):
++        if folds[0] == 'all' and len(folds) == 1:
++            pass
++        else:
++            folds = [int(i) for i in folds]
++    elif folds == "None":
++        folds = None
++    else:
++        raise ValueError("Unexpected value for argument folds")
++
++    if tta == 0:
++        tta = False
++    elif tta == 1:
++        tta = True
++    else:
++        raise ValueError("Unexpected value for tta, Use 1 or 0")
++
++    if overwrite == 0:
++        overwrite = False
++    elif overwrite == 1:
++        overwrite = True
++    else:
++        raise ValueError("Unexpected value for overwrite, Use 1 or 0")
++
++    assert all_in_gpu in ['None', 'False', 'True']
++    if all_in_gpu == "None":
++        all_in_gpu = None
++    elif all_in_gpu == "True":
++        all_in_gpu = True
++    elif all_in_gpu == "False":
++        all_in_gpu = False
++
++    predict_from_folder(model, input_folder, output_folder, folds, save_npz, num_threads_preprocessing,
++                        num_threads_nifti_save, lowres_segmentations, part_id, num_parts, tta, mixed_precision=not args.disable_mixed_precision,
++                        overwrite_existing=overwrite, mode=mode, overwrite_all_in_gpu=all_in_gpu, step_size=step_size)
+diff --git a/pytorch/nnunet/inference/predict_simple2.py b/pytorch/nnunet/inference/predict_simple2.py
+new file mode 100644
+index 0000000..2af423e
+--- /dev/null
++++ b/pytorch/nnunet/inference/predict_simple2.py
+@@ -0,0 +1,238 @@
++#    Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
++#
++#    Licensed under the Apache License, Version 2.0 (the "License");
++#    you may not use this file except in compliance with the License.
++#    You may obtain a copy of the License at
++#
++#        http://www.apache.org/licenses/LICENSE-2.0
++#
++#    Unless required by applicable law or agreed to in writing, software
++#    distributed under the License is distributed on an "AS IS" BASIS,
++#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++#    See the License for the specific language governing permissions and
++#    limitations under the License.
++
++
++import argparse
++import torch
++
++from nnunet.inference.predict2 import predict_from_folder
++from nnunet.paths import default_plans_identifier, network_training_output_dir, default_cascade_trainer, default_trainer
++from batchgenerators.utilities.file_and_folder_operations import join, isdir
++from nnunet.utilities.task_name_id_conversion import convert_id_to_task_name
++from nnunet.inference.infer_path import INFERENCE_INPUT_FOLDER, INFERENCE_OUTPUT_FOLDER
++
++
++def main():
++    parser = argparse.ArgumentParser()
++    parser.add_argument("-i", '--input_folder', help="Must contain all modalities for each patient in the correct"
++                                                     " order (same as training). Files must be named "
++                                                     "CASENAME_XXXX.nii.gz where XXXX is the modality "
++                                                     "identifier (0000, 0001, etc)", required=False,
++                                                default=INFERENCE_INPUT_FOLDER)
++    parser.add_argument('-o', "--output_folder", required=False,
++                        default=INFERENCE_OUTPUT_FOLDER, help="folder for saving predictions")
++    parser.add_argument('-t', '--task_name', help='task name or task ID, required.',
++                        default="Task003_Liver", required=False)
++    parser.add_argument('-pm', '--pre_mode', help='predict mode', required=False, default=-1)
++    parser.add_argument('-fp', '--file_path', help='input or output file path for npu bin files', required=True)
++    parser.add_argument('-tr', '--trainer_class_name',
++                        help='Name of the nnUNetTrainer used for 2D U-Net, full resolution 3D U-Net and low resolution '
++                             'U-Net. The default is %s. If you are running inference with the cascade and the folder '
++                             'pointed to by --lowres_segmentations does not contain the segmentation maps generated by '
++                             'the low resolution U-Net then the low resolution segmentation maps will be automatically '
++                             'generated. For this case, make sure to set the trainer class here that matches your '
++                             '--cascade_trainer_class_name (this part can be ignored if defaults are used).'
++                             % default_trainer,
++                        required=False,
++                        default="nnUNetPlusPlusTrainerV2")
++    parser.add_argument('-ctr', '--cascade_trainer_class_name',
++                        help="Trainer class name used for predicting the 3D full resolution U-Net part of the cascade."
++                             "Default is %s" % default_cascade_trainer, required=False,
++                        default=default_cascade_trainer)
++
++    parser.add_argument('-m', '--model', help="2d, 3d_lowres, 3d_fullres or 3d_cascade_fullres. Default: 3d_fullres",
++                        default="3d_fullres", required=False)
++
++    parser.add_argument('-p', '--plans_identifier', help='do not touch this unless you know what you are doing',
++                        default=default_plans_identifier, required=False)
++
++    parser.add_argument('-f', '--folds', nargs='+', default="None",
++                        help="folds to use for prediction. Default is None which means that folds will be detected "
++                             "automatically in the model output folder")
++
++    parser.add_argument('-z', '--save_npz', required=False, action='store_true',
++                        help="use this if you want to ensemble these predictions with those of other models. Softmax "
++                             "probabilities will be saved as compressed numpy arrays in output_folder and can be "
++                             "merged between output_folders with nnUNet_ensemble_predictions")
++
++    parser.add_argument('-l', '--lowres_segmentations', required=False, default='None',
++                        help="if model is the highres stage of the cascade then you can use this folder to provide "
++                             "predictions from the low resolution 3D U-Net. If this is left at default, the "
++                             "predictions will be generated automatically (provided that the 3D low resolution U-Net "
++                             "network weights are present")
++
++    parser.add_argument("--part_id", type=int, required=False, default=0, help="Used to parallelize the prediction of "
++                                                                               "the folder over several GPUs. If you "
++                                                                               "want to use n GPUs to predict this "
++                                                                               "folder you need to run this command "
++                                                                               "n times with --part_id=0, ... n-1 and "
++                                                                               "--num_parts=n (each with a different "
++                                                                               "GPU (for example via "
++                                                                               "CUDA_VISIBLE_DEVICES=X)")
++
++    parser.add_argument("--num_parts", type=int, required=False, default=1,
++                        help="Used to parallelize the prediction of "
++                             "the folder over several GPUs. If you "
++                             "want to use n GPUs to predict this "
++                             "folder you need to run this command "
++                             "n times with --part_id=0, ... n-1 and "
++                             "--num_parts=n (each with a different "
++                             "GPU (via "
++                             "CUDA_VISIBLE_DEVICES=X)")
++
++    parser.add_argument("--num_threads_preprocessing", required=False, default=6, type=int, help=
++    "Determines many background processes will be used for data preprocessing. Reduce this if you "
++    "run into out of memory (RAM) problems. Default: 6")
++
++    parser.add_argument("--num_threads_nifti_save", required=False, default=2, type=int, help=
++    "Determines many background processes will be used for segmentation export. Reduce this if you "
++    "run into out of memory (RAM) problems. Default: 2")
++
++    parser.add_argument("--disable_tta", required=False, default=False, action="store_true",
++                        help="set this flag to disable test time data augmentation via mirroring. Speeds up inference "
++                             "by roughly factor 4 (2D) or 8 (3D)")
++
++    parser.add_argument("--overwrite_existing", required=False, default=False, action="store_true",
++                        help="Set this flag if the target folder contains predictions that you would like to overwrite")
++
++    parser.add_argument("--mode", type=str, default="normal", required=False, help="Hands off!")
++    parser.add_argument("--all_in_gpu", type=str, default="None", required=False, help="can be None, False or True. "
++                                                                                       "Do not touch.")
++    parser.add_argument("--step_size", type=float, default=0.5, required=False, help="don't touch")
++    # parser.add_argument("--interp_order", required=False, default=3, type=int,
++    #                     help="order of interpolation for segmentations, has no effect if mode=fastest. Do not touch this.")
++    # parser.add_argument("--interp_order_z", required=False, default=0, type=int,
++    #                     help="order of interpolation along z is z is done differently. Do not touch this.")
++    # parser.add_argument("--force_separate_z", required=False, default="None", type=str,
++    #                     help="force_separate_z resampling. Can be None, True or False, has no effect if mode=fastest. "
++    #                          "Do not touch this.")
++    parser.add_argument('-chk',
++                        help='checkpoint name, default: model_final_checkpoint',
++                        required=False,
++                        default='model_final_checkpoint')
++    parser.add_argument('--disable_mixed_precision', default=False, action='store_true', required=False,
++                        help='Predictions are done with mixed precision by default. This improves speed and reduces '
++                             'the required vram. If you want to disable mixed precision you can set this flag. Note '
++                             'that yhis is not recommended (mixed precision is ~2x faster!)')
++
++    args = parser.parse_args()
++    print(args)
++
++    input_folder = args.input_folder
++    output_folder = args.output_folder
++    part_id = args.part_id
++    # 推理模式
++    pre_mode = args.pre_mode
++    fp = args.file_path
++    num_parts = args.num_parts
++    folds = args.folds
++    save_npz = args.save_npz
++    lowres_segmentations = args.lowres_segmentations
++    num_threads_preprocessing = args.num_threads_preprocessing
++    num_threads_nifti_save = args.num_threads_nifti_save
++    disable_tta = args.disable_tta
++    step_size = args.step_size
++    # interp_order = args.interp_order
++    # interp_order_z = args.interp_order_z
++    # force_separate_z = args.force_separate_z
++    overwrite_existing = args.overwrite_existing
++    mode = args.mode
++    all_in_gpu = args.all_in_gpu
++    model = args.model
++    trainer_class_name = args.trainer_class_name
++    cascade_trainer_class_name = args.cascade_trainer_class_name
++
++    task_name = args.task_name
++
++    if not task_name.startswith("Task"):
++        task_id = int(task_name)
++        task_name = convert_id_to_task_name(task_id)
++
++    assert model in ["2d", "3d_lowres", "3d_fullres", "3d_cascade_fullres"], "-m must be 2d, 3d_lowres, 3d_fullres or " \
++                                                                             "3d_cascade_fullres"
++
++    # if force_separate_z == "None":
++    #     force_separate_z = None
++    # elif force_separate_z == "False":
++    #     force_separate_z = False
++    # elif force_separate_z == "True":
++    #     force_separate_z = True
++    # else:
++    #     raise ValueError("force_separate_z must be None, True or False. Given: %s" % force_separate_z)
++
++    if lowres_segmentations == "None":
++        lowres_segmentations = None
++
++    if isinstance(folds, list):
++        if folds[0] == 'all' and len(folds) == 1:
++            pass
++        else:
++            folds = [int(i) for i in folds]
++    elif folds == "None":
++        folds = None
++    else:
++        raise ValueError("Unexpected value for argument folds")
++
++    assert all_in_gpu in ['None', 'False', 'True']
++    if all_in_gpu == "None":
++        all_in_gpu = None
++    elif all_in_gpu == "True":
++        all_in_gpu = True
++    elif all_in_gpu == "False":
++        all_in_gpu = False
++
++    # we need to catch the case where model is 3d cascade fullres and the low resolution folder has not been set.
++    # In that case we need to try and predict with 3d low res first
++    if model == "3d_cascade_fullres" and lowres_segmentations is None:
++        print("lowres_segmentations is None. Attempting to predict 3d_lowres first...")
++        assert part_id == 0 and num_parts == 1, "if you don't specify a --lowres_segmentations folder for the " \
++                                                "inference of the cascade, custom values for part_id and num_parts " \
++                                                "are not supported. If you wish to have multiple parts, please " \
++                                                "run the 3d_lowres inference first (separately)"
++        model_folder_name = join(network_training_output_dir, "3d_lowres", task_name, trainer_class_name + "__" +
++                                  args.plans_identifier)
++        assert isdir(model_folder_name), "model output folder not found. Expected: %s" % model_folder_name
++        lowres_output_folder = join(output_folder, "3d_lowres_predictions")
++        predict_from_folder(model_folder_name, input_folder, lowres_output_folder, folds, False,
++                            num_threads_preprocessing, num_threads_nifti_save, None, part_id, num_parts, not disable_tta,
++                            overwrite_existing=overwrite_existing, mode=mode, overwrite_all_in_gpu=all_in_gpu,
++                            mixed_precision=not args.disable_mixed_precision,
++                            step_size=step_size)
++        lowres_segmentations = lowres_output_folder
++        torch.cuda.empty_cache()
++        print("3d_lowres done")
++
++    if model == "3d_cascade_fullres":
++        trainer = cascade_trainer_class_name
++    else:
++        trainer = trainer_class_name
++    print(network_training_output_dir)
++    print(model)
++    print(task_name)
++    print(trainer)
++    print(args.plans_identifier)
++    model_folder_name = join(network_training_output_dir, model, task_name, trainer + "__" +
++                              args.plans_identifier)
++    print("using model stored in ", model_folder_name)
++    assert isdir(model_folder_name), "model output folder not found. Expected: %s" % model_folder_name
++
++    predict_from_folder(model_folder_name, input_folder, output_folder, folds, save_npz, num_threads_preprocessing,
++                        num_threads_nifti_save, lowres_segmentations, part_id, num_parts, not disable_tta,
++                        overwrite_existing=overwrite_existing, mode=mode, overwrite_all_in_gpu=all_in_gpu,
++                        mixed_precision=not args.disable_mixed_precision,
++                        step_size=step_size, checkpoint_name=args.chk, pre_mode=pre_mode, fp=fp)
++
++
++if __name__ == "__main__":
++    main()
+diff --git a/pytorch/nnunet/inference/read_bin.py b/pytorch/nnunet/inference/read_bin.py
+new file mode 100644
+index 0000000..972d940
+--- /dev/null
++++ b/pytorch/nnunet/inference/read_bin.py
+@@ -0,0 +1,30 @@
++import numpy
++import pdb
++import os
++
++
++def read_from_bin(file_name, folder_path='/root/heyupeng/result/dumpOutput_device0/'):
++    file = os.path.join(folder_path, file_name)
++    data = numpy.fromfile(file, dtype='float32')
++    data = data.reshape(3, 128, 128, 128)
++    return data
++
++
++def main():
++    file = 'liver_132_0_128_0_128_0_128_1.bin'
++    print('ready to load:', file)
++    data = numpy.fromfile(file, dtype='float32')
++    data = data.reshape(3, 128, 128, 128)
++    pdb.set_trace()
++    print(data.shape)
++    for i in range(5):
++        print(data[0, 0, 0, i*7:(i+1)*7])
++    print('-----')
++    for i in range(5):
++        print(data[0, 0, 0, i*7+50:(i+1)*7+50])
++    pdb.set_trace()
++    print('end\n')
++
++
++if __name__ == "__main__":
++    main()
+\ No newline at end of file
+diff --git a/pytorch/nnunet/inference/read_pkl_file.py b/pytorch/nnunet/inference/read_pkl_file.py
+new file mode 100644
+index 0000000..5dcc37b
+--- /dev/null
++++ b/pytorch/nnunet/inference/read_pkl_file.py
+@@ -0,0 +1,22 @@
++import numpy
++import pdb
++import os
++import pickle
++
++
++def read_pkl(file_name, folder_path='/data/yupeng/environment_variables/nnUNet_preprocessed/Task003_Liver/'):
++    file = os.path.join(folder_path, file_name)
++    data = open(file, 'rb')
++    data = pickle.load(data)
++    return data
++
++
++def main():
++    file = 'dataset_properties.pkl'
++    print('ready to load:', file)
++    data = read_pkl(file)
++    print('end\n')
++
++
++if __name__ == "__main__":
++    main()
+\ No newline at end of file
+diff --git a/pytorch/nnunet/inference/read_txt.py b/pytorch/nnunet/inference/read_txt.py
+new file mode 100644
+index 0000000..37c94aa
+--- /dev/null
++++ b/pytorch/nnunet/inference/read_txt.py
+@@ -0,0 +1,29 @@
++import numpy
++import pdb
++import os
++
++
++def read_from_bin(file_name, folder_path='/root/heyupeng/result/dumpOutput_device0/'):
++    file = os.path.join(folder_path, file_name)
++    data = numpy.loadtxt(file)
++    data = data.reshape(3, 128, 128, 128)
++    return data
++
++
++def main():
++    file = 'liver_132_0_128_0_128_0_128_1.txt'
++    print('ready to load:', file)
++    data = numpy.loadtxt(file)
++    data = data.reshape(3, 128, 128, 128)
++    pdb.set_trace()
++    print(data.shape)
++    for i in range(5):
++        print(data[0, 0, 0, i*7:(i+1)*7])
++    print('-----')
++    for i in range(5):
++        print(data[0, 0, 0, i*7+50:(i+1)*7+50])
++    pdb.set_trace()
++    print('end\n')
++
++if __name__ == "__main__":
++    main()
+diff --git a/pytorch/nnunet/network_architecture/generic_UNetPlusPlus.py b/pytorch/nnunet/network_architecture/generic_UNetPlusPlus.py
+index 5c2f816..5b831ea 100644
+--- a/pytorch/nnunet/network_architecture/generic_UNetPlusPlus.py
++++ b/pytorch/nnunet/network_architecture/generic_UNetPlusPlus.py
+@@ -21,7 +21,8 @@ import numpy as np
+ from nnunet.network_architecture.initialization import InitWeights_He
+ from nnunet.network_architecture.neural_network import SegmentationNetwork
+ import torch.nn.functional
+-
++import pdb
++# pdb.set_trace()
+ 
+ class ConvDropoutNormNonlin(nn.Module):
+     """
+@@ -393,7 +394,7 @@ class Generic_UNetPlusPlus(SegmentationNetwork):
+ 
+     def forward(self, x):
+         # skips = []
+-        seg_outputs = []
++        seg_outputs = []  # x是五维的
+         x0_0 = self.conv_blocks_context[0](x)
+         x1_0 = self.conv_blocks_context[1](x0_0)
+         x0_1 = self.loc4[0](torch.cat([x0_0, self.up4[0](x1_0)], 1))
+@@ -425,7 +426,7 @@ class Generic_UNetPlusPlus(SegmentationNetwork):
+         x0_5 = self.loc0[4](torch.cat([x0_0, x0_1, x0_2, x0_3, x0_4, self.up0[4](x1_4)], 1))
+         seg_outputs.append(self.final_nonlin(self.seg_outputs[-5](x0_5)))
+ 
+-        if self._deep_supervision and self.do_ds:
++        if self._deep_supervision and self.do_ds:  # False
+             return tuple([seg_outputs[-1]] + [i(j) for i, j in
+                                               zip(list(self.upscale_logits_ops)[::-1], seg_outputs[:-1][::-1])])
+         else:
+diff --git a/pytorch/nnunet/network_architecture/neural_network.py b/pytorch/nnunet/network_architecture/neural_network.py
+index baa8a05..9425fe9 100644
+--- a/pytorch/nnunet/network_architecture/neural_network.py
++++ b/pytorch/nnunet/network_architecture/neural_network.py
+@@ -21,8 +21,14 @@ from torch import nn
+ import torch
+ from scipy.ndimage.filters import gaussian_filter
+ from typing import Union, Tuple, List
++import os
+ 
+ from torch.cuda.amp import autocast
++import pdb
++from glob import glob
++import time
++from nnunet.inference.read_bin import read_from_bin
++from nnunet.inference.infer_path import INFERENCE_SHAPE_PATH, INFERENCE_BIN_INPUT_FOLDER, INFERENCE_BIN_OUTPUT_FOLDER
+ 
+ 
+ class NeuralNetwork(nn.Module):
+@@ -75,7 +81,8 @@ class SegmentationNetwork(NeuralNetwork):
+                    step_size: float = 0.5, patch_size: Tuple[int, ...] = None, regions_class_order: Tuple[int, ...] = None,
+                    use_gaussian: bool = False, pad_border_mode: str = "constant",
+                    pad_kwargs: dict = None, all_in_gpu: bool = False,
+-                   verbose: bool = True, mixed_precision: bool = True) -> Tuple[np.ndarray, np.ndarray]:
++                   verbose: bool = True, mixed_precision: bool = True, img_name=None,
++                   pre_mode=None, fp=None) -> Tuple[np.ndarray, np.ndarray]:
+         """
+         Use this function to predict a 3D image. It does not matter whether the network is a 2D or 3D U-Net, it will
+         detect that automatically and run the appropriate code.
+@@ -133,7 +140,7 @@ class SegmentationNetwork(NeuralNetwork):
+ 
+         assert len(x.shape) == 4, "data must have shape (c,x,y,z)"
+ 
+-        if mixed_precision:
++        if mixed_precision:  # True
+             context = autocast
+         else:
+             context = no_op
+@@ -141,11 +148,11 @@ class SegmentationNetwork(NeuralNetwork):
+         with context():
+             with torch.no_grad():
+                 if self.conv_op == nn.Conv3d:
+-                    if use_sliding_window:
++                    if use_sliding_window:  # 走到这里
+                         res = self._internal_predict_3D_3Dconv_tiled(x, step_size, do_mirroring, mirror_axes, patch_size,
+                                                                      regions_class_order, use_gaussian, pad_border_mode,
+                                                                      pad_kwargs=pad_kwargs, all_in_gpu=all_in_gpu,
+-                                                                     verbose=verbose)
++                                                                     verbose=verbose, img_name=img_name, pre_mode=pre_mode, fp=fp)
+                     else:
+                         res = self._internal_predict_3D_3Dconv(x, patch_size, do_mirroring, mirror_axes, regions_class_order,
+                                                                pad_border_mode, pad_kwargs=pad_kwargs, verbose=verbose)
+@@ -284,19 +291,161 @@ class SegmentationNetwork(NeuralNetwork):
+ 
+         return steps
+ 
++    # def _internal_predict_3D_3Dconv_tiled(self, x: np.ndarray, step_size: float, do_mirroring: bool, mirror_axes: tuple,
++    #                                       patch_size: tuple, regions_class_order: tuple, use_gaussian: bool,
++    #                                       pad_border_mode: str, pad_kwargs: dict, all_in_gpu: bool,
++    #                                       verbose: bool, img_name=None) -> Tuple[np.ndarray, np.ndarray]:
++    #     # better safe than sorry
++    #     assert len(x.shape) == 4, "x must be (c, x, y, z)"
++    #     assert self.get_device() != "cpu"
++    #     if verbose: print("step_size:", step_size)  # 0.5
++    #     if verbose: print("do mirror:", do_mirroring)  # True
++    #
++    #     torch.cuda.empty_cache()
++    #
++    #     assert patch_size is not None, "patch_size cannot be None for tiled prediction"  # 128, 128, 128
++    #
++    #     # for sliding window inference the image must at least be as large as the patch size. It does not matter
++    #     # whether the shape is divisible by 2**num_pool as long as the patch size is
++    #     data, slicer = pad_nd_image(x, patch_size, pad_border_mode, pad_kwargs, True, None)
++    #     data_shape = data.shape  # still c, x, y, z
++    #
++    #     # compute the steps for sliding window
++    #     steps = self._compute_steps_for_sliding_window(patch_size, data_shape[1:], step_size)  # 计算窗口
++    #     num_tiles = len(steps[0]) * len(steps[1]) * len(steps[2])
++    #
++    #     if verbose:
++    #         print("data shape:", data_shape)
++    #         print("patch size:", patch_size)
++    #         print("steps (x, y, and z):", steps)
++    #         print("number of tiles:", num_tiles)
++    #
++    #     # we only need to compute that once. It can take a while to compute this due to the large sigma in
++    #     # gaussian_filter
++    #     if use_gaussian and num_tiles > 1:
++    #         if self._gaussian_3d is None or not all(  # 走到这里
++    #                 [i == j for i, j in zip(patch_size, self._patch_size_for_gaussian_3d)]):
++    #             if verbose: print('computing Gaussian')
++    #             gaussian_importance_map = self._get_gaussian(patch_size, sigma_scale=1. / 8)
++    #
++    #             self._gaussian_3d = gaussian_importance_map
++    #             self._patch_size_for_gaussian_3d = patch_size
++    #         else:
++    #             if verbose: print("using precomputed Gaussian")
++    #             gaussian_importance_map = self._gaussian_3d
++    #
++    #         gaussian_importance_map = torch.from_numpy(gaussian_importance_map).cuda(self.get_device(),
++    #                                                                                  non_blocking=True)
++    #     else:
++    #         gaussian_importance_map = None
++    #     if all_in_gpu:  # False
++    #         # If we run the inference in GPU only (meaning all tensors are allocated on the GPU, this reduces
++    #         # CPU-GPU communication but required more GPU memory) we need to preallocate a few things on GPU
++    #         if use_gaussian and num_tiles > 1:
++    #             # half precision for the outputs should be good enough. If the outputs here are half, the
++    #             # gaussian_importance_map should be as well
++    #             gaussian_importance_map = gaussian_importance_map.half()
++    #
++    #             # make sure we did not round anything to 0
++    #             gaussian_importance_map[gaussian_importance_map == 0] = gaussian_importance_map[
++    #                 gaussian_importance_map != 0].min()
++    #
++    #             add_for_nb_of_preds = gaussian_importance_map
++    #         else:
++    #             add_for_nb_of_preds = torch.ones(data.shape[1:], device=self.get_device())
++    #
++    #         if verbose: print("initializing result array (on GPU)")
++    #         aggregated_results = torch.zeros([self.num_classes] + list(data.shape[1:]), dtype=torch.half,
++    #                                          device=self.get_device())
++    #
++    #         if verbose: print("moving data to GPU")
++    #         data = torch.from_numpy(data).cuda(self.get_device(), non_blocking=True)
++    #
++    #         if verbose: print("initializing result_numsamples (on GPU)")
++    #         aggregated_nb_of_predictions = torch.zeros([self.num_classes] + list(data.shape[1:]), dtype=torch.half,
++    #                                                    device=self.get_device())
++    #     else:
++    #         if use_gaussian and num_tiles > 1:  # 走到这里
++    #             add_for_nb_of_preds = self._gaussian_3d  # 128 128 128
++    #         else:
++    #             add_for_nb_of_preds = np.ones(data.shape[1:], dtype=np.float32)
++    #         aggregated_results = np.zeros([self.num_classes] + list(data.shape[1:]), dtype=np.float32)
++    #         aggregated_nb_of_predictions = np.zeros([self.num_classes] + list(data.shape[1:]), dtype=np.float32)
++    #     for x in steps[0]:
++    #         lb_x = x
++    #         ub_x = x + patch_size[0]
++    #         for y in steps[1]:
++    #             lb_y = y
++    #             ub_y = y + patch_size[1]
++    #             for z in steps[2]:
++    #                 lb_z = z
++    #                 ub_z = z + patch_size[2]
++    #                 predicted_patch = self._internal_maybe_mirror_and_pred_3D(  # data是ndarray
++    #                     data[None, :, lb_x:ub_x, lb_y:ub_y, lb_z:ub_z], mirror_axes, do_mirroring,
++    #                     gaussian_importance_map)[0]  # -> tensor 3 128 128 128, dtype=float32
++    #                 if all_in_gpu:  # False
++    #                     predicted_patch = predicted_patch.half()
++    #                 else:
++    #                     predicted_patch = predicted_patch.cpu().numpy()
++    #
++    #                 aggregated_results[:, lb_x:ub_x, lb_y:ub_y, lb_z:ub_z] += predicted_patch  # 3 437 309 570
++    #                 aggregated_nb_of_predictions[:, lb_x:ub_x, lb_y:ub_y, lb_z:ub_z] += add_for_nb_of_preds  # 3 437 309 570
++    #
++    #     # we reverse the padding here (remeber that we padded the input to be at least as large as the patch size
++    #     slicer = tuple(
++    #         [slice(0, aggregated_results.shape[i]) for i in
++    #          range(len(aggregated_results.shape) - (len(slicer) - 1))] + slicer[1:])
++    #     aggregated_results = aggregated_results[slicer]  # 尺寸不变
++    #     aggregated_nb_of_predictions = aggregated_nb_of_predictions[slicer]
++    #
++    #     # computing the class_probabilities by dividing the aggregated result with result_numsamples
++    #     class_probabilities = aggregated_results / aggregated_nb_of_predictions  # 尺寸相同
++    #
++    #     if regions_class_order is None:  # None
++    #         predicted_segmentation = class_probabilities.argmax(0)
++    #     else:
++    #         if all_in_gpu:
++    #             class_probabilities_here = class_probabilities.detach().cpu().numpy()
++    #         else:
++    #             class_probabilities_here = class_probabilities
++    #         predicted_segmentation = np.zeros(class_probabilities_here.shape[1:], dtype=np.float32)
++    #         for i, c in enumerate(regions_class_order):
++    #             predicted_segmentation[class_probabilities_here[i] > 0.5] = c
++    #
++    #     if all_in_gpu:  # False
++    #         if verbose: print("copying results to CPU")
++    #
++    #         if regions_class_order is None:
++    #             predicted_segmentation = predicted_segmentation.detach().cpu().numpy()
++    #
++    #         class_probabilities = class_probabilities.detach().cpu().numpy()
++    #
++    #     if verbose: print("prediction done")  # True
++    #     return predicted_segmentation, class_probabilities
++
++    def print_mytensor(data):
++        shape = data.shape[0]
++        for s in range(shape):
++            for i in range(3):
++                print(data[s, 0, 0, i * 3:(i + 1) * 3])
++            for i in range(3):
++                print(data[s, 0, 0, i * 3 + 50:(i + 1) * 3 + 50])
++            print('-----')
++
++    # 为了将图像切割后的子图像保存为bin文件而单独修改的函数。使用后请将该函数注释，原函数恢复。
+     def _internal_predict_3D_3Dconv_tiled(self, x: np.ndarray, step_size: float, do_mirroring: bool, mirror_axes: tuple,
+                                           patch_size: tuple, regions_class_order: tuple, use_gaussian: bool,
+                                           pad_border_mode: str, pad_kwargs: dict, all_in_gpu: bool,
+-                                          verbose: bool) -> Tuple[np.ndarray, np.ndarray]:
++                                          verbose: bool, img_name=None, pre_mode=None, fp=None) -> Tuple[np.ndarray, np.ndarray]:
+         # better safe than sorry
+         assert len(x.shape) == 4, "x must be (c, x, y, z)"
+         assert self.get_device() != "cpu"
+-        if verbose: print("step_size:", step_size)
+-        if verbose: print("do mirror:", do_mirroring)
++        if verbose: print("step_size:", step_size)  # 0.5
++        if verbose: print("do mirror:", do_mirroring)  # True
+ 
+         torch.cuda.empty_cache()
+ 
+-        assert patch_size is not None, "patch_size cannot be None for tiled prediction"
++        assert patch_size is not None, "patch_size cannot be None for tiled prediction"  # 128, 128, 128
+ 
+         # for sliding window inference the image must at least be as large as the patch size. It does not matter
+         # whether the shape is divisible by 2**num_pool as long as the patch size is
+@@ -304,7 +453,7 @@ class SegmentationNetwork(NeuralNetwork):
+         data_shape = data.shape  # still c, x, y, z
+ 
+         # compute the steps for sliding window
+-        steps = self._compute_steps_for_sliding_window(patch_size, data_shape[1:], step_size)
++        steps = self._compute_steps_for_sliding_window(patch_size, data_shape[1:], step_size)  # 计算窗口
+         num_tiles = len(steps[0]) * len(steps[1]) * len(steps[2])
+ 
+         if verbose:
+@@ -316,7 +465,7 @@ class SegmentationNetwork(NeuralNetwork):
+         # we only need to compute that once. It can take a while to compute this due to the large sigma in
+         # gaussian_filter
+         if use_gaussian and num_tiles > 1:
+-            if self._gaussian_3d is None or not all(
++            if self._gaussian_3d is None or not all(  # 走到这里
+                     [i == j for i, j in zip(patch_size, self._patch_size_for_gaussian_3d)]):
+                 if verbose: print('computing Gaussian')
+                 gaussian_importance_map = self._get_gaussian(patch_size, sigma_scale=1. / 8)
+@@ -327,16 +476,16 @@ class SegmentationNetwork(NeuralNetwork):
+                 if verbose: print("using precomputed Gaussian")
+                 gaussian_importance_map = self._gaussian_3d
+ 
+-            gaussian_importance_map = torch.from_numpy(gaussian_importance_map).cuda(self.get_device(),
+-                                                                                     non_blocking=True)
+-
++            # gaussian_importance_map = torch.from_numpy(gaussian_importance_map).cuda(self.get_device(),
++            #                                                                          non_blocking=True)
++            gaussian_importance_map = torch.from_numpy(gaussian_importance_map)                                            
+         else:
+             gaussian_importance_map = None
+-
+-        if all_in_gpu:
++        aggregated_results = torch.zeros(1)
++        aggregated_nb_of_predictions = torch.zeros(1)
++        if all_in_gpu:  # False
+             # If we run the inference in GPU only (meaning all tensors are allocated on the GPU, this reduces
+             # CPU-GPU communication but required more GPU memory) we need to preallocate a few things on GPU
+-
+             if use_gaussian and num_tiles > 1:
+                 # half precision for the outputs should be good enough. If the outputs here are half, the
+                 # gaussian_importance_map should be as well
+@@ -361,13 +510,80 @@ class SegmentationNetwork(NeuralNetwork):
+             aggregated_nb_of_predictions = torch.zeros([self.num_classes] + list(data.shape[1:]), dtype=torch.half,
+                                                        device=self.get_device())
+         else:
+-            if use_gaussian and num_tiles > 1:
+-                add_for_nb_of_preds = self._gaussian_3d
++            if use_gaussian and num_tiles > 1:  # 走到这里
++                add_for_nb_of_preds = self._gaussian_3d  # 128 128 128
+             else:
+                 add_for_nb_of_preds = np.ones(data.shape[1:], dtype=np.float32)
+             aggregated_results = np.zeros([self.num_classes] + list(data.shape[1:]), dtype=np.float32)
+             aggregated_nb_of_predictions = np.zeros([self.num_classes] + list(data.shape[1:]), dtype=np.float32)
+-
++        # 路径设置
++        shape_path = INFERENCE_SHAPE_PATH  # all_shape.txt的目录
++        if fp is None or fp == 'None':
++            bin_save_folder = INFERENCE_BIN_INPUT_FOLDER  # 存放子图bin的目录
++            bin_real_folder = INFERENCE_BIN_OUTPUT_FOLDER  # 存放310推理后的全部结果的目录
++        else:
++            bin_save_folder = fp  # 存放子图bin的目录
++            bin_real_folder = fp  # 存放310推理后的全部结果的目录
++        def save_as_shape(filename, shape, steps, folder=None):
++            shape_txt = 'all_shape.txt'
++            file = os.path.join(folder, shape_txt)
++            with open(file, "w") as f:
++                folders = filename.split('/')
++                name = folders[-1].split('.')[0]
++                s = name + ' ' + str(int(shape[1])) + ' ' + str(int(shape[2])) + ' ' + str(int(shape[3]))
++                s1 = ','.join(str(s) for s in steps[0])
++                s2 = ','.join(str(s) for s in steps[1])
++                s3 = ','.join(str(s) for s in steps[2])
++                s = s + '-' + s1 + '-' + s2 + '-' + s3
++                s = s + '\n'
++                f.write(s)  # 保存格式：图像名 长 宽 高
++        def save_as_bin(data, steps, filename, folder=None):  # 将图像存储为bin文件
++            """
++            data：子图， ndarray 1 128 128 128
++            steps：子图在原图上的区域
++            filename：该原图的文件名，例如liver_132.nii.gz
++            folder：最终的bin文件存储的位置
++            """
++            x = maybe_to_torch(data)  # 3 128 128 128
++            folders = filename.split('/')
++            name = folders[-1]
++            name = name.split('.')[0]  # 获得文件名
++            for s in steps:
++                name = name + '_' + str(s)  # 在文件名上增加位置信息
++            for cur_i in range(8):
++                if cur_i == 0:
++                    y = x  # 1 3 128 128 128
++                if cur_i == 1:
++                    y = torch.flip(x, (3,))
++                if cur_i == 2:
++                    y = torch.flip(x, (2,))
++                if cur_i == 3:
++                    y = torch.flip(x, (3, 2))
++                if cur_i == 4:
++                    y = torch.flip(x, (1,))
++                if cur_i == 5:
++                    y = torch.flip(x, (3, 1))
++                if cur_i == 6:
++                    y = torch.flip(x, (2, 1))
++                if cur_i == 7:
++                    y = torch.flip(x, (3, 2, 1))
++                img = np.array(y).astype(np.float32)
++                file_path = os.path.join(folder, name + '_' + str(cur_i) + ".bin")
++                img.tofile(file_path)
++        # 模式选择
++        if int(pre_mode) == 1:  # 拆分图像
++            save_as_shape_flag = True
++            save_as_bin_flag = True
++            consolidated_bins = False
++        elif int(pre_mode) == 2:  # 合并图像
++            save_as_shape_flag = False
++            save_as_bin_flag = False
++            consolidated_bins = True
++        else:
++            raise Exception('必须提供推理的模式！以免出现错误！')
++        # 记录全局的形状信息，用于后面帮助结果合并
++        if save_as_shape_flag:
++            save_as_shape(img_name, data_shape, steps, shape_path)
+         for x in steps[0]:
+             lb_x = x
+             ub_x = x + patch_size[0]
+@@ -377,30 +593,108 @@ class SegmentationNetwork(NeuralNetwork):
+                 for z in steps[2]:
+                     lb_z = z
+                     ub_z = z + patch_size[2]
+-
++                    if save_as_bin_flag:
++                        # data是四维的，而送入模型前会变成五维的，尝试将保存的bin保存为四维。
++                        cur_data = data[:, lb_x:ub_x, lb_y:ub_y, lb_z:ub_z]  # 当前的一个小patch子图 128 128 128
++                        if save_as_bin_flag:
++                            save_as_bin(cur_data, [lb_x, ub_x, lb_y, ub_y, lb_z, ub_z], img_name, bin_save_folder)
++                    continue
+                     predicted_patch = self._internal_maybe_mirror_and_pred_3D(
+-                        data[None, :, lb_x:ub_x, lb_y:ub_y, lb_z:ub_z], mirror_axes, do_mirroring,
+-                        gaussian_importance_map)[0]
+-
+-                    if all_in_gpu:
++                        cur_data, mirror_axes, do_mirroring,
++                        gaussian_importance_map)[0]  # -> 3 128 128 128
++                    if all_in_gpu:  # False
+                         predicted_patch = predicted_patch.half()
+                     else:
+                         predicted_patch = predicted_patch.cpu().numpy()
+-
+-                    aggregated_results[:, lb_x:ub_x, lb_y:ub_y, lb_z:ub_z] += predicted_patch
+-                    aggregated_nb_of_predictions[:, lb_x:ub_x, lb_y:ub_y, lb_z:ub_z] += add_for_nb_of_preds
+-
++                    aggregated_results[:, lb_x:ub_x, lb_y:ub_y, lb_z:ub_z] += predicted_patch  # 3 437 309 570
++                    aggregated_nb_of_predictions[:, lb_x:ub_x, lb_y:ub_y, lb_z:ub_z] += add_for_nb_of_preds  # 3 437 309 570
++        # 此时steps, aggregated_results, aggregated_nb_of_predictions不可见
++        # data_shape和num_classes或许由文件名获得
++        if consolidated_bins:
++            shape_path = shape_path + 'all_shape.txt'
++            with open(shape_path) as f:
++                all_shape = f.readline().replace('\n', '').replace('\r', '')
++                all_shape = all_shape.split('-')
++                all_1 = all_shape[0].split(' ')
++                bin_img_name = all_1[0]
++                bin_data_shape = (1, int(all_1[1]), int(all_1[2]), int(all_1[3]))
++                bin_steps = []
++                for bin_i in range(3):
++                    bin_list = []
++                    all_2 = all_shape[bin_i + 1].split(',')
++                    for sth in all_2:
++                        bin_list.append(int(sth))
++                    bin_steps.append(bin_list)
++            for x in bin_steps[0]:
++                lb_x = x
++                ub_x = x + patch_size[0]
++                for y in bin_steps[1]:
++                    lb_y = y
++                    ub_y = y + patch_size[1]
++                    for z in bin_steps[2]:
++                        lb_z = z
++                        ub_z = z + patch_size[2]
++                        bin_step = [lb_x, ub_x, lb_y, ub_y, lb_z, ub_z]
++                        bin_file_name = bin_img_name
++                        for bin_s in bin_step:
++                            bin_file_name = bin_file_name + '_' + str(bin_s)
++                        result_torch = torch.zeros([1, 3, 128, 128, 128], dtype=torch.float)  # 1 3 128 128 128
++                        num_results = 8
++                        mult = gaussian_importance_map.cpu()
++                        for zz in range(8):
++                            bin_file_name_zz = bin_file_name + '_' + str(zz)
++                            bin_file_name_zzz = bin_file_name_zz + '_1.bin'
++                            bin_predicted_patch = read_from_bin(bin_file_name_zzz, bin_real_folder)  # ndarray，3 128 128 128, float32
++                            bin_predicted_patch = bin_predicted_patch[None, :, :, :, :]  # 转化为五维的
++                            bin_predicted_patch = maybe_to_torch(bin_predicted_patch)  # tensor, 3 128 128 128 float32
++                            if zz == 0:
++                                pred = self.inference_apply_nonlin(bin_predicted_patch)  # 1 3 128 128 128
++                                result_torch += 1 / num_results * pred  # 1 3 128 128 128
++                            if zz == 1 and (2 in mirror_axes):
++                                pred = self.inference_apply_nonlin(bin_predicted_patch)
++                                result_torch += 1 / num_results * torch.flip(pred, (4,))
++                            if zz == 2 and (1 in mirror_axes):
++                                pred = self.inference_apply_nonlin(bin_predicted_patch)
++                                result_torch += 1 / num_results * torch.flip(pred, (3,))
++                            if zz == 3 and (2 in mirror_axes) and (1 in mirror_axes):
++                                pred = self.inference_apply_nonlin(bin_predicted_patch)
++                                result_torch += 1 / num_results * torch.flip(pred, (4, 3))
++                            if zz == 4 and (0 in mirror_axes):
++                                pred = self.inference_apply_nonlin(bin_predicted_patch)
++                                result_torch += 1 / num_results * torch.flip(pred, (2,))
++                            if zz == 5 and (0 in mirror_axes) and (2 in mirror_axes):
++                                pred = self.inference_apply_nonlin(bin_predicted_patch)
++                                result_torch += 1 / num_results * torch.flip(pred, (4, 2))
++                            if zz == 6 and (0 in mirror_axes) and (1 in mirror_axes):
++                                pred = self.inference_apply_nonlin(bin_predicted_patch)
++                                result_torch += 1 / num_results * torch.flip(pred, (3, 2))
++                            if zz == 7 and (0 in mirror_axes) and (1 in mirror_axes) and (2 in mirror_axes):
++                                pred = self.inference_apply_nonlin(bin_predicted_patch)
++                                result_torch += 1 / num_results * torch.flip(pred, (4, 3, 2))
++                        result_torch[:, :] *= mult  # torch 1 3 128 128 128 float32
++                        result_torch = result_torch[0]  # 变为3 128 128 128
++                        bin_predicted_patch = result_torch
++                        # 一个子图的8轮已经推理完毕，实际上我只到了Z一轮，下面就break了。如果想要跑完，请删除三重循环末尾的三个break
++                        bin_predicted_patch = bin_predicted_patch.cpu().numpy()
++                        aggregated_results[:, lb_x:ub_x, lb_y:ub_y, lb_z:ub_z] += bin_predicted_patch  # 3 437 309 570
++                        aggregated_nb_of_predictions[:, lb_x:ub_x, lb_y:ub_y, lb_z:ub_z] += add_for_nb_of_preds  # 3 437 309 570
++                        # break
++                    # break
++                # break
++        else:
++            import sys
++            sys.exit(0)
+         # we reverse the padding here (remeber that we padded the input to be at least as large as the patch size
+         slicer = tuple(
+             [slice(0, aggregated_results.shape[i]) for i in
+              range(len(aggregated_results.shape) - (len(slicer) - 1))] + slicer[1:])
+-        aggregated_results = aggregated_results[slicer]
++        aggregated_results = aggregated_results[slicer]  # 尺寸不变
+         aggregated_nb_of_predictions = aggregated_nb_of_predictions[slicer]
+ 
+         # computing the class_probabilities by dividing the aggregated result with result_numsamples
+-        class_probabilities = aggregated_results / aggregated_nb_of_predictions
++        class_probabilities = aggregated_results / aggregated_nb_of_predictions  # 尺寸相同
+ 
+-        if regions_class_order is None:
++        if regions_class_order is None:  # None
+             predicted_segmentation = class_probabilities.argmax(0)
+         else:
+             if all_in_gpu:
+@@ -411,7 +705,7 @@ class SegmentationNetwork(NeuralNetwork):
+             for i, c in enumerate(regions_class_order):
+                 predicted_segmentation[class_probabilities_here[i] > 0.5] = c
+ 
+-        if all_in_gpu:
++        if all_in_gpu:  # False
+             if verbose: print("copying results to CPU")
+ 
+             if regions_class_order is None:
+@@ -419,7 +713,7 @@ class SegmentationNetwork(NeuralNetwork):
+ 
+             class_probabilities = class_probabilities.detach().cpu().numpy()
+ 
+-        if verbose: print("prediction done")
++        if verbose: print("prediction done")  # True
+         return predicted_segmentation, class_probabilities
+ 
+     def _internal_predict_2D_2Dconv(self, x: np.ndarray, min_size: Tuple[int, int], do_mirroring: bool,
+@@ -504,54 +798,69 @@ class SegmentationNetwork(NeuralNetwork):
+         assert len(x.shape) == 5, 'x must be (b, c, x, y, z)'
+         # everything in here takes place on the GPU. If x and mult are not yet on GPU this will be taken care of here
+         # we now return a cuda tensor! Not numpy array!
+-
+-        x = to_cuda(maybe_to_torch(x), gpu_id=self.get_device())
++        def print_mytensor(data):
++            shape = data.shape[0]
++            for s in range(shape):
++                for i in range(3):
++                    print(data[s, 0, 0, i * 3:(i + 1) * 3])
++                for i in range(3):
++                    print(data[s, 0, 0, i * 3 + 50:(i + 1) * 3 + 50])
++                print('-----')
++        x = to_cuda(maybe_to_torch(x), gpu_id=self.get_device())  # ndarray, 1 1 128 128 128，之后变成tensor
+         result_torch = torch.zeros([1, self.num_classes] + list(x.shape[2:]),
+-                                   dtype=torch.float).cuda(self.get_device(), non_blocking=True)
++                                   dtype=torch.float).cuda(self.get_device(), non_blocking=True)  # 1 3 128 128 128，全是0
+ 
+         if mult is not None:
+-            mult = to_cuda(maybe_to_torch(mult), gpu_id=self.get_device())
++            mult = to_cuda(maybe_to_torch(mult), gpu_id=self.get_device())  # tensor， 128 128 128
+ 
+-        if do_mirroring:
++        if do_mirroring:  # True
+             mirror_idx = 8
+-            num_results = 2 ** len(mirror_axes)
++            num_results = 2 ** len(mirror_axes)  # 8
+         else:
+             mirror_idx = 1
+             num_results = 1
+         for m in range(mirror_idx):
+             if m == 0:
+-                pred = self.inference_apply_nonlin(self(x))
+-                result_torch += 1 / num_results * pred
++                y = self(x)  # tensor， 1 3 128 128 128
++                pred = self.inference_apply_nonlin(y)  # 1 3 128 128 128
++                result_torch += 1 / num_results * pred  # 1 3 128 128 128
+ 
+             if m == 1 and (2 in mirror_axes):
+-                pred = self.inference_apply_nonlin(self(torch.flip(x, (4, ))))
++                y = self(torch.flip(x, (4, )))
++                pred = self.inference_apply_nonlin(y)
+                 result_torch += 1 / num_results * torch.flip(pred, (4,))
+ 
+             if m == 2 and (1 in mirror_axes):
+-                pred = self.inference_apply_nonlin(self(torch.flip(x, (3, ))))
++                y = self(torch.flip(x, (3, )))
++                pred = self.inference_apply_nonlin(y)
+                 result_torch += 1 / num_results * torch.flip(pred, (3,))
+ 
+             if m == 3 and (2 in mirror_axes) and (1 in mirror_axes):
+-                pred = self.inference_apply_nonlin(self(torch.flip(x, (4, 3))))
++                y = self(torch.flip(x, (4, 3)))
++                pred = self.inference_apply_nonlin(y)
+                 result_torch += 1 / num_results * torch.flip(pred, (4, 3))
+ 
+             if m == 4 and (0 in mirror_axes):
+-                pred = self.inference_apply_nonlin(self(torch.flip(x, (2, ))))
++                y = self(torch.flip(x, (2, )))
++                pred = self.inference_apply_nonlin(y)
+                 result_torch += 1 / num_results * torch.flip(pred, (2,))
+ 
+             if m == 5 and (0 in mirror_axes) and (2 in mirror_axes):
+-                pred = self.inference_apply_nonlin(self(torch.flip(x, (4, 2))))
++                y = self(torch.flip(x, (4, 2)))
++                pred = self.inference_apply_nonlin(y)
+                 result_torch += 1 / num_results * torch.flip(pred, (4, 2))
+ 
+             if m == 6 and (0 in mirror_axes) and (1 in mirror_axes):
+-                pred = self.inference_apply_nonlin(self(torch.flip(x, (3, 2))))
++                y = self(torch.flip(x, (3, 2)))
++                pred = self.inference_apply_nonlin(y)
+                 result_torch += 1 / num_results * torch.flip(pred, (3, 2))
+ 
+             if m == 7 and (0 in mirror_axes) and (1 in mirror_axes) and (2 in mirror_axes):
+-                pred = self.inference_apply_nonlin(self(torch.flip(x, (4, 3, 2))))
++                y = self(torch.flip(x, (4, 3, 2)))
++                pred = self.inference_apply_nonlin(y)
+                 result_torch += 1 / num_results * torch.flip(pred, (4, 3, 2))
+ 
+-        if mult is not None:
++        if mult is not None:  # True
+             result_torch[:, :] *= mult
+ 
+         return result_torch
+diff --git a/pytorch/nnunet/postprocessing/connected_components.py b/pytorch/nnunet/postprocessing/connected_components.py
+index c69471e..45ff991 100644
+--- a/pytorch/nnunet/postprocessing/connected_components.py
++++ b/pytorch/nnunet/postprocessing/connected_components.py
+@@ -175,7 +175,7 @@ def determine_postprocessing(base, gt_labels_folder, raw_subfolder_name="validat
+     pp_results['num_samples'] = len(validation_result_raw['all'])
+     validation_result_raw = validation_result_raw['mean']
+ 
+-    if advanced_postprocessing:
++    if advanced_postprocessing:  # False
+         # first treat all foreground classes as one and remove all but the largest foreground connected component
+         results = []
+         for f in fnames:
+@@ -270,12 +270,12 @@ def determine_postprocessing(base, gt_labels_folder, raw_subfolder_name="validat
+     if len(classes) > 1:
+         # now depending on whether we do remove all but the largest foreground connected component we define the source dir
+         # for the next one to be the raw or the temp dir
+-        if do_fg_cc:
++        if do_fg_cc:  # True
+             source = folder_all_classes_as_fg
+         else:
+             source = join(base, raw_subfolder_name)
+ 
+-        if advanced_postprocessing:
++        if advanced_postprocessing:  # False
+             # now run this for each class separately
+             results = []
+             for f in fnames:
+@@ -325,7 +325,7 @@ def determine_postprocessing(base, gt_labels_folder, raw_subfolder_name="validat
+                              json_output_file=join(folder_per_class, "summary.json"),
+                              json_author="Fabian", num_threads=processes)
+ 
+-        if do_fg_cc:
++        if do_fg_cc:  # True
+             old_res = deepcopy(validation_result_PP_test)
+         else:
+             old_res = validation_result_raw
+@@ -350,7 +350,7 @@ def determine_postprocessing(base, gt_labels_folder, raw_subfolder_name="validat
+     else:
+         print("Only one class present, no need to do each class separately as this is covered in fg vs bg")
+ 
+-    if not advanced_postprocessing:
++    if not advanced_postprocessing:  # True
+         pp_results['min_valid_object_sizes'] = None
+ 
+     print("done")
+diff --git a/pytorch/nnunet/preprocessing/cropping.py b/pytorch/nnunet/preprocessing/cropping.py
+index bb0a92a..95d07bc 100644
+--- a/pytorch/nnunet/preprocessing/cropping.py
++++ b/pytorch/nnunet/preprocessing/cropping.py
+@@ -39,6 +39,7 @@ def get_bbox_from_mask(mask, outside_value=0):
+     maxxidx = int(np.max(mask_voxel_coords[1])) + 1
+     minyidx = int(np.min(mask_voxel_coords[2]))
+     maxyidx = int(np.max(mask_voxel_coords[2])) + 1
++    print(mask.shape, minzidx, maxzidx, minxidx, maxxidx, minyidx, maxyidx)
+     return [[minzidx, maxzidx], [minxidx, maxxidx], [minyidx, maxyidx]]
+ 
+ 
+@@ -202,6 +203,7 @@ class ImageCropper(object):
+             list_of_args.append((case, case_identifier, overwrite_existing))
+ 
+         p = Pool(self.num_threads)
++        print('Pool', self.num_threads)
+         p.starmap(self.load_crop_save, list_of_args)
+         p.close()
+         p.join()
+diff --git a/pytorch/nnunet/run/look_pkl.py b/pytorch/nnunet/run/look_pkl.py
+new file mode 100644
+index 0000000..1a9d78a
+--- /dev/null
++++ b/pytorch/nnunet/run/look_pkl.py
+@@ -0,0 +1,18 @@
++import numpy as np
++import pickle
++
++inputfile = u'/data/yupeng/environment_variables/RESULTS_FOLDER/nnUNet/3d_fullres/Task003_Liver' \
++            u'/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1/fold_0/model_final_checkpoint.model.pkl'
++# test = np.load('labels.npy',  encoding = "latin1")
++# doc = open('1.txt', 'a')
++# print(test, file=doc)
++
++
++
++fr = open(inputfile, 'rb')
++inf = pickle.load(fr)
++print('done')
++
++
++
++print('end')
+\ No newline at end of file
+diff --git a/pytorch/nnunet/run/model_prof.py b/pytorch/nnunet/run/model_prof.py
+new file mode 100644
+index 0000000..013df26
+--- /dev/null
++++ b/pytorch/nnunet/run/model_prof.py
+@@ -0,0 +1,124 @@
++# Copyright (c) Soumith Chintala 2016,
++# All rights reserved
++#
++# Copyright 2020 Huawei Technologies Co., Ltd
++#
++# Licensed under the BSD 3-Clause License (the "License");
++# you may not use this file except in compliance with the License.
++# You may obtain a copy of the License at
++#
++# https://spdx.org/licenses/BSD-3-Clause.html
++#
++# Unless required by applicable law or agreed to in writing, software
++# distributed under the License is distributed on an "AS IS" BASIS,
++# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++# See the License for the specific language governing permissions and
++# limitations under the License.
++
++# -*- coding: utf-8 -*-
++"""pytorch_prof.py
++"""
++
++import torch
++import torch.optim as optim
++import torch.nn as nn
++import time
++import argparse
++
++
++def build_model():
++    # 请自定义模型并加载预训练模型
++    import torchvision
++    model = torchvision.models.resnet50(pretrained=True)
++    return model
++
++
++def get_raw_data():
++    input_tensor = torch.randn(2, 3, 224, 224)
++    return input_tensor
++
++
++def criterion(x):
++    base_func = nn.CrossEntropyLoss()
++    shape_list = x.shape
++    N = shape_list[0]
++    R = 1
++    if len(shape_list) > 1:
++        for r in shape_list[1:]:
++            R *= r
++    T = torch.randint(0,R, size=(N,)).to(x.device)
++    if str(T.device).startswith('npu'):
++        T = T.int()
++    return base_func(x.reshape(N, -1), T)
++
++
++if __name__ == '__main__':
++    parser = argparse.ArgumentParser(description='PyTorch Prof')
++    parser.add_argument('--device', type=str, default='cpu',
++                        help='set which type of device used. Support cuda:0(device_id), npu:0(device_id).')
++    parser.add_argument('--amp', default=False, action='store_true',
++                        help='use amp during prof')
++    parser.add_argument('--loss-scale', default=64.0, type=float,
++                        help='loss scale using in amp, default 64.0, -1 means dynamic')
++    parser.add_argument('--opt-level', default='O2', type=str,
++                        help='opt-level using in amp, default O2')
++    parser.add_argument('--FusedSGD', default=False, action='store_true',
++                        help='use FusedSGD during prof')
++
++    args = parser.parse_args()
++
++    # 1.准备工作
++    if args.device.startswith('cuda'):
++        torch.cuda.set_device(args.device)
++        prof_kwargs = {'use_cuda': True}
++    elif args.device.startswith('npu'):
++        torch.npu.set_device(args.device)
++        prof_kwargs = {'use_npu': True}
++    else:
++        prof_kwargs = {}
++
++    # 2.构建模型
++    model = build_model()
++    if args.FusedSGD:
++        from apex.optimizers import NpuFusedSGD
++        optimizer = NpuFusedSGD(model.parameters(), lr=0.01)
++        model = model.to(args.device)
++        if args.amp:
++            from apex import amp
++            model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level,
++                                              loss_scale=None if args.loss_scale == -1 else args.loss_scale,
++                                              combine_grad=True)
++    else:
++        optimizer = optim.SGD(model.parameters(), lr=0.01)
++        model = model.to(args.device)
++        if args.amp:
++            from apex import amp
++            model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level,
++                                              loss_scale=None if args.loss_scale == -1 else args.loss_scale)
++
++    # 3.生成input
++    input_tensor = get_raw_data()
++    input_tensor = input_tensor.to(args.device)
++
++    # 先运行一次，保证prof得到的性能是正确的
++    def run():
++        output_tensor = model(input_tensor)
++        optimizer.zero_grad()
++        loss = criterion(output_tensor)
++        if args.amp:
++            with amp.scale_loss(loss, optimizer) as scaled_loss:
++                scaled_loss.backward()
++        else:
++            loss.backward()
++        optimizer.step()
++        return loss
++    for i in range(5):
++        start_time = time.time()
++        loss = run()
++        print('iter: %d, loss: %.2f, time: %.2f' % (i, loss, (time.time() - start_time)*1000))
++
++    # 4. 执行forward+profiling
++    with torch.autograd.profiler.profile(**prof_kwargs) as prof:
++        run()
++    print(prof.key_averages().table())
++    prof.export_chrome_trace("pytorch_prof_%s.prof" % args.device)
+\ No newline at end of file
+diff --git a/pytorch/nnunet/run/run_training.py b/pytorch/nnunet/run/run_training.py
+index eb7ca2f..08214d6 100644
+--- a/pytorch/nnunet/run/run_training.py
++++ b/pytorch/nnunet/run/run_training.py
+@@ -31,7 +31,7 @@ def main():
+     parser.add_argument("task", help="can be task name or task id")
+     parser.add_argument("fold", help='0, 1, ..., 5 or \'all\'')
+     parser.add_argument("-val", "--validation_only", help="use this if you want to only run the validation",
+-                        action="store_true")
++                        action="store_true", default=True)
+     parser.add_argument("-w", required=False, default=None, help="Load pre-trained Models Genesis")
+     parser.add_argument("-c", "--continue_training", help="use this if you want to continue a training",
+                         action="store_true")
+@@ -134,8 +134,8 @@ def main():
+                             fp16=run_mixed_precision)
+ 
+     trainer.initialize(not validation_only)
+-    
+-    if weights != None:                                                         
++
++    if weights != None:
+         trainer.load_pretrained_encoder_weights(weights)
+     sys.stdout.flush()
+ 
+diff --git a/pytorch/nnunet/run/run_training2.py b/pytorch/nnunet/run/run_training2.py
+new file mode 100644
+index 0000000..372a4d4
+--- /dev/null
++++ b/pytorch/nnunet/run/run_training2.py
+@@ -0,0 +1,172 @@
++#    Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
++#
++#    Licensed under the Apache License, Version 2.0 (the "License");
++#    you may not use this file except in compliance with the License.
++#    You may obtain a copy of the License at
++#
++#        http://www.apache.org/licenses/LICENSE-2.0
++#
++#    Unless required by applicable law or agreed to in writing, software
++#    distributed under the License is distributed on an "AS IS" BASIS,
++#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++#    See the License for the specific language governing permissions and
++#    limitations under the License.
++import os
++import sys
++import argparse
++from batchgenerators.utilities.file_and_folder_operations import *
++from nnunet.run.default_configuration import get_default_configuration
++from nnunet.paths import default_plans_identifier
++from nnunet.training.cascade_stuff.predict_next_stage import predict_next_stage
++from nnunet.training.network_training.nnUNetTrainer import nnUNetTrainer
++from nnunet.training.network_training.nnUNetTrainerCascadeFullRes import nnUNetTrainerCascadeFullRes
++from nnunet.training.network_training.nnUNetTrainerV2_CascadeFullRes import nnUNetTrainerV2CascadeFullRes
++from nnunet.utilities.task_name_id_conversion import convert_id_to_task_name
++
++
++# import pdb
++# pdb.set_trace()
++
++def main():
++    parser = argparse.ArgumentParser()
++    parser.add_argument("-network", default="3d_fullres")
++    parser.add_argument("-network_trainer", default="nnUNetPlusPlusTrainerV2")
++    parser.add_argument("-task", default="003", help="can be task name or task id")
++    parser.add_argument("-fold", default="0", help='0, 1, ..., 5 or \'all\'')
++    parser.add_argument("-val", "--validation_only", default=False,
++                        help="use this if you want to only run the validation",
++                        action="store_true")
++    parser.add_argument("-w", required=False, default=None, help="Load pre-trained Models Genesis")
++    parser.add_argument("-c", "--continue_training", default=False, help="use this if you want to continue a training",
++                        action="store_true")
++    parser.add_argument("-p", help="plans identifier. Only change this if you created a custom experiment planner",
++                        default=default_plans_identifier, required=False)
++    parser.add_argument("--use_compressed_data", default=False, action="store_true",
++                        help="If you set use_compressed_data, the training cases will not be decompressed. Reading compressed data "
++                             "is much more CPU and RAM intensive and should only be used if you know what you are "
++                             "doing", required=False)
++    parser.add_argument("--deterministic",
++                        help="Makes training deterministic, but reduces training speed substantially. I (Fabian) think "
++                             "this is not necessary. Deterministic training will make you overfit to some random seed. "
++                             "Don't use that.",
++                        required=False, default=False, action="store_true")
++    parser.add_argument("--npz", required=False, default=False, action="store_true", help="if set then nnUNet will "
++                                                                                          "export npz files of "
++                                                                                          "predicted segmentations "
++                                                                                          "in the validation as well. "
++                                                                                          "This is needed to run the "
++                                                                                          "ensembling step so unless "
++                                                                                          "you are developing nnUNet "
++                                                                                          "you should enable this")
++    parser.add_argument("--find_lr", required=False, default=False, action="store_true",
++                        help="not used here, just for fun")
++    parser.add_argument("--valbest", required=False, default=False, action="store_true",
++                        help="hands off. This is not intended to be used")
++    parser.add_argument("--fp32", required=False, default=False, action="store_true",
++                        help="disable mixed precision training and run old school fp32")
++    parser.add_argument("--val_folder", required=False, default="validation_raw",
++                        help="name of the validation folder. No need to use this for most people")
++    # parser.add_argument("--interp_order", required=False, default=3, type=int,
++    #                     help="order of interpolation for segmentations. Testing purpose only. Hands off")
++    # parser.add_argument("--interp_order_z", required=False, default=0, type=int,
++    #                     help="order of interpolation along z if z is resampled separately. Testing purpose only. "
++    #                          "Hands off")
++    # parser.add_argument("--force_separate_z", required=False, default="None", type=str,
++    #                     help="force_separate_z resampling. Can be None, True or False. Testing purpose only. Hands off")
++
++    args = parser.parse_args()
++    print('------------\n', args)
++
++    task = args.task
++    fold = args.fold
++    network = args.network
++    network_trainer = args.network_trainer
++    weights = args.w
++    validation_only = args.validation_only
++    plans_identifier = args.p
++    find_lr = args.find_lr
++
++    use_compressed_data = args.use_compressed_data
++    decompress_data = not use_compressed_data
++
++    deterministic = args.deterministic
++    valbest = args.valbest
++
++    fp32 = args.fp32
++    run_mixed_precision = not fp32
++
++    val_folder = args.val_folder
++    # interp_order = args.interp_order
++    # interp_order_z = args.interp_order_z
++    # force_separate_z = args.force_separate_z
++
++    if not task.startswith("Task"):
++        task_id = int(task)
++        task = convert_id_to_task_name(task_id)
++
++    if fold == 'all':
++        pass
++    else:
++        fold = int(fold)
++
++    # if force_separate_z == "None":
++    #     force_separate_z = None
++    # elif force_separate_z == "False":
++    #     force_separate_z = False
++    # elif force_separate_z == "True":
++    #     force_separate_z = True
++    # else:
++    #     raise ValueError("force_separate_z must be None, True or False. Given: %s" % force_separate_z)
++
++    plans_file, output_folder_name, dataset_directory, batch_dice, stage, \
++    trainer_class, domain = get_default_configuration(network, task, network_trainer, plans_identifier)
++
++    if trainer_class is None:
++        raise RuntimeError("Could not find trainer class in nnunet.training.network_training")
++
++    if network == "3d_cascade_fullres":
++        assert issubclass(trainer_class, (nnUNetTrainerCascadeFullRes, nnUNetTrainerV2CascadeFullRes)), \
++            "If running 3d_cascade_fullres then your " \
++            "trainer class must be derived from " \
++            "nnUNetTrainerCascadeFullRes"
++    else:
++        assert issubclass(trainer_class,
++                          nnUNetTrainer), "network_trainer was found but is not derived from nnUNetTrainer"
++
++    trainer = trainer_class(plans_file, fold, output_folder=output_folder_name, dataset_directory=dataset_directory,
++                            batch_dice=batch_dice, stage=stage, unpack_data=decompress_data,
++                            deterministic=deterministic,
++                            fp16=run_mixed_precision)
++
++    trainer.initialize(not validation_only)
++
++    if weights != None:
++        trainer.load_pretrained_encoder_weights(weights)
++    sys.stdout.flush()
++
++    if find_lr:
++        trainer.find_lr()
++    else:
++        if not validation_only:
++            if args.continue_training:
++                trainer.load_latest_checkpoint()
++            trainer.run_training()
++        else:
++            if valbest:
++                trainer.load_best_checkpoint(train=False)
++            else:
++                trainer.load_latest_checkpoint(train=False)
++
++        trainer.network.eval()
++
++        # predict validation
++        trainer.validate(save_softmax=args.npz, validation_folder_name=val_folder)
++
++        if network == '3d_lowres':
++            trainer.load_best_checkpoint(False)
++            print("predicting segmentations for the next stage of the cascade")
++            predict_next_stage(trainer, join(dataset_directory, trainer.plans['data_identifier'] + "_stage%d" % 1))
++
++
++if __name__ == "__main__":
++    main()
+diff --git a/pytorch/nnunet/run/run_training_DDP.py b/pytorch/nnunet/run/run_training_DDP.py
+index 5ffcdcf..6ad3d5a 100644
+--- a/pytorch/nnunet/run/run_training_DDP.py
++++ b/pytorch/nnunet/run/run_training_DDP.py
+@@ -27,13 +27,13 @@ from nnunet.utilities.task_name_id_conversion import convert_id_to_task_name
+ 
+ def main():
+     parser = argparse.ArgumentParser()
+-    parser.add_argument("network")
+-    parser.add_argument("network_trainer")
+-    parser.add_argument("task", help="can be task name or task id")
+-    parser.add_argument("fold", help='0, 1, ..., 5 or \'all\'')
++    parser.add_argument("network", default='3d_fullres')
++    parser.add_argument("network_trainer", default='nnUNetTrainerV2_DDP')
++    parser.add_argument("task", help="can be task name or task id", default='003')
++    parser.add_argument("fold", help='0, 1, ..., 5 or \'all\'', default='0')
+     parser.add_argument("-val", "--validation_only", help="use this if you want to only run the validation",
+-                        action="store_true")
+-    parser.add_argument("-c", "--continue_training", help="use this if you want to continue a training",
++                        action="store_true", default=False)
++    parser.add_argument("-c", "--continue_training", default=False, help="use this if you want to continue a training",
+                         action="store_true")
+     parser.add_argument("-p", help="plans identifier. Only change this if you created a custom experiment planner",
+                         default=default_plans_identifier, required=False)
+@@ -78,7 +78,7 @@ def main():
+     #                     help="force_separate_z resampling. Can be None, True or False. Testing purpose only. Hands off")
+ 
+     args = parser.parse_args()
+-
++    print('\n\n args=', args, '\n\n')
+     task = args.task
+     fold = args.fold
+     network = args.network
+@@ -115,7 +115,7 @@ def main():
+     #     raise ValueError("force_separate_z must be None, True or False. Given: %s" % force_separate_z)
+ 
+     plans_file, output_folder_name, dataset_directory, batch_dice, stage, \
+-        trainer_class = get_default_configuration(network, task, network_trainer, plans_identifier)
++        trainer_class, _ = get_default_configuration(network, task, network_trainer, plans_identifier)
+ 
+     if trainer_class is None:
+         raise RuntimeError("Could not find trainer class in meddec.model_training")
+diff --git a/pytorch/nnunet/run/run_training_hypDDP.py b/pytorch/nnunet/run/run_training_hypDDP.py
+new file mode 100644
+index 0000000..f50744c
+--- /dev/null
++++ b/pytorch/nnunet/run/run_training_hypDDP.py
+@@ -0,0 +1,164 @@
++#    Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
++#
++#    Licensed under the Apache License, Version 2.0 (the "License");
++#    you may not use this file except in compliance with the License.
++#    You may obtain a copy of the License at
++#
++#        http://www.apache.org/licenses/LICENSE-2.0
++#
++#    Unless required by applicable law or agreed to in writing, software
++#    distributed under the License is distributed on an "AS IS" BASIS,
++#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++#    See the License for the specific language governing permissions and
++#    limitations under the License.
++
++
++import argparse
++
++from batchgenerators.utilities.file_and_folder_operations import *
++from nnunet.run.default_configuration import get_default_configuration
++from nnunet.paths import default_plans_identifier
++from nnunet.training.cascade_stuff.predict_next_stage import predict_next_stage
++from nnunet.training.network_training.nnUNetTrainer import nnUNetTrainer
++from nnunet.training.network_training.nnUNetTrainerCascadeFullRes import nnUNetTrainerCascadeFullRes
++from nnunet.training.network_training.nnUNetTrainerV2_CascadeFullRes import nnUNetTrainerV2CascadeFullRes
++from nnunet.utilities.task_name_id_conversion import convert_id_to_task_name
++
++
++def main():
++    parser = argparse.ArgumentParser()
++    parser.add_argument("network")
++    parser.add_argument("network_trainer")
++    parser.add_argument("task", help="can be task name or task id")
++    parser.add_argument("fold", help='0, 1, ..., 5 or \'all\'')
++    parser.add_argument("-val", "--validation_only", help="use this if you want to only run the validation",
++                        action="store_true")
++    parser.add_argument("-c", "--continue_training", help="use this if you want to continue a training",
++                        action="store_true")
++    parser.add_argument("-p", help="plans identifier. Only change this if you created a custom experiment planner",
++                        default=default_plans_identifier, required=False)
++    parser.add_argument("--use_compressed_data", default=False, action="store_true",
++                        help="If you set use_compressed_data, the training cases will not be decompressed. Reading compressed data "
++                             "is much more CPU and RAM intensive and should only be used if you know what you are "
++                             "doing", required=False)
++    parser.add_argument("--deterministic",
++                        help="Makes training deterministic, but reduces training speed substantially. I (Fabian) think "
++                             "this is not necessary. Deterministic training will make you overfit to some random seed. "
++                             "Don't use that.",
++                        required=False, default=False, action="store_true")
++    parser.add_argument("--local_rank", default=0, type=int)
++    parser.add_argument("--fp32", required=False, default=False, action="store_true",
++                        help="disable mixed precision training and run old school fp32")
++    parser.add_argument("--dbs", required=False, default=False, action="store_true", help="distribute batch size. If "
++                                                                                          "True then whatever "
++                                                                                          "batch_size is in plans will "
++                                                                                          "be distributed over DDP "
++                                                                                          "models, if False then each "
++                                                                                          "model will have batch_size "
++                                                                                          "for a total of "
++                                                                                          "GPUs*batch_size")
++    parser.add_argument("--npz", required=False, default=False, action="store_true", help="if set then nnUNet will "
++                                                                                          "export npz files of "
++                                                                                          "predicted segmentations "
++                                                                                          "in the vlaidation as well. "
++                                                                                          "This is needed to run the "
++                                                                                          "ensembling step so unless "
++                                                                                          "you are developing nnUNet "
++                                                                                          "you should enable this")
++    parser.add_argument("--valbest", required=False, default=False, action="store_true", help="")
++    parser.add_argument("--find_lr", required=False, default=False, action="store_true", help="")
++    parser.add_argument("--val_folder", required=False, default="validation_raw",
++                        help="name of the validation folder. No need to use this for most people")
++    # parser.add_argument("--interp_order", required=False, default=3, type=int,
++    #                     help="order of interpolation for segmentations. Testing purpose only. Hands off")
++    # parser.add_argument("--interp_order_z", required=False, default=0, type=int,
++    #                     help="order of interpolation along z if z is resampled separately. Testing purpose only. "
++    #                          "Hands off")
++    # parser.add_argument("--force_separate_z", required=False, default="None", type=str,
++    #                     help="force_separate_z resampling. Can be None, True or False. Testing purpose only. Hands off")
++
++    args = parser.parse_args()
++    print('\n\n args=', args, '\n\n')
++    task = args.task
++    fold = args.fold
++    network = args.network
++    network_trainer = args.network_trainer
++    validation_only = args.validation_only
++    plans_identifier = args.p
++    use_compressed_data = args.use_compressed_data
++    decompress_data = not use_compressed_data
++    deterministic = args.deterministic
++    valbest = args.valbest
++    find_lr = args.find_lr
++    val_folder = args.val_folder
++    # interp_order = args.interp_order
++    # interp_order_z = args.interp_order_z
++    # force_separate_z = args.force_separate_z
++    fp32 = args.fp32
++
++    if not task.startswith("Task"):
++        task_id = int(task)
++        task = convert_id_to_task_name(task_id)
++
++    if fold == 'all':
++        pass
++    else:
++        fold = int(fold)
++    #
++    # if force_separate_z == "None":
++    #     force_separate_z = None
++    # elif force_separate_z == "False":
++    #     force_separate_z = False
++    # elif force_separate_z == "True":
++    #     force_separate_z = True
++    # else:
++    #     raise ValueError("force_separate_z must be None, True or False. Given: %s" % force_separate_z)
++
++    plans_file, output_folder_name, dataset_directory, batch_dice, stage, \
++        trainer_class, _ = get_default_configuration(network, task, network_trainer, plans_identifier)
++
++    if trainer_class is None:
++        raise RuntimeError("Could not find trainer class in meddec.model_training")
++
++    if network == "3d_cascade_fullres":
++        assert issubclass(trainer_class, (nnUNetTrainerCascadeFullRes, nnUNetTrainerV2CascadeFullRes)), \
++            "If running 3d_cascade_fullres then your " \
++            "trainer class must be derived from " \
++            "nnUNetTrainerCascadeFullRes"
++    else:
++        assert issubclass(trainer_class,
++                          nnUNetTrainer), "network_trainer was found but is not derived from nnUNetTrainer"
++
++    trainer = trainer_class(plans_file, fold, local_rank=args.local_rank, output_folder=output_folder_name,
++                            dataset_directory=dataset_directory, batch_dice=batch_dice, stage=stage,
++                            unpack_data=decompress_data, deterministic=deterministic, fp16=not fp32,
++                            distribute_batch_size=args.dbs)
++
++    trainer.initialize(not validation_only)
++
++    if find_lr:
++        trainer.find_lr()
++    else:
++        if not validation_only:
++            if args.continue_training:
++                trainer.load_latest_checkpoint()
++            trainer.run_training()
++        else:
++            if valbest:
++                trainer.load_best_checkpoint(train=False)
++            else:
++                trainer.load_latest_checkpoint(train=False)
++
++        trainer.network.eval()
++
++        # predict validation
++        trainer.validate(save_softmax=args.npz, validation_folder_name=val_folder)
++
++        if network == '3d_lowres':
++            trainer.load_best_checkpoint(False)
++            print("predicting segmentations for the next stage of the cascade")
++            predict_next_stage(trainer, join(dataset_directory, trainer.plans['data_identifier'] + "_stage%d" % 1))
++
++
++if __name__ == "__main__":
++    main()
+diff --git a/pytorch/nnunet/training/loss_functions/crossentropy.py b/pytorch/nnunet/training/loss_functions/crossentropy.py
+index 6195437..0c782d9 100644
+--- a/pytorch/nnunet/training/loss_functions/crossentropy.py
++++ b/pytorch/nnunet/training/loss_functions/crossentropy.py
+@@ -6,6 +6,15 @@ class RobustCrossEntropyLoss(nn.CrossEntropyLoss):
+     this is just a compatibility layer because my target tensor is float and has an extra dimension
+     """
+     def forward(self, input: Tensor, target: Tensor) -> Tensor:
++        # i = 0
++        # print('----------')
++        # print('input:', input.shape)
++        # for i in range(len(input)):
++        #     print(i, input[i].shape)
++        # print('target')
++        # for i in range(len(target)):
++        #     print(i, target[i].shape)
++        # print('\n----------')
+         if len(target.shape) == len(input.shape):
+             assert target.shape[1] == 1
+             target = target[:, 0]
+diff --git a/pytorch/nnunet/training/network_training/network_trainer.py b/pytorch/nnunet/training/network_training/network_trainer.py
+index e920158..f0031d3 100644
+--- a/pytorch/nnunet/training/network_training/network_trainer.py
++++ b/pytorch/nnunet/training/network_training/network_trainer.py
+@@ -37,6 +37,7 @@ from abc import abstractmethod
+ from datetime import datetime
+ from tqdm import trange
+ from nnunet.utilities.to_torch import maybe_to_torch, to_cuda
++import pdb
+ 
+ 
+ class NetworkTrainer(object):
+@@ -438,7 +439,8 @@ class NetworkTrainer(object):
+         self._maybe_init_amp()
+ 
+     def _maybe_init_amp(self):
+-        if self.fp16 and self.amp_grad_scaler is None and torch.cuda.is_available():
++        # if self.fp16 and self.amp_grad_scaler is None and torch.cuda.is_available():
++        if self.fp16 and self.amp_grad_scaler is None:
+                 self.amp_grad_scaler = GradScaler()
+ 
+     def plot_network_architecture(self):
+diff --git a/pytorch/nnunet/training/network_training/nnUNetPlusPlusTrainerV2.py b/pytorch/nnunet/training/network_training/nnUNetPlusPlusTrainerV2.py
+index e9aa611..9b97e8c 100644
+--- a/pytorch/nnunet/training/network_training/nnUNetPlusPlusTrainerV2.py
++++ b/pytorch/nnunet/training/network_training/nnUNetPlusPlusTrainerV2.py
+@@ -13,6 +13,7 @@
+ #    limitations under the License.
+ 
+ 
++import SimpleITK as sitk
+ from collections import OrderedDict
+ from typing import Tuple
+ import sys
+@@ -35,12 +36,10 @@ from torch.cuda.amp import autocast
+ from nnunet.training.learning_rate.poly_lr import poly_lr
+ from batchgenerators.utilities.file_and_folder_operations import *
+ 
+-
+ class nnUNetPlusPlusTrainerV2(nnUNetTrainer):
+     """
+     Info for Fabian: same as internal nnUNetTrainerV2_2
+     """
+-
+     def __init__(self, plans_file, fold, output_folder=None, dataset_directory=None, batch_dice=True, stage=None,
+                  unpack_data=True, deterministic=True, fp16=False):
+         super().__init__(plans_file, fold, output_folder, dataset_directory, batch_dice, stage, unpack_data,
+@@ -66,7 +65,7 @@ class nnUNetPlusPlusTrainerV2(nnUNetTrainer):
+             maybe_mkdir_p(self.output_folder)
+ 
+             if force_load_plans or (self.plans is None):
+-                self.load_plans_file()
++                self.load_plans_file()  # '/data/yupeng/environment_variables/nnUNet_preprocessed/Task003_Liver/nnUNetPlansv2.1_plans_3D.pkl'
+ 
+             self.process_plans(self.plans)
+ 
+@@ -189,6 +188,7 @@ class nnUNetPlusPlusTrainerV2(nnUNetTrainer):
+         """
+         ds = self.network.do_ds
+         self.network.do_ds = False
++        overwrite = False  # 不希望重新跑推理，不然太久了
+         ret = super().validate(do_mirroring, use_sliding_window, step_size, save_softmax, use_gaussian,
+                                overwrite, validation_folder_name, debug, all_in_gpu, segmentation_export_kwargs)
+ 
+@@ -200,16 +200,18 @@ class nnUNetPlusPlusTrainerV2(nnUNetTrainer):
+                                                          use_sliding_window: bool = True, step_size: float = 0.5,
+                                                          use_gaussian: bool = True, pad_border_mode: str = 'constant',
+                                                          pad_kwargs: dict = None, all_in_gpu: bool = True,
+-                                                         verbose: bool = True, mixed_precision=True) -> Tuple[np.ndarray, np.ndarray]:
++                                                         verbose: bool = True, mixed_precision=True, img_name=None,
++                                                         pre_mode=None, fp=None) -> Tuple[np.ndarray, np.ndarray]:
+         """
+         We need to wrap this because we need to enforce self.network.do_ds = False for prediction
+         """
+-        ds = self.network.do_ds
++        ds = self.network.do_ds  # ds = True
+         self.network.do_ds = False
+         ret = super().predict_preprocessed_data_return_seg_and_softmax(data, do_mirroring, mirror_axes,
+                                                                        use_sliding_window, step_size, use_gaussian,
+                                                                        pad_border_mode, pad_kwargs, all_in_gpu, verbose,
+-                                                                       mixed_precision=mixed_precision)
++                                                                       mixed_precision=mixed_precision, img_name=img_name,
++                                                                       pre_mode=pre_mode, fp=fp)
+         self.network.do_ds = ds
+         return ret
+ 
+@@ -225,7 +227,20 @@ class nnUNetPlusPlusTrainerV2(nnUNetTrainer):
+         data_dict = next(data_generator)
+         data = data_dict['data']
+         target = data_dict['target']
+-
++        # i = 0
++        # while True:
++        #     i += 1
++        #     data_dict = next(data_generator)
++        #     data = data_dict['data']
++        #     target = data_dict['target']
++        #     data_numpy_output = '/home/yupeng/save_data.nii.gz'
++        #     data_numpy = data[0][0].numpy()
++        #     target_numpy = target[0][0][0].numpy()
++        #     data_1 = data_numpy.flatten()
++        #     minm = np.argmin(data_1)
++        #     maxm = np.argmax(data_1)
++        #     out = sitk.GetImageFromArray(data_numpy)
++        #     sitk.WriteImage(out, data_numpy_output)
+         data = maybe_to_torch(data)
+         target = maybe_to_torch(target)
+ 
+@@ -234,7 +249,6 @@ class nnUNetPlusPlusTrainerV2(nnUNetTrainer):
+             target = to_cuda(target)
+ 
+         self.optimizer.zero_grad()
+-
+         if self.fp16:
+             with autocast():
+                 output = self.network(data)
+@@ -261,7 +275,6 @@ class nnUNetPlusPlusTrainerV2(nnUNetTrainer):
+             self.run_online_evaluation(output, target)
+ 
+         del target
+-
+         return l.detach().cpu().numpy()
+ 
+     def do_split(self):
+diff --git a/pytorch/nnunet/training/network_training/nnUNetPlusPlusTrainerV2_DDP.py b/pytorch/nnunet/training/network_training/nnUNetPlusPlusTrainerV2_DDP.py
+new file mode 100644
+index 0000000..e2ab2fa
+--- /dev/null
++++ b/pytorch/nnunet/training/network_training/nnUNetPlusPlusTrainerV2_DDP.py
+@@ -0,0 +1,483 @@
++#    Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
++#
++#    Licensed under the Apache License, Version 2.0 (the "License");
++#    you may not use this file except in compliance with the License.
++#    You may obtain a copy of the License at
++#
++#        http://www.apache.org/licenses/LICENSE-2.0
++#
++#    Unless required by applicable law or agreed to in writing, software
++#    distributed under the License is distributed on an "AS IS" BASIS,
++#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++#    See the License for the specific language governing permissions and
++#    limitations under the License.
++
++
++from collections import OrderedDict
++from typing import Tuple
++import sys
++import time
++import numpy as np
++import torch
++import torch.distributed as dist
++from torch.cuda.amp import autocast
++from torch.nn.parallel import DistributedDataParallel as DDP
++from nnunet.training.loss_functions.deep_supervision import MultipleOutputLoss2
++from nnunet.utilities.to_torch import maybe_to_torch, to_cuda
++from nnunet.training.data_augmentation.default_data_augmentation import get_moreDA_augmentation
++from nnunet.network_architecture.generic_UNetPlusPlus import Generic_UNetPlusPlus
++from nnunet.network_architecture.initialization import InitWeights_He
++from nnunet.network_architecture.neural_network import SegmentationNetwork
++from nnunet.training.data_augmentation.default_data_augmentation import default_2D_augmentation_params, \
++    get_patch_size, default_3D_augmentation_params
++from nnunet.training.dataloading.dataset_loading import unpack_dataset
++from nnunet.training.network_training.nnUNetTrainer import nnUNetTrainer
++from nnunet.utilities.nd_softmax import softmax_helper
++from sklearn.model_selection import KFold
++from torch import nn
++from torch.cuda.amp import autocast
++from nnunet.training.learning_rate.poly_lr import poly_lr
++from batchgenerators.utilities.file_and_folder_operations import *
++
++
++class nnUNetPlusPlusTrainerV2_DDP(nnUNetTrainer):
++    """
++    Info for Fabian: same as internal nnUNetTrainerV2_2
++    """
++
++    def __init__(self, plans_file, fold, local_rank, output_folder=None, dataset_directory=None, batch_dice=True,
++                 stage=None,
++                 unpack_data=True, deterministic=True, fp16=False, distribute_batch_size=1):
++        super().__init__(plans_file, fold, output_folder, dataset_directory, batch_dice, stage, unpack_data,
++                         deterministic, fp16)
++        self.init_args = (
++            plans_file, fold, local_rank, output_folder, dataset_directory, batch_dice, stage, unpack_data,
++            deterministic, distribute_batch_size, fp16)
++        self.max_num_epochs = 1000
++        self.initial_lr = 1e-2
++        self.deep_supervision_scales = None
++        self.ds_loss_weights = None
++        self.distribute_batch_size = distribute_batch_size
++        np.random.seed(local_rank)
++        torch.manual_seed(local_rank)
++        self.local_rank = local_rank
++        if torch.cuda.is_available():
++            torch.cuda.set_device(local_rank)
++        dist.init_process_group(backend='nccl', init_method='env://')
++
++        self.pin_memory = True
++
++    def initialize(self, training=True, force_load_plans=False):
++        """
++        - replaced get_default_augmentation with get_moreDA_augmentation
++        - enforce to only run this code once
++        - loss function wrapper for deep supervision
++
++        :param training:
++        :param force_load_plans:
++        :return:
++        """
++        if not self.was_initialized:
++            maybe_mkdir_p(self.output_folder)
++
++            if force_load_plans or (self.plans is None):
++                self.load_plans_file()
++
++            self.process_plans(self.plans)
++
++            self.setup_DA_params()
++
++            ################# Here we wrap the loss for deep supervision ############
++            # we need to know the number of outputs of the network
++            net_numpool = len(self.net_num_pool_op_kernel_sizes)
++
++            # we give each output a weight which decreases exponentially (division by 2) as the resolution decreases
++            # this gives higher resolution outputs more weight in the loss
++            weights = np.array([1 / (2 ** i) for i in range(net_numpool)])
++
++            # we don't use the lowest 2 outputs. Normalize weights so that they sum to 1
++            mask = np.array([True] + [True if i < net_numpool - 1 else False for i in range(1, net_numpool)])
++            weights[~mask] = 0
++            weights = weights / weights.sum()
++            # self.ds_loss_weights = weights
++            self.ds_loss_weights = None
++            # now wrap the loss
++            self.loss = MultipleOutputLoss2(self.loss, self.ds_loss_weights)
++            ################# END ###################
++
++            self.folder_with_preprocessed_data = join(self.dataset_directory, self.plans['data_identifier'] +
++                                                      "_stage%d" % self.stage)
++            if training:
++                self.dl_tr, self.dl_val = self.get_basic_generators()
++                if self.unpack_data:
++                    if self.local_rank == 0:
++                        print("unpacking dataset")
++                        unpack_dataset(self.folder_with_preprocessed_data)
++                        print("done")
++                    else:
++                        # we need to wait until worker 0 has finished unpacking
++                        npz_files = subfiles(self.folder_with_preprocessed_data, suffix=".npz", join=False)
++                        case_ids = [i[:-4] for i in npz_files]
++                        all_present = all(
++                            [isfile(join(self.folder_with_preprocessed_data, i + ".npy")) for i in case_ids])
++                        while not all_present:
++                            print("worker", self.local_rank, "is waiting for unpacking")
++                            time.sleep(3)
++                            all_present = all(
++                                [isfile(join(self.folder_with_preprocessed_data, i + ".npy")) for i in case_ids])
++                        # there is some slight chance that there may arise some error because dataloader are loading a file
++                        # that is still being written by worker 0. We ignore this for now an address it only if it becomes
++                        # relevant
++                        # (this can occur because while worker 0 writes the file is technically present so the other workers
++                        # will proceed and eventually try to read it)
++                else:
++                    print(
++                        "INFO: Not unpacking data! Training may be slow due to that. Pray you are not using 2d or you "
++                        "will wait all winter for your model to finish!")
++
++                self.tr_gen, self.val_gen = get_moreDA_augmentation(
++                    self.dl_tr, self.dl_val,
++                    self.data_aug_params[
++                        'patch_size_for_spatialtransform'],
++                    self.data_aug_params,
++                    deep_supervision_scales=self.deep_supervision_scales,
++                    pin_memory=self.pin_memory
++                )
++                self.print_to_log_file("TRAINING KEYS:\n %s" % (str(self.dataset_tr.keys())),
++                                       also_print_to_console=False)
++                self.print_to_log_file("VALIDATION KEYS:\n %s" % (str(self.dataset_val.keys())),
++                                       also_print_to_console=False)
++            else:
++                pass
++
++            self.initialize_network()
++            self.initialize_optimizer_and_scheduler()
++
++            assert isinstance(self.network, (SegmentationNetwork, DDP))
++        else:
++            self.print_to_log_file('self.was_initialized is True, not running self.initialize again')
++        self.was_initialized = True
++
++    def initialize_network(self):
++        """
++        - momentum 0.99
++        - SGD instead of Adam
++        - self.lr_scheduler = None because we do poly_lr
++        - deep supervision = True
++        - i am sure I forgot something here
++
++        Known issue: forgot to set neg_slope=0 in InitWeights_He; should not make a difference though
++        :return:
++        """
++        if self.threeD:
++            conv_op = nn.Conv3d
++            dropout_op = nn.Dropout3d
++            norm_op = nn.InstanceNorm3d
++
++        else:
++            conv_op = nn.Conv2d
++            dropout_op = nn.Dropout2d
++            norm_op = nn.InstanceNorm2d
++        norm_op_kwargs = {'eps': 1e-5, 'affine': True}
++        dropout_op_kwargs = {'p': 0, 'inplace': True}
++        net_nonlin = nn.LeakyReLU
++        net_nonlin_kwargs = {'negative_slope': 1e-2, 'inplace': True}
++        self.network = Generic_UNetPlusPlus(self.num_input_channels, self.base_num_features, self.num_classes,
++                                            len(self.net_num_pool_op_kernel_sizes),
++                                            self.conv_per_stage, 2, conv_op, norm_op, norm_op_kwargs, dropout_op,
++                                            dropout_op_kwargs,
++                                            net_nonlin, net_nonlin_kwargs, True, False, lambda x: x,
++                                            InitWeights_He(1e-2),
++                                            self.net_num_pool_op_kernel_sizes, self.net_conv_kernel_sizes, False, True,
++                                            True)
++        if torch.cuda.is_available():
++            self.network.cuda()
++        self.network.inference_apply_nonlin = softmax_helper
++        self.network = DDP(self.network, device_ids=[self.local_rank], find_unused_parameters=True)
++
++    #         self.network = DDP(self.network, device_ids=[self.local_rank], find_unused_parameters=True)
++
++    def initialize_optimizer_and_scheduler(self):
++        assert self.network is not None, "self.initialize_network must be called first"
++        print('weight_decay: ', self.weight_decay)
++        sys.stdout.flush()
++        self.optimizer = torch.optim.SGD(self.network.parameters(), self.initial_lr, weight_decay=self.weight_decay,
++                                         momentum=0.99, nesterov=True)
++        self.lr_scheduler = None
++
++    def run_online_evaluation(self, output, target):
++        """
++        due to deep supervision the return value and the reference are now lists of tensors. We only need the full
++        resolution output because this is what we are interested in in the end. The others are ignored
++        :param output:
++        :param target:
++        :return:
++        """
++        target = target[0]
++        output = output[0]
++        return super().run_online_evaluation(output, target)
++
++    def validate(self, do_mirroring: bool = True, use_sliding_window: bool = True,
++                 step_size: float = 0.5, save_softmax: bool = True, use_gaussian: bool = True, overwrite: bool = True,
++                 validation_folder_name: str = 'validation_raw', debug: bool = False, all_in_gpu: bool = False,
++                 segmentation_export_kwargs: dict = None):
++        """
++        We need to wrap this because we need to enforce self.network.do_ds = False for prediction
++        """
++        if self.local_rank == 0:
++            if isinstance(self.network, DDP):
++                net = self.network.module
++            else:
++                net = self.network
++            ds = self.network.do_ds
++            net.do_ds = False
++            ret = super().validate(do_mirroring, use_sliding_window, step_size, save_softmax, use_gaussian,
++                                   overwrite, validation_folder_name, debug, all_in_gpu, segmentation_export_kwargs)
++
++            net.do_ds = ds
++            return ret
++
++    def predict_preprocessed_data_return_seg_and_softmax(self, data: np.ndarray, do_mirroring: bool = True,
++                                                         mirror_axes: Tuple[int] = None,
++                                                         use_sliding_window: bool = True, step_size: float = 0.5,
++                                                         use_gaussian: bool = True, pad_border_mode: str = 'constant',
++                                                         pad_kwargs: dict = None, all_in_gpu: bool = True,
++                                                         verbose: bool = True, mixed_precision=True) -> Tuple[
++        np.ndarray, np.ndarray]:
++        """
++        We need to wrap this because we need to enforce self.network.do_ds = False for prediction
++        """
++        ds = self.network.do_ds
++        self.network.do_ds = False
++        ret = super().predict_preprocessed_data_return_seg_and_softmax(data, do_mirroring, mirror_axes,
++                                                                       use_sliding_window, step_size, use_gaussian,
++                                                                       pad_border_mode, pad_kwargs, all_in_gpu, verbose,
++                                                                       mixed_precision=mixed_precision)
++        self.network.do_ds = ds
++        return ret
++
++    def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation=False):
++        """
++        gradient clipping improves training stability
++
++        :param data_generator:
++        :param do_backprop:
++        :param run_online_evaluation:
++        :return:
++        """
++        data_dict = next(data_generator)
++        data = data_dict['data']
++        target = data_dict['target']
++
++        data = maybe_to_torch(data)
++        target = maybe_to_torch(target)
++
++        if torch.cuda.is_available():
++            data = to_cuda(data, gpu_id=None)
++            target = to_cuda(target, gpu_id=None)
++
++        self.optimizer.zero_grad()
++
++        if self.fp16:
++            with autocast():
++                output = self.network(data)
++                del data
++                l = self.loss(output, target)
++
++            if do_backprop:
++                self.amp_grad_scaler.scale(l).backward()
++                self.amp_grad_scaler.unscale_(self.optimizer)
++                torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12)
++                self.amp_grad_scaler.step(self.optimizer)
++                self.amp_grad_scaler.update()
++        else:
++            output = self.network(data)
++            del data
++            l = self.loss(output, target)
++
++            if do_backprop:
++                l.backward()
++                torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12)
++                self.optimizer.step()
++
++        if run_online_evaluation:
++            self.run_online_evaluation(output, target)
++
++        del target
++
++        return l.detach().cpu().numpy()
++
++    def do_split(self):
++        """
++        we now allow more than 5 splits. IMPORTANT: and fold > 4 will not be a real split but just another random
++        80:20 split of the data. You cannot run X-fold cross-validation with this code. It will always be a 5-fold CV.
++        Folds > 4 will be independent from each other
++        :return:
++        """
++        if self.fold == "all":
++            # if fold==all then we use all images for training and validation
++            tr_keys = val_keys = list(self.dataset.keys())
++        else:
++            splits_file = join(self.dataset_directory, "splits_final.pkl")
++
++            # if the split file does not exist we need to create it
++            if not isfile(splits_file):
++                self.print_to_log_file("Creating new split...")
++                splits = []
++                all_keys_sorted = np.sort(list(self.dataset.keys()))
++                kfold = KFold(n_splits=5, shuffle=True, random_state=12345)
++                for i, (train_idx, test_idx) in enumerate(kfold.split(all_keys_sorted)):
++                    train_keys = np.array(all_keys_sorted)[train_idx]
++                    test_keys = np.array(all_keys_sorted)[test_idx]
++                    splits.append(OrderedDict())
++                    splits[-1]['train'] = train_keys
++                    splits[-1]['val'] = test_keys
++                save_pickle(splits, splits_file)
++
++            splits = load_pickle(splits_file)
++
++            if self.fold < len(splits):
++                tr_keys = splits[self.fold]['train']
++                val_keys = splits[self.fold]['val']
++            else:
++                self.print_to_log_file("INFO: Requested fold %d but split file only has %d folds. I am now creating a "
++                                       "random 80:20 split!" % (self.fold, len(splits)))
++                # if we request a fold that is not in the split file, create a random 80:20 split
++                rnd = np.random.RandomState(seed=12345 + self.fold)
++                keys = np.sort(list(self.dataset.keys()))
++                idx_tr = rnd.choice(len(keys), int(len(keys) * 0.8), replace=False)
++                idx_val = [i for i in range(len(keys)) if i not in idx_tr]
++                tr_keys = [keys[i] for i in idx_tr]
++                val_keys = [keys[i] for i in idx_val]
++
++        tr_keys.sort()
++        val_keys.sort()
++        self.dataset_tr = OrderedDict()
++        for i in tr_keys:
++            self.dataset_tr[i] = self.dataset[i]
++        self.dataset_val = OrderedDict()
++        for i in val_keys:
++            self.dataset_val[i] = self.dataset[i]
++
++    def setup_DA_params(self):
++        """
++        - we increase roation angle from [-15, 15] to [-30, 30]
++        - scale range is now (0.7, 1.4), was (0.85, 1.25)
++        - we don't do elastic deformation anymore
++
++        :return:
++        """
++
++        self.deep_supervision_scales = [[1, 1, 1]] + list(list(i) for i in 1 / np.cumprod(
++            np.vstack(self.net_num_pool_op_kernel_sizes), axis=0))[:-1]
++
++        if self.threeD:
++            self.data_aug_params = default_3D_augmentation_params
++            self.data_aug_params['rotation_x'] = (-30. / 360 * 2. * np.pi, 30. / 360 * 2. * np.pi)
++            self.data_aug_params['rotation_y'] = (-30. / 360 * 2. * np.pi, 30. / 360 * 2. * np.pi)
++            self.data_aug_params['rotation_z'] = (-30. / 360 * 2. * np.pi, 30. / 360 * 2. * np.pi)
++            if self.do_dummy_2D_aug:
++                self.data_aug_params["dummy_2D"] = True
++                self.print_to_log_file("Using dummy2d data augmentation")
++                self.data_aug_params["elastic_deform_alpha"] = \
++                    default_2D_augmentation_params["elastic_deform_alpha"]
++                self.data_aug_params["elastic_deform_sigma"] = \
++                    default_2D_augmentation_params["elastic_deform_sigma"]
++                self.data_aug_params["rotation_x"] = default_2D_augmentation_params["rotation_x"]
++        else:
++            self.do_dummy_2D_aug = False
++            if max(self.patch_size) / min(self.patch_size) > 1.5:
++                default_2D_augmentation_params['rotation_x'] = (-15. / 360 * 2. * np.pi, 15. / 360 * 2. * np.pi)
++            self.data_aug_params = default_2D_augmentation_params
++        self.data_aug_params["mask_was_used_for_normalization"] = self.use_mask_for_norm
++
++        if self.do_dummy_2D_aug:
++            self.basic_generator_patch_size = get_patch_size(self.patch_size[1:],
++                                                             self.data_aug_params['rotation_x'],
++                                                             self.data_aug_params['rotation_y'],
++                                                             self.data_aug_params['rotation_z'],
++                                                             self.data_aug_params['scale_range'])
++            self.basic_generator_patch_size = np.array([self.patch_size[0]] + list(self.basic_generator_patch_size))
++            patch_size_for_spatialtransform = self.patch_size[1:]
++        else:
++            self.basic_generator_patch_size = get_patch_size(self.patch_size, self.data_aug_params['rotation_x'],
++                                                             self.data_aug_params['rotation_y'],
++                                                             self.data_aug_params['rotation_z'],
++                                                             self.data_aug_params['scale_range'])
++            patch_size_for_spatialtransform = self.patch_size
++
++        self.data_aug_params["scale_range"] = (0.7, 1.4)
++        self.data_aug_params["do_elastic"] = False
++        self.data_aug_params['selected_seg_channels'] = [0]
++        self.data_aug_params['patch_size_for_spatialtransform'] = patch_size_for_spatialtransform
++
++        self.data_aug_params["num_cached_per_thread"] = 2
++
++    def maybe_update_lr(self, epoch=None):
++        """
++        if epoch is not None we overwrite epoch. Else we use epoch = self.epoch + 1
++
++        (maybe_update_lr is called in on_epoch_end which is called before epoch is incremented.
++        herefore we need to do +1 here)
++
++        :param epoch:
++        :return:
++        """
++        if epoch is None:
++            ep = self.epoch + 1
++        else:
++            ep = epoch
++        self.optimizer.param_groups[0]['lr'] = poly_lr(ep, self.max_num_epochs, self.initial_lr, 0.9)
++        self.print_to_log_file("lr:", np.round(self.optimizer.param_groups[0]['lr'], decimals=6))
++
++    def on_epoch_end(self):
++        """
++        overwrite patient-based early stopping. Always run to 1000 epochs
++        :return:
++        """
++        super().on_epoch_end()
++        continue_training = self.epoch < self.max_num_epochs
++
++        # it can rarely happen that the momentum of nnUNetTrainerV2_plus is too high for some dataset. If at epoch 100 the
++        # estimated validation Dice is still 0 then we reduce the momentum from 0.99 to 0.95
++        if self.epoch == 100:
++            if self.all_val_eval_metrics[-1] == 0:
++                self.optimizer.param_groups[0]["momentum"] = 0.95
++                self.network.apply(InitWeights_He(1e-2))
++                self.print_to_log_file("At epoch 100, the mean foreground Dice was 0. This can be caused by a too "
++                                       "high momentum. High momentum (0.99) is good for datasets where it works, but "
++                                       "sometimes causes issues such as this one. Momentum has now been reduced to "
++                                       "0.95 and network weights have been reinitialized")
++        return continue_training
++
++    def save_checkpoint(self, fname, save_optimizer=True):
++        if self.local_rank == 0:
++            super().save_checkpoint(fname, save_optimizer)
++
++    def plot_progress(self):
++        if self.local_rank == 0:
++            super().plot_progress()
++
++    def print_to_log_file(self, *args, also_print_to_console=True):
++        if self.local_rank == 0:
++            super().print_to_log_file(*args, also_print_to_console=also_print_to_console)
++
++    def run_training(self):
++        """
++        if we run with -c then we need to set the correct lr for the first epoch, otherwise it will run the first
++        continued epoch with self.initial_lr
++
++        we also need to make sure deep supervision in the network is enabled for training, thus the wrapper
++        :return:
++        """
++        self.maybe_update_lr(self.epoch)  # if we dont overwrite epoch then self.epoch+1 is used which is not what we
++        # want at the start of the training
++        if isinstance(self.network, DDP):
++            net = self.network.module
++        else:
++            net = self.network
++        ds = net.do_ds
++        net.do_ds = True
++        ret = super().run_training()
++        net.do_ds = ds
++        return ret
+diff --git a/pytorch/nnunet/training/network_training/nnUNetPlusPlusTrainerV2_hypDDP.py b/pytorch/nnunet/training/network_training/nnUNetPlusPlusTrainerV2_hypDDP.py
+new file mode 100644
+index 0000000..aab27fe
+--- /dev/null
++++ b/pytorch/nnunet/training/network_training/nnUNetPlusPlusTrainerV2_hypDDP.py
+@@ -0,0 +1,457 @@
++#    Copyright 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
++#
++#    Licensed under the Apache License, Version 2.0 (the "License");
++#    you may not use this file except in compliance with the License.
++#    You may obtain a copy of the License at
++#
++#        http://www.apache.org/licenses/LICENSE-2.0
++#
++#    Unless required by applicable law or agreed to in writing, software
++#    distributed under the License is distributed on an "AS IS" BASIS,
++#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++#    See the License for the specific language governing permissions and
++#    limitations under the License.
++
++from collections import OrderedDict
++from time import sleep
++from typing import Tuple
++
++import numpy as np
++import torch
++import torch.distributed as dist
++from torch.cuda.amp import autocast
++from torch.nn.parallel import DistributedDataParallel as DDP
++from batchgenerators.utilities.file_and_folder_operations import maybe_mkdir_p, join, subfiles, isfile
++from nnunet.network_architecture.neural_network import SegmentationNetwork
++from nnunet.training.data_augmentation.default_data_augmentation import get_moreDA_augmentation
++from nnunet.training.dataloading.dataset_loading import unpack_dataset
++from nnunet.training.loss_functions.crossentropy import RobustCrossEntropyLoss
++from nnunet.training.loss_functions.dice_loss import get_tp_fp_fn_tn
++from nnunet.training.network_training.nnUNetTrainer import nnUNetTrainer
++from nnunet.training.network_training.nnUNetPlusPlusTrainerV2 import nnUNetPlusPlusTrainerV2
++from nnunet.utilities.distributed import awesome_allgather_function
++from nnunet.utilities.nd_softmax import softmax_helper
++from nnunet.utilities.tensor_utilities import sum_tensor
++from nnunet.utilities.to_torch import to_cuda, maybe_to_torch
++from torch import nn
++from torch.nn.utils import clip_grad_norm_
++from torch.optim.lr_scheduler import _LRScheduler
++
++
++class nnUNetPlusPlusTrainerV2_hypDDP(nnUNetPlusPlusTrainerV2):
++    def __init__(self, plans_file, fold, local_rank, output_folder=None, dataset_directory=None, batch_dice=True,
++                 stage=None,
++                 unpack_data=True, deterministic=True, distribute_batch_size=False, fp16=False):
++        super().__init__(plans_file, fold, output_folder, dataset_directory, batch_dice, stage,
++                         unpack_data, deterministic, fp16)
++        self.init_args = (
++            plans_file, fold, local_rank, output_folder, dataset_directory, batch_dice, stage, unpack_data,
++            deterministic, distribute_batch_size, fp16)
++        self.distribute_batch_size = distribute_batch_size
++        np.random.seed(local_rank)
++        torch.manual_seed(local_rank)
++        if torch.cuda.is_available():
++            torch.cuda.manual_seed_all(local_rank)
++        self.local_rank = local_rank
++
++        if torch.cuda.is_available():
++            torch.cuda.set_device(local_rank)
++        dist.init_process_group(backend='nccl', init_method='env://')
++
++        self.val_loss_ma_alpha = 0.95
++        self.val_loss_MA = None
++
++        self.loss = None
++        self.ce_loss = RobustCrossEntropyLoss()
++
++        self.global_batch_size = None  # we need to know this to properly steer oversample
++
++    def set_batch_size_and_oversample(self):
++        batch_sizes = []
++        oversample_percents = []
++
++        world_size = dist.get_world_size()
++        my_rank = dist.get_rank()
++
++        if self.distribute_batch_size:
++            self.global_batch_size = self.batch_size
++        else:
++            self.global_batch_size = self.batch_size * world_size
++
++        batch_size_per_GPU = np.ceil(self.batch_size / world_size).astype(int)
++
++        for rank in range(world_size):
++            if self.distribute_batch_size:
++                if (rank + 1) * batch_size_per_GPU > self.batch_size:
++                    batch_size = batch_size_per_GPU - ((rank + 1) * batch_size_per_GPU - self.batch_size)
++                else:
++                    batch_size = batch_size_per_GPU
++            else:
++                batch_size = self.batch_size
++
++            batch_sizes.append(batch_size)
++
++            sample_id_low = 0 if len(batch_sizes) == 0 else np.sum(batch_sizes[:-1])
++            sample_id_high = np.sum(batch_sizes)
++
++            if sample_id_high / self.global_batch_size < (1 - self.oversample_foreground_percent):
++                oversample_percents.append(0.0)
++            elif sample_id_low / self.global_batch_size > (1 - self.oversample_foreground_percent):
++                oversample_percents.append(1.0)
++            else:
++                percent_covered_by_this_rank = sample_id_high / self.global_batch_size - sample_id_low / self.global_batch_size
++                oversample_percent_here = 1 - (((1 - self.oversample_foreground_percent) -
++                                                sample_id_low / self.global_batch_size) / percent_covered_by_this_rank)
++                oversample_percents.append(oversample_percent_here)
++
++        print("worker", my_rank, "oversample", oversample_percents[my_rank])
++        print("worker", my_rank, "batch_size", batch_sizes[my_rank])
++
++        self.batch_size = batch_sizes[my_rank]
++        self.oversample_foreground_percent = oversample_percents[my_rank]
++
++    def save_checkpoint(self, fname, save_optimizer=True):
++        if self.local_rank == 0:
++            super().save_checkpoint(fname, save_optimizer)
++
++    def plot_progress(self):
++        if self.local_rank == 0:
++            super().plot_progress()
++
++    def print_to_log_file(self, *args, also_print_to_console=True):
++        if self.local_rank == 0:
++            super().print_to_log_file(*args, also_print_to_console=also_print_to_console)
++
++    def process_plans(self, plans):
++        super().process_plans(plans)
++        self.set_batch_size_and_oversample()
++
++    def initialize(self, training=True, force_load_plans=False):
++        """
++        For prediction of test cases just set training=False, this will prevent loading of training data and
++        training batchgenerator initialization
++        :param training:
++        :return:
++        """
++        if not self.was_initialized:
++            maybe_mkdir_p(self.output_folder)
++
++            if force_load_plans or (self.plans is None):
++                self.load_plans_file()
++
++            self.process_plans(self.plans)
++
++            self.setup_DA_params()
++
++            self.folder_with_preprocessed_data = join(self.dataset_directory, self.plans['data_identifier'] +
++                                                      "_stage%d" % self.stage)
++            if training:
++                self.dl_tr, self.dl_val = self.get_basic_generators()
++                if self.unpack_data:
++                    if self.local_rank == 0:
++                        print("unpacking dataset")
++                        unpack_dataset(self.folder_with_preprocessed_data)
++                        print("done")
++                    else:
++                        # we need to wait until worker 0 has finished unpacking
++                        npz_files = subfiles(self.folder_with_preprocessed_data, suffix=".npz", join=False)
++                        case_ids = [i[:-4] for i in npz_files]
++                        all_present = all(
++                            [isfile(join(self.folder_with_preprocessed_data, i + ".npy")) for i in case_ids])
++                        while not all_present:
++                            print("worker", self.local_rank, "is waiting for unpacking")
++                            sleep(3)
++                            all_present = all(
++                                [isfile(join(self.folder_with_preprocessed_data, i + ".npy")) for i in case_ids])
++                        # there is some slight chance that there may arise some error because dataloader are loading a file
++                        # that is still being written by worker 0. We ignore this for now an address it only if it becomes
++                        # relevant
++                        # (this can occur because while worker 0 writes the file is technically present so the other workers
++                        # will proceed and eventually try to read it)
++                else:
++                    print(
++                        "INFO: Not unpacking data! Training may be slow due to that. Pray you are not using 2d or you "
++                        "will wait all winter for your model to finish!")
++
++                # setting weights for deep supervision losses
++                net_numpool = len(self.net_num_pool_op_kernel_sizes)
++
++                # we give each output a weight which decreases exponentially (division by 2) as the resolution decreases
++                # this gives higher resolution outputs more weight in the loss
++                weights = np.array([1 / (2 ** i) for i in range(net_numpool)])
++
++                # we don't use the lowest 2 outputs. Normalize weights so that they sum to 1
++                mask = np.array([True if i < net_numpool - 1 else False for i in range(net_numpool)])
++                weights[~mask] = 0
++                weights = weights / weights.sum()
++                self.ds_loss_weights = weights
++
++                seeds_train = np.random.random_integers(0, 99999, self.data_aug_params.get('num_threads'))
++                seeds_val = np.random.random_integers(0, 99999, max(self.data_aug_params.get('num_threads') // 2, 1))
++                print("seeds train", seeds_train)
++                print("seeds_val", seeds_val)
++                self.tr_gen, self.val_gen = get_moreDA_augmentation(self.dl_tr, self.dl_val,
++                                                                    self.data_aug_params[
++                                                                        'patch_size_for_spatialtransform'],
++                                                                    self.data_aug_params,
++                                                                    deep_supervision_scales=self.deep_supervision_scales,
++                                                                    seeds_train=seeds_train,
++                                                                    seeds_val=seeds_val,
++                                                                    pin_memory=self.pin_memory)
++                self.print_to_log_file("TRAINING KEYS:\n %s" % (str(self.dataset_tr.keys())),
++                                       also_print_to_console=False)
++                self.print_to_log_file("VALIDATION KEYS:\n %s" % (str(self.dataset_val.keys())),
++                                       also_print_to_console=False)
++            else:
++                pass
++
++            self.initialize_network()
++            self.initialize_optimizer_and_scheduler()
++            self.network = DDP(self.network, device_ids=[self.local_rank])
++
++        else:
++            self.print_to_log_file('self.was_initialized is True, not running self.initialize again')
++        self.was_initialized = True
++
++    def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation=False):
++        data_dict = next(data_generator)
++        data = data_dict['data']
++        target = data_dict['target']
++
++        data = maybe_to_torch(data)
++        target = maybe_to_torch(target)
++
++        if torch.cuda.is_available():
++            data = to_cuda(data, gpu_id=None)
++            target = to_cuda(target, gpu_id=None)
++
++        self.optimizer.zero_grad()
++        # print('self.fp16=', self.fp16, end=' ')
++        if self.fp16:
++            with autocast():
++                # print('if', data.shape, len(target), target[0].shape, end=' ')
++                output = self.network(data)
++                # print(len(output), output[0].shape)
++                del data
++                # print(len(output), output[0].shape, target[0].shape)
++                l = self.compute_loss(output, target)
++
++            if do_backprop:
++                self.amp_grad_scaler.scale(l).backward()
++                self.amp_grad_scaler.unscale_(self.optimizer)
++                torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12)
++                self.amp_grad_scaler.step(self.optimizer)
++                self.amp_grad_scaler.update()
++        else:
++            # print('else', data.shape, len(target), target[0].shape, end=' ')
++            output = self.network(data)
++            # print(len(output), output[0].shape)
++            del data
++            l = self.compute_loss(output, target)
++
++            if do_backprop:
++                l.backward()
++                torch.nn.utils.clip_grad_norm_(self.network.parameters(), 12)
++                self.optimizer.step()
++
++        if run_online_evaluation:
++            self.run_online_evaluation(output, target)
++
++        del target
++
++        return l.detach().cpu().numpy()
++
++    def compute_loss(self, output, target):
++        total_loss = None
++        length = len(output)
++        # length = 1
++        for i in range(length):
++            # Starting here it gets spicy!
++            axes = tuple(range(2, len(output[i].size())))
++
++            # network does not do softmax. We need to do softmax for dice
++            output_softmax = softmax_helper(output[i])
++
++            # get the tp, fp and fn terms we need
++            tp, fp, fn, _ = get_tp_fp_fn_tn(output_softmax, target[0], axes, mask=None)
++            # for dice, compute nominator and denominator so that we have to accumulate only 2 instead of 3 variables
++            # do_bg=False in nnUNetTrainer -> [:, 1:]
++            nominator = 2 * tp[:, 1:]
++            denominator = 2 * tp[:, 1:] + fp[:, 1:] + fn[:, 1:]
++
++            if self.batch_dice:
++                # for DDP we need to gather all nominator and denominator terms from all GPUS to do proper batch dice
++                nominator = awesome_allgather_function.apply(nominator)
++                denominator = awesome_allgather_function.apply(denominator)
++                nominator = nominator.sum(0)
++                denominator = denominator.sum(0)
++            else:
++                pass
++
++            ce_loss = self.ce_loss(output[i], target[0][:, 0].long())
++
++            # we smooth by 1e-5 to penalize false positives if tp is 0
++            dice_loss = (- (nominator + 1e-5) / (denominator + 1e-5)).mean()
++            if total_loss is None:
++                total_loss = self.ds_loss_weights[i] * (ce_loss + dice_loss)
++            else:
++                total_loss += self.ds_loss_weights[i] * (ce_loss + dice_loss)
++        return total_loss
++
++    def run_online_evaluation(self, output, target):
++        with torch.no_grad():
++            num_classes = output[0].shape[1]
++            output_seg = output[0].argmax(1)
++            target = target[0][:, 0]
++            axes = tuple(range(1, len(target.shape)))
++            tp_hard = torch.zeros((target.shape[0], num_classes - 1)).to(output_seg.device.index)
++            fp_hard = torch.zeros((target.shape[0], num_classes - 1)).to(output_seg.device.index)
++            fn_hard = torch.zeros((target.shape[0], num_classes - 1)).to(output_seg.device.index)
++            for c in range(1, num_classes):
++                tp_hard[:, c - 1] = sum_tensor((output_seg == c).float() * (target == c).float(), axes=axes)
++                fp_hard[:, c - 1] = sum_tensor((output_seg == c).float() * (target != c).float(), axes=axes)
++                fn_hard[:, c - 1] = sum_tensor((output_seg != c).float() * (target == c).float(), axes=axes)
++
++            # tp_hard, fp_hard, fn_hard = get_tp_fp_fn((output_softmax > (1 / num_classes)).float(), target,
++            #                                         axes, None)
++            # print_if_rank0("before allgather", tp_hard.shape)
++            tp_hard = tp_hard.sum(0, keepdim=False)[None]
++            fp_hard = fp_hard.sum(0, keepdim=False)[None]
++            fn_hard = fn_hard.sum(0, keepdim=False)[None]
++
++            tp_hard = awesome_allgather_function.apply(tp_hard)
++            fp_hard = awesome_allgather_function.apply(fp_hard)
++            fn_hard = awesome_allgather_function.apply(fn_hard)
++
++        tp_hard = tp_hard.detach().cpu().numpy().sum(0)
++        fp_hard = fp_hard.detach().cpu().numpy().sum(0)
++        fn_hard = fn_hard.detach().cpu().numpy().sum(0)
++        self.online_eval_foreground_dc.append(list((2 * tp_hard) / (2 * tp_hard + fp_hard + fn_hard + 1e-8)))
++        self.online_eval_tp.append(list(tp_hard))
++        self.online_eval_fp.append(list(fp_hard))
++        self.online_eval_fn.append(list(fn_hard))
++
++    def run_training(self):
++        """
++        if we run with -c then we need to set the correct lr for the first epoch, otherwise it will run the first
++        continued epoch with self.initial_lr
++
++        we also need to make sure deep supervision in the network is enabled for training, thus the wrapper
++        :return:
++        """
++        self.maybe_update_lr(self.epoch)  # if we dont overwrite epoch then self.epoch+1 is used which is not what we
++        # want at the start of the training
++        if isinstance(self.network, DDP):
++            net = self.network.module
++        else:
++            net = self.network
++        ds = net.do_ds
++        net.do_ds = True
++        ret = nnUNetTrainer.run_training(self)
++        net.do_ds = ds
++        return ret
++
++    def validate(self, do_mirroring: bool = True, use_sliding_window: bool = True,
++                 step_size: float = 0.5, save_softmax: bool = True, use_gaussian: bool = True, overwrite: bool = True,
++                 validation_folder_name: str = 'validation_raw', debug: bool = False, all_in_gpu: bool = False,
++                 segmentation_export_kwargs: dict = None):
++        if self.local_rank == 0:
++            if isinstance(self.network, DDP):
++                net = self.network.module
++            else:
++                net = self.network
++            ds = net.do_ds
++            net.do_ds = False
++
++            ret = nnUNetTrainer.validate(self, do_mirroring, use_sliding_window, step_size, save_softmax,
++                                         use_gaussian, overwrite, validation_folder_name, debug, all_in_gpu,
++                                         segmentation_export_kwargs)
++            net.do_ds = ds
++            return ret
++
++    def predict_preprocessed_data_return_seg_and_softmax(self, data: np.ndarray, do_mirroring: bool = True,
++                                                         mirror_axes: Tuple[int] = None,
++                                                         use_sliding_window: bool = True, step_size: float = 0.5,
++                                                         use_gaussian: bool = True, pad_border_mode: str = 'constant',
++                                                         pad_kwargs: dict = None, all_in_gpu: bool = True,
++                                                         verbose: bool = True, mixed_precision=True, img_name=None,
++                                                         pre_mode=None, fp=None) -> Tuple[np.ndarray, np.ndarray]:
++        if pad_border_mode == 'constant' and pad_kwargs is None:
++            pad_kwargs = {'constant_values': 0}
++
++        if do_mirroring and mirror_axes is None:
++            mirror_axes = self.data_aug_params['mirror_axes']
++
++        if do_mirroring:
++            assert self.data_aug_params["do_mirror"], "Cannot do mirroring as test time augmentation when training " \
++                                                      "was done without mirroring"
++
++        valid = list((SegmentationNetwork, nn.DataParallel, DDP))
++        assert isinstance(self.network, tuple(valid))
++        if isinstance(self.network, DDP):
++            net = self.network.module
++        else:
++            net = self.network
++        ds = net.do_ds
++        net.do_ds = False
++        ret = net.predict_3D(data, do_mirroring, mirror_axes, use_sliding_window, step_size, self.patch_size,
++                             self.regions_class_order, use_gaussian, pad_border_mode, pad_kwargs,
++                             all_in_gpu, verbose, mixed_precision=mixed_precision)
++        net.do_ds = ds
++        return ret
++
++    def load_checkpoint_ram(self, checkpoint, train=True):
++        """
++        used for if the checkpoint is already in ram
++        :param checkpoint:
++        :param train:
++        :return:
++        """
++        if not self.was_initialized:
++            self.initialize(train)
++
++        new_state_dict = OrderedDict()
++        curr_state_dict_keys = list(self.network.state_dict().keys())
++        # if state dict comes form nn.DataParallel but we use non-parallel model here then the state dict keys do not
++        # match. Use heuristic to make it match
++        for k, value in checkpoint['state_dict'].items():
++            key = k
++            if key not in curr_state_dict_keys:
++                print("duh")
++                key = key[7:]
++            new_state_dict[key] = value
++
++        if self.fp16:
++            self._maybe_init_amp()
++            if 'amp_grad_scaler' in checkpoint.keys():
++                self.amp_grad_scaler.load_state_dict(checkpoint['amp_grad_scaler'])
++
++        self.network.load_state_dict(new_state_dict)
++        self.epoch = checkpoint['epoch']
++        if train:
++            optimizer_state_dict = checkpoint['optimizer_state_dict']
++            if optimizer_state_dict is not None:
++                self.optimizer.load_state_dict(optimizer_state_dict)
++
++            if self.lr_scheduler is not None and hasattr(self.lr_scheduler, 'load_state_dict') and checkpoint[
++                'lr_scheduler_state_dict'] is not None:
++                self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict'])
++
++            if issubclass(self.lr_scheduler.__class__, _LRScheduler):
++                self.lr_scheduler.step(self.epoch)
++
++        self.all_tr_losses, self.all_val_losses, self.all_val_losses_tr_mode, self.all_val_eval_metrics = checkpoint[
++            'plot_stuff']
++
++        # after the training is done, the epoch is incremented one more time in my old code. This results in
++        # self.epoch = 1001 for old trained models when the epoch is actually 1000. This causes issues because
++        # len(self.all_tr_losses) = 1000 and the plot function will fail. We can easily detect and correct that here
++        if self.epoch != len(self.all_tr_losses):
++            self.print_to_log_file("WARNING in loading checkpoint: self.epoch != len(self.all_tr_losses). This is "
++                                   "due to an old bug and should only appear when you are loading old models. New "
++                                   "models should have this fixed! self.epoch is now set to len(self.all_tr_losses)")
++            self.epoch = len(self.all_tr_losses)
++            self.all_tr_losses = self.all_tr_losses[:self.epoch]
++            self.all_val_losses = self.all_val_losses[:self.epoch]
++            self.all_val_losses_tr_mode = self.all_val_losses_tr_mode[:self.epoch]
++            self.all_val_eval_metrics = self.all_val_eval_metrics[:self.epoch]
+diff --git a/pytorch/nnunet/training/network_training/nnUNetTrainer.py b/pytorch/nnunet/training/network_training/nnUNetTrainer.py
+index 2dbf815..a20553f 100644
+--- a/pytorch/nnunet/training/network_training/nnUNetTrainer.py
++++ b/pytorch/nnunet/training/network_training/nnUNetTrainer.py
+@@ -40,6 +40,7 @@ from nnunet.utilities.nd_softmax import softmax_helper
+ from nnunet.utilities.tensor_utilities import sum_tensor
+ from torch import nn
+ from torch.optim import lr_scheduler
++from nnunet.inference.infer_path import INFERENCE_OUTPUT_FOLDER
+ 
+ 
+ matplotlib.use("agg")
+@@ -482,7 +483,8 @@ class nnUNetTrainer(NetworkTrainer):
+                                                          use_sliding_window: bool = True, step_size: float = 0.5,
+                                                          use_gaussian: bool = True, pad_border_mode: str = 'constant',
+                                                          pad_kwargs: dict = None, all_in_gpu: bool = True,
+-                                                         verbose: bool = True, mixed_precision: bool = True) -> Tuple[np.ndarray, np.ndarray]:
++                                                         verbose: bool = True, mixed_precision: bool = True,
++                                                         img_name=None, pre_mode=None, fp=None) -> Tuple[np.ndarray, np.ndarray]:
+         """
+         :param data:
+         :param do_mirroring:
+@@ -513,7 +515,7 @@ class nnUNetTrainer(NetworkTrainer):
+         self.network.eval()
+         ret = self.network.predict_3D(data, do_mirroring, mirror_axes, use_sliding_window, step_size, self.patch_size,
+                                       self.regions_class_order, use_gaussian, pad_border_mode, pad_kwargs,
+-                                      all_in_gpu, verbose, mixed_precision=mixed_precision)
++                                      all_in_gpu, verbose, mixed_precision=mixed_precision, img_name=img_name, pre_mode=pre_mode, fp=fp)
+         self.network.train(current_mode)
+         return ret
+ 
+@@ -533,8 +535,8 @@ class nnUNetTrainer(NetworkTrainer):
+             self.load_dataset()
+             self.do_split()
+ 
+-        if segmentation_export_kwargs is None:
+-            if 'segmentation_export_params' in self.plans.keys():
++        if segmentation_export_kwargs is None:  # True
++            if 'segmentation_export_params' in self.plans.keys():  # False
+                 force_separate_z = self.plans['segmentation_export_params']['force_separate_z']
+                 interpolation_order = self.plans['segmentation_export_params']['interpolation_order']
+                 interpolation_order_z = self.plans['segmentation_export_params']['interpolation_order_z']
+@@ -576,21 +578,21 @@ class nnUNetTrainer(NetworkTrainer):
+         export_pool = Pool(default_num_threads)
+         results = []
+ 
+-        for k in self.dataset_val.keys():
++        for k in self.dataset_val.keys():  # k = Liver_101
+             properties = load_pickle(self.dataset[k]['properties_file'])
+-            fname = properties['list_of_data_files'][0].split("/")[-1][:-12]
++            fname = properties['list_of_data_files'][0].split("/")[-1][:-12]  # Liver_101
+             if overwrite or (not isfile(join(output_folder, fname + ".nii.gz"))) or \
+                     (save_softmax and not isfile(join(output_folder, fname + ".npz"))):
+-                data = np.load(self.dataset[k]['data_file'])['data']
++                data = np.load(self.dataset[k]['data_file'])['data']  # 2 478 470 470
+ 
+                 print(k, data.shape)
+                 data[-1][data[-1] == -1] = 0
+-
++                simple_name = INFERENCE_OUTPUT_FOLDER + str(k) + '.nii.gz'
+                 softmax_pred = self.predict_preprocessed_data_return_seg_and_softmax(data[:-1], do_mirroring,
+                                                                                      mirror_axes, use_sliding_window,
+                                                                                      step_size, use_gaussian,
+                                                                                      all_in_gpu=all_in_gpu,
+-                                                                                     mixed_precision=self.fp16)[1]
++                                                                                     mixed_precision=self.fp16, img_name=simple_name)[1]
+ 
+                 softmax_pred = softmax_pred.transpose([0] + [i + 1 for i in self.transpose_backward])
+ 
+@@ -620,8 +622,8 @@ class nnUNetTrainer(NetworkTrainer):
+                                                          )
+                                )
+ 
+-            pred_gt_tuples.append([join(output_folder, fname + ".nii.gz"),
+-                                   join(self.gt_niftis_folder, fname + ".nii.gz")])
++            pred_gt_tuples.append([join(output_folder, fname + ".nii.gz"),  # '/data/yupeng/environment_variables/RESULTS_FOLDER/nnUNet/3d_fullres/Task003_Liver/nnUNetPlusPlusTrainerV2__nnUNetPlansv2.1/fold_0/validation_raw'
++                                   join(self.gt_niftis_folder, fname + ".nii.gz")])  # '/data/yupeng/environment_variables/nnUNet_preprocessed/Task003_Liver/gt_segmentations'
+ 
+         _ = [i.get() for i in results]
+         self.print_to_log_file("finished prediction")
+diff --git a/pytorch/nnunet/training/network_training/nnUNetTrainerV2_DDP.py b/pytorch/nnunet/training/network_training/nnUNetTrainerV2_DDP.py
+index 812183a..9f56d62 100644
+--- a/pytorch/nnunet/training/network_training/nnUNetTrainerV2_DDP.py
++++ b/pytorch/nnunet/training/network_training/nnUNetTrainerV2_DDP.py
+@@ -226,10 +226,12 @@ class nnUNetTrainerV2_DDP(nnUNetTrainerV2):
+             target = to_cuda(target, gpu_id=None)
+ 
+         self.optimizer.zero_grad()
+-
++        # print('self.fp16=', self.fp16, end=' ')
+         if self.fp16:
+             with autocast():
++                # print('if', data.shape, len(target), target[0].shape, end=' ')
+                 output = self.network(data)
++                # print(len(output), output[0].shape)
+                 del data
+                 l = self.compute_loss(output, target)
+ 
+@@ -240,7 +242,9 @@ class nnUNetTrainerV2_DDP(nnUNetTrainerV2):
+                 self.amp_grad_scaler.step(self.optimizer)
+                 self.amp_grad_scaler.update()
+         else:
++            # print('else', data.shape, len(target), target[0].shape, end=' ')
+             output = self.network(data)
++            # print(len(output), output[0].shape)
+             del data
+             l = self.compute_loss(output, target)
+ 
+diff --git a/pytorch/run.sh b/pytorch/run.sh
+new file mode 100644
+index 0000000..0abb8d5
+--- /dev/null
++++ b/pytorch/run.sh
+@@ -0,0 +1,5 @@
++python nnunet/run/run_training.py  3d_fullres nnUNetPlusPlusTrainerV2_DDP Task003_Liver 0
++
++
++python -m torch.distributed.launch --nproc_per_node 2 nnunet/run/run_training_DDP.py  3d_fullres nnUNetPlusPlusTrainerV2_DDP Task003_Liver 0
++
+diff --git a/pytorch/setup.py b/pytorch/setup.py
+index 590a453..554f4e2 100644
+--- a/pytorch/setup.py
++++ b/pytorch/setup.py
+@@ -9,13 +9,13 @@ setup(name='nnunet',
+       author_email='f.isensee@dkfz-heidelberg.de',
+       license='Apache License Version 2.0, January 2004',
+       install_requires=[
+-            "torch>=1.6.0a",
++            "torch>=1.6.0",
+             "tqdm",
+             "dicom2nifti",
+             "scikit-image>=0.14",
+             "medpy",
+             "scipy",
+-            "batchgenerators>=0.21",
++            "batchgenerators==0.21",
+             "numpy",
+             "sklearn",
+             "SimpleITK",
diff --git a/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/onnx_infer.py b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/onnx_infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c47c62f4b186a1a90a2365ebac23a3aafdcb675
--- /dev/null
+++ b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/onnx_infer.py
@@ -0,0 +1,91 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import time
+import numpy as np
+import onnxruntime
+from tqdm import tqdm
+
+
+all_time = 0
+infer_times = 200
+ignore_times = 10
+curr_time = 0
+assert infer_times > ignore_times
+
+def display_time(func):
+    def wrapper(*args):
+        t1 = time.time()
+        req = func(*args)
+        t2 = time.time()
+        spent_time = t2 - t1
+        print("Single time: {:.4}s".format(spent_time))
+        global all_time, curr_time
+        curr_time += 1
+        if curr_time > ignore_times:
+            all_time += spent_time
+        return req
+    return wrapper
+
+
+class ONNXModel():
+    def __init__(self, onnx_path):
+        # providers: TensorrtExecutionProvider/CUDAExecutionProvider/CPUExecutionProvider
+        self.onnx_session = onnxruntime.InferenceSession(onnx_path, providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider'])
+        self.input_name = self.get_input_name(self.onnx_session)
+        self.output_name = self.get_output_name(self.onnx_session)
+        self.input_feed = None
+
+    def get_output_name(self, onnx_session):
+        output_name = []
+        for node in onnx_session.get_outputs():
+            output_name.append(node.name)
+        return output_name
+
+    def get_input_name(self, onnx_session):
+        input_name = []
+        for node in onnx_session.get_inputs():
+            input_name.append(node.name)
+        return input_name
+
+    def get_input_feed(self, image_numpy):
+        input_feed = {}
+        for name in self.input_name:
+            input_feed[name] = image_numpy
+        self.input_feed = input_feed
+
+    @display_time
+    def forward(self):
+        self.onnx_session.run(self.output_name, input_feed=self.input_feed)
+
+
+def create_random_input(input_shape, dtype=np.float32):
+    input_data = np.random.random(input_shape).astype(dtype)
+    return input_data
+
+
+if __name__ == '__main__':
+    model_file = sys.argv[1]
+    input_shape = sys.argv[2]
+    np.random.seed(123)
+    input_shape = list(map(int, input_shape.split(',')))
+    net = ONNXModel(model_file)
+
+    for _ in tqdm(range(infer_times)):
+        input_data = create_random_input(input_shape)
+        net.get_input_feed(input_data)
+        net.forward()
+
+    print("Average time spent: {:.4}s".format(all_time / (infer_times - ignore_times)))
diff --git a/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/requirements.txt b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..07bda18e71fe40802c2df7cfa6eb66fbde93a2d6
--- /dev/null
+++ b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/requirements.txt
@@ -0,0 +1,51 @@
+auto-tune @ file:///root/selfgz406448609/compiler/lib64/auto_tune-0.1.0-py3-none-any.whl
+batchgenerators==0.21
+certifi==2021.10.8
+cycler==0.11.0
+decorator==5.1.1
+dicom2nifti==2.3.2
+fonttools==4.29.1
+future==0.18.2
+hccl @ file:///root/selfgz406448609/compiler/lib64/hccl-0.1.0-py3-none-any.whl
+imageio==2.16.1
+joblib==1.1.0
+kiwisolver==1.3.2
+linecache2==1.0.0
+matplotlib==3.5.1
+MedPy==0.4.0
+mkl-fft==1.3.1
+mkl-random @ file:///tmp/build/80754af9/mkl_random_1626186064646/work
+mkl-service==2.4.0
+mpmath==1.2.1
+networkx==2.6.3
+nibabel==3.2.2
+-e git+https://github.com/MrGiovanni/UNetPlusPlus.git@e145ba63862982bf1099cf2ec11d5466b434ae0b#egg=nnunet&subdirectory=pytorch
+numpy @ file:///tmp/build/80754af9/numpy_and_numpy_base_1634095647912/work
+olefile @ file:///Users/ktietz/demo/mc3/conda-bld/olefile_1629805411829/work
+op-gen @ file:///usr/local/Ascend/ascend-toolkit/5.0.4/x86_64-linux/toolkit/tools/op_gen-0.1-py3-none-any.whl
+op-test-frame @ file:///usr/local/Ascend/ascend-toolkit/5.0.4/x86_64-linux/toolkit/tools/op_test_frame-0.1-py3-none-any.whl
+packaging==21.3
+pandas==1.4.1
+Pillow==8.4.0
+pydicom==2.2.2
+pyparsing==3.0.7
+python-dateutil==2.8.2
+pytz==2021.3
+PyWavelets==1.2.0
+schedule-search @ file:///root/selfgz406448609/compiler/lib64/schedule_search-0.1.0-py3-none-any.whl
+scikit-image==0.19.2
+scikit-learn==1.0.2
+scipy==1.8.0
+SimpleITK==2.1.1
+six @ file:///tmp/build/80754af9/six_1644875935023/work
+sklearn==0.0
+sympy==1.9
+te @ file:///root/selfgz406448609/compiler/lib64/te-0.4.0-py3-none-any.whl
+threadpoolctl==3.1.0
+tifffile==2022.2.9
+topi @ file:///root/selfgz406448609/compiler/lib64/topi-0.4.0-py3-none-any.whl
+torch==1.6.0
+torchvision==0.7.0
+tqdm==4.63.0
+traceback2==1.4.0
+unittest2==1.1.0
diff --git a/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/set_env.sh b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/set_env.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b79cc18a90e8a6fd6f228ccbf1798ba96b6cc614
--- /dev/null
+++ b/ACL_PyTorch/contrib/cv/segmentation/3D_Nested_Unet/set_env.sh
@@ -0,0 +1,9 @@
+export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/ascend-toolkit/latest/atc/lib64:$LD_LIBRARY_PATH
+export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/toolkit/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/atc/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/pyACL/python/site-packages/acl:$PYTHONPATH
+export PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/bin:/usr/local/Ascend/ascend-toolkit/latest/atc/bin:/usr/local/Ascend/ascend-toolkit/latest/atc/ccec_compiler/bin:$PATH
+export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest
+export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp
+export TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
+export DDK_PATH=/home/usr/local/Ascend/ascend-toolkit/latest
+export NPU_HOST_LIB=/usr/local/Ascend/ascend-toolkit/latest/acllib/lib64/stub