From 21b08de200d7cfc35d4a32878cf75562470eaaec Mon Sep 17 00:00:00 2001
From: Raindrop <1710404@mail.nankai.edu.cn>
Date: Mon, 21 Mar 2022 15:42:58 +0800
Subject: [PATCH] first jasper commit
---
PyTorch/contrib/audio/jasper/.keep | 0
.../Jasper_pytorch_wuqingdian01/.dockerignore | 8 +
.../Jasper_pytorch_wuqingdian01/.gitignore | 9 +
.../jasper/Jasper_pytorch_wuqingdian01/.keep | 0
.../Jasper_pytorch_wuqingdian01/Dockerfile | 30 +
.../Jasper_pytorch_wuqingdian01/LICENSE | 203 ++
.../jasper/Jasper_pytorch_wuqingdian01/NOTICE | 5 +
.../Jasper_pytorch_wuqingdian01/README.md | 49 +
.../Jasper_pytorch_wuqingdian01/README_raw.md | 843 ++++++
.../apex/.gitignore | 147 +
.../apex/.gitmodules | 7 +
.../apex/.nojekyll | 0
.../Jasper_pytorch_wuqingdian01/apex/LICENSE | 11 +
.../apex/README.md | 146 +
.../apex/apex/RNN/README.md | 1 +
.../apex/apex/RNN/RNNBackend.py | 365 +++
.../apex/apex/RNN/__init__.py | 3 +
.../apex/apex/RNN/cells.py | 84 +
.../apex/apex/RNN/models.py | 54 +
.../apex/apex/__init__.py | 20 +
.../apex/apex/_autocast_utils.py | 17 +
.../apex/apex/amp/README.md | 72 +
.../apex/apex/amp/__init__.py | 5 +
.../apex/apex/amp/__version__.py | 2 +
.../apex/apex/amp/_amp_state.py | 69 +
.../apex/apex/amp/_initialize.py | 263 ++
.../apex/apex/amp/_process_optimizer.py | 489 +++
.../apex/apex/amp/amp.py | 177 ++
.../apex/apex/amp/compat.py | 46 +
.../apex/apex/amp/frontend.py | 442 +++
.../apex/apex/amp/handle.py | 281 ++
.../apex/apex/amp/lists/__init__.py | 0
.../apex/amp/lists/functional_overrides.py | 80 +
.../apex/apex/amp/lists/tensor_overrides.py | 63 +
.../apex/apex/amp/lists/torch_overrides.py | 115 +
.../apex/apex/amp/opt.py | 103 +
.../apex/apex/amp/rnn_compat.py | 53 +
.../apex/apex/amp/scaler.py | 217 ++
.../apex/apex/amp/utils.py | 210 ++
.../apex/apex/amp/wrap.py | 276 ++
.../apex/apex/contrib/__init__.py | 0
.../apex/apex/contrib/bottleneck/__init__.py | 1 +
.../apex/contrib/bottleneck/bottleneck.py | 512 ++++
.../bottleneck/bottleneck_module_test.py | 272 ++
.../apex/apex/contrib/bottleneck/test.py | 71 +
.../contrib/csrc/bottleneck/bottleneck.cpp | 2486 +++++++++++++++
.../apex/apex/contrib/csrc/fmha/fmha_api.cpp | 432 +++
.../apex/apex/contrib/csrc/fmha/src/fmha.h | 125 +
.../apex/contrib/csrc/fmha/src/fmha/gemm.h | 317 ++
.../contrib/csrc/fmha/src/fmha/gmem_tile.h | 428 +++
.../csrc/fmha/src/fmha/kernel_traits.h | 95 +
.../apex/contrib/csrc/fmha/src/fmha/mask.h | 76 +
.../contrib/csrc/fmha/src/fmha/smem_tile.h | 1288 ++++++++
.../apex/contrib/csrc/fmha/src/fmha/softmax.h | 478 +++
.../apex/contrib/csrc/fmha/src/fmha/utils.h | 953 ++++++
.../src/fmha_dgrad_fp16_128_64_kernel.sm80.cu | 60 +
.../src/fmha_dgrad_fp16_256_64_kernel.sm80.cu | 60 +
.../src/fmha_dgrad_fp16_384_64_kernel.sm80.cu | 60 +
.../src/fmha_dgrad_fp16_512_64_kernel.sm80.cu | 105 +
.../fmha/src/fmha_dgrad_kernel_1xN_reload.h | 558 ++++
.../src/fmha_dgrad_kernel_1xN_reload_nl.h | 571 ++++
.../src/fmha_fprop_fp16_128_64_kernel.sm80.cu | 58 +
.../src/fmha_fprop_fp16_256_64_kernel.sm80.cu | 58 +
.../src/fmha_fprop_fp16_384_64_kernel.sm80.cu | 57 +
.../src/fmha_fprop_fp16_512_64_kernel.sm80.cu | 98 +
.../csrc/fmha/src/fmha_fprop_kernel_1xN.h | 336 +++
.../csrc/fmha/src/fmha_fprop_kernel_1xN_nl.h | 343 +++
.../fmha/src/fmha_fprop_kernel_1xN_reload_v.h | 322 ++
.../apex/contrib/csrc/fmha/src/fmha_kernel.h | 169 ++
.../csrc/fmha/src/fmha_noloop_reduce.cu | 177 ++
.../apex/contrib/csrc/fmha/src/fmha_utils.h | 92 +
.../apex/contrib/csrc/groupbn/batch_norm.cu | 328 ++
.../apex/contrib/csrc/groupbn/batch_norm.h | 735 +++++
.../csrc/groupbn/batch_norm_add_relu.cu | 340 +++
.../csrc/groupbn/batch_norm_add_relu.h | 682 +++++
.../apex/contrib/csrc/groupbn/cuda_utils.h | 20 +
.../apex/contrib/csrc/groupbn/interface.cpp | 175 ++
.../apex/apex/contrib/csrc/groupbn/ipc.cu | 129 +
.../csrc/groupbn/nhwc_batch_norm_kernel.h | 2685 +++++++++++++++++
.../apex/apex/contrib/csrc/layer_norm/ln.h | 200 ++
.../apex/contrib/csrc/layer_norm/ln_api.cpp | 246 ++
.../csrc/layer_norm/ln_bwd_kernels.cuh | 315 ++
.../layer_norm/ln_bwd_semi_cuda_kernel.cu | 234 ++
.../csrc/layer_norm/ln_fwd_cuda_kernel.cu | 222 ++
.../csrc/layer_norm/ln_fwd_kernels.cuh | 110 +
.../csrc/layer_norm/ln_kernel_traits.h | 159 +
.../apex/contrib/csrc/layer_norm/ln_utils.cuh | 733 +++++
.../additive_masked_softmax_dropout.cpp | 91 +
.../additive_masked_softmax_dropout_cuda.cu | 131 +
.../contrib/csrc/multihead_attn/dropout.h | 306 ++
.../multihead_attn/encdec_multihead_attn.cpp | 156 +
.../encdec_multihead_attn_cuda.cu | 556 ++++
.../encdec_multihead_attn_norm_add.cpp | 198 ++
.../encdec_multihead_attn_norm_add_cuda.cu | 640 ++++
.../contrib/csrc/multihead_attn/layer_norm.h | 740 +++++
.../multihead_attn/masked_softmax_dropout.cpp | 93 +
.../masked_softmax_dropout_cuda.cu | 147 +
.../apex/contrib/csrc/multihead_attn/philox.h | 90 +
.../multihead_attn/self_multihead_attn.cpp | 132 +
.../self_multihead_attn_bias.cpp | 139 +
...self_multihead_attn_bias_additive_mask.cpp | 143 +
..._multihead_attn_bias_additive_mask_cuda.cu | 463 +++
.../self_multihead_attn_bias_cuda.cu | 471 +++
.../self_multihead_attn_cuda.cu | 469 +++
.../self_multihead_attn_norm_add.cpp | 173 ++
.../self_multihead_attn_norm_add_cuda.cu | 556 ++++
.../contrib/csrc/multihead_attn/softmax.h | 2641 ++++++++++++++++
.../multihead_attn/strided_batched_gemm.h | 407 +++
.../csrc/optimizers/fused_adam_cuda.cpp | 86 +
.../csrc/optimizers/fused_adam_cuda_kernel.cu | 1037 +++++++
.../csrc/optimizers/fused_lamb_cuda.cpp | 21 +
.../csrc/optimizers/fused_lamb_cuda_kernel.cu | 294 ++
.../optimizers/multi_tensor_distopt_adam.cpp | 20 +
.../multi_tensor_distopt_adam_kernel.cu | 228 ++
.../optimizers/multi_tensor_distopt_lamb.cpp | 36 +
.../multi_tensor_distopt_lamb_kernel.cu | 506 ++++
.../csrc/transducer/transducer_joint.cpp | 98 +
.../transducer/transducer_joint_kernel.cu | 973 ++++++
.../csrc/transducer/transducer_loss.cpp | 109 +
.../csrc/transducer/transducer_loss_kernel.cu | 767 +++++
.../apex/contrib/csrc/xentropy/interface.cpp | 52 +
.../contrib/csrc/xentropy/xentropy_kernel.cu | 718 +++++
.../func_test_multihead_attn.py | 108 +
.../perf_test_multihead_attn.py | 115 +
.../apex/apex/contrib/fmha/__init__.py | 1 +
.../apex/apex/contrib/fmha/fmha.py | 74 +
.../apex/apex/contrib/groupbn/__init__.py | 9 +
.../apex/apex/contrib/groupbn/batch_norm.py | 225 ++
.../apex/apex/contrib/layer_norm/__init__.py | 1 +
.../apex/contrib/layer_norm/layer_norm.py | 53 +
.../apex/contrib/multihead_attn/README.md | 60 +
.../apex/contrib/multihead_attn/__init__.py | 3 +
.../multihead_attn/encdec_multihead_attn.py | 141 +
.../encdec_multihead_attn_func.py | 268 ++
.../fast_encdec_multihead_attn_func.py | 88 +
...ast_encdec_multihead_attn_norm_add_func.py | 130 +
.../fast_self_multihead_attn_func.py | 196 ++
.../fast_self_multihead_attn_norm_add_func.py | 106 +
.../mask_softmax_dropout_func.py | 81 +
.../multihead_attn/self_multihead_attn.py | 178 ++
.../self_multihead_attn_func.py | 235 ++
.../apex/apex/contrib/optimizers/__init__.py | 3 +
.../optimizers/distributed_fused_adam.py | 636 ++++
.../optimizers/distributed_fused_adam_v2.py | 615 ++++
.../optimizers/distributed_fused_adam_v3.py | 325 ++
.../optimizers/distributed_fused_lamb.py | 975 ++++++
.../apex/contrib/optimizers/fp16_optimizer.py | 243 ++
.../apex/contrib/optimizers/fused_adam.py | 206 ++
.../apex/contrib/optimizers/fused_lamb.py | 208 ++
.../apex/apex/contrib/optimizers/fused_sgd.py | 211 ++
.../apex/apex/contrib/sparsity/README.md | 78 +
.../apex/apex/contrib/sparsity/__init__.py | 2 +
.../apex/apex/contrib/sparsity/asp.py | 217 ++
.../apex/contrib/sparsity/sparse_masklib.py | 184 ++
.../sparsity/test/checkpointing_test_part1.py | 94 +
.../sparsity/test/checkpointing_test_part2.py | 79 +
.../test/checkpointing_test_reference.py | 96 +
.../apex/contrib/sparsity/test/toy_problem.py | 87 +
.../apex/apex/contrib/test/fmha/test_fmha.py | 121 +
.../test/fused_dense/test_fused_dense.py | 44 +
.../test/layer_norm/test_fast_layer_norm.py | 275 ++
.../test_encdec_multihead_attn.py | 136 +
.../test_encdec_multihead_attn_norm_add.py | 77 +
.../test_fast_self_multihead_attn_bias.py | 77 +
.../multihead_attn/test_mha_fused_softmax.py | 42 +
.../test_self_multihead_attn.py | 130 +
.../test_self_multihead_attn_norm_add.py | 72 +
.../apex/contrib/test/test_label_smoothing.py | 128 +
.../test/transducer/test_transducer_joint.py | 157 +
.../test/transducer/test_transducer_loss.py | 133 +
.../contrib/test/transducer/transducer_ref.py | 112 +
.../apex/apex/contrib/transducer/__init__.py | 2 +
.../apex/contrib/transducer/transducer.py | 195 ++
.../apex/apex/contrib/xentropy/__init__.py | 9 +
.../apex/contrib/xentropy/softmax_xentropy.py | 28 +
.../apex/apex/fp16_utils/README.md | 16 +
.../apex/apex/fp16_utils/__init__.py | 16 +
.../apex/apex/fp16_utils/fp16_optimizer.py | 554 ++++
.../apex/apex/fp16_utils/fp16util.py | 187 ++
.../apex/apex/fp16_utils/loss_scaler.py | 186 ++
.../apex/apex/fused_dense/__init__.py | 1 +
.../apex/apex/fused_dense/fused_dense.py | 85 +
.../apex/apex/mlp/__init__.py | 1 +
.../apex/apex/mlp/mlp.py | 79 +
.../apex/apex/multi_tensor_apply/__init__.py | 4 +
.../multi_tensor_apply/multi_tensor_apply.py | 30 +
.../apex/apex/normalization/__init__.py | 1 +
.../apex/normalization/fused_layer_norm.py | 218 ++
.../apex/apex/optimizers/__init__.py | 5 +
.../apex/apex/optimizers/fused_adagrad.py | 122 +
.../apex/apex/optimizers/fused_adam.py | 173 ++
.../apex/apex/optimizers/fused_lamb.py | 215 ++
.../apex/apex/optimizers/fused_novograd.py | 214 ++
.../apex/apex/optimizers/fused_sgd.py | 227 ++
.../apex/apex/parallel/LARC.py | 107 +
.../apex/apex/parallel/README.md | 66 +
.../apex/apex/parallel/__init__.py | 95 +
.../apex/apex/parallel/distributed.py | 639 ++++
.../apex/apex/parallel/multiproc.py | 35 +
.../apex/parallel/optimized_sync_batchnorm.py | 85 +
.../optimized_sync_batchnorm_kernel.py | 119 +
.../apex/apex/parallel/sync_batchnorm.py | 134 +
.../apex/parallel/sync_batchnorm_kernel.py | 87 +
.../apex/apex/pyprof/FAQs.md | 21 +
.../apex/apex/pyprof/README.md | 252 ++
.../apex/apex/pyprof/__init__.py | 3 +
.../apex/apex/pyprof/examples/.gitignore | 4 +
.../apex/apex/pyprof/examples/apex/README.md | 1 +
.../apex/pyprof/examples/apex/fused_adam.py | 20 +
.../pyprof/examples/apex/fused_layer_norm.py | 28 +
.../apex/apex/pyprof/examples/apex/test.sh | 30 +
.../examples/custom_func_module/README.md | 1 +
.../custom_func_module/custom_function.py | 33 +
.../custom_func_module/custom_module.py | 27 +
.../examples/custom_func_module/test.sh | 30 +
.../apex/pyprof/examples/imagenet/imagenet.py | 135 +
.../apex/pyprof/examples/imagenet/test.sh | 36 +
.../apex/apex/pyprof/examples/jit/README.md | 14 +
.../examples/jit/jit_script_function.py | 30 +
.../pyprof/examples/jit/jit_script_method.py | 31 +
.../pyprof/examples/jit/jit_trace_function.py | 30 +
.../pyprof/examples/jit/jit_trace_method.py | 36 +
.../apex/apex/pyprof/examples/jit/test.sh | 30 +
.../apex/apex/pyprof/examples/lenet.py | 65 +
.../apex/apex/pyprof/examples/operators.py | 145 +
.../apex/apex/pyprof/examples/simple.py | 38 +
.../pyprof/examples/user_annotation/README.md | 21 +
.../pyprof/examples/user_annotation/resnet.py | 215 ++
.../pyprof/examples/user_annotation/test.sh | 31 +
.../apex/apex/pyprof/nvtx/__init__.py | 2 +
.../apex/apex/pyprof/nvtx/nvmarker.py | 222 ++
.../apex/apex/pyprof/parse/__init__.py | 0
.../apex/apex/pyprof/parse/__main__.py | 10 +
.../apex/apex/pyprof/parse/db.py | 61 +
.../apex/apex/pyprof/parse/kernel.py | 210 ++
.../apex/apex/pyprof/parse/nvvp.py | 282 ++
.../apex/apex/pyprof/parse/parse.py | 122 +
.../apex/apex/pyprof/prof/__init__.py | 1 +
.../apex/apex/pyprof/prof/__main__.py | 10 +
.../apex/apex/pyprof/prof/activation.py | 65 +
.../apex/apex/pyprof/prof/base.py | 47 +
.../apex/apex/pyprof/prof/blas.py | 340 +++
.../apex/apex/pyprof/prof/conv.py | 236 ++
.../apex/apex/pyprof/prof/convert.py | 62 +
.../apex/apex/pyprof/prof/data.py | 54 +
.../apex/apex/pyprof/prof/dropout.py | 50 +
.../apex/apex/pyprof/prof/embedding.py | 71 +
.../pyprof/prof/index_slice_join_mutate.py | 419 +++
.../apex/apex/pyprof/prof/linear.py | 188 ++
.../apex/apex/pyprof/prof/loss.py | 84 +
.../apex/apex/pyprof/prof/misc.py | 219 ++
.../apex/apex/pyprof/prof/normalization.py | 54 +
.../apex/apex/pyprof/prof/optim.py | 65 +
.../apex/apex/pyprof/prof/output.py | 149 +
.../apex/apex/pyprof/prof/pointwise.py | 166 +
.../apex/apex/pyprof/prof/pooling.py | 59 +
.../apex/apex/pyprof/prof/prof.py | 256 ++
.../apex/apex/pyprof/prof/randomSample.py | 43 +
.../apex/apex/pyprof/prof/recurrentCell.py | 207 ++
.../apex/apex/pyprof/prof/reduction.py | 150 +
.../apex/apex/pyprof/prof/softmax.py | 115 +
.../apex/apex/pyprof/prof/usage.py | 73 +
.../apex/apex/pyprof/prof/utility.py | 60 +
.../apex/apex/reparameterization/README.md | 1 +
.../apex/apex/reparameterization/__init__.py | 127 +
.../reparameterization/reparameterization.py | 151 +
.../apex/reparameterization/weight_norm.py | 78 +
.../apex/apex/transformer/README.md | 81 +
.../apex/apex/transformer/__init__.py | 21 +
.../apex/apex/transformer/enums.py | 30 +
.../apex/transformer/functional/__init__.py | 5 +
.../transformer/functional/fused_softmax.py | 199 ++
.../apex/apex/transformer/microbatches.py | 172 ++
.../apex/apex/transformer/parallel_state.py | 382 +++
.../transformer/pipeline_parallel/__init__.py | 8 +
.../transformer/pipeline_parallel/_timers.py | 83 +
.../pipeline_parallel/p2p_communication.py | 398 +++
.../pipeline_parallel/schedules/__init__.py | 39 +
.../pipeline_parallel/schedules/common.py | 218 ++
.../schedules/fwd_bwd_no_pipelining.py | 81 +
.../fwd_bwd_pipelining_with_interleaving.py | 298 ++
...fwd_bwd_pipelining_without_interleaving.py | 175 ++
.../transformer/pipeline_parallel/utils.py | 317 ++
.../transformer/tensor_parallel/__init__.py | 74 +
.../tensor_parallel/cross_entropy.py | 103 +
.../apex/transformer/tensor_parallel/data.py | 113 +
.../transformer/tensor_parallel/layers.py | 477 +++
.../transformer/tensor_parallel/mappings.py | 159 +
.../transformer/tensor_parallel/memory.py | 136 +
.../transformer/tensor_parallel/random.py | 294 ++
.../apex/transformer/tensor_parallel/utils.py | 54 +
.../apex/apex/transformer/testing/__init__.py | 0
.../apex/transformer/testing/arguments.py | 806 +++++
.../apex/apex/transformer/testing/commons.py | 87 +
.../apex/transformer/testing/global_vars.py | 260 ++
.../transformer/testing/standalone_gpt.py | 1506 +++++++++
.../apex/apex/transformer/utils.py | 47 +
.../apex/csrc/amp_C_frontend.cpp | 145 +
.../apex/csrc/compat.h | 9 +
.../apex/csrc/flatten_unflatten.cpp | 18 +
.../apex/csrc/fused_dense.cpp | 192 ++
.../apex/csrc/fused_dense_cuda.cu | 1437 +++++++++
.../apex/csrc/layer_norm_cuda.cpp | 267 ++
.../apex/csrc/layer_norm_cuda_kernel.cu | 830 +++++
.../csrc/megatron/scaled_masked_softmax.cpp | 97 +
.../csrc/megatron/scaled_masked_softmax.h | 505 ++++
.../megatron/scaled_masked_softmax_cuda.cu | 117 +
.../scaled_upper_triang_masked_softmax.cpp | 72 +
.../scaled_upper_triang_masked_softmax.h | 513 ++++
...scaled_upper_triang_masked_softmax_cuda.cu | 98 +
.../apex/csrc/mlp.cpp | 166 +
.../apex/csrc/mlp_cuda.cu | 1678 ++++++++++
.../apex/csrc/multi_tensor_adagrad.cu | 100 +
.../apex/csrc/multi_tensor_adam.cu | 171 ++
.../apex/csrc/multi_tensor_apply.cuh | 133 +
.../apex/csrc/multi_tensor_axpby_kernel.cu | 157 +
.../apex/csrc/multi_tensor_l2norm_kernel.cu | 448 +++
.../csrc/multi_tensor_l2norm_scale_kernel.cu | 326 ++
.../apex/csrc/multi_tensor_lamb.cu | 413 +++
.../apex/csrc/multi_tensor_lamb_stage_1.cu | 151 +
.../apex/csrc/multi_tensor_lamb_stage_2.cu | 125 +
.../apex/csrc/multi_tensor_novograd.cu | 188 ++
.../apex/csrc/multi_tensor_scale_kernel.cu | 136 +
.../apex/csrc/multi_tensor_sgd_kernel.cu | 280 ++
.../apex/csrc/syncbn.cpp | 109 +
.../apex/csrc/type_shim.h | 387 +++
.../apex/csrc/welford.cu | 1510 +++++++++
.../apex/docs/Makefile | 32 +
.../docs/source/_static/css/pytorch_theme.css | 118 +
.../apex/docs/source/_templates/layout.html | 51 +
.../apex/docs/source/conf.py | 248 ++
.../apex/examples/README.md | 4 +
.../apex/examples/dcgan/README.md | 41 +
.../apex/examples/dcgan/main_amp.py | 274 ++
.../apex/examples/docker/Dockerfile | 16 +
.../apex/examples/docker/README.md | 40 +
.../apex/examples/imagenet/README.md | 183 ++
.../apex/examples/imagenet/main_amp.py | 543 ++++
.../examples/simple/distributed/README.md | 13 +
.../distributed/distributed_data_parallel.py | 65 +
.../apex/examples/simple/distributed/run.sh | 2 +
.../apex/requirements.txt | 5 +
.../apex/requirements_dev.txt | 3 +
.../Jasper_pytorch_wuqingdian01/apex/setup.py | 544 ++++
.../apex/tests/L0/run_amp/__init__.py | 0
.../tests/L0/run_amp/test_add_param_group.py | 148 +
.../apex/tests/L0/run_amp/test_basic_casts.py | 143 +
.../apex/tests/L0/run_amp/test_cache.py | 137 +
.../tests/L0/run_amp/test_checkpointing.py | 267 ++
.../apex/tests/L0/run_amp/test_fused_sgd.py | 794 +++++
.../apex/tests/L0/run_amp/test_larc.py | 53 +
.../L0/run_amp/test_multi_tensor_axpby.py | 180 ++
.../L0/run_amp/test_multi_tensor_l2norm.py | 87 +
.../L0/run_amp/test_multi_tensor_scale.py | 126 +
.../test_multiple_models_optimizers_losses.py | 762 +++++
.../apex/tests/L0/run_amp/test_promotion.py | 75 +
.../apex/tests/L0/run_amp/test_rnn.py | 116 +
.../apex/tests/L0/run_amp/utils.py | 21 +
.../apex/tests/L0/run_fp16util/__init__.py | 0
.../tests/L0/run_fp16util/test_fp16util.py | 75 +
.../test_fused_layer_norm.py | 143 +
.../apex/tests/L0/run_mlp/test_mlp.py | 218 ++
.../apex/tests/L0/run_optimizers/__init__.py | 0
.../tests/L0/run_optimizers/test_dist_adam.py | 183 ++
.../L0/run_optimizers/test_fused_novograd.py | 170 ++
.../L0/run_optimizers/test_fused_optimizer.py | 284 ++
.../apex/tests/L0/run_optimizers/test_lamb.py | 270 ++
.../apex/tests/L0/run_pyprof_data/__init__.py | 0
.../L0/run_pyprof_data/test_pyprof_data.py | 43 +
.../apex/tests/L0/run_pyprof_nvtx/__init__.py | 1 +
.../L0/run_pyprof_nvtx/test_pyprof_nvtx.py | 526 ++++
.../apex/tests/L0/run_test.py | 72 +
.../apex/tests/L0/run_transformer/__init__.py | 0
.../run_transformer/run_cross_entropy_test.py | 110 +
.../tests/L0/run_transformer/run_data_test.py | 95 +
.../L0/run_transformer/run_initialize_test.py | 104 +
.../L0/run_transformer/run_layers_test.py | 696 +++++
.../L0/run_transformer/run_mappings_test.py | 63 +
.../run_megatron_gpt_pipeline.py | 132 +
.../run_pipeline_parallel_test.py | 198 ++
.../L0/run_transformer/run_random_test.py | 213 ++
.../L0/run_transformer/run_utils_test.py | 22 +
.../L0/run_transformer/test_batch_sampler.py | 149 +
.../L0/run_transformer/test_fused_softmax.py | 168 ++
.../test_transformer_module.py | 89 +
.../apex/tests/L1/common/compare.py | 64 +
.../apex/tests/L1/common/main_amp.py | 526 ++++
.../apex/tests/L1/common/run_test.sh | 144 +
.../apex/tests/L1/cross_product/run.sh | 6 +
.../tests/L1/cross_product_distributed/run.sh | 4 +
.../DDP/ddp_race_condition_test.py | 69 +
.../tests/distributed/DDP/run_race_test.sh | 3 +
.../amp_master_params/amp_master_params.py | 70 +
.../distributed/amp_master_params/compare.py | 28 +
.../distributed/amp_master_params/run.sh | 4 +
.../python_single_gpu_unit_test.py | 111 +
.../synced_batchnorm/single_gpu_unit_test.py | 159 +
.../synced_batchnorm/test_batchnorm1d.py | 18 +
.../synced_batchnorm/test_groups.py | 185 ++
.../two_gpu_test_different_batch_size.py | 158 +
.../synced_batchnorm/two_gpu_unit_test.py | 180 ++
.../distributed/synced_batchnorm/unit_test.sh | 8 +
.../apex/tests/docker_extension_builds/run.sh | 73 +
.../common/__init__.py | 0
.../common/audio.py | 247 ++
.../common/dali/__init__.py | 13 +
.../common/dali/data_loader.py | 158 +
.../common/dali/iterator.py | 161 +
.../common/dali/pipeline.py | 366 +++
.../common/dataset.py | 237 ++
.../common/features.py | 522 ++++
.../common/helpers.py | 300 ++
.../common/metrics.py | 59 +
.../common/optimizers.py | 269 ++
.../common/tb_dllogger.py | 159 +
.../common/text/LICENSE | 19 +
.../common/text/__init__.py | 32 +
.../common/text/cleaners.py | 107 +
.../common/text/numbers.py | 99 +
.../common/text/symbols.py | 19 +
.../common/train.py | 516 ++++
.../common/utils.py | 20 +
.../configs/jasper10x5dr_speca.yaml | 139 +
.../configs/jasper10x5dr_speedp-offline.yaml | 139 +
.../jasper10x5dr_speedp-offline_speca.yaml | 139 +
...per10x5dr_speedp-offline_speca_nomask.yaml | 139 +
.../jasper10x5dr_speedp-online-discrete.yaml | 144 +
...er10x5dr_speedp-online-discrete_speca.yaml | 144 +
.../jasper10x5dr_speedp-online_speca.yaml | 144 +
.../Jasper_pytorch_wuqingdian01/inference.py | 398 +++
.../jasper/config.py | 125 +
.../jasper/model.py | 275 ++
.../notebooks/README.md | 203 ++
.../platform/DGX1-16GB_Jasper_AMP_8GPU.sh | 3 +
.../platform/DGX1-16GB_Jasper_FP32_8GPU.sh | 3 +
.../platform/DGX1-32GB_Jasper_AMP_8GPU.sh | 3 +
.../platform/DGX1-32GB_Jasper_FP32_8GPU.sh | 3 +
.../platform/DGX2_Jasper_AMP_16GPU.sh | 3 +
.../platform/DGX2_Jasper_AMP_8GPU.sh | 3 +
.../platform/DGX2_Jasper_FP32_16GPU.sh | 3 +
.../platform/DGX2_Jasper_FP32_8GPU.sh | 3 +
.../platform/DGXA100_Jasper_AMP_8GPU.sh | 3 +
.../platform/DGXA100_Jasper_TF32_8GPU.sh | 3 +
.../requirements.txt | 13 +
.../scripts/docker/build.sh | 3 +
.../scripts/docker/launch.sh | 31 +
.../scripts/download_librispeech.sh | 32 +
.../scripts/evaluation.sh | 22 +
.../scripts/inference.sh | 65 +
.../scripts/inference_benchmark.sh | 38 +
.../scripts/preprocess_librispeech.sh | 54 +
.../scripts/train.sh | 90 +
.../scripts/train_benchmark.sh | 49 +
.../test/docker/build.sh | 3 +
.../test/docker/launch.sh | 31 +
.../test/env_npu.sh | 80 +
.../test/train_eval_8p.sh | 65 +
.../test/train_full_1p.sh | 91 +
.../test/train_full_8p.sh | 90 +
.../test/train_performance_1p.sh | 95 +
.../test/train_performance_8p.sh | 95 +
.../Jasper_pytorch_wuqingdian01/train.py | 528 ++++
.../triton/Dockerfile | 10 +
.../triton/README.md | 388 +++
.../triton/converter.py | 254 ++
.../triton/jasper-client.py | 400 +++
.../triton/jasper_module.py | 174 ++
.../fp16/decoder-ts-script/config.pbtxt | 46 +
.../feature-extractor-ts-trace/config.pbtxt | 32 +
.../fp16/jasper-onnx-ensemble/config.pbtxt | 63 +
.../fp16/jasper-onnx/config.pbtxt | 58 +
.../jasper-tensorrt-ensemble/config.pbtxt | 63 +
.../fp16/jasper-tensorrt/config.pbtxt | 58 +
.../jasper-ts-trace-ensemble/config.pbtxt | 63 +
.../fp16/jasper-ts-trace/config.pbtxt | 32 +
.../fp32/decoder-ts-script/config.pbtxt | 46 +
.../feature-extractor-ts-trace/config.pbtxt | 32 +
.../fp32/jasper-onnx-ensemble/config.pbtxt | 63 +
.../fp32/jasper-onnx/config.pbtxt | 58 +
.../jasper-tensorrt-ensemble/config.pbtxt | 63 +
.../fp32/jasper-tensorrt/config.pbtxt | 58 +
.../jasper-ts-trace-ensemble/config.pbtxt | 63 +
.../fp32/jasper-ts-trace/config.pbtxt | 32 +
.../triton/pytorch/__init__.py | 0
.../triton/pytorch/utils.py | 263 ++
.../scripts/docker/build_triton_client.sh | 7 +
.../scripts/download_triton_librispeech.sh | 29 +
.../triton/scripts/execute_all_perf_runs.sh | 77 +
.../triton/scripts/export_model.sh | 78 +
.../triton/scripts/generate_perf_results.sh | 108 +
.../scripts/prepare_model_repository.sh | 54 +
.../scripts/preprocess_triton_librispeech.sh | 21 +
.../triton/scripts/run_client.sh | 49 +
.../triton/scripts/run_perf_client.sh | 86 +
.../triton/scripts/run_server.sh | 72 +
.../triton/scripts/wait_for_triton_server.sh | 36 +
.../triton/speech_utils.py | 417 +++
.../triton/tensorrt_io_props_fp16.json | 1 +
.../triton/tensorrt_io_props_fp32.json | 1 +
.../triton/triton_librispeech.csv | 2 +
.../utils/__init__.py | 0
.../utils/convert_librispeech.py | 82 +
.../utils/download_librispeech.py | 72 +
.../utils/download_utils.py | 71 +
.../utils/inference_librispeech.csv | 5 +
.../utils/librispeech.csv | 8 +
.../utils/preprocessing_utils.py | 76 +
507 files changed, 93003 insertions(+)
create mode 100644 PyTorch/contrib/audio/jasper/.keep
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/.dockerignore
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/.gitignore
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/.keep
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/Dockerfile
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/LICENSE
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/NOTICE
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/README_raw.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/.gitignore
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/.gitmodules
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/.nojekyll
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/LICENSE
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/RNNBackend.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/cells.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/models.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/_autocast_utils.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/__version__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/_amp_state.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/_initialize.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/_process_optimizer.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/amp.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/compat.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/frontend.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/handle.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/functional_overrides.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/tensor_overrides.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/torch_overrides.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/opt.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/rnn_compat.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/scaler.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/utils.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/wrap.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/bottleneck.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/bottleneck_module_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/bottleneck/bottleneck.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/fmha_api.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha/gemm.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha/gmem_tile.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha/kernel_traits.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha/mask.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha/smem_tile.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha/softmax.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha/utils.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_128_64_kernel.sm80.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_256_64_kernel.sm80.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_384_64_kernel.sm80.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_dgrad_fp16_512_64_kernel.sm80.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_dgrad_kernel_1xN_reload.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_dgrad_kernel_1xN_reload_nl.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_fprop_fp16_128_64_kernel.sm80.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_fprop_fp16_256_64_kernel.sm80.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_fprop_fp16_384_64_kernel.sm80.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_fprop_fp16_512_64_kernel.sm80.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN_nl.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_fprop_kernel_1xN_reload_v.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_kernel.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_noloop_reduce.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/fmha/src/fmha_utils.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/groupbn/batch_norm.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/groupbn/batch_norm.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/groupbn/batch_norm_add_relu.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/groupbn/batch_norm_add_relu.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/groupbn/cuda_utils.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/groupbn/interface.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/groupbn/ipc.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/groupbn/nhwc_batch_norm_kernel.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/layer_norm/ln.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/layer_norm/ln_api.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/layer_norm/ln_bwd_kernels.cuh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/layer_norm/ln_bwd_semi_cuda_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/layer_norm/ln_fwd_cuda_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/layer_norm/ln_fwd_kernels.cuh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/layer_norm/ln_kernel_traits.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/layer_norm/ln_utils.cuh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/additive_masked_softmax_dropout_cuda.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/dropout.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/encdec_multihead_attn.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/layer_norm.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/masked_softmax_dropout.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/masked_softmax_dropout_cuda.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/philox.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/self_multihead_attn.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/self_multihead_attn_cuda.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/self_multihead_attn_norm_add_cuda.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/softmax.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/multihead_attn/strided_batched_gemm.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/optimizers/fused_adam_cuda.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/optimizers/fused_adam_cuda_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/optimizers/fused_lamb_cuda.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/optimizers/fused_lamb_cuda_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/optimizers/multi_tensor_distopt_adam.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/optimizers/multi_tensor_distopt_adam_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/optimizers/multi_tensor_distopt_lamb_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/transducer/transducer_joint.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/transducer/transducer_joint_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/transducer/transducer_loss.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/transducer/transducer_loss_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/xentropy/interface.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/xentropy/xentropy_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/examples/multihead_attn/func_test_multihead_attn.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/examples/multihead_attn/perf_test_multihead_attn.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/fmha/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/fmha/fmha.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/groupbn/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/groupbn/batch_norm.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/layer_norm/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/layer_norm/layer_norm.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/multihead_attn/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/multihead_attn/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/multihead_attn/encdec_multihead_attn.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/multihead_attn/encdec_multihead_attn_func.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/multihead_attn/fast_encdec_multihead_attn_func.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/multihead_attn/fast_encdec_multihead_attn_norm_add_func.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/multihead_attn/fast_self_multihead_attn_func.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/multihead_attn/fast_self_multihead_attn_norm_add_func.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/multihead_attn/mask_softmax_dropout_func.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/multihead_attn/self_multihead_attn.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/multihead_attn/self_multihead_attn_func.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/optimizers/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/optimizers/distributed_fused_adam.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/optimizers/distributed_fused_adam_v2.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/optimizers/distributed_fused_adam_v3.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/optimizers/distributed_fused_lamb.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/optimizers/fp16_optimizer.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/optimizers/fused_adam.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/optimizers/fused_lamb.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/optimizers/fused_sgd.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/sparsity/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/sparsity/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/sparsity/asp.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/sparsity/sparse_masklib.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/sparsity/test/checkpointing_test_part1.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/sparsity/test/checkpointing_test_part2.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/sparsity/test/checkpointing_test_reference.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/sparsity/test/toy_problem.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/fmha/test_fmha.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/fused_dense/test_fused_dense.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/layer_norm/test_fast_layer_norm.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/multihead_attn/test_encdec_multihead_attn.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/multihead_attn/test_encdec_multihead_attn_norm_add.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/multihead_attn/test_fast_self_multihead_attn_bias.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/multihead_attn/test_mha_fused_softmax.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/multihead_attn/test_self_multihead_attn.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/multihead_attn/test_self_multihead_attn_norm_add.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/test_label_smoothing.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/transducer/test_transducer_joint.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/transducer/test_transducer_loss.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/test/transducer/transducer_ref.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/transducer/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/transducer/transducer.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/xentropy/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/xentropy/softmax_xentropy.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/fp16_utils/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/fp16_utils/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/fp16_utils/fp16_optimizer.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/fp16_utils/fp16util.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/fp16_utils/loss_scaler.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/fused_dense/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/fused_dense/fused_dense.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/mlp/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/mlp/mlp.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/multi_tensor_apply/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/multi_tensor_apply/multi_tensor_apply.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/normalization/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/normalization/fused_layer_norm.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/optimizers/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/optimizers/fused_adagrad.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/optimizers/fused_adam.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/optimizers/fused_lamb.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/optimizers/fused_novograd.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/optimizers/fused_sgd.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/parallel/LARC.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/parallel/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/parallel/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/parallel/distributed.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/parallel/multiproc.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/parallel/optimized_sync_batchnorm.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/parallel/optimized_sync_batchnorm_kernel.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/parallel/sync_batchnorm.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/parallel/sync_batchnorm_kernel.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/FAQs.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/.gitignore
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/apex/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/apex/fused_adam.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/apex/fused_layer_norm.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/apex/test.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/custom_func_module/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/custom_func_module/custom_function.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/custom_func_module/custom_module.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/custom_func_module/test.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/imagenet/imagenet.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/imagenet/test.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/jit/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/jit/jit_script_function.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/jit/jit_script_method.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/jit/jit_trace_function.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/jit/jit_trace_method.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/jit/test.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/lenet.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/operators.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/simple.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/user_annotation/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/user_annotation/resnet.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/examples/user_annotation/test.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/nvtx/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/nvtx/nvmarker.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/parse/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/parse/__main__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/parse/db.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/parse/kernel.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/parse/nvvp.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/parse/parse.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/__main__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/activation.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/base.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/blas.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/conv.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/convert.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/data.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/dropout.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/embedding.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/index_slice_join_mutate.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/linear.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/loss.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/misc.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/normalization.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/optim.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/output.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/pointwise.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/pooling.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/prof.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/randomSample.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/recurrentCell.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/reduction.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/softmax.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/usage.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/pyprof/prof/utility.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/reparameterization/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/reparameterization/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/reparameterization/reparameterization.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/reparameterization/weight_norm.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/enums.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/functional/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/functional/fused_softmax.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/microbatches.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/parallel_state.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/pipeline_parallel/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/pipeline_parallel/_timers.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/pipeline_parallel/p2p_communication.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/pipeline_parallel/schedules/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/pipeline_parallel/schedules/common.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/pipeline_parallel/schedules/fwd_bwd_no_pipelining.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/pipeline_parallel/utils.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/tensor_parallel/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/tensor_parallel/cross_entropy.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/tensor_parallel/data.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/tensor_parallel/layers.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/tensor_parallel/mappings.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/tensor_parallel/memory.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/tensor_parallel/random.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/tensor_parallel/utils.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/testing/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/testing/arguments.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/testing/commons.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/testing/global_vars.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/testing/standalone_gpt.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/transformer/utils.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/amp_C_frontend.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/compat.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/flatten_unflatten.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/fused_dense.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/fused_dense_cuda.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/layer_norm_cuda.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/layer_norm_cuda_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/megatron/scaled_masked_softmax.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/megatron/scaled_masked_softmax.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/megatron/scaled_masked_softmax_cuda.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/megatron/scaled_upper_triang_masked_softmax.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/megatron/scaled_upper_triang_masked_softmax.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/megatron/scaled_upper_triang_masked_softmax_cuda.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/mlp.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/mlp_cuda.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/multi_tensor_adagrad.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/multi_tensor_adam.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/multi_tensor_apply.cuh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/multi_tensor_axpby_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/multi_tensor_l2norm_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/multi_tensor_l2norm_scale_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/multi_tensor_lamb.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/multi_tensor_lamb_stage_1.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/multi_tensor_lamb_stage_2.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/multi_tensor_novograd.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/multi_tensor_scale_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/multi_tensor_sgd_kernel.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/syncbn.cpp
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/type_shim.h
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/csrc/welford.cu
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/docs/Makefile
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/docs/source/_static/css/pytorch_theme.css
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/docs/source/_templates/layout.html
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/docs/source/conf.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/examples/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/examples/dcgan/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/examples/dcgan/main_amp.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/examples/docker/Dockerfile
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/examples/docker/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/examples/imagenet/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/examples/imagenet/main_amp.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/examples/simple/distributed/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/examples/simple/distributed/distributed_data_parallel.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/examples/simple/distributed/run.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/requirements.txt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/requirements_dev.txt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/setup.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/test_add_param_group.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/test_basic_casts.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/test_cache.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/test_checkpointing.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/test_fused_sgd.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/test_larc.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/test_multi_tensor_axpby.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/test_multi_tensor_l2norm.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/test_multi_tensor_scale.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/test_multiple_models_optimizers_losses.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/test_promotion.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/test_rnn.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_amp/utils.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_fp16util/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_fp16util/test_fp16util.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_fused_layer_norm/test_fused_layer_norm.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_mlp/test_mlp.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_optimizers/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_optimizers/test_dist_adam.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_optimizers/test_fused_novograd.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_optimizers/test_fused_optimizer.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_optimizers/test_lamb.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_pyprof_data/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_pyprof_data/test_pyprof_data.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_pyprof_nvtx/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_pyprof_nvtx/test_pyprof_nvtx.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/run_cross_entropy_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/run_data_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/run_initialize_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/run_layers_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/run_mappings_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/run_megatron_gpt_pipeline.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/run_pipeline_parallel_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/run_random_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/run_utils_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/test_batch_sampler.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/test_fused_softmax.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L0/run_transformer/test_transformer_module.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L1/common/compare.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L1/common/main_amp.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L1/common/run_test.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L1/cross_product/run.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/L1/cross_product_distributed/run.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/distributed/DDP/ddp_race_condition_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/distributed/DDP/run_race_test.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/distributed/amp_master_params/amp_master_params.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/distributed/amp_master_params/compare.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/distributed/amp_master_params/run.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/distributed/synced_batchnorm/python_single_gpu_unit_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/distributed/synced_batchnorm/single_gpu_unit_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/distributed/synced_batchnorm/test_batchnorm1d.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/distributed/synced_batchnorm/test_groups.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/distributed/synced_batchnorm/two_gpu_test_different_batch_size.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/distributed/synced_batchnorm/two_gpu_unit_test.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/distributed/synced_batchnorm/unit_test.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/tests/docker_extension_builds/run.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/audio.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/dali/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/dali/data_loader.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/dali/iterator.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/dali/pipeline.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/dataset.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/features.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/helpers.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/metrics.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/optimizers.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/tb_dllogger.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/text/LICENSE
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/text/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/text/cleaners.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/text/numbers.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/text/symbols.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/train.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/common/utils.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/configs/jasper10x5dr_speca.yaml
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/configs/jasper10x5dr_speedp-offline.yaml
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/configs/jasper10x5dr_speedp-offline_speca.yaml
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/configs/jasper10x5dr_speedp-offline_speca_nomask.yaml
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/configs/jasper10x5dr_speedp-online-discrete.yaml
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/configs/jasper10x5dr_speedp-online-discrete_speca.yaml
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/configs/jasper10x5dr_speedp-online_speca.yaml
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/inference.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/jasper/config.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/jasper/model.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/notebooks/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/platform/DGX1-16GB_Jasper_AMP_8GPU.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/platform/DGX1-16GB_Jasper_FP32_8GPU.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/platform/DGX1-32GB_Jasper_AMP_8GPU.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/platform/DGX1-32GB_Jasper_FP32_8GPU.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/platform/DGX2_Jasper_AMP_16GPU.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/platform/DGX2_Jasper_AMP_8GPU.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/platform/DGX2_Jasper_FP32_16GPU.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/platform/DGX2_Jasper_FP32_8GPU.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/platform/DGXA100_Jasper_AMP_8GPU.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/platform/DGXA100_Jasper_TF32_8GPU.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/requirements.txt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/scripts/docker/build.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/scripts/docker/launch.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/scripts/download_librispeech.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/scripts/evaluation.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/scripts/inference.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/scripts/inference_benchmark.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/scripts/preprocess_librispeech.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/scripts/train.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/scripts/train_benchmark.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/test/docker/build.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/test/docker/launch.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/test/env_npu.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/test/train_eval_8p.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/test/train_full_1p.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/test/train_full_8p.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/test/train_performance_1p.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/test/train_performance_8p.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/train.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/Dockerfile
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/README.md
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/converter.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/jasper-client.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/jasper_module.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp16/decoder-ts-script/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp16/feature-extractor-ts-trace/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp16/jasper-onnx-ensemble/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp16/jasper-onnx/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp16/jasper-tensorrt-ensemble/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp16/jasper-tensorrt/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp16/jasper-ts-trace-ensemble/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp16/jasper-ts-trace/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp32/decoder-ts-script/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp32/feature-extractor-ts-trace/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp32/jasper-onnx-ensemble/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp32/jasper-onnx/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp32/jasper-tensorrt-ensemble/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp32/jasper-tensorrt/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp32/jasper-ts-trace-ensemble/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/model_repo_configs/fp32/jasper-ts-trace/config.pbtxt
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/pytorch/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/pytorch/utils.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/scripts/docker/build_triton_client.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/scripts/download_triton_librispeech.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/scripts/execute_all_perf_runs.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/scripts/export_model.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/scripts/generate_perf_results.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/scripts/prepare_model_repository.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/scripts/preprocess_triton_librispeech.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/scripts/run_client.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/scripts/run_perf_client.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/scripts/run_server.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/scripts/wait_for_triton_server.sh
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/speech_utils.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/tensorrt_io_props_fp16.json
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/tensorrt_io_props_fp32.json
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/triton/triton_librispeech.csv
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/utils/__init__.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/utils/convert_librispeech.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/utils/download_librispeech.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/utils/download_utils.py
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/utils/inference_librispeech.csv
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/utils/librispeech.csv
create mode 100644 PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/utils/preprocessing_utils.py
diff --git a/PyTorch/contrib/audio/jasper/.keep b/PyTorch/contrib/audio/jasper/.keep
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/.dockerignore b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/.dockerignore
new file mode 100644
index 0000000000..a620be2e6d
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/.dockerignore
@@ -0,0 +1,8 @@
+*.pt
+results/
+*__pycache__
+checkpoints/
+.git/
+datasets/
+external/tensorrt-inference-server/
+checkpoints/
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/.gitignore b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/.gitignore
new file mode 100644
index 0000000000..bb051c475e
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/.gitignore
@@ -0,0 +1,9 @@
+__pycache__
+*.pt
+results/
+datasets/
+checkpoints/
+
+*.swp
+*.swo
+*.swn
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/.keep b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/.keep
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/Dockerfile b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/Dockerfile
new file mode 100644
index 0000000000..8ba48ec348
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/Dockerfile
@@ -0,0 +1,30 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.10-py3
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt update && apt install -y libsndfile1 && apt install -y sox && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace/jasper
+
+# Install requirements (do this first for better caching)
+COPY requirements.txt .
+RUN conda install -y pyyaml==5.4.1
+RUN pip install --disable-pip-version-check -U -r requirements.txt
+
+RUN pip install --force-reinstall --extra-index-url https://developer.download.nvidia.com/compute/redist nvidia-dali-cuda110==1.2.0
+
+# Copy rest of files
+COPY . .
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/LICENSE b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/LICENSE
new file mode 100644
index 0000000000..2ae5b8195c
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/LICENSE
@@ -0,0 +1,203 @@
+ Except where otherwise noted, the following license applies to all files in this repo.
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright 2019 NVIDIA Corporation
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
\ No newline at end of file
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/NOTICE b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/NOTICE
new file mode 100644
index 0000000000..7916839bcc
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/NOTICE
@@ -0,0 +1,5 @@
+Jasper in PyTorch
+
+This repository includes source code (in "parts/") from:
+* https://github.com/keithito/tacotron and https://github.com/ryanleary/patter licensed under MIT license.
+
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/README.md b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/README.md
new file mode 100644
index 0000000000..2bdbc35941
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/README.md
@@ -0,0 +1,49 @@
+# Jasper
+
+This implements training of Jasper on the LibriSpeech dataset.
+
+- Reference implementation:
+```bash
+url=https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechRecognition/Jasper
+```
+
+
+## Requirements
+
+- Install PyTorch ([pytorch.org](http://pytorch.org))
+- `pip install -r requirements.txt`
+- Download the LibriSpeech dataset from http://www.openslr.org/12
+
+
+## Training
+
+- To run the model, you should cd to the directory of test
+
+```bash
+# 1p train full
+bash ./train_full_1p.sh --data_path=xxx
+
+#1p train perf
+bash ./train_performance_1p.sh --data_path=xxx
+
+# 8p train full
+bash ./train_full_8p.sh --data_path=xxx
+
+# 8p train perf
+bash ./train_performance_8p.sh --data_path=xxx
+
+```
+
+
+## Result
+
+| åç§° | WER | 性能 | Epochs |
+| :------: | :------: | :------: | :------: |
+| GPU-1p | - | 10 | 1 |
+| GPU-8p | 10.73 | 78 | 30 |
+| NPU-1p | - | 4 | 1 |
+| NPU-8p | 10.89 | 34 | 30 |
+
+
+
+
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/README_raw.md b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/README_raw.md
new file mode 100644
index 0000000000..ad548b22af
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/README_raw.md
@@ -0,0 +1,843 @@
+# Jasper For
+PyTorch
+
+This repository provides scripts to train the Jasper model to achieve near state of the art accuracy and perform high-performance inference using NVIDIA TensorRT. This repository is tested and maintained by NVIDIA.
+
+## Table Of Contents
+- [Model overview](#model-overview)
+ * [Model architecture](#model-architecture)
+ * [Default configuration](#default-configuration)
+ * [Feature support matrix](#feature-support-matrix)
+ * [Features](#features)
+ * [Mixed precision training](#mixed-precision-training)
+ * [Enabling mixed precision](#enabling-mixed-precision)
+ * [Enabling TF32](#enabling-tf32)
+ * [Glossary](#glossary)
+- [Setup](#setup)
+ * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+ * [Scripts and sample code](#scripts-and-sample-code)
+ * [Parameters](#parameters)
+ * [Command-line options](#command-line-options)
+ * [Getting the data](#getting-the-data)
+ * [Dataset guidelines](#dataset-guidelines)
+ * [Training process](#training-process)
+ * [Inference process](#inference-process)
+ * [Evaluation process](#evaluation-process)
+ * [Deploying Jasper using Triton Inference Server](#deploying-jasper-using-triton-inference)
+- [Performance](#performance)
+ * [Benchmarking](#benchmarking)
+ * [Training performance benchmark](#training-performance-benchmark)
+ * [Inference performance benchmark](#inference-performance-benchmark)
+ * [Results](#results)
+ * [Training accuracy results](#training-accuracy-results)
+ * [Training accuracy: NVIDIA DGX A100 (8x A100 80GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-80gb)
+ * [Training accuracy: NVIDIA DGX-1 (8x V100 32GB)](#training-accuracy-nvidia-dgx-1-8x-v100-32gb)
+ * [Training stability test](#training-stability-test)
+ * [Training performance results](#training-performance-results)
+ * [Training performance: NVIDIA DGX A100 (8x A100 80GB)](#training-performance-nvidia-dgx-a100-8x-a100-80gb)
+ * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+ * [Training performance: NVIDIA DGX-1 (8x V100 32GB)](#training-performance-nvidia-dgx-1-8x-v100-32gb)
+ * [Training performance: NVIDIA DGX-2 (16x V100 32GB)](#training-performance-nvidia-dgx-2-16x-v100-32gb)
+ * [Inference performance results](#inference-performance-results)
+ * [Inference performance: NVIDIA DGX A100 (1x A100 80GB)](#inference-performance-nvidia-dgx-a100-gpu-1x-a100-80gb)
+ * [Inference performance: NVIDIA DGX-1 (1x V100 16GB)](#inference-performance-nvidia-dgx-1-1x-v100-16gb)
+ * [Inference performance: NVIDIA DGX-1 (1x V100 32GB)](#inference-performance-nvidia-dgx-1-1x-v100-32gb)
+ * [Inference performance: NVIDIA DGX-2 (1x V100 32GB)](#inference-performance-nvidia-dgx-2-1x-v100-32gb)
+ * [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
+- [Release notes](#release-notes)
+ * [Changelog](#changelog)
+ * [Known issues](#known-issues)
+
+## Model overview
+This repository provides an implementation of the Jasper model in PyTorch from the paper `Jasper: An End-to-End Convolutional Neural Acoustic Model` [https://arxiv.org/pdf/1904.03288.pdf](https://arxiv.org/pdf/1904.03288.pdf).
+The Jasper model is an end-to-end neural acoustic model for automatic speech recognition (ASR) that provides near state-of-the-art results on LibriSpeech among end-to-end ASR models without any external data. The Jasper architecture of convolutional layers was designed to facilitate fast GPU inference, by allowing whole sub-blocks to be fused into a single GPU kernel. This is important for meeting strict real-time requirements of ASR systems in deployment.
+
+The results of the acoustic model are combined with the results of external language models to get the top-ranked word sequences
+corresponding to a given audio segment. This post-processing step is called decoding.
+
+This repository is a PyTorch implementation of Jasper and provides scripts to train the Jasper 10x5 model with dense residuals from scratch on the [Librispeech](http://www.openslr.org/12) dataset to achieve the greedy decoding results of the original paper.
+The original reference code provides Jasper as part of a research toolkit in TensorFlow [openseq2seq](https://github.com/NVIDIA/OpenSeq2Seq).
+This repository provides a simple implementation of Jasper with scripts for training and replicating the Jasper paper results.
+This includes data preparation scripts, training and inference scripts.
+Both training and inference scripts offer the option to use Automatic Mixed Precision (AMP) to benefit from Tensor Cores for better performance.
+
+In addition to providing the hyperparameters for training a model checkpoint, we publish a thorough inference analysis across different NVIDIA GPU platforms, for example, DGX A100, DGX-1, DGX-2 and T4.
+
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 3x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+The original paper takes the output of the Jasper acoustic model and shows results for 3 different decoding variations: greedy decoding, beam search with a 6-gram language model and beam search with further rescoring of the best ranked hypotheses with Transformer XL, which is a neural language model. Beam search and the rescoring with the neural language model scores are run on CPU and result in better word error rates compared to greedy decoding.
+This repository provides instructions to reproduce greedy decoding results. To run beam search or rescoring with TransformerXL, use the following scripts from the [openseq2seq](https://github.com/NVIDIA/OpenSeq2Seq) repository:
+https://github.com/NVIDIA/OpenSeq2Seq/blob/master/scripts/decode.py
+https://github.com/NVIDIA/OpenSeq2Seq/tree/master/external_lm_rescore
+
+### Model architecture
+Details on the model architecture can be found in the paper [Jasper: An End-to-End Convolutional Neural Acoustic Model](https://arxiv.org/pdf/1904.03288.pdf).
+
+|
|
|
+|:---:|:---:|
+|Figure 1: Jasper BxR model: B- number of blocks, R- number of sub-blocks | Figure 2: Jasper Dense Residual |
+
+Jasper is an end-to-end neural acoustic model that is based on convolutions.
+In the audio processing stage, each frame is transformed into mel-scale spectrogram features, which the acoustic model takes as input and outputs a probability distribution over the vocabulary for each frame.
+The acoustic model has a modular block structure and can be parametrized accordingly:
+a Jasper BxR model has B blocks, each consisting of R repeating sub-blocks.
+
+Each sub-block applies the following operations in sequence: 1D-Convolution, Batch Normalization, ReLU activation, and Dropout.
+
+Each block input is connected directly to the last subblock of all following blocks via a residual connection, which is referred to as `dense residual` in the paper.
+Every block differs in kernel size and number of filters, which are increasing in size from the bottom to the top layers.
+Irrespective of the exact block configuration parameters B and R, every Jasper model has four additional convolutional blocks:
+one immediately succeeding the input layer (Prologue) and three at the end of the B blocks (Epilogue).
+
+The Prologue is to decimate the audio signal
+in time in order to process a shorter time sequence for efficiency. The Epilogue with dilation captures a bigger context around an audio time step, which decreases the model word error rate (WER).
+The paper achieves best results with Jasper 10x5 with dense residual connections, which is also the focus of this repository and is in the following referred to as Jasper Large.
+
+### Default configuration
+The following features were implemented in this model:
+
+* GPU-supported feature extraction with data augmentation options [SpecAugment](https://arxiv.org/abs/1904.08779) and [Cutout](https://arxiv.org/pdf/1708.04552.pdf)
+* offline and online [Speed Perturbation](https://www.danielpovey.com/files/2015_interspeech_augmentation.pdf)
+* data-parallel multi-GPU training and evaluation
+* AMP with dynamic loss scaling for Tensor Core training
+* FP16 inference
+
+Competitive training results and analysis is provided for the following Jasper model configuration
+
+| **Model** | **Number of Blocks** | **Number of Subblocks** | **Max sequence length** | **Number of Parameters** |
+|--------------|----------------------|-------------------------|-------------------------|--------------------------|
+| Jasper Large | 10 | 5 | 16.7 s | 333 M |
+
+
+### Feature support matrix
+The following features are supported by this model.
+
+| **Feature** | **Jasper** |
+|---------------|---------------|
+|[Apex AMP](https://nvidia.github.io/apex/amp.html) | Yes |
+|[Apex DistributedDataParallel](https://nvidia.github.io/apex/parallel.html#apex.parallel.DistributedDataParallel) | Yes |
+
+#### Features
+[Apex AMP](https://nvidia.github.io/apex/amp.html) - a tool that enables Tensor Core-accelerated training. Refer to the [Enabling mixed precision](#enabling-mixed-precision) section for more details.
+
+[Apex
+DistributedDataParallel](https://nvidia.github.io/apex/parallel.html#apex.parallel.DistributedDataParallel) -
+a module wrapper that enables easy multiprocess distributed data parallel
+training, similar to
+[torch.nn.parallel.DistributedDataParallel](https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel).
+`DistributedDataParallel` is optimized for use with
+[NCCL](https://github.com/NVIDIA/nccl). It achieves high performance by
+overlapping communication with computation during `backward()` and bucketing
+smaller gradient transfers to reduce the total number of transfers required.
+
+
+### Mixed precision training
+*Mixed precision* is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+
+1. Porting the model to use the FP16 data type where appropriate.
+2. Adding loss scaling to preserve small gradient values.
+
+The ability to train deep learning networks with lower precision was introduced in the Pascal architecture and first supported in [CUDA 8](https://devblogs.nvidia.com/parallelforall/tag/fp16/) in the NVIDIA Deep Learning SDK.
+
+For information about:
+* How to train using mixed precision, see the[Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
+* Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+* APEX tools for mixed precision training, see the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
+
+
+#### Enabling mixed precision
+For training, mixed precision can be enabled by setting the flag: `train.py --amp`. When using bash helper scripts: `scripts/train.sh` `scripts/inference.sh`, etc., mixed precision can be enabled with env variable `AMP=true`.
+
+Mixed precision is enabled in PyTorch by using the Automatic Mixed Precision
+(AMP) library from [APEX](https://github.com/NVIDIA/apex) that casts variables
+to half-precision upon retrieval, while storing variables in single-precision
+format. Furthermore, to preserve small gradient magnitudes in backpropagation,
+a [loss
+scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling)
+step must be included when applying gradients. In PyTorch, loss scaling can be
+easily applied by using `scale_loss()` method provided by AMP. The scaling
+value to be used can be
+[dynamic](https://nvidia.github.io/apex/amp.html#apex.amp.initialize) or fixed.
+
+For an in-depth walk through on AMP, check out sample usage
+[here](https://nvidia.github.io/apex/amp.html#). [APEX](https://github.com/NVIDIA/apex) is a PyTorch extension that contains
+utility libraries, such as AMP, which require minimal network code changes to
+leverage Tensor Cores performance.
+
+The following steps were needed to enable mixed precision training in Jasper:
+
+* Import AMP from APEX (file: `train.py`):
+```bash
+from apex import amp
+```
+
+* Initialize AMP and wrap the model and the optimizer
+```bash
+ model, optimizer = amp.initialize(
+ min_loss_scale=1.0,
+ models=model,
+ optimizers=optimizer,
+ opt_level=’O1’)
+
+```
+
+* Apply `scale_loss` context manager
+```bash
+with amp.scale_loss(loss, optimizer) as scaled_loss:
+ scaled_loss.backward()
+```
+
+#### Enabling TF32
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](#https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs.
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](#https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+
+### Glossary
+**Acoustic model**
+Assigns a probability distribution over a vocabulary of characters given an audio frame.
+
+**Language Model**
+Assigns a probability distribution over a sequence of words. Given a sequence of words, it assigns a probability to the whole sequence.
+
+**Pre-training**
+Training a model on vast amounts of data on the same (or different) task to build general understandings.
+
+**Automatic Speech Recognition (ASR)**
+Uses both acoustic model and language model to output the transcript of an input audio signal.
+
+
+## Setup
+The following section lists the requirements in order to start training and evaluating the Jasper model.
+
+### Requirements
+This repository contains a `Dockerfile` which extends the PyTorch 20.10-py3 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+
+* [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+* [PyTorch 20.10-py3 NGC container](https://ngc.nvidia.com/catalog/containers/nvidia:pytorch)
+- Supported GPUs:
+ - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+ - [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
+ - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+
+Further required python packages are listed in `requirements.txt`, which are automatically installed with the Docker container built. To manually install them, run
+```bash
+pip install -r requirements.txt
+```
+
+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+
+* [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+* [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
+* [Running PyTorch](https://docs.nvidia.com/deeplearning/dgx/pytorch-release-notes/running.html#running)
+
+For those unable to use the PyTorch NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html).
+
+
+## Quick Start Guide
+
+To train your model using mixed or TF32 precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the Jasper model on the Librispeech dataset. For details concerning training and inference, see [Advanced](#Advanced) section.
+
+1. Clone the repository.
+```bash
+git clone https://github.com/NVIDIA/DeepLearningExamples
+cd DeepLearningExamples/PyTorch/SpeechRecognition/Jasper
+```
+2. Build the Jasper PyTorch container.
+
+Running the following scripts will build and launch the container which contains all the required dependencies for data download and processing as well as training and inference of the model.
+
+```bash
+bash scripts/docker/build.sh
+```
+
+3. Start an interactive session in the NGC container to run data download/training/inference
+
+```bash
+bash scripts/docker/launch.sh
+```
+Within the container, the contents of this repository will be copied to the `/workspace/jasper` directory. The `/datasets`, `/checkpoints`, `/results` directories are mounted as volumes
+and mapped to the corresponding directories ``, ``, `` on the host.
+
+4. Download and preprocess the dataset.
+
+No GPU is required for data download and preprocessing. Therefore, if GPU usage is a limited resource, launch the container for this section on a CPU machine by following Steps 2 and 3.
+
+Note: Downloading and preprocessing the dataset requires 500GB of free disk space and can take several hours to complete.
+
+This repository provides scripts to download, and extract the following datasets:
+
+* LibriSpeech [http://www.openslr.org/12](http://www.openslr.org/12)
+
+LibriSpeech contains 1000 hours of 16kHz read English speech derived from public domain audiobooks from LibriVox project and has been carefully segmented and aligned. For more information, see the [LIBRISPEECH: AN ASR CORPUS BASED ON PUBLIC DOMAIN AUDIO BOOKS](http://www.danielpovey.com/files/2015_icassp_librispeech.pdf) paper.
+
+Inside the container, download and extract the datasets into the required format for later training and inference:
+```bash
+bash scripts/download_librispeech.sh
+```
+Once the data download is complete, the following folders should exist:
+```bash
+datasets/LibriSpeech/
+├── dev-clean
+├── dev-other
+├── test-clean
+├── test-other
+├── train-clean-100
+├── train-clean-360
+└── train-other-500
+```
+
+Since `/datasets/` is mounted to `` on the host (see Step 3), once the dataset is downloaded it will be accessible from outside of the container at `/LibriSpeech`.
+
+
+Next, convert the data into WAV files:
+```bash
+bash scripts/preprocess_librispeech.sh
+```
+Once the data is converted, the following additional files and folders should exist:
+```bash
+datasets/LibriSpeech/
+├── dev-clean-wav
+├── dev-other-wav
+├── librispeech-train-clean-100-wav.json
+├── librispeech-train-clean-360-wav.json
+├── librispeech-train-other-500-wav.json
+├── librispeech-dev-clean-wav.json
+├── librispeech-dev-other-wav.json
+├── librispeech-test-clean-wav.json
+├── librispeech-test-other-wav.json
+├── test-clean-wav
+├── test-other-wav
+├── train-clean-100-wav
+├── train-clean-360-wav
+└── train-other-500-wav
+```
+
+The DALI data pre-processing pipeline, which is enabled by default, performs speed perturbation on-line during training.
+Without DALI, on-line speed perturbation might slow down the training.
+If you wish to disable DALI, speed perturbation can be computed off-line with:
+```bash
+SPEEDS="0.9 1.1" bash scripts/preprocess_librispeech.sh
+```
+
+5. Start training.
+
+Inside the container, use the following script to start training.
+Make sure the downloaded and preprocessed dataset is located at `/LibriSpeech` on the host (see Step 3), which corresponds to `/datasets/LibriSpeech` inside the container.
+
+```bash
+[OPTION1=value1 OPTION2=value2 ...] bash scripts/train.sh
+```
+By default automatic precision is disabled, batch size is 64 over two gradient accumulation steps, and the recipe is run on a total of 8 GPUs. The hyperparameters are tuned for a GPU with at least 32GB of memory and will require adjustment for 16GB GPUs (e.g., by lowering batch size and using more gradient accumulation steps).
+
+Options are being passed as environment variables. More details on available options can be found in [Parameters](#parameters) and [Training process](#training-process).
+
+6. Start validation/evaluation.
+
+Inside the container, use the following script to run evaluation.
+ Make sure the downloaded and preprocessed dataset is located at `/LibriSpeech` on the host (see Step 3), which corresponds to `/datasets/LibriSpeech` inside the container.
+```bash
+[OPTION1=value1 OPTION2=value2 ...] bash scripts/evaluation.sh [OPTIONS]
+```
+By default, this will use full precision, a batch size of 64 and run on a single GPU.
+
+Options are being passed as environment variables. More details on available options can be found in [Parameters](#parameters) and [Evaluation process](#evaluation-process).
+
+
+7. Start inference/predictions.
+
+Inside the container, use the following script to run inference.
+ Make sure the downloaded and preprocessed dataset is located at `/LibriSpeech` on the host (see Step 3), which corresponds to `/datasets/LibriSpeech` inside the container.
+A pretrained model checkpoint can be downloaded from [NGC model repository](https://ngc.nvidia.com/catalog/models/nvidia:jasperpyt_fp16).
+
+```bash
+[OPTION1=value1 OPTION2=value2 ...] bash scripts/inference.sh
+```
+By default this will use single precision, a batch size of 64 and run on a single GPU.
+
+Options are being passed as environment variables. More details on available options can be found in [Parameters](#parameters) and [Inference process](#inference-process).
+
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and getting training and inference results.
+
+
+### Scripts and sample code
+In the `root` directory, the most important files are:
+```
+jasper
+├── common # data pre-processing, logging, etc.
+├── configs # model configurations
+├── Dockerfile # container with the basic set of dependencies to run Jasper
+├── inference.py # entry point for inference
+├── jasper # model-specific code
+├── notebooks # jupyter notebooks and example audio files
+├── scripts # one-click scripts required for running various supported functionalities
+│  ├── docker # contains the scripts for building and launching the container
+│  ├── download_librispeech.sh # downloads LibriSpeech dataset
+│  ├── evaluation.sh # runs evaluation using the `inference.py` script
+│  ├── inference_benchmark.sh # runs the inference benchmark using the `inference_benchmark.py` script
+│  ├── inference.sh # runs inference using the `inference.py` script
+│  ├── preprocess_librispeech.sh # preprocess LibriSpeech raw data files for training and inference
+│  ├── train_benchmark.sh # runs the training performance benchmark using the `train.py` script
+│  └── train.sh # runs training using the `train.py` script
+├── train.py # entry point for training
+├── triton # example of inference using Triton Inference Server
+└── utils # data downloading and common routines
+```
+
+### Parameters
+
+Parameters could be set as env variables, or passed as positional arguments.
+
+The complete list of available parameters for `scripts/train.sh` script contains:
+```bash
+DATA_DIR: directory of dataset. (default: '/datasets/LibriSpeech')
+MODEL_CONFIG: relative path to model configuration. (default: 'configs/jasper10x5dr_speedp-online_speca.yaml')
+OUTPUT_DIR: directory for results, logs, and created checkpoints. (default: '/results')
+CHECKPOINT: a specific model checkpoint to continue training from. To resume training from the last checkpoint, see the RESUME option.
+RESUME: resume training from the last checkpoint found in OUTPUT_DIR, or from scratch if there are no checkpoints (default: true)
+CUDNN_BENCHMARK: boolean that indicates whether to enable cudnn benchmark mode for using more optimized kernels. (default: true)
+NUM_GPUS: number of GPUs to use. (default: 8)
+AMP: if set to `true`, enables automatic mixed precision (default: false)
+BATCH_SIZE: effective data batch size. The real batch size per GPU might be lower, if gradient accumulation is enabled (default: 64)
+GRAD_ACCUMULATION_STEPS: number of gradient accumulation steps until optimizer updates weights. (default: 2)
+LEARNING_RATE: initial learning rate. (default: 0.01)
+MIN_LEARNING_RATE: minimum learning rate, despite LR scheduling (default: 1e-5)
+LR_POLICY: how to decay LR (default: exponential)
+LR_EXP_GAMMA: decay factor for the exponential LR schedule (default: 0.981)
+EMA: decay factor for exponential averages of checkpoints (default: 0.999)
+SEED: seed for random number generator and used for ensuring reproducibility. (default: 0)
+EPOCHS: number of training epochs. (default: 440)
+WARMUP_EPOCHS: number of initial epoch of linearly increasing LR. (default: 2)
+HOLD_EPOCHS: number of epochs to hold maximum LR after warmup. (default: 140)
+SAVE_FREQUENCY: number of epochs between saving the model to disk. (default: 10)
+EPOCHS_THIS_JOB: run training for this number of epochs. Does not affect LR schedule like the EPOCHS parameter. (default: 0)
+DALI_DEVICE: device to run the DALI pipeline on for calculation of filterbanks. Valid choices: cpu, gpu, none. (default: gpu)
+PAD_TO_MAX_DURATION: pad all sequences with zeros to maximum length. (default: false)
+EVAL_FREQUENCY: number of steps between evaluations on the validation set. (default: 544)
+PREDICTION_FREQUENCY: the number of steps between writing a sample prediction to stdout. (default: 544)
+TRAIN_MANIFESTS: lists of .json training set files
+VAL_MANIFESTS: lists of .json validation set files
+
+```
+
+The complete list of available parameters for `scripts/inference.sh` script contains:
+```bash
+DATA_DIR: directory of dataset. (default: '/datasets/LibriSpeech')
+MODEL_CONFIG: model configuration. (default: 'configs/jasper10x5dr_speedp-online_speca.yaml')
+OUTPUT_DIR: directory for results and logs. (default: '/results')
+CHECKPOINT: model checkpoint path. (required)
+DATASET: name of the LibriSpeech subset to use. (default: 'dev-clean')
+LOG_FILE: path to the DLLogger .json logfile. (default: '')
+CUDNN_BENCHMARK: enable cudnn benchmark mode for using more optimized kernels. (default: false)
+MAX_DURATION: filter out recordings shorter then MAX_DURATION seconds. (default: "")
+PAD_TO_MAX_DURATION: pad all sequences with zeros to maximum length. (default: false)
+PAD_LEADING: pad every batch with leading zeros to counteract conv shifts of the field of view. (default: 16)
+NUM_GPUS: number of GPUs to use. Note that with > 1 GPUs WER results might be inaccurate due to the batching policy. (default: 1)
+NUM_STEPS: number of batches to evaluate, loop the dataset if necessary. (default: 0)
+NUM_WARMUP_STEPS: number of initial steps before measuring performance. (default: 0)
+AMP: enable FP16 inference with AMP. (default: false)
+BATCH_SIZE: data batch size. (default: 64)
+EMA: Attempt to load exponentially averaged weights from a checkpoint. (default: true)
+SEED: seed for random number generator and used for ensuring reproducibility. (default: 0)
+DALI_DEVICE: device to run the DALI pipeline on for calculation of filterbanks. Valid choices: cpu, gpu, none. (default: gpu)
+CPU: run inference on CPU. (default: false)
+LOGITS_FILE: dump logit matrices to a file. (default: "")
+PREDICTION_FILE: save predictions to a file. (default: "${OUTPUT_DIR}/${DATASET}.predictions")
+```
+
+The complete list of available parameters for `scripts/evaluation.sh` is the same as for `scripts/inference.sh` except for the few default changes.
+```bash
+PREDICTION_FILE: (default: "")
+DATASET: (default: "test-other")
+```
+
+The `scripts/inference_benchmark.sh` script pads all input to a fixed duration and computes the mean, 90%, 95%, 99% percentile of latency for the specified number of inference steps. Latency is measured in milliseconds per batch. The `scripts/inference_benchmark.sh` measures latency for a single GPU and loops over a number of batch sizes and durations. It extends `scripts/inference.sh`, and changes the defaults with:
+```bash
+BATCH_SIZE_SEQ: batch sizes to measure on. (default: "1 2 4 8 16")
+MAX_DURATION_SEQ: input durations (in seconds) to measure on (default: "2 7 16.7")
+CUDNN_BENCHMARK: (default: true)
+PAD_TO_MAX_DURATION: (default: true)
+PAD_LEADING: (default: 0)
+NUM_WARMUP_STEPS: (default: 10)
+NUM_STEPS: (default: 500)
+DALI_DEVICE: (default: cpu)
+```
+
+The `scripts/train_benchmark.sh` script pads all input to the same length according to the input argument `MAX_DURATION` and measures average training latency and throughput performance. Latency is measured in seconds per batch, throughput in sequences per second.
+Training performance is measured with on-line speed perturbation and cuDNN benchmark mode enabled.
+The script `scripts/train_benchmark.sh` loops over a number of batch sizes and GPU counts.
+It extends `scripts/train.sh`, and the complete list of available parameters for `scripts/train_benchmark.sh` script contains:
+```bash
+BATCH_SIZE_SEQ: batch sizes to measure on. (default: "1 2 4 8 16")
+NUM_GPUS_SEQ: number of GPUs to run the training on. (default: "1 4 8")
+MODEL_CONFIG: (default: "configs/jasper10x5dr_speedp-online_train-benchmark.yaml")
+TRAIN_MANIFESTS: (default: "$DATA_DIR/librispeech-train-clean-100-wav.json")
+RESUME: (default: false)
+EPOCHS_THIS_JOB: (default: 2)
+EPOCHS: (default: 100000)
+SAVE_FREQUENCY: (default: 100000)
+EVAL_FREQUENCY: (default: 100000)
+GRAD_ACCUMULATION_STEPS: (default: 1)
+PAD_TO_MAX_DURATION: (default: true)
+EMA: (default: 0)
+```
+
+### Command-line options
+To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option with the Python file, for example:
+
+```bash
+python train.py --help
+python inference.py --help
+```
+
+### Getting the data
+The Jasper model was trained on the LibriSpeech dataset. We use the concatenation of `train-clean-100`, `train-clean-360` and `train-other-500` for training and `dev-clean` for validation.
+
+This repository contains the `scripts/download_librispeech.sh` and `scripts/preprocess_librispeech.sh` scripts which will automatically download and preprocess the training, test and development datasets. By default, data will be downloaded to the `/datasets/LibriSpeech` directory, a minimum of 250GB free space is required for download and preprocessing, the final preprocessed dataset is approximately 100GB. With offline speed perturbation, the dataset will be about 3x larger.
+
+#### Dataset guidelines
+The `scripts/preprocess_librispeech.sh` script converts the input audio files to WAV format with a sample rate of 16kHz, target transcripts are stripped from whitespace characters, then lower-cased. For `train-clean-100`, `train-clean-360` and `train-other-500`. It can optionally create speed perturbed versions with rates of 0.9 and 1.1 for data augmentation. In the current version, those augmentations are applied on-line with the DALI pipeline without any impact on training time.
+
+After preprocessing, the script creates JSON files with output file paths, sample rate, target transcript and other metadata. These JSON files are used by the training script to identify training and validation datasets.
+
+The Jasper model was tuned on audio signals with a sample rate of 16kHz, if you wish to use a different sampling rate then some hyperparameters might need to be changed - specifically window size and step size.
+
+
+### Training process
+
+The training is performed using `train.py` script along with parameters defined in `scripts/train.sh`
+The `scripts/train.sh` script runs a job on a single node that trains the Jasper model from scratch using LibriSpeech as training data. To make training more efficient, we discard audio samples longer than 16.7 seconds from the training dataset, the total number of these samples is less than 1%. Such filtering does not degrade accuracy, but it allows us to decrease the number of time steps in a batch, which requires less GPU memory and increases training speed.
+Apart from the default arguments as listed in the [Parameters](#parameters) section, by default the training script:
+
+* Runs on 8 GPUs with at least 32GB of memory and training/evaluation batch size 64, split over two gradient accumulation steps
+* Uses TF32 precision (A100 GPU) or FP32 (other GPUs)
+* Trains on the concatenation of all 3 LibriSpeech training datasets and evaluates on the LibriSpeech dev-clean dataset
+* Maintains an exponential moving average of parameters for evaluation
+* Has cudnn benchmark enabled
+* Runs for 440 epochs
+* Uses an initial learning rate of 0.01 and an exponential learning rate decay
+* Saves a checkpoint every 10 epochs
+* Automatically removes old checkpoints and preserves milestone checkpoints
+* Runs evaluation on the development dataset every 544 iterations and at the end of training
+* Maintains a separate checkpoint with the lowest WER on development set
+* Prints out training progress every iteration to stdout
+* Creates a DLLogger logfile and a Tensorboard log
+* Calculates speed perturbation on-line during training
+* Uses SpecAugment in data pre-processing
+* Filters out audio samples longer than 16.7 seconds
+* Pads each batch so its length would be divisible by 16
+* Uses masked convolutions and dense residuals as described in the paper
+* Uses weight decay of 0.001
+* Uses [Novograd](https://arxiv.org/pdf/1905.11286.pdf) as optimizer with betas=(0.95, 0)
+
+Enabling AMP permits batch size 64 with one gradient accumulation step. In the current setup it will improve upon the greedy WER [Results](#results) of the Jasper paper on a DGX-1 with 32GB V100 GPUs.
+
+### Inference process
+Inference is performed using the `inference.py` script along with parameters defined in `scripts/inference.sh`.
+The `scripts/inference.sh` script runs the job on a single GPU, taking a pre-trained Jasper model checkpoint and running it on the specified dataset.
+Apart from the default arguments as listed in the [Parameters](#parameters) section by default the inference script:
+
+* Evaluates on the LibriSpeech dev-clean dataset
+* Uses a batch size of 64
+* Runs for 1 epoch and prints out the final word error rate
+* Creates a log file with progress and results which will be stored in the results folder
+* Pads each batch so its length would be divisible by 16
+* Does not use data augmentation
+* Does greedy decoding and saves the transcription in the results folder
+* Has the option to save the model output tensors for more complex decoding, for example, beam search
+* Has cudnn benchmark disabled
+
+### Evaluation process
+Evaluation is performed using the `inference.py` script along with parameters defined in `scripts/evaluation.sh`.
+The setup is similar to `scripts/inference.sh`, with two differences:
+
+* Evaluates the LibriSpeech test-other dataset
+* Model outputs are not saved
+
+### Deploying Jasper using Triton Inference Server
+The NVIDIA Triton Inference Server provides a datacenter and cloud inferencing solution optimized for NVIDIA GPUs. The server provides an inference service via an HTTP or gRPC endpoint, allowing remote clients to request inferencing for any number of GPU or CPU models being managed by the server.
+More information on how to perform inference using Triton Inference Server with different model backends can be found in the subfolder [./triton/README.md](triton/README.md)
+
+
+## Performance
+
+The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference).
+
+### Benchmarking
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+To benchmark the training performance in a specific setting on the `train-clean-100` subset of LibriSpeech, run:
+
+```bash
+BATCH_SIZE_SEQ= NUM_GPUS_SEQ= bash scripts/train_benchmark.sh
+```
+
+By default, this script runs 2 epochs on the configuration `configs/jasper10x5dr_speedp-online_train-benchmark.yaml`,
+which applies gentle speed perturbation that does not change the length of the output, enabling immediate stabilization of training step times in the cuDNN benchmark mode. The script runs benchmarks on batch sizes 32 on 1, 4, and 8 GPUs, and requires a 8x 32GB GPU machine.
+
+#### Inference performance benchmark
+To benchmark the inference performance on a specific batch size and audio length, run:
+
+```bash
+BATCH_SIZE_SEQ= MAX_DURATION_SEQ= bash scripts/inference_benchmark.sh
+```
+By default, the script runs on a single GPU and evaluates on the dataset limited to utterances shorter than MAX_DURATION. It uses the model configuration `configs/jasper10x5dr_speedp-online_speca.yaml`.
+
+
+### Results
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+All results are trained on 960 hours of LibriSpeech with a maximum audio length of 16.7s. The training is evaluated
+on LibriSpeech dev-clean, dev-other, test-clean, test-other. Checkpoints for evaluation are being chosen based on their
+word error rate on dev-clean.
+
+#### Training accuracy results
+
+##### Training accuracy: NVIDIA DGX A100 (8x A100 80GB)
+Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 20.10-py3 NGC container with NVIDIA DGX A100 with (8x A100 80GB) GPUs.
+The following table reports the word error rate (WER) of the acoustic model with greedy decoding on all LibriSpeech dev and test datasets for mixed precision training.
+
+| Number of GPUs | Batch size per GPU | Precision | dev-clean WER | dev-other WER | test-clean WER | test-other WER | Time to train |
+|-----|-----|-------|-------|-------|------|-------|------|
+| 8 | 64 | mixed | 3.20 | 9.78 | 3.41 | 9.71 | 70 h |
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 32GB)
+Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 20.10-py3 NGC container with NVIDIA DGX-1 with (8x V100 32GB) GPUs.
+The following table reports the word error rate (WER) of the acoustic model with greedy decoding on all LibriSpeech dev and test datasets for mixed precision training.
+
+| Number of GPUs | Batch size per GPU | Precision | dev-clean WER | dev-other WER | test-clean WER | test-other WER | Time to train |
+|-----|-----|-------|-------|-------|------|-------|-------|
+| 8 | 64 | mixed | 3.26 | 10.00 | 3.54 | 9.80 | 130 h |
+
+We show the best of 5 runs (mixed precision) and 2 runs (FP32) chosen based on dev-clean WER. For FP32, two gradient accumulation steps have been used.
+
+##### Training stability test
+The following table compares greedy decoding word error rates across 8 different training runs with different seeds for mixed precision training.
+
+| DGX A100 80GB, FP16, 8x GPU | Seed #1 | Seed #2 | Seed #3 | Seed #4 | Seed #5 | Seed #6 | Seed #7 | Seed #8 | Mean | Std |
+|-----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|-------:|------:|
+| dev-clean | 3.46 | 3.55 | 3.45 | 3.44 | 3.25 | 3.34 | 3.20 | 3.40 | 3.39 | 0.11 |
+| dev-other | 10.30 | 10.77 | 10.36 | 10.26 | 9.99 | 10.18 | 9.78 | 10.32 | 10.25 | 0.27 |
+| test-clean | 3.84 | 3.81 | 3.66 | 3.64 | 3.58 | 3.55 | 3.41 | 3.73 | 3.65 | 0.13 |
+| test-other | 10.61 | 10.52 | 10.49 | 10.47 | 9.89 | 10.09 | 9.71 | 10.26 | 10.26 | 0.31 |
+
+
+| DGX-1 32GB, FP16, 8x GPU | Seed #1 | Seed #2 | Seed #3 | Seed #4 | Seed #5 | Seed #6 | Seed #7 | Seed #8 | Mean | Std |
+|-----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|-------:|------:|
+| dev-clean | 3.31 | 3.31 | 3.26 | 3.44 | 3.40 | 3.35 | 3.36 | 3.28 | 3.34 | 0.06 |
+| dev-other | 10.02 | 10.01 | 10.00 | 10.06 | 10.05 | 10.03 | 10.10 | 10.04 | 10.04 | 0.03 |
+| test-clean | 3.49 | 3.50 | 3.54 | 3.61 | 3.57 | 3.58 | 3.48 | 3.51 | 3.54 | 0.04 |
+| test-other | 10.11 | 10.14 | 9.80 | 10.09 | 10.17 | 9.99 | 9.86 | 10.00 | 10.02 | 0.13 |
+
+#### Training performance results
+Our results were obtained by running the `scripts/train.sh` training script in the PyTorch 20.10-py3 NGC container. Performance (in sequences per second) is the steady-state throughput.
+
+##### Training performance: NVIDIA DGX A100 (8x A100 80GB)
+| Batch size / GPU | GPUs | Throughput - TF32 | Throughput - mixed precision | Throughput speedup (TF32 to mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
+|----:|----:|-------:|-------:|-----:|-----:|-----:|
+| 32 | 1 | 42.18 | 64.32 | 1.52 | 1.00 | 1.00 |
+| 32 | 4 | 157.49 | 239.23 | 1.52 | 3.73 | 3.72 |
+| 32 | 8 | 310.10 | 470.09 | 1.52 | 7.35 | 7.31 |
+| 64 | 1 | 49.64 | 75.59 | 1.52 | 1.00 | 1.00 |
+| 64 | 4 | 192.66 | 289.16 | 1.50 | 3.88 | 3.83 |
+| 64 | 8 | 371.41 | 547.91 | 1.48 | 7.48 | 7.25 |
+
+Note: Mixed precision permits higher batch sizes during training. We report the maximum batch sizes (as powers of 2), which are allowed without gradient accumulation.
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
+| Batch size / GPU | GPUs | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 to mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|----:|----:|------:|-------:|-----:|-----:|-----:|
+| 16 | 1 | 10.71 | 27.87 | 2.60 | 1.00 | 1.00 |
+| 16 | 4 | 40.28 | 99.80 | 2.48 | 3.76 | 3.58 |
+| 16 | 8 | 78.23 | 193.89 | 2.48 | 7.30 | 6.96 |
+
+Note: Mixed precision permits higher batch sizes during training. We report the maximum batch sizes (as powers of 2), which are allowed without gradient accumulation.
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+##### Training performance: NVIDIA DGX-1 (8x V100 32GB)
+| Batch size / GPU | GPUs | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 to mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|----:|----:|------:|-------:|-----:|-----:|-----:|
+| 32 | 1 | 12.22 | 34.08 | 2.79 | 1.00 | 1.00 |
+| 32 | 4 | 46.97 | 128.39 | 2.73 | 3.84 | 3.77 |
+| 32 | 8 | 92.44 | 249.00 | 2.69 | 7.57 | 7.31 |
+| 64 | 1 | N/A | 39.30 | N/A | N/A | 1.00 |
+| 64 | 4 | N/A | 150.18 | N/A | N/A | 3.82 |
+| 64 | 8 | N/A | 282.68 | N/A | N/A | 7.19 |
+
+Note: Mixed precision permits higher batch sizes during training. We report the maximum batch sizes (as powers of 2), which are allowed without gradient accumulation.
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+##### Training performance: NVIDIA DGX-2 (16x V100 32GB)
+| Batch size / GPU | GPUs | Throughput - FP32 | Throughput - mixed precision | Throughput speedup (FP32 to mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|----:|----:|-------:|-------:|-----:|------:|------:|
+| 32 | 1 | 13.46 | 38.94 | 2.89 | 1.00 | 1.00 |
+| 32 | 4 | 51.38 | 143.44 | 2.79 | 3.82 | 3.68 |
+| 32 | 8 | 100.54 | 280.48 | 2.79 | 7.47 | 7.20 |
+| 32 | 16 | 188.14 | 515.90 | 2.74 | 13.98 | 13.25 |
+| 64 | 1 | N/A | 43.86 | N/A | N/A | 1.00 |
+| 64 | 4 | N/A | 165.27 | N/A | N/A | 3.77 |
+| 64 | 8 | N/A | 318.10 | N/A | N/A | 7.25 |
+| 64 | 16 | N/A | 567.47 | N/A | N/A | 12.94 |
+
+Note: Mixed precision permits higher batch sizes during training. We report the maximum batch sizes (as powers of 2), which are allowed without gradient accumulation.
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+
+#### Inference performance results
+Our results were obtained by running the `scripts/inference_benchmark.sh` script in the PyTorch 20.10-py3 NGC container on NVIDIA DGX A100, DGX-1, DGX-2 and T4 on a single GPU. Performance numbers (latency in milliseconds per batch) were averaged over 500 iterations.
+
+##### Inference performance: NVIDIA DGX A100 (1x A100 80GB)
+| | | FP16 Latency (ms) Percentiles | | | | TF32 Latency (ms) Percentiles | | | | FP16/TF32 speed up |
+|-----:|---------------:|------:|------:|------:|------:|------:|------:|-------:|------:|------:|
+| BS | Duration (s) | 90% | 95% | 99% | Avg | 90% | 95% | 99% | Avg | Avg |
+| 1 | 2.0 | 32.40 | 32.50 | 32.82 | 32.30 | 33.30 | 33.64 | 34.65 | 33.25 | 1.03 |
+| 2 | 2.0 | 32.90 | 33.51 | 34.35 | 32.69 | 34.48 | 34.65 | 35.66 | 34.27 | 1.05 |
+| 4 | 2.0 | 32.85 | 33.01 | 33.89 | 32.60 | 34.09 | 34.46 | 35.22 | 34.00 | 1.04 |
+| 8 | 2.0 | 35.51 | 35.89 | 37.10 | 35.33 | 34.86 | 35.36 | 36.08 | 34.45 | 0.98 |
+| 16 | 2.0 | 36.00 | 36.57 | 37.40 | 35.77 | 43.83 | 44.12 | 44.77 | 43.39 | 1.21 |
+| 1 | 7.0 | 33.50 | 33.99 | 34.91 | 33.03 | 33.83 | 34.25 | 34.95 | 33.70 | 1.02 |
+| 2 | 7.0 | 34.43 | 34.89 | 35.72 | 34.22 | 34.41 | 34.73 | 35.69 | 34.28 | 1.00 |
+| 4 | 7.0 | 34.30 | 34.59 | 35.43 | 34.07 | 37.95 | 38.18 | 38.87 | 37.55 | 1.10 |
+| 8 | 7.0 | 35.98 | 36.28 | 37.11 | 35.28 | 44.64 | 44.79 | 45.37 | 44.29 | 1.26 |
+| 16 | 7.0 | 39.86 | 40.08 | 41.16 | 39.33 | 55.17 | 55.46 | 57.24 | 54.56 | 1.39 |
+| 1 | 16.7 | 35.20 | 35.80 | 38.71 | 34.36 | 35.36 | 35.76 | 36.55 | 34.64 | 1.01 |
+| 2 | 16.7 | 35.40 | 35.81 | 36.50 | 34.76 | 36.34 | 36.53 | 37.40 | 35.87 | 1.03 |
+| 4 | 16.7 | 36.01 | 36.38 | 37.37 | 35.57 | 44.69 | 45.09 | 45.88 | 43.92 | 1.23 |
+| 8 | 16.7 | 41.48 | 41.78 | 44.22 | 40.69 | 58.57 | 58.74 | 59.62 | 58.11 | 1.43 |
+| 16 | 16.7 | 61.37 | 61.93 | 66.32 | 60.92 | 97.33 | 97.71 | 100.04 | 96.56 | 1.59 |
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16GB)
+| | | FP16 Latency (ms) Percentiles | | | | FP32 Latency (ms) Percentiles | | | | FP16/FP32 speed up |
+|-----:|---------------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|------:|
+| BS | Duration (s) | 90% | 95% | 99% | Avg | 90% | 95% | 99% | Avg | Avg |
+| 1 | 2.0 | 45.42 | 45.62 | 49.54 | 45.02 | 48.83 | 48.99 | 51.66 | 48.44 | 1.08 |
+| 2 | 2.0 | 50.31 | 50.53 | 53.66 | 49.10 | 49.87 | 50.04 | 52.99 | 49.41 | 1.01 |
+| 4 | 2.0 | 49.17 | 49.48 | 52.13 | 48.73 | 52.92 | 53.21 | 55.28 | 52.31 | 1.07 |
+| 8 | 2.0 | 51.20 | 51.40 | 52.32 | 49.01 | 73.02 | 73.30 | 75.00 | 71.99 | 1.47 |
+| 16 | 2.0 | 51.75 | 52.24 | 56.36 | 51.27 | 83.99 | 84.57 | 86.69 | 83.24 | 1.62 |
+| 1 | 7.0 | 48.13 | 48.53 | 50.95 | 46.78 | 48.52 | 48.75 | 50.89 | 48.01 | 1.03 |
+| 2 | 7.0 | 49.52 | 50.10 | 52.35 | 48.00 | 65.27 | 65.41 | 66.59 | 64.79 | 1.35 |
+| 4 | 7.0 | 51.75 | 52.01 | 54.39 | 50.38 | 93.75 | 94.77 | 97.04 | 92.27 | 1.83 |
+| 8 | 7.0 | 54.80 | 56.27 | 66.23 | 52.95 | 130.65 | 131.09 | 132.91 | 129.82 | 2.45 |
+| 16 | 7.0 | 73.02 | 73.42 | 75.83 | 71.96 | 157.53 | 158.20 | 160.73 | 155.51 | 2.16 |
+| 1 | 16.7 | 48.10 | 48.52 | 52.71 | 47.20 | 73.34 | 73.56 | 74.19 | 72.69 | 1.54 |
+| 2 | 16.7 | 64.21 | 64.52 | 65.56 | 56.06 | 129.48 | 129.97 | 131.78 | 126.36 | 2.25 |
+| 4 | 16.7 | 60.38 | 61.03 | 63.18 | 58.87 | 183.33 | 183.85 | 185.53 | 181.90 | 3.09 |
+| 8 | 16.7 | 85.88 | 86.34 | 87.70 | 84.46 | 227.42 | 228.21 | 229.63 | 225.71 | 2.67 |
+| 16 | 16.7 | 135.62 | 136.40 | 137.69 | 131.58 | 276.90 | 277.59 | 281.16 | 275.08 | 2.09 |
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 32GB)
+| | | FP16 Latency (ms) Percentiles | | | | FP32 Latency (ms) Percentiles | | | | FP16/FP32 speed up |
+|-----:|---------------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|------:|
+| BS | Duration (s) | 90% | 95% | 99% | Avg | 90% | 95% | 99% | Avg | Avg |
+| 1 | 2.0 | 52.74 | 53.01 | 54.40 | 51.47 | 55.97 | 56.22 | 57.93 | 54.93 | 1.07 |
+| 2 | 2.0 | 51.77 | 52.15 | 54.69 | 50.98 | 56.58 | 56.87 | 58.88 | 55.35 | 1.09 |
+| 4 | 2.0 | 51.41 | 51.76 | 53.47 | 50.55 | 61.56 | 61.87 | 63.81 | 60.74 | 1.20 |
+| 8 | 2.0 | 51.83 | 52.15 | 54.08 | 50.85 | 80.20 | 80.69 | 81.67 | 77.69 | 1.53 |
+| 16 | 2.0 | 70.48 | 70.96 | 72.11 | 62.98 | 93.00 | 93.44 | 94.17 | 89.05 | 1.41 |
+| 1 | 7.0 | 49.77 | 50.21 | 51.88 | 48.73 | 52.74 | 52.99 | 54.54 | 51.67 | 1.06 |
+| 2 | 7.0 | 51.12 | 51.47 | 52.84 | 49.98 | 65.33 | 65.63 | 67.07 | 64.64 | 1.29 |
+| 4 | 7.0 | 53.13 | 53.56 | 55.68 | 52.15 | 93.54 | 93.85 | 94.72 | 92.76 | 1.78 |
+| 8 | 7.0 | 57.67 | 58.07 | 59.89 | 56.41 | 133.93 | 134.18 | 134.88 | 133.15 | 2.36 |
+| 16 | 7.0 | 76.09 | 76.48 | 79.13 | 75.27 | 162.35 | 162.77 | 164.63 | 161.30 | 2.14 |
+| 1 | 16.7 | 54.78 | 55.29 | 56.83 | 52.51 | 75.37 | 76.27 | 78.05 | 74.32 | 1.42 |
+| 2 | 16.7 | 56.80 | 57.20 | 59.01 | 55.49 | 130.60 | 131.36 | 132.93 | 128.55 | 2.32 |
+| 4 | 16.7 | 64.19 | 64.84 | 66.47 | 62.87 | 188.09 | 188.76 | 190.07 | 185.76 | 2.95 |
+| 8 | 16.7 | 87.46 | 87.86 | 89.99 | 86.47 | 232.33 | 232.89 | 234.43 | 230.44 | 2.67 |
+| 16 | 16.7 | 136.02 | 136.52 | 139.44 | 134.78 | 283.87 | 284.59 | 286.70 | 282.01 | 2.09 |
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+##### Inference performance: NVIDIA DGX-2 (1x V100 32GB)
+| | | FP16 Latency (ms) Percentiles | | | | FP32 Latency (ms) Percentiles | | | | FP16/FP32 speed up |
+|-----:|---------------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|------:|
+| BS | Duration (s) | 90% | 95% | 99% | Avg | 90% | 95% | 99% | Avg | Avg |
+| 1 | 2.0 | 35.88 | 36.12 | 39.80 | 35.20 | 42.95 | 43.67 | 46.65 | 42.23 | 1.20 |
+| 2 | 2.0 | 36.36 | 36.57 | 40.97 | 35.60 | 41.83 | 42.21 | 45.60 | 40.97 | 1.15 |
+| 4 | 2.0 | 36.69 | 36.89 | 41.25 | 36.05 | 48.35 | 48.52 | 52.35 | 47.80 | 1.33 |
+| 8 | 2.0 | 37.49 | 37.70 | 41.37 | 36.88 | 65.41 | 65.64 | 66.50 | 64.96 | 1.76 |
+| 16 | 2.0 | 41.35 | 41.79 | 45.58 | 40.91 | 77.22 | 77.51 | 79.48 | 76.54 | 1.87 |
+| 1 | 7.0 | 36.07 | 36.55 | 40.31 | 35.62 | 39.52 | 39.84 | 43.07 | 38.93 | 1.09 |
+| 2 | 7.0 | 37.42 | 37.66 | 41.36 | 36.79 | 55.94 | 56.19 | 58.33 | 55.60 | 1.51 |
+| 4 | 7.0 | 38.51 | 38.95 | 42.55 | 37.98 | 86.62 | 87.08 | 87.50 | 86.20 | 2.27 |
+| 8 | 7.0 | 42.82 | 43.00 | 47.11 | 42.55 | 122.05 | 122.29 | 122.70 | 121.59 | 2.86 |
+| 16 | 7.0 | 67.74 | 67.92 | 69.05 | 65.69 | 149.92 | 150.16 | 151.03 | 149.49 | 2.28 |
+| 1 | 16.7 | 39.28 | 39.78 | 43.34 | 38.35 | 66.73 | 67.16 | 69.80 | 66.01 | 1.72 |
+| 2 | 16.7 | 43.05 | 43.42 | 47.18 | 42.43 | 120.04 | 121.12 | 123.32 | 118.14 | 2.78 |
+| 4 | 16.7 | 52.18 | 52.49 | 56.11 | 51.63 | 176.09 | 176.51 | 178.70 | 174.60 | 3.38 |
+| 8 | 16.7 | 78.55 | 78.79 | 81.66 | 78.04 | 216.19 | 216.68 | 217.63 | 214.48 | 2.75 |
+| 16 | 16.7 | 125.57 | 125.92 | 128.78 | 124.33 | 264.11 | 264.49 | 266.14 | 262.80 | 2.11 |
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+##### Inference performance: NVIDIA T4
+| | | FP16 Latency (ms) Percentiles | | | | FP32 Latency (ms) Percentiles | | | | FP16/FP32 speed up |
+|-----:|---------------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|------:|
+| BS | Duration (s) | 90% | 95% | 99% | Avg | 90% | 95% | 99% | Avg | Avg |
+| 1 | 2.0 | 43.62 | 46.95 | 50.46 | 37.23 | 51.31 | 52.37 | 56.21 | 49.77 | 1.34 |
+| 2 | 2.0 | 49.09 | 50.46 | 53.11 | 40.61 | 81.85 | 82.22 | 83.94 | 80.81 | 1.99 |
+| 4 | 2.0 | 47.71 | 51.14 | 55.09 | 41.29 | 112.56 | 115.13 | 118.56 | 111.60 | 2.70 |
+| 8 | 2.0 | 51.37 | 53.11 | 55.48 | 45.94 | 198.95 | 199.48 | 200.28 | 197.22 | 4.29 |
+| 16 | 2.0 | 63.59 | 64.30 | 66.90 | 61.77 | 221.75 | 222.07 | 223.22 | 220.09 | 3.56 |
+| 1 | 7.0 | 47.49 | 48.66 | 53.36 | 40.76 | 73.63 | 74.41 | 77.65 | 72.41 | 1.78 |
+| 2 | 7.0 | 48.63 | 50.01 | 58.35 | 43.44 | 114.66 | 115.28 | 117.63 | 112.41 | 2.59 |
+| 4 | 7.0 | 52.19 | 52.85 | 54.22 | 49.94 | 200.38 | 201.29 | 202.97 | 197.21 | 3.95 |
+| 8 | 7.0 | 84.90 | 85.56 | 87.52 | 83.41 | 404.00 | 404.72 | 405.70 | 400.25 | 4.80 |
+| 16 | 7.0 | 157.12 | 157.58 | 159.19 | 155.01 | 490.93 | 492.09 | 493.44 | 486.45 | 3.14 |
+| 1 | 16.7 | 50.57 | 51.57 | 57.58 | 46.27 | 150.39 | 151.84 | 153.54 | 147.31 | 3.18 |
+| 2 | 16.7 | 63.64 | 64.55 | 66.31 | 61.98 | 256.54 | 258.16 | 262.71 | 250.34 | 4.04 |
+| 4 | 16.7 | 140.44 | 141.06 | 142.00 | 138.14 | 519.59 | 521.41 | 523.86 | 512.74 | 3.71 |
+| 8 | 16.7 | 267.03 | 268.06 | 270.01 | 263.15 | 727.33 | 728.61 | 731.36 | 722.62 | 2.75 |
+| 16 | 16.7 | 362.40 | 364.02 | 367.80 | 358.75 | 867.92 | 869.19 | 871.46 | 860.37 | 2.40 |
+
+To achieve these same results, follow the [Quick Start Guide](#quick-start-guide) outlined above.
+
+## Release notes
+We're constantly refining and improving our performance on AI and HPC workloads even on the same hardware with frequent updates to our software stack. For our latest performance data please refer to these pages for AI and HPC benchmarks.
+
+### Changelog
+February 2021
+* Added DALI data-processing pipeline for on-the-fly data processing and augmentation on CPU or GPU
+* Revised training recipe: ~10% relative improvement in Word Error Rate (WER)
+* Updated Triton scripts for compatibility with Triton V2 API, updated Triton inference results
+* Refactored codebase
+* Updated performance results for the PyTorch 20.10-py3 NGC container
+
+June 2020
+* Updated performance tables to include A100 results
+
+December 2019
+* Inference support for TRT 6 with dynamic shapes
+* Inference support for TensorRT Inference Server with acoustic model backends in ONNX, PyTorch JIT, TensorRT
+* Jupyter notebook for inference with TensorRT Inference Server
+
+November 2019
+* Google Colab notebook for inference with native TensorRT
+
+September 2019
+* Inference support for TensorRT 6 with static shapes
+* Jupyter notebook for inference
+
+August 2019
+* Initial release
+
+### Known issues
+There are no known issues in this release.
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/.gitignore b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/.gitignore
new file mode 100644
index 0000000000..d30f85c34f
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/.gitignore
@@ -0,0 +1,147 @@
+apex.egg-info
+dist
+build
+docs/build
+*~
+__pycache__
+.vscode
+
+# Copied from https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/.gitmodules b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/.gitmodules
new file mode 100644
index 0000000000..6479428db0
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/.gitmodules
@@ -0,0 +1,7 @@
+[submodule "apex/contrib/csrc/multihead_attn/cutlass"]
+ path = apex/contrib/csrc/multihead_attn/cutlass
+ url = https://github.com/NVIDIA/cutlass.git
+ branch = v1.2.0
+[submodule "apex/contrib/csrc/cudnn-frontend"]
+ path = apex/contrib/csrc/cudnn-frontend
+ url = https://github.com/NVIDIA/cudnn-frontend.git
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/.nojekyll b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/.nojekyll
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/LICENSE b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/LICENSE
new file mode 100644
index 0000000000..3d1e9454ff
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/LICENSE
@@ -0,0 +1,11 @@
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/README.md b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/README.md
new file mode 100644
index 0000000000..a761def7b7
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/README.md
@@ -0,0 +1,146 @@
+# Introduction
+
+This repository holds NVIDIA-maintained utilities to streamline
+mixed precision and distributed training in Pytorch.
+Some of the code here will be included in upstream Pytorch eventually.
+The intention of Apex is to make up-to-date utilities available to
+users as quickly as possible.
+
+## Full API Documentation: [https://nvidia.github.io/apex](https://nvidia.github.io/apex)
+
+## [GTC 2019](https://github.com/mcarilli/mixed_precision_references/tree/master/GTC_2019) and [Pytorch DevCon 2019](https://github.com/mcarilli/mixed_precision_references/tree/master/Pytorch_Devcon_2019) Slides
+
+# Contents
+
+## 1. Amp: Automatic Mixed Precision
+
+`apex.amp` is a tool to enable mixed precision training by changing only 3 lines of your script.
+Users can easily experiment with different pure and mixed precision training modes by supplying
+different flags to `amp.initialize`.
+
+[Webinar introducing Amp](https://info.nvidia.com/webinar-mixed-precision-with-pytorch-reg-page.html)
+(The flag `cast_batchnorm` has been renamed to `keep_batchnorm_fp32`).
+
+[API Documentation](https://nvidia.github.io/apex/amp.html)
+
+[Comprehensive Imagenet example](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
+
+[DCGAN example coming soon...](https://github.com/NVIDIA/apex/tree/master/examples/dcgan)
+
+[Moving to the new Amp API](https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users) (for users of the deprecated "Amp" and "FP16_Optimizer" APIs)
+
+## 2. Distributed Training
+
+`apex.parallel.DistributedDataParallel` is a module wrapper, similar to
+`torch.nn.parallel.DistributedDataParallel`. It enables convenient multiprocess distributed training,
+optimized for NVIDIA's NCCL communication library.
+
+[API Documentation](https://nvidia.github.io/apex/parallel.html)
+
+[Python Source](https://github.com/NVIDIA/apex/tree/master/apex/parallel)
+
+[Example/Walkthrough](https://github.com/NVIDIA/apex/tree/master/examples/simple/distributed)
+
+The [Imagenet example](https://github.com/NVIDIA/apex/tree/master/examples/imagenet)
+shows use of `apex.parallel.DistributedDataParallel` along with `apex.amp`.
+
+### Synchronized Batch Normalization
+
+`apex.parallel.SyncBatchNorm` extends `torch.nn.modules.batchnorm._BatchNorm` to
+support synchronized BN.
+It allreduces stats across processes during multiprocess (DistributedDataParallel) training.
+Synchronous BN has been used in cases where only a small
+local minibatch can fit on each GPU.
+Allreduced stats increase the effective batch size for the BN layer to the
+global batch size across all processes (which, technically, is the correct
+formulation).
+Synchronous BN has been observed to improve converged accuracy in some of our research models.
+
+### Checkpointing
+
+To properly save and load your `amp` training, we introduce the `amp.state_dict()`, which contains all `loss_scalers` and their corresponding unskipped steps,
+as well as `amp.load_state_dict()` to restore these attributes.
+
+In order to get bitwise accuracy, we recommend the following workflow:
+```python
+# Initialization
+opt_level = 'O1'
+model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
+
+# Train your model
+...
+with amp.scale_loss(loss, optimizer) as scaled_loss:
+ scaled_loss.backward()
+...
+
+# Save checkpoint
+checkpoint = {
+ 'model': model.state_dict(),
+ 'optimizer': optimizer.state_dict(),
+ 'amp': amp.state_dict()
+}
+torch.save(checkpoint, 'amp_checkpoint.pt')
+...
+
+# Restore
+model = ...
+optimizer = ...
+checkpoint = torch.load('amp_checkpoint.pt')
+
+model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)
+model.load_state_dict(checkpoint['model'])
+optimizer.load_state_dict(checkpoint['optimizer'])
+amp.load_state_dict(checkpoint['amp'])
+
+# Continue training
+...
+```
+
+Note that we recommend restoring the model using the same `opt_level`. Also note that we recommend calling the `load_state_dict` methods after `amp.initialize`.
+
+# Requirements
+
+Python 3
+
+CUDA 9 or newer
+
+PyTorch 0.4 or newer. The CUDA and C++ extensions require pytorch 1.0 or newer.
+
+We recommend the latest stable release, obtainable from
+[https://pytorch.org/](https://pytorch.org/). We also test against the latest master branch, obtainable from [https://github.com/pytorch/pytorch](https://github.com/pytorch/pytorch).
+
+It's often convenient to use Apex in Docker containers. Compatible options include:
+* [NVIDIA Pytorch containers from NGC](https://ngc.nvidia.com/catalog/containers/nvidia%2Fpytorch), which come with Apex preinstalled. To use the latest Amp API, you may need to `pip uninstall apex` then reinstall Apex using the **Quick Start** commands below.
+* [official Pytorch -devel Dockerfiles](https://hub.docker.com/r/pytorch/pytorch/tags), e.g. `docker pull pytorch/pytorch:nightly-devel-cuda10.0-cudnn7`, in which you can install Apex using the **Quick Start** commands.
+
+See the [Docker example folder](https://github.com/NVIDIA/apex/tree/master/examples/docker) for details.
+
+# Quick Start
+
+### Linux
+
+For performance and full functionality, we recommend installing Apex with
+CUDA and C++ extensions via
+```
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+```
+
+Apex also supports a Python-only build (required with Pytorch 0.4) via
+```
+pip install -v --disable-pip-version-check --no-cache-dir ./
+```
+A Python-only build omits:
+- Fused kernels required to use `apex.optimizers.FusedAdam`.
+- Fused kernels required to use `apex.normalization.FusedLayerNorm`.
+- Fused kernels that improve the performance and numerical stability of `apex.parallel.SyncBatchNorm`.
+- Fused kernels that improve the performance of `apex.parallel.DistributedDataParallel` and `apex.amp`.
+`DistributedDataParallel`, `amp`, and `SyncBatchNorm` will still be usable, but they may be slower.
+
+Pyprof support has been moved to its own [dedicated repository](https://github.com/NVIDIA/PyProf).
+The codebase is deprecated in Apex and will be removed soon.
+
+### Windows support
+Windows support is experimental, and Linux is recommended. `pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .` may work if you were able to build Pytorch from source
+on your system. `pip install -v --no-cache-dir .` (without CUDA/C++ extensions) is more likely to work. If you installed Pytorch in a Conda environment, make sure to install Apex in that same environment.
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/README.md b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/README.md
new file mode 100644
index 0000000000..9e86fd8fc1
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/README.md
@@ -0,0 +1 @@
+Under construction...
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/RNNBackend.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/RNNBackend.py
new file mode 100644
index 0000000000..b9d4937efa
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/RNNBackend.py
@@ -0,0 +1,365 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+import torch.nn.functional as F
+
+import math
+
+
+def is_iterable(maybe_iterable):
+ return isinstance(maybe_iterable, list) or isinstance(maybe_iterable, tuple)
+
+
+def flatten_list(tens_list):
+ """
+ flatten_list
+ """
+ if not is_iterable(tens_list):
+ return tens_list
+
+ return torch.cat(tens_list, dim=0).view(len(tens_list), *tens_list[0].size() )
+
+
+#These modules always assumes batch_first
+class bidirectionalRNN(nn.Module):
+ """
+ bidirectionalRNN
+ """
+ def __init__(self, inputRNN, num_layers=1, dropout = 0):
+ super(bidirectionalRNN, self).__init__()
+ self.dropout = dropout
+ self.fwd = stackedRNN(inputRNN, num_layers=num_layers, dropout = dropout)
+ self.bckwrd = stackedRNN(inputRNN.new_like(), num_layers=num_layers, dropout = dropout)
+ self.rnns = nn.ModuleList([self.fwd, self.bckwrd])
+
+ #collect hidden option will return all hidden/cell states from entire RNN
+ def forward(self, input, collect_hidden=False):
+ """
+ forward()
+ """
+ seq_len = input.size(0)
+ bsz = input.size(1)
+
+ fwd_out, fwd_hiddens = list(self.fwd(input, collect_hidden = collect_hidden))
+ bckwrd_out, bckwrd_hiddens = list(self.bckwrd(input, reverse=True, collect_hidden = collect_hidden))
+
+ output = torch.cat( [fwd_out, bckwrd_out], -1 )
+ hiddens = tuple( torch.cat(hidden, -1) for hidden in zip( fwd_hiddens, bckwrd_hiddens) )
+
+ return output, hiddens
+
+ def reset_parameters(self):
+ """
+ reset_parameters()
+ """
+ for rnn in self.rnns:
+ rnn.reset_parameters()
+
+ def init_hidden(self, bsz):
+ """
+ init_hidden()
+ """
+ for rnn in self.rnns:
+ rnn.init_hidden(bsz)
+
+ def detach_hidden(self):
+ """
+ detach_hidden()
+ """
+ for rnn in self.rnns:
+ rnn.detachHidden()
+
+ def reset_hidden(self, bsz):
+ """
+ reset_hidden()
+ """
+ for rnn in self.rnns:
+ rnn.reset_hidden(bsz)
+
+ def init_inference(self, bsz):
+ """
+ init_inference()
+ """
+ for rnn in self.rnns:
+ rnn.init_inference(bsz)
+
+
+#assumes hidden_state[0] of inputRNN is output hidden state
+#constructor either takes an RNNCell or list of RNN layers
+class stackedRNN(nn.Module):
+ """
+ stackedRNN
+ """
+ def __init__(self, inputRNN, num_layers=1, dropout=0):
+ super(stackedRNN, self).__init__()
+
+ self.dropout = dropout
+
+ if isinstance(inputRNN, RNNCell):
+ self.rnns = [inputRNN]
+ for i in range(num_layers-1):
+ self.rnns.append(inputRNN.new_like(inputRNN.output_size))
+ elif isinstance(inputRNN, list):
+ assert len(inputRNN) == num_layers, "RNN list length must be equal to num_layers"
+ self.rnns=inputRNN
+ else:
+ raise RuntimeError()
+
+ self.nLayers = len(self.rnns)
+
+ self.rnns = nn.ModuleList(self.rnns)
+
+
+ '''
+ Returns output as hidden_state[0] Tensor([sequence steps][batch size][features])
+ If collect hidden will also return Tuple(
+ [n_hidden_states][sequence steps] Tensor([layer][batch size][features])
+ )
+ If not collect hidden will also return Tuple(
+ [n_hidden_states] Tensor([layer][batch size][features])
+ '''
+ def forward(self, input, collect_hidden=False, reverse=False):
+ """
+ forward()
+ """
+ seq_len = input.size(0)
+ bsz = input.size(1)
+ inp_iter = reversed(range(seq_len)) if reverse else range(seq_len)
+
+ hidden_states = [[] for i in range(self.nLayers)]
+ outputs = []
+
+ for seq in inp_iter:
+ for layer in range(self.nLayers):
+
+ if layer == 0:
+ prev_out = input[seq]
+
+ outs = self.rnns[layer](prev_out)
+
+ if collect_hidden:
+ hidden_states[layer].append(outs)
+ elif seq == seq_len-1:
+ hidden_states[layer].append(outs)
+
+ prev_out = outs[0]
+
+ outputs.append(prev_out)
+
+ if reverse:
+ outputs = list(reversed(outputs))
+ '''
+ At this point outputs is in format:
+ list( [seq_length] x Tensor([bsz][features]) )
+ need to convert it to:
+ list( Tensor([seq_length][bsz][features]) )
+ '''
+ output = flatten_list(outputs)
+
+ '''
+ hidden_states at this point is in format:
+ list( [layer][seq_length][hidden_states] x Tensor([bsz][features]) )
+ need to convert it to:
+ For not collect hidden:
+ list( [hidden_states] x Tensor([layer][bsz][features]) )
+ For collect hidden:
+ list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
+ '''
+ if not collect_hidden:
+ seq_len = 1
+ n_hid = self.rnns[0].n_hidden_states
+ new_hidden = [ [ [ None for k in range(self.nLayers)] for j in range(seq_len) ] for i in range(n_hid) ]
+
+
+ for i in range(n_hid):
+ for j in range(seq_len):
+ for k in range(self.nLayers):
+ new_hidden[i][j][k] = hidden_states[k][j][i]
+
+ hidden_states = new_hidden
+ #Now in format list( [hidden_states][seq_length][layer] x Tensor([bsz][features]) )
+ #Reverse seq_length if reverse
+ if reverse:
+ hidden_states = list( list(reversed(list(entry))) for entry in hidden_states)
+
+ #flatten layer dimension into tensor
+ hiddens = list( list(
+ flatten_list(seq) for seq in hidden )
+ for hidden in hidden_states )
+
+ #Now in format list( [hidden_states][seq_length] x Tensor([layer][bsz][features]) )
+ #Remove seq_length dimension if not collect_hidden
+ if not collect_hidden:
+ hidden_states = list( entry[0] for entry in hidden_states)
+ return output, hidden_states
+
+ def reset_parameters(self):
+ """
+ reset_parameters()
+ """
+ for rnn in self.rnns:
+ rnn.reset_parameters()
+
+ def init_hidden(self, bsz):
+ """
+ init_hidden()
+ """
+ for rnn in self.rnns:
+ rnn.init_hidden(bsz)
+
+ def detach_hidden(self):
+ """
+ detach_hidden()
+ """
+ for rnn in self.rnns:
+ rnn.detach_hidden()
+
+ def reset_hidden(self, bsz):
+ """
+ reset_hidden()
+ """
+ for rnn in self.rnns:
+ rnn.reset_hidden(bsz)
+
+ def init_inference(self, bsz):
+ """
+ init_inference()
+ """
+ for rnn in self.rnns:
+ rnn.init_inference(bsz)
+
+class RNNCell(nn.Module):
+ """
+ RNNCell
+ gate_multiplier is related to the architecture you're working with
+ For LSTM-like it will be 4 and GRU-like will be 3.
+ Always assumes input is NOT batch_first.
+ Output size that's not hidden size will use output projection
+ Hidden_states is number of hidden states that are needed for cell
+ if one will go directly to cell as tensor, if more will go as list
+ """
+ def __init__(self, gate_multiplier, input_size, hidden_size, cell, n_hidden_states = 2, bias = False, output_size = None):
+ super(RNNCell, self).__init__()
+
+ self.gate_multiplier = gate_multiplier
+ self.input_size = input_size
+ self.hidden_size = hidden_size
+ self.cell = cell
+ self.bias = bias
+ self.output_size = output_size
+ if output_size is None:
+ self.output_size = hidden_size
+
+ self.gate_size = gate_multiplier * self.hidden_size
+ self.n_hidden_states = n_hidden_states
+
+ self.w_ih = nn.Parameter(torch.Tensor(self.gate_size, self.input_size))
+ self.w_hh = nn.Parameter(torch.Tensor(self.gate_size, self.output_size))
+
+ #Check if there's recurrent projection
+ if(self.output_size != self.hidden_size):
+ self.w_ho = nn.Parameter(torch.Tensor(self.output_size, self.hidden_size))
+
+ self.b_ih = self.b_hh = None
+ if self.bias:
+ self.b_ih = nn.Parameter(torch.Tensor(self.gate_size))
+ self.b_hh = nn.Parameter(torch.Tensor(self.gate_size))
+
+ #hidden states for forward
+ self.hidden = [ None for states in range(self.n_hidden_states)]
+
+ self.reset_parameters()
+
+ def new_like(self, new_input_size=None):
+ """
+ new_like()
+ """
+ if new_input_size is None:
+ new_input_size = self.input_size
+
+ return type(self)(self.gate_multiplier,
+ new_input_size,
+ self.hidden_size,
+ self.cell,
+ self.n_hidden_states,
+ self.bias,
+ self.output_size)
+
+
+ #Use xavier where we can (weights), otherwise use uniform (bias)
+ def reset_parameters(self, gain=1):
+ """
+ reset_parameters()
+ """
+ stdev = 1.0 / math.sqrt(self.hidden_size)
+ for param in self.parameters():
+ param.data.uniform_(-stdev, stdev)
+ '''
+ Xavier reset:
+ def reset_parameters(self, gain=1):
+ stdv = 1.0 / math.sqrt(self.gate_size)
+
+ for param in self.parameters():
+ if (param.dim() > 1):
+ torch.nn.init.xavier_normal(param, gain)
+ else:
+ param.data.uniform_(-stdv, stdv)
+ '''
+ def init_hidden(self, bsz):
+ """
+ init_hidden()
+ """
+ for param in self.parameters():
+ if param is not None:
+ a_param = param
+ break
+
+ for i, _ in enumerate(self.hidden):
+ if(self.hidden[i] is None or self.hidden[i].data.size()[0] != bsz):
+
+ if i==0:
+ hidden_size = self.output_size
+ else:
+ hidden_size = self.hidden_size
+
+ tens = a_param.data.new(bsz, hidden_size).zero_()
+ self.hidden[i] = Variable(tens, requires_grad=False)
+
+
+ def reset_hidden(self, bsz):
+ """
+ reset_hidden()
+ """
+ for i, _ in enumerate(self.hidden):
+ self.hidden[i] = None
+ self.init_hidden(bsz)
+
+ def detach_hidden(self):
+ """
+ detach_hidden()
+ """
+ for i, _ in enumerate(self.hidden):
+ if self.hidden[i] is None:
+ raise RuntimeError("Must initialize hidden state before you can detach it")
+ for i, _ in enumerate(self.hidden):
+ self.hidden[i] = self.hidden[i].detach()
+
+ def forward(self, input):
+ """
+ forward()
+ if not inited or bsz has changed this will create hidden states
+ """
+ self.init_hidden(input.size()[0])
+
+ hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
+ self.hidden = self.cell(input, hidden_state, self.w_ih, self.w_hh, b_ih=self.b_ih, b_hh=self.b_hh)
+ if(self.n_hidden_states > 1):
+ self.hidden = list(self.hidden)
+ else:
+ self.hidden=[self.hidden]
+
+ if self.output_size != self.hidden_size:
+ self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
+
+ return tuple(self.hidden)
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/__init__.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/__init__.py
new file mode 100644
index 0000000000..d706746669
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/__init__.py
@@ -0,0 +1,3 @@
+from .models import LSTM, GRU, ReLU, Tanh, mLSTM
+
+__all__ = ['models']
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/cells.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/cells.py
new file mode 100644
index 0000000000..32b61a1be1
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/cells.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .RNNBackend import RNNCell
+
+from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
+
+import math
+
+
+class mLSTMRNNCell(RNNCell):
+ """
+ mLSTMRNNCell
+ """
+
+ def __init__(self, input_size, hidden_size, bias = False, output_size = None):
+ gate_multiplier = 4
+ super(mLSTMRNNCell, self).__init__(gate_multiplier, input_size, hidden_size, mLSTMCell, n_hidden_states = 2, bias = bias, output_size = output_size)
+
+ self.w_mih = nn.Parameter(torch.Tensor(self.output_size, self.input_size))
+ self.w_mhh = nn.Parameter(torch.Tensor(self.output_size, self.output_size))
+
+ self.reset_parameters()
+
+ def forward(self, input):
+ """
+ mLSTMRNNCell.forward()
+ """
+ #if not inited or bsz has changed this will create hidden states
+ self.init_hidden(input.size()[0])
+
+ hidden_state = self.hidden[0] if self.n_hidden_states == 1 else self.hidden
+
+ self.hidden = list(
+ self.cell(input, hidden_state, self.w_ih, self.w_hh, self.w_mih, self.w_mhh,
+ b_ih=self.b_ih, b_hh=self.b_hh)
+ )
+
+ if self.output_size != self.hidden_size:
+ self.hidden[0] = F.linear(self.hidden[0], self.w_ho)
+ return tuple(self.hidden)
+
+
+ def new_like(self, new_input_size=None):
+ if new_input_size is None:
+ new_input_size = self.input_size
+
+ return type(self)(
+ new_input_size,
+ self.hidden_size,
+ self.bias,
+ self.output_size)
+
+def mLSTMCell(input, hidden, w_ih, w_hh, w_mih, w_mhh, b_ih=None, b_hh=None):
+ """
+ mLSTMCell
+ """
+
+ if input.is_cuda:
+ igates = F.linear(input, w_ih)
+ m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
+ hgates = F.linear(m, w_hh)
+
+ state = fusedBackend.LSTMFused.apply
+ return state(igates, hgates, hidden[1], b_ih, b_hh)
+
+ hx, cx = hidden
+
+ m = F.linear(input, w_mih) * F.linear(hidden[0], w_mhh)
+ gates = F.linear(input, w_ih, b_ih) + F.linear(m, w_hh, b_hh)
+
+ ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)
+
+ ingate = F.sigmoid(ingate)
+ forgetgate = F.sigmoid(forgetgate)
+ cellgate = F.tanh(cellgate)
+ outgate = F.sigmoid(outgate)
+
+ cy = (forgetgate * cx) + (ingate * cellgate)
+ hy = outgate * F.tanh(cy)
+
+ return hy, cy
+
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/models.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/models.py
new file mode 100644
index 0000000000..dd7adce047
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/RNN/models.py
@@ -0,0 +1,54 @@
+import torch
+
+from torch.nn._functions.rnn import LSTMCell, RNNReLUCell, RNNTanhCell, GRUCell
+
+from .RNNBackend import bidirectionalRNN, stackedRNN, RNNCell
+from .cells import mLSTMRNNCell, mLSTMCell
+
+def toRNNBackend(inputRNN, num_layers, bidirectional=False, dropout = 0):
+ """
+ :class:`toRNNBackend`
+ """
+
+ if bidirectional:
+ return bidirectionalRNN(inputRNN, num_layers, dropout = dropout)
+ else:
+ return stackedRNN(inputRNN, num_layers, dropout = dropout)
+
+
+def LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
+ """
+ :class:`LSTM`
+ """
+ inputRNN = RNNCell(4, input_size, hidden_size, LSTMCell, 2, bias, output_size)
+ return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
+
+def GRU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
+ """
+ :class:`GRU`
+ """
+ inputRNN = RNNCell(3, input_size, hidden_size, GRUCell, 1, bias, output_size)
+ return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
+
+def ReLU(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
+ """
+ :class:`ReLU`
+ """
+ inputRNN = RNNCell(1, input_size, hidden_size, RNNReLUCell, 1, bias, output_size)
+ return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
+
+def Tanh(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
+ """
+ :class:`Tanh`
+ """
+ inputRNN = RNNCell(1, input_size, hidden_size, RNNTanhCell, 1, bias, output_size)
+ return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
+
+def mLSTM(input_size, hidden_size, num_layers, bias=True, batch_first=False, dropout=0, bidirectional=False, output_size = None):
+ """
+ :class:`mLSTM`
+ """
+ inputRNN = mLSTMRNNCell(input_size, hidden_size, bias=bias, output_size=output_size)
+ return toRNNBackend(inputRNN, num_layers, bidirectional, dropout=dropout)
+
+
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/__init__.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/__init__.py
new file mode 100644
index 0000000000..42f8898f56
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/__init__.py
@@ -0,0 +1,20 @@
+# May help avoid undefined symbol errors https://pytorch.org/cppdocs/notes/faq.html#undefined-symbol-errors-from-pytorch-aten
+import torch
+
+if torch.distributed.is_available():
+ from . import parallel
+
+from . import amp
+from . import fp16_utils
+
+# For optimizers and normalization there is no Python fallback.
+# Absence of cuda backend is a hard error.
+# I would like the errors from importing fused_adam_cuda or fused_layer_norm_cuda
+# to be triggered lazily, because if someone has installed with --cpp_ext and --cuda_ext
+# so they expect those backends to be available, but for some reason they actually aren't
+# available (for example because they built improperly in a way that isn't revealed until
+# load time) the error message is timely and visible.
+from . import optimizers
+from . import normalization
+from . import pyprof
+from . import transformer
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/_autocast_utils.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/_autocast_utils.py
new file mode 100644
index 0000000000..94f8f91156
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/_autocast_utils.py
@@ -0,0 +1,17 @@
+from typing import Optional
+
+import torch
+
+
+def _get_current_dtype(dtype: Optional[torch.dtype] = None) -> torch.dtype:
+ if not torch.is_autocast_enabled():
+ return torch.float or dtype
+ else:
+ return torch.get_autocast_gpu_dtype()
+
+
+def _cast_if_autocast_enabled(*args):
+ if not torch.is_autocast_enabled():
+ return args
+ else:
+ return torch.cuda.amp.autocast_mode._cast(args, torch.get_autocast_gpu_dtype())
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/README.md b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/README.md
new file mode 100644
index 0000000000..a87b5010e3
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/README.md
@@ -0,0 +1,72 @@
+# amp: Automatic Mixed Precision
+
+## Annotating User Functions
+
+Nearly all PyTorch user code needs nothing more than the two steps
+above to use amp. After all, custom layers are built out of simpler
+PyTorch components, and amp already can see those.
+
+However, any custom C++ or CUDA code is outside of amp's (default)
+view of things. For example, suppose I implemented a new recurrent
+cell called a "forgetful recurrent unit" that calls directly into a
+CUDA backend:
+
+```python
+from backend import FRUBackend
+
+def fru(input, hidden, weight, bias):
+ # call to CUDA code
+ FRUBackend(input, hidden, weight, bias)
+```
+
+In this case, it is possible to get a runtime type mismatch. For
+example, you might have `input` in fp16, and `weight` in fp32, and amp
+doesn't have the visibility to insert an appropriate cast.
+
+amp exposes two ways to handle "invisible" backend code: function
+annotations and explicit registration.
+
+#### Function annotation
+
+The first way to handle backend code is a set of function annotations:
+
+- `@amp.half_function`
+- `@amp.float_function`
+- `@amp.promote_function`
+
+These correspond to:
+
+- Cast all arguments to fp16
+- Cast all argumnets fo fp32
+- If there are any type mismatches, cast everything to the widest type
+
+In our example, we believe that the FRU unit is fp16-safe and will get
+performance gains from casting its arguments to fp16, so we write:
+
+```python
+@amp.half_function
+def fru(input, hidden, weight, bias):
+ #...
+```
+
+#### Explicit registration
+
+The other way to handle backend code is with explicit function
+registration:
+
+- `amp.register_half_function(module, function_name)`
+- `amp.register_float_function(module, function_name)`
+- `amp.register_promote_function(module, function_name)`
+
+When using this API, `module` is the containing class or module for
+the function, and `function_name` is the _string_ name of the
+function. Note that the function must be registered before the call to
+`amp.initalize()`.
+
+For our FRU unit, we can register the backend function directly:
+
+```python
+import backend
+
+amp.register_half_function(backend, 'FRUBackend')
+```
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/__init__.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/__init__.py
new file mode 100644
index 0000000000..34d080a69e
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/__init__.py
@@ -0,0 +1,5 @@
+from .amp import init, half_function, float_function, promote_function,\
+ register_half_function, register_float_function, register_promote_function
+from .handle import scale_loss, disable_casts
+from .frontend import initialize, state_dict, load_state_dict
+from ._amp_state import master_params, _amp_state
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/__version__.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/__version__.py
new file mode 100644
index 0000000000..3a83701b29
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/__version__.py
@@ -0,0 +1,2 @@
+VERSION = (0, 1, 0)
+__version__ = '.'.join(map(str, VERSION))
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/_amp_state.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/_amp_state.py
new file mode 100644
index 0000000000..1ac9d31160
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/_amp_state.py
@@ -0,0 +1,69 @@
+# This is a "header object" that allows different amp modules to communicate.
+# I'm a C++ guy, not a python guy. I decided this approach because it seemed most C++-like.
+# But apparently it's ok:
+# http://effbot.org/pyfaq/how-do-i-share-global-variables-across-modules.htm
+import os
+import torch
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+
+if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
+ from torch._six import container_abcs
+else:
+ import collections.abc as container_abcs
+
+
+class AmpState(object):
+ def __init__(self):
+ self.hard_override=False
+ self.allow_incoming_model_not_fp32 = False
+ self.verbosity=1
+
+
+# Attribute stash. Could also just stash things as global module attributes.
+_amp_state = AmpState()
+
+
+def warn_or_err(msg):
+ if _amp_state.hard_override:
+ print("Warning: " + msg)
+ else:
+ raise RuntimeError(msg)
+ # I'm not sure if allowing hard_override is a good idea.
+ # + " If you're sure you know what you're doing, supply " +
+ # "hard_override=True to amp.initialize.")
+
+
+def maybe_print(msg, rank0=False):
+ distributed = torch.distributed.is_available() and \
+ torch.distributed.is_initialized() and \
+ torch.distributed.get_world_size() > 1
+ if _amp_state.verbosity > 0:
+ if rank0:
+ if distributed:
+ if torch.distributed.get_rank() == 0:
+ print(msg)
+ else:
+ print(msg)
+ else:
+ print(msg)
+
+
+# def iter_params(param_groups):
+# for group in param_groups:
+# for p in group['params']:
+# yield p
+
+
+def master_params(optimizer):
+ """
+ Generator expression that iterates over the params owned by ``optimizer``.
+
+ Args:
+ optimizer: An optimizer previously returned from ``amp.initialize``.
+ """
+ for group in optimizer.param_groups:
+ for p in group['params']:
+ yield p
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/_initialize.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/_initialize.py
new file mode 100644
index 0000000000..28c5bbbdfe
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/_initialize.py
@@ -0,0 +1,263 @@
+import torch
+from torch._six import string_classes
+import functools
+import numpy as np
+import sys
+from types import MethodType
+import warnings
+from ._amp_state import _amp_state, warn_or_err, container_abcs
+from .handle import disable_casts
+from .scaler import LossScaler
+from ._process_optimizer import _process_optimizer
+from apex.fp16_utils import convert_network
+from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
+from ..contrib.optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
+
+if torch.distributed.is_available():
+ from ..parallel import DistributedDataParallel as apex_DDP
+ from ..parallel.LARC import LARC
+
+
+def to_type(dtype, t):
+ if isinstance(t, torch.Tensor):
+ if not t.is_cuda:
+ # This should not be a hard error, since it may be legitimate.
+ warnings.warn("An input tensor was not cuda.")
+ # GANs require this.
+ # if t.requires_grad:
+ # warn_or_err("input data requires grad. Since input data is not a model parameter,\n"
+ # "its gradients will not be properly allreduced by DDP.")
+ if t.is_floating_point():
+ return t.to(dtype)
+ return t
+ else:
+ # Trust the user's custom batch type, that's all I can do here.
+ return t.to(dtype)
+
+
+# Modified from torch.optim.optimizer.py. This is a bit more general than casted_args in utils.py.
+def applier(value, fn):
+ if isinstance(value, torch.Tensor):
+ return fn(value)
+ elif isinstance(value, string_classes):
+ return value
+ elif isinstance(value, np.ndarray):
+ return value
+ elif hasattr(value, "to"): # Allow handling of custom batch classes
+ return fn(value)
+ elif isinstance(value, container_abcs.Mapping):
+ return {applier(k, fn) : applier(v, fn) for k, v in value.items()}
+ elif isinstance(value, container_abcs.Iterable):
+ return type(value)(applier(v, fn) for v in value)
+ else:
+ # Do I want this to fire off even if someone chooses to pass something ordinary like
+ # an int or float? May be more annoying than it's worth.
+ # print("Warning: unrecognized type in applier. If your input data is a custom class, "
+ # "provide it with a .to(dtype) method which converts its floating-point Tensors to dtype. "
+ # "Amp will check for your custom to() and invoke it to cast the batch's "
+ # "floating-point Tensors to the appropriate type. "
+ # "Also, if your data is a custom class, it is your responsibility to ensure that "
+ # "any Tensors you want to be cuda are already cuda."
+ return value
+
+
+def check_models(models):
+ for model in models:
+ parallel_type = None
+ if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+ parallel_type = "torch.nn.parallel.DistributedDataParallel"
+ if ('apex_DDP' in sys.modules) and isinstance(model, apex_DDP):
+ parallel_type = "apex.parallel.DistributedDataParallel"
+ if isinstance(model, torch.nn.parallel.DataParallel):
+ parallel_type = "torch.nn.parallel.DataParallel"
+ if parallel_type is not None:
+ raise RuntimeError("Incoming model is an instance of {}. ".format(parallel_type) +
+ "Parallel wrappers should only be applied to the model(s) AFTER \n"
+ "the model(s) have been returned from amp.initialize.")
+
+
+def check_params_fp32(models):
+ for model in models:
+ for name, param in model.named_parameters():
+ if param.is_floating_point():
+ if 'Half' in param.type():
+ warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
+ "When using amp.initialize, you do not need to call .half() on your model\n"
+ "before passing it, no matter what optimization level you choose.".format(
+ name, param.type()))
+ elif not param.is_cuda:
+ warn_or_err("Found param {} with type {}, expected torch.cuda.FloatTensor.\n"
+ "When using amp.initialize, you need to provide a model with parameters\n"
+ "located on a CUDA device before passing it no matter what optimization level\n"
+ "you chose. Use model.to('cuda') to use the default device.".format(
+ name, param.type()))
+
+ # Backward compatibility for PyTorch 0.4
+ if hasattr(model, 'named_buffers'):
+ buf_iter = model.named_buffers()
+ else:
+ buf_iter = model._buffers
+ for obj in buf_iter:
+ if type(obj)==tuple:
+ name, buf = obj
+ else:
+ name, buf = obj, buf_iter[obj]
+ if buf.is_floating_point():
+ if 'Half' in buf.type():
+ warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
+ "When using amp.initialize, you do not need to call .half() on your model\n"
+ "before passing it, no matter what optimization level you choose.".format(
+ name, buf.type()))
+ elif not buf.is_cuda:
+ warn_or_err("Found buffer {} with type {}, expected torch.cuda.FloatTensor.\n"
+ "When using amp.initialize, you need to provide a model with buffers\n"
+ "located on a CUDA device before passing it no matter what optimization level\n"
+ "you chose. Use model.to('cuda') to use the default device.".format(
+ name, buf.type()))
+
+
+def check_optimizers(optimizers):
+ for optim in optimizers:
+ bad_optim_type = None
+ if isinstance(optim, FP16_Optimizer_general):
+ bad_optim_type = "apex.fp16_utils.FP16_Optimizer"
+ if isinstance(optim, FP16_Optimizer_for_fused):
+ bad_optim_type = "apex.optimizers.FP16_Optimizer"
+ if bad_optim_type is not None:
+ raise RuntimeError("An incoming optimizer is an instance of {}. ".format(bad_optim_type) +
+ "The optimizer(s) passed to amp.initialize() must be bare \n"
+ "instances of either ordinary Pytorch optimizers, or Apex fused \n"
+ "optimizers.\n")
+
+
+class O2StateDictHook(object):
+ def __init__(self, fn):
+ self.fn = fn
+
+ def __call__(self, module, state_dict, prefix, local_metadata):
+ for key in state_dict:
+ param = state_dict[key]
+ if 'Half' in param.type():
+ param = param.to(torch.float32)
+ state_dict[key] = param
+
+
+def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs=None):
+ from .amp import init as amp_init
+
+ optimizers_was_list = False
+ if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
+ optimizers = [optimizers]
+ elif optimizers is None:
+ optimizers = []
+ elif isinstance(optimizers, list):
+ optimizers_was_list = True
+ check_optimizers(optimizers)
+ else:
+ check_optimizers([optimizers])
+ raise TypeError("optimizers must be either a single optimizer or a list of optimizers.")
+
+ if isinstance(models, torch.nn.Module):
+ models_was_list = False
+ models = [models]
+ elif isinstance(models, list):
+ models_was_list = True
+ else:
+ raise TypeError("models must be either a single model or a list of models.")
+
+ check_models(models)
+
+ if not _amp_state.allow_incoming_model_not_fp32:
+ check_params_fp32(models)
+
+ # In the future, when FP16_Optimizer can be deprecated and master weights can
+ # become an attribute, remember to stash master weights before casting the model.
+
+ if properties.cast_model_type:
+ if properties.keep_batchnorm_fp32:
+ for model in models:
+ convert_network(model, properties.cast_model_type)
+ else:
+ for model in models:
+ model.to(properties.cast_model_type)
+
+ input_caster = functools.partial(to_type, properties.cast_model_type)
+ if cast_model_outputs is not None:
+ output_caster = functools.partial(to_type, cast_model_outputs)
+ else:
+ output_caster = functools.partial(to_type, torch.float32)
+
+ for model in models:
+ # Patch the forward method to cast incoming data to the correct type, and
+ # outgoing data to float32, so "the user never needs to call .half()."
+ # I like writing things explicitly more than decorators.
+ def patch_forward(old_fwd):
+ def new_fwd(*args, **kwargs):
+ output = old_fwd(*applier(args, input_caster),
+ **applier(kwargs, input_caster))
+ return applier(output, output_caster)
+ return new_fwd
+
+ model.forward = patch_forward(model.forward)
+
+ # State dict trick to recast any preexisting per-param state tensors
+ for optimizer in optimizers:
+ optimizer.load_state_dict(optimizer.state_dict())
+
+ # patch model.state_dict() to return float32 params
+ for model in models:
+ for module in model.modules():
+ module._register_state_dict_hook(O2StateDictHook(functools.partial(to_type, torch.float32)))
+
+ elif cast_model_outputs is not None:
+ output_caster = functools.partial(to_type, cast_model_outputs)
+
+ for model in models:
+ def patch_forward(old_fwd):
+ def new_fwd(*args, **kwargs):
+ output = old_fwd(*args, **kwargs)
+ return applier(output, output_caster)
+ return new_fwd
+
+ model.forward = patch_forward(model.forward)
+
+ for i, optimizer in enumerate(optimizers):
+ optimizers[i] = _process_optimizer(optimizer, properties)
+
+ _amp_state.loss_scalers = []
+ for _ in range(num_losses):
+ _amp_state.loss_scalers.append(LossScaler(properties.loss_scale,
+ min_loss_scale=_amp_state.min_loss_scale,
+ max_loss_scale=_amp_state.max_loss_scale))
+
+ if properties.patch_torch_functions:
+ # handle is unused here. It's accessible later through a global value anyway.
+ handle = amp_init(loss_scale=properties.loss_scale, verbose=(_amp_state.verbosity == 2))
+ for optimizer in optimizers:
+ # Disable Amp casting for the optimizer step, because it should only be
+ # applied to FP32 master params anyway.
+ def patch_step(old_step):
+ def new_step(self, *args, **kwargs):
+ with disable_casts():
+ output = old_step(*args, **kwargs)
+ return output
+ return new_step
+
+ optimizer.step = MethodType(patch_step(optimizer.step), optimizer)
+
+ if optimizers_was_list:
+ if models_was_list:
+ return models, optimizers
+ else:
+ return models[0], optimizers
+ else:
+ if models_was_list:
+ if len(optimizers) == 0:
+ return models
+ else:
+ return models, optimizers[0]
+ else:
+ if len(optimizers) == 0:
+ return models[0]
+ else:
+ return models[0], optimizers[0]
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/_process_optimizer.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/_process_optimizer.py
new file mode 100644
index 0000000000..471289bba6
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/_process_optimizer.py
@@ -0,0 +1,489 @@
+import types
+from ..fp16_utils import master_params_to_model_params
+from ..multi_tensor_apply import multi_tensor_applier
+from ._amp_state import maybe_print
+import torch
+from ..optimizers import FusedSGD
+
+
+class AmpOptimizerState(object):
+ def __init__(self):
+ pass
+
+
+def _master_params_to_model_params(self):
+ stash = self._amp_stash
+ if multi_tensor_applier.available:
+ if len(stash.all_fp16_params) > 0:
+ multi_tensor_applier(
+ stash.multi_tensor_scale,
+ stash.dummy_overflow_buf,
+ [stash.all_fp32_from_fp16_params, stash.all_fp16_params],
+ 1.0)
+ else:
+ for fp16_group, fp32_from_fp16_group in zip(stash.fp16_groups, stash.fp32_from_fp16_groups):
+ master_params_to_model_params(fp16_group, fp32_from_fp16_group)
+
+
+def lazy_init_with_master_weights(self):
+ stash = self._amp_stash
+ stash.fp16_groups = []
+ stash.fp32_from_fp16_groups = []
+ stash.fp32_from_fp32_groups = []
+ for i, param_group in enumerate(self.param_groups):
+ # maybe_print("FP16_Optimizer processing param group {}:".format(i))
+ fp16_params_this_group = []
+ fp32_params_this_group = []
+ fp32_from_fp16_params_this_group = []
+ for i, param in enumerate(param_group['params']):
+ if param.requires_grad:
+ if param.type() == 'torch.cuda.HalfTensor':
+ # maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
+ # .format(param.size()))
+ fp16_params_this_group.append(param)
+ master_param = param.detach().clone().float()
+ master_param.requires_grad = True
+ param_group['params'][i] = master_param
+ fp32_from_fp16_params_this_group.append(master_param)
+ # Reset existing state dict key to the new master param.
+ # We still need to recast per-param state tensors, if any, to FP32.
+ if param in self.state:
+ self.state[master_param] = self.state.pop(param)
+ elif param.type() == 'torch.cuda.FloatTensor':
+ # maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
+ # .format(param.size()))
+ fp32_params_this_group.append(param)
+ param_group['params'][i] = param
+ else:
+ raise TypeError("Optimizer's parameters must be either "
+ "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+ "Received {}".format(param.type()))
+
+ stash.fp16_groups.append(fp16_params_this_group)
+ stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+ stash.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+ stash.all_fp16_params = []
+ for group in stash.fp16_groups:
+ stash.all_fp16_params += group
+
+ stash.all_fp32_from_fp16_params = []
+ for group in stash.fp32_from_fp16_groups:
+ stash.all_fp32_from_fp16_params += group
+
+ stash.all_fp32_from_fp32_params = []
+ for group in stash.fp32_from_fp32_groups:
+ stash.all_fp32_from_fp32_params += group
+
+ # all_fp16_grad_stash is only needed for fused optimizers.
+ stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
+ # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
+ stash.all_fp32_from_fp32_grad_stash = [None for _ in stash.all_fp32_from_fp32_params]
+
+ for param in stash.all_fp32_from_fp16_params:
+ param.grad = None
+
+ for param in stash.all_fp32_from_fp32_params:
+ param.grad = None
+
+ # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
+ self.load_state_dict(self.state_dict())
+
+
+def post_backward_models_are_masters(scaler, params, stashed_grads, scale_override=None):
+ grads_have_scale, stashed_have_scale, out_scale = scaler.loss_scale(), 1.0, 1.0
+
+ # not much to do if scale == 1.0 and static scaling
+ if scaler.loss_scale() == 1.0 and not scaler.dynamic:
+ # Clear the stash.
+ for i in range(len(stashed_grads)):
+ stashed_grads[i] = None
+ return
+
+ if scale_override is not None:
+ grads_have_scale, stashed_have_scale, out_scale = scale_override
+
+ # This is a lot of python overhead...
+ grads_needing_unscale = []
+ grads_needing_unscale_with_stash = []
+ stashed = []
+ for param, stashed_grad in zip(params, stashed_grads):
+ if param.grad is None and stashed_grad is not None:
+ param.grad = stashed_grad
+ elif param.grad is not None and stashed_grad is None:
+ grads_needing_unscale.append(param.grad)
+ elif param.grad is not None and stashed_grad is not None:
+ grads_needing_unscale_with_stash.append(param.grad)
+ stashed.append(stashed_grad)
+ else: # param.grad is None and stashed_grad is None
+ continue
+
+ # unscale() implements grads*(1/scale), so "scale" should be grads_have_scale/out_scale.
+ if len(grads_needing_unscale) > 0:
+ scaler.unscale(
+ grads_needing_unscale,
+ grads_needing_unscale,
+ None, # unused_scale, currently present to avoid API breakage elsewhere
+ models_are_masters=True,
+ scale_override=grads_have_scale/out_scale)
+
+ if len(grads_needing_unscale_with_stash) > 0:
+ scaler.unscale_with_stashed(
+ grads_needing_unscale_with_stash,
+ stashed,
+ grads_needing_unscale_with_stash,
+ scale_override=(grads_have_scale, stashed_have_scale, out_scale))
+
+ # Clear the stash.
+ for i in range(len(stashed_grads)):
+ stashed_grads[i] = None
+
+
+def prepare_backward_with_master_weights(self):
+ stash = self._amp_stash
+
+ self._amp_lazy_init()
+
+ for i, param in enumerate(stash.all_fp16_params):
+ # Set up to leverage grad copy elision.
+ # This may behave differently from an unpatched optimizer if zero_grad is used and the param is unused.
+ param.grad = None
+
+ # for i, param in enumerate(stash.all_fp32_from_fp16_params):
+ # stash.all_fp32_from_fp16_grad_stash[i] = param.grad
+
+ for i, param in enumerate(stash.all_fp32_from_fp32_params):
+ stash.all_fp32_from_fp32_grad_stash[i] = param.grad
+ # Set up to leverage grad copy elision:
+ param.grad = None
+
+
+def post_backward_with_master_weights(self, scaler):
+ stash = self._amp_stash
+
+ self._amp_lazy_init()
+
+ # This is a lot of python overhead...
+ fp16_grads_needing_unscale = []
+ new_fp32_grads = []
+ fp16_grads_needing_unscale_with_stash = []
+ preexisting_fp32_grads = []
+ for fp16_param, fp32_param in zip(stash.all_fp16_params,
+ stash.all_fp32_from_fp16_params):
+ if fp16_param.grad is None and fp32_param.grad is not None:
+ continue
+ elif fp16_param.grad is not None and fp32_param.grad is None:
+ fp32_param.grad = torch.empty_like(fp32_param)
+ fp16_grads_needing_unscale.append(fp16_param.grad)
+ new_fp32_grads.append(fp32_param.grad)
+ elif fp16_param.grad is not None and fp32_param.grad is not None:
+ fp16_grads_needing_unscale_with_stash.append(fp16_param.grad)
+ preexisting_fp32_grads.append(fp32_param.grad)
+ else: # fp16_param.grad is None and fp32_param.grad is None:
+ continue
+
+ if len(fp16_grads_needing_unscale) > 0:
+ scaler.unscale(
+ fp16_grads_needing_unscale,
+ new_fp32_grads,
+ scaler.loss_scale(),
+ models_are_masters=False)
+
+ if len(fp16_grads_needing_unscale_with_stash) > 0:
+ scaler.unscale_with_stashed(
+ fp16_grads_needing_unscale_with_stash,
+ preexisting_fp32_grads,
+ preexisting_fp32_grads)
+
+ # fp32 params can be treated as they would be in the "no_master_weights" case.
+ post_backward_models_are_masters(
+ scaler,
+ stash.all_fp32_from_fp32_params,
+ stash.all_fp32_from_fp32_grad_stash)
+
+
+def lazy_init_no_master_weights(self):
+ stash = self._amp_stash
+ stash.all_fp16_params = []
+ stash.all_fp32_params = []
+ for i, param_group in enumerate(self.param_groups):
+ for i, param in enumerate(param_group['params']):
+ if param.type() == 'torch.cuda.HalfTensor':
+ stash.all_fp16_params.append(param)
+ elif param.type() == 'torch.cuda.FloatTensor':
+ stash.all_fp32_params.append(param)
+ else:
+ raise TypeError("Optimizer's parameters must be either "
+ "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+ "Received {}".format(param.type()))
+
+ stash.all_fp16_grad_stash = [None for _ in stash.all_fp16_params]
+ stash.all_fp32_grad_stash = [None for _ in stash.all_fp32_params]
+
+
+def prepare_backward_no_master_weights(self):
+ stash = self._amp_stash
+
+ self._amp_lazy_init()
+
+ for i, param in enumerate(stash.all_fp16_params):
+ stash.all_fp16_grad_stash[i] = param.grad
+ # Set up to leverage grad copy elision:
+ param.grad = None
+
+ for i, param in enumerate(stash.all_fp32_params):
+ stash.all_fp32_grad_stash[i] = param.grad
+ # Set up to leverage grad copy elision:
+ param.grad = None
+
+
+def post_backward_no_master_weights(self, scaler):
+ stash = self._amp_stash
+
+ self._amp_lazy_init()
+
+ split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
+ (stash.all_fp32_params, stash.all_fp32_grad_stash))
+
+ for params, stashed_grads in split_types:
+ post_backward_models_are_masters(scaler, params, stashed_grads)
+
+
+#####################################################################################
+# FusedSGD versions
+#####################################################################################
+
+# FusedSGD never explicitly materializes the fp32 gradients for "fp32 from fp16" master params
+# outside the kernel, so we must accumulate directly into the model grads.
+def prepare_backward_with_master_weights_FusedSGD(self):
+ if self.materialize_master_grads:
+ prepare_backward_with_master_weights(self)
+ else:
+ stash = self._amp_stash
+
+ self._amp_lazy_init()
+
+ for i, param in enumerate(stash.all_fp16_params):
+ stash.all_fp16_grad_stash[i] = param.grad
+ # Set up to leverage grad copy elision:
+ param.grad = None
+
+ for i, param in enumerate(stash.all_fp32_from_fp32_params):
+ stash.all_fp32_from_fp32_grad_stash[i] = param.grad
+ # Set up to leverage grad copy elision:
+ param.grad = None
+
+
+def post_backward_with_master_weights_FusedSGD(self, scaler):
+ if self.materialize_master_grads:
+ post_backward_with_master_weights(self, scaler)
+ else:
+ stash = self._amp_stash
+
+ self._amp_lazy_init()
+
+ grads_have_scale = scaler.loss_scale()
+ stashed_have_scale = self.most_recent_scale
+ out_scale = grads_have_scale
+ if self.scale_set_by_backward:
+ out_scale = min(grads_have_scale, self.most_recent_scale)
+
+ split_types = ((stash.all_fp16_params, stash.all_fp16_grad_stash),
+ (stash.all_fp32_from_fp32_params, stash.all_fp32_from_fp32_grad_stash))
+
+
+ # unscale_with_stashed() implements grads*1/scale + stashed_grads*1.
+ # stashed_grads are scaled by self.most_recent_scale.
+ for params, stashed_grads in split_types:
+ post_backward_models_are_masters(scaler, params, stashed_grads,
+ (grads_have_scale, stashed_have_scale, out_scale))
+
+ self.most_recent_scale = out_scale
+ self.scale_set_by_backward = True
+
+
+def prepare_backward_no_master_weights_FusedSGD(self):
+ prepare_backward_no_master_weights(self)
+
+
+def post_backward_no_master_weights_FusedSGD(self, scaler):
+ post_backward_no_master_weights(self, scaler)
+
+
+def _amp_lazy_init(self):
+ stash = self._amp_stash
+
+ if not stash.lazy_init_called:
+ self._lazy_init_maybe_master_weights()
+ stash.lazy_init_called = True
+
+
+def _process_optimizer(optimizer, properties):
+ if hasattr(optimizer, "_amp_stash"):
+ raise RuntimeError("A given optimizer should only be passed through amp.initialize once.")
+ else:
+ optimizer._amp_stash = AmpOptimizerState()
+
+ optimizer._amp_stash.lazy_init_called = False
+ optimizer._amp_stash.already_patched = False
+ optimizer._amp_stash.params_have_scaled_gradients = False
+
+ for name in ("_lazy_init_maybe_master_weights",
+ "_master_params_to_model_params",
+ "_prepare_amp_backward",
+ "_post_amp_backward",
+ "_amp_lazy_init"):
+ if hasattr(optimizer, name):
+ raise RuntimeError("Incoming optimizer already has {} defined.".format(name))
+
+ # TODO: Centralize exposure and import error checking for the C backend.
+ if multi_tensor_applier.available:
+ import amp_C
+ optimizer._amp_stash.multi_tensor_scale = amp_C.multi_tensor_scale
+ optimizer._amp_stash.multi_tensor_l2norm = amp_C.multi_tensor_l2norm
+ optimizer._amp_stash.dummy_overflow_buf = torch.cuda.IntTensor([0]);
+
+ if properties.master_weights:
+ optimizer._lazy_init_maybe_master_weights = types.MethodType(
+ lazy_init_with_master_weights, optimizer)
+
+ optimizer._master_params_to_model_params = types.MethodType(
+ _master_params_to_model_params, optimizer)
+
+ old_step = optimizer.step
+ def new_step(self, closure=None):
+ if closure is not None:
+ raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
+ retval = old_step()
+ if not isinstance(self, FusedSGD):
+ self._master_params_to_model_params()
+ # Clear the master grads that wouldn't be zeroed by model.zero_grad()
+ for param in self._amp_stash.all_fp32_from_fp16_params:
+ param.grad = None
+ return retval
+ optimizer.step = types.MethodType(new_step, optimizer)
+
+ old_zero_grad = optimizer.zero_grad
+ def new_zero_grad(self):
+ stash = self._amp_stash
+ self._amp_lazy_init()
+ # Zero the model grads.
+ for param in stash.all_fp16_params:
+ if param.grad is not None:
+ param.grad.detach_()
+ param.grad.zero_()
+ for param in stash.all_fp32_from_fp32_params:
+ if param.grad is not None:
+ param.grad.detach_()
+ param.grad.zero_()
+ # Clear the master grads that are independent of model grads
+ for param in self._amp_stash.all_fp32_from_fp16_params:
+ param.grad = None
+ optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)
+
+ if isinstance(optimizer, FusedSGD):
+ optimizer._prepare_amp_backward = types.MethodType(
+ prepare_backward_with_master_weights_FusedSGD, optimizer)
+ optimizer._post_amp_backward = types.MethodType(
+ post_backward_with_master_weights_FusedSGD, optimizer)
+ else:
+ optimizer._prepare_amp_backward = types.MethodType(
+ prepare_backward_with_master_weights, optimizer)
+ optimizer._post_amp_backward = types.MethodType(
+ post_backward_with_master_weights, optimizer)
+ else:
+ optimizer._lazy_init_maybe_master_weights = types.MethodType(
+ lazy_init_no_master_weights, optimizer)
+
+ if isinstance(optimizer, FusedSGD):
+ optimizer._prepare_amp_backward = types.MethodType(
+ prepare_backward_no_master_weights_FusedSGD, optimizer)
+ optimizer._post_amp_backward = types.MethodType(
+ post_backward_no_master_weights_FusedSGD, optimizer)
+ else:
+ optimizer._prepare_amp_backward = types.MethodType(
+ prepare_backward_no_master_weights, optimizer)
+ optimizer._post_amp_backward = types.MethodType(
+ post_backward_no_master_weights, optimizer)
+
+ optimizer._amp_lazy_init = types.MethodType(_amp_lazy_init, optimizer)
+
+ old_add_param_group = optimizer.add_param_group
+
+ def new_add_param_group(self, new_group):
+ stash = self._amp_stash
+
+ if not stash.lazy_init_called:
+ self._lazy_init_maybe_master_weights()
+ stash.lazy_init_called = True
+
+ assert isinstance(new_group, dict), "param group must be a dict"
+
+ new_params = new_group['params']
+ if isinstance(new_params, torch.Tensor):
+ new_group['params'] = [new_params]
+ elif isinstance(new_params, set):
+ raise TypeError('optimizer parameters need to be organized in ordered collections, but '
+ 'the ordering of tensors in sets will change between runs. Please use a list instead.')
+ else:
+ new_group['params'] = list(new_params)
+
+ if properties.master_weights:
+ # Mutate new_group in-place to use FP32 master params
+ fp16_params_this_group = []
+ fp32_params_this_group = []
+ fp32_from_fp16_params_this_group = []
+ for i, param in enumerate(new_group['params']):
+ if param.requires_grad:
+ if param.type() == 'torch.cuda.HalfTensor':
+ fp16_params_this_group.append(param)
+ master_param = param.detach().clone().float()
+ master_param.requires_grad = True
+ new_group['params'][i] = master_param
+ fp32_from_fp16_params_this_group.append(master_param)
+ elif param.type() == 'torch.cuda.FloatTensor':
+ fp32_params_this_group.append(param)
+ new_group['params'][i] = param
+ else:
+ raise TypeError("Optimizer's parameters must be either "
+ "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+ "Received {}".format(param.type()))
+
+ stash.fp16_groups.append(fp16_params_this_group)
+ stash.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
+ stash.fp32_from_fp32_groups.append(fp32_params_this_group)
+
+ stash.all_fp16_params += fp16_params_this_group
+ stash.all_fp32_from_fp16_params += fp32_from_fp16_params_this_group
+ stash.all_fp32_from_fp32_params += fp32_params_this_group
+
+ # stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
+ stash.all_fp32_from_fp32_grad_stash += [None for _ in fp32_params_this_group]
+
+ # It should be ok to let params be added with existing .grad attributes.
+ # for param in fp16_params_this_group:
+ # param.grad = None
+
+ # for param in fp32_from_fp16_params_this_group:
+ # param.grad = None
+
+ # for param in stash.fp32_params_this_group:
+ # param.grad = None
+ else:
+ for param in new_group['params']:
+ if param.type() == 'torch.cuda.HalfTensor':
+ stash.all_fp16_params.append(param)
+ stash.all_fp16_grad_stash.append(None)
+ elif param.type() == 'torch.cuda.FloatTensor':
+ stash.all_fp32_params.append(param)
+ stash.all_fp32_grad_stash.append(None)
+ else:
+ raise TypeError("Optimizer's parameters must be either "
+ "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
+ "Received {}".format(param.type()))
+
+ old_add_param_group(new_group)
+
+ optimizer.add_param_group = types.MethodType(new_add_param_group, optimizer)
+
+ return optimizer
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/amp.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/amp.py
new file mode 100644
index 0000000000..1eed72d07b
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/amp.py
@@ -0,0 +1,177 @@
+from . import compat, rnn_compat, utils, wrap
+from .handle import AmpHandle, NoOpHandle
+from .lists import functional_overrides, torch_overrides, tensor_overrides
+from ._amp_state import _amp_state
+from .frontend import *
+
+import functools
+import itertools
+
+import torch
+
+
+_DECORATOR_HANDLE = None
+_USER_CAST_REGISTRY = set()
+_USER_PROMOTE_REGISTRY = set()
+
+
+def _decorator_helper(orig_fn, cast_fn, wrap_fn):
+ def wrapper(*args, **kwargs):
+ handle = _DECORATOR_HANDLE
+ if handle is None or not handle.is_active():
+ return orig_fn(*args, **kwargs)
+ inner_cast_fn = utils.verbosify(cast_fn, orig_fn.__name__,
+ handle.verbose)
+ return wrap_fn(orig_fn, inner_cast_fn, handle)(*args, **kwargs)
+ return wrapper
+
+
+# Decorator form
+def half_function(fn):
+ wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=True)
+ return _decorator_helper(fn, utils.maybe_half, wrap_fn)
+
+
+def float_function(fn):
+ wrap_fn = functools.partial(wrap.make_cast_wrapper, try_caching=False)
+ return _decorator_helper(fn, utils.maybe_float, wrap_fn)
+
+
+def promote_function(fn):
+ wrap_fn = functools.partial(wrap.make_promote_wrapper)
+ return _decorator_helper(fn, utils.maybe_float, wrap_fn)
+
+
+# Registry form
+def register_half_function(module, name):
+ if not hasattr(module, name):
+ raise ValueError('No function named {} in module {}.'.format(
+ name, module))
+ _USER_CAST_REGISTRY.add((module, name, utils.maybe_half))
+
+
+def register_float_function(module, name):
+ if not hasattr(module, name):
+ raise ValueError('No function named {} in module {}.'.format(
+ name, module))
+ _USER_CAST_REGISTRY.add((module, name, utils.maybe_float))
+
+
+def register_promote_function(module, name):
+ if not hasattr(module, name):
+ raise ValueError('No function named {} in module {}.'.format(
+ name, module))
+ _USER_PROMOTE_REGISTRY.add((module, name))
+
+
+# Top-level function to insert _all_ the hooks.
+def init(enabled=True, loss_scale="dynamic", enable_caching=True, verbose=False, allow_banned=False):
+ global _DECORATOR_HANDLE
+
+ if not enabled:
+ handle = NoOpHandle()
+ _DECORATOR_HANDLE = handle
+ return handle
+
+ handle = AmpHandle(loss_scale, enable_caching, verbose)
+
+ # 0) Force-{fp16, fp32} for user-annotated functions
+ for mod, fn, cast_fn in _USER_CAST_REGISTRY:
+ try_caching = (cast_fn == utils.maybe_half)
+ wrap.cached_cast(mod, fn, cast_fn, handle,
+ try_caching, verbose)
+ _USER_CAST_REGISTRY.clear()
+
+ # 0.5) Force-promote for user-annotated functions
+ for mod, fn in _USER_PROMOTE_REGISTRY:
+ wrap.promote(mod, fn, handle, verbose)
+ _USER_PROMOTE_REGISTRY.clear()
+
+ # 1) Force-{fp16, fp32} on white- / black-list functions
+ override_modules = [functional_overrides,
+ torch_overrides,
+ tensor_overrides]
+ cast_table = [('FP16_FUNCS', utils.maybe_half),
+ ('FP32_FUNCS', utils.maybe_float)]
+ for module, (list_name, cast_fn) in itertools.product(override_modules,
+ cast_table):
+ for fn in getattr(module, list_name):
+ try_caching = (cast_fn == utils.maybe_half)
+ wrap.cached_cast(module.MODULE, fn, cast_fn, handle,
+ try_caching, verbose)
+
+ # 1.5) Pre-0.4, put the blacklist methods on HalfTensor and whitelist
+ # methods on FloatTensor, since they're distinct types.
+ if compat.tensor_is_float_tensor():
+ for fn in tensor_overrides.FP16_FUNCS:
+ wrap.cached_cast(torch.cuda.FloatTensor, fn, utils.maybe_half,
+ handle, try_caching=True, verbose=verbose)
+ for fn in tensor_overrides.FP32_FUNCS:
+ wrap.cached_cast(torch.cuda.HalfTensor, fn, utils.maybe_float,
+ handle, try_caching=False, verbose=verbose)
+
+ # 2) Enable type-promotion on multi-arg functions and methods.
+ # NB: special handling for sequence fns (e.g. `torch.cat`).
+ promote_modules = [torch_overrides, tensor_overrides]
+ promote_table = [('CASTS', wrap.promote),
+ ('SEQUENCE_CASTS', wrap.sequence_promote)]
+ for promote_mod, (list_name, promote_fn) in itertools.product(promote_modules,
+ promote_table):
+ for fn in getattr(promote_mod, list_name):
+ promote_fn(promote_mod.MODULE, fn, handle, verbose)
+
+ # 2.5) Pre-0.4, add blacklist methods directly to HalfTensor and FloatTensor types
+ if compat.tensor_is_float_tensor():
+ for cls, (list_name, promote_fn) in itertools.product([torch.cuda.FloatTensor,
+ torch.cuda.HalfTensor],
+ promote_table):
+ for fn in getattr(tensor_overrides, list_name):
+ promote_fn(cls, fn, handle, verbose)
+
+ # 3) For any in-place version of a blacklist function, error if any input is fp16.
+ # NB: this is overly conservative.
+ for fn in utils.as_inplace(torch_overrides.FP32_FUNCS):
+ wrap.err_if_any_half(torch_overrides.MODULE, fn, handle)
+
+ # 3.5) For any in-place blacklist method, error if called on fp16 tensor
+ for fn in utils.as_inplace(tensor_overrides.FP32_FUNCS):
+ wrap.err_if_arg0_half(tensor_overrides.MODULE, fn, handle, verbose)
+ if compat.tensor_is_float_tensor():
+ wrap.err_if_arg0_half(torch.cuda.HalfTensor, fn, handle, verbose)
+
+ # 4) For other in-place methods, match the type of self tensor
+ for fn in utils.as_inplace(itertools.chain(
+ tensor_overrides.FP16_FUNCS,
+ tensor_overrides.CASTS)):
+ wrap.promote_match_arg0(tensor_overrides.MODULE, fn, handle, verbose)
+ if compat.tensor_is_float_tensor():
+ wrap.promote_match_arg0(torch.cuda.HalfTensor, fn, handle, verbose)
+ wrap.promote_match_arg0(torch.cuda.FloatTensor, fn, handle, verbose)
+
+ # 5) RNNs + RNN cells are whitelisted specially
+ if rnn_compat.has_old_rnns():
+ wrap.rnn_cast(torch.nn.backends.thnn.backend, 'RNN', handle, verbose)
+ if not rnn_compat.has_old_rnns():
+ # Patch in our own indirection of `_VF` in modules/rnn s.t. it is mutable.
+ torch.nn.modules.rnn._VF = rnn_compat.VariableFunctionsShim()
+ # Wrap all the rnns
+ for x in rnn_compat.RNN_NAMES:
+ wrap.new_rnn_cast(x.upper(), handle, verbose)
+
+ # Wrap all the RNN cells
+ rnn_compat.whitelist_rnn_cells(handle, verbose)
+
+ # 6) Place error+print message on banned functions.
+ # Or, if allow_banned, then cast to FP32.
+ for fn, err_msg in functional_overrides.BANNED_FUNCS:
+ if allow_banned:
+ wrap.cached_cast(functional_overrides.MODULE, fn, utils.maybe_float,
+ handle, try_caching=True, verbose=verbose)
+ else:
+ wrap.err_if_any_half(functional_overrides.MODULE, fn, handle, err_msg)
+
+ _DECORATOR_HANDLE = handle
+
+ _amp_state.handle = handle
+
+ return handle
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/compat.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/compat.py
new file mode 100644
index 0000000000..22276bd47d
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/compat.py
@@ -0,0 +1,46 @@
+import torch
+
+# True for post-0.4, when Variables/Tensors merged.
+def variable_is_tensor():
+ v = torch.autograd.Variable()
+ return isinstance(v, torch.Tensor)
+
+def tensor_is_variable():
+ x = torch.Tensor()
+ return type(x) == torch.autograd.Variable
+
+# False for post-0.4
+def tensor_is_float_tensor():
+ x = torch.Tensor()
+ return type(x) == torch.FloatTensor
+
+# Akin to `torch.is_tensor`, but returns True for Variable
+# objects in pre-0.4.
+def is_tensor_like(x):
+ return torch.is_tensor(x) or isinstance(x, torch.autograd.Variable)
+
+# Wraps `torch.is_floating_point` if present, otherwise checks
+# the suffix of `x.type()`.
+def is_floating_point(x):
+ if hasattr(torch, 'is_floating_point'):
+ return torch.is_floating_point(x)
+ try:
+ torch_type = x.type()
+ return torch_type.endswith('FloatTensor') or \
+ torch_type.endswith('HalfTensor') or \
+ torch_type.endswith('DoubleTensor')
+ except AttributeError:
+ return False
+
+def scalar_python_val(x):
+ if hasattr(x, 'item'):
+ return x.item()
+ else:
+ if isinstance(x, torch.autograd.Variable):
+ return x.data[0]
+ else:
+ return x[0]
+
+# Accounts for the possibility that some ops may be removed from a namespace.
+def filter_attrs(module, attrs):
+ return list(attrname for attrname in attrs if hasattr(module, attrname))
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/frontend.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/frontend.py
new file mode 100644
index 0000000000..da0f05dc99
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/frontend.py
@@ -0,0 +1,442 @@
+import torch
+from ._initialize import _initialize
+from ._amp_state import _amp_state, warn_or_err, maybe_print
+from collections import OrderedDict
+
+
+class Properties(object):
+ """
+ This class has two purposes: to establish a set of default properties,
+ and to route setting of these attributes through __setattr__ so that (in theory)
+ they can be checked for consistency with other existing args.
+ """
+ def __init__(self):
+ self.options = {
+ "enabled" : False,
+ "opt_level" : None,
+ "cast_model_type" : None,
+ "patch_torch_functions" : False,
+ "keep_batchnorm_fp32" : None,
+ "master_weights" : None,
+ "loss_scale" : 1.0,
+ # Reserved for future functionality
+ # "fused_optimizer" : False,
+ # "enable_ddp_interop" : False,
+ }
+
+ """
+ This function allows updating several options at a time without routing through
+ __setattr__ checks, to avoid "you can't get there from here" scenarios.
+ Currently not intended to be exposed; users are expected to select an opt_level
+ and apply consistent modifications.
+ """
+ def _update_options_dict(self, new_options):
+ for k, v in new_options:
+ if k in self.options:
+ self.options[k] = v
+ else:
+ raise ValueError("Tried to set unexpected option {}".format(k))
+ """
+ The members of "options" are not direct attributes of self, so access attempts
+ will roll down to __getattr__. This borrows from the logic in torch.nn.Module.
+ """
+ def __getattr__(self, name):
+ if "options" in self.__dict__:
+ options = self.__dict__["options"]
+ if name in options:
+ return options[name]
+ raise AttributeError("'{}' object has no attribute '{}'".format(
+ type(self).__name__, name))
+
+ def __setattr__(self, name, value):
+ if "options" in self.__dict__:
+ if name in self.options:
+ # print("setting {} {}".format(name, value))
+ if name == "cast_model_type":
+ if self.opt_level == "O1" and value is not None:
+ if value is not False:
+ if value is not torch.float32:
+ warn_or_err("O1 inserts casts around Torch functions rather than "
+ "model weights, so with O1, the model weights themselves "
+ "should remain FP32. If you wish to cast the model to a "
+ "different type, use opt_level='O2' or 'O3'. " +
+ "cast_model_type was {}".format(value))
+ self.options[name] = value
+ elif name == "patch_torch_functions":
+ if self.opt_level != "O1" and value:
+ warn_or_err("Currently, patch_torch_functions=True should only be set by "
+ "selecting opt_level='O1'.")
+ self.options[name] = value
+ elif name == "keep_batchnorm_fp32":
+ if self.opt_level == "O1" and value is not None:
+ warn_or_err("With opt_level O1, batchnorm functions are automatically patched "
+ "to run in FP32, so keep_batchnorm_fp32 should be None." +
+ " keep_batchnorm_fp32 was {}".format(value))
+ if value == "False":
+ self.options[name] = False
+ elif value == "True":
+ self.options[name] = True
+ else:
+ assert (value is True or value is False or value is None),\
+ "keep_batchnorm_fp32 must be a boolean, the string 'True' or 'False', "\
+ "or None, found keep_batchnorm_fp32={}".format(value)
+ self.options[name] = value
+ elif name == "master_weights":
+ if self.opt_level == "O1" and value is not None:
+ warn_or_err("It doesn't make sense to use master_weights with O1. "
+ "With O1, your model weights themselves should be FP32.")
+ self.options[name] = value
+ elif name == "loss_scale":
+ if value == "dynamic":
+ self.options[name] = value
+ else:
+ self.options[name] = float(value)
+ else:
+ self.options[name] = value
+ else:
+ super(Properties, self).__setattr__(name, value)
+
+
+""" O0-O3 are convenience wrappers to establish defaults for typically used mixed precision options. """
+
+class O3:
+ brief = "O3: Pure FP16 training."
+ more = "Calls .half() on your model, converting the entire model to FP16.\n"\
+ "A casting operation is also inserted to cast incoming Tensors to FP16,\n"\
+ "so you don't need to change your data pipeline.\n"\
+ "This mode is useful for establishing a performance ceiling.\n"\
+ "It's also possible training may 'just work' in this mode.\n"\
+ "If not, try other optimization levels."
+
+ def __call__(self, properties):
+ properties.enabled = True
+ properties.opt_level = "O3"
+ properties.cast_model_type = torch.float16
+ properties.patch_torch_functions = False
+ properties.keep_batchnorm_fp32 = False
+ properties.master_weights = False
+ properties.loss_scale = 1.0
+ # properties.fused_optimizer = False
+ # properties.enable_ddp_interop = False
+ return properties # modified in place so this isn't really necessary
+
+
+class O2:
+ brief = "O2: FP16 training with FP32 batchnorm and FP32 master weights.\n"
+ more = "Calls .half() on your model, converting the entire model (except for batchnorms)\n"\
+ "to FP16. Batchnorms are retained in FP32 for additional stability.\n"\
+ "The forward pass is patched to cast incoming Tensors to FP16, so you don't need to change\n"\
+ "your data pipeline.\n"\
+ "O2 creates FP32 master weights outside the model and patches any optimizers to update\n"\
+ "these master weights, then copy the master weights into the FP16 model weights.\n"\
+ "Master weights can also improve convergence and stability."
+
+ def __call__(self, properties):
+ properties.enabled = True
+ properties.opt_level = "O2"
+ properties.cast_model_type = torch.float16
+ properties.patch_torch_functions = False
+ properties.keep_batchnorm_fp32 = True
+ properties.master_weights = True
+ properties.loss_scale = "dynamic"
+ # properties.fused_optimizer = False
+ # properties.enable_ddp_interop = False
+ return properties # modified in place so this isn't really necessary
+
+
+class O1:
+ brief = "O1: Insert automatic casts around Pytorch functions and Tensor methods.\n"
+ more = "The type of your model's weights is not altered. However, internally,\n"\
+ "Pytorch functions are patched to cast any Tensor Core-friendly ops to FP16 for speed,\n"\
+ "while operations that might benefit from the additional stability of FP32 are patched\n"\
+ "to cast their inputs to fp32.\n"\
+ "O1 is the safest way to try mixed precision training, and is recommended when\n"\
+ "trying mixed precision training for the first time."
+
+ def __call__(self, properties):
+ properties.enabled = True
+ properties.opt_level = "O1"
+ properties.cast_model_type = None
+ properties.patch_torch_functions = True
+ properties.keep_batchnorm_fp32 = None
+ properties.master_weights = None
+ properties.loss_scale = "dynamic"
+ # properties.fused_optimizer = False
+ # properties.enable_ddp_interop = False
+ return properties # modified in place so this isn't really necessary
+
+
+class O0:
+ brief = "O0: Pure FP32 training.\n"
+ more = "Your models are checked to make sure parameters are FP32, but otherwise the\n"\
+ "types of weights and internal Pytorch operations are not altered. This mode disables any\n"\
+ "FP16 arithmetic, although other optimizations like DDP interop may still be requested.\n"
+
+ def __call__(self, properties):
+ properties.enabled = True
+ properties.opt_level = "O0"
+ properties.cast_model_type = torch.float32
+ properties.patch_torch_functions = False
+ properties.keep_batchnorm_fp32 = None
+ properties.master_weights = False
+ properties.loss_scale = 1.0
+ # properties.fused_optimizer = False
+ # properties.enable_ddp_interop = False
+ return properties # modified in place so this isn't really necessary
+
+
+opt_levels = {"O3": O3(),
+ "O2": O2(),
+ "O1": O1(),
+ "O0": O0()}
+
+
+# allow user to directly pass Properties struct as well?
+def initialize(
+ models,
+ optimizers=None,
+ enabled=True,
+ opt_level="O1",
+ cast_model_type=None,
+ patch_torch_functions=None,
+ keep_batchnorm_fp32=None,
+ master_weights=None,
+ loss_scale=None,
+ cast_model_outputs=None,
+ num_losses=1,
+ verbosity=1,
+ min_loss_scale=None,
+ max_loss_scale=2.**24
+ ):
+ """
+ Initialize your models, optimizers, and the Torch tensor and functional namespace according to the
+ chosen ``opt_level`` and overridden properties, if any.
+
+ ``amp.initialize`` should be called **after** you have finished
+ constructing your model(s) and
+ optimizer(s), but **before** you send your model through any DistributedDataParallel wrapper.
+ See `Distributed training`_ in the Imagenet example.
+
+ Currently, ``amp.initialize`` should only be called **once**,
+ although it can process an arbitrary number of
+ models and optimizers (see the corresponding `Advanced Amp Usage topic`_).
+ If you think your use case requires ``amp.initialize`` to be called more than once,
+ `let us know`_.
+
+ Any property keyword argument that is not ``None`` will be interpreted as a manual override.
+
+ To prevent having to rewrite anything else in your script, name the returned models/optimizers
+ to replace the passed models/optimizers, as in the code sample below.
+
+ Args:
+ models (torch.nn.Module or list of torch.nn.Modules): Models to modify/cast.
+ optimizers (optional, torch.optim.Optimizer or list of torch.optim.Optimizers): Optimizers to modify/cast.
+ REQUIRED for training, optional for inference.
+ enabled (bool, optional, default=True): If False, renders all Amp calls no-ops, so your script
+ should run as if Amp were not present.
+ opt_level (str, optional, default="O1"): Pure or mixed precision optimization level. Accepted values are
+ "O0", "O1", "O2", and "O3", explained in detail above.
+ cast_model_type (``torch.dtype``, optional, default=None): Optional property override, see
+ above.
+ patch_torch_functions (bool, optional, default=None): Optional property override.
+ keep_batchnorm_fp32 (bool or str, optional, default=None): Optional property override. If
+ passed as a string, must be the string "True" or "False".
+ master_weights (bool, optional, default=None): Optional property override.
+ loss_scale (float or str, optional, default=None): Optional property override. If passed as a string,
+ must be a string representing a number, e.g., "128.0", or the string "dynamic".
+ cast_model_outputs (torch.dtype, optional, default=None): Option to ensure that the outputs
+ of your model(s) are always cast to a particular type regardless of ``opt_level``.
+ num_losses (int, optional, default=1): Option to tell Amp in advance how many losses/backward
+ passes you plan to use. When used in conjunction with the ``loss_id`` argument to
+ ``amp.scale_loss``, enables Amp to use a different loss scale per loss/backward pass,
+ which can improve stability. See "Multiple models/optimizers/losses"
+ under `Advanced Amp Usage`_ for examples. If ``num_losses`` is left to 1, Amp will still
+ support multiple losses/backward passes, but use a single global loss scale
+ for all of them.
+ verbosity (int, default=1): Set to 0 to suppress Amp-related output.
+ min_loss_scale (float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic
+ loss scaling. The default value of None means that no floor is imposed.
+ If dynamic loss scaling is not used, `min_loss_scale` is ignored.
+ max_loss_scale (float, default=2.**24): Sets a ceiling for the loss scale values that can be chosen by
+ dynamic loss scaling. If dynamic loss scaling is not used, `max_loss_scale` is ignored.
+
+ Returns:
+ Model(s) and optimizer(s) modified according to the ``opt_level``.
+ If either the ``models`` or ``optimizers`` args were lists, the corresponding return value will
+ also be a list.
+
+ Permissible invocations::
+
+ model, optim = amp.initialize(model, optim,...)
+ model, [optim1, optim2] = amp.initialize(model, [optim1, optim2],...)
+ [model1, model2], optim = amp.initialize([model1, model2], optim,...)
+ [model1, model2], [optim1, optim2] = amp.initialize([model1, model2], [optim1, optim2],...)
+
+ # This is not an exhaustive list of the cross product of options that are possible,
+ # just a set of examples.
+ model, optim = amp.initialize(model, optim, opt_level="O0")
+ model, optim = amp.initialize(model, optim, opt_level="O0", loss_scale="dynamic"|128.0|"128.0")
+
+ model, optim = amp.initialize(model, optim, opt_level="O1") # uses "loss_scale="dynamic" default
+ model, optim = amp.initialize(model, optim, opt_level="O1", loss_scale=128.0|"128.0")
+
+ model, optim = amp.initialize(model, optim, opt_level="O2") # uses "loss_scale="dynamic" default
+ model, optim = amp.initialize(model, optim, opt_level="O2", loss_scale=128.0|"128.0")
+ model, optim = amp.initialize(model, optim, opt_level="O2", keep_batchnorm_fp32=True|False|"True"|"False")
+
+ model, optim = amp.initialize(model, optim, opt_level="O3") # uses loss_scale=1.0 default
+ model, optim = amp.initialize(model, optim, opt_level="O3", loss_scale="dynamic"|128.0|"128.0")
+ model, optim = amp.initialize(model, optim, opt_level="O3", keep_batchnorm_fp32=True|False|"True"|"False")
+
+ The `Imagenet example`_ demonstrates live use of various opt_levels and overrides.
+
+ .. _`Distributed training`:
+ https://github.com/NVIDIA/apex/tree/master/examples/imagenet#distributed-training
+
+ .. _`Imagenet example`:
+ https://github.com/NVIDIA/apex/tree/master/examples/imagenet
+
+ .. _`Advanced Amp Usage`:
+ https://nvidia.github.io/apex/advanced.html
+
+ .. _`Advanced Amp Usage topic`:
+ https://nvidia.github.io/apex/advanced.html#multiple-models-optimizers-losses
+
+ .. _`let us know`:
+ https://github.com/NVIDIA/apex/issues
+ """
+ _amp_state.opt_properties = Properties()
+ _amp_state.verbosity = verbosity
+
+ if not enabled:
+ if optimizers is None:
+ return models
+ else:
+ return models, optimizers
+
+ if not torch.backends.cudnn.enabled:
+ raise RuntimeError(
+ "Amp requires torch.backends.cudnn.enabled = True")
+
+ if opt_level not in opt_levels:
+ raise RuntimeError(
+ "Unexpected optimization level {}. ".format(opt_level) +
+ "Options are 'O0', 'O1', 'O2', 'O3'. Note that in `O0`, `O1`, etc., the prefix O is the letter O, " +
+ "not the number zero.")
+ else:
+ _amp_state.opt_properties = opt_levels[opt_level](_amp_state.opt_properties)
+ maybe_print("Selected optimization level {}".format(opt_levels[opt_level].brief), True)
+ maybe_print("Defaults for this optimization level are:", True)
+ for k, v in _amp_state.opt_properties.options.items():
+ maybe_print("{:22} : {}".format(k, v), True)
+
+ _amp_state.min_loss_scale = min_loss_scale
+ _amp_state.max_loss_scale = max_loss_scale
+
+ maybe_print("Processing user overrides (additional kwargs that are not None)...", True)
+ # I chose to have the keyword arguments listed directly in the argument list,
+ # instead of **kwargs, so I can't use kwargs.items() here.
+ if enabled is not None:
+ _amp_state.opt_properties.enabled = enabled
+ if opt_level is not None:
+ _amp_state.opt_properties.opt_level = opt_level
+ if cast_model_type is not None:
+ _amp_state.opt_properties.cast_model_type = cast_model_type
+ if patch_torch_functions is not None:
+ _amp_state.opt_properties.patch_torch_functions = patch_torch_functions
+ if keep_batchnorm_fp32 is not None:
+ _amp_state.opt_properties.keep_batchnorm_fp32 = keep_batchnorm_fp32
+ if master_weights is not None:
+ _amp_state.opt_properties.master_weights = master_weights
+ if loss_scale is not None:
+ _amp_state.opt_properties.loss_scale = loss_scale
+
+ maybe_print("After processing overrides, optimization options are:", True)
+ for k, v in _amp_state.opt_properties.options.items():
+ maybe_print("{:22} : {}".format(k, v), True)
+
+ return _initialize(models, optimizers, _amp_state.opt_properties, num_losses, cast_model_outputs)
+
+
+def state_dict(destination=None):
+ if destination is None:
+ destination = OrderedDict()
+
+ for idx, loss_scaler in enumerate(_amp_state.loss_scalers):
+ destination['loss_scaler%d' % idx] = {
+ 'loss_scale': loss_scaler.loss_scale(),
+ 'unskipped': loss_scaler._unskipped,
+ }
+ return destination
+
+
+def load_state_dict(state_dict):
+ # Check if state_dict containes the same number of loss_scalers as current setup
+ if len(state_dict) != len(_amp_state.loss_scalers):
+ print('Warning: state_dict contains {} entries, while {} loss_scalers are used'.format(
+ len(state_dict), len(_amp_state.loss_scalers)))
+
+ state_dict = state_dict.copy()
+
+ nb_loss_scalers = len(_amp_state.loss_scalers)
+ unexpected_keys = []
+ # Initialize idx outside, since unexpected_keys will increase it if enumerate is used
+ idx = 0
+ for key in state_dict:
+ if 'loss_scaler' not in key:
+ unexpected_keys.append(key)
+ else:
+ if idx > (nb_loss_scalers - 1):
+ print('Skipping loss_scaler[{}], since num_losses was set to {}'.format(
+ idx, nb_loss_scalers))
+ break
+ _amp_state.loss_scalers[idx]._loss_scale = state_dict[key]['loss_scale']
+ _amp_state.loss_scalers[idx]._unskipped = state_dict[key]['unskipped']
+ idx += 1
+
+ if len(unexpected_keys) > 0:
+ raise RuntimeError(
+ 'Error(s) in loading state_dict. Unexpected key(s) in state_dict: {}. '.format(
+ ', '.join('"{}"'.format(k) for k in unexpected_keys)))
+
+
+# TODO: is this necessary/useful?
+# def check_option_consistency(enabled=True,
+# opt_level=None,
+# cast_model_type=None,
+# patch_torch_functions=None,
+# keep_batchnorm_fp32=None,
+# master_weights=None,
+# loss_scale=None,
+# enable_ddp_interop=None,
+# hard_override=False):
+# """
+# Utility function that enables users to quickly check if the option combination they intend
+# to use is permitted. ``check_option_consistency`` does not require models or optimizers
+# to be constructed, and can be called at any point in the script. ``check_option_consistency``
+# is totally self-contained; it does not set any amp global state or affect anything outside
+# of itself.
+# """
+#
+# if not enabled:
+# return
+#
+# if opt_level not in opt_levels:
+# raise RuntimeError("Unexpected optimization level. Options are 'O0', 'O1', 'O2', 'O3'.")
+# else:
+# opt_properties = opt_levels[opt_level](Properties())
+# print("Selected optimization level {}", opt_levels[opt_level].brief)
+# print("Defaults for this optimization level are:")
+# for k, v in opt_properties.options:
+# print("{:22} : {}".format(k, v))
+#
+# print("Processing user overrides (additional kwargs that are not None)...")
+# for k, v in kwargs:
+# if k not in _amp_state.opt_properties.options:
+# raise RuntimeError("Unexpected kwarg {}".format(k))
+# if v is not None:
+# setattr(opt_properties, k, v)
+#
+# print("After processing overrides, optimization options are:")
+# for k, v in opt_properties.options:
+# print("{:22} : {}".format(k, v))
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/handle.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/handle.py
new file mode 100644
index 0000000000..0be567ca48
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/handle.py
@@ -0,0 +1,281 @@
+import contextlib
+import warnings
+import sys
+import torch
+
+from . import utils
+from .opt import OptimWrapper
+from .scaler import LossScaler
+from ._amp_state import _amp_state, master_params, maybe_print
+
+if torch.distributed.is_available():
+ from ..parallel.LARC import LARC
+
+
+# There's no reason to expose the notion of a "handle". Everything can happen through amp.* calls.
+@contextlib.contextmanager
+def scale_loss(loss,
+ optimizers,
+ loss_id=0,
+ model=None,
+ delay_unscale=False,
+ delay_overflow_check=False):
+ """
+ On context manager entrance, creates ``scaled_loss = (loss.float())*current loss scale``.
+ ``scaled_loss`` is yielded so that the user can call ``scaled_loss.backward()``::
+
+ with amp.scale_loss(loss, optimizer) as scaled_loss:
+ scaled_loss.backward()
+
+ On context manager exit (if ``delay_unscale=False``), the gradients are checked for infs/NaNs
+ and unscaled, so that ``optimizer.step()`` can be called.
+
+ .. note::
+ If Amp is using explicit FP32 master params (which is the default for ``opt_level=O2``, and
+ can also be manually enabled by supplying ``master_weights=True`` to ``amp.initialize``)
+ any FP16 gradients are copied to FP32 master gradients before being unscaled.
+ ``optimizer.step()`` will then apply the unscaled master gradients to the master params.
+
+ .. warning::
+ If Amp is using explicit FP32 master params, only the FP32 master gradients will be
+ unscaled. The direct ``.grad`` attributes of any FP16
+ model params will remain scaled after context manager exit.
+ This subtlety affects gradient clipping. See "Gradient clipping" under
+ `Advanced Amp Usage`_ for best practices.
+
+ Args:
+ loss(Tensor): Typically a scalar Tensor. The ``scaled_loss`` that the context
+ manager yields is simply ``loss.float()*loss_scale``, so in principle
+ ``loss`` could have more than one element, as long as you call
+ ``backward()`` on ``scaled_loss`` appropriately within the context manager body.
+ optimizers: All optimizer(s) for which the current backward pass is creating gradients.
+ Must be an optimizer or list of optimizers returned from an earlier call
+ to ``amp.initialize``. For example use with multiple optimizers, see
+ "Multiple models/optimizers/losses" under `Advanced Amp Usage`_.
+ loss_id(int, optional, default=0): When used in conjunction with the ``num_losses`` argument
+ to ``amp.initialize``, enables Amp to use a different loss scale per loss. ``loss_id``
+ must be an integer between 0 and ``num_losses`` that tells Amp which loss is
+ being used for the current backward pass. See "Multiple models/optimizers/losses"
+ under `Advanced Amp Usage`_ for examples. If ``loss_id`` is left unspecified, Amp
+ will use the default global loss scaler for this backward pass.
+ model(torch.nn.Module, optional, default=None): Currently unused, reserved to enable future
+ optimizations.
+ delay_unscale(bool, optional, default=False): ``delay_unscale`` is never necessary, and
+ the default value of ``False`` is strongly recommended.
+ If ``True``, Amp will not unscale the gradients or perform model->master
+ gradient copies on context manager exit.
+ ``delay_unscale=True`` is a minor ninja performance optimization and can result
+ in weird gotchas (especially with multiple models/optimizers/losses),
+ so only use it if you know what you're doing.
+ "Gradient accumulation across iterations" under `Advanced Amp Usage`_
+ illustrates a situation where this CAN (but does not need to) be used.
+
+ .. warning::
+ If ``delay_unscale`` is ``True`` for a given backward pass, ``optimizer.step()`` cannot be
+ called yet after context manager exit, and must wait for another, later backward context
+ manager invocation with ``delay_unscale`` left to False.
+
+ .. _`Advanced Amp Usage`:
+ https://nvidia.github.io/apex/advanced.html
+ """
+ if not hasattr(_amp_state, "opt_properties"):
+ raise RuntimeError("Invoked 'with amp.scale_loss`, but internal Amp state has not been initialized. "
+ "model, optimizer = amp.initialize(model, optimizer, opt_level=...) must be called "
+ "before `with amp.scale_loss`.")
+
+ if not _amp_state.opt_properties.enabled:
+ yield loss
+ return
+
+ if isinstance(optimizers, torch.optim.Optimizer) or ('LARC' in globals() and isinstance(optimizers, LARC)):
+ optimizers = [optimizers]
+
+ loss_scaler = _amp_state.loss_scalers[loss_id]
+ loss_scale = loss_scaler.loss_scale()
+
+ if ((not _amp_state.opt_properties.master_weights)
+ and (not loss_scaler.dynamic)
+ and loss_scale == 1.0):
+ yield loss.float()
+ # Needing to drop the cache here as well is an ugly gotcha.
+ # But for now I think it's necessary to short-circuit.
+ # Probably ok to skip this if not delay_unscale
+ if _amp_state.opt_properties.patch_torch_functions:
+ _amp_state.handle._clear_cache()
+ return
+
+ if not delay_unscale:
+ if isinstance(optimizers, list):
+ for optimizer in optimizers:
+ if not optimizer._amp_stash.params_have_scaled_gradients:
+ optimizer._prepare_amp_backward()
+
+ yield (loss.float())*loss_scale
+
+ if delay_unscale:
+ for optimizer in optimizers:
+ optimizer._amp_stash.params_have_scaled_gradients = True
+ else:
+ # FusedSGD may take care of unscaling as part of their step() methods.
+ # if not isinstance(optimizers, FP16_Optimizer_for_fused):
+ loss_scaler.clear_overflow_state()
+ for optimizer in optimizers:
+ optimizer._post_amp_backward(loss_scaler)
+ optimizer._amp_stash.params_have_scaled_gradients = False
+ # For future fused optimizers that enable sync-free dynamic loss scaling,
+ # should_skip will always be False.
+ should_skip = False if delay_overflow_check else loss_scaler.update_scale()
+ if should_skip:
+ for optimizer in optimizers:
+ if not optimizer._amp_stash.already_patched:
+ # Close on loss_scaler and loss_id as well, to be safe. Probably not
+ # necessary because amp.scale_loss is already creating a temporary scope.
+ def patch_step(opt, loss_scaler, loss_id):
+ opt_step = opt.step
+ def skip_step(closure=None):
+ if closure is not None:
+ raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
+ maybe_print(("Gradient overflow. Skipping step, loss scaler " +
+ "{} reducing loss scale to {}").format(loss_id,
+ loss_scaler.loss_scale()))
+ # TODO: I don't like the special casing for different optimizer implementations.
+ # Maybe skip should delegate to a method owned by the optimizers themselves.
+ if hasattr(opt._amp_stash, "all_fp32_from_fp16_params"):
+ # Clear the master grads that wouldn't be zeroed by model.zero_grad()
+ for param in opt._amp_stash.all_fp32_from_fp16_params:
+ param.grad = None
+ if hasattr(opt, "most_recent_scale"):
+ opt.most_recent_scale = 1.0
+ opt.scale_set_by_backward = False
+ opt.step = opt_step
+ opt._amp_stash.already_patched = False
+ return skip_step
+ optimizer.step = patch_step(optimizer, loss_scaler, loss_id)
+ optimizer._amp_stash.already_patched = True
+
+ # Probably ok to skip this if not delay_unscale
+ if _amp_state.opt_properties.patch_torch_functions:
+ _amp_state.handle._clear_cache()
+
+
+# Free function version of AmpHandle.disable_casts, another step on the
+# path to removing the concept of "AmpHandle"
+@contextlib.contextmanager
+def disable_casts():
+ _amp_state.handle._is_active = False
+ yield
+ _amp_state.handle._is_active = True
+
+
+class AmpHandle(object):
+ def __init__(self, loss_scale="dynamic", enable_caching=True, verbose=False):
+ self._enable_caching = enable_caching
+ self._verbose = verbose
+ self._cache = dict()
+ self._default_scaler = LossScaler(loss_scale)
+ self._is_active = True
+ self._all_wrappers = []
+
+ def is_active(self):
+ return self._is_active
+
+ @contextlib.contextmanager
+ def _disable_casts(self):
+ self._is_active = False
+ yield
+ self._is_active = True
+
+ def wrap_optimizer(self, optimizer, num_loss=1):
+ self._default_scaler = None
+ return OptimWrapper(optimizer, self, num_loss)
+
+ @contextlib.contextmanager
+ def scale_loss(self, loss, optimizer):
+ raise RuntimeError("The old Amp API is no longer supported. Please move to the new API, "
+ "documented here: https://nvidia.github.io/apex/amp.html. Transition guide: "
+ "https://nvidia.github.io/apex/amp.html#transition-guide-for-old-api-users")
+
+ if not self.is_active():
+ yield loss
+ return
+
+ if self._default_scaler is None:
+ raise RuntimeError(
+ 'After calling `handle.wrap_optimizer()`, you must explicitly ' +
+ 'use `optimizer.scale_loss(loss)`.')
+
+ # TODO: this code block is duplicated here and `opt.py`. Unify.
+ loss_scale = self._default_scaler.loss_scale()
+ yield loss * loss_scale
+
+ self._default_scaler.clear_overflow_state()
+ self._default_scaler.unscale(
+ master_params(optimizer),
+ master_params(optimizer),
+ loss_scale)
+ should_skip = self._default_scaler.update_scale()
+ if should_skip:
+ optimizer_step = optimizer.step
+ def skip_step():
+ maybe_print('Gradient overflow, skipping update')
+ optimizer.step = optimizer_step
+ optimizer.step = skip_step
+
+ self._clear_cache()
+
+ def _clear_cache(self):
+ self._cache.clear()
+
+ # Experimental support for saving / restoring uncasted versions of functions
+ def _save_func(self, mod, fn, func):
+ self._all_wrappers.append((mod, fn, func))
+
+ def _deactivate(self):
+ for mod, fn, func in self._all_wrappers:
+ utils.set_func(mod, fn, func)
+ self._all_wrappers = []
+
+ @property
+ def has_cache(self):
+ return self._enable_caching
+
+ @property
+ def cache(self):
+ return self._cache
+
+ def remove_cache(self, param):
+ if self.has_cache and param in self.cache:
+ del self.cache[param]
+
+ @property
+ def verbose(self):
+ return self._verbose
+
+class NoOpHandle(object):
+ def is_active(self):
+ return False
+
+ @contextlib.contextmanager
+ def _disable_casts(self):
+ yield
+
+ def wrap_optimizer(self, optimizer, num_loss=1):
+ return OptimWrapper(optimizer, self, num_loss)
+
+ @contextlib.contextmanager
+ def scale_loss(self, loss, optimizer):
+ yield loss
+
+ @property
+ def has_cache(self):
+ return False
+
+ @property
+ def verbose(self):
+ return False
+
+ def _clear_cache(self):
+ pass
+
+ def _deactivate(self):
+ pass
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/__init__.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/functional_overrides.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/functional_overrides.py
new file mode 100644
index 0000000000..dd009cec6e
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/functional_overrides.py
@@ -0,0 +1,80 @@
+
+# TODO: think about the following two. They do weird things.
+# - torch.nn.utils.clip_grad (but it should always be fp32 anyway)
+# - torch.nn.utils.weight_norm
+
+# Notes:
+# F.instance_norm uses batch_norm internally. Which correctly handles
+# fp16 in/out with fp32 weights. So we shouldn't do anything for
+# either of these.
+# F.normalize calls `input.norm()` internally, so it's redundant, but
+# kept here in case impl. changes.
+# F.cosine_similarity is same: calls `x.norm()` internally.
+
+import torch.nn.functional
+
+MODULE = torch.nn.functional
+
+FP16_FUNCS = [
+ 'conv1d',
+ 'conv2d',
+ 'conv3d',
+ 'conv_transpose1d',
+ 'conv_transpose2d',
+ 'conv_transpose3d',
+ 'conv_tbc', # Undocumented / maybe new?
+ 'linear',
+]
+
+FP32_FUNCS = [
+
+ # Interpolation/Upsampling TODO: Remove for 1.2
+ 'interpolate',
+ 'grid_sample',
+
+ # Pointwise
+ 'softplus',
+ 'softmin',
+ 'log_softmax',
+ 'softmax',
+ 'gelu',
+
+ # Normalization
+ 'layer_norm',
+ 'group_norm',
+ 'local_response_norm',
+ 'normalize',
+ 'cosine_similarity',
+
+ # Loss functions
+ # TODO: which of these can be fp16?
+ 'poisson_nll_loss',
+ 'cosine_embedding_loss',
+ 'cross_entropy',
+ 'hinge_embedding_loss',
+ 'kl_div',
+ 'l1_loss',
+ 'mse_loss',
+ 'margin_ranking_loss',
+ 'multilabel_margin_loss',
+ 'multilabel_soft_margin_loss',
+ 'multi_margin_loss',
+ 'nll_loss',
+ 'binary_cross_entropy_with_logits',
+ 'smooth_l1_loss',
+ 'soft_margin_loss',
+ 'triplet_margin_loss',
+ 'ctc_loss'
+]
+
+BANNED_FUNCS = [
+ ('binary_cross_entropy',
+ ("\namp does not work out-of-the-box with `F.binary_cross_entropy` or `torch.nn.BCELoss.` "
+ "It requires that the output of the previous function be already a FloatTensor. \n\n"
+ "Most models have a Sigmoid right before BCELoss. In that case, you can use\n"
+ " torch.nn.BCEWithLogitsLoss\nto combine Sigmoid+BCELoss into a single layer "
+ "that is compatible with amp.\nAnother option is to add\n"
+ " amp.register_float_function(torch, 'sigmoid')\nbefore calling `amp.init()`.\n"
+ "If you _really_ know what you are doing, you can disable this warning by passing "
+ "allow_banned=True to `amp.init()`."))
+]
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/tensor_overrides.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/tensor_overrides.py
new file mode 100644
index 0000000000..18f3e5dcf2
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/tensor_overrides.py
@@ -0,0 +1,63 @@
+from .. import compat
+from . import torch_overrides
+
+import importlib
+
+import torch
+
+# if compat.variable_is_tensor() and not compat.tensor_is_variable():
+MODULE = torch.Tensor
+# else:
+# MODULE = torch.autograd.Variable
+
+
+FP16_FUNCS = compat.filter_attrs(MODULE, [
+ '__matmul__',
+])
+
+FP32_FUNCS = compat.filter_attrs(MODULE, [
+ '__ipow__',
+ '__pow__',
+ '__rpow__',
+
+ # Cast to fp32 before transfer to CPU
+ 'cpu',
+])
+
+CASTS = compat.filter_attrs(MODULE, [
+ '__add__',
+ '__div__',
+ '__eq__',
+ '__ge__',
+ '__gt__',
+ '__iadd__',
+ '__idiv__',
+ '__imul__',
+ '__isub__',
+ '__itruediv__',
+ '__le__',
+ '__lt__',
+ '__mul__',
+ '__ne__',
+ '__radd__',
+ '__rdiv__',
+ '__rmul__',
+ '__rsub__',
+ '__rtruediv__',
+ '__sub__',
+ '__truediv__',
+])
+
+# None of these, but here to make code cleaner.
+SEQUENCE_CASTS = []
+
+# We need to grab all the methods from torch_overrides and add them to
+# the Tensor lists as well, as almost all methods are duplicated
+# between `torch` and `torch.Tensor` (and check with `hasattr`,
+# because a few random ones aren't defined on Tensor)
+_self_mod = importlib.import_module(__name__)
+for attrname in ['FP16_FUNCS', 'FP32_FUNCS', 'CASTS', 'SEQUENCE_CASTS']:
+ lst = getattr(_self_mod, attrname)
+ for fn in getattr(torch_overrides, attrname):
+ if hasattr(MODULE, fn):
+ lst.append(fn)
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/torch_overrides.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/torch_overrides.py
new file mode 100644
index 0000000000..7dedb05a83
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/lists/torch_overrides.py
@@ -0,0 +1,115 @@
+import torch
+
+from .. import utils
+
+MODULE = torch
+
+FP16_FUNCS = [
+ # Low level functions wrapped by torch.nn layers.
+ # The wrapper layers contain the weights which are then passed in as a parameter
+ # to these functions.
+ 'conv1d',
+ 'conv2d',
+ 'conv3d',
+ 'conv_transpose1d',
+ 'conv_transpose2d',
+ 'conv_transpose3d',
+ 'conv_tbc',
+ 'prelu',
+
+ # BLAS
+ 'addmm',
+ 'addmv',
+ 'addr',
+ 'matmul',
+ 'mm',
+ 'mv',
+]
+
+FP32_FUNCS = [
+ # Pointwise
+ 'acos',
+ 'asin',
+ 'cosh',
+ 'erfinv',
+ 'exp',
+ 'expm1',
+ 'log',
+ 'log10',
+ 'log2',
+ 'reciprocal',
+ 'rsqrt',
+ 'sinh',
+ 'tan',
+
+ # Other math
+ 'pow',
+
+ # Reduction
+ 'cumprod',
+ 'cumsum',
+ 'dist',
+ # 'mean',
+ 'norm',
+ 'prod',
+ 'std',
+ 'sum',
+ 'var',
+
+ # Misc
+ 'renorm'
+]
+
+version_strings = torch.__version__.split('.')
+version_major = version_strings[0]
+version_minor = version_strings[1]
+version_num = float(version_major + "." + version_minor)
+# Before torch 1.1, mean must be blacklisted.
+if version_num < 1.1:
+ FP32_FUNCS.append('mean')
+
+# Before CUDA 9.1, batched matmul was missing fast FP16 kernels. We
+# check the CUDA version -- if at least 9.1, then put the bmm
+# functions on the fp16 list. Otherwise, put them on the fp32 list.
+_bmms = ['addbmm',
+ 'baddbmm',
+ 'bmm']
+
+if utils.is_cuda_enabled():
+ # workaround https://github.com/facebookresearch/maskrcnn-benchmark/issues/802
+ if utils.get_cuda_version() >= (9, 1, 0):
+ FP16_FUNCS.extend(_bmms)
+ else:
+ FP32_FUNCS.extend(_bmms)
+
+# Multi-tensor fns that may need type promotion
+CASTS = [
+ # Multi-tensor math
+ 'addcdiv',
+ 'addcmul',
+ 'atan2',
+ 'cross',
+ 'bilinear',
+ 'dot',
+
+ # Element-wise _or_ tensor-wise math
+ 'add',
+ 'div',
+ 'mul',
+
+ # Comparison
+ 'eq',
+ 'equal',
+ 'ge',
+ 'gt',
+ 'le',
+ 'lt',
+ 'ne'
+]
+
+# Functions that take sequence arguments. We need to inspect the whole
+# sequence and cast to the widest type.
+SEQUENCE_CASTS = [
+ 'cat',
+ 'stack'
+]
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/opt.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/opt.py
new file mode 100644
index 0000000000..baf311684d
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/opt.py
@@ -0,0 +1,103 @@
+import contextlib
+import warnings
+
+from .scaler import LossScaler, master_params
+from ._amp_state import maybe_print
+
+import numpy as np
+
+class OptimWrapper(object):
+ def __init__(self, optimizer, amp_handle, num_loss):
+ self._optimizer = optimizer
+ self._amp_handle = amp_handle
+ self._num_loss = num_loss
+ self._loss_idx = 0
+ self._skip_next = [False] * num_loss
+ self._loss_scaler = [LossScaler('dynamic') for _ in range(num_loss)]
+
+ @contextlib.contextmanager
+ def scale_loss(self, loss):
+ if not self._amp_handle.is_active():
+ yield loss
+ return
+
+ # When there are multiple losses per-optimizer, we need
+ # to save out current grad accumulation, since we won't be
+ # able to unscale this particulare loss once the grads are
+ # all mixed together.
+ cached_grads = []
+ if self._loss_idx > 0:
+ for p in master_params(self._optimizer):
+ if p.grad is not None:
+ cached_grads.append(p.grad.data.detach().clone())
+ else:
+ cached_grads.append(None)
+ self._optimizer.zero_grad()
+
+ loss_scale = self._cur_loss_scaler().loss_scale()
+ yield loss * loss_scale
+
+ self._cur_loss_scaler().clear_overflow_state()
+ self._cur_loss_scaler().unscale(
+ master_params(self._optimizer),
+ master_params(self._optimizer),
+ loss_scale)
+ self._skip_next[self._loss_idx] = self._cur_loss_scaler().update_scale()
+ self._loss_idx += 1
+
+ if len(cached_grads) > 0:
+ for p, cached_grad in zip(master_params(self._optimizer),
+ cached_grads):
+ if cached_grad is not None:
+ p.grad.data.add_(cached_grad)
+ cached_grads = []
+
+ def _cur_loss_scaler(self):
+ assert 0 <= self._loss_idx < self._num_loss
+ return self._loss_scaler[self._loss_idx]
+
+ def step(self, closure=None):
+ if not self._amp_handle.is_active():
+ return self._optimizer.step(closure=closure)
+
+ self._loss_idx = 0
+
+ for group in self._optimizer.param_groups:
+ for p in group['params']:
+ self._amp_handle.remove_cache(p)
+
+ if closure is not None:
+ raise NotImplementedError(
+ 'The `closure` argument is unsupported by the amp ' +
+ 'optimizer wrapper.')
+ if any(self._skip_next):
+ maybe_print('Gradient overflow, skipping update')
+ self._skip_next = [False] * self._num_loss
+ else:
+ return self._optimizer.step(closure=closure)
+
+ # Forward any attribute lookups
+ def __getattr__(self, attr):
+ return getattr(self._optimizer, attr)
+
+ # Forward all torch.optim.Optimizer methods
+ def __getstate__(self):
+ return self._optimizer.__getstate__()
+
+ def __setstate__(self):
+ return self._optimizer.__setstate__()
+
+ def __repr__(self):
+ return self._optimizer.__repr__()
+
+ def state_dict(self):
+ return self._optimizer.state_dict()
+
+ def load_state_dict(self, state_dict):
+ return self._optimizer.load_state_dict(state_dict)
+
+ def zero_grad(self):
+ return self._optimizer.zero_grad()
+
+ def add_param_group(self, param_group):
+ return self._optimizer.add_param_group(param_group)
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/rnn_compat.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/rnn_compat.py
new file mode 100644
index 0000000000..d062ae2658
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/rnn_compat.py
@@ -0,0 +1,53 @@
+from . import utils, wrap
+
+import torch
+_VF = torch._C._VariableFunctions
+RNN_NAMES = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm']
+
+def _gen_VF_wrapper(name):
+ def wrapper(*args, **kwargs):
+ return getattr(_VF, name)(*args, **kwargs)
+ return wrapper
+
+# Some python magic to generate an object that has the rnn cell functions
+# defined on it, all of which call into corresponding _VF version.
+# Intended to patch torch.nn.modules.rnn._VF (aka, the ref named "_VF"
+# imported at module scope within torch.nn.modules.rnn). This should
+# not affect third-party importers of _VF.py.
+class VariableFunctionsShim(object):
+ def __init__(self):
+ for name in RNN_NAMES:
+ for suffix in ['', '_cell']:
+ fn_name = name + suffix
+ setattr(self, fn_name, _gen_VF_wrapper(fn_name))
+
+def has_old_rnns():
+ try:
+ torch.nn.backends.thnn.backend.LSTMCell
+ return True
+ except:
+ return False
+
+def whitelist_rnn_cells(handle, verbose):
+ # Different module + function names in old/new RNN cases
+ if has_old_rnns():
+ fn_names = ['RNNReLUCell', 'RNNTanhCell', 'LSTMCell', 'GRUCell']
+ mod = torch.nn.backends.thnn.backend
+ else:
+ fn_names = [x + '_cell' for x in RNN_NAMES]
+ mod = torch.nn.modules.rnn._VF
+ assert isinstance(mod, VariableFunctionsShim)
+
+ # Insert casts on cell functions
+ for fn in fn_names:
+ wrap.cached_cast(mod, fn, utils.maybe_half, handle,
+ try_caching=True, verbose=verbose)
+
+ if has_old_rnns():
+ # Special handling of `backward` for fused gru / lstm:
+ # The `backward` method calls Tensor.sum() (blacklist) internally,
+ # and then the resulting grad_input has the wrong type.
+ # TODO: where else is this a problem?
+ for rnn_type in ['GRUFused', 'LSTMFused']:
+ mod = getattr(torch.nn._functions.thnn.rnnFusedPointwise, rnn_type)
+ wrap.disable_casts(mod, 'backward', handle)
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/scaler.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/scaler.py
new file mode 100644
index 0000000000..99888bc6fc
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/scaler.py
@@ -0,0 +1,217 @@
+import torch
+from ..multi_tensor_apply import multi_tensor_applier
+from ._amp_state import _amp_state, master_params, maybe_print
+from itertools import product
+
+def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=False):
+ # Exception handling for 18.04 compatibility
+ if check_overflow:
+ cpu_sum = float(model_grad.float().sum())
+ if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+ return True
+
+ if master_grad is not model_grad: # copy_ probably internally short-circuits this
+ master_grad.copy_(model_grad)
+ if scale != 1.0:
+ master_grad.mul_(scale)
+ return False
+
+def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, a, b, check_overflow=False):
+ # Exception handling for 18.04 compatibility
+ if check_overflow:
+ cpu_sum = float(model_grad.float().sum())
+ if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
+ return True
+
+ # if master_grad is not model_grad: # copy_ probably internally short-circuits this
+ # master_grad.copy_(model_grad)
+ assert stashed_grad.dtype == master_grad.dtype
+ converted_model_grad = model_grad.data.to(master_grad.dtype)
+ master_grad.data = a*converted_model_grad.data + b*stashed_grad.data
+ return False
+
+class LossScaler(object):
+ warned_no_fused_kernel = False
+ warned_unscaling_non_fp32_grad = False
+ has_fused_kernel = False
+
+ def __init__(self,
+ loss_scale,
+ init_scale=2.**16,
+ scale_factor=2.,
+ scale_window=2000,
+ min_loss_scale=None,
+ max_loss_scale=2.**24):
+ if loss_scale == "dynamic":
+ self.dynamic = True
+ self._loss_scale = min(max_loss_scale, init_scale)
+ else:
+ self.dynamic = False
+ self._loss_scale = loss_scale
+ self._max_loss_scale = max_loss_scale
+ self._min_loss_scale = min_loss_scale
+ self._scale_seq_len = scale_window
+ self._unskipped = 0
+ self._has_overflow = False
+ self._overflow_buf = torch.cuda.IntTensor([0])
+ if multi_tensor_applier.available:
+ import amp_C
+ LossScaler.has_fused_kernel = multi_tensor_applier.available
+ LossScaler.multi_tensor_scale_cuda = amp_C.multi_tensor_scale
+ LossScaler.multi_tensor_axpby_cuda = amp_C.multi_tensor_axpby
+ else:
+ if not LossScaler.warned_no_fused_kernel:
+ maybe_print(
+ "Warning: multi_tensor_applier fused unscale kernel is unavailable, "
+ "possibly because apex was installed without --cuda_ext --cpp_ext. "
+ "Using Python fallback. Original ImportError was: " +
+ repr(multi_tensor_applier.import_err),
+ True)
+ LossScaler.has_fused_kernel = False
+ LossScaler.warned_no_fused_kernel = True
+
+ def loss_scale(self):
+ return self._loss_scale
+
+ def unscale_python(self, model_grads, master_grads, scale):
+ for model, master in zip(model_grads, master_grads):
+ if model is not None:
+ if not LossScaler.warned_unscaling_non_fp32_grad:
+ if master.dtype != torch.float32:
+ maybe_print(
+ "Attempting to unscale a grad with type {} ".format(master.type()) +
+ "Unscaling non-fp32 grads may indicate an error. "
+ "When using Amp, you don't need to call .half() on your model.")
+ LossScaler.warned_unscaling_non_fp32_grad = True
+ self._has_overflow = scale_check_overflow_python(model,
+ master,
+ 1./scale,
+ self.dynamic)
+ if self._has_overflow and self.dynamic:
+ break
+
+ # unused_scale keeps some of the old API alive for hopefully a short time.
+ def unscale(self, model_grads, master_grads, unused_scale, models_are_masters=False, scale_override=None):
+ if self._has_overflow:
+ return
+
+ scale = self._loss_scale
+ if scale_override is not None:
+ scale = scale_override
+
+ if scale == 1.0 and models_are_masters and not self.dynamic:
+ return
+
+ if LossScaler.has_fused_kernel:
+ # if (not LossScaler.warned_unscaling_non_fp32_grad
+ # and master_grads[0].dtype == torch.float16):
+ # print("Warning: unscaling grads that are not FP32. "
+ # "Unscaling non-fp32 grads may indicate an error. "
+ # "When using Amp, you don't need to call .half() on your model.")
+ # # Setting this to True unconditionally allows the possibility of an escape
+ # # if never-before-seen non-fp32 grads are created in some later iteration.
+ # LossScaler.warned_unscaling_non_fp32_grad = True
+ multi_tensor_applier(LossScaler.multi_tensor_scale_cuda,
+ self._overflow_buf,
+ [model_grads, master_grads],
+ 1./scale)
+ else:
+ self.unscale_python(model_grads, master_grads, scale)
+
+ # Defer to update_scale
+ # If the fused kernel is available, we only need one D2H memcopy and sync.
+ # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+ # self._has_overflow = self._overflow_buf.item()
+
+ def unscale_with_stashed_python(self,
+ model_grads,
+ stashed_master_grads,
+ master_grads,
+ a,
+ b):
+ for model, stashed, master in zip(model_grads, stashed_master_grads, master_grads):
+ if model is None and stashed is None:
+ continue
+ else:
+ if not LossScaler.warned_unscaling_non_fp32_grad:
+ if master.dtype != torch.float32:
+ maybe_print(
+ "Attempting to unscale a grad with type {} ".format(master.type()) +
+ "Unscaling non-fp32 grads may indicate an error. "
+ "When using Amp, you don't need to call .half() on your model.")
+ LossScaler.warned_unscaling_non_fp32_grad = True
+ self._has_overflow = axpby_check_overflow_python(model,
+ stashed,
+ master,
+ a,
+ b,
+ self.dynamic)
+ if self._has_overflow and self.dynamic:
+ break
+
+ def unscale_with_stashed(self,
+ model_grads,
+ stashed_master_grads,
+ master_grads,
+ scale_override=None):
+ if self._has_overflow:
+ return
+
+ grads_have_scale, stashed_have_scale, out_scale = self._loss_scale, 1.0, 1.0
+ if scale_override is not None:
+ grads_have_scale, stashed_have_scale, out_scale = scale_override
+
+ if LossScaler.has_fused_kernel:
+ if (not LossScaler.warned_unscaling_non_fp32_grad
+ and master_grads[0].dtype == torch.float16):
+ print("Warning: unscaling grads that are not FP32. "
+ "Unscaling non-fp32 grads may indicate an error. "
+ "When using Amp, you don't need to call .half() on your model.")
+ # Setting this to True unconditionally allows the possibility of an escape
+ # if never-before-seen non-fp32 grads are created in some later iteration.
+ LossScaler.warned_unscaling_non_fp32_grad = True
+ multi_tensor_applier(LossScaler.multi_tensor_axpby_cuda,
+ self._overflow_buf,
+ [model_grads, stashed_master_grads, master_grads],
+ out_scale/grads_have_scale, # 1./scale,
+ out_scale/stashed_have_scale, # 1.0,
+ 0) # check only arg 0, aka the incoming model grads, for infs
+ else:
+ self.unscale_with_stashed_python(model_grads,
+ stashed_master_grads,
+ master_grads,
+ out_scale/grads_have_scale,
+ out_scale/stashed_have_scale)
+
+ # Defer to update_scale
+ # If the fused kernel is available, we only need one D2H memcopy and sync.
+ # if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+ # self._has_overflow = self._overflow_buf.item()
+
+ def clear_overflow_state(self):
+ self._has_overflow = False
+ if self.has_fused_kernel:
+ self._overflow_buf.zero_()
+
+ # Separate so unscale() can be called more that once before updating.
+ def update_scale(self):
+ # If the fused kernel is available, we only need one D2H memcopy and sync.
+ if LossScaler.has_fused_kernel and self.dynamic and not self._has_overflow:
+ self._has_overflow = self._overflow_buf.item()
+
+ if self._has_overflow and self.dynamic:
+ should_skip = True
+ if(self._min_loss_scale):
+ self._loss_scale = max(self._min_loss_scale, self._loss_scale/2.)
+ else:
+ self._loss_scale = self._loss_scale/2.
+ self._unskipped = 0
+ else:
+ should_skip = False
+ self._unskipped += 1
+
+ if self._unskipped == self._scale_seq_len and self.dynamic:
+ self._loss_scale = min(self._max_loss_scale, self._loss_scale*2.)
+ self._unskipped = 0
+
+ return should_skip
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/utils.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/utils.py
new file mode 100644
index 0000000000..0590cd70a1
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/utils.py
@@ -0,0 +1,210 @@
+from . import compat
+
+import functools
+import itertools
+
+import torch
+
+def is_cuda_enabled():
+ return torch.version.cuda is not None
+
+def get_cuda_version():
+ return tuple(int(x) for x in torch.version.cuda.split('.'))
+
+def is_fp_tensor(x):
+ if is_nested(x):
+ # Fast-fail version of all(is_fp_tensor)
+ for y in x:
+ if not is_fp_tensor(y):
+ return False
+ return True
+ return compat.is_tensor_like(x) and compat.is_floating_point(x)
+
+def is_nested(x):
+ return isinstance(x, tuple) or isinstance(x, list)
+
+def should_cache(x):
+ if is_nested(x):
+ # Fast-fail version of all(should_cache)
+ for y in x:
+ if not should_cache(y):
+ return False
+ return True
+ return isinstance(x, torch.nn.parameter.Parameter) and \
+ type_string(x) == 'FloatTensor'
+
+def collect_fp_tensor_types(args, kwargs):
+ def collect_types(x, types):
+ if is_nested(x):
+ for y in x:
+ collect_types(y, types)
+ else:
+ types.add(type_string(x))
+
+ all_args = itertools.chain(args, kwargs.values())
+ types = set()
+ for x in all_args:
+ if is_fp_tensor(x):
+ collect_types(x, types)
+ return types
+
+def type_string(x):
+ return x.type().split('.')[-1]
+
+def maybe_half(x, name='', verbose=False):
+ if is_nested(x):
+ return type(x)([maybe_half(y) for y in x])
+
+ if not x.is_cuda or type_string(x) == 'HalfTensor':
+ return x
+ else:
+ if verbose:
+ print('Float->Half ({})'.format(name))
+ return x.half()
+
+def maybe_float(x, name='', verbose=False):
+ if is_nested(x):
+ return type(x)([maybe_float(y) for y in x])
+
+ if not x.is_cuda or type_string(x) == 'FloatTensor':
+ return x
+ else:
+ if verbose:
+ print('Half->Float ({})'.format(name))
+ return x.float()
+
+# NB: returneds casted `args`, mutates `kwargs` in-place
+def casted_args(cast_fn, args, kwargs):
+ new_args = []
+ for x in args:
+ if is_fp_tensor(x):
+ new_args.append(cast_fn(x))
+ else:
+ new_args.append(x)
+ for k in kwargs:
+ val = kwargs[k]
+ if is_fp_tensor(val):
+ kwargs[k] = cast_fn(val)
+ return new_args
+
+def cached_cast(cast_fn, x, cache):
+ if is_nested(x):
+ return type(x)([cached_cast(y) for y in x])
+ if x in cache:
+ cached_x = cache[x]
+ if x.requires_grad and cached_x.requires_grad:
+ # Make sure x is actually cached_x's autograd parent.
+ if cached_x.grad_fn.next_functions[1][0].variable is not x:
+ raise RuntimeError("x and cache[x] both require grad, but x is not "
+ "cache[x]'s parent. This is likely an error.")
+ # During eval, it's possible to end up caching casted weights with
+ # requires_grad=False. On the next training iter, if cached_x is found
+ # and reused from the cache, it will not actually have x as its parent.
+ # Therefore, we choose to invalidate the cache (and force refreshing the cast)
+ # if x.requires_grad and cached_x.requires_grad do not match.
+ #
+ # During eval (i.e. running under with torch.no_grad()) the invalidation
+ # check would cause the cached value to be dropped every time, because
+ # cached_x would always be created with requires_grad=False, while x would
+ # still have requires_grad=True. This would render the cache effectively
+ # useless during eval. Therefore, if we are running under the no_grad()
+ # context manager (torch.is_grad_enabled=False) we elide the invalidation
+ # check, and use the cached value even though its requires_grad flag doesn't
+ # match. During eval, we don't care that there's no autograd-graph
+ # connection between x and cached_x.
+ if torch.is_grad_enabled() and x.requires_grad != cached_x.requires_grad:
+ del cache[x]
+ else:
+ return cached_x
+
+ casted_x = cast_fn(x)
+ cache[x] = casted_x
+ return casted_x
+
+def verbosify(cast_fn, fn_name, verbose):
+ if verbose:
+ return functools.partial(cast_fn, name=fn_name, verbose=verbose)
+ else:
+ return cast_fn
+
+def as_inplace(fns):
+ for x in fns:
+ yield x + '_'
+
+def has_func(mod, fn):
+ if isinstance(mod, dict):
+ return fn in mod
+ else:
+ return hasattr(mod, fn)
+
+def get_func(mod, fn):
+ if isinstance(mod, dict):
+ return mod[fn]
+ else:
+ return getattr(mod, fn)
+
+def set_func(mod, fn, new_fn):
+ if isinstance(mod, dict):
+ mod[fn] = new_fn
+ else:
+ setattr(mod, fn, new_fn)
+
+def set_func_save(handle, mod, fn, new_fn):
+ cur_fn = get_func(mod, fn)
+ handle._save_func(mod, fn, cur_fn)
+ set_func(mod, fn, new_fn)
+
+# A couple problems get solved here:
+# - The flat_weight buffer is disconnected from autograd graph,
+# so the fp16 weights need to be derived from the input weights
+# to this forward call, not the flat buffer.
+# - The ordering of weights in the flat buffer is...idiosyncratic.
+# First problem is solved with combination of set_ (to set up
+# correct storage) and copy_ (so the fp16 weight derives from the
+# fp32 one in autograd.
+# Second is solved by doing ptr arithmetic on the fp32 weights
+# to derive the correct offset.
+#
+# TODO: maybe this should actually use
+# `torch._cudnn_rnn_flatten_weight`? But then I need to call
+# on first iter and cache the right offsets. Ugh.
+def synthesize_flattened_rnn_weights(fp32_weights,
+ fp16_flat_tensor,
+ rnn_fn='',
+ verbose=False):
+ fp16_weights = []
+ fp32_base_ptr = fp32_weights[0][0].data_ptr()
+ for layer_weights in fp32_weights:
+ fp16_layer_weights = []
+ for w_fp32 in layer_weights:
+ w_fp16 = w_fp32.new().half()
+ offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
+ w_fp16.set_(fp16_flat_tensor.storage(),
+ offset,
+ w_fp32.shape)
+ w_fp16.copy_(w_fp32)
+ if verbose:
+ print('Float->Half ({})'.format(rnn_fn))
+ fp16_layer_weights.append(w_fp16)
+ fp16_weights.append(fp16_layer_weights)
+ return fp16_weights
+
+# Roughly same as above, just the `fp32_weights` aren't nested.
+# Code kept separate for readability.
+def new_synthesize_flattened_rnn_weights(fp32_weights,
+ fp16_flat_tensor,
+ rnn_fn='',
+ verbose=False):
+ fp16_weights = []
+ fp32_base_ptr = fp32_weights[0].data_ptr()
+ for w_fp32 in fp32_weights:
+ w_fp16 = w_fp32.new().half()
+ offset = (w_fp32.data_ptr() - fp32_base_ptr) // w_fp32.element_size()
+ w_fp16.set_(fp16_flat_tensor.storage(),
+ offset,
+ w_fp32.shape)
+ w_fp16.copy_(w_fp32)
+ if verbose:
+ print('Float->Half ({})'.format(rnn_fn))
+ fp16_weights.append(w_fp16)
+ return fp16_weights
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/wrap.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/wrap.py
new file mode 100644
index 0000000000..559d0558d9
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/amp/wrap.py
@@ -0,0 +1,276 @@
+from . import compat
+from . import utils
+from ._amp_state import _amp_state
+from . import rnn_compat
+
+import functools
+
+import torch
+
+def make_cast_wrapper(orig_fn, cast_fn, handle,
+ try_caching=False):
+ @functools.wraps(orig_fn)
+ def wrapper(*args, **kwargs):
+ if not handle.is_active():
+ return orig_fn(*args, **kwargs)
+
+ if try_caching and handle.has_cache:
+ args = list(args)
+ for i in range(len(args)):
+ if utils.should_cache(args[i]):
+ args[i] = utils.cached_cast(cast_fn, args[i], handle.cache)
+ for k in kwargs:
+ if utils.should_cache(kwargs[k]):
+ kwargs[k] = utils.cached_cast(cast_fn, kwargs[k], handle.cache)
+ new_args = utils.casted_args(cast_fn,
+ args,
+ kwargs)
+ return orig_fn(*new_args, **kwargs)
+ return wrapper
+
+def cached_cast(mod, fn, cast_fn, handle,
+ try_caching=False, verbose=False):
+ if not utils.has_func(mod, fn):
+ return
+
+ orig_fn = utils.get_func(mod, fn)
+ cast_fn = utils.verbosify(cast_fn, fn, verbose)
+ wrapper = make_cast_wrapper(orig_fn, cast_fn, handle, try_caching)
+ utils.set_func_save(handle, mod, fn, wrapper)
+
+# `handle` arg is unused, but simplifies API to make `make_cast_wrapper`
+# Annoyingly, make_promote_wrapper still uses the global handle. Once everyone
+# is on the new API and I am free to get rid of handle, I can clean this up.
+def make_promote_wrapper(orig_fn, cast_fn, handle=None):
+ @functools.wraps(orig_fn)
+ def wrapper(*args, **kwargs):
+ if not _amp_state.handle.is_active():
+ return orig_fn(*args, **kwargs)
+
+ types = utils.collect_fp_tensor_types(args, kwargs)
+
+ if len(types) <= 1:
+ return orig_fn(*args, **kwargs)
+ elif len(types) == 2 and types == set(['HalfTensor', 'FloatTensor']):
+ new_args = utils.casted_args(cast_fn,
+ args,
+ kwargs)
+ return orig_fn(*new_args, **kwargs)
+ else:
+ raise NotImplementedError('Do not know how to handle ' +
+ 'these types to promote: {}'
+ .format(types))
+ return wrapper
+
+def promote(mod, fn, handle, verbose=False):
+ orig_fn = utils.get_func(mod, fn)
+ maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
+ wrapper = make_promote_wrapper(orig_fn, maybe_float)
+ utils.set_func_save(handle, mod, fn, wrapper)
+
+def sequence_promote(mod, fn, handle, verbose=False):
+ orig_fn = utils.get_func(mod, fn)
+ maybe_float = utils.verbosify(utils.maybe_float, fn, verbose)
+ @functools.wraps(orig_fn)
+ def wrapper(seq, *args, **kwargs):
+ if not _amp_state.handle.is_active():
+ return orig_fn(seq, *args, **kwargs)
+
+ types = set([utils.type_string(x) for x in seq])
+ if len(types) <= 1:
+ return orig_fn(seq, *args, **kwargs)
+ elif types == set(['HalfTensor', 'FloatTensor']):
+ cast_seq = utils.casted_args(maybe_float,
+ seq, {})
+ return orig_fn(cast_seq, *args, **kwargs)
+ else:
+ # TODO: other mixed-type cases aren't due to amp.
+ # Just pass through?
+ return orig_fn(seq, *args, **kwargs)
+ utils.set_func_save(handle, mod, fn, wrapper)
+
+def promote_match_arg0(mod, fn, handle, verbose=False):
+ if not utils.has_func(mod, fn):
+ return
+
+ orig_fn = utils.get_func(mod, fn)
+ @functools.wraps(orig_fn)
+ def wrapper(arg0, *args, **kwargs):
+ assert compat.is_tensor_like(arg0)
+ if not _amp_state.handle.is_active():
+ return orig_fn(arg0, *args, **kwargs)
+
+ if utils.type_string(arg0) == 'HalfTensor':
+ cast_fn = utils.maybe_half
+ elif utils.type_string(arg0) == 'FloatTensor':
+ cast_fn = utils.maybe_float
+ else:
+ return orig_fn(arg0, *args, **kwargs)
+ cast_fn = utils.verbosify(cast_fn, fn, verbose)
+ new_args = utils.casted_args(cast_fn, args, kwargs)
+ return orig_fn(arg0, *new_args, **kwargs)
+ utils.set_func_save(handle, mod, fn, wrapper)
+
+def err_if_any_half(mod, fn, handle, custom_err_msg=None):
+ if not utils.has_func(mod, fn):
+ return
+
+ orig_fn = utils.get_func(mod, fn)
+ @functools.wraps(orig_fn)
+ def wrapper(*args, **kwargs):
+ types = utils.collect_fp_tensor_types(args, kwargs)
+ if 'HalfTensor' in types:
+ if custom_err_msg:
+ raise NotImplementedError(custom_err_msg)
+ else:
+ raise NotImplementedError('Cannot call in-place function ' +
+ '{} with fp16 arguments.'.format(fn))
+ else:
+ return orig_fn(*args, **kwargs)
+ utils.set_func_save(handle, mod, fn, wrapper)
+
+def err_if_arg0_half(mod, fn, handle, verbose=False):
+ if not utils.has_func(mod, fn):
+ return
+
+ orig_fn = utils.get_func(mod, fn)
+ @functools.wraps(orig_fn)
+ def wrapper(arg0, *args, **kwargs):
+ assert compat.is_tensor_like(arg0)
+ if utils.type_string(arg0) == 'HalfTensor':
+ raise NotImplementedError('Cannot call in-place method ' +
+ '{} on fp16 Tensors.'.format(fn))
+ else:
+ cast_fn = utils.verbosify(utils.maybe_float, fn, verbose)
+ new_args = utils.casted_args(cast_fn, args, kwargs)
+ return orig_fn(arg0, *new_args, **kwargs)
+ utils.set_func_save(handle, mod, fn, wrapper)
+
+# Current RNN approach:
+# - Wrap top-level `RNN` function in thnn backend
+# - Will call into either CudnnRNN or AutogradRNN
+# - Each of these are factory functions that return a per-iter
+# `forward` function
+# - We interpose on the factory function to:
+# 1) Interpose on the actual forward function and put in casts
+# 2) Insert an fp16 `flat_weight` if necessary
+def rnn_cast(backend, fn, handle, verbose=False):
+ orig_rnn = utils.get_func(backend, fn)
+ @functools.wraps(orig_rnn)
+ def rnn_wrapper(*args, **kwargs):
+ flat_weight = kwargs.get('flat_weight')
+ if flat_weight is not None:
+ # We replace `flat_weight` with an uninitialized fp16
+ # Tensor. The "actual" weight tensors (provided in `forward`),
+ # will then be set up as ptrs into the buffer and have the
+ # corresponding fp32 values copied in.
+ # We need to call `copy` on the "actual" weights so that the
+ # autograd graph correctly backprops from the wgrads computed
+ # inside cuDNN (on fp16 weights) into the fp32 weights.
+ assert utils.type_string(flat_weight) == 'FloatTensor'
+ if compat.tensor_is_float_tensor() or compat.tensor_is_variable():
+ # Pre-0.4. A little slower, since it zeros out memory.
+ flat_weight_fp16 = flat_weight.new().half().resize_(flat_weight.shape)
+ else:
+ flat_weight_fp16 = torch.empty_like(flat_weight,
+ dtype=torch.float16)
+ kwargs['flat_weight'] = flat_weight_fp16
+ else:
+ flat_weight_fp16 = None
+
+ forward = orig_rnn(*args, **kwargs)
+ @functools.wraps(forward)
+ def fwd_wrapper(*fargs, **fkwargs):
+ assert len(fargs) == 3 or len(fargs) == 4
+ inputs, weights, hiddens = fargs[:3]
+ assert utils.is_fp_tensor(inputs)
+ assert isinstance(weights, list)
+ cast_fn = utils.verbosify(utils.maybe_half,
+ fn,
+ verbose)
+ new_args = []
+
+ # 0) Inputs
+ new_args.append(cast_fn(inputs))
+
+ # 1) Weights
+ if flat_weight_fp16 is not None:
+ fp16_weights = utils.synthesize_flattened_rnn_weights(
+ weights, flat_weight_fp16, fn, verbose)
+ else:
+ fp16_weights = [[cast_fn(w) for w in layer]
+ for layer in weights]
+ new_args.append(fp16_weights)
+
+ # 2) Inputs: either a tuple (for LSTM) or single tensor
+ if isinstance(hiddens, tuple):
+ new_args.append(tuple(cast_fn(x) for x in hiddens))
+ elif utils.is_fp_tensor(hiddens):
+ new_args.append(cast_fn(hiddens))
+ else:
+ # Hiddens can, in principle, be `None` -- pass through
+ new_args.append(hiddens)
+
+ # 3) Batch sizes (0.4 or later only)
+ if len(fargs) == 4:
+ new_args.append(fargs[3])
+
+ return forward(*new_args, **fkwargs)
+ return fwd_wrapper
+ utils.set_func_save(handle, backend, fn, rnn_wrapper)
+
+def new_rnn_cast(fn, handle, verbose=False):
+ # Forward+backward compatibility around https://github.com/pytorch/pytorch/pull/15744
+ # For rnn backend calls that route through _rnn_impls, we must patch the ref
+ # that _rnn_impls stashed. For rnn backend calls that directly invoke
+ # _VF., e.g. _VF.lstm, we can patch onto VariableFunctionsShim,
+ # which in turn has patched the ref named "_VF" in torch.nn.modules.rnn.
+ if utils.has_func(torch.nn.modules.rnn._rnn_impls, fn):
+ mod = torch.nn.modules.rnn._rnn_impls
+ else:
+ mod = torch.nn.modules.rnn._VF
+ assert isinstance(mod, rnn_compat.VariableFunctionsShim)
+ fn = fn.lower()
+ orig_fn = utils.get_func(mod, fn)
+ cast_fn = utils.verbosify(utils.maybe_half, fn, verbose)
+ @functools.wraps(orig_fn)
+ def wrapper(*args, **kwargs):
+ # Exact call signature from modules/rnn.py
+ assert len(args) == 9
+ assert len(kwargs) == 0
+
+ if not _amp_state.handle.is_active():
+ return orig_fn(*args, **kwargs)
+
+ if isinstance(args[6], bool):
+ params_idx = 2 # Not PackedSequence case
+ else:
+ params_idx = 3 # PackedSequence case
+
+ new_args = []
+ for i, arg in enumerate(args):
+ if i == params_idx:
+ num_params = sum([x.numel() for x in arg])
+ fp16_weight_buf = args[0].new_empty((num_params,),
+ dtype=torch.half)
+ casted_weights = utils.new_synthesize_flattened_rnn_weights(
+ arg, fp16_weight_buf, fn, verbose)
+ new_args.append(casted_weights)
+ elif utils.is_fp_tensor(arg):
+ new_args.append(cast_fn(arg))
+ else:
+ new_args.append(arg)
+
+ return orig_fn(*new_args)
+ utils.set_func_save(handle, mod, fn, wrapper)
+
+def disable_casts(mod, fn, handle):
+ if not utils.has_func(mod, fn):
+ return
+
+ orig_fn = utils.get_func(mod, fn)
+ @functools.wraps(orig_fn)
+ def wrapper(*args, **kwargs):
+ with handle._disable_casts():
+ return orig_fn(*args, **kwargs)
+ utils.set_func_save(handle, mod, fn, wrapper)
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/__init__.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/__init__.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/__init__.py
new file mode 100644
index 0000000000..996dbf148f
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/__init__.py
@@ -0,0 +1 @@
+from .bottleneck import Bottleneck, SpatialBottleneck
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/bottleneck.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/bottleneck.py
new file mode 100644
index 0000000000..4f18692320
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/bottleneck.py
@@ -0,0 +1,512 @@
+import torch
+import torch.distributed as dist
+from torch import nn
+import fast_bottleneck
+
+def kaiming_uniform_(tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+ weight_tensor_nchw = tensor
+ nn.init.kaiming_uniform_(weight_tensor_nchw, a=a, mode=mode, nonlinearity=nonlinearity)
+
+class FrozenBatchNorm2d(torch.nn.Module):
+ """
+ BatchNorm2d where the batch statistics and the affine parameters are fixed
+ """
+ def __init__(self, n):
+ super(FrozenBatchNorm2d, self).__init__()
+ self.register_buffer("weight", torch.ones(n))
+ self.register_buffer("bias", torch.zeros(n))
+ self.register_buffer("running_mean", torch.zeros(n))
+ self.register_buffer("running_var", torch.ones(n))
+
+ def get_scale_bias(self, nhwc=False):
+ scale = self.weight * self.running_var.rsqrt()
+ bias = self.bias - self.running_mean * scale
+ if nhwc:
+ scale = scale.reshape(1, 1, 1, -1)
+ bias = bias.reshape(1, 1, 1, -1)
+ else:
+ scale = scale.reshape(1, -1, 1, 1)
+ bias = bias.reshape(1, -1, 1, 1)
+ return scale, bias
+
+ def forward(self, x):
+ scale, bias = self.get_scale_bias()
+ return x * scale + bias
+
+
+@torch.jit.script
+def drelu_dscale1(grad_o, output, scale1):
+ relu_mask = (output>0).half()
+ dx_relu = relu_mask * grad_o
+ g1 = dx_relu * scale1
+ return g1, dx_relu
+
+@torch.jit.script
+def drelu_dscale2(grad_o, output, scale1, scale2):
+ relu_mask = (output>0).half()
+ dx_relu = relu_mask * grad_o
+ g1 = dx_relu * scale1
+ g2 = dx_relu * scale2
+ return g1, g2
+
+class BottleneckFunction(torch.autograd.Function):
+ @staticmethod
+ def forward(ctx, nhwc, stride_1x1, scale, bias, x, *conv):
+ # TODO: clean up order of tensors
+ args = [x, *conv[0:3], *scale[0:3], *bias[0:3]]
+ ctx.downsample = len(conv) > 3
+ if ctx.downsample:
+ args.append(conv[3])
+ args.append(scale[3])
+ args.append(bias[3])
+
+ # weight buffers are always in nhwc while shape can be nhwc or channels_last
+ # here we pass in flag and let c++ handle it
+ # alternatively, we can put all sizes into a fixed format and pass it in
+ outputs = fast_bottleneck.forward(nhwc, stride_1x1, args)
+ ctx.save_for_backward(*(args+outputs))
+ # save relu outputs for drelu
+ ctx.nhwc = nhwc
+ ctx.stride_1x1 = stride_1x1
+ return outputs[2]
+
+ # backward relu is not exposed, MUL with mask used now
+ # only support dgrad
+ @staticmethod
+ def backward(ctx, grad_o):
+ outputs = ctx.saved_tensors[-3:]
+
+ if ctx.downsample:
+ grad_conv3, grad_conv4 = drelu_dscale2(grad_o, outputs[2], ctx.saved_tensors[6], ctx.saved_tensors[11])
+ else:
+ grad_conv3, grad_conv4 = drelu_dscale1(grad_o, outputs[2], ctx.saved_tensors[6])
+
+ # create input vector for backward
+ t_list = [*ctx.saved_tensors[0:10]]
+ t_list.append(grad_conv3)
+ t_list.append(grad_conv4)
+
+ # outputs used for wgrad and generating drelu mask
+ t_list.append(outputs[0])
+ t_list.append(outputs[1])
+
+ # in case there is downsample
+ if ctx.downsample:
+ t_list.append(ctx.saved_tensors[10])
+
+ grads = fast_bottleneck.backward(ctx.nhwc, ctx.stride_1x1, t_list)
+
+ return (None, None, None, None, *grads)
+
+bottleneck_function = BottleneckFunction.apply
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+ """3x3 convolution with padding"""
+ return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+ padding=dilation, groups=groups, bias=False, dilation=dilation)
+
+def conv1x1(in_planes, out_planes, stride=1):
+ """1x1 convolution"""
+ return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+class Bottleneck(torch.nn.Module):
+ # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+ # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+ # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+ # This variant is also known as ResNet V1.5 and improves accuracy according to
+ # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+ # here we put it at 1x1
+
+ def __init__(self, in_channels, bottleneck_channels, out_channels, stride=1, groups=1,
+ dilation=1, norm_func=None, use_cudnn=False, explicit_nhwc=False):
+ super(Bottleneck, self).__init__()
+ if groups != 1:
+ raise RuntimeError('Only support groups == 1')
+ if dilation != 1:
+ raise RuntimeError('Only support dilation == 1')
+ if norm_func == None:
+ norm_func = FrozenBatchNorm2d
+ else:
+ raise RuntimeError('Only support frozen BN now.')
+
+ if stride != 1 or in_channels != out_channels:
+ self.downsample = nn.Sequential(
+ conv1x1(in_channels, out_channels, stride),
+ norm_func(out_channels),
+ )
+ else:
+ self.downsample = None
+
+ # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+ self.conv1 = conv1x1(in_channels, bottleneck_channels, stride)
+ self.conv2 = conv3x3(bottleneck_channels, bottleneck_channels)
+ self.conv3 = conv1x1(bottleneck_channels, out_channels)
+ self.relu = nn.ReLU(inplace=True)
+ self.stride = stride
+
+ self.bn1 = norm_func(bottleneck_channels)
+ self.bn2 = norm_func(bottleneck_channels)
+ self.bn3 = norm_func(out_channels)
+
+ self.use_cudnn = use_cudnn
+
+ # setup conv weights
+ self.w_conv = [self.conv1.weight, self.conv2.weight, self.conv3.weight]
+ if self.downsample is not None:
+ self.w_conv.append(self.downsample[0].weight)
+
+ # init weight in nchw format before possible transpose
+ for w in self.w_conv:
+ kaiming_uniform_(w, a=1)
+
+ # TODO: prevent unsupported case usage
+ # support cases
+ # native cudnn
+ # normal yes no
+ # channel_last yes yes
+ # explicit_nhwc no yes
+ self.explicit_nhwc = explicit_nhwc
+ if self.explicit_nhwc:
+ for p in self.parameters():
+ with torch.no_grad():
+ p.data = p.data.permute(0,2,3,1).contiguous()
+ return
+
+ def forward(self, x):
+ if self.use_cudnn:
+ # calculate scale/bias from registered buffers
+ # TODO: make this better
+ s1, b1 = self.bn1.get_scale_bias(self.explicit_nhwc)
+ s2, b2 = self.bn2.get_scale_bias(self.explicit_nhwc)
+ s3, b3 = self.bn3.get_scale_bias(self.explicit_nhwc)
+ w_scale = [s1, s2, s3]
+ w_bias = [b1, b2, b3]
+ if self.downsample is not None:
+ s4, b4 = self.downsample[1].get_scale_bias(self.explicit_nhwc)
+ w_scale.append(s4)
+ w_bias.append(b4)
+
+ out = bottleneck_function(self.explicit_nhwc, self.stride, w_scale, w_bias, x, *self.w_conv)
+ return out
+
+ if self.explicit_nhwc:
+ raise RuntimeError('explicit nhwc with native ops is not supported.')
+
+ # fallback to native ops
+ identity = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+ out = self.relu(out)
+
+ out = self.conv3(out)
+ out = self.bn3(out)
+
+ if self.downsample is not None:
+ identity = self.downsample(x)
+
+ out += identity
+ out = self.relu(out)
+
+ return out
+
+
+class SpatialBottleneckFunction(torch.autograd.Function):
+ @staticmethod
+ def forward(ctx, spatial_group_size, local_rank, comm, stream1, nhwc, stride_1x1, scale, bias, x, *conv):
+ # TODO: clean up order of tensors
+ args = [x, *conv[0:3], *scale[0:3], *bias[0:3]]
+ ctx.downsample = len(conv) > 3
+ if ctx.downsample:
+ args.append(conv[3])
+ args.append(scale[3])
+ args.append(bias[3])
+
+ # weight buffers are always in nhwc while shape can be nhwc or channels_last
+ # here we pass in flag and let c++ handle it
+ # alternatively, we can put all sizes into a fixed format and pass it in
+ outputs = fast_bottleneck.forward_init(nhwc, stride_1x1, args)
+ fast_bottleneck.forward_out1(nhwc, stride_1x1, args, outputs)
+
+ fast_bottleneck.forward_out2(nhwc, stride_1x1, args, outputs)
+
+ # do halo exchange for outputs[0] (out1)
+ # compute halo cells for outputs[1]
+ if spatial_group_size > 1:
+ out1 = outputs[0]
+ N,Hs,W,C = list(out1.shape)
+ stream1.wait_stream(torch.cuda.current_stream())
+ with torch.cuda.stream(stream1):
+ # copy halos to send buffer
+ send_halos = torch.empty((N,2,W,C),dtype=out1.dtype,device=out1.device)
+ send_halos[:,:1,:,:].copy_(out1[:,:1,:,:])
+ send_halos[:,1:,:,:].copy_(out1[:,Hs-1:,:,:])
+ all_halos = torch.empty((N,2*spatial_group_size,W,C),dtype=out1.dtype,device=out1.device)
+ all_halos = [all_halos[:,i*2:(i+1)*2,:,:] for i in range(spatial_group_size)]
+ dist.all_gather(all_halos,send_halos,group=comm)
+ fat_halo = torch.empty((N,3,W,C),dtype=out1.dtype,device=out1.device)
+ top_out1_halo = all_halos[(spatial_group_size+local_rank-1)%spatial_group_size][:,1:,:,:]
+ if local_rank > 0:
+ fat_halo[:,:1,:,:].copy_(top_out1_halo)
+ fat_halo[:,1:3,:,:].copy_(out1[:,:2,:,:])
+ top_out2 = fast_bottleneck.forward_out2_halo(nhwc, fat_halo, args)
+ btm_out1_halo = all_halos[(local_rank+1)%spatial_group_size][:,:1,:,:]
+ if local_rank < spatial_group_size-1:
+ fat_halo[:,0:2,:,:].copy_(out1[:,Hs-2:,:,:])
+ fat_halo[:,2:,:,:].copy_(btm_out1_halo)
+ btm_out2 = fast_bottleneck.forward_out2_halo(nhwc, fat_halo, args)
+ torch.cuda.current_stream().wait_stream(stream1)
+ out2 = outputs[1]
+ if local_rank > 0:
+ out2[:,:1,:,:].copy_(top_out2)
+ if local_rank < spatial_group_size-1:
+ out2[:,Hs-1:,:,:].copy_(btm_out2)
+
+ fast_bottleneck.forward_rest(nhwc, stride_1x1, args, outputs)
+ # save halos for backward pass
+ if spatial_group_size > 1:
+ ctx.save_for_backward(*(args+outputs+[top_out1_halo,btm_out1_halo]))
+ else:
+ ctx.save_for_backward(*(args+outputs))
+ # save relu outputs for drelu
+ ctx.nhwc = nhwc
+ ctx.stride_1x1 = stride_1x1
+ ctx.spatial_group_size = spatial_group_size
+ ctx.local_rank = local_rank
+ ctx.comm = comm
+ ctx.stream1 = stream1
+ return outputs[2]
+
+ # backward relu is not exposed, MUL with mask used now
+ # only support dgrad
+ @staticmethod
+ def backward(ctx, grad_o):
+ if ctx.spatial_group_size > 1:
+ top_out1_halo = ctx.saved_tensors[-2]
+ btm_out1_halo = ctx.saved_tensors[-1]
+ outputs = ctx.saved_tensors[-5:-2]
+ else:
+ outputs = ctx.saved_tensors[-3:]
+
+ if ctx.downsample:
+ grad_conv3, grad_conv4 = drelu_dscale2(grad_o, outputs[2], ctx.saved_tensors[6], ctx.saved_tensors[11])
+ else:
+ grad_conv3, grad_conv4 = drelu_dscale1(grad_o, outputs[2], ctx.saved_tensors[6])
+
+ # create input vector for backward
+ t_list = [*ctx.saved_tensors[0:10]]
+ t_list.append(grad_conv3)
+ t_list.append(grad_conv4)
+
+ # outputs used for wgrad and generating drelu mask
+ t_list.append(outputs[0])
+ t_list.append(outputs[1])
+
+ # in case there is downsample
+ if ctx.downsample:
+ t_list.append(ctx.saved_tensors[10])
+
+ grads = fast_bottleneck.backward_init(ctx.nhwc, ctx.stride_1x1, t_list)
+ grad_out2 = fast_bottleneck.backward_grad_out2(ctx.nhwc, ctx.stride_1x1, t_list, grads)
+
+ # compute wgrad2 for internal cells
+ wgrad2 = fast_bottleneck.backward_wgrad2(ctx.nhwc, ctx.stride_1x1, t_list, grads, grad_out2)
+
+ # apply wgrad2 halos
+ if ctx.spatial_group_size > 1:
+ if ctx.local_rank > 0:
+ top_grad2_halo = grad_out2[:,:1,:,:]
+ top_wgrad2_halo = fast_bottleneck.backward_wgrad2_halo(ctx.nhwc, ctx.stride_1x1, t_list, grads, top_out1_halo, top_grad2_halo)
+ wgrad2[:,:1,:,:].add_(top_wgrad2_halo)
+ if ctx.local_rank < ctx.spatial_group_size-1:
+ btm_grad2_halo = grad_out2[:,-1:,:,:]
+ btm_wgrad2_halo = fast_bottleneck.backward_wgrad2_halo(ctx.nhwc, ctx.stride_1x1, t_list, grads, btm_out1_halo, btm_grad2_halo)
+ wgrad2[:,-1:,:,:].add_(btm_wgrad2_halo)
+
+ # do halo exchange of grad_out2 here
+ # compute halo cells for grad_out1
+ if ctx.spatial_group_size > 1:
+ N,Hs,W,C = list(grad_out2.shape)
+ ctx.stream1.wait_stream(torch.cuda.current_stream())
+ with torch.cuda.stream(ctx.stream1):
+ # copy halos to send buffer
+ send_halos = torch.empty((N,2,W,C),dtype=grad_out2.dtype,device=grad_out2.device)
+ send_halos[:,:1,:,:].copy_(grad_out2[:,:1,:,:])
+ send_halos[:,1:,:,:].copy_(grad_out2[:,Hs-1:,:,:])
+ all_halos = torch.empty((N,2*ctx.spatial_group_size,W,C),dtype=grad_out2.dtype,device=grad_out2.device)
+ all_halos = [all_halos[:,i*2:(i+1)*2,:,:] for i in range(ctx.spatial_group_size)]
+ dist.all_gather(all_halos,send_halos,group=ctx.comm)
+ relu1 = t_list[12]
+ fat_halo = torch.empty((N,3,W,C),dtype=grad_out2.dtype,device=grad_out2.device)
+ relu_halo = torch.empty((N,3,W,C),dtype=grad_out2.dtype,device=grad_out2.device)
+ if ctx.local_rank > 0:
+ top_halo = all_halos[ctx.local_rank-1][:,1:,:,:]
+ fat_halo[:,:1,:,:].copy_(top_halo)
+ fat_halo[:,1:,:,:].copy_(grad_out2[:,:2,:,:])
+ relu_halo[:,:1,:,:].zero_()
+ relu_halo[:,1:,:,:].copy_(relu1[:,:2,:,:])
+ top_grad_out1_halo = fast_bottleneck.backward_grad_out1_halo(ctx.nhwc, ctx.stride_1x1, t_list, grads, fat_halo, relu_halo)
+ top_grad_out1_halo = top_grad_out1_halo[:,1:2,:,:]
+ if ctx.local_rank < ctx.spatial_group_size-1:
+ btm_halo = all_halos[ctx.local_rank+1][:,:1,:,:]
+ fat_halo[:,:2,:,:].copy_(grad_out2[:,Hs-2:,:,:])
+ fat_halo[:,2:,:,:].copy_(btm_halo)
+ relu_halo[:,:2,:,:].copy_(relu1[:,Hs-2:,:,:])
+ relu_halo[:,2:,:,:].zero_()
+ btm_grad_out1_halo = fast_bottleneck.backward_grad_out1_halo(ctx.nhwc, ctx.stride_1x1, t_list, grads, fat_halo, relu_halo)
+ btm_grad_out1_halo = btm_grad_out1_halo[:,1:2,:,:]
+
+ # compute grad_out1 for internal cells
+ grad_out1 = fast_bottleneck.backward_grad_out1(ctx.nhwc, ctx.stride_1x1, t_list, grads, grad_out2)
+
+ # apply halo cells to grad_out1
+ if ctx.spatial_group_size > 1:
+ w = t_list[2]
+ z = t_list[4]
+ relu1 = t_list[12]
+ #print("w.shape = %s, z.shape = %s, relu1.shape = %s" % (str(list(w.shape)), str(list(z.shape)), str(list(relu1.shape))))
+ torch.cuda.current_stream().wait_stream(ctx.stream1)
+ if ctx.local_rank > 0:
+ grad_out1[:,:1,:,:].copy_(top_grad_out1_halo)
+ #print("ctx.local_rank = %d, apply grad_out1 top halo (grad_out1.shape = %s)" % (ctx.local_rank, str(list(grad_out1.shape))))
+ if ctx.local_rank < ctx.spatial_group_size-1:
+ grad_out1[:,Hs-1:,:,:].copy_(btm_grad_out1_halo)
+ #print("ctx.local_rank = %d, apply grad_out1 btm halo (grad_out1.shape = %s)" % (ctx.local_rank, str(list(grad_out1.shape))))
+
+ fast_bottleneck.backward_rest(ctx.nhwc, ctx.stride_1x1, t_list, grads, grad_out2, grad_out1, wgrad2)
+
+ return (None, None, None, None, None, None, None, None, *grads)
+
+spatial_bottleneck_function = SpatialBottleneckFunction.apply
+
+class SpatialBottleneck(torch.nn.Module):
+ # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+ # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+ # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+ # This variant is also known as ResNet V1.5 and improves accuracy according to
+ # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+ # here we put it at 1x1
+
+ def __init__(self, in_channels, bottleneck_channels, out_channels, stride=1, groups=1,
+ dilation=1, norm_func=None, use_cudnn=False, explicit_nhwc=False,
+ spatial_group_size=1, communicator=None):
+ super(SpatialBottleneck, self).__init__()
+ if groups != 1:
+ raise RuntimeError('Only support groups == 1')
+ if dilation != 1:
+ raise RuntimeError('Only support dilation == 1')
+ if norm_func == None:
+ norm_func = FrozenBatchNorm2d
+ else:
+ raise RuntimeError('Only support frozen BN now.')
+
+ if stride != 1 or in_channels != out_channels:
+ self.downsample = nn.Sequential(
+ conv1x1(in_channels, out_channels, stride),
+ norm_func(out_channels),
+ )
+ else:
+ self.downsample = None
+
+ # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+ self.conv1 = conv1x1(in_channels, bottleneck_channels, stride)
+ self.conv2 = conv3x3(bottleneck_channels, bottleneck_channels)
+ self.conv3 = conv1x1(bottleneck_channels, out_channels)
+ self.relu = nn.ReLU(inplace=True)
+ self.stride = stride
+
+ self.bn1 = norm_func(bottleneck_channels)
+ self.bn2 = norm_func(bottleneck_channels)
+ self.bn3 = norm_func(out_channels)
+
+ self.use_cudnn = use_cudnn
+
+ # setup conv weights
+ self.w_conv = [self.conv1.weight, self.conv2.weight, self.conv3.weight]
+ if self.downsample is not None:
+ self.w_conv.append(self.downsample[0].weight)
+
+ # init weight in nchw format before possible transpose
+ for w in self.w_conv:
+ kaiming_uniform_(w, a=1)
+
+ # TODO: prevent unsupported case usage
+ # support cases
+ # native cudnn
+ # normal yes no
+ # channel_last yes yes
+ # explicit_nhwc no yes
+ self.explicit_nhwc = explicit_nhwc
+ if self.explicit_nhwc:
+ for p in self.parameters():
+ with torch.no_grad():
+ p.data = p.data.permute(0,2,3,1).contiguous()
+
+ # spatial communicator
+ self.spatial_group_size = spatial_group_size
+ if spatial_group_size > 1:
+ world_size = dist.get_world_size()
+ num_groups = world_size // spatial_group_size
+ assert(num_groups*spatial_group_size == world_size), "torch.distributed.get_world_size() must be multiple of group_size"
+ rank = dist.get_rank()
+ self.local_rank = rank % spatial_group_size
+ if communicator is None:
+ for group in range(num_groups):
+ ranks = list(range(group*spatial_group_size,(group+1)*spatial_group_size))
+ comm = torch.distributed.new_group(ranks=ranks)
+ if rank in ranks:
+ self.communicator = comm
+ else:
+ self.communicator = communicator
+ self.stream1 = torch.cuda.Stream()
+ self.spatial_args = self.spatial_group_size, self.local_rank, self.communicator, self.stream1
+ else:
+ self.spatial_args = 1, 0, None, None
+
+ return
+
+ def forward(self, x):
+ if self.use_cudnn:
+ # calculate scale/bias from registered buffers
+ # TODO: make this better
+ s1, b1 = self.bn1.get_scale_bias(self.explicit_nhwc)
+ s2, b2 = self.bn2.get_scale_bias(self.explicit_nhwc)
+ s3, b3 = self.bn3.get_scale_bias(self.explicit_nhwc)
+ w_scale = [s1, s2, s3]
+ w_bias = [b1, b2, b3]
+ if self.downsample is not None:
+ s4, b4 = self.downsample[1].get_scale_bias(self.explicit_nhwc)
+ w_scale.append(s4)
+ w_bias.append(b4)
+
+ out = spatial_bottleneck_function(*self.spatial_args, self.explicit_nhwc, self.stride, w_scale, w_bias, x, *self.w_conv)
+ return out
+
+ if self.explicit_nhwc:
+ raise RuntimeError('explicit nhwc with native ops is not supported.')
+
+ # fallback to native ops
+ identity = x
+
+ out = self.conv1(x)
+ out = self.bn1(out)
+ out = self.relu(out)
+
+ out = self.conv2(out)
+ out = self.bn2(out)
+ out = self.relu(out)
+
+ out = self.conv3(out)
+ out = self.bn3(out)
+
+ if self.downsample is not None:
+ identity = self.downsample(x)
+
+ out += identity
+ out = self.relu(out)
+
+ return out
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/bottleneck_module_test.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/bottleneck_module_test.py
new file mode 100644
index 0000000000..38aa228426
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/bottleneck_module_test.py
@@ -0,0 +1,272 @@
+import os
+import torch
+from maskrcnn_benchmark.modeling.backbone.resnet import Bottleneck
+from maskrcnn_benchmark.layers.nhwc import nhwc_to_nchw_transform, nchw_to_nhwc_transform
+from maskrcnn_benchmark.layers.nhwc.batch_norm import FrozenBatchNorm2d_NHWC
+from apex.contrib.bottleneck import Bottleneck as FastBottleneck
+from apex.contrib.bottleneck import SpatialBottleneck
+
+
+def single_module_test(ref, rank, world_size, numtype, device, shape, fast, spatial_group_size, in_channels, bottleneck_channels, out_channels, num_groups, stride_in_1x1, stride, dilation, norm_func, nhwc):
+ # inputs + modules
+ with torch.no_grad():
+ input_shape = [1, in_channels] + list(shape)
+ x = torch.randn(input_shape, dtype=numtype, device=device)
+ if nhwc:
+ x = nchw_to_nhwc_transform(x).contiguous()
+ x.requires_grad = True
+ print(x.shape, x.stride())
+
+ #if spatial_group_size > 1:
+ # fast = False # hack so fast bottleneck can be run against distributed bottleneck
+ #if spatial_group_size == 1:
+ # fast = False
+
+ if fast:
+ if spatial_group_size == 1:
+ bottleneck = FastBottleneck(
+ in_channels=in_channels,
+ bottleneck_channels=bottleneck_channels,
+ out_channels=out_channels,
+ stride=stride,
+ dilation=dilation,
+ explicit_nhwc=nhwc,
+ use_cudnn=True)
+ else:
+ bottleneck = SpatialBottleneck(
+ in_channels=in_channels,
+ bottleneck_channels=bottleneck_channels,
+ out_channels=out_channels,
+ stride=stride,
+ dilation=dilation,
+ explicit_nhwc=nhwc,
+ use_cudnn=True,
+ spatial_group_size=spatial_group_size)
+ else:
+ bottleneck = Bottleneck(
+ in_channels,
+ bottleneck_channels,
+ out_channels,
+ num_groups,
+ stride_in_1x1,
+ stride,
+ dilation,
+ norm_func,
+ nhwc,
+ spatial_group_size)
+ bottleneck = bottleneck.to(dtype=numtype,device=device)
+ weights = dict(bottleneck.named_parameters())
+
+ if ref is not None:
+ ref_x, _, ref_weights = ref
+ Hs,H = x.shape[1], ref_x.shape[1]
+ assert(Hs*spatial_group_size == H), "Hs not a multiple of H"
+ ref_x = ref_x[:,rank*Hs:(rank+1)*Hs,:,:]
+ x.copy_(ref_x)
+ assert(len(weights) == len(ref_weights)), "Reference weights and weights don't match"
+ for k in weights.keys():
+ weights[k].copy_(ref_weights[k])
+
+ # forward
+ out = bottleneck(x)
+
+ # gradient output
+ with torch.no_grad():
+ grad_out = torch.randn_like(out)
+ if ref is not None:
+ _, ref_grad_out, _ = ref
+ Hs,H = grad_out.shape[1], ref_grad_out.shape[1]
+ assert(Hs*spatial_group_size == H), "Hs not a multiple of H"
+ ref_grad_out = ref_grad_out[:,rank*Hs:(rank+1)*Hs,:,:]
+ grad_out.copy_(ref_grad_out)
+
+ # backward
+ out.backward(grad_out)
+
+ with torch.no_grad():
+ dgrad = x.grad.detach()
+
+ wgrad = {}
+ for n,p in bottleneck.named_parameters():
+ wgrad[n] = p.grad.detach()
+
+ if world_size > 1:
+ if spatial_group_size == 1:
+ # broadcast x, grad_out and weights from rank 0
+ with torch.no_grad():
+ torch.distributed.broadcast(x,0)
+ torch.distributed.broadcast(grad_out,0)
+ for k in weights.keys():
+ torch.distributed.broadcast(weights[k],0)
+ else:
+ # gather dgrad (x.grad), sum wgrad (weights) and out
+ N,Hs,W,C = dgrad.shape
+ H = Hs * spatial_group_size
+ dgrad_gathered = torch.empty((N,H,W,C),dtype=dgrad.dtype,device=dgrad.device)
+ dgrad_tensors = [dgrad_gathered[:,i*Hs:(i+1)*Hs,:,:] for i in range(spatial_group_size)]
+ torch.distributed.all_gather(dgrad_tensors, dgrad)
+ dgrad = dgrad_gathered
+ N,Hs,W,C = list(out.shape)
+ H = Hs * spatial_group_size
+ out_gathered = torch.empty((N,H,W,C),dtype=dgrad.dtype,device=dgrad.device)
+ out_tensors= [out_gathered[:,i*Hs:(i+1)*Hs,:,:] for i in range(spatial_group_size)]
+ torch.distributed.all_gather(out_tensors, out)
+ out = out_gathered
+ for k in wgrad.keys():
+ w = wgrad[k].to(dtype=torch.float64)
+ torch.distributed.all_reduce(w)
+ wgrad[k].copy_(w.to(dtype=wgrad[k].dtype))
+ #torch.distributed.all_reduce(wgrad[k])
+
+ return x, out, grad_out, weights, dgrad, wgrad
+
+
+def module_tests(rank, world_size, numtype, device, fast, spatial_group_sizes, init_args):
+ r = []
+ for ia in init_args:
+ shape = ia[0:4]
+ args = ia[4:]
+ rr = []
+ ref = None
+ for spatial_group_size in spatial_group_sizes:
+ N,H,W,C = shape
+ H = H//spatial_group_size
+ x, out, grad_out, weights, dgrad, wgrad = single_module_test(ref, rank, world_size, numtype, device, [H,W], fast, spatial_group_size, *args)
+ if ref is None:
+ assert(spatial_group_size == 1), "Wrong reference weights"
+ ref = x, grad_out, weights
+ if rank == 0:
+ rr.append( (out, dgrad, wgrad) )
+ if world_size > 1: torch.distributed.barrier()
+ r.append(rr)
+ return r
+
+
+def main():
+ total_num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
+ distributed = total_num_gpus > 1
+ ngpus = torch.cuda.device_count()
+
+ if distributed:
+ torch.distributed.init_process_group("nccl")
+ rank, world_size = torch.distributed.get_rank(), torch.distributed.get_world_size()
+ is_master = True if rank == 0 else False
+ local_rank = rank % ngpus
+ torch.cuda.set_device(local_rank)
+ spatial_group_size = total_num_gpus
+ else:
+ rank, local_rank, is_master, world_size, spatial_group_size = 0, 0, True, 1, 1
+
+ torch.use_deterministic_algorithms(True)
+ torch.backends.cudnn.benchmark = False
+ torch.backends.cudnn.deterministic = True
+ torch.backends.cuda.matmul.allow_tf32 = False
+ torch.backends.cudnn.allow_tf32 = False
+
+ norm_func = FrozenBatchNorm2d_NHWC
+
+ init_args = [
+ (1, 200, 336, 64, 64, 64, 256, 1, True, 1, 1, norm_func, True),
+ (1, 200, 336, 256, 256, 64, 256, 1, True, 1, 1, norm_func, True),
+ (1, 200, 336, 256, 256, 128, 512, 1, True, 2, 1, norm_func, True),
+ (1, 100, 168, 512, 512, 128, 512, 1, True, 1, 1, norm_func, True),
+ (1, 100, 168, 512, 512, 256, 1024, 1, True, 2, 1, norm_func, True),
+ (1, 50, 84, 1024, 1024, 256, 1024, 1, True, 1, 1, norm_func, True),
+ (1, 50, 84, 1024, 1024, 512, 2048, 1, True, 2, 1, norm_func, True),
+ (1, 25, 42, 2048, 2048, 512, 2048, 1, True, 1, 1, norm_func, True),
+ (1, 336, 200, 64, 64, 64, 256, 1, True, 1, 1, norm_func, True),
+ (1, 336, 200, 256, 256, 64, 256, 1, True, 1, 1, norm_func, True),
+ (1, 336, 200, 256, 256, 128, 512, 1, True, 2, 1, norm_func, True),
+ (1, 168, 100, 512, 512, 128, 512, 1, True, 1, 1, norm_func, True),
+ (1, 168, 100, 512, 512, 256, 1024, 1, True, 2, 1, norm_func, True),
+ (1, 84, 50, 1024, 1024, 256, 1024, 1, True, 1, 1, norm_func, True),
+ (1, 84, 50, 1024, 1024, 512, 2048, 1, True, 2, 1, norm_func, True),
+ (1, 42, 25, 2048, 2048, 512, 2048, 1, True, 1, 1, norm_func, True),
+ ]
+ init_args = init_args[0:1]
+
+ # pad H to account for spatial distribution
+ padded_init_args = []
+ for ia in init_args:
+ N,H,W,C = ia[0:4]
+ m = spatial_group_size * H // (25 if H < W else 42)
+ H = ((H + m - 1) // m) * m
+ args = tuple( [N,H,W,C] + list(ia[4:]) )
+ padded_init_args.append(args)
+ init_args = padded_init_args
+ if rank == 0:
+ for ia in init_args:
+ print(ia)
+
+ spatial_group_sizes = [1]
+ if spatial_group_size > 1:
+ spatial_group_sizes.append(spatial_group_size)
+
+ numtype, device, fast = torch.float16, 'cuda', True
+ r = module_tests(rank, world_size, numtype, device, fast, spatial_group_sizes, init_args)
+ if world_size > 1: torch.distributed.barrier()
+ if rank == 0:
+ for rr in r:
+ print("***")
+ for out, dgrad, wgrad in rr:
+ gr = [("out",out.norm(p=2,dtype=torch.float64).item())]
+ gr = gr + [("dgrad",dgrad.norm(p=2,dtype=torch.float64).item())]
+ gr = gr + [(k+".wgrad",wgrad[k].norm(p=2,dtype=torch.float64).item()) for k in wgrad.keys()]
+ print(gr)
+ if len(rr) == 2:
+ out1, dgrad1, wgrad1 = rr[0]
+ out2, dgrad2, wgrad2 = rr[1]
+
+ rtol = 1e-1
+ out_atol = out1.abs().max().item() * rtol
+ dgrad_atol = dgrad1.abs().max().item() * rtol
+ wgrad_atol = {}
+ for k in wgrad1.keys():
+ wgrad_atol[k] = wgrad1[k].abs().max().item() * rtol
+
+ gr = [("out",torch.allclose(out1,out2,rtol,out_atol,equal_nan=True))]
+ gr = gr + [("dgrad",torch.allclose(dgrad1,dgrad2,rtol,dgrad_atol,equal_nan=True))]
+ gr = gr + [(k+".wgrad",torch.allclose(wgrad1[k],wgrad2[k],rtol,wgrad_atol[k],equal_nan=True)) for k in wgrad1.keys()]
+ print(gr)
+
+ gr = [("out",(out1-out2).norm(p=2,dtype=torch.float64).item())]
+ gr = gr + [("dgrad",(dgrad1-dgrad2).norm(p=2,dtype=torch.float64).item())]
+ gr = gr + [(k+".wgrad",(wgrad1[k]-wgrad2[k]).norm(p=2,dtype=torch.float64).item()) for k in wgrad1.keys()]
+ print(gr)
+
+ N,H,W,C = out1.shape
+ Hs = H // spatial_group_size
+ Ht = Hs-2
+ print("out1@%d:%d=%s" % (Ht,H,str(out1[0,Ht,:8,:5])))
+ print("out2@%d:%d=%s" % (Ht,H,str(out2[0,Ht,:8,:5])))
+ Ht = Hs-1
+ print("out1@%d:%d=%s" % (Ht,H,str(out1[0,Ht,:8,:5])))
+ print("out2@%d:%d=%s" % (Ht,H,str(out2[0,Ht,:8,:5])))
+ Ht = Hs
+ print("out1@%d:%d=%s" % (Ht,H,str(out1[0,Ht,:8,:5])))
+ print("out2@%d:%d=%s" % (Ht,H,str(out2[0,Ht,:8,:5])))
+ Ht = Hs+1
+ print("out1@%d:%d=%s" % (Ht,H,str(out1[0,Ht,:8,:5])))
+ print("out2@%d:%d=%s" % (Ht,H,str(out2[0,Ht,:8,:5])))
+
+ N,H,W,C = dgrad1.shape
+ Hs = H // spatial_group_size
+ Ht = Hs-2
+ print("dgrad1@%d:%d=%s" % (Ht,H,str(dgrad1[0,Ht,:8,:5])))
+ print("dgrad2@%d:%d=%s" % (Ht,H,str(dgrad2[0,Ht,:8,:5])))
+ Ht = Hs-1
+ print("dgrad1@%d:%d=%s" % (Ht,H,str(dgrad1[0,Ht,:8,:5])))
+ print("dgrad2@%d:%d=%s" % (Ht,H,str(dgrad2[0,Ht,:8,:5])))
+ Ht = Hs
+ print("dgrad1@%d:%d=%s" % (Ht,H,str(dgrad1[0,Ht,:8,:5])))
+ print("dgrad2@%d:%d=%s" % (Ht,H,str(dgrad2[0,Ht,:8,:5])))
+ Ht = Hs+1
+ print("dgrad1@%d:%d=%s" % (Ht,H,str(dgrad1[0,Ht,:8,:5])))
+ print("dgrad2@%d:%d=%s" % (Ht,H,str(dgrad2[0,Ht,:8,:5])))
+
+
+ if world_size > 1: torch.distributed.barrier()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/test.py b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/test.py
new file mode 100644
index 0000000000..2c3c621302
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/bottleneck/test.py
@@ -0,0 +1,71 @@
+import torch
+from bottleneck import Bottleneck
+torch.manual_seed(23337)
+
+# use True to print layerwise sum for all outputs in reference code path
+DEBUG = False#True
+
+for stride, o_channel in [(1,32), (1,128), (2,32)]:
+ print("testing stride ==", stride, ", in_channel == 32 , out_channel ==", o_channel)
+ a_ = torch.randn(17,32,28,28)
+
+ a = a_.cuda().half().to(memory_format=torch.channels_last).requires_grad_()
+ model = Bottleneck(32,8,o_channel,stride=stride).cuda().half().to(memory_format=torch.channels_last)
+
+ # test model
+ b = model(a)
+ b.mean().backward()
+ d_grad = a.grad.float()
+ a.grad = None
+ torch.cuda.synchronize()
+
+ if DEBUG:
+ print("[DEBUG] ref dx :", d_grad.sum().item())
+ # print wgrad. we don't need to reset since later cpp print before accumulation
+ for i, w in enumerate(model.w_conv):
+ print("[DEBUG] ref wgrad{} :".format(i+1), w.grad.sum().item())
+
+ wgrads = []
+ for w in model.w_conv:
+ wgrads.append(w.grad.float())
+
+ model.use_cudnn = True
+ model.zero_grad()
+ c = model(a)
+ c.mean().backward()
+
+ torch.cuda.synchronize()
+ print("comparing native and channels_last:")
+ print("max error fprop:", (b-c).abs().max().item(), "max elem:", b.abs().max().item())
+ print("max error dgrad:", (d_grad-a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
+ for i, (w, wgrad) in enumerate(zip(model.w_conv, wgrads)):
+ print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())
+
+ nhwc_a = a_.permute(0,2,3,1).contiguous().cuda().half().requires_grad_()
+ nhwc_model = Bottleneck(32,8,o_channel,stride=stride,explicit_nhwc=True, use_cudnn=True).cuda().half()
+ for p,q in zip(model.parameters(), nhwc_model.parameters()):
+ # model's storage is already in nhwc, we clone and assign to explicit nhwc model
+ q.data.copy_(p.data.permute(0,2,3,1).contiguous())
+ for p,q in zip(model.buffers(), nhwc_model.buffers()):
+ q.data.copy_(p.data)
+
+ d = nhwc_model(nhwc_a)
+ d.mean().backward()
+ torch.cuda.synchronize()
+
+ # reset reference to cudnn channels_last permute
+ #c_s = c.storage().tolist()
+ #d_s = d.storage().tolist()
+ #print(max([x-y for x,y in zip(c_s,d_s)]))
+ c = c.contiguous(memory_format=torch.contiguous_format).permute(0,2,3,1).contiguous()
+ d_grad = a.grad.float().permute(0,2,3,1).contiguous()
+ wgrads = []
+ for w in model.w_conv:
+ wgrads.append(w.grad.float().permute(0,2,3,1).contiguous())
+
+ torch.cuda.synchronize()
+ print("comparing nhwc and channels_last:")
+ print("max error fprop:", (d-c).abs().max().item(), "max elem:", c.abs().max().item())
+ print("max error dgrad:", (d_grad-nhwc_a.grad.float()).abs().max().item(), "max elem:", d_grad.abs().max().item())
+ for i, (w, wgrad) in enumerate(zip(nhwc_model.w_conv, wgrads)):
+ print("max error wgrad{}:".format(i+1), (wgrad - w.grad.float()).abs().max().item(), "max elem:", wgrad.abs().max().item())
diff --git a/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/bottleneck/bottleneck.cpp b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/bottleneck/bottleneck.cpp
new file mode 100644
index 0000000000..4f9475c5cd
--- /dev/null
+++ b/PyTorch/contrib/audio/jasper/Jasper_pytorch_wuqingdian01/apex/apex/contrib/csrc/bottleneck/bottleneck.cpp
@@ -0,0 +1,2486 @@
+#include
+#include // for getcudnnhandle
+#include
+#include
+#include
+#include
+
+#include
+
+#ifdef DEBUG
+#define DEBUG_MSG(str) do { std::cout << str << std::endl; } while( false )
+#else
+#define DEBUG_MSG(str) do { } while ( false )
+#endif
+
+#ifdef DEBUG_CUDNN
+#define DEBUG_CUDNN_MSG(buf, str) do { buf << str << std::endl; } while( false )
+#else
+#define DEBUG_CUDNN_MSG(buf, str) do { } while ( false )
+#endif
+
+#define checkCudnnErr(...) \
+ do { \
+ int err = checkCudnnError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__); \
+ if (err) { \
+ return; \
+ } \
+ } while (0)
+
+
+int checkCudnnError(cudnnStatus_t code, const char* expr, const char* file, int line) {
+ if (code) {
+ printf("CUDNN error at %s:%d, code=%d (%s) in '%s'\n", file, line, (int)code, cudnnGetErrorString(code), expr);
+ return 1;
+ }
+ return 0;
+}
+
+void checkError(cudaError_t code, char const * func, const char *file, const int line, bool abort = true);
+#define checkCUDAError(val) { checkError((val), #val, __FILE__, __LINE__); } // in-line regular function
+
+void checkError(cudaError_t code, char const * func, const char *file, const int line, bool abort)
+{
+ if (code != cudaSuccess)
+ {
+ const char * errorMessage = cudaGetErrorString(code);
+ fprintf(stderr, "CUDA error returned from \"%s\" at %s:%d, Error code: %d (%s)\n", func, file, line, code, errorMessage);
+ if (abort){
+ cudaDeviceReset();
+ exit(code);
+ }
+ }
+}
+
+void generateStrides(const int64_t* dimA, int64_t* strideA, int nbDims, cudnnTensorFormat_t filterFormat) {
+ // For INT8x4 and INT8x32 we still compute standard strides here to input
+ // into the cuDNN functions. We will manually scale by resizeFactor in the cpu ref.
+ if (filterFormat == CUDNN_TENSOR_NCHW) {
+ strideA[nbDims - 1] = 1;
+ for (int64_t d = nbDims - 2; d >= 0; d--) {
+ strideA[d] = strideA[d + 1] * dimA[d + 1];
+ }
+ } else {
+ // Here we assume that the format is CUDNN_TENSOR_NHWC
+ strideA[1] = 1;
+ strideA[nbDims - 1] = strideA[1] * dimA[1];
+ for (int64_t d = nbDims - 2; d >= 2; d--) {
+ strideA[d] = strideA[d + 1] * dimA[d + 1];
+ }
+ strideA[0] = strideA[2] * dimA[2];
+ }
+}
+
+
+int getFwdConvDilatedFilterDim(int filterDim, int dilation) {
+ return ((filterDim - 1) * dilation) + 1;
+}
+
+int getFwdConvPaddedImageDim(int tensorDim, int pad) {
+ return tensorDim + (2 * pad);
+}
+
+int getFwdConvOutputDim(
+ int tensorDim,
+ int pad,
+ int filterDim,
+ int stride,
+ int dilation)
+{
+ int p = (getFwdConvPaddedImageDim(tensorDim, pad) - getFwdConvDilatedFilterDim(filterDim, dilation)) / stride + 1;
+ return (p);
+}
+
+enum {
+ X_TENSOR,
+ Y_TENSOR,
+ W_TENSOR,
+ Z_TENSOR,
+ B_TENSOR,
+ AFTERADD_TENSOR,
+ AFTERBIAS_TENSOR,
+ AFTERCONV_TENSOR,
+ OPTIONAL,
+ AFTEROPT_TENSOR,
+};
+
+using common_conv_descriptors =
+ std::tuple;
+
+
+common_conv_descriptors
+create_common_descriptors(int64_t* x_dim_padded,
+ int64_t* padA,
+ int64_t* convstrideA,
+ int64_t* dilationA,
+ int64_t* w_dim_padded,
+ int64_t* y_dim_padded,
+ cudnnDataType_t dataType,
+ cudnnConvolutionMode_t mode) {
+ const int convDim = 2;
+
+ int64_t strideA_padded[4];
+ int64_t outstrideA_padded[4];
+ int64_t filterstrideA_padded[4];
+
+ generateStrides(w_dim_padded, filterstrideA_padded, 4, CUDNN_TENSOR_NHWC);
+ generateStrides(x_dim_padded, strideA_padded, 4, CUDNN_TENSOR_NHWC);
+ generateStrides(y_dim_padded, outstrideA_padded, 4, CUDNN_TENSOR_NHWC);
+
+ return common_conv_descriptors(cudnn_frontend::TensorBuilder()
+ .setDim(4, x_dim_padded)
+ .setStrides(4, strideA_padded)
+ .setId('x')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, y_dim_padded)
+ .setStrides(4, outstrideA_padded)
+ .setId('y')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, w_dim_padded)
+ .setStrides(4, filterstrideA_padded)
+ .setId('w')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::ConvDescBuilder()
+ .setDataType(CUDNN_DATA_FLOAT)
+ .setMathMode(mode)
+ .setNDims(convDim)
+ .setStrides(convDim, convstrideA)
+ .setPrePadding(convDim, padA)
+ .setPostPadding(convDim, padA)
+ .setDilation(convDim, dilationA)
+ .build());
+}
+
+using common_convbias_descriptors = std::tuple;
+
+common_convbias_descriptors
+create_conv_bias_add_act_descriptors(int64_t* x_dim_padded,
+ int64_t* padA,
+ int64_t* convstrideA,
+ int64_t* dilationA,
+ int64_t* w_dim_padded,
+ int64_t* y_dim_padded,
+ cudnnDataType_t dataType) {
+ const int convDim = 2;
+
+ int64_t b_dim_padded[4];
+ b_dim_padded[0] = 1;
+ b_dim_padded[1] = y_dim_padded[1];
+ b_dim_padded[2] = 1;
+ b_dim_padded[3] = 1;
+
+ int64_t x_stride_padded[4];
+ int64_t y_stride_padded[4];
+ int64_t w_stride_padded[4];
+ int64_t b_stride_padded[4];
+
+ generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
+ generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
+ generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
+ generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);
+
+ return common_convbias_descriptors(cudnn_frontend::TensorBuilder()
+ .setDim(4, x_dim_padded)
+ .setStrides(4, x_stride_padded)
+ .setId('x')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, y_dim_padded)
+ .setStrides(4, y_stride_padded)
+ .setId('y')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, w_dim_padded)
+ .setStrides(4, w_stride_padded)
+ .setId('w')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, b_dim_padded)
+ .setStrides(4, b_stride_padded)
+ .setId('z')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, b_dim_padded)
+ .setStrides(4, b_stride_padded)
+ .setId('b')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, y_dim_padded)
+ .setStrides(4, y_stride_padded)
+ .setVirtual()
+ .setId('A') // after add
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, y_dim_padded)
+ .setStrides(4, y_stride_padded)
+ .setVirtual()
+ .setId('B') // after bias
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, y_dim_padded)
+ .setStrides(4, y_stride_padded)
+ .setId('C') // after conv
+ .setAlignment(16)
+ .setVirtual()
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, y_dim_padded)
+ .setStrides(4, y_stride_padded)
+ .setId('i')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, y_dim_padded)
+ .setStrides(4, y_stride_padded)
+ .setId('D') // after optional add
+ .setAlignment(16)
+ .setVirtual()
+ .setDataType(dataType)
+ .build());
+}
+
+// tensor descriptors used for dgrad
+enum {
+ X_OR_DX_TENSOR,
+ DY_TENSOR,
+ W_OR_DW_TENSOR,
+ SCALE_TENSOR,
+ RELU_TENSOR,
+ AFTER_DCONV_TENSOR,
+ AFTER_DRELU_TENSOR,
+};
+
+using dconv_descriptors = std::tuple;
+
+dconv_descriptors
+create_dconv_descriptors(int64_t* x_dim_padded,
+ int64_t* padA,
+ int64_t* convstrideA,
+ int64_t* dilationA,
+ int64_t* w_dim_padded,
+ int64_t* y_dim_padded,
+ cudnnDataType_t dataType) {
+ const int convDim = 2;
+
+ int64_t b_dim_padded[4];
+ b_dim_padded[0] = 1;
+ b_dim_padded[1] = x_dim_padded[1];
+ b_dim_padded[2] = 1;
+ b_dim_padded[3] = 1;
+
+ int64_t x_stride_padded[4];
+ int64_t y_stride_padded[4];
+ int64_t w_stride_padded[4];
+ int64_t b_stride_padded[4];
+
+ generateStrides(w_dim_padded, w_stride_padded, 4, CUDNN_TENSOR_NHWC);
+ generateStrides(x_dim_padded, x_stride_padded, 4, CUDNN_TENSOR_NHWC);
+ generateStrides(y_dim_padded, y_stride_padded, 4, CUDNN_TENSOR_NHWC);
+ generateStrides(b_dim_padded, b_stride_padded, 4, CUDNN_TENSOR_NHWC);
+
+ return dconv_descriptors(cudnn_frontend::TensorBuilder()
+ .setDim(4, x_dim_padded)
+ .setStrides(4, x_stride_padded)
+ .setId('x')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, y_dim_padded)
+ .setStrides(4, y_stride_padded)
+ .setId('y')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, w_dim_padded)
+ .setStrides(4, w_stride_padded)
+ .setId('w')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, b_dim_padded)
+ .setStrides(4, b_stride_padded)
+ .setId('s')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, x_dim_padded)
+ .setStrides(4, x_stride_padded)
+ .setId('r')
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, x_dim_padded)
+ .setStrides(4, x_stride_padded)
+ .setVirtual()
+ .setId('A') // after dconv
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build(),
+ cudnn_frontend::TensorBuilder()
+ .setDim(4, x_dim_padded)
+ .setStrides(4, x_stride_padded)
+ .setVirtual()
+ .setId('B') // after drelu
+ .setAlignment(16)
+ .setDataType(dataType)
+ .build());
+}
+
+// create a cache for plan
+std::unordered_map plan_cache;
+
+// TODO: better name
+std::string getConvFusionString(int64_t* x_dim_padded,
+ int64_t* padA,
+ int64_t* convstrideA,
+ int64_t* dilationA,
+ int64_t* w_dim_padded,
+ cudnnDataType_t dataType,
+ std::string fusion_string) {
+
+ for(int i=0;i<4;i++) {
+ fusion_string += 'X';
+ fusion_string += std::to_string(x_dim_padded[i]);
+ }
+ for(int i=0;i<4;i++) {
+ fusion_string += 'W';
+ fusion_string += std::to_string(w_dim_padded[i]);
+ }
+ for(int i=0;i<2;i++) {
+ fusion_string += 'P';
+ fusion_string += std::to_string(padA[i]);
+ }
+ for(int i=0;i<2;i++) {
+ fusion_string += 'S';
+ fusion_string += std::to_string(convstrideA[i]);
+ }
+ for(int i=0;i<2;i++) {
+ fusion_string += 'D';
+ fusion_string += std::to_string(dilationA[i]);
+ }
+ fusion_string += 'T';
+ fusion_string += std::to_string(dataType);
+ return fusion_string;
+}
+
+cudnn_frontend::ExecutionPlan& getOrCreatePlan(cudnnHandle_t handle_,
+ std::stringstream& log_buf,
+ cudnn_frontend::OperationGraph& opGraph,
+ std::string cache_string,
+ bool use_heuristic = true){
+ auto it = plan_cache.find(cache_string);
+ if (it != plan_cache.end()) {
+ DEBUG_CUDNN_MSG(log_buf, "Found plan in cache");
+ return it->second;
+ } else {
+ if (use_heuristic){
+ // TODO: confirm which mode to use
+ auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
+ .setOperationGraph(opGraph)
+ .setHeurMode(CUDNN_HEUR_MODE_INSTANT)
+ .build();
+ // try 3 times for now as WAR for no heuristic training
+ int max_tries = 3, count = 0;
+ auto& engine_configs = heuristics.getEngineConfig(max_tries);
+ while(true) {
+ try {
+ plan_cache.emplace(cache_string, std::move(cudnn_frontend::ExecutionPlanBuilder()
+ .setHandle(handle_)
+ .setEngineConfig(engine_configs[count], opGraph.getTag())
+ .build()));
+ break;
+ } catch (cudnn_frontend::cudnnException e) {
+ if (++count == max_tries) throw e;
+ }
+ }
+ }else{
+ DEBUG_CUDNN_MSG(log_buf, "No plan in cache");
+ // How many engines support this operation graph ?
+ auto total_engines = opGraph.getEngineCount();
+ DEBUG_CUDNN_MSG(log_buf, opGraph.describe() << " has " << total_engines << " engines.");
+ // We have to randomly pick one engine from [0, total_engines)
+ // Selecting "0" by default
+ auto engine = cudnn_frontend::EngineBuilder().setGlobalEngineIdx(0).setOperationGraph(opGraph).build();
+ DEBUG_CUDNN_MSG(log_buf, engine.describe());
+ auto& knobs = engine.getSupportedKnobs();
+ for (auto it = std::begin(knobs); it != std::end(knobs); ++it) {
+ DEBUG_CUDNN_MSG(log_buf, it->describe());
+ }
+ if (knobs.begin() != knobs.end()) {
+ DEBUG_CUDNN_MSG(log_buf, "Updated knob choice");
+ knobs.begin()->setChoice(knobs.begin()->getMinValue() + 1);
+ DEBUG_CUDNN_MSG(log_buf, knobs.begin()->describe());
+ }
+
+ // Createmplacee the requisite engine config
+ auto engine_config = cudnn_frontend::EngineConfigBuilder().setEngine(engine).build();
+ DEBUG_CUDNN_MSG(log_buf, engine_config.describe());
+ plan_cache.emplace(cache_string, std::move(cudnn_frontend::ExecutionPlanBuilder().setHandle(handle_).setEngineConfig(engine_config).build()));
+ }
+
+ return plan_cache.find(cache_string)->second;
+ }
+}
+
+void
+run_conv_scale_bias_add_activation(int64_t* x_dim_padded,
+ int64_t* pad,
+ int64_t* convstride,
+ int64_t* dilation,
+ int64_t* w_dim_padded,
+ int64_t* y_dim_padded,
+ cudnnDataType_t dataType,
+ at::Half* devPtrX,
+ at::Half* devPtrW,
+ at::Half* devPtrY,
+ at::Half* devPtrZ,
+ at::Half* devPtrB,
+ at::Half* devPtrI) {
+ cudnnHandle_t handle_ = torch::native::getCudnnHandle();
+ std::stringstream log_buf;
+ try {
+ int convDim = 2;
+
+ // Creates the necessary tensor descriptors
+ common_convbias_descriptors tensors = create_conv_bias_add_act_descriptors(
+ x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+
+ // Define the add operation
+ auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
+ .setMode(CUDNN_POINTWISE_MUL)
+ .setMathPrecision(CUDNN_DATA_FLOAT)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
+
+ // Define the bias operation
+ auto biasDesc = cudnn_frontend::PointWiseDescBuilder()
+ .setMode(CUDNN_POINTWISE_ADD)
+ .setMathPrecision(CUDNN_DATA_FLOAT)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, biasDesc.describe());
+
+ // optional add
+ auto addDesc = cudnn_frontend::PointWiseDescBuilder()
+ .setMode(CUDNN_POINTWISE_ADD)
+ .setMathPrecision(CUDNN_DATA_FLOAT)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, addDesc.describe());
+
+ // Define the activation operation
+ auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+ .setMode(CUDNN_POINTWISE_RELU_FWD)
+ .setMathPrecision(CUDNN_DATA_FLOAT)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
+
+ // Define the convolution problem
+ auto convDesc = cudnn_frontend::ConvDescBuilder()
+ .setDataType(CUDNN_DATA_FLOAT)
+ .setMathMode(CUDNN_CROSS_CORRELATION)
+ .setNDims(convDim)
+ .setStrides(convDim, convstride)
+ .setPrePadding(convDim, pad)
+ .setPostPadding(convDim, pad)
+ .setDilation(convDim, dilation)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
+
+ float alpha = 1.0f;
+ float beta = 0.0f;
+
+ // Create a convolution Node
+ auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+ .setxDesc(std::get(tensors))
+ .setwDesc(std::get(tensors))
+ .setyDesc(std::get(tensors))
+ .setcDesc(convDesc)
+ .setAlpha(alpha)
+ .setBeta(beta)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
+
+ // Create a Add Node with scaling parameters.
+ auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+ .setxDesc(conv_op.getOutputTensor())
+ .setbDesc(std::get(tensors))
+ .setyDesc(std::get(tensors))
+ .setpwDesc(scaleDesc)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
+
+ // Create a Bias Node.
+ auto bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+ .setxDesc(scale_op.getOutputTensor())
+ .setbDesc(std::get(tensors))
+ .setyDesc(std::get(tensors))
+ .setpwDesc(biasDesc)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, bias_op.describe());
+
+ // Create a optional add Node.
+ auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+ .setxDesc(bias_op.getOutputTensor())
+ .setbDesc(std::get(tensors))
+ .setyDesc(std::get(tensors))
+ .setpwDesc(addDesc)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, add_op.describe());
+
+
+ // Create an Activation Node.
+ auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+ .setxDesc(devPtrI ? add_op.getOutputTensor() : bias_op.getOutputTensor())
+ .setyDesc(std::get(tensors))
+ .setpwDesc(actDesc)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, act_op.describe());
+
+ // Create an Operation Graph. In this case it is convolution add bias activation
+ std::array ops = {&conv_op, &scale_op, &bias_op, devPtrI ? &add_op : &act_op, &act_op};
+
+ auto opGraph = cudnn_frontend::OperationGraphBuilder()
+ .setHandle(handle_)
+ .setOperationGraph(devPtrI ? ops.size() : 4, ops.data())
+ .build();
+
+ // Create string encoding for plan caching
+ auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
+ DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
+
+ auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
+ DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
+
+ auto workspace_size = plan.getWorkspaceSize();
+ DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
+
+ void* workspace_ptr = nullptr;
+ auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
+ if (workspace_size > 0) {
+ workspace_ptr = workspace_tensor.data_ptr();
+ }
+ void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB, devPtrI};
+ int64_t uids[] = {'x', 'y', 'w', 'z', 'b', 'i'};
+ auto variantPack = cudnn_frontend::VariantPackBuilder()
+ .setWorkspacePointer(workspace_ptr)
+ .setDataPointers(devPtrI ? 6 : 5, data_ptrs)
+ .setUids(devPtrI ? 6 : 5, uids)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
+ cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+ checkCudnnErr(status);
+ cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error");
+ } catch (cudnn_frontend::cudnnException e) {
+ std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
+ }
+}
+
+void
+run_conv_scale_bias(int64_t* x_dim_padded,
+ int64_t* pad,
+ int64_t* convstride,
+ int64_t* dilation,
+ int64_t* w_dim_padded,
+ int64_t* y_dim_padded,
+ cudnnDataType_t dataType,
+ at::Half* devPtrX,
+ at::Half* devPtrW,
+ at::Half* devPtrY,
+ at::Half* devPtrZ,
+ at::Half* devPtrB) {
+ cudnnHandle_t handle_ = torch::native::getCudnnHandle();
+ std::stringstream log_buf;
+ try {
+ int convDim = 2;
+
+ // Creates the necessary tensor descriptors
+ common_convbias_descriptors tensors = create_conv_bias_add_act_descriptors(
+ x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+
+ // Define the add operation
+ auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
+ .setMode(CUDNN_POINTWISE_MUL)
+ .setMathPrecision(CUDNN_DATA_FLOAT)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
+
+ // Define the bias operation
+ auto addDesc = cudnn_frontend::PointWiseDescBuilder()
+ .setMode(CUDNN_POINTWISE_ADD)
+ .setMathPrecision(CUDNN_DATA_FLOAT)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, addDesc.describe());
+
+ // Define the convolution problem
+ auto convDesc = cudnn_frontend::ConvDescBuilder()
+ .setDataType(CUDNN_DATA_FLOAT)
+ .setMathMode(CUDNN_CROSS_CORRELATION)
+ .setNDims(convDim)
+ .setStrides(convDim, convstride)
+ .setPrePadding(convDim, pad)
+ .setPostPadding(convDim, pad)
+ .setDilation(convDim, dilation)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
+
+ float alpha = 1.0f;
+ float beta = 0.0f;
+
+ // Create a convolution Node
+ auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR)
+ .setxDesc(std::get(tensors))
+ .setwDesc(std::get(tensors))
+ .setyDesc(std::get(tensors))
+ .setcDesc(convDesc)
+ .setAlpha(alpha)
+ .setBeta(beta)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
+
+ // Create a Add Node with scaling parameters.
+ auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+ .setxDesc(conv_op.getOutputTensor())
+ .setbDesc(std::get(tensors))
+ .setyDesc(std::get(tensors)) // TODO: change enum to aftermul
+ .setpwDesc(scaleDesc)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
+
+ // Create a Bias Node.
+ auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+ .setxDesc(scale_op.getOutputTensor())
+ .setbDesc(std::get(tensors))
+ .setyDesc(std::get(tensors))
+ .setpwDesc(addDesc)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, add_op.describe());
+
+ // Create an Operation Graph. In this case it is convolution add bias activation
+ std::array ops = {&conv_op, &scale_op, &add_op};
+
+ auto opGraph = cudnn_frontend::OperationGraphBuilder()
+ .setHandle(handle_)
+ .setOperationGraph(ops.size(), ops.data())
+ .build();
+
+ // Create string encoding for plan caching
+ auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
+ DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
+
+ auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
+ DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
+
+ auto workspace_size = plan.getWorkspaceSize();
+ DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
+
+ void* workspace_ptr = nullptr;
+ auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
+ if (workspace_size > 0) {
+ workspace_ptr = workspace_tensor.data_ptr();
+ }
+ void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrB};
+ int64_t uids[] = {'x', 'y', 'w', 'z', 'b'};
+ auto variantPack = cudnn_frontend::VariantPackBuilder()
+ .setWorkspacePointer(workspace_ptr)
+ .setDataPointers(5, data_ptrs)
+ .setUids(5, uids)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
+ cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+ checkCudnnErr(status);
+ cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error");
+ } catch (cudnn_frontend::cudnnException e) {
+ std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
+ }
+}
+
+
+void
+run_dconv_drelu_dscale(int64_t* x_dim_padded,
+ int64_t* pad,
+ int64_t* convstride,
+ int64_t* dilation,
+ int64_t* w_dim_padded,
+ int64_t* y_dim_padded,
+ cudnnDataType_t dataType,
+ at::Half* devPtrX,
+ at::Half* devPtrW,
+ at::Half* devPtrY,
+ at::Half* devPtrZ,
+ at::Half* devPtrR) {
+ cudnnHandle_t handle_ = torch::native::getCudnnHandle();
+ std::stringstream log_buf;
+ try {
+ int convDim = 2;
+
+ // Creates the necessary tensor descriptors
+ dconv_descriptors tensors = create_dconv_descriptors(
+ x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+
+ // Define the convolution problem
+ auto convDesc = cudnn_frontend::ConvDescBuilder()
+ .setDataType(CUDNN_DATA_FLOAT)
+ .setMathMode(CUDNN_CROSS_CORRELATION)
+ .setNDims(convDim)
+ .setStrides(convDim, convstride)
+ .setPrePadding(convDim, pad)
+ .setPostPadding(convDim, pad)
+ .setDilation(convDim, dilation)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
+
+ // Define the activation backward operation
+ auto actDesc = cudnn_frontend::PointWiseDescBuilder()
+ .setMode(CUDNN_POINTWISE_RELU_BWD)
+ .setMathPrecision(CUDNN_DATA_FLOAT)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, actDesc.describe());
+
+ // Define the scale backward operation
+ auto scaleDesc = cudnn_frontend::PointWiseDescBuilder()
+ .setMode(CUDNN_POINTWISE_MUL)
+ .setMathPrecision(CUDNN_DATA_FLOAT)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, scaleDesc.describe());
+
+ float alpha = 1.0f;
+ float beta = 0.0f;
+
+ // Create a convolution Node
+ auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
+ .setdxDesc(std::get(tensors))
+ .setwDesc(std::get(tensors))
+ .setdyDesc(std::get(tensors))
+ .setcDesc(convDesc)
+ .setAlpha(alpha)
+ .setBeta(beta)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
+
+ // TODO: do we need getOutputTensor(), and what it returns in backward case?
+ // Create an relu backward Node.
+ auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+ .setdyDesc(std::get(tensors))
+ .setxDesc(std::get(tensors))
+ .setdxDesc(std::get(tensors))
+ .setpwDesc(actDesc)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, act_op.describe());
+
+ // Create a Scale Node.
+ auto scale_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+ .setxDesc(std::get(tensors))
+ .setbDesc(std::get(tensors))
+ .setyDesc(std::get(tensors))
+ .setpwDesc(scaleDesc)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, scale_op.describe());
+
+ // Create an Operation Graph. In this case it is convolution add bias activation
+ std::array ops = {&conv_op, &act_op, &scale_op};
+
+ auto opGraph = cudnn_frontend::OperationGraphBuilder()
+ .setHandle(handle_)
+ .setOperationGraph(ops.size(), ops.data())
+ .build();
+
+ // Create string encoding for plan caching
+ auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
+ DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
+
+ auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
+ DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
+
+ auto workspace_size = plan.getWorkspaceSize();
+ DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
+
+ void* workspace_ptr = nullptr;
+ auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
+ if (workspace_size > 0) {
+ workspace_ptr = workspace_tensor.data_ptr();
+ }
+ void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrZ, devPtrR};
+ int64_t uids[] = {'x', 'y', 'w', 's', 'r'};
+ auto variantPack = cudnn_frontend::VariantPackBuilder()
+ .setWorkspacePointer(workspace_ptr)
+ .setDataPointers(5, data_ptrs)
+ .setUids(5, uids)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
+ cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+ checkCudnnErr(status);
+ cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error");
+ } catch (cudnn_frontend::cudnnException e) {
+ std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
+ }
+}
+
+void
+run_dconv(int64_t* x_dim_padded,
+ int64_t* pad,
+ int64_t* convstride,
+ int64_t* dilation,
+ int64_t* w_dim_padded,
+ int64_t* y_dim_padded,
+ cudnnDataType_t dataType,
+ at::Half* devPtrX,
+ at::Half* devPtrW,
+ at::Half* devPtrY,
+ cudnnBackendDescriptorType_t mode) {
+ cudnnHandle_t handle_ = torch::native::getCudnnHandle();
+ std::stringstream log_buf;
+ try {
+ int convDim = 2;
+
+ // Creates the necessary tensor descriptors
+ dconv_descriptors tensors = create_dconv_descriptors(
+ x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+
+ // Define the convolution problem
+ auto convDesc = cudnn_frontend::ConvDescBuilder()
+ .setDataType(CUDNN_DATA_FLOAT)
+ .setMathMode(CUDNN_CROSS_CORRELATION)
+ .setNDims(convDim)
+ .setStrides(convDim, convstride)
+ .setPrePadding(convDim, pad)
+ .setPostPadding(convDim, pad)
+ .setDilation(convDim, dilation)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
+
+ float alpha = 1.0f;
+ float beta = 0.0f;
+
+ // Create a convolution Node
+ // mode should be one of following
+ // CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
+ // CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR
+ auto conv_op_builder = cudnn_frontend::OperationBuilder(mode);
+ if (mode == CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) {
+ conv_op_builder.setdxDesc(std::get(tensors))
+ .setwDesc(std::get(tensors))
+ .setdyDesc(std::get(tensors))
+ .setcDesc(convDesc)
+ .setAlpha(alpha)
+ .setBeta(beta);
+ }
+ else {
+ conv_op_builder.setxDesc(std::get(tensors))
+ .setdwDesc(std::get(tensors))
+ .setdyDesc(std::get(tensors))
+ .setcDesc(convDesc)
+ .setAlpha(alpha)
+ .setBeta(beta);
+ }
+ auto conv_op = conv_op_builder.build();
+ DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
+
+ // Create an Operation Graph. In this case it is convolution add bias activation
+ std::array ops = {&conv_op};
+
+ auto opGraph = cudnn_frontend::OperationGraphBuilder()
+ .setHandle(handle_)
+ .setOperationGraph(ops.size(), ops.data())
+ .build();
+
+ // Create string encoding for plan caching
+ auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
+ DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
+
+ auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
+ DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
+
+ auto workspace_size = plan.getWorkspaceSize();
+ DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
+
+ void* workspace_ptr = nullptr;
+ auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
+ if (workspace_size > 0) {
+ workspace_ptr = workspace_tensor.data_ptr();
+ }
+ void* data_ptrs[] = {devPtrX, devPtrY, devPtrW};
+ int64_t uids[] = {'x', 'y', 'w'};
+ auto variantPack = cudnn_frontend::VariantPackBuilder()
+ .setWorkspacePointer(workspace_ptr)
+ .setDataPointers(3, data_ptrs)
+ .setUids(3, uids)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
+ cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+ checkCudnnErr(status);
+ cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error");
+ } catch (cudnn_frontend::cudnnException e) {
+ std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
+ }
+}
+
+void
+run_dconv_add(int64_t* x_dim_padded,
+ int64_t* pad,
+ int64_t* convstride,
+ int64_t* dilation,
+ int64_t* w_dim_padded,
+ int64_t* y_dim_padded,
+ cudnnDataType_t dataType,
+ at::Half* devPtrX,
+ at::Half* devPtrW,
+ at::Half* devPtrY,
+ at::Half* devPtrR) {
+ cudnnHandle_t handle_ = torch::native::getCudnnHandle();
+ std::stringstream log_buf;
+ try {
+ int convDim = 2;
+
+ // Creates the necessary tensor descriptors
+ dconv_descriptors tensors = create_dconv_descriptors(
+ x_dim_padded, pad, convstride, dilation, w_dim_padded, y_dim_padded, dataType);
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+ DEBUG_CUDNN_MSG(log_buf, std::get(tensors).describe());
+
+ // Define the convolution problem
+ auto convDesc = cudnn_frontend::ConvDescBuilder()
+ .setDataType(CUDNN_DATA_FLOAT)
+ .setMathMode(CUDNN_CROSS_CORRELATION)
+ .setNDims(convDim)
+ .setStrides(convDim, convstride)
+ .setPrePadding(convDim, pad)
+ .setPostPadding(convDim, pad)
+ .setDilation(convDim, dilation)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, convDesc.describe());
+
+ // Define the add backward operation
+ auto addDesc = cudnn_frontend::PointWiseDescBuilder()
+ .setMode(CUDNN_POINTWISE_ADD)
+ .setMathPrecision(CUDNN_DATA_FLOAT)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, addDesc.describe());
+
+ float alpha = 1.0f;
+ float beta = 0.0f;
+
+ // Create a convolution Node
+ auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR)
+ .setdxDesc(std::get(tensors))
+ .setwDesc(std::get(tensors))
+ .setdyDesc(std::get(tensors))
+ .setcDesc(convDesc)
+ .setAlpha(alpha)
+ .setBeta(beta)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, conv_op.describe());
+
+ // TODO: do we need getOutputTensor(), and what it returns in backward case?
+ // Create add Node.
+ auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+ .setxDesc(std::get(tensors))
+ .setbDesc(std::get(tensors))
+ .setyDesc(std::get(tensors))
+ .setpwDesc(addDesc)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, add_op.describe());
+
+ // Create an Operation Graph. In this case it is convolution add bias activation
+ std::array ops = {&conv_op, &add_op};
+
+ auto opGraph = cudnn_frontend::OperationGraphBuilder()
+ .setHandle(handle_)
+ .setOperationGraph(ops.size(), ops.data())
+ .build();
+
+ // Create string encoding for plan caching
+ auto cache_string = getConvFusionString(x_dim_padded, pad, convstride, dilation, w_dim_padded, dataType, opGraph.getTag());
+ DEBUG_CUDNN_MSG(log_buf, "[convstring] " << cache_string);
+
+ auto& plan = getOrCreatePlan(handle_, log_buf, opGraph, cache_string);
+ DEBUG_CUDNN_MSG(log_buf, "Plan tag: " << plan.getTag());
+
+ auto workspace_size = plan.getWorkspaceSize();
+ DEBUG_CUDNN_MSG(log_buf, plan.describe() << " requires workspace " << workspace_size);
+
+ void* workspace_ptr = nullptr;
+ auto workspace_tensor = at::empty({(workspace_size+3)/4}, at::TensorOptions(at::kCUDA).dtype(at::kFloat));
+ if (workspace_size > 0) {
+ workspace_ptr = workspace_tensor.data_ptr();
+ }
+ void* data_ptrs[] = {devPtrX, devPtrY, devPtrW, devPtrR};
+ int64_t uids[] = {'x', 'y', 'w', 'r'};
+ auto variantPack = cudnn_frontend::VariantPackBuilder()
+ .setWorkspacePointer(workspace_ptr)
+ .setDataPointers(4, data_ptrs)
+ .setUids(4, uids)
+ .build();
+ DEBUG_CUDNN_MSG(log_buf, "variantPack " << variantPack.describe());
+ cudnnStatus_t status = cudnnBackendExecute(handle_, plan.get_raw_desc(), variantPack.get_raw_desc());
+ checkCudnnErr(status);
+ cudnn_frontend::throw_if([status]() { return (status != CUDNN_STATUS_SUCCESS); }, "Plan execute error");
+ } catch (cudnn_frontend::cudnnException e) {
+ std::cout << log_buf.str() << "[ERROR] Exception " << e.what() << std::endl;
+ }
+}
+
+
+// inputs contains x,w,z,b,(i)
+std::vector bottleneck_forward(bool explicit_nhwc, int stride_1X1, std::vector inputs) {
+
+ std::cout << std::fixed;
+ // create output vector
+ std::vector outputs;
+ auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+
+ // setup dimensions
+ int64_t dimA[] = {0, 0, 0, 0};
+ int64_t filterdimA1[] = {0, 0, 0, 0};
+ int64_t filterdimA2[] = {0, 0, 0, 0};
+ int64_t filterdimA3[] = {0, 0, 0, 0};
+ int64_t filterdimA4[] = {0, 0, 0, 0};
+
+ // All dim calculation after this order of n,c,h,w
+ int axis[] {0,1,2,3};
+ if (explicit_nhwc) {
+ axis[0] = 0;
+ axis[1] = 3;
+ axis[2] = 1;
+ axis[3] = 2;
+ }
+ for (int dim=0;dim<4;dim++) {
+ dimA[dim] = inputs[0].size(axis[dim]);
+ filterdimA1[dim] = inputs[1].size(axis[dim]);
+ filterdimA2[dim] = inputs[2].size(axis[dim]);
+ filterdimA3[dim] = inputs[3].size(axis[dim]);
+ }
+ if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
+ for (int dim=0;dim<4;dim++) {
+ filterdimA4[dim] = inputs[10].size(axis[dim]);
+ }
+ }
+
+ // output dim in n,c,h,w used by backend
+ int64_t outdimA1[] = {0, 0, 0, 0}; // Computed Below
+ int64_t outdimA2[] = {0, 0, 0, 0}; // Computed Below
+ int64_t outdimA3[] = {0, 0, 0, 0}; // Computed Below
+
+ // use these fixed value for test run
+ int64_t padA[] = {0, 0};
+ int64_t padA1[] = {1, 1};
+ int64_t dilationA[] = {1, 1};
+ int64_t convstrideA[] = {1, 1};
+ int64_t convstride1X1[] = {stride_1X1, stride_1X1};
+
+ // compute output from pad/stride/dilation
+ outdimA1[0] = dimA[0];
+ outdimA1[1] = filterdimA1[0];
+ for (int dim = 0; dim < 2; dim++) {
+ outdimA1[dim + 2] = getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA1[dim + 2], convstride1X1[dim], dilationA[dim]);
+ }
+
+ outdimA2[0] = outdimA1[0];
+ outdimA2[1] = filterdimA2[0];
+ for (int dim = 0; dim < 2; dim++) {
+ outdimA2[dim + 2] = getFwdConvOutputDim(outdimA1[dim + 2], padA1[dim], filterdimA2[dim + 2], convstrideA[dim], dilationA[dim]);
+ }
+
+ outdimA3[0] = outdimA2[0];
+ outdimA3[1] = filterdimA3[0];
+ for (int dim = 0; dim < 2; dim++) {
+ outdimA3[dim + 2] = getFwdConvOutputDim(outdimA2[dim + 2], padA[dim], filterdimA3[dim + 2], convstrideA[dim], dilationA[dim]);
+ }
+
+ // Create output tensor in the correct shape in pytorch's view
+ int64_t outdim1[] = {0, 0, 0, 0};
+ int64_t outdim2[] = {0, 0, 0, 0};
+ int64_t outdim3[] = {0, 0, 0, 0};
+ if (explicit_nhwc) {
+ axis[0] = 0;
+ axis[1] = 2;
+ axis[2] = 3;
+ axis[3] = 1;
+ }
+ for (int dim=0;dim<4;dim++) {
+ outdim1[dim] = outdimA1[axis[dim]];
+ outdim2[dim] = outdimA2[axis[dim]];
+ outdim3[dim] = outdimA3[axis[dim]];
+ }
+
+ // run
+ at::Half* x = inputs[0].data_ptr();
+ at::Half* w = inputs[1].data_ptr();
+ at::Half* z = inputs[4].data_ptr();
+ at::Half* b = inputs[7].data_ptr();
+ auto out1 = at::empty(outdim1, inputs[0].type(), output_format);
+ at::Half* y1 = out1.data_ptr();
+
+ run_conv_scale_bias_add_activation(dimA,
+ padA,
+ convstride1X1,
+ dilationA,
+ filterdimA1,
+ outdimA1,
+ CUDNN_DATA_HALF,
+ x,
+ w,
+ y1,
+ z,
+ b,
+ nullptr);
+
+ DEBUG_MSG("[DEBUG] new relu1 : " << out1.to(at::kFloat).sum().item());
+
+ w = inputs[2].data_ptr();
+ z = inputs[5].data_ptr();
+ b = inputs[8].data_ptr();
+ auto out2 = at::empty(outdim2, inputs[0].type(), output_format);
+ at::Half* y2 = out2.data_ptr();
+
+ run_conv_scale_bias_add_activation(outdimA1,
+ padA1,
+ convstrideA,
+ dilationA,
+ filterdimA2,
+ outdimA2,
+ CUDNN_DATA_HALF,
+ y1,
+ w,
+ y2,
+ z,
+ b,
+ nullptr);
+ DEBUG_MSG("[DEBUG] new relu2 : " << out2.to(at::kFloat).sum().item());
+
+ // create output of conv3
+ auto out3 = at::empty(outdim3, inputs[0].type(), output_format);
+ at::Half* y3 = out3.data_ptr();
+
+ // create output of conv4 that may exist
+ auto identity = at::empty_like(out3);
+ at::Half* yi = identity.data_ptr();
+
+ if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]){
+
+ w = inputs[10].data_ptr();
+ z = inputs[11].data_ptr();
+ b = inputs[12].data_ptr();
+ run_conv_scale_bias(dimA,
+ padA,
+ convstride1X1,
+ dilationA,
+ filterdimA4,
+ outdimA3,
+ CUDNN_DATA_HALF,
+ x,
+ w,
+ yi,
+ z,
+ b);
+ DEBUG_MSG("[DEBUG] new downsample : " << identity.to(at::kFloat).sum().item());
+ }
+ else {
+ yi = x;
+ }
+
+ w = inputs[3].data_ptr();
+ z = inputs[6].data_ptr();
+ b = inputs[9].data_ptr();
+
+ run_conv_scale_bias_add_activation(outdimA2,
+ padA,
+ convstrideA,
+ dilationA,
+ filterdimA3,
+ outdimA3,
+ CUDNN_DATA_HALF,
+ y2,
+ w,
+ y3,
+ z,
+ b,
+ yi);
+ DEBUG_MSG("[DEBUG] new relu3 : " << out3.to(at::kFloat).sum().item());
+
+ outputs.push_back(out1);
+ outputs.push_back(out2);
+ outputs.push_back(out3);
+
+ return outputs;
+}
+
+std::vector bottleneck_backward(bool explicit_nhwc, int stride_1X1, std::vector inputs) {
+
+ bool requires_grad = inputs[0].requires_grad();
+
+ std::cout << std::fixed;
+ // create output vector
+ std::vector outputs;
+ auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+
+ // setup dimensions
+ int64_t dimA[] = {0, 0, 0, 0};
+ int64_t filterdimA1[] = {0, 0, 0, 0};
+ int64_t filterdimA2[] = {0, 0, 0, 0};
+ int64_t filterdimA3[] = {0, 0, 0, 0};
+ int64_t filterdimA4[] = {0, 0, 0, 0};
+
+ // All dim calculation after this order of n,c,h,w
+ int axis[] {0,1,2,3};
+ if (explicit_nhwc) {
+ axis[0] = 0;
+ axis[1] = 3;
+ axis[2] = 1;
+ axis[3] = 2;
+ }
+ for (int dim=0;dim<4;dim++) {
+ dimA[dim] = inputs[0].size(axis[dim]);
+ filterdimA1[dim] = inputs[1].size(axis[dim]);
+ filterdimA2[dim] = inputs[2].size(axis[dim]);
+ filterdimA3[dim] = inputs[3].size(axis[dim]);
+ }
+ if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
+ for (int dim=0;dim<4;dim++) {
+ filterdimA4[dim] = inputs[14].size(axis[dim]);
+ }
+ }
+
+ // output dim in n,c,h,w used by backend
+ int64_t outdimA1[] = {0, 0, 0, 0}; // Computed Below
+ int64_t outdimA2[] = {0, 0, 0, 0}; // Computed Below
+ int64_t outdimA3[] = {0, 0, 0, 0}; // Computed Below
+
+ // use these fixed value for test run
+ int64_t padA[] = {0, 0};
+ int64_t padA1[] = {1, 1};
+ int64_t dilationA[] = {1, 1};
+ int64_t convstrideA[] = {1, 1};
+ int64_t convstride1X1[] = {stride_1X1, stride_1X1};
+
+ // compute output from pad/stride/dilation
+ outdimA1[0] = dimA[0];
+ outdimA1[1] = filterdimA1[0];
+ for (int dim = 0; dim < 2; dim++) {
+ outdimA1[dim + 2] = getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA1[dim + 2], convstride1X1[dim], dilationA[dim]);
+ }
+
+ outdimA2[0] = outdimA1[0];
+ outdimA2[1] = filterdimA2[0];
+ for (int dim = 0; dim < 2; dim++) {
+ outdimA2[dim + 2] = getFwdConvOutputDim(outdimA1[dim + 2], padA1[dim], filterdimA2[dim + 2], convstrideA[dim], dilationA[dim]);
+ }
+
+ outdimA3[0] = outdimA2[0];
+ outdimA3[1] = filterdimA3[0];
+ for (int dim = 0; dim < 2; dim++) {
+ outdimA3[dim + 2] = getFwdConvOutputDim(outdimA2[dim + 2], padA[dim], filterdimA3[dim + 2], convstrideA[dim], dilationA[dim]);
+ }
+
+ // Create output tensor in the correct shape in pytorch's view
+ int64_t outdim1[] = {0, 0, 0, 0};
+ int64_t outdim2[] = {0, 0, 0, 0};
+ int64_t outdim3[] = {0, 0, 0, 0};
+ if (explicit_nhwc) {
+ axis[0] = 0;
+ axis[1] = 2;
+ axis[2] = 3;
+ axis[3] = 1;
+ }
+ for (int dim=0;dim<4;dim++) {
+ outdim1[dim] = outdimA1[axis[dim]];
+ outdim2[dim] = outdimA2[axis[dim]];
+ outdim3[dim] = outdimA3[axis[dim]];
+ }
+
+ // dconv3+drelu2+dscale2
+ at::Half* conv_in = inputs[13].data_ptr();
+ at::Half* dy3 = inputs[10].data_ptr();
+
+ DEBUG_MSG("[DEBUG] new dconv3 : " << inputs[10].to(at::kFloat).sum().item());
+
+ // wgrad
+ auto wgrad3 = at::empty_like(inputs[3]);
+ at::Half* dw3 = wgrad3.data_ptr();
+ run_dconv(outdimA2,
+ padA,
+ convstrideA,
+ dilationA,
+ filterdimA3,
+ outdimA3,
+ CUDNN_DATA_HALF,
+ conv_in,
+ dw3,
+ dy3,
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
+
+ // dgrad
+ auto grad_out2 = at::empty(outdim2, inputs[0].type(), output_format);
+ at::Half* dy2 = grad_out2.data_ptr();
+ at::Half* w = inputs[3].data_ptr();
+ at::Half* z = inputs[5].data_ptr();
+
+ at::Half* relu2 = inputs[13].data_ptr();
+
+ run_dconv_drelu_dscale(outdimA2,
+ padA,
+ convstrideA,
+ dilationA,
+ filterdimA3,
+ outdimA3,
+ CUDNN_DATA_HALF,
+ dy2,
+ w,
+ dy3,
+ z,
+ relu2);
+
+ DEBUG_MSG("[DEBUG] new dconv2 : " << grad_out2.to(at::kFloat).sum().item());
+
+ // dconv2+drelu1+dscale1
+ conv_in = inputs[12].data_ptr();
+
+ // wgrad
+ auto wgrad2 = at::empty_like(inputs[2]);
+ at::Half* dw2 = wgrad2.data_ptr();
+ run_dconv(outdimA1,
+ padA1,
+ convstrideA,
+ dilationA,
+ filterdimA2,
+ outdimA2,
+ CUDNN_DATA_HALF,
+ conv_in,
+ dw2,
+ dy2,
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
+
+ // dgrad
+ auto grad_out1 = at::empty(outdim1, inputs[0].type(), output_format);
+ at::Half* dy1 = grad_out1.data_ptr();
+ w = inputs[2].data_ptr();
+ z = inputs[4].data_ptr();
+
+ at::Half* relu1 = inputs[12].data_ptr();
+ // fused dgrad
+ run_dconv_drelu_dscale(outdimA1,
+ padA1,
+ convstrideA,
+ dilationA,
+ filterdimA2,
+ outdimA2,
+ CUDNN_DATA_HALF,
+ dy1,
+ w,
+ dy2,
+ z,
+ relu1);
+
+/*
+ // backward strided conv cannot be fused
+ // if stride == 1 but channel changes, we can fuse here
+ if (stride_1X1 != 1){
+ // dgrad
+ run_dconv(outdimA1,
+ padA1,
+ convstride1X1,
+ dilationA,
+ filterdimA2,
+ outdimA2,
+ CUDNN_DATA_HALF,
+ dy1,
+ w,
+ dy2,
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
+
+ // mul fused mask
+ grad_out1.mul_(inputs[15]);
+ }
+ else {
+ at::Half* relu1 = inputs[12].data_ptr();
+ // fused dgrad
+ run_dconv_drelu_dscale(outdimA1,
+ padA1,
+ convstride1X1,
+ dilationA,
+ filterdimA2,
+ outdimA2,
+ CUDNN_DATA_HALF,
+ dy1,
+ w,
+ dy2,
+ z,
+ relu1);
+ }
+*/
+ DEBUG_MSG("[DEBUG] new dconv1 : " << grad_out1.to(at::kFloat).sum().item());
+
+ // create grads of conv4 that may exist
+ auto grad_x_conv4 = at::empty_like(inputs[0]);
+ at::Half* dx_conv4 = grad_x_conv4.data_ptr();
+ at::Tensor wgrad4;
+
+ // x used for dconv1 and dconv4 wgrad
+ at::Half* x = inputs[0].data_ptr();
+
+ if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]){
+ w = inputs[14].data_ptr();
+ at::Half* dy_conv4 = inputs[11].data_ptr();
+ if (requires_grad) {
+ run_dconv(dimA,
+ padA,
+ convstride1X1,
+ dilationA,
+ filterdimA4,
+ outdimA3,
+ CUDNN_DATA_HALF,
+ dx_conv4,
+ w,
+ dy_conv4,
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
+ // we don't print here since we can't hook out this grad in pytorch alone to compare, due to addition with dx
+ // DEBUG_MSG("[DEBUG] new dx_identity : " << grad_x_conv4.to(at::kFloat).sum().item());
+ }
+ // wgrad
+ wgrad4 = at::empty_like(inputs[14]);
+ at::Half* dw4 = wgrad4.data_ptr();
+ run_dconv(dimA,
+ padA,
+ convstride1X1,
+ dilationA,
+ filterdimA4,
+ outdimA3,
+ CUDNN_DATA_HALF,
+ x,
+ dw4,
+ dy_conv4,
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
+ }
+ else {
+ // if there is no downsample, dx_conv4 is fork of drelu3
+ dx_conv4 = inputs[11].data_ptr();
+ }
+
+ // dconv1+add
+ // wgrad
+ auto wgrad1 = at::empty_like(inputs[1]);
+ at::Half* dw1 = wgrad1.data_ptr();
+ run_dconv(dimA,
+ padA,
+ convstride1X1,
+ dilationA,
+ filterdimA1,
+ outdimA1,
+ CUDNN_DATA_HALF,
+ x,
+ dw1,
+ dy1,
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
+
+ // dgrad
+ w = inputs[1].data_ptr();
+ auto grad_x = at::empty_like(inputs[0]);
+ at::Half* dx = grad_x.data_ptr();
+
+ // backward strided conv cannot be fused
+ // if stride == 1 but channel changes, we can fuse here
+ if (requires_grad){
+ if (stride_1X1 != 1){
+ run_dconv(dimA,
+ padA,
+ convstride1X1,
+ dilationA,
+ filterdimA1,
+ outdimA1,
+ CUDNN_DATA_HALF,
+ dx,
+ w,
+ dy1,
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR);
+ // add 2 together
+ grad_x.add_(grad_x_conv4);
+ }
+ else {
+ run_dconv_add(dimA,
+ padA,
+ convstride1X1,
+ dilationA,
+ filterdimA1,
+ outdimA1,
+ CUDNN_DATA_HALF,
+ dx,
+ w,
+ dy1,
+ dx_conv4);
+ }
+ }
+
+ DEBUG_MSG("[DEBUG] new dx : " << grad_x.to(at::kFloat).sum().item());
+ DEBUG_MSG("[DEBUG] new wgrad1 : " << wgrad1.to(at::kFloat).sum().item());
+ DEBUG_MSG("[DEBUG] new wgrad2 : " << wgrad2.to(at::kFloat).sum().item());
+ DEBUG_MSG("[DEBUG] new wgrad3 : " << wgrad3.to(at::kFloat).sum().item());
+ outputs.push_back(grad_x);
+ outputs.push_back(wgrad1);
+ outputs.push_back(wgrad2);
+ outputs.push_back(wgrad3);
+
+ if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
+ DEBUG_MSG("[DEBUG] new wgrad4 : " << wgrad4.to(at::kFloat).sum().item());
+ outputs.push_back(wgrad4);
+ }
+
+ return outputs;
+}
+
+namespace {
+
+struct bottleneck_forward_status {
+
+ int64_t dimA[4];
+ int64_t filterdimA1[4];
+ int64_t filterdimA2[4];
+ int64_t filterdimA3[4];
+ int64_t filterdimA4[4];
+
+ int axis[4];
+
+ int64_t outdimA0[4];
+ int64_t outdimA1[4];
+ int64_t outdimA2[4];
+ int64_t outdimA3[4];
+ int64_t outdimA4[4];
+
+ int64_t padA[2];
+ int64_t padA1[2];
+ int64_t padA2[2]; // halo padding
+ int64_t dilationA[2];
+ int64_t convstrideA[2];
+ int64_t convstride1X1[2];
+
+ int64_t outdim0[4]; // halo input shape
+ int64_t outdim1[4];
+ int64_t outdim2[4];
+ int64_t outdim3[4];
+ int64_t outdim4[4]; // halo output shape
+
+ void init(bool explicit_nhwc, int stride_1X1, std::vector inputs) {
+ dimA[0] = dimA[1] = dimA[2] = dimA[3] = 0;
+ filterdimA1[0] = filterdimA1[1] = filterdimA1[2] = filterdimA1[3] = 0;
+ filterdimA2[0] = filterdimA2[1] = filterdimA2[2] = filterdimA2[3] = 0;
+ filterdimA3[0] = filterdimA3[1] = filterdimA3[2] = filterdimA3[3] = 0;
+ filterdimA4[0] = filterdimA4[1] = filterdimA4[2] = filterdimA4[3] = 0;
+
+ // All dim calculation after this order of n,c,h,w
+ if (explicit_nhwc) {
+ axis[0] = 0;
+ axis[1] = 3;
+ axis[2] = 1;
+ axis[3] = 2;
+ } else {
+ axis[0] = 0;
+ axis[1] = 1;
+ axis[2] = 2;
+ axis[3] = 3;
+ }
+
+ for (int dim=0;dim<4;dim++) {
+ dimA[dim] = inputs[0].size(axis[dim]);
+ filterdimA1[dim] = inputs[1].size(axis[dim]);
+ filterdimA2[dim] = inputs[2].size(axis[dim]);
+ filterdimA3[dim] = inputs[3].size(axis[dim]);
+ }
+ if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
+ for (int dim=0;dim<4;dim++) {
+ filterdimA4[dim] = inputs[10].size(axis[dim]);
+ }
+ }
+
+ // output dim in n,c,h,w used by backend
+ outdimA0[0] = outdimA0[1] = outdimA0[2] = outdimA0[3] = 0;
+ outdimA1[0] = outdimA1[1] = outdimA1[2] = outdimA1[3] = 0;
+ outdimA2[0] = outdimA2[1] = outdimA2[2] = outdimA2[3] = 0;
+ outdimA3[0] = outdimA3[1] = outdimA3[2] = outdimA3[3] = 0;
+ outdimA4[0] = outdimA4[1] = outdimA4[2] = outdimA4[3] = 0;
+
+ // use these fixed value for test run
+ padA[0] = 0; padA[1] = 0;
+ padA1[0] = 1; padA1[1] = 1;
+ padA2[0] = 0; padA2[1] = 1;
+ dilationA[0] = 1; dilationA[1] = 1;
+ convstrideA[0] = 1; convstrideA[1] = 1;
+ convstride1X1[0] = stride_1X1; convstride1X1[1] = stride_1X1;
+
+ // compute output from pad/stride/dilation
+ outdimA1[0] = dimA[0];
+ outdimA1[1] = filterdimA1[0];
+ for (int dim = 0; dim < 2; dim++) {
+ outdimA1[dim + 2] = getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA1[dim + 2], convstride1X1[dim], dilationA[dim]);
+ }
+
+ outdimA2[0] = outdimA1[0];
+ outdimA2[1] = filterdimA2[0];
+ for (int dim = 0; dim < 2; dim++) {
+ outdimA2[dim + 2] = getFwdConvOutputDim(outdimA1[dim + 2], padA1[dim], filterdimA2[dim + 2], convstrideA[dim], dilationA[dim]);
+ }
+
+ for (int dim = 0; dim < 4; dim++) {
+ if (dim == 2) {
+ outdimA0[dim] = 3;
+ outdimA4[dim] = 1;
+ } else {
+ outdimA0[dim] = outdimA1[dim];
+ outdimA4[dim] = outdimA2[dim];
+ }
+ }
+
+ outdimA3[0] = outdimA2[0];
+ outdimA3[1] = filterdimA3[0];
+ for (int dim = 0; dim < 2; dim++) {
+ outdimA3[dim + 2] = getFwdConvOutputDim(outdimA2[dim + 2], padA[dim], filterdimA3[dim + 2], convstrideA[dim], dilationA[dim]);
+ }
+
+ // Create output tensor in the correct shape in pytorch's view
+ outdim1[0] = outdim1[1] = outdim1[2] = outdim1[3] = 0;
+ outdim2[0] = outdim2[1] = outdim2[2] = outdim2[3] = 0;
+ outdim3[0] = outdim3[1] = outdim3[2] = outdim3[3] = 0;
+ if (explicit_nhwc) {
+ axis[0] = 0;
+ axis[1] = 2;
+ axis[2] = 3;
+ axis[3] = 1;
+ }
+ for (int dim=0;dim<4;dim++) {
+ outdim0[dim] = outdimA0[axis[dim]];
+ outdim1[dim] = outdimA1[axis[dim]];
+ outdim2[dim] = outdimA2[axis[dim]];
+ outdim3[dim] = outdimA3[axis[dim]];
+ outdim4[dim] = outdimA4[axis[dim]];
+ }
+ }
+};
+
+bottleneck_forward_status forward_state;
+
+} // end of anonymous namespace
+
+std::vector bottleneck_forward_init(bool explicit_nhwc, int stride_1X1, std::vector inputs) {
+ // NB! Bottleneck_forward and bottleneck_backward are NOT thread safe method.
+ // NB! We use a global object to store state.
+ forward_state.init(explicit_nhwc, stride_1X1, inputs);
+
+ // create output vector
+ std::vector outputs;
+ auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+
+ //printf("outdim1 = (%d,%d,%d,%d)\n",forward_state.outdim1[0],forward_state.outdim1[1],forward_state.outdim1[2],forward_state.outdim1[3]);
+ auto out1 = at::empty(forward_state.outdim1, inputs[0].type(), output_format);
+ auto out2 = at::empty(forward_state.outdim2, inputs[0].type(), output_format);
+ auto out3 = at::empty(forward_state.outdim3, inputs[0].type(), output_format);
+
+ outputs.push_back(out1);
+ outputs.push_back(out2);
+ outputs.push_back(out3);
+
+ return outputs;
+}
+
+// inputs contains x,w,z,b,(i)
+void bottleneck_forward_out1(bool explicit_nhwc, int stride_1X1, std::vector inputs, std::vector outputs) {
+
+ std::cout << std::fixed;
+
+ // run
+ at::Half* x = inputs[0].data_ptr();
+ at::Half* w = inputs[1].data_ptr();
+ at::Half* z = inputs[4].data_ptr();
+ at::Half* b = inputs[7].data_ptr();
+ auto out1 = outputs[0];
+ at::Half* y1 = out1.data_ptr();
+
+ run_conv_scale_bias_add_activation(forward_state.dimA,
+ forward_state.padA,
+ forward_state.convstride1X1,
+ forward_state.dilationA,
+ forward_state.filterdimA1,
+ forward_state.outdimA1,
+ CUDNN_DATA_HALF,
+ x,
+ w,
+ y1,
+ z,
+ b,
+ nullptr);
+
+ DEBUG_MSG("[DEBUG] new relu1 : " << out1.to(at::kFloat).sum().item());
+}
+
+// computes halo (top or bottom) from fat halo input.
+// fat halo input is 3 pixels wide in H.
+at::Tensor bottleneck_forward_out2_halo(bool explicit_nhwc, at::Tensor fat_halo_y1, std::vector inputs) {
+
+ auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+
+ // run
+ at::Half* w = inputs[2].data_ptr();
+ at::Half* z = inputs[5].data_ptr();
+ at::Half* b = inputs[8].data_ptr();
+
+ at::Half* y1 = fat_halo_y1.data_ptr();
+
+ auto halo_y2 = at::empty(forward_state.outdim4, inputs[0].type(), output_format);
+ at::Half* y2 = halo_y2.data_ptr();
+
+ run_conv_scale_bias_add_activation(forward_state.outdimA0,
+ forward_state.padA2,
+ forward_state.convstrideA,
+ forward_state.dilationA,
+ forward_state.filterdimA2,
+ forward_state.outdimA4,
+ CUDNN_DATA_HALF,
+ y1,
+ w,
+ y2,
+ z,
+ b,
+ nullptr);
+
+ return halo_y2;
+}
+
+void bottleneck_forward_out2(bool explicit_nhwc, int stride_1X1, std::vector inputs, std::vector outputs) {
+
+ std::cout << std::fixed;
+
+ // from _out1 method
+ at::Half* x = inputs[0].data_ptr();
+ auto out1 = outputs[0];
+ at::Half* y1 = out1.data_ptr();
+
+ // run
+ at::Half* w = inputs[2].data_ptr();
+ at::Half* z = inputs[5].data_ptr();
+ at::Half* b = inputs[8].data_ptr();
+ auto out2 = outputs[1];
+ at::Half* y2 = out2.data_ptr();
+
+ //printf("forward_state.outdimA1 = {%d,%d,%d,%d}\n",forward_state.outdimA1[0],forward_state.outdimA1[1],forward_state.outdimA1[2],forward_state.outdimA1[3]);
+ //printf("forward_state.padA1 = {%d,%d}\n",forward_state.padA1[0],forward_state.padA1[1]);
+ //printf("forward_state.convstrideA = {%d,%d}\n",forward_state.convstrideA[0],forward_state.convstrideA[1]);
+ //printf("forward_state.dilationA = {%d,%d}\n",forward_state.dilationA[0],forward_state.dilationA[1]);
+ //printf("forward_state.filterdimA2 = {%d,%d,%d,%d}\n",forward_state.filterdimA2[0],forward_state.filterdimA2[1],forward_state.filterdimA2[2],forward_state.filterdimA2[3]);
+ //printf("forward_state.outdimA2 = {%d,%d,%d,%d}\n",forward_state.outdimA2[0],forward_state.outdimA2[1],forward_state.outdimA2[2],forward_state.outdimA2[3]);
+ run_conv_scale_bias_add_activation(forward_state.outdimA1,
+ forward_state.padA1,
+ forward_state.convstrideA,
+ forward_state.dilationA,
+ forward_state.filterdimA2,
+ forward_state.outdimA2,
+ CUDNN_DATA_HALF,
+ y1,
+ w,
+ y2,
+ z,
+ b,
+ nullptr);
+ DEBUG_MSG("[DEBUG] new relu2 : " << out2.to(at::kFloat).sum().item());
+}
+
+void bottleneck_forward_rest(bool explicit_nhwc, int stride_1X1, std::vector inputs, std::vector outputs) {
+
+ std::cout << std::fixed;
+
+ // from _out1 method
+ at::Half* x = inputs[0].data_ptr();
+
+ // create output of conv3
+ auto out3 = outputs[2];
+ at::Half* y3 = out3.data_ptr();
+
+ // create output of conv4 that may exist
+ auto identity = at::empty_like(out3);
+ at::Half* yi = identity.data_ptr();
+
+ at::Half *w, *z, *b;
+
+ if (stride_1X1 != 1 || forward_state.filterdimA3[0] != forward_state.dimA[1]){
+
+ w = inputs[10].data_ptr();
+ z = inputs[11].data_ptr();
+ b = inputs[12].data_ptr();
+ run_conv_scale_bias(forward_state.dimA,
+ forward_state.padA,
+ forward_state.convstride1X1,
+ forward_state.dilationA,
+ forward_state.filterdimA4,
+ forward_state.outdimA3,
+ CUDNN_DATA_HALF,
+ x,
+ w,
+ yi,
+ z,
+ b);
+ DEBUG_MSG("[DEBUG] new downsample : " << identity.to(at::kFloat).sum().item());
+ }
+ else {
+ yi = x;
+ }
+
+ auto out2 = outputs[1];
+ at::Half* y2 = out2.data_ptr();
+
+ w = inputs[3].data_ptr();
+ z = inputs[6].data_ptr();
+ b = inputs[9].data_ptr();
+
+ run_conv_scale_bias_add_activation(forward_state.outdimA2,
+ forward_state.padA,
+ forward_state.convstrideA,
+ forward_state.dilationA,
+ forward_state.filterdimA3,
+ forward_state.outdimA3,
+ CUDNN_DATA_HALF,
+ y2,
+ w,
+ y3,
+ z,
+ b,
+ yi);
+ DEBUG_MSG("[DEBUG] new relu3 : " << out3.to(at::kFloat).sum().item());
+}
+
+namespace {
+
+struct bottleneck_backward_state {
+
+ int64_t dimA[4];
+ int64_t filterdimA1[4];
+ int64_t filterdimA2[4];
+ int64_t filterdimA3[4];
+ int64_t filterdimA4[4];
+ int64_t filterdimA2hh[4]; // Cin,Cout,1,3
+
+ int axis[4];
+
+ int64_t outdimA1[4]; // grad_out1
+ int64_t outdimA2[4]; // grad_out2
+ int64_t outdimA3[4];
+ int64_t outdimA1h[4]; // output: grad_out1 halo (H=3)
+ int64_t outdimA2h[4]; // input : grad_out2 halo cells (H=3)
+ int64_t outdimA1hh[4]; // input: grad_out2 halo (H=1)
+ int64_t outdimA2hh[4]; // input: out1 halo (H=1)
+
+ int64_t padA[2];
+ int64_t padA1[2];
+ int64_t padA2[2];
+ int64_t dilationA[2];
+ int64_t convstrideA[2];
+ int64_t convstride1X1[2];
+
+ int64_t filterdim2hh[4]; // Cin,1,3,Cout
+
+ int64_t outdim1[4];
+ int64_t outdim2[4];
+ int64_t outdim3[4];
+ int64_t outdim1h[4];
+
+ void init(bool explicit_nhwc, int stride_1X1, std::vector inputs) {
+ // setup dimensions
+ dimA[0] = dimA[1] = dimA[2] = dimA[3] = 0;
+ filterdimA1[0] = filterdimA1[1] = filterdimA1[2] = filterdimA1[3] = 0;
+ filterdimA2[0] = filterdimA2[1] = filterdimA2[2] = filterdimA2[3] = 0;
+ filterdimA3[0] = filterdimA3[1] = filterdimA3[2] = filterdimA3[3] = 0;
+ filterdimA4[0] = filterdimA4[1] = filterdimA4[2] = filterdimA4[3] = 0;
+ filterdimA2hh[0] = filterdimA2hh[1] = filterdimA2hh[2] = filterdimA2hh[3] = 0;
+
+ // All dim calculation after this order of n,c,h,w
+ if (explicit_nhwc) {
+ axis[0] = 0;
+ axis[1] = 3;
+ axis[2] = 1;
+ axis[3] = 2;
+ } else {
+ axis[0] = 0;
+ axis[1] = 1;
+ axis[2] = 2;
+ axis[3] = 3;
+ }
+
+ for (int dim=0;dim<4;dim++) {
+ dimA[dim] = inputs[0].size(axis[dim]);
+ filterdimA1[dim] = inputs[1].size(axis[dim]);
+ filterdimA2[dim] = inputs[2].size(axis[dim]);
+ filterdimA3[dim] = inputs[3].size(axis[dim]);
+ }
+ if (stride_1X1 != 1 || filterdimA3[0] != dimA[1]) {
+ for (int dim=0;dim<4;dim++) {
+ filterdimA4[dim] = inputs[14].size(axis[dim]);
+ }
+ }
+
+ for (int dim=0;dim<4;dim++) {
+ if (dim == 2) {
+ filterdimA2hh[dim] = 1;
+ } else {
+ filterdimA2hh[dim] = filterdimA2[dim];
+ }
+ }
+
+ // output dim in n,c,h,w used by backend
+ outdimA1[0] = outdimA1[1] = outdimA1[2] = outdimA1[3] = 0;
+ outdimA2[0] = outdimA2[1] = outdimA2[2] = outdimA2[3] = 0;
+ outdimA3[0] = outdimA3[1] = outdimA3[2] = outdimA3[3] = 0;
+ outdimA1h[0] = outdimA1h[1] = outdimA1h[2] = outdimA1h[3] = 0;
+ outdimA2h[0] = outdimA2h[1] = outdimA2h[2] = outdimA2h[3] = 0;
+ outdimA1hh[0] = outdimA1hh[1] = outdimA1hh[2] = outdimA1hh[3] = 0;
+ outdimA2hh[0] = outdimA2hh[1] = outdimA2hh[2] = outdimA2hh[3] = 0;
+
+ // use these fixed value for test run
+ padA[0] = 0; padA[1] = 0;
+ padA1[0] = 1; padA1[1] = 1;
+ padA2[0] = 0; padA2[1] = 1;
+ dilationA[0] = 1; dilationA[1] = 1;
+ convstrideA[0] = 1; convstrideA[1] = 1;
+ convstride1X1[0] = stride_1X1; convstride1X1[1] = stride_1X1;
+
+ // compute output from pad/stride/dilation
+ outdimA1[0] = dimA[0];
+ outdimA1[1] = filterdimA1[0];
+ for (int dim = 0; dim < 2; dim++) {
+ outdimA1[dim + 2] = getFwdConvOutputDim(dimA[dim + 2], padA[dim], filterdimA1[dim + 2], convstride1X1[dim], dilationA[dim]);
+ }
+
+ outdimA2[0] = outdimA1[0];
+ outdimA2[1] = filterdimA2[0];
+ for (int dim = 0; dim < 2; dim++) {
+ outdimA2[dim + 2] = getFwdConvOutputDim(outdimA1[dim + 2], padA1[dim], filterdimA2[dim + 2], convstrideA[dim], dilationA[dim]);
+ }
+
+ outdimA3[0] = outdimA2[0];
+ outdimA3[1] = filterdimA3[0];
+ for (int dim = 0; dim < 2; dim++) {
+ outdimA3[dim + 2] = getFwdConvOutputDim(outdimA2[dim + 2], padA[dim], filterdimA3[dim + 2], convstrideA[dim], dilationA[dim]);
+ }
+
+ for (int dim = 0; dim < 4; dim++) {
+ if (dim == 2) {
+ outdimA1h[dim] = 3;
+ outdimA2h[dim] = 3;
+ outdimA1hh[dim] = 1;
+ outdimA2hh[dim] = 1;
+ } else {
+ outdimA1h[dim] = outdimA1[dim];
+ outdimA2h[dim] = outdimA2[dim];
+ outdimA1hh[dim] = outdimA1[dim];
+ outdimA2hh[dim] = outdimA2[dim];
+ }
+ }
+
+ // Create output tensor in the correct shape in pytorch's view
+ outdim1[0] = outdim1[1] = outdim1[2] = outdim1[3] = 0;
+ outdim2[0] = outdim2[1] = outdim2[2] = outdim2[3] = 0;
+ outdim3[0] = outdim3[1] = outdim3[2] = outdim3[3] = 0;
+ outdim1h[0] = outdim1h[1] = outdim1h[2] = outdim1h[3] = 0;
+ filterdim2hh[0] = filterdim2hh[1] = filterdim2hh[2] = filterdim2hh[3] = 0;
+ if (explicit_nhwc) {
+ axis[0] = 0;
+ axis[1] = 2;
+ axis[2] = 3;
+ axis[3] = 1;
+ }
+ for (int dim=0;dim<4;dim++) {
+ outdim1[dim] = outdimA1[axis[dim]];
+ outdim2[dim] = outdimA2[axis[dim]];
+ outdim3[dim] = outdimA3[axis[dim]];
+ outdim1h[dim] = outdimA1h[axis[dim]];
+ filterdim2hh[dim] = filterdimA2hh[axis[dim]];
+ }
+ }
+};
+
+bottleneck_backward_state backward_state;
+
+}
+
+std::vector bottleneck_backward_init(bool explicit_nhwc, int stride_1X1, std::vector inputs) {
+
+ std::cout << std::fixed;
+
+ backward_state.init(explicit_nhwc, stride_1X1, inputs);
+
+ // create output vector
+ std::vector outputs;
+ auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+
+ auto grad_x = at::empty_like(inputs[0]);
+ auto wgrad1 = at::empty_like(inputs[1]);
+ auto wgrad2 = at::empty_like(inputs[2]);
+ auto wgrad3 = at::empty_like(inputs[3]);
+
+ outputs.push_back(grad_x);
+ outputs.push_back(wgrad1);
+ outputs.push_back(wgrad2);
+ outputs.push_back(wgrad3);
+ if (stride_1X1 != 1 || backward_state.filterdimA3[0] != backward_state.dimA[1]) {
+ auto wgrad4 = at::empty_like(inputs[14]);
+ outputs.push_back(wgrad4);
+ }
+
+ return outputs;
+}
+
+at::Tensor bottleneck_backward_grad_out2(bool explicit_nhwc, int stride_1X1, std::vector inputs, std::vector outputs) {
+
+ bool requires_grad = inputs[0].requires_grad();
+
+ std::cout << std::fixed;
+ auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+
+ // dconv3+drelu2+dscale2
+ at::Half* conv_in = inputs[13].data_ptr();
+ at::Half* dy3 = inputs[10].data_ptr();
+
+ DEBUG_MSG("[DEBUG] new dconv3 : " << inputs[10].to(at::kFloat).sum().item());
+
+ // wgrad
+ auto wgrad3 = outputs[3];
+ at::Half* dw3 = wgrad3.data_ptr();
+ run_dconv(backward_state.outdimA2,
+ backward_state.padA,
+ backward_state.convstrideA,
+ backward_state.dilationA,
+ backward_state.filterdimA3,
+ backward_state.outdimA3,
+ CUDNN_DATA_HALF,
+ conv_in,
+ dw3,
+ dy3,
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
+
+ // dgrad
+ auto grad_out2 = at::empty(backward_state.outdim2, inputs[0].type(), output_format);
+ at::Half* dy2 = grad_out2.data_ptr();
+ at::Half* w = inputs[3].data_ptr();
+ at::Half* z = inputs[5].data_ptr();
+
+ at::Half* relu2 = inputs[13].data_ptr();
+
+ run_dconv_drelu_dscale(backward_state.outdimA2,
+ backward_state.padA,
+ backward_state.convstrideA,
+ backward_state.dilationA,
+ backward_state.filterdimA3,
+ backward_state.outdimA3,
+ CUDNN_DATA_HALF,
+ dy2,
+ w,
+ dy3,
+ z,
+ relu2);
+
+ // do halo exchange of dy2 here
+
+ DEBUG_MSG("[DEBUG] new dconv2 : " << grad_out2.to(at::kFloat).sum().item());
+
+ return grad_out2;
+}
+
+at::Tensor bottleneck_backward_grad_out1(bool explicit_nhwc, int stride_1X1, std::vector inputs, std::vector outputs, at::Tensor grad_out2) {
+
+ bool requires_grad = inputs[0].requires_grad();
+
+ std::cout << std::fixed;
+ auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+
+ // dgrad
+ at::Half* dy2 = grad_out2.data_ptr();
+
+ // dgrad
+ auto grad_out1 = at::empty(backward_state.outdim1, inputs[0].type(), output_format);
+ at::Half* dy1 = grad_out1.data_ptr();
+ at::Half* w = inputs[2].data_ptr();
+ at::Half* z = inputs[4].data_ptr();
+
+ at::Half* relu1 = inputs[12].data_ptr();
+ //printf("relu.shape = [%d,%d,%d,%d]\n",inputs[12].size(0),inputs[12].size(1),inputs[12].size(2),inputs[12].size(3));
+
+ // fused dgrad
+ run_dconv_drelu_dscale(backward_state.outdimA1,
+ backward_state.padA1,
+ backward_state.convstrideA,
+ backward_state.dilationA,
+ backward_state.filterdimA2,
+ backward_state.outdimA2,
+ CUDNN_DATA_HALF,
+ dy1,
+ w,
+ dy2,
+ z,
+ relu1);
+
+ return grad_out1;
+}
+
+// perform backward data 3x3 convolution (grad_out * w_rot180) on grad_out2 input of shape [N,3,W,C] with padding=(1,1) to produce output of shape [N,3,W,C]
+at::Tensor bottleneck_backward_grad_out1_halo(bool explicit_nhwc, int stride_1X1, std::vector inputs, std::vector outputs, at::Tensor grad_out2_halo, at::Tensor relu1_halo) {
+
+ bool requires_grad = inputs[0].requires_grad();
+
+ std::cout << std::fixed;
+ auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+
+ // dgrad
+ at::Half* dy2h = grad_out2_halo.data_ptr();
+
+ // dgrad
+ auto grad_out1_halo = at::empty(backward_state.outdim1h, inputs[0].type(), output_format);
+ at::Half* dy1h = grad_out1_halo.data_ptr();
+ at::Half* w = inputs[2].data_ptr();
+ at::Half* z = inputs[4].data_ptr();
+
+ at::Half* relu1h = relu1_halo.data_ptr();
+ //printf("relu.shape = [%d,%d,%d,%d]\n",relu1_halo.size(0),relu1_halo.size(1),relu1_halo.size(2),relu1_halo.size(3));
+ // fused dgrad
+ //printf("backward_state.outdimA1h = {%d,%d,%d,%d}\n",backward_state.outdimA1h[0],backward_state.outdimA1h[1],backward_state.outdimA1h[2],backward_state.outdimA1h[3]);
+ //printf("backward_state.outdimA2h = {%d,%d,%d,%d}\n",backward_state.outdimA2h[0],backward_state.outdimA2h[1],backward_state.outdimA2h[2],backward_state.outdimA2h[3]);
+ //printf("backward_state.filterdimA2 = {%d,%d,%d,%d}\n",backward_state.filterdimA2[0],backward_state.filterdimA2[1],backward_state.filterdimA2[2],backward_state.filterdimA2[3]);
+ run_dconv_drelu_dscale(backward_state.outdimA1h,
+ backward_state.padA1,
+ backward_state.convstrideA,
+ backward_state.dilationA,
+ backward_state.filterdimA2,
+ backward_state.outdimA2h,
+ CUDNN_DATA_HALF,
+ dy1h,
+ w,
+ dy2h,
+ z,
+ relu1h);
+
+ return grad_out1_halo;
+}
+
+at::Tensor bottleneck_backward_wgrad2(bool explicit_nhwc, int stride_1X1, std::vector inputs, std::vector outputs, at::Tensor grad_out2) {
+
+ bool requires_grad = inputs[0].requires_grad();
+
+ std::cout << std::fixed;
+ auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+
+ // dgrad
+ at::Half* dy2 = grad_out2.data_ptr();
+
+ // dconv2+drelu1+dscale1
+ at::Half* conv_in = inputs[12].data_ptr();
+
+ // wgrad
+ auto wgrad2 = outputs[2];
+ at::Half* dw2 = wgrad2.data_ptr();
+
+ //printf("outdimA1 = (%d,%d,%d,%d)\n",backward_state.outdimA1[0],backward_state.outdimA1[1],backward_state.outdimA1[2],backward_state.outdimA1[3]);
+ run_dconv(backward_state.outdimA1,
+ backward_state.padA1,
+ backward_state.convstrideA,
+ backward_state.dilationA,
+ backward_state.filterdimA2,
+ backward_state.outdimA2,
+ CUDNN_DATA_HALF,
+ conv_in,
+ dw2,
+ dy2,
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
+
+ return wgrad2;
+}
+
+// compute halo cells for input volume of dimension [N,1,W,C] with padding=(0,1) to produce output volume of dimension [N,1,W,C]
+// input and grad_out2_halo tensors are all of same shape
+// output tensor is of shape [Cin,1,3,Cout] (regular filter dims are [Cin,3,3,Cout]
+at::Tensor bottleneck_backward_wgrad2_halo(bool explicit_nhwc, int stride_1X1, std::vector inputs, std::vector outputs, at::Tensor input, at::Tensor grad_out2_halo) {
+
+ bool requires_grad = inputs[0].requires_grad();
+
+ std::cout << std::fixed;
+ auto output_format = explicit_nhwc ? at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+
+ // dgrad
+ at::Half* dy2 = grad_out2_halo.data_ptr();
+
+ // dconv2+drelu1+dscale1
+ at::Half* conv_in = input.data_ptr();
+
+ // wgrad
+ auto wgrad2_halo = at::empty(backward_state.filterdim2hh, input.type(), output_format);
+ at::Half* dw2 = wgrad2_halo.data_ptr();
+
+ //printf("backward_state.outdimA1hh = {%d,%d,%d,%d}\n",backward_state.outdimA1hh[0],backward_state.outdimA1hh[1],backward_state.outdimA1hh[2],backward_state.outdimA1hh[3]);
+ //printf("backward_state.outdimA2hh = {%d,%d,%d,%d}\n",backward_state.outdimA2hh[0],backward_state.outdimA2hh[1],backward_state.outdimA2hh[2],backward_state.outdimA2hh[3]);
+ //printf("backward_state.filterdim2hh = {%d,%d,%d,%d}\n",backward_state.filterdim2hh[0],backward_state.filterdim2hh[1],backward_state.filterdim2hh[2],backward_state.filterdim2hh[3]);
+ //printf("backward_state.filterdimA2hh = {%d,%d,%d,%d}\n",backward_state.filterdimA2hh[0],backward_state.filterdimA2hh[1],backward_state.filterdimA2hh[2],backward_state.filterdimA2hh[3]);
+ //printf("backward_state.padA2 = {%d,%d}\n",backward_state.padA2[0],backward_state.padA2[1]);
+ run_dconv(backward_state.outdimA1hh, // N,C,1,W
+ backward_state.padA2, // 0, 1
+ backward_state.convstrideA,
+ backward_state.dilationA,
+ backward_state.filterdimA2hh, // Cin,Cout,1,3
+ backward_state.outdimA2hh, // N,C,1,W
+ CUDNN_DATA_HALF,
+ conv_in,
+ dw2,
+ dy2,
+ CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR);
+
+ return wgrad2_halo;
+}
+
+void bottleneck_backward_rest(bool explicit_nhwc, int stride_1X1, std::vector inputs, std::vector