From 4d789a4181849490abbcafc9d54229c78d96fc13 Mon Sep 17 00:00:00 2001
From: commc <zhaosj5574@163.com>
Date: Wed, 4 Sep 2024 15:25:08 +0800
Subject: [PATCH 1/4] =?UTF-8?q?=E5=AE=A2=E6=88=B7=E9=9C=80=E6=B1=82CLIP?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E6=80=A7=E8=83=BD=E5=8F=8A=E7=B2=BE=E5=BA=A6?=
 =?UTF-8?q?=E6=B5=8B=E8=AF=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/multimodal/perf_test_aie.py      |  99 +++++++++++++
 .../built-in/multimodal/perf_test_onnx.py     |  97 +++++++++++++
 .../built-in/multimodal/precision_test.py     | 134 ++++++++++++++++++
 3 files changed, 330 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/multimodal/perf_test_aie.py
 create mode 100644 MindIE/MindIE-Torch/built-in/multimodal/perf_test_onnx.py
 create mode 100644 MindIE/MindIE-Torch/built-in/multimodal/precision_test.py

diff --git a/MindIE/MindIE-Torch/built-in/multimodal/perf_test_aie.py b/MindIE/MindIE-Torch/built-in/multimodal/perf_test_aie.py
new file mode 100644
index 0000000000..f268c4680d
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/multimodal/perf_test_aie.py
@@ -0,0 +1,99 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+import logging
+import argparse
+import time
+import torch
+import mindietorch
+
+logging.basicConfig(level=logging.INFO)
+
+
+def test(inputs, model, stream, meta=""):
+    # warmup
+    for _ in range(10):
+        with mindietorch.npu.stream(stream):
+            model(*inputs)
+            stream.synchronize()
+
+    # performance test
+    num_infer = 100
+    start = time.time()
+    for _ in range(num_infer):
+        with mindietorch.npu.stream(stream):
+            model(*inputs)
+            stream.synchronize()
+    end = time.time()
+
+    logging.info("%s latency: %.2f ms", meta, (end - start) / num_infer * 1000)
+    logging.info("%s throughput: %.2f fps", meta, num_infer / (end - start))
+
+
+def test_clip(args):
+    device = f'npu:{args.device_id}'
+    stream = mindietorch.npu.Stream(device)
+    if args.clip_aie_path.endswith(".ts"):
+        model = torch.jit.load(args.clip_aie_path)
+    else:
+        model = torch.load(args.clip_aie_path)
+    model.eval().to(device)
+
+    hf_config_path = os.path.join(args.hf_model_path, "config.json")
+    if not os.path.exists(hf_config_path):
+        raise FileNotFoundError(f"config.json not found at {args.hf_model_path}: {hf_config_path}")
+    with open(hf_config_path, "r") as f:
+        config_dict = json.load(f)
+
+    image_width = config_dict["vision_config"]["image_size"]
+    img_input_shape = (args.image_batchsize, 3, image_width, image_width)
+    text_input_shape = (args.text_batchsize, args.token_len)
+    input_img = torch.randn(img_input_shape, dtype=torch.float32).to(device)
+    input_ids = torch.randint(high=1000, size=text_input_shape, dtype=torch.int32).to(device)
+    attention_mask = torch.ones(text_input_shape, dtype=torch.int32).to(device)
+    inputs = [input_ids, input_img, attention_mask]
+    
+    test(inputs, model, stream, "CLIP")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device-id", type=int, help="NPU device id", default=0)
+    parser.add_argument(
+        "--clip-aie-path",
+        type=str, 
+        default="/Path/to/compiled/aie_or_ts_model"
+    )
+    parser.add_argument(
+        "--hf-model-path",
+        default="/Path/to/Huggingface_model_path",
+        type=str,
+        help="Huggingface CLIP Model Path."
+    )
+    parser.add_argument("--text-batchsize", type=int, default=80)
+    parser.add_argument("--image-batchsize", type=int, default=1)
+    parser.add_argument("--token-len", type=int, default=52)
+
+    return parser.parse_args()
+
+
+def main():
+    perf_args = parse_args()
+    mindietorch.set_device(perf_args.device_id)
+    test_clip(perf_args)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/multimodal/perf_test_onnx.py b/MindIE/MindIE-Torch/built-in/multimodal/perf_test_onnx.py
new file mode 100644
index 0000000000..106b7b87d2
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/multimodal/perf_test_onnx.py
@@ -0,0 +1,97 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+import logging
+import argparse
+import time
+import torch
+import onnxruntime as ort
+
+logging.basicConfig(level=logging.INFO)
+
+
+def test(encoder_path, provider, output_names, onnx_inputs, meta=""):
+    onnx_model = ort.InferenceSession(
+        encoder_path,
+        providers=[provider]
+    )
+
+    # warmup
+    for _ in range(10):
+        onnx_model.run(output_names, onnx_inputs)
+    # performance test
+    num_infer = 100
+    start = time.time()
+    for _ in range(num_infer):
+        onnx_model.run(output_names, onnx_inputs)
+    end = time.time()
+
+    logging.info("%s latency: %.2f ms", meta, (end - start) / num_infer * 1000)
+    logging.info("%s throughput: %.2f fps", meta, num_infer / (end - start))
+
+
+def test_clip(args, provider):
+    hf_config_path = os.path.join(args.hf_model_path, "config.json")
+    if not os.path.exists(hf_config_path):
+        raise FileNotFoundError(f"config.json not found at {args.hf_model_path}: {hf_config_path}")
+    with open(hf_config_path, "r") as f:
+        config_dict = json.load(f)
+
+    image_width = config_dict["vision_config"]["image_size"]
+    img_input_shape = (args.image_batchsize, 3, image_width, image_width)
+    text_input_shape = (args.text_batchsize, args.token_len)
+    input_img = torch.randn(img_input_shape, dtype=torch.float32).detach().numpy()
+    input_ids = torch.randint(high=1000, size=text_input_shape, dtype=torch.int32).detach().numpy()
+    attention_mask = torch.ones(text_input_shape, dtype=torch.int32).detach().numpy()
+
+    onnx_inputs = {"input_ids": input_ids, "pixel_values": input_img, "attention_mask": attention_mask}
+    output_names = ["image_embeds", "text_embeds", "logits_per_text", "logits_per_image"]
+    
+    test(args.onnx_path, provider, output_names, onnx_inputs, "CLIP")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--onnx-path",
+        type=str, 
+        default="/Path/to/onnx_model"
+    )
+    parser.add_argument(
+        "--hf-model-path",
+        default="/Path/to/Huggingface_model_path",
+        type=str,
+        help="Huggingface CLIP Model Path."
+    )
+    parser.add_argument("--text-batchsize", type=int, default=80)
+    parser.add_argument("--image-batchsize", type=int, default=1)
+    parser.add_argument("--token-len", type=int, default=52)
+    parser.add_argument("--use-gpu", action="store_true")
+
+    return parser.parse_args()
+
+
+def main():
+    perf_args = parse_args()
+    if perf_args.use_gpu:
+        provider = "CUDAExecutionProvider"
+    else:
+        provider = "CPUExecutionProvider"
+
+    test_clip(perf_args, provider)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/multimodal/precision_test.py b/MindIE/MindIE-Torch/built-in/multimodal/precision_test.py
new file mode 100644
index 0000000000..8a46d0e965
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/multimodal/precision_test.py
@@ -0,0 +1,134 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+import logging
+import argparse
+import torch
+import mindietorch
+import torch
+import onnxruntime as ort
+import numpy as np
+import torch.nn.functional as F
+
+logging.basicConfig(level=logging.INFO)
+
+
+def compare_onnx_aie_output(onnx_out, aie_out, sim_threshold=0.99):
+    num_sim = 0
+    for i, (a, b) in enumerate(zip(onnx_out, aie_out)):
+        a = a.reshape(1, -1).astype(np.float32)
+        b = b.reshape(1, -1)
+        sim = F.cosine_similarity(torch.from_numpy(a), b, dim=1)
+        if sim > sim_threshold:
+            num_sim += 1
+        else:
+            logging.info('Output %d similarity: %f', i, sim)
+
+    logging.info('Number of outputs to compare: %d', len(onnx_out))
+    logging.info('Number of outputs with cosine similarity > %.2f: %d', sim_threshold, num_sim)
+
+
+def compare(args):
+    # MindIETorch
+    device = f'npu:{args.device_id}'
+    stream = mindietorch.npu.Stream(device)
+
+    if args.clip_aie_path.endswith(".ts"):
+        aie_model = torch.jit.load(args.clip_aie_path)
+    else:
+        aie_model = torch.load(args.clip_aie_path)
+    aie_model.eval().to(device)
+    
+    hf_config_path = os.path.join(args.hf_model_path, "config.json")
+    if not os.path.exists(hf_config_path):
+        raise FileNotFoundError(f"config.json not found at {args.hf_model_path}: {hf_config_path}")
+    with open(hf_config_path, "r") as f:
+        config_dict = json.load(f)
+
+    image_width = config_dict["vision_config"]["image_size"]
+    img_input_shape = (args.image_batchsize, 3, image_width, image_width)
+    text_input_shape = (args.text_batchsize, args.token_len)
+    input_img = torch.randn(img_input_shape, dtype=torch.float32).to(device)
+    input_ids = torch.randint(high=1000, size=text_input_shape, dtype=torch.int32).to(device)
+    attention_mask = torch.ones(text_input_shape, dtype=torch.int32).to(device)
+    inputs = [input_ids, input_img, attention_mask]
+    
+    with mindietorch.npu.stream(stream):
+        aie_out = aie_model(*inputs)
+        stream.synchronize()
+
+    if isinstance(aie_out, tuple) or isinstance(aie_out, list):
+        aie_out = (x.cpu() for x in aie_out)
+    else:
+        aie_out = aie_out.cpu()
+     
+    # ONNX 
+    input_img = input_img.cpu().detach().numpy()
+    input_ids = input_ids.cpu().detach().numpy()
+    attention_mask = attention_mask.cpu().detach().numpy()
+    
+    if args.use_gpu:
+        provider = "CUDAExecutionProvider"
+    else:
+        provider = "CPUExecutionProvider"
+
+    onnx_model = ort.InferenceSession(
+        args.clip_onnx_path,
+        providers=[provider]
+    )
+    onnx_inputs = {"input_ids": input_ids, "pixel_values": input_img, "attention_mask": attention_mask}
+    output_names = ["image_embeds", "text_embeds", "logits_per_text", "logits_per_image"]
+    onnx_out = onnx_model.run(output_names, onnx_inputs)
+    
+    compare_onnx_aie_output(onnx_out, aie_out, args.sim_threshold)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device-id", type=int, default=0, help="NPU device id")
+    parser.add_argument(
+        "--clip-aie-path",
+        type=str, 
+        default="/Path/to/compiled/aie_or_ts_model"
+    )
+    parser.add_argument(
+        "--clip-onnx-path",
+        type=str, 
+        default="/Path/to/onnx_model"
+    )
+    parser.add_argument(
+        "--hf-model-path",
+        default="/Path/to/Huggingface_model_path",
+        type=str,
+        help="Huggingface CLIP Model Path."
+    )
+    parser.add_argument("--text-batchsize", type=int, default=80)
+    parser.add_argument("--image-batchsize", type=int, default=1)
+    parser.add_argument("--token-len", type=int, default=52)
+    parser.add_argument('--sim-threshold', type=float, default=0.99)
+    parser.add_argument("--use-gpu", action="store_true")
+
+    return parser.parse_args()
+
+
+def main():
+    compare_args = parse_args()
+    mindietorch.set_device(compare_args.device_id)
+    logging.info('=== Compare the outputs of ONNX and AIE ===')
+    compare(compare_args)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
-- 
Gitee


From 0729d7bb1273898d78db95d7d0d525560b391cf1 Mon Sep 17 00:00:00 2001
From: commc <zhaosj5574@163.com>
Date: Wed, 4 Sep 2024 18:36:33 +0800
Subject: [PATCH 2/4] =?UTF-8?q?config=E6=96=87=E4=BB=B6=E6=89=93=E5=BC=80?=
 =?UTF-8?q?=E6=96=B9=E5=BC=8F=E4=BF=AE=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../MindIE-Torch/built-in/multimodal/perf_test_aie.py  | 10 +++-------
 .../MindIE-Torch/built-in/multimodal/perf_test_onnx.py |  9 +++------
 .../MindIE-Torch/built-in/multimodal/precision_test.py | 10 +++-------
 3 files changed, 9 insertions(+), 20 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/multimodal/perf_test_aie.py b/MindIE/MindIE-Torch/built-in/multimodal/perf_test_aie.py
index f268c4680d..ceff8b8648 100644
--- a/MindIE/MindIE-Torch/built-in/multimodal/perf_test_aie.py
+++ b/MindIE/MindIE-Torch/built-in/multimodal/perf_test_aie.py
@@ -18,6 +18,7 @@ import argparse
 import time
 import torch
 import mindietorch
+from transformers import AutoConfig
 
 logging.basicConfig(level=logging.INFO)
 
@@ -50,14 +51,9 @@ def test_clip(args):
     else:
         model = torch.load(args.clip_aie_path)
     model.eval().to(device)
+    config = AutoConfig.from_pretrained(args.hf_model_path)
 
-    hf_config_path = os.path.join(args.hf_model_path, "config.json")
-    if not os.path.exists(hf_config_path):
-        raise FileNotFoundError(f"config.json not found at {args.hf_model_path}: {hf_config_path}")
-    with open(hf_config_path, "r") as f:
-        config_dict = json.load(f)
-
-    image_width = config_dict["vision_config"]["image_size"]
+    image_width = config.vision_config.image_size
     img_input_shape = (args.image_batchsize, 3, image_width, image_width)
     text_input_shape = (args.text_batchsize, args.token_len)
     input_img = torch.randn(img_input_shape, dtype=torch.float32).to(device)
diff --git a/MindIE/MindIE-Torch/built-in/multimodal/perf_test_onnx.py b/MindIE/MindIE-Torch/built-in/multimodal/perf_test_onnx.py
index 106b7b87d2..3dbc7f9d67 100644
--- a/MindIE/MindIE-Torch/built-in/multimodal/perf_test_onnx.py
+++ b/MindIE/MindIE-Torch/built-in/multimodal/perf_test_onnx.py
@@ -18,6 +18,7 @@ import argparse
 import time
 import torch
 import onnxruntime as ort
+from transformers import AutoConfig
 
 logging.basicConfig(level=logging.INFO)
 
@@ -43,13 +44,9 @@ def test(encoder_path, provider, output_names, onnx_inputs, meta=""):
 
 
 def test_clip(args, provider):
-    hf_config_path = os.path.join(args.hf_model_path, "config.json")
-    if not os.path.exists(hf_config_path):
-        raise FileNotFoundError(f"config.json not found at {args.hf_model_path}: {hf_config_path}")
-    with open(hf_config_path, "r") as f:
-        config_dict = json.load(f)
+    config = AutoConfig.from_pretrained(args.hf_model_path)
 
-    image_width = config_dict["vision_config"]["image_size"]
+    image_width = config.vision_config.image_size
     img_input_shape = (args.image_batchsize, 3, image_width, image_width)
     text_input_shape = (args.text_batchsize, args.token_len)
     input_img = torch.randn(img_input_shape, dtype=torch.float32).detach().numpy()
diff --git a/MindIE/MindIE-Torch/built-in/multimodal/precision_test.py b/MindIE/MindIE-Torch/built-in/multimodal/precision_test.py
index 8a46d0e965..6995367ed9 100644
--- a/MindIE/MindIE-Torch/built-in/multimodal/precision_test.py
+++ b/MindIE/MindIE-Torch/built-in/multimodal/precision_test.py
@@ -21,6 +21,7 @@ import torch
 import onnxruntime as ort
 import numpy as np
 import torch.nn.functional as F
+from transformers import AutoConfig
 
 logging.basicConfig(level=logging.INFO)
 
@@ -50,14 +51,9 @@ def compare(args):
     else:
         aie_model = torch.load(args.clip_aie_path)
     aie_model.eval().to(device)
-    
-    hf_config_path = os.path.join(args.hf_model_path, "config.json")
-    if not os.path.exists(hf_config_path):
-        raise FileNotFoundError(f"config.json not found at {args.hf_model_path}: {hf_config_path}")
-    with open(hf_config_path, "r") as f:
-        config_dict = json.load(f)
+    config = AutoConfig.from_pretrained(args.hf_model_path)
 
-    image_width = config_dict["vision_config"]["image_size"]
+    image_width = config.vision_config.image_size
     img_input_shape = (args.image_batchsize, 3, image_width, image_width)
     text_input_shape = (args.text_batchsize, args.token_len)
     input_img = torch.randn(img_input_shape, dtype=torch.float32).to(device)
-- 
Gitee


From 2ee22f5ed1d87c290541118e3d22e953e9e4450d Mon Sep 17 00:00:00 2001
From: commc <zhaosj5574@163.com>
Date: Thu, 5 Sep 2024 17:08:09 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E6=AD=A3=E7=A1=AE=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E5=B1=82=E7=BA=A7=E7=BB=93=E6=9E=84=E6=95=B4=E6=94=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/multimodal/CLIP/perf_test_aie.py |  95 +++++++++++++
 .../multimodal/CLIP/perf_test_onnx.py         |  94 +++++++++++++
 .../multimodal/CLIP/precision_test.py         | 130 ++++++++++++++++++
 3 files changed, 319 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/multimodal/CLIP/perf_test_aie.py
 create mode 100644 MindIE/MindIE-Torch/built-in/multimodal/CLIP/perf_test_onnx.py
 create mode 100644 MindIE/MindIE-Torch/built-in/multimodal/CLIP/precision_test.py

diff --git a/MindIE/MindIE-Torch/built-in/multimodal/CLIP/perf_test_aie.py b/MindIE/MindIE-Torch/built-in/multimodal/CLIP/perf_test_aie.py
new file mode 100644
index 0000000000..ceff8b8648
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/multimodal/CLIP/perf_test_aie.py
@@ -0,0 +1,95 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+import logging
+import argparse
+import time
+import torch
+import mindietorch
+from transformers import AutoConfig
+
+logging.basicConfig(level=logging.INFO)
+
+
+def test(inputs, model, stream, meta=""):
+    # warmup
+    for _ in range(10):
+        with mindietorch.npu.stream(stream):
+            model(*inputs)
+            stream.synchronize()
+
+    # performance test
+    num_infer = 100
+    start = time.time()
+    for _ in range(num_infer):
+        with mindietorch.npu.stream(stream):
+            model(*inputs)
+            stream.synchronize()
+    end = time.time()
+
+    logging.info("%s latency: %.2f ms", meta, (end - start) / num_infer * 1000)
+    logging.info("%s throughput: %.2f fps", meta, num_infer / (end - start))
+
+
+def test_clip(args):
+    device = f'npu:{args.device_id}'
+    stream = mindietorch.npu.Stream(device)
+    if args.clip_aie_path.endswith(".ts"):
+        model = torch.jit.load(args.clip_aie_path)
+    else:
+        model = torch.load(args.clip_aie_path)
+    model.eval().to(device)
+    config = AutoConfig.from_pretrained(args.hf_model_path)
+
+    image_width = config.vision_config.image_size
+    img_input_shape = (args.image_batchsize, 3, image_width, image_width)
+    text_input_shape = (args.text_batchsize, args.token_len)
+    input_img = torch.randn(img_input_shape, dtype=torch.float32).to(device)
+    input_ids = torch.randint(high=1000, size=text_input_shape, dtype=torch.int32).to(device)
+    attention_mask = torch.ones(text_input_shape, dtype=torch.int32).to(device)
+    inputs = [input_ids, input_img, attention_mask]
+    
+    test(inputs, model, stream, "CLIP")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device-id", type=int, help="NPU device id", default=0)
+    parser.add_argument(
+        "--clip-aie-path",
+        type=str, 
+        default="/Path/to/compiled/aie_or_ts_model"
+    )
+    parser.add_argument(
+        "--hf-model-path",
+        default="/Path/to/Huggingface_model_path",
+        type=str,
+        help="Huggingface CLIP Model Path."
+    )
+    parser.add_argument("--text-batchsize", type=int, default=80)
+    parser.add_argument("--image-batchsize", type=int, default=1)
+    parser.add_argument("--token-len", type=int, default=52)
+
+    return parser.parse_args()
+
+
+def main():
+    perf_args = parse_args()
+    mindietorch.set_device(perf_args.device_id)
+    test_clip(perf_args)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/multimodal/CLIP/perf_test_onnx.py b/MindIE/MindIE-Torch/built-in/multimodal/CLIP/perf_test_onnx.py
new file mode 100644
index 0000000000..3dbc7f9d67
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/multimodal/CLIP/perf_test_onnx.py
@@ -0,0 +1,94 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+import logging
+import argparse
+import time
+import torch
+import onnxruntime as ort
+from transformers import AutoConfig
+
+logging.basicConfig(level=logging.INFO)
+
+
+def test(encoder_path, provider, output_names, onnx_inputs, meta=""):
+    onnx_model = ort.InferenceSession(
+        encoder_path,
+        providers=[provider]
+    )
+
+    # warmup
+    for _ in range(10):
+        onnx_model.run(output_names, onnx_inputs)
+    # performance test
+    num_infer = 100
+    start = time.time()
+    for _ in range(num_infer):
+        onnx_model.run(output_names, onnx_inputs)
+    end = time.time()
+
+    logging.info("%s latency: %.2f ms", meta, (end - start) / num_infer * 1000)
+    logging.info("%s throughput: %.2f fps", meta, num_infer / (end - start))
+
+
+def test_clip(args, provider):
+    config = AutoConfig.from_pretrained(args.hf_model_path)
+
+    image_width = config.vision_config.image_size
+    img_input_shape = (args.image_batchsize, 3, image_width, image_width)
+    text_input_shape = (args.text_batchsize, args.token_len)
+    input_img = torch.randn(img_input_shape, dtype=torch.float32).detach().numpy()
+    input_ids = torch.randint(high=1000, size=text_input_shape, dtype=torch.int32).detach().numpy()
+    attention_mask = torch.ones(text_input_shape, dtype=torch.int32).detach().numpy()
+
+    onnx_inputs = {"input_ids": input_ids, "pixel_values": input_img, "attention_mask": attention_mask}
+    output_names = ["image_embeds", "text_embeds", "logits_per_text", "logits_per_image"]
+    
+    test(args.onnx_path, provider, output_names, onnx_inputs, "CLIP")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--onnx-path",
+        type=str, 
+        default="/Path/to/onnx_model"
+    )
+    parser.add_argument(
+        "--hf-model-path",
+        default="/Path/to/Huggingface_model_path",
+        type=str,
+        help="Huggingface CLIP Model Path."
+    )
+    parser.add_argument("--text-batchsize", type=int, default=80)
+    parser.add_argument("--image-batchsize", type=int, default=1)
+    parser.add_argument("--token-len", type=int, default=52)
+    parser.add_argument("--use-gpu", action="store_true")
+
+    return parser.parse_args()
+
+
+def main():
+    perf_args = parse_args()
+    if perf_args.use_gpu:
+        provider = "CUDAExecutionProvider"
+    else:
+        provider = "CPUExecutionProvider"
+
+    test_clip(perf_args, provider)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/multimodal/CLIP/precision_test.py b/MindIE/MindIE-Torch/built-in/multimodal/CLIP/precision_test.py
new file mode 100644
index 0000000000..6995367ed9
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/multimodal/CLIP/precision_test.py
@@ -0,0 +1,130 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+import logging
+import argparse
+import torch
+import mindietorch
+import torch
+import onnxruntime as ort
+import numpy as np
+import torch.nn.functional as F
+from transformers import AutoConfig
+
+logging.basicConfig(level=logging.INFO)
+
+
+def compare_onnx_aie_output(onnx_out, aie_out, sim_threshold=0.99):
+    num_sim = 0
+    for i, (a, b) in enumerate(zip(onnx_out, aie_out)):
+        a = a.reshape(1, -1).astype(np.float32)
+        b = b.reshape(1, -1)
+        sim = F.cosine_similarity(torch.from_numpy(a), b, dim=1)
+        if sim > sim_threshold:
+            num_sim += 1
+        else:
+            logging.info('Output %d similarity: %f', i, sim)
+
+    logging.info('Number of outputs to compare: %d', len(onnx_out))
+    logging.info('Number of outputs with cosine similarity > %.2f: %d', sim_threshold, num_sim)
+
+
+def compare(args):
+    # MindIETorch
+    device = f'npu:{args.device_id}'
+    stream = mindietorch.npu.Stream(device)
+
+    if args.clip_aie_path.endswith(".ts"):
+        aie_model = torch.jit.load(args.clip_aie_path)
+    else:
+        aie_model = torch.load(args.clip_aie_path)
+    aie_model.eval().to(device)
+    config = AutoConfig.from_pretrained(args.hf_model_path)
+
+    image_width = config.vision_config.image_size
+    img_input_shape = (args.image_batchsize, 3, image_width, image_width)
+    text_input_shape = (args.text_batchsize, args.token_len)
+    input_img = torch.randn(img_input_shape, dtype=torch.float32).to(device)
+    input_ids = torch.randint(high=1000, size=text_input_shape, dtype=torch.int32).to(device)
+    attention_mask = torch.ones(text_input_shape, dtype=torch.int32).to(device)
+    inputs = [input_ids, input_img, attention_mask]
+    
+    with mindietorch.npu.stream(stream):
+        aie_out = aie_model(*inputs)
+        stream.synchronize()
+
+    if isinstance(aie_out, tuple) or isinstance(aie_out, list):
+        aie_out = (x.cpu() for x in aie_out)
+    else:
+        aie_out = aie_out.cpu()
+     
+    # ONNX 
+    input_img = input_img.cpu().detach().numpy()
+    input_ids = input_ids.cpu().detach().numpy()
+    attention_mask = attention_mask.cpu().detach().numpy()
+    
+    if args.use_gpu:
+        provider = "CUDAExecutionProvider"
+    else:
+        provider = "CPUExecutionProvider"
+
+    onnx_model = ort.InferenceSession(
+        args.clip_onnx_path,
+        providers=[provider]
+    )
+    onnx_inputs = {"input_ids": input_ids, "pixel_values": input_img, "attention_mask": attention_mask}
+    output_names = ["image_embeds", "text_embeds", "logits_per_text", "logits_per_image"]
+    onnx_out = onnx_model.run(output_names, onnx_inputs)
+    
+    compare_onnx_aie_output(onnx_out, aie_out, args.sim_threshold)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device-id", type=int, default=0, help="NPU device id")
+    parser.add_argument(
+        "--clip-aie-path",
+        type=str, 
+        default="/Path/to/compiled/aie_or_ts_model"
+    )
+    parser.add_argument(
+        "--clip-onnx-path",
+        type=str, 
+        default="/Path/to/onnx_model"
+    )
+    parser.add_argument(
+        "--hf-model-path",
+        default="/Path/to/Huggingface_model_path",
+        type=str,
+        help="Huggingface CLIP Model Path."
+    )
+    parser.add_argument("--text-batchsize", type=int, default=80)
+    parser.add_argument("--image-batchsize", type=int, default=1)
+    parser.add_argument("--token-len", type=int, default=52)
+    parser.add_argument('--sim-threshold', type=float, default=0.99)
+    parser.add_argument("--use-gpu", action="store_true")
+
+    return parser.parse_args()
+
+
+def main():
+    compare_args = parse_args()
+    mindietorch.set_device(compare_args.device_id)
+    logging.info('=== Compare the outputs of ONNX and AIE ===')
+    compare(compare_args)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
-- 
Gitee


From b6d075ea15848d2f9cc163d00a30b01049571d57 Mon Sep 17 00:00:00 2001
From: commc <zhaosj5574@163.com>
Date: Thu, 5 Sep 2024 17:09:53 +0800
Subject: [PATCH 4/4] =?UTF-8?q?=E5=88=A0=E9=99=A4=E9=94=99=E8=AF=AF?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6=E7=BB=93=E6=9E=84=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/multimodal/perf_test_aie.py      |  95 -------------
 .../built-in/multimodal/perf_test_onnx.py     |  94 -------------
 .../built-in/multimodal/precision_test.py     | 130 ------------------
 3 files changed, 319 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/multimodal/perf_test_aie.py
 delete mode 100644 MindIE/MindIE-Torch/built-in/multimodal/perf_test_onnx.py
 delete mode 100644 MindIE/MindIE-Torch/built-in/multimodal/precision_test.py

diff --git a/MindIE/MindIE-Torch/built-in/multimodal/perf_test_aie.py b/MindIE/MindIE-Torch/built-in/multimodal/perf_test_aie.py
deleted file mode 100644
index ceff8b8648..0000000000
--- a/MindIE/MindIE-Torch/built-in/multimodal/perf_test_aie.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import json
-import logging
-import argparse
-import time
-import torch
-import mindietorch
-from transformers import AutoConfig
-
-logging.basicConfig(level=logging.INFO)
-
-
-def test(inputs, model, stream, meta=""):
-    # warmup
-    for _ in range(10):
-        with mindietorch.npu.stream(stream):
-            model(*inputs)
-            stream.synchronize()
-
-    # performance test
-    num_infer = 100
-    start = time.time()
-    for _ in range(num_infer):
-        with mindietorch.npu.stream(stream):
-            model(*inputs)
-            stream.synchronize()
-    end = time.time()
-
-    logging.info("%s latency: %.2f ms", meta, (end - start) / num_infer * 1000)
-    logging.info("%s throughput: %.2f fps", meta, num_infer / (end - start))
-
-
-def test_clip(args):
-    device = f'npu:{args.device_id}'
-    stream = mindietorch.npu.Stream(device)
-    if args.clip_aie_path.endswith(".ts"):
-        model = torch.jit.load(args.clip_aie_path)
-    else:
-        model = torch.load(args.clip_aie_path)
-    model.eval().to(device)
-    config = AutoConfig.from_pretrained(args.hf_model_path)
-
-    image_width = config.vision_config.image_size
-    img_input_shape = (args.image_batchsize, 3, image_width, image_width)
-    text_input_shape = (args.text_batchsize, args.token_len)
-    input_img = torch.randn(img_input_shape, dtype=torch.float32).to(device)
-    input_ids = torch.randint(high=1000, size=text_input_shape, dtype=torch.int32).to(device)
-    attention_mask = torch.ones(text_input_shape, dtype=torch.int32).to(device)
-    inputs = [input_ids, input_img, attention_mask]
-    
-    test(inputs, model, stream, "CLIP")
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--device-id", type=int, help="NPU device id", default=0)
-    parser.add_argument(
-        "--clip-aie-path",
-        type=str, 
-        default="/Path/to/compiled/aie_or_ts_model"
-    )
-    parser.add_argument(
-        "--hf-model-path",
-        default="/Path/to/Huggingface_model_path",
-        type=str,
-        help="Huggingface CLIP Model Path."
-    )
-    parser.add_argument("--text-batchsize", type=int, default=80)
-    parser.add_argument("--image-batchsize", type=int, default=1)
-    parser.add_argument("--token-len", type=int, default=52)
-
-    return parser.parse_args()
-
-
-def main():
-    perf_args = parse_args()
-    mindietorch.set_device(perf_args.device_id)
-    test_clip(perf_args)
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/multimodal/perf_test_onnx.py b/MindIE/MindIE-Torch/built-in/multimodal/perf_test_onnx.py
deleted file mode 100644
index 3dbc7f9d67..0000000000
--- a/MindIE/MindIE-Torch/built-in/multimodal/perf_test_onnx.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import json
-import logging
-import argparse
-import time
-import torch
-import onnxruntime as ort
-from transformers import AutoConfig
-
-logging.basicConfig(level=logging.INFO)
-
-
-def test(encoder_path, provider, output_names, onnx_inputs, meta=""):
-    onnx_model = ort.InferenceSession(
-        encoder_path,
-        providers=[provider]
-    )
-
-    # warmup
-    for _ in range(10):
-        onnx_model.run(output_names, onnx_inputs)
-    # performance test
-    num_infer = 100
-    start = time.time()
-    for _ in range(num_infer):
-        onnx_model.run(output_names, onnx_inputs)
-    end = time.time()
-
-    logging.info("%s latency: %.2f ms", meta, (end - start) / num_infer * 1000)
-    logging.info("%s throughput: %.2f fps", meta, num_infer / (end - start))
-
-
-def test_clip(args, provider):
-    config = AutoConfig.from_pretrained(args.hf_model_path)
-
-    image_width = config.vision_config.image_size
-    img_input_shape = (args.image_batchsize, 3, image_width, image_width)
-    text_input_shape = (args.text_batchsize, args.token_len)
-    input_img = torch.randn(img_input_shape, dtype=torch.float32).detach().numpy()
-    input_ids = torch.randint(high=1000, size=text_input_shape, dtype=torch.int32).detach().numpy()
-    attention_mask = torch.ones(text_input_shape, dtype=torch.int32).detach().numpy()
-
-    onnx_inputs = {"input_ids": input_ids, "pixel_values": input_img, "attention_mask": attention_mask}
-    output_names = ["image_embeds", "text_embeds", "logits_per_text", "logits_per_image"]
-    
-    test(args.onnx_path, provider, output_names, onnx_inputs, "CLIP")
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--onnx-path",
-        type=str, 
-        default="/Path/to/onnx_model"
-    )
-    parser.add_argument(
-        "--hf-model-path",
-        default="/Path/to/Huggingface_model_path",
-        type=str,
-        help="Huggingface CLIP Model Path."
-    )
-    parser.add_argument("--text-batchsize", type=int, default=80)
-    parser.add_argument("--image-batchsize", type=int, default=1)
-    parser.add_argument("--token-len", type=int, default=52)
-    parser.add_argument("--use-gpu", action="store_true")
-
-    return parser.parse_args()
-
-
-def main():
-    perf_args = parse_args()
-    if perf_args.use_gpu:
-        provider = "CUDAExecutionProvider"
-    else:
-        provider = "CPUExecutionProvider"
-
-    test_clip(perf_args, provider)
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/multimodal/precision_test.py b/MindIE/MindIE-Torch/built-in/multimodal/precision_test.py
deleted file mode 100644
index 6995367ed9..0000000000
--- a/MindIE/MindIE-Torch/built-in/multimodal/precision_test.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import json
-import logging
-import argparse
-import torch
-import mindietorch
-import torch
-import onnxruntime as ort
-import numpy as np
-import torch.nn.functional as F
-from transformers import AutoConfig
-
-logging.basicConfig(level=logging.INFO)
-
-
-def compare_onnx_aie_output(onnx_out, aie_out, sim_threshold=0.99):
-    num_sim = 0
-    for i, (a, b) in enumerate(zip(onnx_out, aie_out)):
-        a = a.reshape(1, -1).astype(np.float32)
-        b = b.reshape(1, -1)
-        sim = F.cosine_similarity(torch.from_numpy(a), b, dim=1)
-        if sim > sim_threshold:
-            num_sim += 1
-        else:
-            logging.info('Output %d similarity: %f', i, sim)
-
-    logging.info('Number of outputs to compare: %d', len(onnx_out))
-    logging.info('Number of outputs with cosine similarity > %.2f: %d', sim_threshold, num_sim)
-
-
-def compare(args):
-    # MindIETorch
-    device = f'npu:{args.device_id}'
-    stream = mindietorch.npu.Stream(device)
-
-    if args.clip_aie_path.endswith(".ts"):
-        aie_model = torch.jit.load(args.clip_aie_path)
-    else:
-        aie_model = torch.load(args.clip_aie_path)
-    aie_model.eval().to(device)
-    config = AutoConfig.from_pretrained(args.hf_model_path)
-
-    image_width = config.vision_config.image_size
-    img_input_shape = (args.image_batchsize, 3, image_width, image_width)
-    text_input_shape = (args.text_batchsize, args.token_len)
-    input_img = torch.randn(img_input_shape, dtype=torch.float32).to(device)
-    input_ids = torch.randint(high=1000, size=text_input_shape, dtype=torch.int32).to(device)
-    attention_mask = torch.ones(text_input_shape, dtype=torch.int32).to(device)
-    inputs = [input_ids, input_img, attention_mask]
-    
-    with mindietorch.npu.stream(stream):
-        aie_out = aie_model(*inputs)
-        stream.synchronize()
-
-    if isinstance(aie_out, tuple) or isinstance(aie_out, list):
-        aie_out = (x.cpu() for x in aie_out)
-    else:
-        aie_out = aie_out.cpu()
-     
-    # ONNX 
-    input_img = input_img.cpu().detach().numpy()
-    input_ids = input_ids.cpu().detach().numpy()
-    attention_mask = attention_mask.cpu().detach().numpy()
-    
-    if args.use_gpu:
-        provider = "CUDAExecutionProvider"
-    else:
-        provider = "CPUExecutionProvider"
-
-    onnx_model = ort.InferenceSession(
-        args.clip_onnx_path,
-        providers=[provider]
-    )
-    onnx_inputs = {"input_ids": input_ids, "pixel_values": input_img, "attention_mask": attention_mask}
-    output_names = ["image_embeds", "text_embeds", "logits_per_text", "logits_per_image"]
-    onnx_out = onnx_model.run(output_names, onnx_inputs)
-    
-    compare_onnx_aie_output(onnx_out, aie_out, args.sim_threshold)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--device-id", type=int, default=0, help="NPU device id")
-    parser.add_argument(
-        "--clip-aie-path",
-        type=str, 
-        default="/Path/to/compiled/aie_or_ts_model"
-    )
-    parser.add_argument(
-        "--clip-onnx-path",
-        type=str, 
-        default="/Path/to/onnx_model"
-    )
-    parser.add_argument(
-        "--hf-model-path",
-        default="/Path/to/Huggingface_model_path",
-        type=str,
-        help="Huggingface CLIP Model Path."
-    )
-    parser.add_argument("--text-batchsize", type=int, default=80)
-    parser.add_argument("--image-batchsize", type=int, default=1)
-    parser.add_argument("--token-len", type=int, default=52)
-    parser.add_argument('--sim-threshold', type=float, default=0.99)
-    parser.add_argument("--use-gpu", action="store_true")
-
-    return parser.parse_args()
-
-
-def main():
-    compare_args = parse_args()
-    mindietorch.set_device(compare_args.device_id)
-    logging.info('=== Compare the outputs of ONNX and AIE ===')
-    compare(compare_args)
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
-- 
Gitee