diff --git a/AscendIE/TorchAIE/built-in/cv/detection/ssd/README.md b/AscendIE/TorchAIE/built-in/cv/detection/ssd/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bdb8fefd76e2f47d462e30747371bf0f7f230080 --- /dev/null +++ b/AscendIE/TorchAIE/built-in/cv/detection/ssd/README.md @@ -0,0 +1,223 @@ +# SSD模型-推理指导 + +- [概述](#ZH-CN_TOPIC_0000001172161501) + + - [输入输出数据](#ZH-CN_TOPIC_0000001126281702) + +- [推理环境准备](#ZH-CN_TOPIC_0000001126281702) + +- [快速上手](#ZH-CN_TOPIC_0000001126281700) + + - [获取源码](#section4622531142816) + - [准备数据集](#section183221994411) + - [模型推理](#section741711594517) + +- [模型推理性能&精度](#ZH-CN_TOPIC_0000001172201573) + + ****** + +# 概述 + +SSD将detection转化为regression的思路,可以一次完成目标定位与分类。该算法基于Faster RCNN中的Anchor,提出了相似的Prior box;该算法修改了传统的SSD网络:将SSD的FC6和FC7层转化为卷积层,去掉所有的Dropout层和FC8层。同时加入基于特征金字塔的检测方式,在不同感受野的feature map上预测目标。 + +- 参考实现: + + ```shell + url=https://github.com/open-mmlab/mmdetection.git + branch=master + commit_id=a21eb25535f31634cef332b09fc27d28956fb24b + model_name=ssd + ``` + +## 输入输出数据 + +- 输入数据 + + | 输入数据 | 数据类型 | 大小 | 数据排布格式 | + | -------- | -------- | ------------------------- | ------------ | + | input | RGB_FP32 | batchsize x 3 x 300 x 300 | NCHW | + +- 输出数据 + + | 输出数据 | 数据类型 | 大小 | 数据排布格式 | + | -------- | -------- | --------------------- | ------------ | + | boxes | FLOAT32 | batchsize x 8732 x 4 | ND | + | labels | FLOAT32 | batchsize x 8732 x 80 | ND | + +# 推理环境准备 + +- 该模型需要两套环境切换运行,用于执行推理的环境(包括插件与驱动)如下 + + **表 1** 版本配套表 + + | 配套 | 版本 | 环境准备指导 | + |---------| ------- | ------------------------------------------------------------ | + | 固件与驱动 | 23.0.rc1 | [Pytorch框架推理环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/pies) | + | CANN | 7.0.RC1.alpha003 | - | + | Python | 3.9.11 | - | + | PyTorch | 2.0.1 | - | + | Torch_AIE | 6.3.rc2 | - | + +- 用于执行前后处理以及模型导出,则需要另一套环境,建议使用conda命令构建虚拟环境,并安装相应的包 + +``` +conda create --name ssd python=3.7.16 +``` + +# 快速上手 + +## 获取源码 + +1. 安装依赖。 + ```shell + pip3 install -r ssd-requirements.txt + ``` + +2. 获取SSD源代码并修改mmdetection。 + ```shell + git clone https://github.com/open-mmlab/mmdetection.git + cd mmdetection + git reset --hard a21eb25535f31634cef332b09fc27d28956fb24b + patch -p1 < ../update_ssd_mmdet.diff + pip install -v -e . + cd .. + ``` + 将`onnx_helper.py`文件,放置在`mmdetection/mmdet/core/export`目录下 + +3. 随后执行(该步骤如初次安装,可能需等待较长时间): + ``` + pip3 install --no-cache-dir mmcv-full==1.2.7 + ``` + +## 准备数据集 + +1. 获取原始数据集。(解压命令参考tar –xvf \*.tar与 unzip \*.zip) + + 推理数据集采用 [coco_val_2017](http://images.cocodataset.org),数据集下载后存放路径:`dataset=/root/datasets/coco` + + 目录结构: + + ``` + ├── coco + │ ├── val2017 + │ ├── annotations + │ ├──instances_val2017.json + ``` + +2. 数据预处理(使用torch 1.8环境)。 + + 将原始数据集转换为模型输入的二进制数据。执行 `ssd_preprocess.py` 脚本。 + + ```shell + python ssd_preprocess.py \ + --image_folder_path $dataset/val2017 \ + --bin_folder_path val2017_ssd_bin + ``` + + - 参数说明: + + - --image_folder_path:原始数据验证集(.jpg)所在路径。 + - --bin_folder_path:输出的二进制文件(.bin)所在路径。 + + 每个图像对应生成一个二进制文件。 + +3. 生成数据集info文件(使用torch 1.8环境)。 + + 运行 `get_info.py` 脚本,生成图片数据info文件。 + ```shell + python get_info.py jpg $dataset/val2017 coco2017_ssd_jpg.info + ``` + + - 参数说明: + + - 第一个参数:生成的数据集文件格式。 + - 第二个参数:预处理后的数据文件相对路径。 + - 第三个参数:即将生成的info文件名。 + + 运行成功后,在当前目录中生成 `coco2017_ssd_jpg.info`。 + +## 模型推理 + +1. 模型转换。 + + 使用PyTorch将模型权重文件.pth转换为.ts文件。 + 1. 获取权重文件。 + + 获取经过训练的权重文件:[ssd300_coco_20200307-a92d2092.pth](http://download.openmmlab.com/mmdetection/v2.0/ssd/ssd300_coco/ssd300_coco_20200307-a92d2092.pth) + + 2. 导出onnx文件。 + + 使用`export.py`导出ts文件(使用torch 1.8环境) + + ``` + python export.py \ + --checkpoint ./ssd300_coco_20200307-a92d2092.pth \ + --mmdet_path ./mmdetection \ + --shape=300 \ + --mean 123.675 116.28 103.53 \ + --std 1 1 1 + ``` + + - 参数说明: + + - checkpoint:原始pth文件所在路径 + - mmdet_path:github拉入文件夹路径 + - shape:图像尺寸 + +2. 开始推理验证。 + + 1. 执行推理。(使用torch2.0.1版本) + ```shell + python3 acc_dataset.py --ts_path ./ssd300_coco.ts --img_bin_path ./coco2017_bin --save_dir ./pyinfer_res_npu + ``` + + - 参数说明: + + - ts_path:导出ts文件路径 + - img_bin_path:图片预处理得到的bin文件夹所在路径 + - save_dir:保存推理结果的路径 + + 2. 精度验证。 + + 调用coco_eval.py评测map精度(使用torch1.8版本): + + ```shell + det_path=postprocess_out + python ssd_postprocess.py \ + --bin_data_path=out/2022_*/ \ + --score_threshold=0.02 \ + --test_annotation=coco2017_ssd_jpg.info \ + --nms_pre 200 \ + --det_results_path ${det_path} + python txt_to_json.py --npu_txt_path ${det_path} + python coco_eval.py --ground_truth /root/datasets/coco/annotations/instances_val2017.json + ``` + + - 参数说明: + + - --bin_data_path:为推理结果存放的路径。 + - --score_threshold:得分阈值。 + - --test_annotation:原始图片信息文件。 + - --nms_pre:每张图片获取框数量的阈值。 + - --det_results_path:后处理输出路径。 + - --npu_txt_path:后处理输出路径。 + - --ground_truth:instances_val2017.json文件路径。 + +# 模型推理性能&精度 + +调用ACL接口推理计算,性能参考下列数据。 + +| | mAP | +| --------- | -------- | +| 310P3精度 | mAP=25.4 | + + +| Throughput | 310*4 | 310P3 | 310B1 | +| ---------- | -------- | -------- | ----- | +| bs1 | 179.194 | 298.5514 | 75.42 | +| bs4 | 207.596 | 337.0112 | 77.9 | +| bs8 | 211.7312 | 323.5662 | 79.77 | +| bs16 | 211.288 | 318.1392 | 77.84 | +| bs32 | 200.2948 | 318.7303 | 79.78 | +| bs64 | 196.4192 | 313.0790 | 48.36 | +| 最优batch | 211.7312 | 337.0112 | 79.77 | \ No newline at end of file diff --git a/AscendIE/TorchAIE/built-in/cv/detection/ssd/onnx_helper.py b/AscendIE/TorchAIE/built-in/cv/detection/ssd/onnx_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..9abd220bafabe507e127b871daa316658aa8c066 --- /dev/null +++ b/AscendIE/TorchAIE/built-in/cv/detection/ssd/onnx_helper.py @@ -0,0 +1,245 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os + +import torch + + +def dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape): + """Clip boxes dynamically for onnx. + + Since torch.clamp cannot have dynamic `min` and `max`, we scale the + boxes by 1/max_shape and clamp in the range [0, 1]. + + Args: + x1 (Tensor): The x1 for bounding boxes. + y1 (Tensor): The y1 for bounding boxes. + x2 (Tensor): The x2 for bounding boxes. + y2 (Tensor): The y2 for bounding boxes. + max_shape (Tensor or torch.Size): The (H,W) of original image. + Returns: + tuple(Tensor): The clipped x1, y1, x2, y2. + """ + # assert isinstance( + # max_shape, + # torch.Tensor), '`max_shape` should be tensor of (h,w) for onnx, got {}'.format(max_shape.__class__.__name__) + + assert isinstance(max_shape, (torch.Tensor, torch.Size, list, tuple)), '`max_shape` should be ' + \ + 'torch.Tensor/torch.Size/list/tuple of (h, w) for onnx, got {}'.format(max_shape.__class__.__name__) + if not isinstance(max_shape, torch.Tensor): + max_shape = torch.tensor(max_shape, dtype=x1.dtype, device=x1.device) + else: + max_shape = max_shape.type_as(x1) + + # scale by 1/max_shape + x1 = x1 / max_shape[1] + y1 = y1 / max_shape[0] + x2 = x2 / max_shape[1] + y2 = y2 / max_shape[0] + + # clamp [0, 1] + x1 = torch.clamp(x1, 0, 1) + y1 = torch.clamp(y1, 0, 1) + x2 = torch.clamp(x2, 0, 1) + y2 = torch.clamp(y2, 0, 1) + + # scale back + x1 = x1 * max_shape[1] + y1 = y1 * max_shape[0] + x2 = x2 * max_shape[1] + y2 = y2 * max_shape[0] + return x1, y1, x2, y2 + + +def get_k_for_topk(k, size): + """Get k of TopK for onnx exporting. + + The K of TopK in TensorRT should not be a Tensor, while in ONNX Runtime + it could be a Tensor.Due to dynamic shape feature, we have to decide + whether to do TopK and what K it should be while exporting to ONNX. + If returned K is less than zero, it means we do not have to do + TopK operation. + + Args: + k (int or Tensor): The set k value for nms from config file. + size (Tensor or torch.Size): The number of elements of \ + TopK's input tensor + Returns: + tuple: (int or Tensor): The final K for TopK. + """ + ret_k = -1 + if k <= 0 or size <= 0: + return ret_k + if torch.onnx.is_in_onnx_export(): + is_trt_backend = os.environ.get('ONNX_BACKEND') == 'MMCVTensorRT' + if is_trt_backend: + # TensorRT does not support dynamic K with TopK op + if 0 < k < size: + ret_k = k + else: + # Always keep topk op for dynamic input in onnx for ONNX Runtime + ret_k = torch.where(k < size, k, size) + elif k < size: + ret_k = k + else: + # ret_k is -1 + pass + return ret_k + + +def add_dummy_nms_for_onnx(boxes, + scores, + max_output_boxes_per_class=1000, + iou_threshold=0.5, + score_threshold=0.05, + pre_top_k=-1, + after_top_k=-1, + labels=None): + """Create a dummy onnx::NonMaxSuppression op while exporting to ONNX. + + This function helps exporting to onnx with batch and multiclass NMS op. + It only supports class-agnostic detection results. That is, the scores + is of shape (N, num_bboxes, num_classes) and the boxes is of shape + (N, num_boxes, 4). + + Args: + boxes (Tensor): The bounding boxes of shape [N, num_boxes, 4] + scores (Tensor): The detection scores of shape + [N, num_boxes, num_classes] + max_output_boxes_per_class (int): Maximum number of output + boxes per class of nms. Defaults to 1000. + iou_threshold (float): IOU threshold of nms. Defaults to 0.5 + score_threshold (float): score threshold of nms. + Defaults to 0.05. + pre_top_k (bool): Number of top K boxes to keep before nms. + Defaults to -1. + after_top_k (int): Number of top K boxes to keep after nms. + Defaults to -1. + labels (Tensor, optional): It not None, explicit labels would be used. + Otherwise, labels would be automatically generated using + num_classed. Defaults to None. + + Returns: + tuple[Tensor, Tensor]: dets of shape [N, num_det, 5] + and class labels of shape [N, num_det]. + """ + max_output_boxes_per_class = torch.LongTensor([max_output_boxes_per_class]) + iou_threshold = torch.tensor([iou_threshold], dtype=torch.float32) + score_threshold = torch.tensor([score_threshold], dtype=torch.float32) + batch_size = scores.shape[0] + num_class = scores.shape[2] + + if pre_top_k > 0: + nms_pre = torch.tensor(pre_top_k, device=scores.device, dtype=torch.long) + nms_pre = get_k_for_topk(nms_pre, boxes.shape[1]) + + if nms_pre > 0: + max_scores, _ = scores.max(-1) + _, topk_inds = max_scores.topk(nms_pre) + batch_inds = torch.arange(batch_size).view( + -1, 1).expand_as(topk_inds).long() + # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501 + # transformed_inds = boxes.shape[1] * batch_inds + topk_inds + transformed_inds = (boxes.shape[1] * batch_inds.int()) + topk_inds.int() + transformed_inds = transformed_inds.long() + boxes = boxes.reshape(-1, 4)[transformed_inds, :].reshape( + batch_size, -1, 4) + scores = scores.reshape(-1, num_class)[transformed_inds, :].reshape( + batch_size, -1, num_class) + if labels is not None: + labels = labels.reshape(-1, 1)[transformed_inds].reshape( + batch_size, -1) + + scores = scores.permute(0, 2, 1) + num_box = boxes.shape[1] + # turn off tracing to create a dummy output of nms + state = torch._C._get_tracing_state() + # dummy indices of nms's output + num_fake_det = 2 + batch_inds = torch.randint(batch_size, (num_fake_det, 1)) + cls_inds = torch.randint(num_class, (num_fake_det, 1)) + box_inds = torch.randint(num_box, (num_fake_det, 1)) + indices = torch.cat([batch_inds, cls_inds, box_inds], dim=1) + output = indices + setattr(DummyONNXNMSop, 'output', output) + + # open tracing + torch._C._set_tracing_state(state) + selected_indices = DummyONNXNMSop.apply(boxes, scores, + max_output_boxes_per_class, + iou_threshold, score_threshold) + + batch_inds, cls_inds = selected_indices[:, 0], selected_indices[:, 1] + box_inds = selected_indices[:, 2] + if labels is None: + labels = torch.arange(num_class, dtype=torch.long).to(scores.device) + labels = labels.view(1, num_class, 1).expand_as(scores) + scores = scores.reshape(-1, 1) + boxes = boxes.reshape(batch_size, -1).repeat(1, num_class).reshape(-1, 4) + # pos_inds = (num_class * batch_inds + cls_inds) * num_box + box_inds # original + pos_inds = (num_class * batch_inds.int()) + cls_inds.int() + pos_inds = (pos_inds * num_box.int()) + box_inds.int() + pos_inds = pos_inds.long() + # pos_inds = (batch_inds.new_tensor(num_class) * batch_inds + cls_inds) * batch_inds.new_tensor(num_box) + box_inds + mask = scores.new_zeros(scores.shape) + # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501 + # PyTorch style code: mask[batch_inds, box_inds] += 1 + mask[pos_inds, :] += 1 + scores = scores * mask + boxes = boxes * mask + + scores = scores.reshape(batch_size, -1) + boxes = boxes.reshape(batch_size, -1, 4) + labels = labels.reshape(batch_size, -1) + + if boxes.dtype != torch.float: + boxes = boxes.float() + scores = scores.float() + + if after_top_k > 0: + nms_after = torch.tensor( + after_top_k, device=scores.device, dtype=torch.long) + nms_after = get_k_for_topk(nms_after, num_box * num_class) + + if nms_after > 0: + _, topk_inds = scores.topk(nms_after) + batch_inds = torch.arange(batch_size).view(-1, 1).expand_as(topk_inds).long() + # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501 + batch_inds = scores.shape[1] * batch_inds + # transformed_inds = batch_inds + topk_inds + transformed_inds = batch_inds.int() + topk_inds.int() + transformed_inds = transformed_inds.long() + scores = scores.reshape(-1, 1)[transformed_inds, :].reshape( + batch_size, -1) + boxes = boxes.reshape(-1, 4)[transformed_inds, :].reshape( + batch_size, -1, 4) + labels = labels.reshape(-1, 1)[transformed_inds, :].reshape( + batch_size, -1) + + scores = scores.unsqueeze(2) + dets = torch.cat([boxes, scores], dim=2) + return dets, labels + + +class DummyONNXNMSop(torch.autograd.Function): + """DummyONNXNMSop. + + This class is only for creating onnx::NonMaxSuppression. + """ + + @staticmethod + def forward(ctx, boxes, scores, max_output_boxes_per_class, iou_threshold, + score_threshold): + + return DummyONNXNMSop.output + + @staticmethod + def symbolic(g, boxes, scores, max_output_boxes_per_class, iou_threshold, + score_threshold): + return g.op( + 'NonMaxSuppression', + boxes, + scores, + max_output_boxes_per_class, + iou_threshold, + score_threshold, + outputs=1) diff --git a/AscendIE/TorchAIE/built-in/cv/detection/ssd/ssd_postprocess.py b/AscendIE/TorchAIE/built-in/cv/detection/ssd/ssd_postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..09dc47196e648d85a7351a7fe00533c0d8a527f9 --- /dev/null +++ b/AscendIE/TorchAIE/built-in/cv/detection/ssd/ssd_postprocess.py @@ -0,0 +1,294 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""coco postprocess""" + +import os +import numpy as np +import argparse +import cv2 +import warnings +import torch +import time +try: + from torch import npu_batch_nms as NMSOp + NMS_ON_NPU = True +except: + from torchvision.ops import batched_nms as NMSOp + NMS_ON_NPU = False + +CLASSES = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', + 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', + 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', + 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', + 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', + 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', + 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', + 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', + 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', + 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', + 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', + 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'] + + +def coco_postprocess(bbox, image_size, net_input_width, net_input_height): + """ + This function is postprocessing for FasterRCNN output. + + Before calling this function, reshape the raw output of FasterRCNN to + following form + numpy.ndarray: + [x, y, width, height, confidence, probability of 80 classes] + shape: (100,) + The postprocessing restore the bounding rectangles of FasterRCNN output + to origin scale and filter with non-maximum suppression. + + :param bbox: a numpy array of the FasterRCNN output + :param image_path: a string of image path + :return: three list for best bound, class and score + """ + w = image_size[0] + h = image_size[1] + scale_w = net_input_width / w + scale_h = net_input_height / h + + # cal predict box on the image src + pbox = bbox.copy() + pbox[:, 0] = (bbox[:, 0]) / scale_w + pbox[:, 1] = (bbox[:, 1]) / scale_h + pbox[:, 2] = (bbox[:, 2]) / scale_w + pbox[:, 3] = (bbox[:, 3]) / scale_h + return pbox + + +def np_clip_bbox(bboxes, max_shape): + x1, y1, x2, y2 = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3] + h, w = max_shape + x1 = x1.clip(min=0, max=w) + y1 = y1.clip(min=0, max=h) + x2 = x2.clip(min=0, max=w) + y2 = y2.clip(min=0, max=h) + bboxes = np.stack([x1, y1, x2, y2], axis=-1) + return bboxes + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--bin_data_path", + default="/onnx/ssd/cpp_infer_res_cpu") + parser.add_argument("--test_annotation", default="./coco2017_ssd_jpg.info") + parser.add_argument("--det_results_path", default="./postprocess_out_cpu/") + parser.add_argument("--net_out_num", default=2, type=int) + parser.add_argument("--num_pred_box", default=8732, type=int) + parser.add_argument("--nms_pre", default=200, type=int) + parser.add_argument("--net_input_width", default=300, type=int) + parser.add_argument("--net_input_height", default=300, type=int) + parser.add_argument("--min_bbox_size", default=0.01, type=float) + parser.add_argument("--score_threshold", default=0.02, type=float) + parser.add_argument("--nms", default=True, type=bool) + parser.add_argument("--iou_threshold", default=0.45, type=float) + parser.add_argument("--max_per_img", default=200, type=int) + parser.add_argument("--ifShowDetObj", action="store_true", default=True, + help="if input the para means True, neither False.") + parser.add_argument("--start", default=0, type=float) + parser.add_argument("--end", default=1, type=float) + parser.add_argument("--device", default=0, type=int) + parser.add_argument("--clear_cache", action='store_true') + flags = parser.parse_args() + + # generate dict according to annotation file for query resolution + # load width and height of input images + img_size_dict = dict() + with open(flags.test_annotation)as f: + for line in f: + temp = line.split(" ") + img_file_path = temp[1] + img_name = temp[1].split("/")[-1].split(".")[0] + img_width = int(temp[2]) + img_height = int(temp[3]) + img_size_dict[img_name] = (img_width, img_height, img_file_path) + + # read bin file for generate predict result + bin_path = flags.bin_data_path # 推理结果保存路径 + det_results_path = flags.det_results_path + os.makedirs(det_results_path, exist_ok=True) + total_img = set([name[:name.rfind('_')] + for name in os.listdir(bin_path) if "bin" in name]) + total_img = sorted(total_img) # list of img names (str) + num_img = len(total_img) # 5000 + start = int(flags.start * num_img) + end = int(flags.end * num_img) + task_len = end - start + 1 + + finished = 0 + time_start = time.time() + for img_id in range(start, end): + # for img_id, bin_file in enumerate(sorted(total_img)): + bin_file = total_img[img_id] + path_base = os.path.join(bin_path, bin_file) + det_results_file = os.path.join(det_results_path, bin_file + ".txt") + if os.path.exists(det_results_file) and not flags.clear_cache: + continue + + # load all detected output tensor + bbox_file = path_base + "_" + str(0) + ".bin" + score_file = path_base + "_" + str(1) + ".bin" + assert os.path.exists( + bbox_file), '[ERROR] file `{}` not exist'.format(bbox_file) + assert os.path.exists( + score_file), '[ERROR] file `{}` not exist'.format(score_file) + bboxes = np.fromfile(bbox_file, dtype="float32").reshape( + flags.num_pred_box, 4) + scores = np.fromfile(score_file, dtype="float32").reshape( + flags.num_pred_box, 80) + + bboxes = torch.from_numpy(bboxes) + scores = torch.from_numpy(scores) + try: + bboxes = bboxes.npu(flags.device) + scores = scores.npu(flags.device) + except: + warnings.warn('npu is not available, running on cpu') + + max_scores, _ = scores.max(-1) # shape of [8732], torch.float32 + keep_inds = (max_scores > flags.score_threshold).nonzero( + as_tuple=False).view(-1) + bboxes = bboxes[keep_inds, :] + scores = scores[keep_inds, :] + + if flags.nms_pre > 0 and flags.nms_pre < bboxes.shape[0]: + max_scores, _ = scores.max(-1) # shape: torch.Size([2738]) dtype:torch.float32 + _, topk_inds = max_scores.topk(flags.nms_pre) + bboxes = bboxes[topk_inds, :] # shape: torch.Size([200, 4]) + scores = scores[topk_inds, :] # shape: torch.Size([200, 80]) + + # clip bbox border + bboxes[:, 0::2].clamp_(min=0, max=flags.net_input_width - 1) + bboxes[:, 1::2].clamp_(min=0, max=flags.net_input_height - 1) + + # remove small bbox + bboxes_width_height = bboxes[:, 2:] - bboxes[:, :2] + valid_bboxes = bboxes_width_height > flags.min_bbox_size + keep_inds = (valid_bboxes[:, 0] & valid_bboxes[:, 1] + ).nonzero(as_tuple=False).view(-1) + bboxes = bboxes[keep_inds, :] + scores = scores[keep_inds, :] + + # rescale bbox to original image size + original_img_info = img_size_dict[bin_file] + rescale_factor = torch.tensor([ + original_img_info[0] / flags.net_input_width, + original_img_info[1] / flags.net_input_height] * 2, + dtype=bboxes.dtype, device=bboxes.device) + bboxes *= rescale_factor + + if flags.nms: + if NMS_ON_NPU: + # repeat bbox for each class + # (N, 4) -> (B, N, 80, 4), where B = 1 is the batchsize + bboxes = bboxes[None, :, None, :].repeat(1, 1, 80, 1) + # (N, 80) -> (B, N, 80), where B = 1 is the batchsize + scores = scores[None, :, :] + + # bbox batched nms + bboxes, scores, labels, num_total_bboxes = \ + NMSOp( + bboxes.half(), scores.half(), + score_threshold=flags.score_threshold, + iou_threshold=flags.iou_threshold, + max_size_per_class=flags.max_per_img, + max_total_size=flags.max_per_img) + bboxes = bboxes[0, :num_total_bboxes, :] + scores = scores[0, :num_total_bboxes] + class_idxs = labels[0, :num_total_bboxes] + else: + # repeat bbox and class idx for each class + bboxes = bboxes[:, None, :].repeat( + 1, 80, 1) # (N, 4) -> (N, 80, 4) + class_idxs = torch.arange(80, dtype=torch.long, device=bboxes.device + )[None, :].repeat(bboxes.shape[0], 1) # (80) -> (N, 80) + + # reshape bbox for torch nms + bboxes = bboxes.view(-1, 4) + scores = scores.view(-1) + class_idxs = class_idxs.view(-1) + + # bbox batched nms + keep_inds = NMSOp(bboxes, scores, class_idxs, + flags.iou_threshold) + bboxes = bboxes[keep_inds] + scores = scores[keep_inds] + class_idxs = class_idxs[keep_inds] + else: + # repeat bbox and class idx for each class + bboxes = bboxes[:, None, :].repeat( + 1, 80, 1) # (N, 4) -> (N, 80, 4) + class_idxs = torch.arange(80, dtype=torch.long, device=bboxes.device + )[None, :].repeat(bboxes.shape[0], 1) # (80) -> (N, 80) + + # reshape bbox for torch nms + bboxes = bboxes.view(-1, 4) + scores = scores.view(-1) + class_idxs = class_idxs.view(-1) + + # keep topk max_per_img bbox + if flags.max_per_img > 0 and flags.max_per_img < bboxes.shape[0]: + _, topk_inds = scores.topk(flags.max_per_img) + bboxes = bboxes[topk_inds, :] + scores = scores[topk_inds] + class_idxs = class_idxs[topk_inds] + + # move to cpu if running on npu + if bboxes.device != 'cpu': + bboxes = bboxes.cpu() + scores = scores.cpu() + class_idxs = class_idxs.cpu() + + # convert to numpy.ndarray + bboxes = bboxes.numpy() + scores = scores.numpy() + class_idxs = class_idxs.numpy() + + # make det result file + if flags.ifShowDetObj == True: + imgCur = cv2.imread(original_img_info[2]) + + det_results_str = '' + for idx in range(bboxes.shape[0]): + x1, y1, x2, y2 = bboxes[idx, :] + predscore = scores[idx] + class_ind = class_idxs[idx] + + class_name = CLASSES[int(class_ind)] + det_results_str += "{} {} {} {} {} {}\n".format( + class_name, predscore, x1, y1, x2, y2) + if flags.ifShowDetObj == True: + imgCur = cv2.rectangle(imgCur, (int(x1), int( + y1)), (int(x2), int(y2)), (0, 255, 0), 1) + imgCur = cv2.putText(imgCur, class_name + '|' + str(predscore), + (int(x1), int(y1)), cv2.FONT_HERSHEY_SIMPLEX, + 0.5, (0, 0, 255), 1) + + if flags.ifShowDetObj == True: + cv2.imwrite(os.path.join(det_results_path, bin_file + + '.jpg'), imgCur, [int(cv2.IMWRITE_JPEG_QUALITY), 70]) + + with open(det_results_file, "w") as detf: + detf.write(det_results_str) + + finished += 1 + speed = finished / (time.time() - time_start) + print('processed {:5d}/{:<5d} images, speed: {:.2f}FPS'.format( + finished, task_len, speed), end='\r')