diff --git a/ACL_PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/.keep b/ACL_PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ACL_PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test.py b/ACL_PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test.py new file mode 100644 index 0000000000000000000000000000000000000000..cfcd5066e715f7151f45e42be2843f6bf4e3563e --- /dev/null +++ b/ACL_PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test.py @@ -0,0 +1 @@ +import torch diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/LICENSE b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..517057158039f458ca1ac1341be58c1148b6b552 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/LICENSE @@ -0,0 +1,30 @@ +BSD 3-Clause License + +Copyright (c) 2017, +All rights reserved. +Copyright 2022 Huawei Technologies Co., Ltd + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/OXInterface.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/OXInterface.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc4bc085fa14ef6c0f4434a0e703d94d241af8f7 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/OXInterface.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/coco_eval.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/coco_eval.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fdcebb56a7f46785cded01bb7d2a3ff13e7cff89 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/coco_eval.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/engine.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/engine.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac3e653e5de063ae2cb252f772cbf1310ca05563 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/engine.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/engine.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/engine.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6cb1ae46dc90919c05212f3fa6b666291923fff8 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/engine.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/hubconf.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/hubconf.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91c1504cf6d9d4e1146ca2351e73c6bd73723713 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/hubconf.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/hubconf.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/hubconf.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ca77d8fa5294d321996f19a34eff2ff991fb3a3 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/hubconf.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/npu_fused_adamw.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/npu_fused_adamw.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0f72da710f540d1da76f6d5cbc5b7f9e47a67b83 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/npu_fused_adamw.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/train_npu_hw.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/train_npu_hw.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aeffc9d3c26caf6a223af3a9cb046ba86c35bddd Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/__pycache__/train_npu_hw.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/coco_eval.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/coco_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..b7bbe6bd8e01b9e5f45e0bb6a1503403f9fa8121 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/coco_eval.py @@ -0,0 +1,81 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Train and eval functions used in main.py +""" + +import os +import torch +import util.misc as utils +from datasets.coco_eval import CocoEvaluator +import onnxruntime + + +@torch.no_grad() +def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, output_dir): + model.eval() + criterion.eval() + + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) + header = 'Test:' + + iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys()) + coco_evaluator = CocoEvaluator(base_ds, iou_types) + # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75] + + + # ort_session = onnxruntime.InferenceSession('model_file/detr_640.onnx') + for samples, targets in metric_logger.log_every(data_loader, 10, header): + samples = samples.to(device) + targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + print(samples.tensors.shape) + + # onnx_input=torch.randn(1,3,750,800).numpy() + + # ort_inputs = {ort_session.get_inputs()[0].name:samples.tensors.cpu().numpy()} + # # print('inputs',ort_inputs) + # ort_outs = ort_session.run(None, ort_inputs) + # out={'pred_logits':torch.from_numpy(ort_outs[0]).cuda(), + # 'pred_boxes':torch.from_numpy(ort_outs[1]).cuda()} + # outputs=out + # loss_dict=criterion(out,targets) + + outputs = model(samples) + + loss_dict = criterion(outputs, targets) + + weight_dict = criterion.weight_dict + + # reduce losses over all GPUs for logging purposes + loss_dict_reduced = utils.reduce_dict(loss_dict) + loss_dict_reduced_scaled = {k: v * weight_dict[k] + for k, v in loss_dict_reduced.items() if k in weight_dict} + loss_dict_reduced_unscaled = {f'{k}_unscaled': v + for k, v in loss_dict_reduced.items()} + metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()), + **loss_dict_reduced_scaled, + **loss_dict_reduced_unscaled) + metric_logger.update(class_error=loss_dict_reduced['class_error']) + + orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) + results = postprocessors['bbox'](outputs, orig_target_sizes) + print(len(results[0]['scores']),results[0]['scores']) + print(results[0]['boxes']) + print(results[0]['labels']) + print(postprocessors.keys()) + res = {target['image_id'].item(): output for target, output in zip(targets, results)} + coco_evaluator.update(res) + + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + coco_evaluator.synchronize_between_processes() + + # accumulate predictions from all images + coco_evaluator.accumulate() + coco_evaluator.summarize() + stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} + stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist() + + return stats, coco_evaluator \ No newline at end of file diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__init__.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..571b126ea4ed8db85bc75ff7947d674b8a5a2099 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import torch.utils.data +import torchvision + +from .coco import build as build_coco + + +def get_coco_api_from_dataset(dataset): + for _ in range(10): + # if isinstance(dataset, torchvision.datasets.CocoDetection): + # break + if isinstance(dataset, torch.utils.data.Subset): + dataset = dataset.dataset + if isinstance(dataset, torchvision.datasets.CocoDetection): + return dataset.coco + + +def build_dataset(image_set, args): + if args.dataset_file == 'coco': + return build_coco(image_set, args) + if args.dataset_file == 'coco_panoptic': + # to avoid making panopticapi required for coco + from .coco_panoptic import build as build_coco_panoptic + return build_coco_panoptic(image_set, args) + raise ValueError(f'dataset {args.dataset_file} not supported') diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/__init__.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..816728a01044b95762478d70b371822e2ffcd19e Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/__init__.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/__init__.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69983b6ba7fbe379137776a3c57350fb23ab5b16 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/__init__.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..689f41ab91d174dea6ef11587aa851ca4b5aec2e Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd72017b858cf9e469a04406af63d77a8f454623 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco_eval.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco_eval.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0a4bb3ac128cd0c9e7f43d79e205e0142980752 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco_eval.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco_eval.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco_eval.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a6c4d22531816b7721e85752cdb07aa03d9f7468 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/coco_eval.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/panoptic_eval.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/panoptic_eval.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f011b0b88ed8100cdf362fe3682d349a156ea783 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/panoptic_eval.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/panoptic_eval.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/panoptic_eval.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0224c4baf0778ef1ab589c72a71b8707f21c6286 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/panoptic_eval.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/transforms.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/transforms.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..763db09f26c435b163b41b135013ab5df796043a Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/transforms.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/transforms.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/transforms.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55f3cbecf4b2d8b549aa7a097884fd136156f530 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/__pycache__/transforms.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..e94e8d5026dec45f152fd3d525e36f5af0cb6ad9 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco.py @@ -0,0 +1,166 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +COCO dataset which returns image_id for evaluation. + +Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py +""" +from pathlib import Path + +import torch +import torch.utils.data +import torchvision +from pycocotools import mask as coco_mask + +import datasets.transforms as T + + +class CocoDetection(torchvision.datasets.CocoDetection): + def __init__(self, img_folder, ann_file, transforms, return_masks): + super(CocoDetection, self).__init__(img_folder, ann_file) + self._transforms = transforms + self.prepare = ConvertCocoPolysToMask(return_masks) + + def __getitem__(self, idx): + img, target = super(CocoDetection, self).__getitem__(idx) + image_id = self.ids[idx] + target = {'image_id': image_id, 'annotations': target} + img, target = self.prepare(img, target) + if self._transforms is not None: + img, target = self._transforms(img, target) + return img, target + + +def convert_coco_poly_to_mask(segmentations, height, width): + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8) + mask = mask.any(dim=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, dim=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8) + return masks + + +class ConvertCocoPolysToMask(object): + def __init__(self, return_masks=False): + self.return_masks = return_masks + + def __call__(self, image, target): + w, h = image.size + + image_id = target["image_id"] + image_id = torch.tensor([image_id]) + + anno = target["annotations"] + + anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0] + + boxes = [obj["bbox"] for obj in anno] + # print(boxes) + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + + classes = [obj["category_id"] for obj in anno] + classes = torch.tensor(classes, dtype=torch.int64) + + if self.return_masks: + segmentations = [obj["segmentation"] for obj in anno] + masks = convert_coco_poly_to_mask(segmentations, h, w) + + keypoints = None + if anno and "keypoints" in anno[0]: + keypoints = [obj["keypoints"] for obj in anno] + keypoints = torch.as_tensor(keypoints, dtype=torch.float32) + num_keypoints = keypoints.shape[0] + if num_keypoints: + keypoints = keypoints.view(num_keypoints, -1, 3) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + classes = classes[keep] + if self.return_masks: + masks = masks[keep] + if keypoints is not None: + keypoints = keypoints[keep] + + target = {} + target["boxes"] = boxes + target["labels"] = classes + if self.return_masks: + target["masks"] = masks + target["image_id"] = image_id + if keypoints is not None: + target["keypoints"] = keypoints + + # for conversion to coco api + area = torch.tensor([obj["area"] for obj in anno]) + iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) + target["area"] = area[keep] + target["iscrowd"] = iscrowd[keep] + + target["orig_size"] = torch.as_tensor([int(h), int(w)]) + target["size"] = torch.as_tensor([int(h), int(w)]) + + return image, target + + +def make_coco_transforms(image_set): + + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.RandomSelect( + # T.RandomResize(scales, max_size=1333), + T.pad_resize(), + T.Compose([ + # T.RandomResize([400, 500, 600]), + T.pad_resize(), + T.RandomSizeCrop(384, 600), + # T.RandomResize(scales, max_size=1333), + T.pad_resize(), + ]) + ), + normalize, + ]) + + if image_set == 'val': + return T.Compose([ + + T.pad_resize(), + # T.RandomResize(sizes=(640,640),), + # T.RandomResize([800], max_size=1333), + + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + + +def build(image_set, args): + root = Path(args.coco_path) + assert root.exists(), f'provided COCO path {root} does not exist' + mode = 'instances' + PATHS = { + "train": (root / "train2017", root / "annotations" / f'{mode}_train2017.json'), + "val": (root / "val2017", root / "annotations" / f'{mode}_val2017.json'), + } + + img_folder, ann_file = PATHS[image_set] + dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks) + return dataset diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco_eval.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..9487c08fd6b5da041facd4bd6c0b13c40a16df7d --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco_eval.py @@ -0,0 +1,257 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +COCO evaluator that works in distributed mode. + +Mostly copy-paste from https://github.com/pytorch/vision/blob/edfd5a7/references/detection/coco_eval.py +The difference is that there is less copy-pasting from pycocotools +in the end of the file, as python3 can suppress prints with contextlib +""" +import os +import contextlib +import copy +import numpy as np +import torch + +from pycocotools.cocoeval import COCOeval +from pycocotools.coco import COCO +import pycocotools.mask as mask_util + +from util.misc import all_gather + + +class CocoEvaluator(object): + def __init__(self, coco_gt, iou_types): + assert isinstance(iou_types, (list, tuple)) + coco_gt = copy.deepcopy(coco_gt) + self.coco_gt = coco_gt + + self.iou_types = iou_types + self.coco_eval = {} + for iou_type in iou_types: + self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type) + + self.img_ids = [] + self.eval_imgs = {k: [] for k in iou_types} + + def update(self, predictions): + img_ids = list(np.unique(list(predictions.keys()))) + self.img_ids.extend(img_ids) + + for iou_type in self.iou_types: + results = self.prepare(predictions, iou_type) + + # suppress pycocotools prints + with open(os.devnull, 'w') as devnull: + with contextlib.redirect_stdout(devnull): + coco_dt = COCO.loadRes(self.coco_gt, results) if results else COCO() + coco_eval = self.coco_eval[iou_type] + + coco_eval.cocoDt = coco_dt + coco_eval.params.imgIds = list(img_ids) + img_ids, eval_imgs = evaluate(coco_eval) + + self.eval_imgs[iou_type].append(eval_imgs) + + def synchronize_between_processes(self): + for iou_type in self.iou_types: + self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2) + create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type]) + + def accumulate(self): + for coco_eval in self.coco_eval.values(): + coco_eval.accumulate() + + def summarize(self): + for iou_type, coco_eval in self.coco_eval.items(): + print("IoU metric: {}".format(iou_type)) + coco_eval.summarize() + + def prepare(self, predictions, iou_type): + if iou_type == "bbox": + return self.prepare_for_coco_detection(predictions) + elif iou_type == "segm": + return self.prepare_for_coco_segmentation(predictions) + elif iou_type == "keypoints": + return self.prepare_for_coco_keypoint(predictions) + else: + raise ValueError("Unknown iou type {}".format(iou_type)) + + def prepare_for_coco_detection(self, predictions): + coco_results = [] + for original_id, prediction in predictions.items(): + if len(prediction) == 0: + continue + + boxes = prediction["boxes"] + boxes = convert_to_xywh(boxes).tolist() + scores = prediction["scores"].tolist() + labels = prediction["labels"].tolist() + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": labels[k], + "bbox": box, + "score": scores[k], + } + for k, box in enumerate(boxes) + ] + ) + return coco_results + + def prepare_for_coco_segmentation(self, predictions): + coco_results = [] + for original_id, prediction in predictions.items(): + if len(prediction) == 0: + continue + + scores = prediction["scores"] + labels = prediction["labels"] + masks = prediction["masks"] + + masks = masks > 0.5 + + scores = prediction["scores"].tolist() + labels = prediction["labels"].tolist() + + rles = [ + mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] + for mask in masks + ] + for rle in rles: + rle["counts"] = rle["counts"].decode("utf-8") + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": labels[k], + "segmentation": rle, + "score": scores[k], + } + for k, rle in enumerate(rles) + ] + ) + return coco_results + + def prepare_for_coco_keypoint(self, predictions): + coco_results = [] + for original_id, prediction in predictions.items(): + if len(prediction) == 0: + continue + + boxes = prediction["boxes"] + boxes = convert_to_xywh(boxes).tolist() + scores = prediction["scores"].tolist() + labels = prediction["labels"].tolist() + keypoints = prediction["keypoints"] + keypoints = keypoints.flatten(start_dim=1).tolist() + + coco_results.extend( + [ + { + "image_id": original_id, + "category_id": labels[k], + 'keypoints': keypoint, + "score": scores[k], + } + for k, keypoint in enumerate(keypoints) + ] + ) + return coco_results + + +def convert_to_xywh(boxes): + xmin, ymin, xmax, ymax = boxes.unbind(1) + return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1) + + +def merge(img_ids, eval_imgs): + all_img_ids = all_gather(img_ids) + all_eval_imgs = all_gather(eval_imgs) + + merged_img_ids = [] + for p in all_img_ids: + merged_img_ids.extend(p) + + merged_eval_imgs = [] + for p in all_eval_imgs: + merged_eval_imgs.append(p) + + merged_img_ids = np.array(merged_img_ids) + merged_eval_imgs = np.concatenate(merged_eval_imgs, 2) + + # keep only unique (and in sorted order) images + merged_img_ids, idx = np.unique(merged_img_ids, return_index=True) + merged_eval_imgs = merged_eval_imgs[..., idx] + + return merged_img_ids, merged_eval_imgs + + +def create_common_coco_eval(coco_eval, img_ids, eval_imgs): + img_ids, eval_imgs = merge(img_ids, eval_imgs) + img_ids = list(img_ids) + eval_imgs = list(eval_imgs.flatten()) + + coco_eval.evalImgs = eval_imgs + coco_eval.params.imgIds = img_ids + coco_eval._paramsEval = copy.deepcopy(coco_eval.params) + + +################################################################# +# From pycocotools, just removed the prints and fixed +# a Python3 bug about unicode not defined +################################################################# + + +def evaluate(self): + ''' + Run per image evaluation on given images and store results (a list of dict) in self.evalImgs + :return: None + ''' + # tic = time.time() + # print('Running per image evaluation...') + p = self.params + # add backward compatibility if useSegm is specified in params + if p.useSegm is not None: + p.iouType = 'segm' if p.useSegm == 1 else 'bbox' + print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) + # print('Evaluate annotation type *{}*'.format(p.iouType)) + p.imgIds = list(np.unique(p.imgIds)) + if p.useCats: + p.catIds = list(np.unique(p.catIds)) + p.maxDets = sorted(p.maxDets) + self.params = p + + self._prepare() + # loop through images, area range, max detection number + catIds = p.catIds if p.useCats else [-1] + + if p.iouType == 'segm' or p.iouType == 'bbox': + computeIoU = self.computeIoU + elif p.iouType == 'keypoints': + computeIoU = self.computeOks + self.ious = { + (imgId, catId): computeIoU(imgId, catId) + for imgId in p.imgIds + for catId in catIds} + + evaluateImg = self.evaluateImg + maxDet = p.maxDets[-1] + evalImgs = [ + evaluateImg(imgId, catId, areaRng, maxDet) + for catId in catIds + for areaRng in p.areaRng + for imgId in p.imgIds + ] + # this is NOT in the pycocotools code, but could be done outside + evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds)) + self._paramsEval = copy.deepcopy(self.params) + # toc = time.time() + # print('DONE (t={:0.2f}s).'.format(toc-tic)) + return p.imgIds, evalImgs + +################################################################# +# end of straight copy from pycocotools, just removing the prints +################################################################# diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco_panoptic.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco_panoptic.py new file mode 100644 index 0000000000000000000000000000000000000000..b24f615c2faa14b422829e2edad996e2b5b84248 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/coco_panoptic.py @@ -0,0 +1,99 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import json +from pathlib import Path + +import numpy as np +import torch +from PIL import Image + +from panopticapi.utils import rgb2id +from util.box_ops import masks_to_boxes + +from .coco import make_coco_transforms + + +class CocoPanoptic: + def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True): + with open(ann_file, 'r') as f: + self.coco = json.load(f) + + # sort 'images' field so that they are aligned with 'annotations' + # i.e., in alphabetical order + self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id']) + # sanity check + if "annotations" in self.coco: + for img, ann in zip(self.coco['images'], self.coco['annotations']): + assert img['file_name'][:-4] == ann['file_name'][:-4] + + self.img_folder = img_folder + self.ann_folder = ann_folder + self.ann_file = ann_file + self.transforms = transforms + self.return_masks = return_masks + + def __getitem__(self, idx): + ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx] + img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg') + ann_path = Path(self.ann_folder) / ann_info['file_name'] + + img = Image.open(img_path).convert('RGB') + w, h = img.size + if "segments_info" in ann_info: + masks = np.asarray(Image.open(ann_path), dtype=np.uint32) + masks = rgb2id(masks) + + ids = np.array([ann['id'] for ann in ann_info['segments_info']]) + masks = masks == ids[:, None, None] + + masks = torch.as_tensor(masks, dtype=torch.uint8) + labels = torch.tensor([ann['category_id'] for ann in ann_info['segments_info']], dtype=torch.int64) + + target = {} + target['image_id'] = torch.tensor([ann_info['image_id'] if "image_id" in ann_info else ann_info["id"]]) + if self.return_masks: + target['masks'] = masks + target['labels'] = labels + + target["boxes"] = masks_to_boxes(masks) + + target['size'] = torch.as_tensor([int(h), int(w)]) + target['orig_size'] = torch.as_tensor([int(h), int(w)]) + if "segments_info" in ann_info: + for name in ['iscrowd', 'area']: + target[name] = torch.tensor([ann[name] for ann in ann_info['segments_info']]) + + if self.transforms is not None: + img, target = self.transforms(img, target) + + return img, target + + def __len__(self): + return len(self.coco['images']) + + def get_height_and_width(self, idx): + img_info = self.coco['images'][idx] + height = img_info['height'] + width = img_info['width'] + return height, width + + +def build(image_set, args): + img_folder_root = Path(args.coco_path) + ann_folder_root = Path(args.coco_panoptic_path) + assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist' + assert ann_folder_root.exists(), f'provided COCO path {ann_folder_root} does not exist' + mode = 'panoptic' + PATHS = { + "train": ("train2017", Path("annotations") / f'{mode}_train2017.json'), + "val": ("val2017", Path("annotations") / f'{mode}_val2017.json'), + } + + img_folder, ann_file = PATHS[image_set] + img_folder_path = img_folder_root / img_folder + ann_folder = ann_folder_root / f'{mode}_{img_folder}' + ann_file = ann_folder_root / ann_file + + dataset = CocoPanoptic(img_folder_path, ann_folder, ann_file, + transforms=make_coco_transforms(image_set), return_masks=args.masks) + + return dataset diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/panoptic_eval.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/panoptic_eval.py new file mode 100644 index 0000000000000000000000000000000000000000..9cb4f83409046a5c2a87643ee005e52a440aae74 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/panoptic_eval.py @@ -0,0 +1,44 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import json +import os + +import util.misc as utils + +try: + from panopticapi.evaluation import pq_compute +except ImportError: + pass + + +class PanopticEvaluator(object): + def __init__(self, ann_file, ann_folder, output_dir="panoptic_eval"): + self.gt_json = ann_file + self.gt_folder = ann_folder + if utils.is_main_process(): + if not os.path.exists(output_dir): + os.mkdir(output_dir) + self.output_dir = output_dir + self.predictions = [] + + def update(self, predictions): + for p in predictions: + with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f: + f.write(p.pop("png_string")) + + self.predictions += predictions + + def synchronize_between_processes(self): + all_predictions = utils.all_gather(self.predictions) + merged_predictions = [] + for p in all_predictions: + merged_predictions += p + self.predictions = merged_predictions + + def summarize(self): + if utils.is_main_process(): + json_data = {"annotations": self.predictions} + predictions_json = os.path.join(self.output_dir, "predictions.json") + with open(predictions_json, "w") as f: + f.write(json.dumps(json_data)) + return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir) + return None diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/transforms.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..0419a812365f5b2878c1f19daffcedf7d89558a5 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/datasets/transforms.py @@ -0,0 +1,330 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Transforms and data augmentation for both image + bbox. +""" +import random + +import PIL +import torch +import torchvision.transforms as T +import torchvision.transforms.functional as F +from PIL import Image, ImageDraw +from util.box_ops import box_xyxy_to_cxcywh +from util.misc import interpolate +import numpy as np + + +def crop(image, target, region): + cropped_image = F.crop(image, *region) + + target = target.copy() + i, j, h, w = region + + # should we do something wrt the original size? + target["size"] = torch.tensor([h, w]) + + fields = ["labels", "area", "iscrowd"] + + if "boxes" in target: + boxes = target["boxes"] + max_size = torch.as_tensor([w, h], dtype=torch.float32) + cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) + cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) + cropped_boxes = cropped_boxes.clamp(min=0) + area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) + target["boxes"] = cropped_boxes.reshape(-1, 4) + target["area"] = area + fields.append("boxes") + + if "masks" in target: + # FIXME should we update the area here if there are no boxes? + target['masks'] = target['masks'][:, i:i + h, j:j + w] + fields.append("masks") + + # remove elements for which the boxes or masks that have zero area + if "boxes" in target or "masks" in target: + # favor boxes selection when defining which elements to keep + # this is compatible with previous implementation + if "boxes" in target: + cropped_boxes = target['boxes'].reshape(-1, 2, 2) + keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) + else: + keep = target['masks'].flatten(1).any(1) + + for field in fields: + target[field] = target[field][keep] + + return cropped_image, target + + +def hflip(image, target): + flipped_image = F.hflip(image) + + w, h = image.size + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) + target["boxes"] = boxes + + if "masks" in target: + target['masks'] = target['masks'].flip(-1) + + return flipped_image, target + + +def resize(image, target, size, max_size=None): + # size can be min_size (scalar) or (w, h) tuple + def get_size_with_aspect_ratio(image_size, size, max_size=None): + w, h = image_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def get_size(image_size, size, max_size=None): + if isinstance(size, (list, tuple)): + return size[::-1] + else: + return get_size_with_aspect_ratio(image_size, size, max_size) + + # for i in target['boxes']: + # draw=ImageDraw.Draw(image) + # draw.line([(i[0].item(), i[1].item()),(i[2].item(),i[1].item()), + # (i[2].item(), i[3].item()),(i[0].item(),i[3].item()), + # (i[0].item(), i[1].item())], width=2, fill='red') + # image.show() + + size = get_size(image.size, size, max_size) + rescaled_image = F.resize(image, size) + + # rescaled_image = F.resize(image, size=(1280, 720)) + + if target is None: + return rescaled_image, None + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratio_width, ratio_height = ratios + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + scaled_boxes = boxes + torch.as_tensor([200, 200, 200, 200]) + scaled_boxes = scaled_boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) + target["boxes"] = scaled_boxes + + if "area" in target: + area = target["area"] + scaled_area = area * (ratio_width * ratio_height) + target["area"] = scaled_area + + h, w = size + target["size"] = torch.tensor([h, w]) + + if "masks" in target: + target['masks'] = interpolate( + target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5 + + # print('pad:',target['boxes']) + # for i in target['boxes']: + # draw=ImageDraw.Draw(rescaled_image) + # draw.line([(i[0].item(), i[1].item()),(i[2].item(),i[1].item()), + # (i[2].item(), i[3].item()),(i[0].item(),i[3].item()), + # (i[0].item(), i[1].item())], width=2, fill='red') + # rescaled_image.show() + + return rescaled_image, target + + +def pad(image, target, padding): + # assumes that we only pad on the bottom right corners + padded_image = F.pad(image, (0, 0, padding[0], padding[1])) + if target is None: + return padded_image, None + target = target.copy() + # should we do something wrt the original size? + target["size"] = torch.tensor(padded_image.size[::-1]) + if "masks" in target: + target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1])) + return padded_image, target + + +class RandomCrop(object): + def __init__(self, size): + self.size = size + + def __call__(self, img, target): + region = T.RandomCrop.get_params(img, self.size) + return crop(img, target, region) + + +class RandomSizeCrop(object): + def __init__(self, min_size: int, max_size: int): + self.min_size = min_size + self.max_size = max_size + + def __call__(self, img: PIL.Image.Image, target: dict): + w = random.randint(self.min_size, min(img.width, self.max_size)) + h = random.randint(self.min_size, min(img.height, self.max_size)) + region = T.RandomCrop.get_params(img, [h, w]) + return crop(img, target, region) + + +class CenterCrop(object): + def __init__(self, size): + self.size = size + + def __call__(self, img, target): + image_width, image_height = img.size + crop_height, crop_width = self.size + crop_top = int(round((image_height - crop_height) / 2.)) + crop_left = int(round((image_width - crop_width) / 2.)) + return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) + + +class RandomHorizontalFlip(object): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, img, target): + if random.random() < self.p: + return hflip(img, target) + return img, target + + +class RandomResize(object): + def __init__(self, sizes, max_size=None): + assert isinstance(sizes, (list, tuple)) + self.sizes = sizes + self.max_size = max_size + + def __call__(self, img, target=None): + size = random.choice(self.sizes) + return resize(img, target, size, self.max_size) + + +class pad_resize(object): + def __init__(self, sizes=None): + # assert isinstance(sizes, (list, tuple)) + self.sizes = sizes + + def __call__(self, img, target=None): + + img, target = Pad_img(img, target) + return resize(img, target, size=(1280, 1280)) + + +def Pad_img(image, target): + # assumes that we only pad on the bottom right corners + + h, w = image.size + pad_value = int(abs(h - w) / 2) + + if h > w: + padded_image = F.pad(image, (0, pad_value, 0, pad_value), fill=0) + else: + padded_image = F.pad(image, (pad_value, 0, pad_value, 0), fill=0) + h_, w_ = padded_image.size + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + scaled_boxes = boxes + torch.as_tensor([abs(h - h_) / 2, abs(w - w_) / 2, abs(h - h_) / 2, abs(w - w_) / 2]) + target["boxes"] = scaled_boxes + + if target is None: + return padded_image, None + + target["size"] = torch.tensor([h_, w_]) + + return padded_image, target + + +class RandomPad(object): + def __init__(self, max_pad): + self.max_pad = max_pad + + def __call__(self, img, target): + pad_x = random.randint(0, self.max_pad) + pad_y = random.randint(0, self.max_pad) + return pad(img, target, (pad_x, pad_y)) + + +class RandomSelect(object): + """ + Randomly selects between transforms1 and transforms2, + with probability p for transforms1 and (1 - p) for transforms2 + """ + + def __init__(self, transforms1, transforms2, p=0.5): + self.transforms1 = transforms1 + self.transforms2 = transforms2 + self.p = p + + def __call__(self, img, target): + if random.random() < self.p: + return self.transforms1(img, target) + return self.transforms2(img, target) + + +class ToTensor(object): + def __call__(self, img, target): + return F.to_tensor(img), target + + +class RandomErasing(object): + + def __init__(self, *args, **kwargs): + self.eraser = T.RandomErasing(*args, **kwargs) + + def __call__(self, img, target): + return self.eraser(img), target + + +class Normalize(object): + def __init__(self, mean, std): + self.mean = mean + self.std = std + + def __call__(self, image, target=None): + image = F.normalize(image, mean=self.mean, std=self.std) + if target is None: + return image, None + target = target.copy() + h, w = image.shape[-2:] + if "boxes" in target: + boxes = target["boxes"] + boxes = box_xyxy_to_cxcywh(boxes) + boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) + target["boxes"] = boxes + return image, target + + +class Compose(object): + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, target): + for t in self.transforms: + image, target = t(image, target) + return image, target + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/engine.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/engine.py new file mode 100644 index 0000000000000000000000000000000000000000..a1a78c8342468ebee4116a72630ecc5b2147679a --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/engine.py @@ -0,0 +1,153 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Train and eval functions used in main.py +""" +import math +import os +import sys +from typing import Iterable + +import torch +from apex import amp +import util.misc as utils +from datasets.coco_eval import CocoEvaluator +from datasets.panoptic_eval import PanopticEvaluator +import time + +def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, + data_loader: Iterable, optimizer: torch.optim.Optimizer, + device: torch.device, batch_size: int, epoch: int, max_norm: float = 0): + + model.train() + criterion.train() + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) + metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) + header = 'Epoch: [{}]'.format(epoch) + print_freq = 1 + + for samples, targets in metric_logger.log_every(data_loader, batch_size, print_freq, header): + optimizer.zero_grad() + samples = samples.to(device) + targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + outputs = model(samples) + + loss_dict = criterion(outputs, targets) + weight_dict = criterion.weight_dict + losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + + # reduce losses over all GPUs for logging purposes + loss_dict_reduced = utils.reduce_dict(loss_dict) + loss_dict_reduced_unscaled = {f'{k}_unscaled': v + for k, v in loss_dict_reduced.items()} + loss_dict_reduced_scaled = {k: v * weight_dict[k] + for k, v in loss_dict_reduced.items() if k in weight_dict} + losses_reduced_scaled = sum(loss_dict_reduced_scaled.values()) + + loss_value = losses_reduced_scaled.item() + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + print(loss_dict_reduced) + sys.exit(1) + + with amp.scale_loss(losses, optimizer) as scaled_loss: + scaled_loss.backward() + # losses.backward() + if max_norm > 0: + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) + optimizer.step() + metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) + metric_logger.update(class_error=loss_dict_reduced['class_error']) + metric_logger.update(lr=optimizer.param_groups[0]["lr"]) + + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + +@torch.no_grad() +def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, output_dir): + model.eval() + criterion.eval() + + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) + header = 'Test:' + + iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys()) + coco_evaluator = CocoEvaluator(base_ds, iou_types) + # coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75] + + panoptic_evaluator = None + if 'panoptic' in postprocessors.keys(): + panoptic_evaluator = PanopticEvaluator( + data_loader.dataset.ann_file, + data_loader.dataset.ann_folder, + output_dir=os.path.join(output_dir, "panoptic_eval"), + ) + # ort_session = onnxruntime.InferenceSession('model_file/detr_640.onnx') + for samples, targets in metric_logger.log_every(data_loader, 10, header): + samples = samples.to(device) + targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + outputs = model(samples) + loss_dict = criterion(outputs, targets) + weight_dict = criterion.weight_dict + + # reduce losses over all GPUs for logging purposes + loss_dict_reduced = utils.reduce_dict(loss_dict) + loss_dict_reduced_scaled = {k: v * weight_dict[k] + for k, v in loss_dict_reduced.items() if k in weight_dict} + loss_dict_reduced_unscaled = {f'{k}_unscaled': v + for k, v in loss_dict_reduced.items()} + metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()), + **loss_dict_reduced_scaled, + **loss_dict_reduced_unscaled) + metric_logger.update(class_error=loss_dict_reduced['class_error']) + + orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) + results = postprocessors['bbox'](outputs, orig_target_sizes) + if 'segm' in postprocessors.keys(): + target_sizes = torch.stack([t["size"] for t in targets], dim=0) + results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes) + res = {target['image_id'].item(): output for target, output in zip(targets, results)} + if coco_evaluator is not None: + coco_evaluator.update(res) + + if panoptic_evaluator is not None: + res_pano = postprocessors["panoptic"](outputs, target_sizes, orig_target_sizes) + for i, target in enumerate(targets): + image_id = target["image_id"].item() + file_name = f"{image_id:012d}.png" + res_pano[i]["image_id"] = image_id + res_pano[i]["file_name"] = file_name + + panoptic_evaluator.update(res_pano) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + if coco_evaluator is not None: + coco_evaluator.synchronize_between_processes() + if panoptic_evaluator is not None: + panoptic_evaluator.synchronize_between_processes() + + # accumulate predictions from all images + if coco_evaluator is not None: + coco_evaluator.accumulate() + coco_evaluator.summarize() + panoptic_res = None + if panoptic_evaluator is not None: + panoptic_res = panoptic_evaluator.summarize() + stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} + if coco_evaluator is not None: + if 'bbox' in postprocessors.keys(): + stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist() + if 'segm' in postprocessors.keys(): + stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist() + if panoptic_res is not None: + stats['PQ_all'] = panoptic_res["All"] + stats['PQ_th'] = panoptic_res["Things"] + stats['PQ_st'] = panoptic_res["Stuff"] + return stats, coco_evaluator diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/hubconf.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/hubconf.py new file mode 100644 index 0000000000000000000000000000000000000000..328c3306d03dc65b71898ead44b2b3f8164de4a0 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/hubconf.py @@ -0,0 +1,168 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import torch + +from models.backbone import Backbone, Joiner +from models.detr import DETR, PostProcess +from models.position_encoding import PositionEmbeddingSine +from models.segmentation import DETRsegm, PostProcessPanoptic +from models.transformer import Transformer + +dependencies = ["torch", "torchvision"] + + +def _make_detr(backbone_name: str, dilation=False, num_classes=91, mask=False): + hidden_dim = 256 + backbone = Backbone(backbone_name, train_backbone=True, return_interm_layers=mask, dilation=dilation) + pos_enc = PositionEmbeddingSine(hidden_dim // 2, normalize=True) + backbone_with_pos_enc = Joiner(backbone, pos_enc) + backbone_with_pos_enc.num_channels = backbone.num_channels + transformer = Transformer(d_model=hidden_dim, return_intermediate_dec=True) + detr = DETR(backbone_with_pos_enc, transformer, num_classes=num_classes, num_queries=100) + if mask: + return DETRsegm(detr) + return detr + + +def detr_resnet50(pretrained=False, num_classes=91, return_postprocessor=False): + """ + DETR R50 with 6 encoder and 6 decoder layers. + + Achieves 42/62.4 AP/AP50 on COCO val5k. + """ + model = _make_detr("resnet50", dilation=False, num_classes=num_classes) + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth", map_location="cpu", check_hash=True + ) + model.load_state_dict(checkpoint["model"]) + if return_postprocessor: + return model, PostProcess() + return model + + +def detr_resnet50_dc5(pretrained=False, num_classes=91, return_postprocessor=False): + """ + DETR-DC5 R50 with 6 encoder and 6 decoder layers. + + The last block of ResNet-50 has dilation to increase + output resolution. + Achieves 43.3/63.1 AP/AP50 on COCO val5k. + """ + model = _make_detr("resnet50", dilation=True, num_classes=num_classes) + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/detr/detr-r50-dc5-f0fb7ef5.pth", map_location="cpu", check_hash=True + ) + model.load_state_dict(checkpoint["model"]) + if return_postprocessor: + return model, PostProcess() + return model + + +def detr_resnet101(pretrained=False, num_classes=91, return_postprocessor=False): + """ + DETR-DC5 R101 with 6 encoder and 6 decoder layers. + + Achieves 43.5/63.8 AP/AP50 on COCO val5k. + """ + model = _make_detr("resnet101", dilation=False, num_classes=num_classes) + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/detr/detr-r101-2c7b67e5.pth", map_location="cpu", check_hash=True + ) + model.load_state_dict(checkpoint["model"]) + if return_postprocessor: + return model, PostProcess() + return model + + +def detr_resnet101_dc5(pretrained=False, num_classes=91, return_postprocessor=False): + """ + DETR-DC5 R101 with 6 encoder and 6 decoder layers. + + The last block of ResNet-101 has dilation to increase + output resolution. + Achieves 44.9/64.7 AP/AP50 on COCO val5k. + """ + model = _make_detr("resnet101", dilation=True, num_classes=num_classes) + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/detr/detr-r101-dc5-a2e86def.pth", map_location="cpu", check_hash=True + ) + model.load_state_dict(checkpoint["model"]) + if return_postprocessor: + return model, PostProcess() + return model + + +def detr_resnet50_panoptic( + pretrained=False, num_classes=250, threshold=0.85, return_postprocessor=False +): + """ + DETR R50 with 6 encoder and 6 decoder layers. + Achieves 43.4 PQ on COCO val5k. + + threshold is the minimum confidence required for keeping segments in the prediction + """ + model = _make_detr("resnet50", dilation=False, num_classes=num_classes, mask=True) + is_thing_map = {i: i <= 90 for i in range(250)} + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/detr/detr-r50-panoptic-00ce5173.pth", + map_location="cpu", + check_hash=True, + ) + model.load_state_dict(checkpoint["model"]) + if return_postprocessor: + return model, PostProcessPanoptic(is_thing_map, threshold=threshold) + return model + + +def detr_resnet50_dc5_panoptic( + pretrained=False, num_classes=250, threshold=0.85, return_postprocessor=False +): + """ + DETR-DC5 R50 with 6 encoder and 6 decoder layers. + + The last block of ResNet-50 has dilation to increase + output resolution. + Achieves 44.6 on COCO val5k. + + threshold is the minimum confidence required for keeping segments in the prediction + """ + model = _make_detr("resnet50", dilation=True, num_classes=num_classes, mask=True) + is_thing_map = {i: i <= 90 for i in range(250)} + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/detr/detr-r50-dc5-panoptic-da08f1b1.pth", + map_location="cpu", + check_hash=True, + ) + model.load_state_dict(checkpoint["model"]) + if return_postprocessor: + return model, PostProcessPanoptic(is_thing_map, threshold=threshold) + return model + + +def detr_resnet101_panoptic( + pretrained=False, num_classes=250, threshold=0.85, return_postprocessor=False +): + """ + DETR-DC5 R101 with 6 encoder and 6 decoder layers. + + Achieves 45.1 PQ on COCO val5k. + + threshold is the minimum confidence required for keeping segments in the prediction + """ + model = _make_detr("resnet101", dilation=False, num_classes=num_classes, mask=True) + is_thing_map = {i: i <= 90 for i in range(250)} + if pretrained: + checkpoint = torch.hub.load_state_dict_from_url( + url="https://dl.fbaipublicfiles.com/detr/detr-r101-panoptic-40021d53.pth", + map_location="cpu", + check_hash=True, + ) + model.load_state_dict(checkpoint["model"]) + if return_postprocessor: + return model, PostProcessPanoptic(is_thing_map, threshold=threshold) + return model diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__init__.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a3f26531befaf6abb215e48a0ef4bfc3da1c7c04 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .detr import build + + +def build_model(args): + return build(args) diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/__init__.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..412367e3f4bbf044d602c58174d344e91abcd382 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/__init__.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/__init__.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5a9664cfcc4178ecfaef6baa75383eab6cf2f20a Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/__init__.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/backbone.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/backbone.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de63f498e3348a44805c347917b7e681251135f8 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/backbone.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/backbone.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/backbone.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d72cf5d1aa2f98e62ec7462b03e9f0bf9f53edc3 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/backbone.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/detr.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/detr.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce70cc4869f118856944389e0a555174d5bf678e Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/detr.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/detr.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/detr.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4739f63cb5dd95eccfaf35b4aadc85493664cc9f Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/detr.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/matcher.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/matcher.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0adfb0bcbde625e011cf77144d73aa22397f8604 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/matcher.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/matcher.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/matcher.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6f51dca35d0b794935a0b0a263c79cc040c87a4a Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/matcher.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/position_encoding.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/position_encoding.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d4b076f3c107d8a1804b6192ae965dc596ceb988 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/position_encoding.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/position_encoding.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/position_encoding.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7aa0717efd318b39a44233205b2c25f204707e3a Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/position_encoding.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/segmentation.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/segmentation.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64ceb75c03bb4c1cfaca8528f33cfbadbe2d833e Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/segmentation.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/segmentation.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/segmentation.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..078aea0bdc5837558360986b882ca5ef5a427baa Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/segmentation.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/transformer.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/transformer.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1141c355a432b454a34b445858d828194293c4bb Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/transformer.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/transformer.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/transformer.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e00b3332583fa7bb3f462d40de05091169ef664 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/__pycache__/transformer.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/backbone.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..9976e2e3586b6b32f878e0c8ecbdabc4dea18938 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/backbone.py @@ -0,0 +1,118 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Backbone modules. +""" +from collections import OrderedDict + +import torch +import torch.nn.functional as F +import torchvision +from torch import nn +from torchvision.models._utils import IntermediateLayerGetter +from typing import Dict, List + +from util.misc import NestedTensor, is_main_process + +from .position_encoding import build_position_encoding + + +class FrozenBatchNorm2d(torch.nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, + without which any other models than torchvision.models.resnet[18,34,50,101] + produce nans. + """ + + def __init__(self, n): + super(FrozenBatchNorm2d, self).__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + num_batches_tracked_key = prefix + 'num_batches_tracked' + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super(FrozenBatchNorm2d, self)._load_from_state_dict( + state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs) + + def forward(self, x): + # move reshapes to the beginning + # to make it fuser-friendly + w = self.weight.reshape(1, -1, 1, 1) + b = self.bias.reshape(1, -1, 1, 1) + rv = self.running_var.reshape(1, -1, 1, 1) + rm = self.running_mean.reshape(1, -1, 1, 1) + eps = 1e-5 + scale = w * (rv + eps).rsqrt() + bias = b - rm * scale + return x * scale + bias + + +class BackboneBase(nn.Module): + + def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool): + super().__init__() + for name, parameter in backbone.named_parameters(): + if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: + parameter.requires_grad_(False) + if return_interm_layers: + return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} + else: + return_layers = {'layer4': "0"} + self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) + self.num_channels = num_channels + + def forward(self, tensor_list: NestedTensor): + xs = self.body(tensor_list.tensors) + out: Dict[str, NestedTensor] = {} + for name, x in xs.items(): + m = tensor_list.mask + assert m is not None + mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] + out[name] = NestedTensor(x, mask) + return out + + +class Backbone(BackboneBase): + """ResNet backbone with frozen BatchNorm.""" + def __init__(self, name: str, + train_backbone: bool, + return_interm_layers: bool, + dilation: bool): + backbone = getattr(torchvision.models, name)( + replace_stride_with_dilation=[False, False, dilation], + pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d) + num_channels = 512 if name in ('resnet18', 'resnet34') else 2048 + super().__init__(backbone, train_backbone, num_channels, return_interm_layers) + + +class Joiner(nn.Sequential): + def __init__(self, backbone, position_embedding): + super().__init__(backbone, position_embedding) + + def forward(self, tensor_list: NestedTensor): + xs = self[0](tensor_list) + out: List[NestedTensor] = [] + pos = [] + for name, x in xs.items(): + out.append(x) + # position encoding + pos.append(self[1](x).to(x.tensors.dtype)) + return out, pos + + +def build_backbone(args): + position_embedding = build_position_encoding(args) + train_backbone = args.lr_backbone > 0 + return_interm_layers = args.masks + backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation) + model = Joiner(backbone, position_embedding) + model.num_channels = backbone.num_channels + return model diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/detr.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/detr.py new file mode 100644 index 0000000000000000000000000000000000000000..44209c82421e3dbad7c59514a7c4b21030a46d32 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/detr.py @@ -0,0 +1,368 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +DETR model and criterion classes. +""" +import torch +import torch.nn.functional as F +from torch import nn + +from util import box_ops +from util.misc import (NestedTensor, nested_tensor_from_tensor_list,_onnx_nested_tensor_from_tensor_list, + accuracy, get_world_size, interpolate, + is_dist_avail_and_initialized) + +from .backbone import build_backbone +from .matcher import build_matcher +from .segmentation import (DETRsegm, PostProcessPanoptic, PostProcessSegm, + dice_loss, sigmoid_focal_loss) +from .transformer import build_transformer + + +class DETR(nn.Module): + """ This is the DETR module that performs object detection """ + def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False): + """ Initializes the model. + Parameters: + backbone: torch module of the backbone to be used. See backbone.py + transformer: torch module of the transformer architecture. See transformer.py + num_classes: number of object classes + num_queries: number of object queries, ie detection slot. This is the maximal number of objects + DETR can detect in a single image. For COCO, we recommend 100 queries. + aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. + """ + super().__init__() + self.num_queries = num_queries + self.transformer = transformer + hidden_dim = transformer.d_model + self.class_embed = nn.Linear(hidden_dim, num_classes + 1) + self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) + self.query_embed = nn.Embedding(num_queries, hidden_dim) + self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1) + self.backbone = backbone + self.aux_loss = aux_loss + + def forward(self, samples: NestedTensor): + """ The forward expects a NestedTensor, which consists of: + - samples.tensor: batched images, of shape [batch_size x 3 x H x W] + - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels + + It returns a dict with the following elements: + - "pred_logits": the classification logits (including no-object) for all queries. + Shape= [batch_size x num_queries x (num_classes + 1)] + - "pred_boxes": The normalized boxes coordinates for all queries, represented as + (center_x, center_y, height, width). These values are normalized in [0, 1], + relative to the size of each individual image (disregarding possible padding). + See PostProcess for information on how to retrieve the unnormalized bounding box. + - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of + dictionnaries containing the two above keys for each decoder layer. + """ + if isinstance(samples, (list, torch.Tensor)): + samples = nested_tensor_from_tensor_list(samples) + features, pos = self.backbone(samples) + src, mask = features[-1].decompose() + assert mask is not None + hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0] + outputs_class = self.class_embed(hs) + outputs_coord = self.bbox_embed(hs).sigmoid() + out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]} + if self.aux_loss: + out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) + return out + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{'pred_logits': a, 'pred_boxes': b} + for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + + +class SetCriterion(nn.Module): + """ This class computes the loss for DETR. + The process happens in two steps: + 1) we compute hungarian assignment between ground truth boxes and the outputs of the model + 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) + """ + def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses): + """ Create the criterion. + Parameters: + num_classes: number of object categories, omitting the special no-object category + matcher: module able to compute a matching between targets and proposals + weight_dict: dict containing as key the names of the losses and as values their relative weight. + eos_coef: relative classification weight applied to the no-object category + losses: list of all the losses to be applied. See get_loss for list of available losses. + """ + super().__init__() + self.num_classes = num_classes + self.matcher = matcher + self.weight_dict = weight_dict + self.eos_coef = eos_coef + self.losses = losses + empty_weight = torch.ones(self.num_classes + 1) + empty_weight[-1] = self.eos_coef + self.register_buffer('empty_weight', empty_weight) + + def loss_labels(self, outputs, targets, indices, num_boxes, log=True): + """Classification loss (NLL) + targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] + """ + assert 'pred_logits' in outputs + src_logits = outputs['pred_logits'] + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat([t["labels"][J].to('cpu') for t, (_, J) in zip(targets, indices)]).to('npu') + target_classes = torch.full(src_logits.shape[:2], self.num_classes, + dtype=torch.int64, device=src_logits.device) + target_classes[idx] = target_classes_o + + loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) + losses = {'loss_ce': loss_ce} + + if log: + # TODO this should probably be a separate loss, not hacked in this one here + losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0] + return losses + + @torch.no_grad() + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients + """ + pred_logits = outputs['pred_logits'] + device = pred_logits.device + tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1) + card_err = F.l1_loss(card_pred.float(), tgt_lengths.float()) + losses = {'cardinality_error': card_err} + return losses + + def loss_boxes(self, outputs, targets, indices, num_boxes): + """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss + targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] + The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + assert 'pred_boxes' in outputs + idx = self._get_src_permutation_idx(indices) + src_boxes = outputs['pred_boxes'][idx] + target_boxes = torch.cat([t['boxes'][i].to('cpu') for t, (_, i) in zip(targets, indices)], dim=0).to('npu') + + loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none') + + losses = {} + losses['loss_bbox'] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag(box_ops.generalized_box_iou( + box_ops.box_cxcywh_to_xyxy(src_boxes), + box_ops.box_cxcywh_to_xyxy(target_boxes))) + losses['loss_giou'] = loss_giou.sum() / num_boxes + return losses + + def loss_masks(self, outputs, targets, indices, num_boxes): + """Compute the losses related to the masks: the focal loss and the dice loss. + targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] + """ + assert "pred_masks" in outputs + + src_idx = self._get_src_permutation_idx(indices) + tgt_idx = self._get_tgt_permutation_idx(indices) + src_masks = outputs["pred_masks"] + src_masks = src_masks[src_idx] + masks = [t["masks"] for t in targets] + # TODO use valid to mask invalid areas due to padding in loss + target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() + target_masks = target_masks.to(src_masks) + target_masks = target_masks[tgt_idx] + + # upsample predictions to the target size + src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:], + mode="bilinear", align_corners=False) + src_masks = src_masks[:, 0].flatten(1) + + target_masks = target_masks.flatten(1) + target_masks = target_masks.view(src_masks.shape) + losses = { + "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes), + "loss_dice": dice_loss(src_masks, target_masks, num_boxes), + } + return losses + + def _get_src_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): + loss_map = { + 'labels': self.loss_labels, + 'cardinality': self.loss_cardinality, + 'boxes': self.loss_boxes, + 'masks': self.loss_masks + } + assert loss in loss_map, f'do you really want to compute {loss} loss?' + return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs) + + def forward(self, outputs, targets): + """ This performs the loss computation. + Parameters: + outputs: dict of tensors, see the output specification of the model for the format + targets: list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_boxes = sum(len(t["labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + if is_dist_avail_and_initialized(): + torch.distributed.all_reduce(num_boxes) + num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if 'aux_outputs' in outputs: + for i, aux_outputs in enumerate(outputs['aux_outputs']): + indices = self.matcher(aux_outputs, targets) + for loss in self.losses: + if loss == 'masks': + # Intermediate masks losses are too costly to compute, we ignore them. + continue + kwargs = {} + if loss == 'labels': + # Logging is enabled only for the last layer + kwargs = {'log': False} + l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs) + l_dict = {k + f'_{i}': v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + +class PostProcess(nn.Module): + """ This module converts the model's output into the format expected by the coco api""" + @torch.no_grad() + def forward(self, outputs, target_sizes): + """ Perform the computation + Parameters: + outputs: raw outputs of the model + target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch + For evaluation, this must be the original image size (before any data augmentation) + For visualization, this should be the image size after data augment, but before padding + """ + out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes'] + + assert len(out_logits) == len(target_sizes) + assert target_sizes.shape[1] == 2 + + prob = F.softmax(out_logits, -1) + scores, labels = prob[..., :-1].max(-1) + + # convert to [x0, y0, x1, y1] format + boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + sc = torch.ones_like(scale_fct) + value = torch.max(scale_fct.half(), dim=1)[0] + value = torch.unsqueeze(value, dim=0).t() + scale_fct_value = sc * value + sc_value = (scale_fct_value - scale_fct) / 2 + sc_ex = torch.unsqueeze(sc_value, dim=1) + boxes = boxes * scale_fct_value[:, None, :] + boxes_one = torch.ones_like(boxes) + boxex_ten = boxes_one * sc_ex + boxes = boxes - boxex_ten + + results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)] + + return results + + +class MLP(nn.Module): + """ Very simple multi-layer perceptron (also called FFN)""" + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +def build(args): + # the `num_classes` naming here is somewhat misleading. + # it indeed corresponds to `max_obj_id + 1`, where max_obj_id + # is the maximum id for a class in your dataset. For example, + # COCO has a max_obj_id of 90, so we pass `num_classes` to be 91. + # As another example, for a dataset that has a single class with id 1, + # you should pass `num_classes` to be 2 (max_obj_id + 1). + # For more details on this, check the following discussion + # https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223 + num_classes = 20 if args.dataset_file != 'coco' else 91 + if args.dataset_file == "coco_panoptic": + # for panoptic, we just add a num_classes that is large enough to hold + # max_obj_id + 1, but the exact value doesn't really matter + num_classes = 250 + device = torch.device(args.device) + + backbone = build_backbone(args) + + transformer = build_transformer(args) + + model = DETR( + backbone, + transformer, + num_classes=num_classes, + num_queries=args.num_queries, + aux_loss=args.aux_loss, + ) + if args.masks: + model = DETRsegm(model, freeze_detr=(args.frozen_weights is not None)) + matcher = build_matcher(args) + weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef} + weight_dict['loss_giou'] = args.giou_loss_coef + if args.masks: + weight_dict["loss_mask"] = args.mask_loss_coef + weight_dict["loss_dice"] = args.dice_loss_coef + # TODO this is a hack + if args.aux_loss: + aux_weight_dict = {} + for i in range(args.dec_layers - 1): + aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + + losses = ['labels', 'boxes', 'cardinality'] + if args.masks: + losses += ["masks"] + print('losses',losses) + + criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, + eos_coef=args.eos_coef, losses=losses) + criterion.to(device) + postprocessors = {'bbox': PostProcess()} + if args.masks: + postprocessors['segm'] = PostProcessSegm() + if args.dataset_file == "coco_panoptic": + is_thing_map = {i: i <= 90 for i in range(201)} + postprocessors["panoptic"] = PostProcessPanoptic(is_thing_map, threshold=0.85) + + return model, criterion, postprocessors diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/matcher.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..7a4483e4c49d2d679bb48eef87a642695be05ba3 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/matcher.py @@ -0,0 +1,95 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Modules to compute the matching cost and solve the corresponding LSAP. +""" +import torch +from scipy.optimize import linear_sum_assignment +from torch import nn + +from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou + + +class HungarianMatcher(nn.Module): + """This class computes an assignment between the targets and the predictions of the network + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, + there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, + while the others are un-matched (and thus treated as non-objects). + """ + + def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1): + """Creates the matcher + + Params: + cost_class: This is the relative weight of the classification error in the matching cost + cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost + cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost + """ + super().__init__() + self.cost_class = cost_class + self.cost_bbox = cost_bbox + self.cost_giou = cost_giou + assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0" + + @torch.no_grad() + def forward(self, outputs, targets): + """ Performs the matching + + Params: + outputs: This is a dict that contains at least these entries: + "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates + + targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: + "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth + objects in the target) containing the class labels + "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates + + Returns: + A list of size batch_size, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + bs, num_queries = outputs["pred_logits"].shape[:2] + + # We flatten to compute the cost matrices in a batch + out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1) # [batch_size * num_queries, num_classes] + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + + # Also concat the target labels and boxes + + targets_labels = [v["labels"].to('cpu') for v in targets] + targets_boxes = [v["boxes"].to('cpu') for v in targets] + + tgt_ids = torch.cat(targets_labels).to('npu') + tgt_bbox = torch.cat(targets_boxes).to('npu') + + # tgt_ids = torch.cat([v["labels"] for v in targets]) + # tgt_bbox = torch.cat([v["boxes"] for v in targets]) + + + # Compute the classification cost. Contrary to the loss, we don't use the NLL, + # but approximate it in 1 - proba[target class]. + # The 1 is a constant that doesn't change the matching, it can be ommitted. + cost_class = -out_prob[:, tgt_ids] + + # Compute the L1 cost between boxes + cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) + + # Compute the giou cost betwen boxes + + cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) + + # Final cost matrix + C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou + C = C.view(bs, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + + +def build_matcher(args): + return HungarianMatcher(cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou) diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/position_encoding.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/position_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..6afc9190d485dc8b94d623fa398d37aba8a47b9f --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/position_encoding.py @@ -0,0 +1,91 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Various positional encodings for the transformer. +""" +import math +import torch +from torch import nn + +from util.misc import NestedTensor + + +class PositionEmbeddingSine(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + mask = tensor_list.mask + assert mask is not None + # not_mask = ~mask + not_mask = (~mask).float() + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class PositionEmbeddingLearned(nn.Module): + """ + Absolute pos embedding, learned. + """ + def __init__(self, num_pos_feats=256): + super().__init__() + self.row_embed = nn.Embedding(50, num_pos_feats) + self.col_embed = nn.Embedding(50, num_pos_feats) + self.reset_parameters() + + def reset_parameters(self): + nn.init.uniform_(self.row_embed.weight) + nn.init.uniform_(self.col_embed.weight) + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + h, w = x.shape[-2:] + i = torch.arange(w, device=x.device) + j = torch.arange(h, device=x.device) + x_emb = self.col_embed(i) + y_emb = self.row_embed(j) + + pos = torch.cat([ + x_emb.unsqueeze(0).repeat(h, 1, 1), + y_emb.unsqueeze(1).repeat(1, w, 1), + ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) + return pos + + +def build_position_encoding(args): + N_steps = args.hidden_dim // 2 + if args.position_embedding in ('v2', 'sine'): + # TODO find a better way of exposing other arguments + position_embedding = PositionEmbeddingSine(N_steps, normalize=True) + elif args.position_embedding in ('v3', 'learned'): + position_embedding = PositionEmbeddingLearned(N_steps) + else: + raise ValueError(f"not supported {args.position_embedding}") + + return position_embedding diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/segmentation.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/segmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..01faa8851838661a930440b5f6ccf68ca2e6fb8d --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/segmentation.py @@ -0,0 +1,363 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +This file provides the definition of the convolutional heads used to predict masks, as well as the losses +""" +import io +from collections import defaultdict +from typing import List, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from PIL import Image + +import util.box_ops as box_ops +from util.misc import NestedTensor, interpolate, nested_tensor_from_tensor_list + +try: + from panopticapi.utils import id2rgb, rgb2id +except ImportError: + pass + + +class DETRsegm(nn.Module): + def __init__(self, detr, freeze_detr=False): + super().__init__() + self.detr = detr + + if freeze_detr: + for p in self.parameters(): + p.requires_grad_(False) + + hidden_dim, nheads = detr.transformer.d_model, detr.transformer.nhead + self.bbox_attention = MHAttentionMap(hidden_dim, hidden_dim, nheads, dropout=0.0) + self.mask_head = MaskHeadSmallConv(hidden_dim + nheads, [1024, 512, 256], hidden_dim) + + def forward(self, samples: NestedTensor): + if isinstance(samples, (list, torch.Tensor)): + samples = nested_tensor_from_tensor_list(samples) + features, pos = self.detr.backbone(samples) + + bs = features[-1].tensors.shape[0] + + src, mask = features[-1].decompose() + assert mask is not None + src_proj = self.detr.input_proj(src) + hs, memory = self.detr.transformer(src_proj, mask, self.detr.query_embed.weight, pos[-1]) + + outputs_class = self.detr.class_embed(hs) + outputs_coord = self.detr.bbox_embed(hs).sigmoid() + out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]} + if self.detr.aux_loss: + out['aux_outputs'] = self.detr._set_aux_loss(outputs_class, outputs_coord) + + # FIXME h_boxes takes the last one computed, keep this in mind + bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask) + + seg_masks = self.mask_head(src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors]) + outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]) + + out["pred_masks"] = outputs_seg_masks + return out + + +def _expand(tensor, length: int): + return tensor.unsqueeze(1).repeat(1, int(length), 1, 1, 1).flatten(0, 1) + + +class MaskHeadSmallConv(nn.Module): + """ + Simple convolutional head, using group norm. + Upsampling is done using a FPN approach + """ + + def __init__(self, dim, fpn_dims, context_dim): + super().__init__() + + inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64] + self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1) + self.gn1 = torch.nn.GroupNorm(8, dim) + self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1) + self.gn2 = torch.nn.GroupNorm(8, inter_dims[1]) + self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1) + self.gn3 = torch.nn.GroupNorm(8, inter_dims[2]) + self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1) + self.gn4 = torch.nn.GroupNorm(8, inter_dims[3]) + self.lay5 = torch.nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1) + self.gn5 = torch.nn.GroupNorm(8, inter_dims[4]) + self.out_lay = torch.nn.Conv2d(inter_dims[4], 1, 3, padding=1) + + self.dim = dim + + self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1) + self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1) + self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_uniform_(m.weight, a=1) + nn.init.constant_(m.bias, 0) + + def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]): + x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1) + + x = self.lay1(x) + x = self.gn1(x) + x = F.relu(x) + x = self.lay2(x) + x = self.gn2(x) + x = F.relu(x) + + cur_fpn = self.adapter1(fpns[0]) + if cur_fpn.size(0) != x.size(0): + cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) + x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = self.lay3(x) + x = self.gn3(x) + x = F.relu(x) + + cur_fpn = self.adapter2(fpns[1]) + if cur_fpn.size(0) != x.size(0): + cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) + x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = self.lay4(x) + x = self.gn4(x) + x = F.relu(x) + + cur_fpn = self.adapter3(fpns[2]) + if cur_fpn.size(0) != x.size(0): + cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) + x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = self.lay5(x) + x = self.gn5(x) + x = F.relu(x) + + x = self.out_lay(x) + return x + + +class MHAttentionMap(nn.Module): + """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)""" + + def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True): + super().__init__() + self.num_heads = num_heads + self.hidden_dim = hidden_dim + self.dropout = nn.Dropout(dropout) + + self.q_linear = nn.Linear(query_dim, hidden_dim, bias=bias) + self.k_linear = nn.Linear(query_dim, hidden_dim, bias=bias) + + nn.init.zeros_(self.k_linear.bias) + nn.init.zeros_(self.q_linear.bias) + nn.init.xavier_uniform_(self.k_linear.weight) + nn.init.xavier_uniform_(self.q_linear.weight) + self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5 + + def forward(self, q, k, mask: Optional[Tensor] = None): + q = self.q_linear(q) + k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias) + qh = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads) + kh = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]) + weights = torch.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh) + + if mask is not None: + weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf")) + weights = F.softmax(weights.flatten(2), dim=-1).view(weights.size()) + weights = self.dropout(weights) + return weights + + +def dice_loss(inputs, targets, num_boxes): + """ + Compute the DICE loss, similar to generalized IOU for masks + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_boxes + + +def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + alpha: (optional) Weighting factor in range (0,1) to balance + positive vs negative examples. Default = -1 (no weighting). + gamma: Exponent of the modulating factor (1 - p_t) to + balance easy vs hard examples. + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + return loss.mean(1).sum() / num_boxes + + +class PostProcessSegm(nn.Module): + def __init__(self, threshold=0.5): + super().__init__() + self.threshold = threshold + + @torch.no_grad() + def forward(self, results, outputs, orig_target_sizes, max_target_sizes): + assert len(orig_target_sizes) == len(max_target_sizes) + max_h, max_w = max_target_sizes.max(0)[0].tolist() + outputs_masks = outputs["pred_masks"].squeeze(2) + outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False) + outputs_masks = (outputs_masks.sigmoid() > self.threshold).cpu() + + for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): + img_h, img_w = t[0], t[1] + results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) + results[i]["masks"] = F.interpolate( + results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest" + ).byte() + + return results + + +class PostProcessPanoptic(nn.Module): + """This class converts the output of the model to the final panoptic result, in the format expected by the + coco panoptic API """ + + def __init__(self, is_thing_map, threshold=0.85): + """ + Parameters: + is_thing_map: This is a whose keys are the class ids, and the values a boolean indicating whether + the class is a thing (True) or a stuff (False) class + threshold: confidence threshold: segments with confidence lower than this will be deleted + """ + super().__init__() + self.threshold = threshold + self.is_thing_map = is_thing_map + + def forward(self, outputs, processed_sizes, target_sizes=None): + """ This function computes the panoptic prediction from the model's predictions. + Parameters: + outputs: This is a dict coming directly from the model. See the model doc for the content. + processed_sizes: This is a list of tuples (or torch tensors) of sizes of the images that were passed to the + model, ie the size after data augmentation but before batching. + target_sizes: This is a list of tuples (or torch tensors) corresponding to the requested final size + of each prediction. If left to None, it will default to the processed_sizes + """ + if target_sizes is None: + target_sizes = processed_sizes + assert len(processed_sizes) == len(target_sizes) + out_logits, raw_masks, raw_boxes = outputs["pred_logits"], outputs["pred_masks"], outputs["pred_boxes"] + assert len(out_logits) == len(raw_masks) == len(target_sizes) + preds = [] + + def to_tuple(tup): + if isinstance(tup, tuple): + return tup + return tuple(tup.cpu().tolist()) + + for cur_logits, cur_masks, cur_boxes, size, target_size in zip( + out_logits, raw_masks, raw_boxes, processed_sizes, target_sizes + ): + # we filter empty queries and detection below threshold + scores, labels = cur_logits.softmax(-1).max(-1) + keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & (scores > self.threshold) + cur_scores, cur_classes = cur_logits.softmax(-1).max(-1) + cur_scores = cur_scores[keep] + cur_classes = cur_classes[keep] + cur_masks = cur_masks[keep] + cur_masks = interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) + cur_boxes = box_ops.box_cxcywh_to_xyxy(cur_boxes[keep]) + + h, w = cur_masks.shape[-2:] + assert len(cur_boxes) == len(cur_classes) + + # It may be that we have several predicted masks for the same stuff class. + # In the following, we track the list of masks ids for each stuff class (they are merged later on) + cur_masks = cur_masks.flatten(1) + stuff_equiv_classes = defaultdict(lambda: []) + for k, label in enumerate(cur_classes): + if not self.is_thing_map[label.item()]: + stuff_equiv_classes[label.item()].append(k) + + def get_ids_area(masks, scores, dedup=False): + # This helper function creates the final panoptic segmentation image + # It also returns the area of the masks that appears on the image + + m_id = masks.transpose(0, 1).softmax(-1) + + if m_id.shape[-1] == 0: + # We didn't detect any mask :( + m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device) + else: + m_id = m_id.argmax(-1).view(h, w) + + if dedup: + # Merge the masks corresponding to the same stuff class + for equiv in stuff_equiv_classes.values(): + if len(equiv) > 1: + for eq_id in equiv: + m_id.masked_fill_(m_id.eq(eq_id), equiv[0]) + + final_h, final_w = to_tuple(target_size) + + seg_img = Image.fromarray(id2rgb(m_id.view(h, w).cpu().numpy())) + seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST) + + np_seg_img = ( + torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())).view(final_h, final_w, 3).numpy() + ) + m_id = torch.from_numpy(rgb2id(np_seg_img)) + + area = [] + for i in range(len(scores)): + area.append(m_id.eq(i).sum().item()) + return area, seg_img + + area, seg_img = get_ids_area(cur_masks, cur_scores, dedup=True) + if cur_classes.numel() > 0: + # We know filter empty masks as long as we find some + while True: + filtered_small = torch.as_tensor( + [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device + ) + if filtered_small.any().item(): + cur_scores = cur_scores[~filtered_small] + cur_classes = cur_classes[~filtered_small] + cur_masks = cur_masks[~filtered_small] + area, seg_img = get_ids_area(cur_masks, cur_scores) + else: + break + + else: + cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device) + + segments_info = [] + for i, a in enumerate(area): + cat = cur_classes[i].item() + segments_info.append({"id": i, "isthing": self.is_thing_map[cat], "category_id": cat, "area": a}) + del cur_classes + + with io.BytesIO() as out: + seg_img.save(out, format="PNG") + predictions = {"png_string": out.getvalue(), "segments_info": segments_info} + preds.append(predictions) + return preds diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/transformer.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..dcd536750acbfea7e4d514acb6e60154dc28ddbd --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/models/transformer.py @@ -0,0 +1,297 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +DETR Transformer class. + +Copy-paste from torch.nn.Transformer with modifications: + * positional encodings are passed in MHattention + * extra LN at the end of encoder is removed + * decoder returns a stack of activations from all decoding layers +""" +import copy +from typing import Optional, List + +import torch +import torch.nn.functional as F +from torch import nn, Tensor + + +class Transformer(nn.Module): + + def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, + num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, + activation="relu", normalize_before=False, + return_intermediate_dec=False): + super().__init__() + + encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, + dropout, activation, normalize_before) + encoder_norm = nn.LayerNorm(d_model) if normalize_before else None + self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) + + decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, + dropout, activation, normalize_before) + decoder_norm = nn.LayerNorm(d_model) + self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm, + return_intermediate=return_intermediate_dec) + + self._reset_parameters() + + self.d_model = d_model + self.nhead = nhead + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, src, mask, query_embed, pos_embed): + # flatten NxCxHxW to HWxNxC + bs, c, h, w = src.shape + src = src.flatten(2).permute(2, 0, 1) + pos_embed = pos_embed.flatten(2).permute(2, 0, 1) + query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) + mask = mask.flatten(1) + + tgt = torch.zeros_like(query_embed) + memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) + hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, + pos=pos_embed, query_pos=query_embed) + return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) + + +class TransformerEncoder(nn.Module): + + def __init__(self, encoder_layer, num_layers, norm=None): + super().__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, src, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + output = src + + for layer in self.layers: + output = layer(output, src_mask=mask, + src_key_padding_mask=src_key_padding_mask, pos=pos) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerDecoder(nn.Module): + + def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): + super().__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + self.return_intermediate = return_intermediate + + def forward(self, tgt, memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + output = tgt + + intermediate = [] + + for layer in self.layers: + output = layer(output, memory, tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask, + pos=pos, query_pos=query_pos) + if self.return_intermediate: + intermediate.append(self.norm(output)) + + if self.norm is not None: + output = self.norm(output) + if self.return_intermediate: + intermediate.pop() + intermediate.append(output) + + if self.return_intermediate: + return torch.stack(intermediate) + + return output.unsqueeze(0) + + +class TransformerEncoderLayer(nn.Module): + + def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, + activation="relu", normalize_before=False): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post(self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + q = k = self.with_pos_embed(src, pos) + src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, + key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + return src + + def forward_pre(self, src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + src2 = self.norm1(src) + q = k = self.with_pos_embed(src2, pos) + src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, + key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) + src2 = self.norm2(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) + src = src + self.dropout2(src2) + return src + + def forward(self, src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + if self.normalize_before: + return self.forward_pre(src, src_mask, src_key_padding_mask, pos) + return self.forward_post(src, src_mask, src_key_padding_mask, pos) + + +class TransformerDecoderLayer(nn.Module): + + def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, + activation="relu", normalize_before=False): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post(self, tgt, memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + q = k = self.with_pos_embed(tgt, query_pos) + tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask)[0] + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask)[0] + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout3(tgt2) + tgt = self.norm3(tgt) + return tgt + + def forward_pre(self, tgt, memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + tgt2 = self.norm1(tgt) + q = k = self.with_pos_embed(tgt2, query_pos) + tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask)[0] + tgt = tgt + self.dropout1(tgt2) + tgt2 = self.norm2(tgt) + tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask)[0] + tgt = tgt + self.dropout2(tgt2) + tgt2 = self.norm3(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) + tgt = tgt + self.dropout3(tgt2) + return tgt + + def forward(self, tgt, memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + if self.normalize_before: + return self.forward_pre(tgt, memory, tgt_mask, memory_mask, + tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) + return self.forward_post(tgt, memory, tgt_mask, memory_mask, + tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def build_transformer(args): + return Transformer( + d_model=args.hidden_dim, + dropout=args.dropout, + nhead=args.nheads, + dim_feedforward=args.dim_feedforward, + num_encoder_layers=args.enc_layers, + num_decoder_layers=args.dec_layers, + normalize_before=args.pre_norm, + return_intermediate_dec=True, + ) + + +def _get_activation_fn(activation): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(F"activation should be relu/gelu, not {activation}.") diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/requirements.txt b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb8f7823ba7a47d8edcf966f7652a8bf4ddb86ea --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/requirements.txt @@ -0,0 +1,9 @@ +cython +git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI&egg=pycocotools +submitit +torch>=1.5.0 +torchvision>=0.6.0 +git+https://github.com/cocodataset/panopticapi.git#egg=panopticapi +scipy +onnx +onnxruntime diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/run_with_submitit.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/run_with_submitit.py new file mode 100644 index 0000000000000000000000000000000000000000..b6780def01e2f3266b24889403f11d95fffddafe --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/run_with_submitit.py @@ -0,0 +1,111 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +A script to run multinode training with submitit. +""" +import argparse +import os +import uuid +from pathlib import Path + +import main as detection +import submitit + + +def parse_args(): + detection_parser = detection.get_args_parser() + parser = argparse.ArgumentParser("Submitit for detection", parents=[detection_parser]) + parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") + parser.add_argument("--nodes", default=4, type=int, help="Number of nodes to request") + parser.add_argument("--timeout", default=60, type=int, help="Duration of the job") + parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") + return parser.parse_args() + + +def get_shared_folder() -> Path: + user = os.getenv("USER") + if Path("/checkpoint/").is_dir(): + p = Path(f"/checkpoint/{user}/experiments") + p.mkdir(exist_ok=True) + return p + raise RuntimeError("No shared folder available") + + +def get_init_file(): + # Init file must not exist, but it's parent dir must exist. + os.makedirs(str(get_shared_folder()), exist_ok=True) + init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init" + if init_file.exists(): + os.remove(str(init_file)) + return init_file + + +class Trainer(object): + def __init__(self, args): + self.args = args + + def __call__(self): + import main as detection + + self._setup_gpu_args() + detection.main(self.args) + + def checkpoint(self): + import os + import submitit + from pathlib import Path + + self.args.dist_url = get_init_file().as_uri() + checkpoint_file = os.path.join(self.args.output_dir, "checkpoint.pth") + if os.path.exists(checkpoint_file): + self.args.resume = checkpoint_file + print("Requeuing ", self.args) + empty_trainer = type(self)(self.args) + return submitit.helpers.DelayedSubmission(empty_trainer) + + def _setup_gpu_args(self): + import submitit + from pathlib import Path + + job_env = submitit.JobEnvironment() + self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) + self.args.gpu = job_env.local_rank + self.args.rank = job_env.global_rank + self.args.world_size = job_env.num_tasks + print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") + + +def main(): + args = parse_args() + if args.job_dir == "": + args.job_dir = get_shared_folder() / "%j" + + # Note that the folder will depend on the job_id, to easily track experiments + executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) + + # cluster setup is defined by environment variables + num_gpus_per_node = args.ngpus + nodes = args.nodes + timeout_min = args.timeout + + executor.update_parameters( + mem_gb=40 * num_gpus_per_node, + gpus_per_node=num_gpus_per_node, + tasks_per_node=num_gpus_per_node, # one task per GPU + cpus_per_task=10, + nodes=nodes, + timeout_min=timeout_min, # max is 60 * 72 + ) + + executor.update_parameters(name="detr") + + args.dist_url = get_init_file().as_uri() + args.output_dir = args.job_dir + + trainer = Trainer(args) + job = executor.submit(trainer) + + print("Submitted job_id:", job.job_id) + + +if __name__ == "__main__": + main() diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/env_npu.sh b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/env_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..280fca96da61c2d983f9a690fd0c044bbcba9167 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/env_npu.sh @@ -0,0 +1,71 @@ +#!/bin/bash +export install_path=/usr/local/Ascend + +if [ -d ${install_path}/toolkit ]; then + export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH} + export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH + export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH + export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH + export ASCEND_OPP_PATH=${install_path}/opp +else + if [ -d ${install_path}/nnae/latest ];then + export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=${install_path}/nnae/latest + else + export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest + fi +fi + + +#将Host日志输出到串口,0-关闭/1-开启 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +#设置默认日志级别,0-debug/1-info/2-warning/3-error +export ASCEND_GLOBAL_LOG_LEVEL=3 +#设置Host侧Event日志开启标志,0-关闭/1-开启 +export ASCEND_GLOBAL_EVENT_ENABLE=0 +#设置是否开启taskque,0-关闭/1-开启 +export TASK_QUEUE_ENABLE=1 +#设置是否开启PTCopy,0-关闭/1-开启 +export PTCOPY_ENABLE=1 +#设置是否开启combined标志,0-关闭/1-开启 +export COMBINED_ENABLE=1 +#设置特殊场景是否需要重新编译,不需要修改 +export DYNAMIC_OP="ADD#MUL" +#HCCL白名单开关,1-关闭/0-开启 +export HCCL_WHITELIST_DISABLE=1 +#设置Device侧日志等级为error +${install_path}/driver/tools/msnpureport -g error +#关闭Device侧Event日志 +${install_path}/driver/tools/msnpureport -e disable +export BMMV2_ENABLE=1 + +path_lib=$(python3.7 -c """ +import sys +import re +result='' +for index in range(len(sys.path)): + match_sit = re.search('-packages', sys.path[index]) + if match_sit is not None: + match_lib = re.search('lib', sys.path[index]) + + if match_lib is not None: + end=match_lib.span()[1] + result += sys.path[index][0:end] + ':' + + result+=sys.path[index] + '/torch/lib:' +print(result)""" +) + +echo ${path_lib} + +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/usr/local/gcc7.3.0/lib64:${LD_LIBRARY_PATH} \ No newline at end of file diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_full_8p.sh b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..f5c6678e2234a3ed7df73137b2b02fa6097eeaa4 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_full_8p.sh @@ -0,0 +1,147 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="DETR_for_PyTorch" +# 训练batch_size +batch_size=8 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=300 +# 学习率 +learning_rate=0.0001 +# 加载数据进程数 +workers=128 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + + + +device_id_list=0,1,2,3,4,5,6,7 +KERNEL_NUM=$(($(nproc)/8)) +for i in $(seq 0 7) +do + PID_START=$((KERNEL_NUM * i)) + PID_END=$((PID_START + KERNEL_NUM - 1)) + taskset -c $PID_START-$PID_END python3.7 -u train_npu.py \ + --addr=$(hostname -I |awk '{print $1}') \ + --workers=$(nproc) \ + --multiprocessing_distributed \ + --dist_url='tcp://127.0.0.1:50000' \ + --dist_backend='hccl' \ + --epochs=${train_epochs} \ + --lr=${learning_rate} \ + --world_size=1 \ + --batch_size=${batch_size} \ + --device_num=8 \ + --rank=0 \ + --device_list=${device_id_list} \ + --local_rank=$i > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done + + +wait + + + +##################获取训练数据################ +#训练结束时间,不需要修改 + +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'FPS' test/output/0/train_0.log|awk -F " " '{print $NF}'|awk -F ":" '{print $2}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +# #输出训练精度,需要模型审视修改 +train_accuracy=`grep -a "Average Precision" test/output/0/train_0.log|awk -F "=" '{print $NF}'|awk 'END {print}'` +# #打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_performance_1p.sh b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..6e9b33650979cfc0c46f96e4afd8899f5a66578b --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_performance_1p.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="DETR_for_PyTorch" +# 训练batch_size +batch_size=8 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=1 +# 指定训练所使用的npu device卡id +device_id=0 +# 加载数据进程数 +workers=128 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + + +nohup taskset -c 0-23 python3.7 -u train_npu.py \ + --coco_path=${data_path} \ + --workers=${workers} \ + --gpu=${ASCEND_DEVICE_ID} \ + --epochs=${train_epochs} \ + --opt_level='O0' \ + --batch_size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'FPS' test/output/0/train_0.log|awk -F " " '{print $NF}'|awk -F ":" '{print $2}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +# #输出训练精度,需要模型审视修改 +train_accuracy=`grep -a "Average Precision" test/output/0/train_0.log|awk -F "=" '{print $NF}'|awk 'END {print}'` +# #打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep -a loss ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F "\t" '{print $2}'|awk -F ":" '{print $2}'|awk 'END {print}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log + + diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_performance_8p.sh b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..c4182b7a1f04ec6a40aa4020df6270bb6172af21 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/test/train_performance_8p.sh @@ -0,0 +1,147 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="DETR_for_PyTorch" +# 训练batch_size +batch_size=8 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=1 +# 学习率 +learning_rate=0.0001 +# 加载数据进程数 +workers=128 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + + + +device_id_list=0,1,2,3,4,5,6,7 +KERNEL_NUM=$(($(nproc)/8)) +for i in $(seq 0 7) +do + PID_START=$((KERNEL_NUM * i)) + PID_END=$((PID_START + KERNEL_NUM - 1)) + taskset -c $PID_START-$PID_END python3.7 -u train_npu.py \ + --addr=$(hostname -I |awk '{print $1}') \ + --workers=$(nproc) \ + --multiprocessing_distributed \ + --dist_url='tcp://127.0.0.1:50000' \ + --dist_backend='hccl' \ + --epochs=${train_epochs} \ + --lr=${learning_rate} \ + --world_size=1 \ + --batch_size=${batch_size} \ + --device_num=8 \ + --rank=0 \ + --device_list=${device_id_list} \ + --local_rank=$i > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done + + +wait + + + +##################获取训练数据################ +#训练结束时间,不需要修改 + +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'FPS' test/output/0/train_0.log|awk -F " " '{print $NF}'|awk -F ":" '{print $2}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance images/sec : $FPS" + +# #输出训练精度,需要模型审视修改 +train_accuracy=`grep -a "Average Precision" test/output/0/train_0.log|awk -F "=" '{print $NF}'|awk 'END {print}'` +# #打印,不需要修改 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/train_npu.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/train_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..6f7358263440c5a5c417ba68533ae5953d26554b --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/train_npu.py @@ -0,0 +1,375 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import argparse +import datetime +import json +import random +import time +from pathlib import Path + +import numpy as np +import torch +from torch.utils.data import DataLoader, DistributedSampler + +import datasets +import util.misc as utils +from datasets import build_dataset, get_coco_api_from_dataset +from engine import evaluate, train_one_epoch +from models import build_model + +import apex +from apex import amp +from apex.parallel import convert_syncbn_model +from apex.parallel import DistributedDataParallel +import torch.distributed as dist +import os +import warnings + +def get_args_parser(): + parser = argparse.ArgumentParser('Set transformer detector', add_help=False) + parser.add_argument('--lr', default=1e-4, type=float) + parser.add_argument('--lr_backbone', default=1e-5, type=float) + parser.add_argument('--batch_size', default=8, type=int) + parser.add_argument('--weight_decay', default=1e-4, type=float) + parser.add_argument('--epochs', default=400, type=int) + parser.add_argument('--lr_drop', default=200, type=int) + parser.add_argument('--clip_max_norm', default=0.1, type=float, + help='gradient clipping max norm') + + # Model parameters + parser.add_argument('--frozen_weights', type=str, default=None, + help="Path to the pretrained model. If set, only the mask head will be trained") + # * Backbone + parser.add_argument('--backbone', default='resnet50', type=str, + help="Name of the convolutional backbone to use") + parser.add_argument('--dilation', action='store_true', + help="If true, we replace stride with dilation in the last convolutional block (DC5)") + parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'), + help="Type of positional embedding to use on top of the image features") + + # * Transformer + parser.add_argument('--enc_layers', default=6, type=int, + help="Number of encoding layers in the transformer") + parser.add_argument('--dec_layers', default=6, type=int, + help="Number of decoding layers in the transformer") + parser.add_argument('--dim_feedforward', default=2048, type=int, + help="Intermediate size of the feedforward layers in the transformer blocks") + parser.add_argument('--hidden_dim', default=256, type=int, + help="Size of the embeddings (dimension of the transformer)") + parser.add_argument('--dropout', default=0.1, type=float, + help="Dropout applied in the transformer") + parser.add_argument('--nheads', default=8, type=int, + help="Number of attention heads inside the transformer's attentions") + parser.add_argument('--num_queries', default=100, type=int, + help="Number of query slots") + parser.add_argument('--pre_norm', action='store_true') + + # * Segmentation + parser.add_argument('--masks', action='store_true', + help="Train segmentation head if the flag is provided") + + # Loss + parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false', + help="Disables auxiliary decoding losses (loss at each layer)") + # * Matcher + parser.add_argument('--set_cost_class', default=1, type=float, + help="Class coefficient in the matching cost") + parser.add_argument('--set_cost_bbox', default=5, type=float, + help="L1 box coefficient in the matching cost") + parser.add_argument('--set_cost_giou', default=2, type=float, + help="giou box coefficient in the matching cost") + # * Loss coefficients + parser.add_argument('--mask_loss_coef', default=1, type=float) + parser.add_argument('--dice_loss_coef', default=1, type=float) + parser.add_argument('--bbox_loss_coef', default=5, type=float) + parser.add_argument('--giou_loss_coef', default=2, type=float) + parser.add_argument('--eos_coef', default=0.1, type=float, + help="Relative classification weight of the no-object class") + + # dataset parameters + parser.add_argument('--dataset_file', default='coco') + parser.add_argument('--coco_path', type=str, default='/opt/npu/dataset/coco') + parser.add_argument('--coco_panoptic_path', type=str) + parser.add_argument('--remove_difficult', action='store_true') + + parser.add_argument('--output_dir', default='', + help='path where to save, empty for no saving') + parser.add_argument('--device', default='npu', + help='device to use for training / testing') + parser.add_argument('--seed', default=42, type=int) + parser.add_argument('--resume', default='', help='resume from checkpoint') + parser.add_argument('--start_epoch', default=0, type=int, metavar='N', + help='start epoch') + parser.add_argument('--eval', action='store_true') + parser.add_argument('--num_workers', default=8, type=int) + + # edit this for 8p + parser.add_argument('--dist-backend', type=str, default='hccl') + parser.add_argument('--distributed', type=bool, default=True) + parser.add_argument('--world-size', type=int, default=-1) + parser.add_argument('--rank', type=int, default=-1) + parser.add_argument('--local_rank', type=int, default=0) + parser.add_argument('--addr', type=str, default='127.0.0.1') + parser.add_argument('--device_num', type=int, default=-1) + parser.add_argument('--workers', type=int, default=32) + parser.add_argument('--device-list', default='', type=str) + parser.add_argument('--dist-url', type=str, default='tcp://127.0.0.1:50000') + parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.') + parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') + warnings.filterwarnings('ignore') + #############end################# + return parser + +def main(args): + torch.manual_seed(args.seed) + ############################## + # edit this for 8p + os.environ['MASTER_ADDR'] = args.addr + os.environ['MASTER_PORT'] = '29888' + os.environ['LOCAL_DEVICE_ID'] = str(0) + print("+++++++++++++++++++++++++++LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID']) + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + print('==========>args.world_size: ', args.world_size) + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + + if args.device_list != '': + ngpus_per_node = len(args.device_list.split(',')) + elif args.device_num != -1: + ngpus_per_node = args.device_num + elif args.device == 'npu': + ngpus_per_node = int(os.environ["RANK_SIZE"]) + else: + ngpus_per_node = torch.cuda.device_count() + if args.multiprocessing_distributed: + # Since we have ngpus_per_node processes per node, the total world_size + # needs to be adjusted accordingly + args.world_size = ngpus_per_node * args.world_size + # Use torch.multiprocessing.spawn to launch distributed processes: the + # main_worker process function + # The child process uses the environment variables of the parent process, + # we have to set LOCAL_DEVICE_ID for every proc + if args.device == 'npu': + main_worker(args.local_rank, ngpus_per_node, args) + else: + mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) + else: + # Simply call main_worker function + main_worker(args.gpu, ngpus_per_node, args) + ############################## + +def main_worker(gpu, ngpus_per_node, args): + + if args.frozen_weights is not None: + assert args.masks, "Frozen training is meant for segmentation only" + #####################begin############################## + if args.device_list != '': + print(args.device_list) + args.gpu = int(args.device_list.split(',')[gpu]) + else: + args.gpu = gpu + + print("[npu id:", args.gpu, "]", "++++++++++++++++ before set LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID']) + os.environ['LOCAL_DEVICE_ID'] = str(args.gpu) + print("[npu id:", args.gpu, "]", "++++++++++++++++ LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID']) + + if args.gpu is not None: + print("[npu id:", args.gpu, "]", "Use GPU: {} for training".format(args.gpu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + gpu + if args.device == 'npu': + dist.init_process_group(backend=args.dist_backend, # init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + + else: + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + + loc = 'npu:{}'.format(args.gpu) + torch.npu.set_device(loc) + + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + + print("[npu id:", args.gpu, "]", "===============main_worker()=================") + print("[npu id:", args.gpu, "]", args) + print("[npu id:", args.gpu, "]", "===============main_worker()=================") + ##################end################ + + # device = torch.device(args.device) + + # fix the seed for reproducibility + seed = args.seed + utils.get_rank() + np.random.seed(seed) + random.seed(seed) + + model, criterion, postprocessors = build_model(args) + # model.to(device) + # model = convert_syncbn_model(model) + model = model.to(loc) + model_without_ddp = model + param_dicts = [ + {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, + { + "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], + "lr": args.lr_backbone, + }, + ] + + # utils.init_distributed_mode(args) + print("git:\n {}\n".format(utils.get_sha())) + + optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, + weight_decay=args.weight_decay) + + # optimizer = apex.optimizers.NpuFusedAdamW(param_dicts, lr=args.lr, + # weight_decay=args.weight_decay) + + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) + + # model.to(device) + opt_level = 'O0' + model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level) + for ls in amp._amp_state.loss_scalers: + ls._scale_seq_len = 50 + ls._loss_scale = 2. ** 24 + + + if args.distributed: + # model = DistributedDataParallel(model) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False) + model_without_ddp = model.module + # n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + n_parameters = sum(p.numel() for p in model.parameters()) + print('number of params:', n_parameters) + + dataset_train = build_dataset(image_set='train', args=args) + dataset_val = build_dataset(image_set='val', args=args) + + if args.distributed: + sampler_train = DistributedSampler(dataset_train) + sampler_val = DistributedSampler(dataset_val, shuffle=False) + else: + sampler_train = torch.utils.data.RandomSampler(dataset_train) + sampler_val = torch.utils.data.SequentialSampler(dataset_val) + + batch_sampler_train = torch.utils.data.BatchSampler( + sampler_train, args.batch_size, drop_last=True) + + data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, + collate_fn=utils.collate_fn, num_workers=args.num_workers) + data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, + drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) + + if args.dataset_file == "coco_panoptic": + # We also evaluate AP during panoptic training, on original coco DS + coco_val = datasets.coco.build("val", args) + base_ds = get_coco_api_from_dataset(coco_val) + else: + base_ds = get_coco_api_from_dataset(dataset_val) + + if args.frozen_weights is not None: + checkpoint = torch.load(args.frozen_weights, map_location='cpu') + model_without_ddp.detr.load_state_dict(checkpoint['model']) + + output_dir = Path(args.output_dir) + if args.resume: + if args.resume.startswith('https'): + checkpoint = torch.hub.load_state_dict_from_url( + args.resume, map_location='cpu', check_hash=True) + else: + checkpoint = torch.load(args.resume, map_location='cpu') + model_without_ddp.load_state_dict(checkpoint['model']) + if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: + optimizer.load_state_dict(checkpoint['optimizer']) + lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) + args.start_epoch = checkpoint['epoch'] + 1 + + if args.eval: + test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, + data_loader_val, base_ds, loc, args.output_dir) + if args.output_dir: + utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") + return + + print("Start training") + start_time = time.time() + best = 0 + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + sampler_train.set_epoch(epoch) + + train_stats = train_one_epoch( + model, criterion, data_loader_train, optimizer, loc, epoch, + args.clip_max_norm) + lr_scheduler.step() + + if args.output_dir: + checkpoint_paths = [output_dir / 'checkpoint.pth'] + # extra checkpoint before LR drop and every 100 epochs + if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: + checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') + for checkpoint_path in checkpoint_paths: + utils.save_on_master({ + 'model': model_without_ddp.state_dict(), + 'optimizer': optimizer.state_dict(), + 'lr_scheduler': lr_scheduler.state_dict(), + 'epoch': epoch, + 'args': args, + }, checkpoint_path) + + test_stats, coco_evaluator = evaluate( + model, criterion, postprocessors, data_loader_val, base_ds, loc, args.output_dir + ) + + map = coco_evaluator.coco_eval['bbox'].stats[0] + if map >= best: + print(map) + best = map + utils.save_on_master({ + 'model': model_without_ddp.state_dict(), + 'optimizer': optimizer.state_dict(), + 'lr_scheduler': lr_scheduler.state_dict(), + 'epoch': epoch, + 'args': args, + }, 'output/checkpoint_{}.pth'.format(map)) + + log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, + **{f'test_{k}': v for k, v in test_stats.items()}, + 'epoch': epoch, + 'n_parameters': n_parameters} + + if args.output_dir and utils.is_main_process(): + with (output_dir / "log.txt").open("a") as f: + f.write(json.dumps(log_stats) + "\n") + + # for evaluation logs + if coco_evaluator is not None: + (output_dir / 'eval').mkdir(exist_ok=True) + if "bbox" in coco_evaluator.coco_eval: + filenames = ['latest.pth'] + if epoch % 50 == 0: + filenames.append(f'{epoch:03}.pth') + for name in filenames: + torch.save(coco_evaluator.coco_eval["bbox"].eval, + output_dir / "eval" / name) + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()]) + args = parser.parse_args() + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + main(args) diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__init__.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..168f9979a4623806934b0ff1102ac166704e7dec --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/__init__.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ce80ead7a8df0dfa09aa3e9fcd147d99516d47a Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/__init__.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/__init__.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75ce5c70ceb5cb340aed14946d0ec6cec79cec06 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/__init__.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/box_ops.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/box_ops.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..38266ba3db12c3b64edef438c8b1c9e8d99964e5 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/box_ops.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/box_ops.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/box_ops.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af354e4cc519a03b4350ce2dab1c2d0eb96df0b2 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/box_ops.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/misc.cpython-36.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/misc.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cbd42f01283591f7d74d3a07653b1a6c2acf7f8c Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/misc.cpython-36.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/misc.cpython-37.pyc b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/misc.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b3b4b214ac31f0fde1248254220e74a874f5f27 Binary files /dev/null and b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/__pycache__/misc.cpython-37.pyc differ diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/box_ops.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/box_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..9c088e5bacc88ff7217fc971f5db889f5bb45b39 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/box_ops.py @@ -0,0 +1,88 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Utilities for bounding box manipulation and GIoU. +""" +import torch +from torchvision.ops.boxes import box_area + + +def box_cxcywh_to_xyxy(x): + x_c, y_c, w, h = x.unbind(-1) + b = [(x_c - 0.5 * w), (y_c - 0.5 * h), + (x_c + 0.5 * w), (y_c + 0.5 * h)] + return torch.stack(b, dim=-1) + + +def box_xyxy_to_cxcywh(x): + x0, y0, x1, y1 = x.unbind(-1) + b = [(x0 + x1) / 2, (y0 + y1) / 2, + (x1 - x0), (y1 - y0)] + return torch.stack(b, dim=-1) + + +# modified from torchvision to also return the union +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + wh = (rb - lt).clamp(min=0) # [N,M,2] + inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/ + + The boxes should be in [x0, y0, x1, y1] format + + Returns a [N, M] pairwise matrix, where N = len(boxes1) + and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + assert (boxes1[:, 2:] >= boxes1[:, :2]).all() + assert (boxes2[:, 2:] >= boxes2[:, :2]).all() + iou, union = box_iou(boxes1, boxes2) + + lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + + wh = (rb - lt).clamp(min=0) # [N,M,2] + area = wh[:, :, 0] * wh[:, :, 1] + + return iou - (area - union) / area + + +def masks_to_boxes(masks): + """Compute the bounding boxes around the provided masks + + The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. + + Returns a [N, 4] tensors, with the boxes in xyxy format + """ + if masks.numel() == 0: + return torch.zeros((0, 4), device=masks.device) + + h, w = masks.shape[-2:] + + y = torch.arange(0, h, dtype=torch.float) + x = torch.arange(0, w, dtype=torch.float) + y, x = torch.meshgrid(y, x) + + x_mask = (masks * x.unsqueeze(0)) + x_max = x_mask.flatten(1).max(-1)[0] + x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] + + y_mask = (masks * y.unsqueeze(0)) + y_max = y_mask.flatten(1).max(-1)[0] + y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] + + return torch.stack([x_min, y_min, x_max, y_max], 1) diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/misc.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..e08159ebe1860387ac74dc913004454e9d26cbf9 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/misc.py @@ -0,0 +1,489 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Misc functions, including distributed helpers. +Mostly copy-paste from torchvision references. +""" +import os +import subprocess +import time +from collections import defaultdict, deque +import datetime +import pickle +from typing import Optional, List + +import torch +import torch.distributed as dist +from torch import Tensor + +# needed due to empty tensor bug in pytorch and torchvision 0.5 +import torchvision + +if float(torchvision.__version__.split(".")[1]) < 7.0: + from torchvision.ops import _new_empty_tensor + from torchvision.ops.misc import _output_size + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not is_dist_avail_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float16, device='npu') + dist.barrier() + dist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) + + +def all_gather(data): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors) + Args: + data: any picklable object + Returns: + list[data]: list of data gathered from each rank + """ + world_size = get_world_size() + if world_size == 1: + return [data] + + # serialized to a Tensor + buffer = pickle.dumps(data) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to("npu") + + # obtain Tensor size of each rank + local_size = torch.tensor([tensor.numel()], device="npu") + size_list = [torch.tensor([0], device="npu") for _ in range(world_size)] + dist.all_gather(size_list, local_size) + size_list = [int(size.item()) for size in size_list] + max_size = max(size_list) + + # receiving Tensor from all ranks + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + tensor_list = [] + for _ in size_list: + tensor_list.append(torch.empty((max_size,), dtype=torch.int8, device="npu")) + if local_size != max_size: + padding = torch.empty(size=(max_size - local_size,), dtype=torch.int8, device="npu") + tensor = torch.cat((tensor, padding), dim=0) + + dist.all_gather(tensor_list, tensor.char()) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def reduce_dict(input_dict, average=True): + """ + Args: + input_dict (dict): all the values will be reduced + average (bool): whether to do average or sum + Reduce the values in the dictionary from all processes so that all processes + have the averaged results. Returns a dict with the same fields as + input_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return input_dict + with torch.no_grad(): + names = [] + values = [] + # sort the keys so that they are consistent across processes + for k in sorted(input_dict.keys()): + names.append(k) + values.append(input_dict[k]) + values = torch.stack(values, dim=0) + dist.all_reduce(values) + if average: + values /= world_size + reduced_dict = {k: v for k, v in zip(names, values)} + return reduced_dict + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, batch_size, print_freq, header=None): + i = 0 + if not header: + header = '' + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt='{avg:.4f}') + data_time = SmoothedValue(fmt='{avg:.4f}') + space_fmt = ':' + str(len(str(len(iterable)))) + 'd' + if torch.npu.is_available(): + log_msg = self.delimiter.join([ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}', + 'max mem: {memory:.0f}', + 'FPS:{fps:.4f}' + ]) + else: + log_msg = self.delimiter.join([ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}' + ]) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0 or i == len(iterable) - 1: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + fps = 1 / (float(str(iter_time)) / batch_size) + if torch.npu.is_available(): + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time), + memory=torch.npu.max_memory_allocated() / MB, + fps=fps)) + else: + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time))) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('{} Total time: {} ({:.4f} s / it)'.format( + header, total_time_str, total_time / len(iterable))) + + +def get_sha(): + cwd = os.path.dirname(os.path.abspath(__file__)) + + def _run(command): + return subprocess.check_output(command, cwd=cwd).decode('ascii').strip() + + sha = 'N/A' + diff = "clean" + branch = 'N/A' + try: + sha = _run(['git', 'rev-parse', 'HEAD']) + subprocess.check_output(['git', 'diff'], cwd=cwd) + diff = _run(['git', 'diff-index', 'HEAD']) + diff = "has uncommited changes" if diff else "clean" + branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD']) + except Exception: + pass + message = f"sha: {sha}, status: {diff}, branch: {branch}" + return message + + +def collate_fn(batch): + batch = list(zip(*batch)) + batch[0] = nested_tensor_from_tensor_list(batch[0]) + return tuple(batch) + + +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + +class NestedTensor(object): + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + def to(self, device): + # type: (Device) -> NestedTensor # noqa + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + assert mask is not None + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + # TODO make this more general + if tensor_list[0].ndim == 3: + if torchvision._is_tracing(): + # nested_tensor_from_tensor_list() does not export well to ONNX + # call _onnx_nested_tensor_from_tensor_list() instead + return _onnx_nested_tensor_from_tensor_list(tensor_list) + + # TODO make it support different-sized images + + # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + batch_shape = [len(tensor_list)] + max_size + + # batch_shape = [len(tensor_list)] + [3,640,640] + # h,w=tensor_list[0].shape[::-2] + # pad_value = int(abs(h - w) / 2) + # if h > w: + # padded_image = torch.nn.functional.pad(tensor_list[0], (pad_value, pad_value, 0, 0)) + # else: + # padded_image = torch.nn.functional.pad(tensor_list[0], (0, 0, pad_value, pad_value)) + # padded_image=padded_image.resize_((3,640,640)) + # tensor_list=[padded_image] + # + + b, c, h, w = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((b, h, w), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + # print(pad_img.shape) + # print(img.shape) + # p + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + m[: img.shape[1], :img.shape[2]] = False + else: + raise ValueError('not supported') + return NestedTensor(tensor, mask) + + +# _onnx_nested_tensor_from_tensor_list() is an implementation of +# nested_tensor_from_tensor_list() that is supported by ONNX tracing. +@torch.jit.unused +def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: + # max_size = [] + # for i in range(tensor_list[0].dim()): + # max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64) + # max_size.append(max_size_i) + # max_size = tuple(max_size) + + # work around for + # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + # m[: img.shape[1], :img.shape[2]] = False + # which is not yet supported in onnx + padded_imgs = [] + padded_masks = [] + for img in tensor_list: + # padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] + padding = [torch.tensor(0), torch.tensor(0), torch.tensor(0)] + padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) + padded_imgs.append(padded_img) + + m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) + padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) + padded_masks.append(padded_mask.to(torch.bool)) + + tensor = torch.stack(padded_imgs) + mask = torch.stack(padded_masks) + + return NestedTensor(tensor, mask=mask) + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + import builtins as __builtin__ + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + if is_master or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def init_distributed_mode(args): + if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.npu = int(os.environ['LOCAL_RANK']) + elif 'SLURM_PROCID' in os.environ: + args.rank = int(os.environ['SLURM_PROCID']) + args.npu = args.rank % torch.npu.device_count() + else: + print('Not using distributed mode') + args.distributed = False + return + + args.distributed = True + + torch.npu.set_device(args.npu) + args.dist_backend = 'hccl' + print('| distributed init (rank {}): {}'.format( + args.rank, args.dist_url), flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) + + +@torch.no_grad() +def accuracy(output, target, topk=(1,)): + """Computes the precision@k for the specified values of k""" + if target.numel() == 0: + return [torch.zeros([], device=output.device)] + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None): + # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor + """ + Equivalent to nn.functional.interpolate, but with support for empty batch sizes. + This will eventually be supported natively by PyTorch, and this + class can go away. + """ + if float(torchvision.__version__.split(".")[1]) < 7.0: + if input.numel() > 0: + return torch.nn.functional.interpolate( + input, size, scale_factor, mode, align_corners + ) + + output_shape = _output_size(2, input, size, scale_factor) + output_shape = list(input.shape[:-2]) + list(output_shape) + return _new_empty_tensor(input, output_shape) + else: + return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners) diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/plot_utils.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/plot_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0f24bed0d3fe4624aeb231ddd02633f2e58e4bff --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/util/plot_utils.py @@ -0,0 +1,107 @@ +""" +Plotting utilities to visualize training logs. +""" +import torch +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt + +from pathlib import Path, PurePath + + +def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'): + ''' + Function to plot specific fields from training log(s). Plots both training and test results. + + :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file + - fields = which results to plot from each log file - plots both training and test for each field. + - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots + - log_name = optional, name of log file if different than default 'log.txt'. + + :: Outputs - matplotlib plots of results in fields, color coded for each log file. + - solid lines are training results, dashed lines are test results. + + ''' + func_name = "plot_utils.py::plot_logs" + + # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path, + # convert single Path to list to avoid 'not iterable' error + + if not isinstance(logs, list): + if isinstance(logs, PurePath): + logs = [logs] + print(f"{func_name} info: logs param expects a list argument, converted to list[Path].") + else: + raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \ + Expect list[Path] or single Path obj, received {type(logs)}") + + # Quality checks - verify valid dir(s), that every item in list is Path object, and that log_name exists in each dir + for i, dir in enumerate(logs): + if not isinstance(dir, PurePath): + raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}") + if not dir.exists(): + raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}") + # verify log_name exists + fn = Path(dir / log_name) + if not fn.exists(): + print(f"-> missing {log_name}. Have you gotten to Epoch 1 in training?") + print(f"--> full path of missing log file: {fn}") + return + + # load log file(s) and plot + dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs] + + fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5)) + + for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))): + for j, field in enumerate(fields): + if field == 'mAP': + coco_eval = pd.DataFrame( + np.stack(df.test_coco_eval_bbox.dropna().values)[:, 1] + ).ewm(com=ewm_col).mean() + axs[j].plot(coco_eval, c=color) + else: + df.interpolate().ewm(com=ewm_col).mean().plot( + y=[f'train_{field}', f'test_{field}'], + ax=axs[j], + color=[color] * 2, + style=['-', '--'] + ) + for ax, field in zip(axs, fields): + ax.legend([Path(p).name for p in logs]) + ax.set_title(field) + + +def plot_precision_recall(files, naming_scheme='iter'): + if naming_scheme == 'exp_id': + # name becomes exp_id + names = [f.parts[-3] for f in files] + elif naming_scheme == 'iter': + names = [f.stem for f in files] + else: + raise ValueError(f'not supported {naming_scheme}') + fig, axs = plt.subplots(ncols=2, figsize=(16, 5)) + for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names): + data = torch.load(f) + # precision is n_iou, n_points, n_cat, n_area, max_det + precision = data['precision'] + recall = data['params'].recThrs + scores = data['scores'] + # take precision for all classes, all areas and 100 detections + precision = precision[0, :, :, 0, -1].mean(1) + scores = scores[0, :, :, 0, -1].mean(1) + prec = precision.mean() + rec = data['recall'][0, :, 0, -1].mean() + print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' + + f'score={scores.mean():0.3f}, ' + + f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}' + ) + axs[0].plot(recall, precision, c=color) + axs[1].plot(recall, scores, c=color) + + axs[0].set_title('Precision / Recall') + axs[0].legend(names) + axs[1].set_title('Scores / Recall') + axs[1].legend(names) + return fig, axs diff --git a/PyTorch/contrib/cv/detection/DETR_for_PyTorch/validate.py b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/validate.py new file mode 100644 index 0000000000000000000000000000000000000000..a64e1b2400152648bcf7091a59ecb48dd8a11601 --- /dev/null +++ b/PyTorch/contrib/cv/detection/DETR_for_PyTorch/validate.py @@ -0,0 +1,261 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import argparse +import datetime +import os +import random +import time +from pathlib import Path +import numpy as np +import torch +from torch.utils.data import DataLoader, DistributedSampler +from datasets.coco_eval import CocoEvaluator +import util.misc as utils +from datasets import build_dataset, get_coco_api_from_dataset +from models import build_model +from util import box_ops + +def get_args_parser(): + parser = argparse.ArgumentParser('Set transformer detector', add_help=False) + parser.add_argument('--lr', default=1e-4, type=float) + parser.add_argument('--lr_backbone', default=1e-5, type=float) + parser.add_argument('--batch_size', default=1, type=int) + parser.add_argument('--weight_decay', default=1e-4, type=float) + parser.add_argument('--epochs', default=300, type=int) + parser.add_argument('--lr_drop', default=200, type=int) + parser.add_argument('--clip_max_norm', default=0.1, type=float, + help='gradient clipping max norm') + + # Model parameters + parser.add_argument('--frozen_weights', type=str, default=None, + help="Path to the pretrained model. If set, only the mask head will be trained") + # * Backbone + parser.add_argument('--backbone', default='resnet50', type=str, + help="Name of the convolutional backbone to use") + parser.add_argument('--dilation', action='store_true', + help="If true, we replace stride with dilation in the last convolutional block (DC5)") + parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'), + help="Type of positional embedding to use on top of the image features") + + # * Transformer + parser.add_argument('--enc_layers', default=6, type=int, + help="Number of encoding layers in the transformer") + parser.add_argument('--dec_layers', default=6, type=int, + help="Number of decoding layers in the transformer") + parser.add_argument('--dim_feedforward', default=2048, type=int, + help="Intermediate size of the feedforward layers in the transformer blocks") + parser.add_argument('--hidden_dim', default=256, type=int, + help="Size of the embeddings (dimension of the transformer)") + parser.add_argument('--dropout', default=0.1, type=float, + help="Dropout applied in the transformer") + parser.add_argument('--nheads', default=8, type=int, + help="Number of attention heads inside the transformer's attentions") + parser.add_argument('--num_queries', default=100, type=int, + help="Number of query slots") + parser.add_argument('--pre_norm', action='store_true') + + # * Segmentation + parser.add_argument('--masks', action='store_true', + help="Train segmentation head if the flag is provided") + + # Loss + parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false', + help="Disables auxiliary decoding losses (loss at each layer)") + # * Matcher + parser.add_argument('--set_cost_class', default=1, type=float, + help="Class coefficient in the matching cost") + parser.add_argument('--set_cost_bbox', default=5, type=float, + help="L1 box coefficient in the matching cost") + parser.add_argument('--set_cost_giou', default=2, type=float, + help="giou box coefficient in the matching cost") + # * Loss coefficients + parser.add_argument('--mask_loss_coef', default=1, type=float) + parser.add_argument('--dice_loss_coef', default=1, type=float) + parser.add_argument('--bbox_loss_coef', default=5, type=float) + parser.add_argument('--giou_loss_coef', default=2, type=float) + parser.add_argument('--eos_coef', default=0.1, type=float, + help="Relative classification weight of the no-object class") + + # dataset parameters + parser.add_argument('--dataset_file', default='coco') + parser.add_argument('--coco_path', type=str,default='/home/xu/SJH/datasets/coco') + parser.add_argument('--coco_panoptic_path', type=str) + parser.add_argument('--remove_difficult', action='store_true') + + parser.add_argument('--output_dir', default='output', + help='path where to save, empty for no saving') + parser.add_argument('--device', default='cuda', + help='device to use for training / testing') + parser.add_argument('--seed', default=42, type=int) + parser.add_argument('--resume', default='model_file/detr.pth', help='resume from checkpoint') + parser.add_argument('--start_epoch', default=0, type=int, metavar='N', + help='start epoch') + parser.add_argument('--eval', action='store_true',default=True,) + parser.add_argument('--num_workers', default=2, type=int) + + # distributed training parameters + parser.add_argument('--world_size', default=1, type=int, + help='number of distributed processes') + parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training') + return parser + + +@torch.no_grad() +def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, output_dir): + model.eval() + criterion.eval() + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) + header = 'Test:' + iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys()) + coco_evaluator = CocoEvaluator(base_ds, iou_types) + + # ort_session = onnxruntime.InferenceSession('model_file/detr_640.onnx') + for file,(samples, targets) in zip(os.listdir('/home/xu/xiaoxiong/effdet/coco_data/val2017'),metric_logger.log_every(data_loader, 10, header)): + samples = samples.to(device) + targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + print(samples.tensors.shape) + print(os.path.join('/home/xu/xiaoxiong/DETR/detr_bin','{}.bin'.format(file.split('.')[0]))) + samples.tensors.cpu().numpy().tofile(os.path.join('/home/xu/xiaoxiong/DETR/detr_bin','{}.bin'.format(file.split('.')[0]))) + + # ort_inputs = {ort_session.get_inputs()[0].name:samples.tensors.cpu().numpy()} + # # print('inputs',ort_inputs) + # ort_outs = ort_session.run(None, ort_inputs) + # out={'pred_logits':torch.from_numpy(ort_outs[0]).cuda(), + # 'pred_boxes':torch.from_numpy(ort_outs[1]).cuda()} + # outputs=out + # loss_dict=criterion(out,targets) + + outputs = model(samples) + loss_dict = criterion(outputs, targets) + weight_dict = criterion.weight_dict + + # reduce losses over all GPUs for logging purposes + loss_dict_reduced = utils.reduce_dict(loss_dict) + loss_dict_reduced_scaled = {k: v * weight_dict[k] + for k, v in loss_dict_reduced.items() if k in weight_dict} + loss_dict_reduced_unscaled = {f'{k}_unscaled': v + for k, v in loss_dict_reduced.items()} + metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()), + **loss_dict_reduced_scaled, + **loss_dict_reduced_unscaled) + metric_logger.update(class_error=loss_dict_reduced['class_error']) + orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) + + results = postprocessors['bbox'](outputs, orig_target_sizes) + + # print(len(results[0]['scores']),results[0]['scores']) + # print(results[0]['boxes']) + # print(results[0]['labels']) + # print(postprocessors.keys()) + + res = {target['image_id'].item(): output for target, output in zip(targets, results)} + + # if 2592 in res.keys(): + # print(orig_target_sizes) + # print(res[2592]) + # for i,value in enumerate(res[2592]['scores']): + # if value>0.5: + # print(i,value) + # print(res[2592]['boxes'][i]) + # p + coco_evaluator.update(res) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + coco_evaluator.synchronize_between_processes() + + # accumulate predictions from all images + coco_evaluator.accumulate() + coco_evaluator.summarize() + stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} + stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist() + + return stats, coco_evaluator + +def main(args): + utils.init_distributed_mode(args) + print("git:\n {}\n".format(utils.get_sha())) + + if args.frozen_weights is not None: + assert args.masks, "Frozen training is meant for segmentation only" + print(args) + + device = torch.device(args.device) + + # fix the seed for reproducibility + seed = args.seed + utils.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + + model, criterion, postprocessors = build_model(args) + model.to(device) + + model_without_ddp = model + print(args.distributed) + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + model_without_ddp = model.module + n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) + print('number of params:', n_parameters) + + param_dicts = [ + {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, + { + "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], + "lr": args.lr_backbone, + }, + ] + optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, + weight_decay=args.weight_decay) + lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) + + + dataset_val = build_dataset(image_set='val', args=args) + + if args.distributed: + sampler_val = DistributedSampler(dataset_val, shuffle=False) + else: + sampler_val = torch.utils.data.SequentialSampler(dataset_val) + + data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, + drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) + + + base_ds = get_coco_api_from_dataset(dataset_val) + + if args.frozen_weights is not None: + checkpoint = torch.load(args.frozen_weights, map_location='cpu') + model_without_ddp.detr.load_state_dict(checkpoint['model']) + + output_dir = Path(args.output_dir) + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + model_without_ddp.load_state_dict(checkpoint['model']) + if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: + optimizer.load_state_dict(checkpoint['optimizer']) + lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) + args.start_epoch = checkpoint['epoch'] + 1 + + start_time = time.time() + if args.eval: + print('start validate') + test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, + data_loader_val, base_ds, device, args.output_dir) + if args.output_dir: + utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") + return + + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()]) + args = parser.parse_args() + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + main(args) diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/.keep b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/Dockerfile b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..360861ede17fb0ab697fbcac190acde7c1e29fef --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/Dockerfile @@ -0,0 +1,5 @@ +ARG FROM_IMAGE_NAME +FROM ${FROM_IMAGE_NAME} + +COPY requirements.txt . +RUN pip3.7 install -r requirements.txt diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/Readme.md b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/Readme.md new file mode 100644 index 0000000000000000000000000000000000000000..9575325dc67228acfddd34d75da5926015248460 --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/Readme.md @@ -0,0 +1,62 @@ +# Transformer-xl + +This implements training of transformer-xl on the enwik8 dataset, mainly modified from [pytorch/examples](https://github.com/kimiyoung/transformer-xl/tree/master/pytorch). + +## Transformer-xl Detail + +As of the current date, Ascend-Pytorch is still inefficient for contiguous operations.Therefore, Transformer-xl is re-implemented using semantics such as custom OP. + + +## Requirements + +- Install PyTorch ([pytorch.org](http://pytorch.org)) +- `pip install -r requirements.txt` + +## Data Prepration +- `bash getdata.sh` + +## Training and Evaluation + +To train a model, run `bash test/train_full_8p.sh` with the desired model architecture and the path to the enwik8 dataset: + + +```bash +#env +cd transformer-xl +dos2unix ./test/*.sh + +# 1p train perf +bash test/train_performance_1p.sh --data_path=xxxx + +# 8p train perf +bash test/train_performance_8p.sh --data_path=xxxx + +# 8p train full +bash test/train_full_8p.sh --data_path=xxxx + +# 1p eval +bash test/train_eval_1p.sh --data_path=xxxx --pth_path=xxxx + +``` + +- 参数说明: +```bash +#--data //数据集路径,可自行修改为对应路径的数据集 +#--restart_dir //加载模型checkpoint路径,可自行修改为对应路径的模型文件 +#--addr //主机地址 +#--max_step //最大训练步数 +#--batch-size //训练批次大小 +#--lr //初始学习率,默认:0.00025 +#--device-list //多卡训练指定训练用卡 ,8卡:'0,1,2,3,4,5,6,7' +#--amp //是否使用混合精度 +#--loss-scale //lossscale大小 +#--opt-level //混合精度类型 +``` + + +## Transformer-xl training result + +| bpc | FPS | Npu_nums | Epochs | AMP_Type | +| :------: | :------: | :------: | :------: | :------: | +| - | 8300 | 1 | 1 | O2 | +| 1.09 | 44500 | 8 | 50 | O2 | \ No newline at end of file diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/data_utils.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3642f37ceff7853d0e1eecb85c1232eb5ead5bd0 --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/data_utils.py @@ -0,0 +1,258 @@ +import os, sys +import glob +import numpy as np +import torch + +from utils.vocabulary import Vocab + + +class LMOrderedIterator(object): + def __init__(self, data, bsz, bptt, device='npu:0', ext_len=None): + """ + data -- LongTensor -- the LongTensor is strictly ordered + """ + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + self.device = device + + # Work out how cleanly we can divide the dataset into bsz parts. + self.n_step = data.size(0) // bsz + + # Trim off any extra elements that wouldn't cleanly fit (remainders). + data = data.narrow(0, 0, self.n_step * bsz) + + # Evenly divide the data across the bsz batches. + self.data = data.view(bsz, -1).t().contiguous().to(device) + + # Number of mini-batches + self.n_batch = (self.n_step + self.bptt - 1) // self.bptt + + def get_batch(self, i, bptt=None): + if bptt is None: bptt = self.bptt + seq_len = min(bptt, self.data.size(0) - 1 - i) + + end_idx = i + seq_len + beg_idx = max(0, i - self.ext_len) + + data = self.data[beg_idx:end_idx] + target = self.data[i+1:i+1+seq_len] + + return data, target, seq_len + + def get_fixlen_iter(self, start=0): + for i in range(start, self.data.size(0) - 1, self.bptt): + yield self.get_batch(i) + + def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3): + max_len = self.bptt + max_deviation * std + i = start + while True: + bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2. + bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std)))) + data, target, seq_len = self.get_batch(i, bptt) + i += seq_len + yield data, target, seq_len + if i >= self.data.size(0) - 2: + break + + def __iter__(self): + return self.get_fixlen_iter() + + +class LMShuffledIterator(object): + def __init__(self, data, bsz, bptt, device='npu:0', ext_len=None, shuffle=False): + """ + data -- list[LongTensor] -- there is no order among the LongTensors + """ + self.data = data + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self): + # index iterator + epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \ + else np.array(range(len(self.data))) + + # sentence iterator + for idx in epoch_indices: + yield self.data[idx] + + def stream_iterator(self, sent_stream): + # streams for each data in the batch + streams = [None] * self.bsz + + data = torch.LongTensor(self.bptt, self.bsz) + target = torch.LongTensor(self.bptt, self.bsz) + + n_retain = 0 + + while True: + # data : [n_retain+bptt x bsz] + # target : [bptt x bsz] + data[n_retain:].fill_(-1) + target.fill_(-1) + + valid_batch = True + + for i in range(self.bsz): + n_filled = 0 + try: + while n_filled < self.bptt: + if streams[i] is None or len(streams[i]) <= 1: + streams[i] = next(sent_stream) + # number of new tokens to fill in + n_new = min(len(streams[i]) - 1, self.bptt - n_filled) + # first n_retain tokens are retained from last batch + data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \ + streams[i][:n_new] + target[n_filled:n_filled+n_new, i] = \ + streams[i][1:n_new+1] + streams[i] = streams[i][n_new:] + n_filled += n_new + except StopIteration: + valid_batch = False + break + + if not valid_batch: + return + + data = data.to(self.device) + target = target.to(self.device) + + yield data, target, self.bptt + + n_retain = min(data.size(0), self.ext_len) + if n_retain > 0: + data[:n_retain] = data[-n_retain:] + data.resize_(n_retain + self.bptt, data.size(1)) + + def __iter__(self): + # sent_stream is an iterator + sent_stream = self.get_sent_stream() + + for batch in self.stream_iterator(sent_stream): + yield batch + + +class LMMultiFileIterator(LMShuffledIterator): + def __init__(self, paths, vocab, bsz, bptt, device='npu:0', ext_len=None, + shuffle=False): + + self.paths = paths + self.vocab = vocab + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self, path): + sents = self.vocab.encode_file(path, add_double_eos=True) + if self.shuffle: + np.random.shuffle(sents) + sent_stream = iter(sents) + + return sent_stream + + def __iter__(self): + if self.shuffle: + np.random.shuffle(self.paths) + + for path in self.paths: + # sent_stream is an iterator + sent_stream = self.get_sent_stream(path) + for batch in self.stream_iterator(sent_stream): + yield batch + + +class Corpus(object): + def __init__(self, path, dataset, *args, **kwargs): + self.dataset = dataset + self.vocab = Vocab(*args, **kwargs) + + if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']: + self.vocab.count_file(os.path.join(path, 'train.txt')) + self.vocab.count_file(os.path.join(path, 'valid.txt')) + self.vocab.count_file(os.path.join(path, 'test.txt')) + elif self.dataset == 'wt103': + self.vocab.count_file(os.path.join(path, 'train.txt')) + elif self.dataset == 'lm1b': + train_path_pattern = os.path.join( + path, '1-billion-word-language-modeling-benchmark-r13output', + 'training-monolingual.tokenized.shuffled', 'news.en-*') + train_paths = glob.glob(train_path_pattern) + # the vocab will load from file when build_vocab() is called + + self.vocab.build_vocab() + + if self.dataset in ['ptb', 'wt2', 'wt103']: + self.train = self.vocab.encode_file( + os.path.join(path, 'train.txt'), ordered=True) + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=True) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=True) + elif self.dataset in ['enwik8', 'text8']: + self.train = self.vocab.encode_file( + os.path.join(path, 'train.txt'), ordered=True, add_eos=False) + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=True, add_eos=False) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=True, add_eos=False) + elif self.dataset == 'lm1b': + self.train = train_paths + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True) + + def get_iterator(self, split, *args, **kwargs): + if split == 'train': + if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']: + data_iter = LMOrderedIterator(self.train, *args, **kwargs) + elif self.dataset == 'lm1b': + kwargs['shuffle'] = True + data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs) + elif split in ['valid', 'test.py']: + data = self.valid if split == 'valid' else self.test + if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']: + data_iter = LMOrderedIterator(data, *args, **kwargs) + elif self.dataset == 'lm1b': + data_iter = LMShuffledIterator(data, *args, **kwargs) + + return data_iter + + +def get_lm_corpus(datadir, dataset): + fn = os.path.join(datadir, 'cache.pt') + if os.path.exists(fn): + print('Loading cached dataset...') + corpus = torch.load(fn) + else: + print('Producing dataset {}...'.format(dataset)) + kwargs = {} + if dataset in ['wt103', 'wt2']: + kwargs['special'] = [''] + kwargs['lower_case'] = False + elif dataset == 'ptb': + kwargs['special'] = [''] + kwargs['lower_case'] = True + elif dataset == 'lm1b': + kwargs['special'] = [] + kwargs['lower_case'] = False + kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt') + elif dataset in ['enwik8', 'text8']: + pass + corpus = Corpus(datadir, dataset, **kwargs) + torch.save(corpus, fn) + return corpus + + diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/eval_npu.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/eval_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..2cc3b6a73c7957fd2ff286ed41ef199263108385 --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/eval_npu.py @@ -0,0 +1,378 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time +import math +import os +import torch +import torch.nn as nn +import torch.optim as optim + +from data_utils import get_lm_corpus +from mem_transformer import MemTransformerLM +from utils.exp_utils import create_exp_dir +from apex import amp +import apex +from utils.exp_utils import get_logger + +parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') +parser.add_argument('--data', type=str, default='../data/enwik8', + help='location of the data corpus') +parser.add_argument('--dataset', type=str, default='enwik8', + choices=['wt103', 'lm1b', 'enwik8', 'text8'], + help='dataset name') +parser.add_argument('--n_layer', type=int, default=12, + help='number of total layers') +parser.add_argument('--n_head', type=int, default=8, + help='number of heads') +parser.add_argument('--d_head', type=int, default=64, + help='head dimension') +parser.add_argument('--d_embed', type=int, default=-1, + help='embedding dimension') +parser.add_argument('--d_model', type=int, default=512, + help='model dimension') +parser.add_argument('--d_inner', type=int, default=2048, + help='inner dimension in FF') +parser.add_argument('--dropout', type=float, default=0.1, + help='global dropout rate') +parser.add_argument('--dropatt', type=float, default=0.0, + help='attention probability dropout rate') +parser.add_argument('--init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--emb_init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--init_range', type=float, default=0.1, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--emb_init_range', type=float, default=0.01, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--init_std', type=float, default=0.02, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--proj_init_std', type=float, default=0.01, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--optim', default='adam', type=str, + choices=['adam', 'sgd', 'adagrad'], + help='optimizer to use.') +parser.add_argument('--lr', type=float, default=0.00025, + help='initial learning rate (0.00025|5 for adam|sgd)') +parser.add_argument('--mom', type=float, default=0.0, + help='momentum for sgd') +parser.add_argument('--scheduler', default='cosine', type=str, + choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'], + help='lr scheduler to use.') +parser.add_argument('--warmup_step', type=int, default=0, + help='upper epoch limit') +parser.add_argument('--decay_rate', type=float, default=0.5, + help='decay factor when ReduceLROnPlateau is used') +parser.add_argument('--lr_min', type=float, default=0.0, + help='minimum learning rate during annealing') +parser.add_argument('--clip', type=float, default=0.25, + help='gradient clipping') +parser.add_argument('--clip_nonemb', action='store_true', + help='only clip the gradient of non-embedding params') +parser.add_argument('--max_step', type=int, default=100000, + help='upper epoch limit') +parser.add_argument('--batch_size', type=int, default=10, + help='batch size') +parser.add_argument('--batch_chunk', type=int, default=1, + help='split batch into chunks to save memory') +parser.add_argument('--tgt_len', type=int, default=512, + help='number of tokens to predict') +parser.add_argument('--eval_tgt_len', type=int, default=128, + help='number of tokens to predict for evaluation') +parser.add_argument('--ext_len', type=int, default=0, + help='length of the extended context') +parser.add_argument('--mem_len', type=int, default=512, + help='length of the retained previous heads') +parser.add_argument('--not_tied', action='store_true', + help='do not tie the word embedding and softmax weights') +parser.add_argument('--seed', type=int, default=1111, + help='random seed') +parser.add_argument('--npu', default=True, help='use NPU') +parser.add_argument('--adaptive', action='store_true', + help='use adaptive softmax') +parser.add_argument('--div_val', type=int, default=1, + help='divident value for adapative input and softmax') +parser.add_argument('--pre_lnorm', action='store_true', + help='apply LayerNorm to the input instead of the output') +parser.add_argument('--varlen', action='store_true', + help='use variable length') +parser.add_argument('--multi_gpu', action='store_true', + help='use multiple GPU') +parser.add_argument('--log-interval', type=int, default=200, + help='report interval') +parser.add_argument('--eval-interval', type=int, default=4000, + help='evaluation interval') +parser.add_argument('--work_dir', default='LM-TFM', type=str, + help='experiment directory.') +parser.add_argument('--restart', action='store_true', + help='restart training from the saved checkpoint') +parser.add_argument('--restart_dir', type=str, default='', + help='restart dir') +parser.add_argument('--pth', type=str, default='', + help='eval checkpoint') +parser.add_argument('--debug', action='store_true', + help='run in debug mode (do not create exp dir)') +parser.add_argument('--same_length', action='store_true', + help='use the same attn length for all tokens') +parser.add_argument('--attn_type', type=int, default=0, + help='attention type. 0 for ours, 1 for Shaw et al,' + '2 for Vaswani et al, 3 for Al Rfou et al.') +parser.add_argument('--clamp_len', type=int, default=-1, + help='use the same pos embeddings after clamp_len') +parser.add_argument('--eta_min', type=float, default=0.0, + help='min learning rate for cosine scheduler') +parser.add_argument('--gpu0_bsz', type=int, default=-1, + help='batch size on gpu 0') +parser.add_argument('--max_eval_steps', type=int, default=-1, + help='max eval steps') +parser.add_argument('--sample_softmax', type=int, default=-1, + help='number of samples in sampled softmax') +parser.add_argument('--patience', type=int, default=0, + help='patience') +parser.add_argument('--finetune_v2', action='store_true', + help='finetune v2') +parser.add_argument('--finetune_v3', action='store_true', + help='finetune v3') +parser.add_argument('--static-loss-scale', type=float, default=128.0, + help='Static loss scale, positive power of 2 values can ' + 'improve fp16 convergence.') +parser.add_argument('--dynamic-loss-scale', action='store_true', + help='Use dynamic loss scaling. If supplied, this argument' + ' supersedes --static-loss-scale.') +parser.add_argument('--no_log', action='store_true', + help='do not log the eval result') +parser.add_argument('--split', default='valid', + choices=['all','valid','test']) + +args = parser.parse_args() +args.tied = not args.not_tied + +if args.d_embed < 0: + args.d_embed = args.d_model + +assert args.ext_len >= 0, 'extended context length must be non-negative' +assert args.batch_size % args.batch_chunk == 0 + +args.work_dir = '{}-{}'.format(args.work_dir, args.dataset) +args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S')) + +# Get logger +logging = create_exp_dir(args.work_dir, + scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug) +logging = get_logger('log.txt', log_=not args.no_log) + +loc = "npu:0" +torch.npu.set_device(loc) + +############################################################################### +# Load data +############################################################################### +corpus = get_lm_corpus(args.data, args.dataset) +ntokens = len(corpus.vocab) +args.n_token = ntokens + +va_iter = corpus.get_iterator('valid', args.batch_size, args.eval_tgt_len, + device=loc, ext_len=args.ext_len) +te_iter = corpus.get_iterator('test.py', args.batch_size, args.eval_tgt_len, + device=loc, ext_len=args.ext_len) + +# adaptive softmax / embedding +cutoffs, tie_projs = [], [False] +if args.adaptive: + assert args.dataset in ['wt103', 'lm1b'] + if args.dataset == 'wt103': + cutoffs = [20000, 40000, 200000] + tie_projs += [True] * len(cutoffs) + elif args.dataset == 'lm1b': + cutoffs = [60000, 100000, 640000] + tie_projs += [False] * len(cutoffs) + +############################################################################### +# Build the model +############################################################################### +def init_weight(weight): + if args.init == 'uniform': + nn.init.uniform_(weight, -args.init_range, args.init_range) + elif args.init == 'normal': + nn.init.normal_(weight, 0.0, args.init_std) + +def init_bias(bias): + nn.init.constant_(bias, 0.0) + +def weights_init(m): + classname = m.__class__.__name__ + if classname.find('Linear') != -1: + if hasattr(m, 'weight') and m.weight is not None: + init_weight(m.weight) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('AdaptiveEmbedding') != -1: + if hasattr(m, 'emb_projs'): + for i in range(len(m.emb_projs)): + if m.emb_projs[i] is not None: + nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std) + elif classname.find('Embedding') != -1: + if hasattr(m, 'weight'): + init_weight(m.weight) + elif classname.find('ProjectedAdaptiveLogSoftmax') != -1: + if hasattr(m, 'cluster_weight') and m.cluster_weight is not None: + init_weight(m.cluster_weight) + if hasattr(m, 'cluster_bias') and m.cluster_bias is not None: + init_bias(m.cluster_bias) + if hasattr(m, 'out_projs'): + for i in range(len(m.out_projs)): + if m.out_projs[i] is not None: + nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std) + elif classname.find('LayerNorm') != -1: + if hasattr(m, 'weight'): + nn.init.normal_(m.weight, 1.0, args.init_std) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('TransformerLM') != -1: + if hasattr(m, 'r_emb'): + init_weight(m.r_emb) + if hasattr(m, 'r_w_bias'): + init_weight(m.r_w_bias) + if hasattr(m, 'r_r_bias'): + init_weight(m.r_r_bias) + if hasattr(m, 'r_bias'): + init_bias(m.r_bias) + +def update_dropout(m): + classname = m.__class__.__name__ + if classname.find('Dropout') != -1: + if hasattr(m, 'p'): + m.p = args.dropout + +def update_dropatt(m): + if hasattr(m, 'dropatt'): + m.dropatt.p = args.dropatt + +model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model, + args.d_head, args.d_inner, args.dropout, args.dropatt, + tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, + tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, + ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, + same_length=args.same_length, attn_type=args.attn_type, + clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) +model.apply(weights_init) +model.word_emb.apply(weights_init) +args.n_all_param = sum([p.nelement() for p in model.parameters()]) +args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()]) + +model = model.to(loc) + +#### optimizer +if args.optim.lower() == 'sgd': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2) + optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom) + else: + optimizer = optim.SGD(model.parameters(), lr=args.lr, + momentum=args.mom) +elif args.optim.lower() == 'adam': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr) + optimizer = optim.Adam(dense_params, lr=args.lr) + else: + optimizer = apex.optimizers.NpuFusedAdam(model.parameters(), lr=args.lr) +elif args.optim.lower() == 'adagrad': + optimizer = optim.Adagrad(model.parameters(), lr=args.lr) + + + +logging('=' * 100) +logging('#params = {}'.format(args.n_all_param)) +logging('#non emb params = {}'.format(args.n_nonemb_param)) + +# Load the best saved model. +with open(args.pth, 'rb') as f: + model.load_state_dict(torch.load(f, map_location=loc)) + +logging('Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}'.format( + args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len)) + +model.reset_length(args.tgt_len, args.ext_len, args.mem_len) +if args.clamp_len > 0: + model.clamp_len = args.clamp_len +if args.same_length: + model.same_length = True + +############################################################################### +# Evaluation code +############################################################################### + +def evaluate(eval_iter): + model.eval() + total_len, total_loss = 0, 0. + start_time = time.time() + with torch.no_grad(): + mems = tuple() + for idx, (data, target, seq_len) in enumerate(eval_iter): + ts = time.time() + ret = model(data,target,*mems) + loss, mems = ret[0], ret[1:] + loss = loss.mean() + total_loss += seq_len * loss.item() + total_len += seq_len + #print('eval_batch id: {} use time: {:.2f} ms '.format(idx, (time.time()-ts)*1000)) + total_time = time.time() - start_time + logging('Time : {:.2f}s, FPS: {:.2f} characters/s'.format( + total_time, total_len*args.batch_size*args.eval_tgt_len/total_time)) + return total_loss / total_len + + +# Run on test.py data. +if args.split == 'all': + test_loss = evaluate(te_iter) + valid_loss = evaluate(va_iter) +elif args.split == 'valid': + valid_loss = evaluate(va_iter) + test_loss = None +elif args.split == 'test': + test_loss = evaluate(te_iter) + valid_loss = None + +def format_log(loss, split): + if args.dataset in ['enwik8', 'text8']: + log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format( + split, loss, loss / math.log(2)) + else: + log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format( + split, loss, math.exp(loss)) + return log_str + +log_str = '' +if valid_loss is not None: + log_str += format_log(valid_loss, 'valid') +if test_loss is not None: + log_str += format_log(test_loss, 'test.py') + +logging('=' * 100) +logging(log_str) +logging('=' * 100) diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/getdata.sh b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/getdata.sh new file mode 100644 index 0000000000000000000000000000000000000000..8043d1a1aa96cc492928aed06d34a0c4ed6f0f6c --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/getdata.sh @@ -0,0 +1,18 @@ +echo "=== Acquiring datasets ===" +echo "---" + +mkdir -p data +cd data + +echo "- Downloading enwik8 (Character)" +if [[ ! -d 'enwik8' ]]; then + mkdir -p enwik8 + cd enwik8 + wget --continue http://mattmahoney.net/dc/enwik8.zip + wget https://raw.githubusercontent.com/salesforce/awd-lstm-lm/master/data/enwik8/prep_enwik8.py + python3 prep_enwik8.py + cd .. +fi + +echo "---" +echo "Happy language modeling :)" diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/mem_transformer.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/mem_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..92a55122f9fec99dc3a32cd6c38fcc9ca008625a --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/mem_transformer.py @@ -0,0 +1,851 @@ +import sys +import math +import functools + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +sys.path.append('utils') +from utils.proj_adaptive_softmax import ProjectedAdaptiveLogSoftmax +from utils.log_uniform_sampler import LogUniformSampler, sample_logits + +class PositionalEmbedding(nn.Module): + def __init__(self, demb): + super(PositionalEmbedding, self).__init__() + + self.demb = demb + + inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, pos_seq, bsz=None): + sinusoid_inp = torch.ger(pos_seq, self.inv_freq) + pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) + + if bsz is not None: + return pos_emb[:,None,:].expand(-1, bsz, -1) + else: + return pos_emb[:,None,:] + + +class PositionwiseFF(nn.Module): + def __init__(self, d_model, d_inner, dropout, pre_lnorm=False): + super(PositionwiseFF, self).__init__() + + self.d_model = d_model + self.d_inner = d_inner + self.dropout = dropout + + self.CoreNet = nn.Sequential( + nn.Linear(d_model, d_inner), nn.ReLU(inplace=True), + nn.Dropout(dropout), + nn.Linear(d_inner, d_model), + nn.Dropout(dropout), + ) + + self.layer_norm = nn.LayerNorm(d_model) + + self.pre_lnorm = pre_lnorm + + def forward(self, inp): + if self.pre_lnorm: + ##### layer normalization + positionwise feed-forward + core_out = self.CoreNet(self.layer_norm(inp)) + + ##### residual connection + output = core_out + inp + else: + ##### positionwise feed-forward + core_out = self.CoreNet(inp) + + ##### residual connection + layer normalization + output = self.layer_norm(inp + core_out) + + # output = self.layer_norm((inp + core_out).squeeze()) + # output = output.unsqueeze(1) + + return output + +class MultiHeadAttn(nn.Module): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, + pre_lnorm=False): + super(MultiHeadAttn, self).__init__() + + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.dropout = dropout + + self.q_net = nn.Linear(d_model, n_head * d_head, bias=False) + self.kv_net = nn.Linear(d_model, 2 * n_head * d_head, bias=False) + + self.drop = nn.Dropout(dropout) + self.dropatt = nn.Dropout(dropatt) + self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) + + self.layer_norm = nn.LayerNorm(d_model) + + self.scale = 1 / (d_head ** 0.5) + + self.pre_lnorm = pre_lnorm + + def forward(self, h, attn_mask=None, mems=None): + ##### multihead attention + # [hlen x bsz x n_head x d_head] + + if mems is not None: + c = torch.cat([mems, h], 0) + else: + c = h + + if self.pre_lnorm: + ##### layer normalization + c = self.layer_norm(c) + + head_q = self.q_net(h) + head_k, head_v = torch.chunk(self.kv_net(c), 2, -1) + + head_q = head_q.view(h.size(0), h.size(1), self.n_head, self.d_head) + head_k = head_k.view(c.size(0), c.size(1), self.n_head, self.d_head) + head_v = head_v.view(c.size(0), c.size(1), self.n_head, self.d_head) + + # [qlen x klen x bsz x n_head] + attn_score = torch.einsum('ibnd,jbnd->ijbn', (head_q, head_k)) + attn_score.mul_(self.scale) + if attn_mask is not None and attn_mask.any().item(): + if attn_mask.dim() == 2: + attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf')) + elif attn_mask.dim() == 3: + attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf')) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + # [qlen x klen x bsz x n_head] + [klen x bsz x n_head x d_head] -> [qlen x bsz x n_head x d_head] + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, head_v)) + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + output = h + attn_out + else: + ##### residual connection + layer normalization + output = self.layer_norm(h + attn_out) + + return output + +class RelMultiHeadAttn(nn.Module): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, + tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False): + super(RelMultiHeadAttn, self).__init__() + + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.dropout = dropout + + self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False) + + self.drop = nn.Dropout(dropout) + self.dropatt = nn.Dropout(dropatt) + self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) + + self.layer_norm = nn.LayerNorm(d_model) + + self.scale = 1 / (d_head ** 0.5) + + self.pre_lnorm = pre_lnorm + + def _parallelogram_mask(self, h, w, left=False): + mask = torch.ones((h, w)).byte() + m = min(h, w) + mask[:m,:m] = torch.triu(mask[:m,:m]) + mask[-m:,-m:] = torch.tril(mask[-m:,-m:]) + + if left: + return mask + else: + return mask.flip(0) + + def _shift(self, x, qlen, klen, mask, left=False): + if qlen > 1: + zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)), + device=x.device, dtype=x.dtype) + else: + zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype) + + if left: + mask = mask.flip(1) + x_padded = torch.cat([zero_pad, x], dim=1).expand(qlen, -1, -1, -1) + else: + x_padded = torch.cat([x, zero_pad], dim=1).expand(qlen, -1, -1, -1) + + x = x_padded.masked_select(mask[:,:,None,None]) \ + .view(qlen, klen, x.size(2), x.size(3)) + + return x + + def _rel_shift(self, x, zero_triu=False): + zero_pad = torch.zeros((x.size(0), 1, *x.size()[2:]), + device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=1) + + x_padded = x_padded.view(x.size(1) + 1, x.size(0), *x.size()[2:]) + + x = x_padded[1:].view_as(x) + + if zero_triu: + ones = torch.ones((x.size(0), x.size(1))) + x = x * torch.tril(ones, x.size(1) - x.size(0))[:,:,None,None] + + return x + + def forward(self, w, r, attn_mask=None, mems=None): + + raise NotImplementedError + +class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn): + def __init__(self, *args, **kwargs): + super(RelPartialLearnableMultiHeadAttn, self).__init__(*args, **kwargs) + + self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False) + + def forward(self, w, r, r_w_bias, r_r_bias, attn_mask=None, mems=None): + qlen, rlen, bsz = w.size(0), r.size(0), w.size(1) + if mems is not None: + cat = torch.cat([mems, w], 0) + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(cat)) + else: + w_heads = self.qkv_net(cat) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + w_head_q = w_head_q[-qlen:] + else: + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(w)) + else: + w_heads = self.qkv_net(w) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + klen = w_head_k.size(0) + + w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + + r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head + + #### compute attention score + rw_head_q = w_head_q + r_w_bias # qlen x bsz x n_head x d_head + + AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head + + rr_head_q = w_head_q + r_r_bias + BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k)) # qlen x klen x bsz x n_head + BD = self._rel_shift(BD) + + # [qlen x klen x bsz x n_head] + attn_score = AC + BD + attn_score.mul_(self.scale) + + #### compute attention probability + attn_mask_bool = attn_mask.bool() + if attn_mask is not None and attn_mask_bool.any().item(): + if attn_mask.dim() == 2: + attn_score.masked_fill_(attn_mask[None, :, :, None], -float('inf')).bool().type_as(attn_score) + elif attn_mask.dim() == 3: + attn_score.masked_fill_(attn_mask[:, :, :, None], -float('inf')).bool().type_as(attn_score) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + #### compute attention vector + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v)) + + # [qlen x bsz x n_head x d_head] + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + output = w + attn_out + else: + ##### residual connection + layer normalization + output = self.layer_norm(w + attn_out) + + return output + +class RelLearnableMultiHeadAttn(RelMultiHeadAttn): + def __init__(self, *args, **kwargs): + super(RelLearnableMultiHeadAttn, self).__init__(*args, **kwargs) + + def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None): + # r_emb: [klen, n_head, d_head], used for term B + # r_w_bias: [n_head, d_head], used for term C + # r_bias: [klen, n_head], used for term D + + qlen, bsz = w.size(0), w.size(1) + + if mems is not None: + cat = torch.cat([mems, w], 0) + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(cat)) + else: + w_heads = self.qkv_net(cat) + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + w_head_q = w_head_q[-qlen:] + else: + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(w)) + else: + w_heads = self.qkv_net(w) + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + klen = w_head_k.size(0) + + w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) + w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) + w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) + + if klen > r_emb.size(0): + r_emb_pad = r_emb[0:1].expand(klen-r_emb.size(0), -1, -1) + r_emb = torch.cat([r_emb_pad, r_emb], 0) + r_bias_pad = r_bias[0:1].expand(klen-r_bias.size(0), -1) + r_bias = torch.cat([r_bias_pad, r_bias], 0) + else: + r_emb = r_emb[-klen:] + r_bias = r_bias[-klen:] + + #### compute attention score + rw_head_q = w_head_q + r_w_bias[None] # qlen x bsz x n_head x d_head + + AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head + B_ = torch.einsum('ibnd,jnd->ijbn', (w_head_q, r_emb)) # qlen x klen x bsz x n_head + D_ = r_bias[None, :, None] # 1 x klen x 1 x n_head + BD = self._rel_shift(B_ + D_) + + # [qlen x klen x bsz x n_head] + attn_score = AC + BD + attn_score.mul_(self.scale) + + #### compute attention probability + if attn_mask is not None and attn_mask.any().item(): + if attn_mask.dim() == 2: + attn_score.masked_fill_(attn_mask[None,:,:,None], -float('inf')) + elif attn_mask.dim() == 3: + attn_score.masked_fill_(attn_mask[:,:,:,None], -float('inf')) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + #### compute attention vector + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v)) + + # [qlen x bsz x n_head x d_head] + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + output = w + attn_out + else: + ##### residual connection + layer normalization + output = self.layer_norm(w + attn_out) + + return output + +class DecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, **kwargs): + super(DecoderLayer, self).__init__() + + self.dec_attn = MultiHeadAttn(n_head, d_model, d_head, dropout, **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, dec_attn_mask=None, mems=None): + output = self.dec_attn(dec_inp, attn_mask=dec_attn_mask, + mems=mems) + output = self.pos_ff(output) + + return output + +class RelLearnableDecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, + **kwargs): + super(RelLearnableDecoderLayer, self).__init__() + + self.dec_attn = RelLearnableMultiHeadAttn(n_head, d_model, d_head, dropout, + **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, r_emb, r_w_bias, r_bias, dec_attn_mask=None, mems=None): + output = self.dec_attn(dec_inp, r_emb, r_w_bias, r_bias, + attn_mask=dec_attn_mask, + mems=mems) + output = self.pos_ff(output) + + return output + +class RelPartialLearnableDecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, + **kwargs): + super(RelPartialLearnableDecoderLayer, self).__init__() + + self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model, + d_head, dropout, **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm')) + + def forward(self, dec_inp, r, r_w_bias, r_r_bias, dec_attn_mask=None, mems=None): + output = self.dec_attn(dec_inp, r, r_w_bias, r_r_bias, + attn_mask=dec_attn_mask, + mems=mems) + output = self.pos_ff(output) + + return output + + +class AdaptiveEmbedding(nn.Module): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, + sample_softmax=False): + super(AdaptiveEmbedding, self).__init__() + + self.n_token = n_token + self.d_embed = d_embed + + self.cutoffs = cutoffs + [n_token] + self.div_val = div_val + self.d_proj = d_proj + + self.emb_scale = d_proj ** 0.5 + + self.cutoff_ends = [0] + self.cutoffs + + self.emb_layers = nn.ModuleList() + self.emb_projs = nn.ParameterList() + if div_val == 1: + print("n_token:", n_token) + print("d_embed:", d_embed) + # self.emb_layers.append( + # nn.Embedding(n_token, 512, sparse=sample_softmax>0) + # ) + self.emb_layers.append( + nn.Embedding(n_token, d_embed, sparse=sample_softmax>0) + ) + if d_proj != d_embed: + self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed))) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + d_emb_i = d_embed // (div_val ** i) + self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i)) + self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i))) + + def forward(self, inp): + if self.div_val == 1: + embed = self.emb_layers[0](inp) + if self.d_proj != self.d_embed: + embed = F.linear(embed, self.emb_projs[0]) + else: + param = next(self.parameters()) + inp_flat = inp.view(-1) + emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], + dtype=param.dtype, device=param.device) + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + + mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + inp_i = inp_flat.index_select(0, indices_i) - l_idx + emb_i = self.emb_layers[i](inp_i) + emb_i = F.linear(emb_i, self.emb_projs[i]) + + emb_flat.index_copy_(0, indices_i, emb_i) + + embed = emb_flat.view(*inp.size(), self.d_proj) + + embed.mul_(self.emb_scale) + + return embed + +class MemTransformerLM(nn.Module): + def __init__(self, n_token, n_layer, n_head, d_model, d_head, d_inner, + dropout, dropatt, tie_weight=True, d_embed=None, + div_val=1, tie_projs=[False], pre_lnorm=False, + tgt_len=None, ext_len=None, mem_len=None, + cutoffs=[], adapt_inp=False, + same_length=False, attn_type=0, clamp_len=-1, + sample_softmax=-1): + super(MemTransformerLM, self).__init__() + self.n_token = n_token + + d_embed = d_model if d_embed is None else d_embed + self.d_embed = d_embed + self.d_model = d_model + self.n_head = n_head + self.d_head = d_head + self.word_emb = AdaptiveEmbedding(n_token, d_embed, d_model, cutoffs, + div_val=div_val) + self.drop = nn.Dropout(dropout) + + self.n_layer = n_layer + + self.tgt_len = tgt_len + self.mem_len = mem_len + self.ext_len = ext_len + self.max_klen = tgt_len + ext_len + mem_len + + self.attn_type = attn_type + + self.layers = nn.ModuleList() + if attn_type == 0: # the default attention + for i in range(n_layer): + self.layers.append( + RelPartialLearnableDecoderLayer( + n_head, d_model, d_head, d_inner, dropout, + tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, + dropatt=dropatt, pre_lnorm=pre_lnorm) + ) + elif attn_type == 1: # learnable embeddings + for i in range(n_layer): + self.layers.append( + RelLearnableDecoderLayer( + n_head, d_model, d_head, d_inner, dropout, + tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, + dropatt=dropatt, pre_lnorm=pre_lnorm) + ) + elif attn_type in [2, 3]: # absolute embeddings + for i in range(n_layer): + self.layers.append( + DecoderLayer( + n_head, d_model, d_head, d_inner, dropout, + dropatt=dropatt, pre_lnorm=pre_lnorm) + ) + + self.sample_softmax = sample_softmax + # use sampled softmax + if sample_softmax > 0: + self.out_layer = nn.Linear(d_model, n_token) + if tie_weight: + self.out_layer.weight = self.word_emb.weight + self.tie_weight = tie_weight + self.sampler = LogUniformSampler(n_token, sample_softmax) + + # use adaptive softmax (including standard softmax) + else: + # dump_tensor(n_token, 'n_token.pt') + # dump_tensor(d_embed, 'd_embed.pt') + # dump_tensor(d_model, 'd_model.pt') + # dump_tensor(cutoffs, 'cutoffs.pt') + # dump_tensor(div_val, 'div_val.pt') + + self.crit = ProjectedAdaptiveLogSoftmax(n_token, d_embed, d_model, + cutoffs, div_val=div_val) + + if tie_weight: + for i in range(len(self.crit.out_layers)): + self.crit.out_layers[i].weight = self.word_emb.emb_layers[i].weight + + if tie_projs: + for i, tie_proj in enumerate(tie_projs): + if tie_proj and div_val == 1 and d_model != d_embed: + self.crit.out_projs[i] = self.word_emb.emb_projs[0] + elif tie_proj and div_val != 1: + self.crit.out_projs[i] = self.word_emb.emb_projs[i] + + self.same_length = same_length + self.clamp_len = clamp_len + + self._create_params() + + def backward_compatible(self): + self.sample_softmax = -1 + + def _create_params(self): + if self.attn_type == 0: # default attention + self.pos_emb = PositionalEmbedding(self.d_model) + self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head)) + self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head)) + elif self.attn_type == 1: # learnable + self.r_emb = nn.Parameter(torch.Tensor( + self.n_layer, self.max_klen, self.n_head, self.d_head)) + self.r_w_bias = nn.Parameter(torch.Tensor( + self.n_layer, self.n_head, self.d_head)) + self.r_bias = nn.Parameter(torch.Tensor( + self.n_layer, self.max_klen, self.n_head)) + elif self.attn_type == 2: # absolute standard + self.pos_emb = PositionalEmbedding(self.d_model) + elif self.attn_type == 3: # absolute deeper SA + self.r_emb = nn.Parameter(torch.Tensor( + self.n_layer, self.max_klen, self.n_head, self.d_head)) + + def reset_length(self, tgt_len, ext_len, mem_len): + self.tgt_len = tgt_len + self.mem_len = mem_len + self.ext_len = ext_len + + def init_mems(self): + if self.mem_len > 0: + mems = [] + param = next(self.parameters()) + for i in range(self.n_layer+1): + empty = torch.empty(0, dtype=param.dtype, device=param.device) + mems.append(empty) + + return mems + else: + return None + + def _update_mems(self, hids, mems, qlen, mlen): + # does not deal with None + if mems is None: return None + + # mems is not None + assert len(hids) == len(mems), 'len(hids) != len(mems)' + + # There are `mlen + qlen` steps that can be cached into mems + # For the next step, the last `ext_len` of the `qlen` tokens + # will be used as the extended context. Hence, we only cache + # the tokens from `mlen + qlen - self.ext_len - self.mem_len` + # to `mlen + qlen - self.ext_len`. + with torch.no_grad(): + new_mems = [] + end_idx = mlen + max(0, qlen - 0 - self.ext_len) + beg_idx = max(0, end_idx - self.mem_len) + for i in range(len(hids)): + + cat = torch.cat([mems[i], hids[i]], dim=0) + new_mems.append(cat[beg_idx:end_idx].detach()) + + return new_mems + + def _forward(self, dec_inp, mems=None): + qlen, bsz = dec_inp.size() + word_emb = self.word_emb(dec_inp.long()) + + mlen = mems[0].size(0) if mems is not None else 0 + klen = mlen + qlen + if self.same_length: + all_ones = word_emb.new_ones(qlen, klen) + mask_len = klen - self.mem_len + if mask_len > 0: + mask_shift_len = qlen - mask_len + else: + mask_shift_len = qlen + dec_attn_mask = (torch.triu(all_ones, 1+mlen) + + torch.tril(all_ones, -mask_shift_len)).byte()[:, :, None] # -1 + else: + dec_attn_mask = torch.triu( + word_emb.new_ones(qlen, klen), diagonal=1+mlen).byte()[:,:,None] + + hids = [] + if self.attn_type == 0: # default + pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, + dtype=word_emb.dtype) + if self.clamp_len > 0: + pos_seq.clamp_(max=self.clamp_len) + pos_emb = self.pos_emb(pos_seq) + + core_out = self.drop(word_emb) + pos_emb = self.drop(pos_emb) + + hids.append(core_out) + for i, layer in enumerate(self.layers): + mems_i = None if mems is None else mems[i] + core_out = layer(core_out, pos_emb, self.r_w_bias, + self.r_r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i) + hids.append(core_out) + elif self.attn_type == 1: # learnable + core_out = self.drop(word_emb) + hids.append(core_out) + for i, layer in enumerate(self.layers): + if self.clamp_len > 0: + r_emb = self.r_emb[i][-self.clamp_len :] + r_bias = self.r_bias[i][-self.clamp_len :] + else: + r_emb, r_bias = self.r_emb[i], self.r_bias[i] + + mems_i = None if mems is None else mems[i] + core_out = layer(core_out, r_emb, self.r_w_bias[i], + r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i) + hids.append(core_out) + elif self.attn_type == 2: # absolute + pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, + dtype=word_emb.dtype) + if self.clamp_len > 0: + pos_seq.clamp_(max=self.clamp_len) + pos_emb = self.pos_emb(pos_seq) + + core_out = self.drop(word_emb + pos_emb[-qlen:]) + + hids.append(core_out) + for i, layer in enumerate(self.layers): + mems_i = None if mems is None else mems[i] + if mems_i is not None and i == 0: + mems_i += pos_emb[:mlen] + core_out = layer(core_out, dec_attn_mask=dec_attn_mask, + mems=mems_i) + hids.append(core_out) + elif self.attn_type == 3: + core_out = self.drop(word_emb) + + hids.append(core_out) + for i, layer in enumerate(self.layers): + mems_i = None if mems is None else mems[i] + if mems_i is not None and mlen > 0: + cur_emb = self.r_emb[i][:-qlen] + cur_size = cur_emb.size(0) + if cur_size < mlen: + cur_emb_pad = cur_emb[0:1].expand(mlen-cur_size, -1, -1) + cur_emb = torch.cat([cur_emb_pad, cur_emb], 0) + else: + cur_emb = cur_emb[-mlen:] + mems_i += cur_emb.view(mlen, 1, -1) + core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1) + + core_out = layer(core_out, dec_attn_mask=dec_attn_mask, + mems=mems_i) + hids.append(core_out) + + core_out = self.drop(core_out) + + new_mems = self._update_mems(hids, mems, mlen, qlen) + + return core_out, new_mems + + def forward(self, data, target, *mems): + # nn.DataParallel does not allow size(0) tensors to be broadcasted. + # So, have to initialize size(0) mems inside the model forward. + # Moreover, have to return new_mems to allow nn.DataParallel to piece + # them together. + if not mems: mems = self.init_mems() + + tgt_len = target.size(0) + hidden, new_mems = self._forward(data, mems=mems) + + pred_hid = hidden[-tgt_len:] + if self.sample_softmax > 0 and self.training: + assert self.tie_weight + logit = sample_logits(self.word_emb, + self.out_layer.bias, target, pred_hid, self.sampler) + loss = -F.log_softmax(logit, -1)[:, :, 0] + else: + loss = self.crit(pred_hid.view(-1, pred_hid.size(-1)), target.view(-1)) + loss = loss.view(tgt_len, -1) + loss = loss.npu() + + if new_mems is None: + return [loss] + else: + return [loss] + new_mems + +def set_device(obj, device='cpu'): + if isinstance(obj, (tuple, list)): + dump = [] + for item in obj: + dump.append(set_device(item, device)) + return dump + elif isinstance(obj, dict): + dump = {} + for k, v in obj.items(): + dump[k] = set_device(v, device) + return dump + elif isinstance(obj, torch.Tensor): + return obj.to(device) + else: + return obj + + +def dump_tensor(output, name): + dump = set_device(output, 'cpu') + torch.save(dump, name) + print('%s dump success!' % (name)) + + +def load_tensor(name, device): + output = torch.load(name) + dump = set_device(output, device) + print('%s load success!' % (name)) + return dump + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description='unit test') + + parser.add_argument('--n_layer', type=int, default=4, help='') + parser.add_argument('--n_rel_layer', type=int, default=4, help='') + parser.add_argument('--n_head', type=int, default=2, help='') + parser.add_argument('--d_head', type=int, default=2, help='') + parser.add_argument('--d_model', type=int, default=200, help='') + parser.add_argument('--d_embed', type=int, default=200, help='') + parser.add_argument('--d_inner', type=int, default=200, help='') + parser.add_argument('--dropout', type=float, default=0.0, help='') + parser.add_argument('--cuda', action='store_true', help='') + parser.add_argument('--seed', type=int, default=1111, help='') + parser.add_argument('--multi_gpu', action='store_true', help='') + + args = parser.parse_args() + + #device = torch.device("cuda" if args.cuda else "cpu") + device = torch.device("npu:0") + + B = 4 + tgt_len, mem_len, ext_len = 36, 36, 0 + data_len = tgt_len * 20 + args.n_token = 10000 + + import data_utils + + data = torch.LongTensor(data_len*B).random_(0, args.n_token).to(device) + diter = data_utils.LMOrderedIterator(data, B, tgt_len, device=device, ext_len=ext_len) + + cutoffs = [args.n_token // 2] + tie_projs = [False] + [True] * len(cutoffs) + + for div_val in [1, 2]: + for d_embed in [200, 100]: + model = MemTransformerLM(args.n_token, args.n_layer, args.n_head, + args.d_model, args.d_head, args.d_inner, args.dropout, + dropatt=args.dropout, tie_weight=True, + d_embed=d_embed, div_val=div_val, + tie_projs=tie_projs, pre_lnorm=True, + tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, + cutoffs=cutoffs, attn_type=0).to(device) + + print(sum(p.numel() for p in model.parameters())) + + mems = tuple() + for idx, (inp, tgt, seqlen) in enumerate(diter): + print('batch {}'.format(idx)) + out = model(inp, tgt, *mems) + mems = out[1:] diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/modelzoo_level.txt b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/modelzoo_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..82f29898a44e5414055c4a4dbb4f0998260f9809 --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/modelzoo_level.txt @@ -0,0 +1,3 @@ +FuncStatus:OK +PerfStatus:POK +PrecisionStatus:POK \ No newline at end of file diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/requirements.txt b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f4eaeb22a91cbb52bc225de73f755272ad3fe53 --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/requirements.txt @@ -0,0 +1,5 @@ +torchvision +tqdm +numpy +itertools +argparse \ No newline at end of file diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/.keep b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/env_npu.sh b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/env_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..280fca96da61c2d983f9a690fd0c044bbcba9167 --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/env_npu.sh @@ -0,0 +1,71 @@ +#!/bin/bash +export install_path=/usr/local/Ascend + +if [ -d ${install_path}/toolkit ]; then + export LD_LIBRARY_PATH=/usr/include/hdf5/lib/:/usr/local/:/usr/local/lib/:/usr/lib/:${install_path}/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons:${path_lib}:${LD_LIBRARY_PATH} + export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH + export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:${install_path}/tfplugin/python/site-packages:${install_path}/toolkit/python/site-packages:$PYTHONPATH + export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:$PYTHONPATH + export ASCEND_OPP_PATH=${install_path}/opp +else + if [ -d ${install_path}/nnae/latest ];then + export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:${install_path}/nnae/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:${install_path}/nnae/latest/fwkacllib/ccec_compiler/bin/:${install_path}/nnae/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=${install_path}/nnae/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=${install_path}/nnae/latest/fwkacllib/python/site-packages/:${install_path}/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=${install_path}/nnae/latest + else + export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/:${install_path}/driver/lib64/common/:${install_path}/driver/lib64/driver/:${install_path}/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:${install_path}/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:${install_path}/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=${install_path}/ascend-toolkit/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:${install_path}/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:${install_path}/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=${install_path}/ascend-toolkit/latest + fi +fi + + +#将Host日志输出到串口,0-关闭/1-开启 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +#设置默认日志级别,0-debug/1-info/2-warning/3-error +export ASCEND_GLOBAL_LOG_LEVEL=3 +#设置Host侧Event日志开启标志,0-关闭/1-开启 +export ASCEND_GLOBAL_EVENT_ENABLE=0 +#设置是否开启taskque,0-关闭/1-开启 +export TASK_QUEUE_ENABLE=1 +#设置是否开启PTCopy,0-关闭/1-开启 +export PTCOPY_ENABLE=1 +#设置是否开启combined标志,0-关闭/1-开启 +export COMBINED_ENABLE=1 +#设置特殊场景是否需要重新编译,不需要修改 +export DYNAMIC_OP="ADD#MUL" +#HCCL白名单开关,1-关闭/0-开启 +export HCCL_WHITELIST_DISABLE=1 +#设置Device侧日志等级为error +${install_path}/driver/tools/msnpureport -g error +#关闭Device侧Event日志 +${install_path}/driver/tools/msnpureport -e disable +export BMMV2_ENABLE=1 + +path_lib=$(python3.7 -c """ +import sys +import re +result='' +for index in range(len(sys.path)): + match_sit = re.search('-packages', sys.path[index]) + if match_sit is not None: + match_lib = re.search('lib', sys.path[index]) + + if match_lib is not None: + end=match_lib.span()[1] + result += sys.path[index][0:end] + ':' + + result+=sys.path[index] + '/torch/lib:' +print(result)""" +) + +echo ${path_lib} + +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/usr/local/gcc7.3.0/lib64:${LD_LIBRARY_PATH} \ No newline at end of file diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_eval_1p.sh b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_eval_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..9b97a2636da9102d40e867388137a1c15d22a4c0 --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_eval_1p.sh @@ -0,0 +1,121 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="Transformer_XL_for_PyTorch" +# 训练batch_size +batch_size=22 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="" +# checkpoint文件路径,以实际路径为准 +pth_path="" +# 训练epoch +train_epochs=50 +# 指定训练所使用的npu device卡id +device_id=0 +# 加载数据进程数 +workers=128 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi +python3 -u eval_npu.py --split valid \ + --data=${data_path} \ + --pth=${pth_path} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/Eval_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'Eval' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/Eval_${ASCEND_DEVICE_ID}.log|awk -F "|" '{print $NF}'|awk -F " " '{print $NF}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Train bpc : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + + +#最后一个迭代loss值,不需要修改 +#ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/Eval_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_full_8p.sh b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_full_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..f77b282e9aa79914f9737735ce722d58f8710262 --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_full_8p.sh @@ -0,0 +1,145 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="Transformer_XL_for_PyTorch" +# 训练batch_size +batch_size=22 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=50 +# 学习率 +learning_rate=0.00025 +# 加载数据进程数 +workers=124 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +device_id_list=0,1,2,3,4,5,6,7 +export RANK_SIZE=8 +currentDir=$(cd "$(dirname "$0")";pwd) +KERNEL_NUM=$(($(nproc)/8)) +for i in $(seq 0 7) +do + PID_START=$((KERNEL_NUM * i)) + PID_END=$((PID_START + KERNEL_NUM - 1)) + taskset -c $PID_START-$PID_END python3.7 -u $(dirname $currentDir)/train_8p_npu.py \ + --addr=$(hostname -I |awk '{print $1}') \ + --workers=$(nproc) \ + --data=${data_path} \ + --multiprocessing-distributed \ + --dist-url='tcp://127.0.0.1:50000' \ + --dist-backend='hccl' \ + --world-size=1 \ + --device_num=8 \ + --max_step=400000 \ + --rank=0 \ + --device-list=${device_id_list} \ + --local_rank=$i > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done + + +wait + + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +FPS=`grep -a 'fps' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $20}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Performance characters/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'Eval' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "|" '{print $NF}'|awk -F " " '{print $NF}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Train bpc : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep epoch ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_performance_1p.sh b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_performance_1p.sh new file mode 100644 index 0000000000000000000000000000000000000000..18157a65d0efb238c9b696aab00bd74edf5bc672 --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_performance_1p.sh @@ -0,0 +1,150 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="Transformer_XL_for_PyTorch" +# 训练batch_size +batch_size=22 +# 训练使用的npu卡数 +export RANK_SIZE=1 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=1 +# 指定训练所使用的npu device卡id +device_id=0 +# 加载数据进程数 +workers=128 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --device_id* ]];then + device_id=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi +# 校验是否指定了device_id,分动态分配device_id与手动指定device_id,此处不需要修改 +if [ $ASCEND_DEVICE_ID ];then + echo "device id is ${ASCEND_DEVICE_ID}" +elif [ ${device_id} ];then + export ASCEND_DEVICE_ID=${device_id} + echo "device id is ${ASCEND_DEVICE_ID}" +else + "[Error] device id must be config" + exit 1 +fi + + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi +python3.7 -u ./train_1p_npu.py \ + --data=${data_path} \ + --seed=1111 \ + --workers=${workers} \ + --gpu=${ASCEND_DEVICE_ID} \ + --eval-interval=4000 \ + --log-interval=1 \ + --max_step=100 \ + --epochs=${train_epochs} \ + --static-loss-scale=128 \ + --batch_size=${batch_size} > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait + + +##################获取训练数据################ +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +#FPS=`grep -a 'FPS' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F " " '{print $11}'|awk 'END {print}'` +#FPS=${FPS#* } + +grep "fps" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $20}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${ASCEND_DEVICE_ID}_fps.log +FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}_fps.log | awk '{a+=$1} END {if (NR != 0) printf("%.3f",a/NR)}'` + +#打印,不需要修改 +echo "Final Performance characters/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'bpc' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v test|awk -F "|" '{print $NF}'|awk -F " " '{print $NF}'|awk 'END {print}'` +#打印,不需要修改 +echo "Final Train bpc : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} + +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000}'` + +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "epoch" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log + + diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_performance_8p.sh b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_performance_8p.sh new file mode 100644 index 0000000000000000000000000000000000000000..b1840ab3cb057206e47ce43487720cf7311434e4 --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/test/train_performance_8p.sh @@ -0,0 +1,150 @@ +#!/bin/bash + +################基础配置参数,需要模型审视修改################## +# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE +# 网络名称,同目录名称 +Network="Transformer_XL_for_PyTorch" +# 训练batch_size +batch_size=22 +# 训练使用的npu卡数 +export RANK_SIZE=8 +# 数据集路径,保持为空,不需要修改 +data_path="" + +# 训练epoch +train_epochs=50 +# 学习率 +learning_rate=0.00025 +# 加载数据进程数 +workers=124 + + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --workers* ]];then + workers=`echo ${para#*=}` + elif [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + fi +done + +# 校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_diename=${cur_path##*/} +if [ x"${cur_path_last_diename}" == x"test" ];then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi + + +#################创建日志输出目录,不需要修改################# +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/${ASCEND_DEVICE_ID} ];then + rm -rf ${test_path_dir}/output/${ASCEND_DEVICE_ID} + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#################启动训练脚本################# +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +device_id_list=0,1,2,3,4,5,6,7 +export RANK_SIZE=8 +currentDir=$(cd "$(dirname "$0")";pwd) +KERNEL_NUM=$(($(nproc)/8)) +for i in $(seq 0 7) +do + PID_START=$((KERNEL_NUM * i)) + PID_END=$((PID_START + KERNEL_NUM - 1)) + taskset -c $PID_START-$PID_END python3.7 -u $(dirname $currentDir)/train_8p_npu.py \ + --addr=$(hostname -I |awk '{print $1}') \ + --workers=$(nproc) \ + --data=${data_path} \ + --multiprocessing-distributed \ + --dist-url='tcp://127.0.0.1:50000' \ + --dist-backend='hccl' \ + --world-size=1 \ + --device_num=8 \ + --log-interval=1 \ + --eval-interval=4000 \ + --max_step=100 \ + --rank=0 \ + --device-list=${device_id_list} \ + --local_rank=$i > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & +done + + +wait + + + +##################获取训练数据################ +#训练结束时间,不需要修改 + +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +grep "fps" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log | awk '{print $20}' > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}_fps.log +FPS=`cat ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}_fps.log|awk '{a+=$1} END {if (NR !=0) printf("%.3f", a/NR)}'` +#打印,不需要修改 +echo "Final Performance characters/sec : $FPS" + +#输出训练精度,需要模型审视修改 +train_accuracy=`grep -a 'bpc' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|grep -v test|awk 'END {print}'|awk -F "|" '{print $NF}'|awk -F " " '{print $NF}'` + +#打印,不需要修改 +echo "Final Train bpc : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +BatchSize=${batch_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf' + +##获取性能数据,不需要修改 +#吞吐量 +ActualFPS=${FPS} +#单迭代训练时长 +TrainingTime=`awk 'BEGIN{printf "%.2f\n", '${batch_size}'*1000/'${FPS}'}'` + +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep "epoch" ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_$ASCEND_DEVICE_ID.log|grep -v test|awk -F "|" '{print $6}' | awk -F " " '{print $NF}' > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt + +#最后一个迭代loss值,不需要修改 +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${CaseName}_loss.txt` + +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "BatchSize = ${BatchSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualFPS = ${ActualFPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/train_1p_npu.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/train_1p_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..8f213a8237389c9d4942c7bc2ce1d172923850cb --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/train_1p_npu.py @@ -0,0 +1,542 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import time +import math +import os +import itertools + +import torch +import torch.nn as nn +import torch.optim as optim + +from data_utils import get_lm_corpus +from mem_transformer import MemTransformerLM +from utils.exp_utils import create_exp_dir +from utils.data_parallel import BalancedDataParallel +from apex import amp +import apex + +parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') +parser.add_argument('--data', type=str, default='../data/enwik8', + help='location of the data corpus') +parser.add_argument('--dataset', type=str, default='enwik8', + choices=['wt103', 'lm1b', 'enwik8', 'text8'], + help='dataset name') +parser.add_argument('--n_layer', type=int, default=12, + help='number of total layers') +parser.add_argument('--n_head', type=int, default=8, + help='number of heads') +parser.add_argument('--d_head', type=int, default=64, + help='head dimension') +parser.add_argument('--d_embed', type=int, default=-1, + help='embedding dimension') +parser.add_argument('--d_model', type=int, default=512, + help='model dimension') +parser.add_argument('--d_inner', type=int, default=2048, + help='inner dimension in FF') +parser.add_argument('--dropout', type=float, default=0.1, + help='global dropout rate') +parser.add_argument('--dropatt', type=float, default=0.0, + help='attention probability dropout rate') +parser.add_argument('--init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--emb_init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--init_range', type=float, default=0.1, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--emb_init_range', type=float, default=0.01, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--init_std', type=float, default=0.02, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--proj_init_std', type=float, default=0.01, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--optim', default='adam', type=str, + choices=['adam', 'sgd', 'adagrad'], + help='optimizer to use.') +parser.add_argument('--lr', type=float, default=0.00025, + help='initial learning rate (0.00025|5 for adam|sgd)') +parser.add_argument('--epochs', type=int, default=50, + help='train epochs') +parser.add_argument('--mom', type=float, default=0.0, + help='momentum for sgd') +parser.add_argument('--scheduler', default='cosine', type=str, + choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'], + help='lr scheduler to use.') +parser.add_argument('--warmup_step', type=int, default=0, + help='upper epoch limit') +parser.add_argument('--decay_rate', type=float, default=0.5, + help='decay factor when ReduceLROnPlateau is used') +parser.add_argument('--lr_min', type=float, default=0.0, + help='minimum learning rate during annealing') +parser.add_argument('--clip', type=float, default=0.25, + help='gradient clipping') +parser.add_argument('--clip_nonemb', action='store_true', + help='only clip the gradient of non-embedding params') +parser.add_argument('--max_step', type=int, default=100000, + help='upper epoch limit') +parser.add_argument('--batch_size', type=int, default=22, + help='batch size') +parser.add_argument('--batch_chunk', type=int, default=1, + help='split batch into chunks to save memory') +parser.add_argument('--tgt_len', type=int, default=512, + help='number of tokens to predict') +parser.add_argument('--eval_tgt_len', type=int, default=128, + help='number of tokens to predict for evaluation') +parser.add_argument('--ext_len', type=int, default=0, + help='length of the extended context') +parser.add_argument('--mem_len', type=int, default=512, + help='length of the retained previous heads') +parser.add_argument('--not_tied', action='store_true', + help='do not tie the word embedding and softmax weights') +parser.add_argument('--seed', type=int, default=1111, + help='random seed') +parser.add_argument('--workers', type=int, default=64, + help='workers num') +parser.add_argument('--npu', default=True, help='use NPU') +parser.add_argument('--adaptive', action='store_true', + help='use adaptive softmax') +parser.add_argument('--div_val', type=int, default=1, + help='divident value for adapative input and softmax') +parser.add_argument('--pre_lnorm', action='store_true', + help='apply LayerNorm to the input instead of the output') +parser.add_argument('--varlen', action='store_true', + help='use variable length') +parser.add_argument('--multi_gpu', action='store_true', + help='use multiple GPU') +parser.add_argument('--log-interval', type=int, default=200, + help='report interval') +parser.add_argument('--eval-interval', type=int, default=4000, + help='evaluation interval') +parser.add_argument('--work_dir', default='LM-TFM', type=str, + help='experiment directory.') +parser.add_argument('--restart', action='store_true', + help='restart training from the saved checkpoint') +parser.add_argument('--restart_dir', type=str, default='', + help='restart dir') +parser.add_argument('--debug', action='store_true', + help='run in debug mode (do not create exp dir)') +parser.add_argument('--same_length', action='store_true', + help='use the same attn length for all tokens') +parser.add_argument('--attn_type', type=int, default=0, + help='attention type. 0 for ours, 1 for Shaw et al,' + '2 for Vaswani et al, 3 for Al Rfou et al.') +parser.add_argument('--clamp_len', type=int, default=-1, + help='use the same pos embeddings after clamp_len') +parser.add_argument('--eta_min', type=float, default=0.0, + help='min learning rate for cosine scheduler') +parser.add_argument('--gpu0_bsz', type=int, default=-1, + help='batch size on gpu 0') +parser.add_argument('--max_eval_steps', type=int, default=-1, + help='max eval steps') +parser.add_argument('--sample_softmax', type=int, default=-1, + help='number of samples in sampled softmax') +parser.add_argument('--patience', type=int, default=0, + help='patience') +parser.add_argument('--finetune_v2', action='store_true', + help='finetune v2') +parser.add_argument('--finetune_v3', action='store_true', + help='finetune v3') +parser.add_argument('--static-loss-scale', type=float, default=128.0, + help='Static loss scale, positive power of 2 values can ' + 'improve fp16 convergence.') +parser.add_argument('--dynamic-loss-scale', action='store_true', + help='Use dynamic loss scaling. If supplied, this argument' + ' supersedes --static-loss-scale.') +args = parser.parse_args() +args.tied = not args.not_tied + +if args.d_embed < 0: + args.d_embed = args.d_model + +assert args.ext_len >= 0, 'extended context length must be non-negative' +assert args.batch_size % args.batch_chunk == 0 + +args.work_dir = '{}-{}'.format(args.work_dir, args.dataset) +args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S')) +logging = create_exp_dir(args.work_dir, + scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug) + +device = torch.device('npu:0') + +############################################################################### +# Load data +############################################################################### +corpus = get_lm_corpus(args.data, args.dataset) +ntokens = len(corpus.vocab) +args.n_token = ntokens + +eval_batch_size = 10 +tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len, + device=device, ext_len=args.ext_len) +va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len, + device=device, ext_len=args.ext_len) +te_iter = corpus.get_iterator('test.py', eval_batch_size, args.eval_tgt_len, + device=device, ext_len=args.ext_len) + +# adaptive softmax / embedding +cutoffs, tie_projs = [], [False] +if args.adaptive: + assert args.dataset in ['wt103', 'lm1b'] + if args.dataset == 'wt103': + cutoffs = [20000, 40000, 200000] + tie_projs += [True] * len(cutoffs) + elif args.dataset == 'lm1b': + cutoffs = [60000, 100000, 640000] + tie_projs += [False] * len(cutoffs) + +############################################################################### +# Build the model +############################################################################### +def init_weight(weight): + if args.init == 'uniform': + nn.init.uniform_(weight, -args.init_range, args.init_range) + elif args.init == 'normal': + nn.init.normal_(weight, 0.0, args.init_std) + +def init_bias(bias): + nn.init.constant_(bias, 0.0) + +def weights_init(m): + classname = m.__class__.__name__ + if classname.find('Linear') != -1: + if hasattr(m, 'weight') and m.weight is not None: + init_weight(m.weight) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('AdaptiveEmbedding') != -1: + if hasattr(m, 'emb_projs'): + for i in range(len(m.emb_projs)): + if m.emb_projs[i] is not None: + nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std) + elif classname.find('Embedding') != -1: + if hasattr(m, 'weight'): + init_weight(m.weight) + elif classname.find('ProjectedAdaptiveLogSoftmax') != -1: + if hasattr(m, 'cluster_weight') and m.cluster_weight is not None: + init_weight(m.cluster_weight) + if hasattr(m, 'cluster_bias') and m.cluster_bias is not None: + init_bias(m.cluster_bias) + if hasattr(m, 'out_projs'): + for i in range(len(m.out_projs)): + if m.out_projs[i] is not None: + nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std) + elif classname.find('LayerNorm') != -1: + if hasattr(m, 'weight'): + nn.init.normal_(m.weight, 1.0, args.init_std) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('TransformerLM') != -1: + if hasattr(m, 'r_emb'): + init_weight(m.r_emb) + if hasattr(m, 'r_w_bias'): + init_weight(m.r_w_bias) + if hasattr(m, 'r_r_bias'): + init_weight(m.r_r_bias) + if hasattr(m, 'r_bias'): + init_bias(m.r_bias) + +def update_dropout(m): + classname = m.__class__.__name__ + if classname.find('Dropout') != -1: + if hasattr(m, 'p'): + m.p = args.dropout + +def update_dropatt(m): + if hasattr(m, 'dropatt'): + m.dropatt.p = args.dropatt + +if args.restart: + with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f: + model = torch.load(f) + model.apply(update_dropout) + model.apply(update_dropatt) +else: + model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model, + args.d_head, args.d_inner, args.dropout, args.dropatt, + tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, + tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, + ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, + same_length=args.same_length, attn_type=args.attn_type, + clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) + model.apply(weights_init) + model.word_emb.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing +args.n_all_param = sum([p.nelement() for p in model.parameters()]) +args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()]) + + +if args.multi_gpu: + model = model.to(device) + if args.gpu0_bsz >= 0: + para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk, + model, dim=1).to(device) + else: + para_model = nn.DataParallel(model, dim=1).to(device) +else: + para_model = model.to(device) + +#### optimizer +if args.optim.lower() == 'sgd': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2) + optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom) + else: + optimizer = optim.SGD(model.parameters(), lr=args.lr, + momentum=args.mom) +elif args.optim.lower() == 'adam': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr) + optimizer = optim.Adam(dense_params, lr=args.lr) + else: + optimizer = apex.optimizers.NpuFusedAdam(model.parameters(), lr=args.lr) +elif args.optim.lower() == 'adagrad': + optimizer = optim.Adagrad(model.parameters(), lr=args.lr) + + +################################################################################################### +opt_level = "O2" +model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level, loss_scale=128.0, combine_grad=True) +################################################################################################### + + +#### scheduler +if args.scheduler == 'cosine': + scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, + args.max_step, eta_min=args.eta_min) + if args.sample_softmax > 0: + scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse, + args.max_step, eta_min=args.eta_min) +elif args.scheduler == 'inv_sqrt': + + def lr_lambda(step): + if step == 0 and args.warmup_step == 0: + return 1. + else: + return 1. / (step ** 0.5) if step > args.warmup_step \ + else step / (args.warmup_step ** 1.5) + scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) + +elif args.scheduler == 'dev_perf': + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, + factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) + if args.sample_softmax > 0: + scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse, + factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) +elif args.scheduler == 'constant': + pass + + +if args.restart: + if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')): + with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f: + opt_state_dict = torch.load(f) + optimizer.load_state_dict(opt_state_dict) + else: + print('Optimizer was not saved. Start from scratch.') + +logging('=' * 100) +for k, v in args.__dict__.items(): + logging(' - {} : {}'.format(k, v)) +logging('=' * 100) +logging('#params = {}'.format(args.n_all_param)) +logging('#non emb params = {}'.format(args.n_nonemb_param)) + +############################################################################### +# Training code +############################################################################### + +def evaluate(eval_iter): + model.eval() + if args.mem_len == 0: + model.reset_length(args.eval_tgt_len, + args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len) + else: + model.reset_length(args.eval_tgt_len, + args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len) + + # Evaluation + total_len, total_loss = 0, 0. + with torch.no_grad(): + mems = tuple() + for i, (data, target, seq_len) in enumerate(eval_iter): + if args.max_eval_steps > 0 and i >= args.max_eval_steps: + break + ret = model(data, target, *mems) + loss, mems = ret[0], ret[1:] + loss = loss.mean() + total_loss += seq_len * loss.float().item() + total_len += seq_len + + model.reset_length(args.tgt_len, args.ext_len, args.mem_len) + model.train() + + return total_loss / total_len + + +def train(): + global train_step, train_loss, best_val_loss, eval_start_time, log_start_time + model.train() + if args.batch_chunk > 1: + mems = [tuple() for _ in range(args.batch_chunk)] + else: + mems = tuple() + train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter + for batch, (data, target, seq_len) in enumerate(train_iter): + model.zero_grad() + if args.batch_chunk > 1: + data_chunks = torch.chunk(data, args.batch_chunk, 1) + target_chunks = torch.chunk(target, args.batch_chunk, 1) + for i in range(args.batch_chunk): + data_i = data_chunks[i].contiguous() + target_i = target_chunks[i].contiguous() + ret = para_model(data_i, target_i, *mems[i]) + loss, mems[i] = ret[0], ret[1:] + loss = loss.float().mean().type_as(loss) / args.batch_chunk + #################################################################### + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + #################################################################### + with torch.no_grad(): + train_loss += loss.float().bool().item() + else: + ret = para_model(data, target, *mems) + loss, mems = ret[0], ret[1:] + loss = loss.float().mean().type_as(loss) + #################################################### + with torch.no_grad(): + train_loss += loss.float().item() + ################################################################### + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + + + optimizer.step() + if args.sample_softmax > 0: + optimizer_sparse.step() + + # step-wise learning rate annealing + train_step += 1 + if args.scheduler in ['cosine', 'constant', 'dev_perf']: + # linear warmup stage + if train_step < args.warmup_step: + curr_lr = args.lr * train_step / args.warmup_step + optimizer.param_groups[0]['lr'] = curr_lr + if args.sample_softmax > 0: + optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2 + else: + if args.scheduler == 'cosine': + scheduler.step(train_step) + if args.sample_softmax > 0: + scheduler_sparse.step(train_step) + elif args.scheduler == 'inv_sqrt': + scheduler.step(train_step) + + if train_step % args.log_interval == 0: + cur_loss = train_loss / args.log_interval + elapsed = time.time() - log_start_time + log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \ + '| ms/batch {:5.2f} | loss {:5.2f} | fps {:.2f}'.format( + epoch, train_step, batch+1, optimizer.param_groups[0]['lr'], + elapsed * 1000 / args.log_interval, cur_loss, args.log_interval*args.batch_size*args.tgt_len/elapsed) + if args.dataset in ['enwik8', 'text8']: + log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2)) + else: + log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss)) + logging(log_str) + train_loss = 0 + log_start_time = time.time() + + if train_step % args.eval_interval == 0: + ts = time.time() + val_loss = evaluate(va_iter) + print('evaluation use time {} s'.format(time.time()-ts)) + logging('-' * 100) + log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \ + '| valid loss {:5.2f}'.format( + train_step // args.eval_interval, train_step, + (time.time() - eval_start_time), val_loss) + if args.dataset in ['enwik8', 'text8']: + log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2)) + else: + log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss)) + logging(log_str) + logging('-' * 100) + # Save the model if the validation loss is the best we've seen so far. + if not best_val_loss or val_loss < best_val_loss: + if not args.debug: + with open('model.pt', 'wb') as f: + torch.save(model.state_dict(), f) + with open('optimizer.pt', 'wb') as f: + torch.save(optimizer.state_dict(), f) + best_val_loss = val_loss + + # dev-performance based learning rate annealing + if args.scheduler == 'dev_perf': + scheduler.step(val_loss) + if args.sample_softmax > 0: + scheduler_sparse.step(val_loss) + + eval_start_time = time.time() + + if train_step == args.max_step: + break + +# Loop over epochs. +train_step = 0 +train_loss = 0 +best_val_loss = None + +log_start_time = time.time() +eval_start_time = time.time() + +# At any point you can hit Ctrl + C to break out of training early. +try: + for epoch in itertools.count(start=1): + train() + if train_step == args.max_step: + logging('-' * 100) + logging('End of training') + break +except KeyboardInterrupt: + logging('-' * 100) + logging('Exiting from training early') + +## Load the best saved model. +#with open('model.pt', 'rb') as f: +# model.load_state_dict(torch.load(f, map_location=device)) +#para_model = model.to(device) + +## Run on test data. +#test_loss = evaluate(te_iter) +#logging('=' * 100) +#if args.dataset in ['enwik8', 'text8']: +# logging('| End of training | test loss {:5.2f} | test bpc {:9.5f}'.format( +# test_loss, test_loss / math.log(2))) +#else: +# logging('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format( +# test_loss, math.exp(test_loss))) +#logging('=' * 100) diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/train_8p_npu.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/train_8p_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..fb782e84f1777212f930e0741f409b6e376da96b --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/train_8p_npu.py @@ -0,0 +1,644 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import time +import math +import os, sys +import itertools +import numpy as np + +import torch +import torch.nn as nn +import torch.optim as optim +import torch.multiprocessing as mp +from data_utils import get_lm_corpus +from mem_transformer import MemTransformerLM +from utils.exp_utils import create_exp_dir +from utils.data_parallel import BalancedDataParallel +from apex import amp +import torch.distributed as dist +import apex +import warnings + + +parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model') +parser.add_argument('--data', type=str, default='../data/enwik8', + help='location of the data corpus') +parser.add_argument('--dataset', type=str, default='enwik8', + choices=['wt103', 'lm1b', 'enwik8', 'text8'], + help='dataset name') +parser.add_argument('--n_layer', type=int, default=12, + help='number of total layers') +parser.add_argument('--n_head', type=int, default=8, + help='number of heads') +parser.add_argument('--d_head', type=int, default=64, + help='head dimension') +parser.add_argument('--d_embed', type=int, default=-1, + help='embedding dimension') +parser.add_argument('--d_model', type=int, default=512, + help='model dimension') +parser.add_argument('--d_inner', type=int, default=2048, + help='inner dimension in FF') +parser.add_argument('--dropout', type=float, default=0.1, + help='global dropout rate') +parser.add_argument('--dropatt', type=float, default=0.0, + help='attention probability dropout rate') +parser.add_argument('--init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--emb_init', default='normal', type=str, + help='parameter initializer to use.') +parser.add_argument('--init_range', type=float, default=0.1, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--emb_init_range', type=float, default=0.01, + help='parameters initialized by U(-init_range, init_range)') +parser.add_argument('--init_std', type=float, default=0.02, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--proj_init_std', type=float, default=0.01, + help='parameters initialized by N(0, init_std)') +parser.add_argument('--optim', default='adam', type=str, + choices=['adam', 'sgd', 'adagrad'], + help='optimizer to use.') +parser.add_argument('--lr', type=float, default=0.00025, + help='initial learning rate (0.00025|5 for adam|sgd)') +parser.add_argument('--mom', type=float, default=0.0, + help='momentum for sgd') +parser.add_argument('--scheduler', default='cosine', type=str, + choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'], + help='lr scheduler to use.') +parser.add_argument('--warmup_step', type=int, default=0, + help='upper epoch limit') +parser.add_argument('--decay_rate', type=float, default=0.5, + help='decay factor when ReduceLROnPlateau is used') +parser.add_argument('--lr_min', type=float, default=0.0, + help='minimum learning rate during annealing') +parser.add_argument('--clip', type=float, default=0.25, + help='gradient clipping') +parser.add_argument('--clip_nonemb', action='store_true', + help='only clip the gradient of non-embedding params') +parser.add_argument('--max_step', type=int, default=1000, + help='upper epoch limit') +parser.add_argument('--batch_size', type=int, default=22, + help='batch size') +parser.add_argument('--batch_chunk', type=int, default=1, + help='split batch into chunks to save memory') +parser.add_argument('--tgt_len', type=int, default=512, + help='number of tokens to predict') +parser.add_argument('--eval_tgt_len', type=int, default=128, + help='number of tokens to predict for evaluation') +parser.add_argument('--ext_len', type=int, default=0, + help='length of the extended context') +parser.add_argument('--mem_len', type=int, default=512, + help='length of the retained previous heads') +parser.add_argument('--not_tied', action='store_true', + help='do not tie the word embedding and softmax weights') +parser.add_argument('--seed', type=int, default=1111, + help='random seed') +# parser.add_argument('--npu', default=True, help='use NPU') +parser.add_argument('--adaptive', action='store_true', + help='use adaptive softmax') +parser.add_argument('--div_val', type=int, default=1, + help='divident value for adapative input and softmax') +parser.add_argument('--pre_lnorm', action='store_true', + help='apply LayerNorm to the input instead of the output') +parser.add_argument('--varlen', action='store_true', + help='use variable length') +parser.add_argument('--multi_gpu', action='store_true', + help='use multiple GPU') +parser.add_argument('--log-interval', type=int, default=200, + help='report interval') +parser.add_argument('--eval-interval', type=int, default=1000, + help='evaluation interval') +parser.add_argument('--work_dir', default='LM-TFM', type=str, + help='experiment directory.') +parser.add_argument('--restart', action='store_true', + help='restart training from the saved checkpoint') +parser.add_argument('--restart_dir', type=str, default='', + help='restart dir') +parser.add_argument('--debug', action='store_true', + help='run in debug mode (do not create exp dir)') +parser.add_argument('--same_length', action='store_true', + help='use the same attn length for all tokens') +parser.add_argument('--attn_type', type=int, default=0, + help='attention type. 0 for ours, 1 for Shaw et al,' + '2 for Vaswani et al, 3 for Al Rfou et al.') +parser.add_argument('--clamp_len', type=int, default=-1, + help='use the same pos embeddings after clamp_len') +parser.add_argument('--eta_min', type=float, default=0.0, + help='min learning rate for cosine scheduler') +parser.add_argument('--gpu0_bsz', type=int, default=-1, + help='batch size on gpu 0') +parser.add_argument('--max_eval_steps', type=int, default=-1, + help='max eval steps') +parser.add_argument('--sample_softmax', type=int, default=-1, + help='number of samples in sampled softmax') +parser.add_argument('--patience', type=int, default=0, + help='patience') +parser.add_argument('--finetune_v2', action='store_true', + help='finetune v2') +parser.add_argument('--finetune_v3', action='store_true', + help='finetune v3') +parser.add_argument('--static-loss-scale', type=float, default=128.0, + help='Static loss scale, positive power of 2 values can ' + 'improve fp16 convergence.') +parser.add_argument('--dynamic-loss-scale', action='store_true', + help='Use dynamic loss scaling. If supplied, this argument' + ' supersedes --static-loss-scale.') +#edit this for 8p +parser.add_argument('--dist-backend', type=str, default='hccl') +parser.add_argument('--world-size', type=int, default=-1) +parser.add_argument('--rank', type=int, default=-1) +parser.add_argument('--local_rank', type=int, default=0) +parser.add_argument('--addr', type=str, default='127.0.0.1') +parser.add_argument('--device_num', type=int, default=-1) +parser.add_argument('--workers', type=int, default=32) +parser.add_argument('--device-list', default='', type=str) +parser.add_argument('--dist-url', type=str, default='tcp://127.0.0.1:50000') +parser.add_argument('--device', type=str, default='npu') +parser.add_argument('--gpu', default=None, type=int, help='GPU id to use.') +parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') +warnings.filterwarnings('ignore') +#############end################# + +def main(): + args = parser.parse_args() + args.tied = not args.not_tied + torch.manual_seed(args.seed) + + global train_step, train_loss, best_val_loss, eval_start_time, log_start_time + ############################## + # edit this for 8p + os.environ['MASTER_ADDR'] = args.addr + os.environ['MASTER_PORT'] = '29888' + os.environ['LOCAL_DEVICE_ID'] = str(0) + print("+++++++++++++++++++++++++++LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID']) + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + if args.device_list != '': + ngpus_per_node = len(args.device_list.split(',')) + elif args.device_num != -1: + ngpus_per_node = args.device_num + elif args.device == 'npu': + ngpus_per_node = int(os.environ["RANK_SIZE"]) + else: + ngpus_per_node = torch.cuda.device_count() + if args.multiprocessing_distributed: + args.world_size = ngpus_per_node * args.world_size + if args.device == 'npu': + main_worker(args.local_rank, ngpus_per_node,args) + else: + main_worker(args.gpu, ngpus_per_node, args) + ############################## + + +def main_worker(gpu, ngpus_per_node, args): + + global train_step, train_loss, best_val_loss, eval_start_time, log_start_time + if args.d_embed < 0: + args.d_embed = args.d_model + + assert args.ext_len >= 0, 'extended context length must be non-negative' + assert args.batch_size % args.batch_chunk == 0 + + args.work_dir = '{}-{}'.format(args.work_dir, args.dataset) + args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S')) + logging = create_exp_dir(args.work_dir, + scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug) + + if args.device_list != '': + args.gpu = int(args.device_list.split(',')[gpu]) + else: + args.gpu = gpu + + print("[npu id:", args.gpu, "]", "++++++++++++++++ before set LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID']) + os.environ['LOCAL_DEVICE_ID'] = str(args.gpu) + print("[npu id:", args.gpu, "]", "++++++++++++++++ LOCAL_DEVICE_ID:", os.environ['LOCAL_DEVICE_ID']) + + if args.gpu is not None: + print("[npu id:", args.gpu, "]", "Use GPU: {} for training".format(args.gpu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + args.rank = args.rank * ngpus_per_node + gpu + + if args.device == 'npu': + dist.init_process_group(backend=args.dist_backend, + world_size=args.world_size, rank=args.rank) + else: + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + + loc = 'npu:{}'.format(args.gpu) + torch.npu.set_device(loc) + + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + + print("[npu id:", args.gpu, "]", "===============main_worker()=================") + print("[npu id:", args.gpu, "]", args) + print("[npu id:", args.gpu, "]", "===============main_worker()=================") + + + ############################################################################### + # Load data + ############################################################################### + corpus = get_lm_corpus(args.data, args.dataset) + ntokens = len(corpus.vocab) + args.n_token = ntokens + + eval_batch_size = 10 + tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len, + device=loc, ext_len=args.ext_len) + va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len, + device=loc, ext_len=args.ext_len) + te_iter = corpus.get_iterator('test.py', eval_batch_size, args.eval_tgt_len, + device=loc, ext_len=args.ext_len) + + # adaptive softmax / embedding + cutoffs, tie_projs = [], [False] + if args.adaptive: + assert args.dataset in ['wt103', 'lm1b'] + if args.dataset == 'wt103': + cutoffs = [20000, 40000, 200000] + tie_projs += [True] * len(cutoffs) + elif args.dataset == 'lm1b': + cutoffs = [60000, 100000, 640000] + tie_projs += [False] * len(cutoffs) + + ############################################################################### + # Build the model + ############################################################################### + def init_weight(weight): + if args.init == 'uniform': + nn.init.uniform_(weight, -args.init_range, args.init_range) + elif args.init == 'normal': + nn.init.normal_(weight, 0.0, args.init_std) + + def init_bias(bias): + nn.init.constant_(bias, 0.0) + + def weights_init(m): + classname = m.__class__.__name__ + if classname.find('Linear') != -1: + if hasattr(m, 'weight') and m.weight is not None: + init_weight(m.weight) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('AdaptiveEmbedding') != -1: + if hasattr(m, 'emb_projs'): + for i in range(len(m.emb_projs)): + if m.emb_projs[i] is not None: + nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std) + elif classname.find('Embedding') != -1: + if hasattr(m, 'weight'): + init_weight(m.weight) + elif classname.find('ProjectedAdaptiveLogSoftmax') != -1: + if hasattr(m, 'cluster_weight') and m.cluster_weight is not None: + init_weight(m.cluster_weight) + if hasattr(m, 'cluster_bias') and m.cluster_bias is not None: + init_bias(m.cluster_bias) + if hasattr(m, 'out_projs'): + for i in range(len(m.out_projs)): + if m.out_projs[i] is not None: + nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std) + elif classname.find('LayerNorm') != -1: + if hasattr(m, 'weight'): + nn.init.normal_(m.weight, 1.0, args.init_std) + if hasattr(m, 'bias') and m.bias is not None: + init_bias(m.bias) + elif classname.find('TransformerLM') != -1: + if hasattr(m, 'r_emb'): + init_weight(m.r_emb) + if hasattr(m, 'r_w_bias'): + init_weight(m.r_w_bias) + if hasattr(m, 'r_r_bias'): + init_weight(m.r_r_bias) + if hasattr(m, 'r_bias'): + init_bias(m.r_bias) + + def update_dropout(m): + classname = m.__class__.__name__ + if classname.find('Dropout') != -1: + if hasattr(m, 'p'): + m.p = args.dropout + + def update_dropatt(m): + if hasattr(m, 'dropatt'): + m.dropatt.p = args.dropatt + + if args.restart: + with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f: + model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model, + args.d_head, args.d_inner, args.dropout, args.dropatt, + tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, + tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, + ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, + same_length=args.same_length, attn_type=args.attn_type, + clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) + model.apply(weights_init) + model.word_emb.apply(weights_init) + model = model.to(loc) + ckpt = torch.load(f, map_location=loc) + model.load_state_dict(ckpt) + model.apply(update_dropout) + model.apply(update_dropatt) + else: + model = MemTransformerLM(ntokens, args.n_layer, args.n_head, args.d_model, + args.d_head, args.d_inner, args.dropout, args.dropatt, + tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val, + tie_projs=tie_projs, pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len, + ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs, + same_length=args.same_length, attn_type=args.attn_type, + clamp_len=args.clamp_len, sample_softmax=args.sample_softmax) + model.apply(weights_init) + model.word_emb.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing + + + + args.n_all_param = sum([p.nelement() for p in model.parameters()]) + args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()]) + + + + #### optimizer + if args.optim.lower() == 'sgd': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2) + optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom) + else: + optimizer = optim.SGD(model.parameters(), lr=args.lr, + momentum=args.mom) + elif args.optim.lower() == 'adam': + if args.sample_softmax > 0: + dense_params, sparse_params = [], [] + for param in model.parameters(): + if param.size() == model.word_emb.weight.size(): + sparse_params.append(param) + else: + dense_params.append(param) + optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr) + optimizer = optim.Adam(dense_params, lr=args.lr) + else: + #optimizer = optim.Adam(model.parameters(), lr=args.lr) + optimizer = apex.optimizers.NpuFusedAdam(model.parameters(), lr=args.lr) + elif args.optim.lower() == 'adagrad': + optimizer = optim.Adagrad(model.parameters(), lr=args.lr) + + model = model.to(loc) + ################################################################################################### + opt_level = "O2" + model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level, loss_scale=128.0, combine_grad=True) + ################################################################################################### + + if args.multi_gpu: + + if args.gpu0_bsz >= 0: + para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk, + model, dim=1).to(loc) + else: + para_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False) + else: + para_model = model.to(loc) + + #### scheduler + if args.scheduler == 'cosine': + scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, + args.max_step, eta_min=args.eta_min) + if args.sample_softmax > 0: + scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse, + args.max_step, eta_min=args.eta_min) + elif args.scheduler == 'inv_sqrt': + def lr_lambda(step): + if step == 0 and args.warmup_step == 0: + return 1. + else: + return 1. / (step ** 0.5) if step > args.warmup_step \ + else step / (args.warmup_step ** 1.5) + scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) + elif args.scheduler == 'dev_perf': + scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, + factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) + if args.sample_softmax > 0: + scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse, + factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min) + elif args.scheduler == 'constant': + pass + + + if args.restart: + if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')): + with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f: + opt_state_dict = torch.load(f, map_location=loc) + optimizer.load_state_dict(opt_state_dict) + else: + print('Optimizer was not saved. Start from scratch.') + + logging('=' * 100) + for k, v in args.__dict__.items(): + logging(' - {} : {}'.format(k, v)) + logging('=' * 100) + logging('#params = {}'.format(args.n_all_param)) + logging('#non emb params = {}'.format(args.n_nonemb_param)) + + ############################################################################### + # Training code + ############################################################################### + + def evaluate(eval_iter): + model.eval() + if args.mem_len == 0: + model.reset_length(args.eval_tgt_len, + args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len) + else: + model.reset_length(args.eval_tgt_len, + args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len) + + # Evaluation + total_len, total_loss = 0, 0. + with torch.no_grad(): + mems = tuple() + for i, (data, target, seq_len) in enumerate(eval_iter): + if args.max_eval_steps > 0 and i >= args.max_eval_steps: + break + ret = model(data, target, *mems) + loss, mems = ret[0], ret[1:] + loss = loss.mean() + total_loss += seq_len * loss.float().item() + total_len += seq_len + + model.reset_length(args.tgt_len, args.ext_len, args.mem_len) + model.train() + return total_loss / total_len + + + def train(): + # Turn on training mode which enables dropout. + global train_step, train_loss, best_val_loss, eval_start_time, log_start_time + + model.train() + if args.batch_chunk > 1: + mems = [tuple() for _ in range(args.batch_chunk)] + else: + mems = tuple() + train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter + for batch, (data, target, seq_len) in enumerate(train_iter): + model.zero_grad() + if args.batch_chunk > 1: + data_chunks = torch.chunk(data, args.batch_chunk, 1) + target_chunks = torch.chunk(target, args.batch_chunk, 1) + for i in range(args.batch_chunk): + data_i = data_chunks[i].contiguous() + target_i = target_chunks[i].contiguous() + ret = para_model(data_i, target_i, *mems[i]) + loss, mems[i] = ret[0], ret[1:] + loss = loss.float().mean().type_as(loss) / args.batch_chunk + #################################################################### + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + #################################################################### + with torch.no_grad(): + train_loss += loss.float().bool().item() + else: + ret = para_model(data, target, *mems) + loss, mems = ret[0], ret[1:] + loss = loss.float().mean().type_as(loss) + #################################################### + with torch.no_grad(): + train_loss += loss.float().item() + ################################################################### + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + + + optimizer.step() + if args.sample_softmax > 0: + optimizer_sparse.step() + + # step-wise learning rate annealing + train_step += 1 + if args.scheduler in ['cosine', 'constant', 'dev_perf']: + # linear warmup stage + if train_step < args.warmup_step: + curr_lr = args.lr * train_step / args.warmup_step + optimizer.param_groups[0]['lr'] = curr_lr + if args.sample_softmax > 0: + optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2 + else: + if args.scheduler == 'cosine': + scheduler.step(train_step) + if args.sample_softmax > 0: + scheduler_sparse.step(train_step) + elif args.scheduler == 'inv_sqrt': + scheduler.step(train_step) + + if train_step % args.log_interval == 0: + cur_loss = train_loss / args.log_interval + elapsed = time.time() - log_start_time + log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \ + '| ms/batch {:5.2f} | loss {:5.2f} | fps {:.2f}'.format( + epoch, train_step, batch+1, optimizer.param_groups[0]['lr'], + elapsed * 1000 / args.log_interval, cur_loss, args.log_interval*args.batch_size*args.tgt_len*8/elapsed) + if args.dataset in ['enwik8', 'text8']: + log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2)) + else: + log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss)) + logging(log_str) + train_loss = 0 + log_start_time = time.time() + + if train_step % args.eval_interval == 0: + ts = time.time() + val_loss = evaluate(va_iter) + print('evaluation use time {} s'.format(time.time()-ts)) + logging('-' * 100) + log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \ + '| valid loss {:5.2f}'.format( + train_step // args.eval_interval, train_step, + (time.time() - ts), val_loss) + if args.dataset in ['enwik8', 'text8']: + log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2)) + else: + log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss)) + logging(log_str) + logging('-' * 100) + # Save the model if the validation loss is the best we've seen so far. + if not best_val_loss or val_loss < best_val_loss: + if not args.debug: + with open('model.pt', 'wb') as f: + torch.save(model.state_dict(), f) + with open('optimizer.pt', 'wb') as f: + torch.save(optimizer.state_dict(), f) + best_val_loss = val_loss + + # dev-performance based learning rate annealing + if args.scheduler == 'dev_perf': + scheduler.step(val_loss) + if args.sample_softmax > 0: + scheduler_sparse.step(val_loss) + + eval_start_time = time.time() + + if train_step == args.max_step: + sys.exit() + + # At any point you can hit Ctrl + C to break out of training early. + try: + for epoch in itertools.count(start=1): + train() + if train_step == args.max_step: + logging('-' * 100) + logging('End of training') + sys.exit() + except KeyboardInterrupt: + logging('-' * 100) + logging('Exiting from training early') + + # # Load the best saved model. + # with open('model.pt', 'rb') as f: + # model.load_state_dict(torch.load(f, map_location=loc)) + # para_model = model.to(loc) + + # # Run on test data. + # test_loss = evaluate(te_iter) + # logging('=' * 100) + # if args.dataset in ['enwik8', 'text8']: + # logging('| End of training | test loss {:5.2f} | test bpc {:9.5f}'.format( + # test_loss, test_loss / math.log(2))) + # else: + # logging('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format( + # test_loss, math.exp(test_loss))) + # logging('=' * 100) + + +if __name__ == '__main__': + global train_step, train_loss, best_val_loss, eval_start_time, log_start_time + train_step = 0 + train_loss = 0 + best_val_loss = None + log_start_time = time.time() + eval_start_time = time.time() + main() diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/.keep b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/adaptive_softmax.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/adaptive_softmax.py new file mode 100644 index 0000000000000000000000000000000000000000..3c54a69204525d62466eb58245af2a3165798bed --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/adaptive_softmax.py @@ -0,0 +1,102 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch.nn as nn +import torch.nn.functional as F + +class AdaptiveLogSoftmax(nn.Module): + def __init__(self, in_features, n_classes, cutoffs, keep_order=False): + super(AdaptiveLogSoftmax, self).__init__() + + cutoffs = list(cutoffs) + + if (cutoffs != sorted(cutoffs)) \ + or (min(cutoffs) <= 0) \ + or (max(cutoffs) >= (n_classes - 1)) \ + or (len(set(cutoffs)) != len(cutoffs)) \ + or any([int(c) != c for c in cutoffs]): + + raise ValueError("cutoffs should be a sequence of unique, positive " + "integers sorted in an increasing order, where " + "each value is between 1 and n_classes-1") + + self.in_features = in_features + self.n_classes = n_classes + self.cutoffs = cutoffs + [n_classes] + + self.shortlist_size = self.cutoffs[0] + self.n_clusters = len(self.cutoffs) - 1 + self.head_size = self.shortlist_size + self.n_clusters + + self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.in_features)) + self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) + + self.keep_order = keep_order + + + def forward(self, hidden, target, weight, bias, keep_order=False): + if hidden.size(0) != target.size(0): + raise RuntimeError('Input and target should have the same size ' + 'in the batch dimension.') + + head_weight = torch.cat( + [weight[:self.shortlist_size], self.cluster_weight], dim=0) + head_bias = torch.cat( + [bias[:self.shortlist_size], self.cluster_bias], dim=0) + + head_logit = F.linear(hidden, head_weight, bias=head_bias) + head_logprob = F.log_softmax(head_logit, dim=1) + + nll = torch.zeros_like(target, + dtype=hidden.dtype, device=hidden.device) + + offset = 0 + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + l_idx, h_idx = cutoff_values[i], cutoff_values[i + 1] + + mask_i = (target >= l_idx) & (target < h_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + target_i = target.index_select(0, indices_i) - l_idx + head_logprob_i = head_logprob.index_select(0, indices_i) + + if i == 0: + logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1) + else: + weight_i = weight[l_idx:h_idx] + bias_i = bias[l_idx:h_idx] + + hidden_i = hidden.index_select(0, indices_i) + + tail_logit_i = F.linear(hidden_i, weight_i, bias=bias_i) + tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + + print(f'target_i[:,None]: {target_i[:, None]}') + print(f'target_i[:,None].shape: {target_i[:, None].shape}') + logprob_i = head_logprob_i[:, -i] \ + + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1) + + if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: + nll.index_copy_(0, indices_i, -logprob_i) + else: + nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i) + + offset += logprob_i.size(0) + + return nll diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/data_parallel.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/data_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..1b48aaaa644e8310cce7cb4d04d14b9d832d39ff --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/data_parallel.py @@ -0,0 +1,109 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from torch.nn.parallel import DataParallel +import torch +from torch.nn.parallel._functions import Scatter +from torch.nn.parallel.parallel_apply import parallel_apply + +def scatter(inputs, target_gpus, chunk_sizes, dim=0): + r""" + Slices tensors into approximately equal chunks and + distributes them across given GPUs. Duplicates + references to objects that are not tensors. + """ + def scatter_map(obj): + if isinstance(obj, torch.Tensor): + try: + return Scatter.apply(target_gpus, chunk_sizes, dim, obj) + except: + print('obj', obj.size()) + print('dim', dim) + print('chunk_sizes', chunk_sizes) + quit() + if isinstance(obj, tuple) and len(obj) > 0: + return list(zip(*map(scatter_map, obj))) + if isinstance(obj, list) and len(obj) > 0: + return list(map(list, zip(*map(scatter_map, obj)))) + if isinstance(obj, dict) and len(obj) > 0: + return list(map(type(obj), zip(*map(scatter_map, obj.items())))) + return [obj for targets in target_gpus] + + try: + return scatter_map(inputs) + finally: + scatter_map = None + +def scatter_kwargs(inputs, kwargs, target_gpus, chunk_sizes, dim=0): + r"""Scatter with support for kwargs dictionary""" + inputs = scatter(inputs, target_gpus, chunk_sizes, dim) if inputs else [] + kwargs = scatter(kwargs, target_gpus, chunk_sizes, dim) if kwargs else [] + if len(inputs) < len(kwargs): + inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) + elif len(kwargs) < len(inputs): + kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) + inputs = tuple(inputs) + kwargs = tuple(kwargs) + return inputs, kwargs + +class BalancedDataParallel(DataParallel): + def __init__(self, gpu0_bsz, *args, **kwargs): + self.gpu0_bsz = gpu0_bsz + super().__init__(*args, **kwargs) + + def forward(self, *inputs, **kwargs): + if not self.device_ids: + return self.module(*inputs, **kwargs) + if self.gpu0_bsz == 0: + device_ids = self.device_ids[1:] + else: + device_ids = self.device_ids + inputs, kwargs = self.scatter(inputs, kwargs, device_ids) + if len(self.device_ids) == 1: + return self.module(*inputs[0], **kwargs[0]) + replicas = self.replicate(self.module, self.device_ids) + if self.gpu0_bsz == 0: + replicas = replicas[1:] + outputs = self.parallel_apply(replicas, device_ids, inputs, kwargs) + + ######################################3 + # outputs=outputs.to('cpu') + # self.output_device='cpu' + print(f'outputs: {outputs}') + print(f'type(outputs): {type(outputs)}') + print(f'len(outputs): {len(outputs)}') + print(f'self.output_device: {self.output_device}') + + + return self.gather(outputs, self.output_device) + + def parallel_apply(self, replicas, device_ids, inputs, kwargs): + return parallel_apply(replicas, inputs, kwargs, device_ids) + + def scatter(self, inputs, kwargs, device_ids): + bsz = inputs[0].size(self.dim) + num_dev = len(self.device_ids) + gpu0_bsz = self.gpu0_bsz + bsz_unit = (bsz - gpu0_bsz) // (num_dev - 1) + if gpu0_bsz < bsz_unit: + chunk_sizes = [gpu0_bsz] + [bsz_unit] * (num_dev - 1) + delta = bsz - sum(chunk_sizes) + for i in range(delta): + chunk_sizes[i + 1] += 1 + if gpu0_bsz == 0: + chunk_sizes = chunk_sizes[1:] + else: + return super().scatter(inputs, kwargs, device_ids) + return scatter_kwargs(inputs, kwargs, device_ids, chunk_sizes, dim=self.dim) + diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/exp_utils.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/exp_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f290b8e70eed7448095e9af4b97341d0e89644eb --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/exp_utils.py @@ -0,0 +1,40 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import functools +import os +import shutil +import torch + + +def logging(s, log_path, print_=True, log_=True): + if print_: + print(s) + if log_: + with open(log_path, 'a+') as f_log: + f_log.write(s + '\n') + + +def get_logger(log_path, **kwargs): + return functools.partial(logging, log_path=log_path, **kwargs) + + +def create_exp_dir(dir_path, scripts_to_save=None, debug=False): + print('Experiment dir : {}'.format(dir_path)) + return get_logger(log_path='log.txt') + + +def save_checkpoint(model, optimizer, path, epoch): + torch.save(model, os.path.join(path, 'model_{}.pt'.format(epoch))) + torch.save(optimizer.state_dict(), os.path.join(path, 'optimizer_{}.pt'.format(epoch))) diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/log_uniform_sampler.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/log_uniform_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..4ebe1297479dd63b01cfcc2d553760fff26947b0 --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/log_uniform_sampler.py @@ -0,0 +1,111 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +from torch import nn + +class LogUniformSampler(object): + def __init__(self, range_max, n_sample): + """ + Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py + `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)` + + expected count can be approximated by 1 - (1 - p)^n + and we use a numerically stable version -expm1(num_tries * log1p(-p)) + + Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run + """ + with torch.no_grad(): + self.range_max = range_max + log_indices = torch.arange(1., range_max+2., 1.).log_() + self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] + # print('P', self.dist.numpy().tolist()[-30:]) + + self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float() + + self.n_sample = n_sample + + def sample(self, labels): + """ + labels: [b1, b2] + Return + true_log_probs: [b1, b2] + samp_log_probs: [n_sample] + neg_samples: [n_sample] + """ + + n_sample = self.n_sample + n_tries = 2 * n_sample + + with torch.no_grad(): + neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique() + device = labels.device + neg_samples = neg_samples.to(device) + true_log_probs = self.log_q[labels].to(device) + samp_log_probs = self.log_q[neg_samples].to(device) + return true_log_probs, samp_log_probs, neg_samples + +def sample_logits(embedding, bias, labels, inputs, sampler): + """ + embedding: an nn.Embedding layer + bias: [n_vocab] + labels: [b1, b2] + inputs: [b1, b2, n_emb] + sampler: you may use a LogUniformSampler + Return + logits: [b1, b2, 1 + n_sample] + """ + true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels) + n_sample = neg_samples.size(0) + b1, b2 = labels.size(0), labels.size(1) + all_ids = torch.cat([labels.view(-1), neg_samples]) + all_w = embedding(all_ids) + true_w = all_w[: -n_sample].view(b1, b2, -1) + sample_w = all_w[- n_sample:].view(n_sample, -1) + + all_b = bias[all_ids] + true_b = all_b[: -n_sample].view(b1, b2) + sample_b = all_b[- n_sample:] + + hit = (labels[:, :, None] == neg_samples).detach() + + true_logits = torch.einsum('ijk,ijk->ij', + [true_w, inputs]) + true_b - true_log_probs + sample_logits = torch.einsum('lk,ijk->ijl', + [sample_w, inputs]) + sample_b - samp_log_probs + sample_logits.masked_fill_(hit, -1e30) + logits = torch.cat([true_logits[:, :, None], sample_logits], -1) + + return logits + + +if __name__ == '__main__': + S, B = 3, 4 + n_vocab = 10000 + n_sample = 5 + H = 32 + + labels = torch.LongTensor(S, B).random_(0, n_vocab) + sampler = LogUniformSampler(n_vocab, unique=True) + + embedding = nn.Embedding(n_vocab, H) + bias = torch.zeros(n_vocab) + inputs = torch.Tensor(S, B, H).normal_() + + logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample) + print('logits', logits.detach().numpy().tolist()) + print('logits shape', logits.size()) + print('out_labels', out_labels.detach().numpy().tolist()) + print('out_labels shape', out_labels.size()) + diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/proj_adaptive_softmax.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/proj_adaptive_softmax.py new file mode 100644 index 0000000000000000000000000000000000000000..886757190ce2b308dc3e3d6e66762f623cb9878a --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/proj_adaptive_softmax.py @@ -0,0 +1,160 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch +import torch.nn as nn +import torch.nn.functional as F + +#################################################################################### +# edit +# CUDA_MAJOR = int(torch.version.cuda.split('.')[0]) # 主cuda +# CUDA_MINOR = int(torch.version.cuda.split('.')[1]) # 辅cuda +##################################################################################### + +class ProjectedAdaptiveLogSoftmax(nn.Module): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, + keep_order=False): + super(ProjectedAdaptiveLogSoftmax, self).__init__() + self.n_token = n_token + self.d_embed = d_embed + self.d_proj = d_proj + + self.cutoffs = cutoffs + [n_token] + self.cutoff_ends = [0] + self.cutoffs + self.div_val = div_val + + self.shortlist_size = self.cutoffs[0] + self.n_clusters = len(self.cutoffs) - 1 + self.head_size = self.shortlist_size + self.n_clusters + + if self.n_clusters > 0: + self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed)) + self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) + + self.out_layers = nn.ModuleList() + self.out_projs = nn.ParameterList() + + if div_val == 1: + for i in range(len(self.cutoffs)): + if d_proj != d_embed: + self.out_projs.append( + nn.Parameter(torch.Tensor(d_proj, d_embed)) + ) + else: + self.out_projs.append(None) + + self.out_layers.append(nn.Linear(d_embed, n_token)) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + d_emb_i = d_embed // (div_val ** i) + + self.out_projs.append( + nn.Parameter(torch.Tensor(d_proj, d_emb_i)) + ) + + self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx)) + + self.keep_order = keep_order + + def _compute_logit(self, hidden, weight, bias, proj): + if proj is None: + logit = F.linear(hidden, weight, bias=bias) + else: + proj_hid = F.linear(hidden, proj.t().contiguous()) + logit = F.linear(proj_hid, weight, bias=bias) + + return logit + + def forward(self, hidden, target, keep_order=False): + ''' + hidden :: [len*bsz x d_proj] + target :: [len*bsz] + ''' + + if hidden.size(0) != target.size(0): + raise RuntimeError('Input and target should have the same size ' + 'in the batch dimension.') + + if self.n_clusters == 0: + logit = self._compute_logit(hidden, self.out_layers[0].weight, + self.out_layers[0].bias, self.out_projs[0]) + + torch.save(logit,"logit.pt") + + nll = -F.log_softmax(logit, dim=-1).gather(1, target.unsqueeze(1).long()).squeeze(1) + + else: + weights, biases = [], [] + for i in range(len(self.cutoffs)): + if self.div_val == 1: + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + weight_i = self.out_layers[0].weight[l_idx:r_idx] + bias_i = self.out_layers[0].bias[l_idx:r_idx] + else: + weight_i = self.out_layers[i].weight + bias_i = self.out_layers[i].bias + + if i == 0: + weight_i = torch.cat( + [weight_i, self.cluster_weight], dim=0) + bias_i = torch.cat( + [bias_i, self.cluster_bias], dim=0) + + weights.append(weight_i) + biases.append(bias_i) + + head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] + + head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) + head_logprob = F.log_softmax(head_logit, dim=1) + + nll = torch.zeros_like(target, + dtype=hidden.dtype, device=hidden.device) + + offset = 0 + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1] + + mask_i = (target >= l_idx) & (target < r_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + target_i = target.index_select(0, indices_i) - l_idx + head_logprob_i = head_logprob.index_select(0, indices_i) + + if i == 0: + logprob_i = head_logprob_i.gather(1, target_i[:,None]).squeeze(1) + else: + weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] + + hidden_i = hidden.index_select(0, indices_i) + + tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i) + tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + + logprob_i = head_logprob_i[:, -i] \ + + tail_logprob_i.gather(1, target_i[:,None]).squeeze(1) + + if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: + nll.index_copy_(0, indices_i, -logprob_i) + else: + nll[offset:offset+logprob_i.size(0)].copy_(-logprob_i) + + offset += logprob_i.size(0) + + return nll diff --git a/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/vocabulary.py b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/vocabulary.py new file mode 100644 index 0000000000000000000000000000000000000000..a13e4183c243befab6f7167719931782f67c843c --- /dev/null +++ b/PyTorch/contrib/nlp/Transformer-xl_for_PyTorch/utils/vocabulary.py @@ -0,0 +1,178 @@ +# coding: UTF-8 +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from collections import Counter, OrderedDict +import torch + + +class Vocab(object): + def __init__(self, special=[], min_freq=0, max_size=None, lower_case=True, + delimiter=None, vocab_file=None): + self.counter = Counter() + self.special = special + self.min_freq = min_freq + self.max_size = max_size + self.lower_case = lower_case + self.delimiter = delimiter + self.vocab_file = vocab_file + + + def tokenize(self, line, add_eos=False, add_double_eos=False): + line = line.strip() + if self.lower_case: + line = line.lower() + + if self.delimiter == '': + symbols = line + else: + symbols = line.split(self.delimiter) + + if add_double_eos: # lm1b + return [''] + symbols + [''] + elif add_eos: + return symbols + [''] + else: + return symbols + + + + def count_file(self, path, verbose=False, add_eos=False): + if verbose: print('counting file {} ...'.format(path)) + assert os.path.exists(path) + + sents = [] + with open(path, 'r', encoding='utf-8') as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + symbols = self.tokenize(line, add_eos=add_eos) + self.counter.update(symbols) + sents.append(symbols) + + return sents + + + def count_sents(self, sents, verbose=False): + """ + sents : a list of sentences, each a list of tokenized symbols + """ + if verbose: print('counting {} sents ...'.format(len(sents))) + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + self.counter.update(symbols) + + def _build_from_file(self, vocab_file): + self.idx2sym = [] + self.sym2idx = OrderedDict() + + with open(vocab_file, 'r', encoding='utf-8') as f: + for line in f: + symb = line.strip().split()[0] + self.add_symbol(symb) + self.unk_idx = self.sym2idx[''] + + def build_vocab(self): + if self.vocab_file: + print('building vocab from {}'.format(self.vocab_file)) + self._build_from_file(self.vocab_file) + print('final vocab size {}'.format(len(self))) + else: + print('building vocab with min_freq={}, max_size={}'.format( + self.min_freq, self.max_size)) + self.idx2sym = [] + self.sym2idx = OrderedDict() + + for sym in self.special: + self.add_special(sym) + + for sym, cnt in self.counter.most_common(self.max_size): + if cnt < self.min_freq: break + self.add_symbol(sym) + + print('final vocab size {} from {} unique tokens'.format( + len(self), len(self.counter))) + + def encode_file(self, path, ordered=False, verbose=False, add_eos=True, + add_double_eos=False): + if verbose: print('encoding file {} ...'.format(path)) + assert os.path.exists(path) + encoded = [] + with open(path, 'r', encoding='utf-8') as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + symbols = self.tokenize(line, add_eos=add_eos, + add_double_eos=add_double_eos) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def encode_sents(self, sents, ordered=False, verbose=False): + if verbose: print('encoding {} sents ...'.format(len(sents))) + encoded = [] + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + print(' line {}'.format(idx)) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def add_special(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym]) + + def add_symbol(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + + def get_sym(self, idx): + assert 0 <= idx < len(self), 'Index {} out of range'.format(idx) + return self.idx2sym[idx] + + def get_idx(self, sym): + if sym in self.sym2idx: + return self.sym2idx[sym] + else: + assert '' not in sym + assert hasattr(self, 'unk_idx') + return self.sym2idx.get(sym, self.unk_idx) + + def get_symbols(self, indices): + return [self.get_sym(idx) for idx in indices] + + def get_indices(self, symbols): + return [self.get_idx(sym) for sym in symbols] + + def convert_to_tensor(self, symbols): + return torch.LongTensor(self.get_indices(symbols)) + + def convert_to_sent(self, indices, exclude=None): + if exclude is None: + return ' '.join([self.get_sym(idx) for idx in indices]) + else: + return ' '.join([self.get_sym(idx) for idx in indices if idx not in exclude]) + + def __len__(self): + return len(self.idx2sym)