commit 95d4f2ca7d2f916960379f09233a14ddb13534d1 Author: Petr Masopust Date: Thu Jul 25 23:35:26 2019 +0200 Faceserver vectorizer initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..939889d --- /dev/null +++ b/.gitignore @@ -0,0 +1,71 @@ +# Runtime directories +ckpt/ +mAP_txt/ +summary/ +weight/ + +# IntelliJ IDEA +.idea/ +*.iml + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +# lib is NOT ignored +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ diff --git a/vectorizer/identification/anchors.py b/vectorizer/identification/anchors.py new file mode 100644 index 0000000..791cad0 --- /dev/null +++ b/vectorizer/identification/anchors.py @@ -0,0 +1,131 @@ +import numpy as np +import torch +import torch.nn as nn + + +class Anchors(nn.Module): + def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None, is_cuda=True): + super(Anchors, self).__init__() + + self.is_cuda = is_cuda + if pyramid_levels is None: + self.pyramid_levels = [3, 4, 5, 6, 7] + if strides is None: + self.strides = [2 ** x for x in self.pyramid_levels] + if sizes is None: + self.sizes = [2 ** (x + 2) for x in self.pyramid_levels] + if ratios is None: + # self.ratios = np.array([1., 1.5, 2., 2.5, 3.]) + self.ratios = np.array([0.5, 1., 2.]) + if scales is None: + self.scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) + + def forward(self, image): + + image_shape = image.shape[2:] + image_shape = np.array(image_shape) + image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in self.pyramid_levels] + + # compute anchors over all pyramid levels + all_anchors = np.zeros((0, 4)).astype(np.float32) + + for idx, p in enumerate(self.pyramid_levels): + anchors = generate_anchors(base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales) + shifted_anchors = shift(image_shapes[idx], self.strides[idx], anchors) + all_anchors = np.append(all_anchors, shifted_anchors, axis=0) + + all_anchors = np.expand_dims(all_anchors, axis=0) + all_anchors = torch.from_numpy(all_anchors.astype(np.float32)) + if self.is_cuda: + all_anchors = all_anchors.cuda() + + return all_anchors + + +def generate_anchors(base_size=16, ratios=None, scales=None): + """ + Generate anchor (reference) windows by enumerating aspect ratios X + scales w.r.t. a reference window. + """ + + if ratios is None: + ratios = np.array([0.5, 1, 2]) + + if scales is None: + scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) + + num_anchors = len(ratios) * len(scales) + + # initialize output anchors + anchors = np.zeros((num_anchors, 4)) + + # scale base_size + anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T + + # compute areas of anchors + areas = anchors[:, 2] * anchors[:, 3] + + # correct for ratios + anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales))) + anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales)) + + # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2) + anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T + anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T + + return anchors + + +def compute_shape(image_shape, pyramid_levels): + """Compute shapes based on pyramid levels. + + :param image_shape: + :param pyramid_levels: + :return: + """ + image_shape = np.array(image_shape[:2]) + image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels] + return image_shapes + + +def anchors_for_shape( + image_shape, + pyramid_levels=None, + ratios=None, + scales=None, + strides=None, + sizes=None, +): + image_shapes = compute_shape(image_shape, pyramid_levels) + + # compute anchors over all pyramid levels + all_anchors = np.zeros((0, 4)) + for idx, p in enumerate(pyramid_levels): + anchors = generate_anchors(base_size=sizes[idx], ratios=ratios, scales=scales) + shifted_anchors = shift(image_shapes[idx], strides[idx], anchors) + all_anchors = np.append(all_anchors, shifted_anchors, axis=0) + + return all_anchors + + +def shift(shape, stride, anchors): + shift_x = (np.arange(0, shape[1]) + 0.5) * stride + shift_y = (np.arange(0, shape[0]) + 0.5) * stride + + shift_x, shift_y = np.meshgrid(shift_x, shift_y) + + shifts = np.vstack(( + shift_x.ravel(), shift_y.ravel(), + shift_x.ravel(), shift_y.ravel() + )).transpose() + + # add A anchors (1, A, 4) to + # cell K shifts (K, 1, 4) to get + # shift anchors (K, A, 4) + # reshape to (K*A, 4) shifted anchors + A = anchors.shape[0] + K = shifts.shape[0] + all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) + all_anchors = all_anchors.reshape((K * A, 4)) + + return all_anchors diff --git a/vectorizer/identification/detector.py b/vectorizer/identification/detector.py new file mode 100644 index 0000000..aafe2ab --- /dev/null +++ b/vectorizer/identification/detector.py @@ -0,0 +1,95 @@ +import numpy as np +import torch +from PIL import Image + +from torchvision import transforms + + +class Resizer(object): + """Convert ndarrays in sample to Tensors.""" + + def __call__(self, sample, min_side=800, max_side=1400): + image, annots, scale = sample['img'], sample['annot'], sample['scale'] + + rows, cols = image.size + + # scale = min_side / rows + + smallest_side = min(rows, cols) + + # rescale the image so the smallest side is min_side + scale = min_side / smallest_side + + # check if the largest side is now greater than max_side, which can happen + # when images have a large aspect ratio + largest_side = max(rows, cols) + + if largest_side * scale > max_side: + scale = max_side / largest_side + + # resize the image with the computed scale + image = np.array(image.resize((int(round((cols * scale))), int(round((rows * scale)))), resample=Image.BILINEAR)) + image = image / 255.0 + + rows, cols, cns = image.shape + + pad_w = 32 - rows % 32 + pad_h = 32 - cols % 32 + + new_image = np.zeros((rows + pad_w, cols + pad_h, cns)).astype(np.float32) + new_image[:rows, :cols, :] = image.astype(np.float32) + + annots[:, :4] *= scale + + return {'img': new_image, 'annot': annots, 'scale': scale} + + +class Normalizer(object): + def __init__(self): + self.mean = np.array([[[0.485, 0.456, 0.406]]]) + self.std = np.array([[[0.229, 0.224, 0.225]]]) + + def __call__(self, sample): + image, annots, scales = sample['img'], sample['annot'], sample['scale'] + + image = (image.astype(np.float32) - self.mean) / self.std + + sample = {'img': torch.from_numpy(image), 'annot': torch.from_numpy(annots), 'scale': scales} + return sample + + +def fan_detect(model, img_data, threshold=0.9, max_detections=100, is_cuda=True): + input_data = {'img': img_data, 'annot': np.zeros((0, 5)), 'scale': 1} + transform = transforms.Compose([Resizer(), Normalizer()]) + transformed = transform(input_data) + + model.eval() + with torch.no_grad(): + img_data = transformed['img'].permute(2, 0, 1).float().unsqueeze(dim=0) + if is_cuda: + img_data = img_data.cuda() + scores, labels, boxes = model(img_data) + if scores is None: + return np.array() + + scores = scores.cpu().numpy() + scale = transformed['scale'] + boxes = boxes.cpu().numpy() / scale + + indices = np.where(scores > threshold)[0] + scores = scores[indices] + scores_sort = np.argsort(-scores)[:max_detections] + image_boxes = boxes[indices[scores_sort], :] + + return image_boxes + + +def load_model(model_path, is_cuda=True): + # load possible cuda model as cpu + model = torch.load(model_path, map_location=lambda storage, location: storage) + if is_cuda: + model = model.cuda() + + model.anchors.is_cuda=is_cuda + + return model diff --git a/vectorizer/identification/losses.py b/vectorizer/identification/losses.py new file mode 100644 index 0000000..9e63f34 --- /dev/null +++ b/vectorizer/identification/losses.py @@ -0,0 +1,289 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def memprint(a): + print(a.shape) + print(a.element_size() * a.nelement()) + +def calc_iou(a, b): + step = 20 + IoU = torch.zeros((len(a), len(b))).cuda() + step_count = int(len(b) / step) + if len(b) % step != 0: + step_count += 1 + + area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) + + for i in range(step_count): + iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[i * step:(i+1) * step, 2]) + iw.sub_(torch.max(torch.unsqueeze(a[:, 0], 1), b[i * step:(i+1) * step, 0])) + + ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[i * step:(i+1) * step, 3]) + ih.sub_(torch.max(torch.unsqueeze(a[:, 1], 1), b[i * step:(i+1) * step, 1])) + + iw.clamp_(min=0) + ih.clamp_(min=0) + + iw.mul_(ih) + del ih + + ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area[i * step:(i+1) * step] - iw + ua = torch.clamp(ua, min=1e-8) + iw.div_(ua) + del ua + + IoU[:, i * step:(i+1) * step] = iw + + return IoU + + +def calc_iou_vis(a, b): + area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) + + iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0]) + ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1]) + + iw = torch.clamp(iw, min=0) + ih = torch.clamp(ih, min=0) + + intersection = iw * ih + + IoU = intersection / area + + return IoU + + +def IoG(box_a, box_b): + inter_xmin = torch.max(box_a[:, 0], box_b[:, 0]) + inter_ymin = torch.max(box_a[:, 1], box_b[:, 1]) + inter_xmax = torch.min(box_a[:, 2], box_b[:, 2]) + inter_ymax = torch.min(box_a[:, 3], box_b[:, 3]) + Iw = torch.clamp(inter_xmax - inter_xmin, min=0) + Ih = torch.clamp(inter_ymax - inter_ymin, min=0) + I = Iw * Ih + G = (box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1]) + return I / G + + +class FocalLoss(nn.Module): + def __init__(self, is_cuda=True): + super(FocalLoss, self).__init__() + self.is_cuda = is_cuda + + def forward(self, classifications, regressions, anchors, annotations): + alpha = 0.25 + gamma = 2.0 + batch_size = classifications.shape[0] + classification_losses = [] + regression_losses = [] + + anchor = anchors[0, :, :] + + anchor_widths = anchor[:, 2] - anchor[:, 0] + anchor_heights = anchor[:, 3] - anchor[:, 1] + anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths + anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights + + for j in range(batch_size): + + classification = classifications[j, :, :] + regression = regressions[j, :, :] + + bbox_annotation = annotations[j, :, :] + bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1] + + if bbox_annotation.shape[0] == 0: + if self.is_cuda: + regression_losses.append(torch.tensor(0).float().cuda()) + classification_losses.append(torch.tensor(0).float().cuda()) + else: + regression_losses.append(torch.tensor(0).float()) + classification_losses.append(torch.tensor(0).float()) + + continue + + classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4) + + IoU = calc_iou(anchor, bbox_annotation[:, :4]) # num_anchors x num_annotations + + IoU_max, IoU_argmax = torch.max(IoU, dim=1) # num_anchors x 1 + + # compute the loss for classification + targets = torch.ones(classification.shape) * -1 + if self.is_cuda: + targets = targets.cuda() + + targets[torch.lt(IoU_max, 0.4), :] = 0 + + positive_ful = torch.ge(IoU_max, 0.5) + positive_indices = positive_ful + + num_positive_anchors = positive_indices.sum() + + assigned_annotations = bbox_annotation[IoU_argmax, :] + + targets[positive_indices, :] = 0 + targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1 + try: + alpha_factor = torch.ones(targets.shape) + if self.is_cuda: + alpha_factor = alpha_factor.cuda() + alpha_factor *= alpha + except: + print(targets) + print(targets.shape) + + alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor) + focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification) + focal_weight = alpha_factor * torch.pow(focal_weight, gamma) + + bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification)) + + # cls_loss = focal_weight * torch.pow(bce, gamma) + cls_loss = focal_weight * bce + + cls_zeros = torch.zeros(cls_loss.shape) + if self.is_cuda: + cls_zeros = cls_zeros.cuda() + cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, cls_zeros) + + classification_losses.append(cls_loss.sum() / torch.clamp(num_positive_anchors.float(), min=1.0)) + + # compute the loss for regression + + if positive_indices.sum() > 0: + assigned_annotations = assigned_annotations[positive_indices, :] + + anchor_widths_pi = anchor_widths[positive_indices] + anchor_heights_pi = anchor_heights[positive_indices] + anchor_ctr_x_pi = anchor_ctr_x[positive_indices] + anchor_ctr_y_pi = anchor_ctr_y[positive_indices] + + gt_widths = assigned_annotations[:, 2] - assigned_annotations[:, 0] + gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1] + gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths + gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights + + # clip widths to 1 + gt_widths = torch.clamp(gt_widths, min=1) + gt_heights = torch.clamp(gt_heights, min=1) + + targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi + targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi + targets_dw = torch.log(gt_widths / anchor_widths_pi) + targets_dh = torch.log(gt_heights / anchor_heights_pi) + + targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh)) + targets = targets.t() + + if self.is_cuda: + targets = targets.cuda() / torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda() + else: + targets = targets / torch.Tensor([[0.1, 0.1, 0.2, 0.2]]) + + regression_diff = torch.abs(targets - regression[positive_indices, :]) + + regression_loss = torch.where( + torch.le(regression_diff, 1.0 / 9.0), + 0.5 * 9.0 * torch.pow(regression_diff, 2), + regression_diff - 0.5 / 9.0 + ) + regression_losses.append(regression_loss.mean()) + else: + if self.is_cuda: + regression_losses.append(torch.tensor(0).float().cuda()) + else: + regression_losses.append(torch.tensor(0).float()) + + return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses) \ + .mean(dim=0, keepdim=True) + + +class LevelAttentionLoss(nn.Module): + def __init__(self, is_cuda=True): + super(LevelAttentionLoss, self).__init__() + self.is_cuda = is_cuda + + def forward(self, img_batch_shape, attention_mask, bboxs): + h, w = img_batch_shape[2], img_batch_shape[3] + + mask_losses = [] + + batch_size = bboxs.shape[0] + for j in range(batch_size): + + bbox_annotation = bboxs[j, :, :] + bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1] + + if bbox_annotation.shape[0] == 0: + if self.is_cuda: + mask_losses.append(torch.tensor(0).float().cuda()) + else: + mask_losses.append(torch.tensor(0).float()) + continue + + cond1 = torch.le(bbox_annotation[:, 0], w) + cond2 = torch.le(bbox_annotation[:, 1], h) + cond3 = torch.le(bbox_annotation[:, 2], w) + cond4 = torch.le(bbox_annotation[:, 3], h) + cond = cond1 * cond2 * cond3 * cond4 + + bbox_annotation = bbox_annotation[cond, :] + + if bbox_annotation.shape[0] == 0: + if self.is_cuda: + mask_losses.append(torch.tensor(0).float().cuda()) + else: + mask_losses.append(torch.tensor(0).float()) + continue + + bbox_area = (bbox_annotation[:, 2] - bbox_annotation[:, 0]) * ( + bbox_annotation[:, 3] - bbox_annotation[:, 1]) + + mask_loss = [] + for id in range(len(attention_mask)): + + attention_map = attention_mask[id][j, 0, :, :] + + min_area = (2 ** (id + 5)) ** 2 * 0.5 + max_area = (2 ** (id + 5) * 1.58) ** 2 * 2 + + level_bbox_indice1 = torch.ge(bbox_area, min_area) + level_bbox_indice2 = torch.le(bbox_area, max_area) + + level_bbox_indice = level_bbox_indice1 * level_bbox_indice2 + + level_bbox_annotation = bbox_annotation[level_bbox_indice, :].clone() + + # level_bbox_annotation = bbox_annotation.clone() + + attention_h, attention_w = attention_map.shape + + if level_bbox_annotation.shape[0]: + level_bbox_annotation[:, 0] *= attention_w / w + level_bbox_annotation[:, 1] *= attention_h / h + level_bbox_annotation[:, 2] *= attention_w / w + level_bbox_annotation[:, 3] *= attention_h / h + + mask_gt = torch.zeros(attention_map.shape) + if self.is_cuda: + mask_gt = mask_gt.cuda() + + for i in range(level_bbox_annotation.shape[0]): + x1 = max(int(level_bbox_annotation[i, 0]), 0) + y1 = max(int(level_bbox_annotation[i, 1]), 0) + x2 = min(math.ceil(level_bbox_annotation[i, 2]) + 1, attention_w) + y2 = min(math.ceil(level_bbox_annotation[i, 3]) + 1, attention_h) + + mask_gt[y1:y2, x1:x2] = 1 + + mask_gt = mask_gt[mask_gt >= 0] + mask_predict = attention_map[attention_map >= 0] + + mask_loss.append(F.binary_cross_entropy(mask_predict, mask_gt)) + mask_losses.append(torch.stack(mask_loss).mean()) + + return torch.stack(mask_losses).mean(dim=0, keepdim=True) diff --git a/vectorizer/identification/model_level_attention.py b/vectorizer/identification/model_level_attention.py new file mode 100644 index 0000000..1bcd663 --- /dev/null +++ b/vectorizer/identification/model_level_attention.py @@ -0,0 +1,385 @@ +import torch.nn as nn +import torch +import math +from identification.utils import BasicBlock, Bottleneck, BBoxTransform, ClipBoxes +from identification.anchors import Anchors +from identification.losses import LevelAttentionLoss, FocalLoss +from torchvision.ops.boxes import nms as tv_nms + + +def nms(dets, thresh): + """Dispatch to either CPU or GPU NMS implementations. Accept dets as tensor""" + return tv_nms(dets[:, :4], dets[:, 4], thresh) + + +class PyramidFeatures(nn.Module): + def __init__(self, c3_size, c4_size, c5_size, feature_size=256): + super(PyramidFeatures, self).__init__() + + # upsample C5 to get P5 from the FPN paper + self.p5_1 = nn.Conv2d(c5_size, feature_size, kernel_size=1, stride=1, padding=0) + self.p5_upsampled = nn.Upsample(scale_factor=2, mode='nearest') + self.p5_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) + + # add P5 elementwise to C4 + self.p4_1 = nn.Conv2d(c4_size, feature_size, kernel_size=1, stride=1, padding=0) + self.p4_upsampled = nn.Upsample(scale_factor=2, mode='nearest') + self.p4_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) + + # add P4 elementwise to C3 + self.p3_1 = nn.Conv2d(c3_size, feature_size, kernel_size=1, stride=1, padding=0) + self.p3_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) + + # "P6 is obtained via a 3x3 stride-2 conv on C5" + self.p6 = nn.Conv2d(c5_size, feature_size, kernel_size=3, stride=2, padding=1) + + # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6" + self.p7_1 = nn.ReLU() + self.p7_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=2, padding=1) + + def forward(self, inputs): + c3, c4, c5 = inputs + + # TODO hack for old model + self.p5_1.padding_mode = 'zeros' + self.p5_2.padding_mode = 'zeros' + self.p4_1.padding_mode = 'zeros' + self.p4_2.padding_mode = 'zeros' + self.p3_1.padding_mode = 'zeros' + self.p3_2.padding_mode = 'zeros' + self.p6.padding_mode = 'zeros' + self.p7_2.padding_mode = 'zeros' + + p5_x = self.p5_1(c5) + p5_upsampled_x = self.p5_upsampled(p5_x) + p5_x = self.p5_2(p5_x) + + p4_x = self.p4_1(c4) + p4_x = p5_upsampled_x + p4_x + p4_upsampled_x = self.p4_upsampled(p4_x) + p4_x = self.p4_2(p4_x) + + p3_x = self.p3_1(c3) + p3_x = p3_x + p4_upsampled_x + p3_x = self.p3_2(p3_x) + + p6_x = self.p6(c5) + + p7_x = self.p7_1(p6_x) + p7_x = self.p7_2(p7_x) + + return [p3_x, p4_x, p5_x, p6_x, p7_x] + + +class RegressionModel(nn.Module): + def __init__(self, num_features_in, num_anchors=9, feature_size=256): + super(RegressionModel, self).__init__() + + self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1) + self.act1 = nn.ReLU() + + self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) + self.act2 = nn.ReLU() + + self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) + self.act3 = nn.ReLU() + + self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) + self.act4 = nn.ReLU() + + self.output = nn.Conv2d(feature_size, num_anchors * 4, kernel_size=3, padding=1) + + def forward(self, x): + # TODO hack for old model + self.conv1.padding_mode = 'zeros' + self.conv2.padding_mode = 'zeros' + self.conv3.padding_mode = 'zeros' + self.conv4.padding_mode = 'zeros' + self.output.padding_mode = 'zeros' + + out = self.conv1(x) + out = self.act1(out) + + out = self.conv2(out) + out = self.act2(out) + + out = self.conv3(out) + out = self.act3(out) + + out = self.conv4(out) + out = self.act4(out) + + out = self.output(out) + + # out is B x C x W x H, with C = 4*num_anchors + out = out.permute(0, 2, 3, 1) + + return out.contiguous().view(out.shape[0], -1, 4) + + +class ClassificationModel(nn.Module): + def __init__(self, num_features_in, num_anchors=9, num_classes=80, feature_size=256): + super(ClassificationModel, self).__init__() + + self.num_classes = num_classes + self.num_anchors = num_anchors + + self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1) + self.act1 = nn.ReLU() + + self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) + self.act2 = nn.ReLU() + + self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) + self.act3 = nn.ReLU() + + self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) + self.act4 = nn.ReLU() + + self.output = nn.Conv2d(feature_size, num_anchors * num_classes, kernel_size=3, padding=1) + self.output_act = nn.Sigmoid() + + def forward(self, x): + # TODO hack for old model + self.conv1.padding_mode = 'zeros' + self.conv2.padding_mode = 'zeros' + self.conv3.padding_mode = 'zeros' + self.conv4.padding_mode = 'zeros' + self.output.padding_mode = 'zeros' + + out = self.conv1(x) + out = self.act1(out) + + out = self.conv2(out) + out = self.act2(out) + + out = self.conv3(out) + out = self.act3(out) + + out = self.conv4(out) + out = self.act4(out) + + out = self.output(out) + out = self.output_act(out) + + # out is B x C x W x H, with C = n_classes + n_anchors + out1 = out.permute(0, 2, 3, 1) + + batch_size, width, height, channels = out1.shape + + out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes) + + return out2.contiguous().view(x.shape[0], -1, self.num_classes) + + +class LevelAttentionModel(nn.Module): + def __init__(self, num_features_in, feature_size=256): + super(LevelAttentionModel, self).__init__() + + self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1) + self.act1 = nn.ReLU() + + self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) + self.act2 = nn.ReLU() + + self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) + self.act3 = nn.ReLU() + + self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) + self.act4 = nn.ReLU() + + self.conv5 = nn.Conv2d(feature_size, 1, kernel_size=3, padding=1) + + self.output_act = nn.Sigmoid() + + def forward(self, x): + # TODO hack for old model + self.conv1.padding_mode = 'zeros' + self.conv2.padding_mode = 'zeros' + self.conv3.padding_mode = 'zeros' + self.conv4.padding_mode = 'zeros' + self.conv5.padding_mode = 'zeros' + + out = self.conv1(x) + out = self.act1(out) + + out = self.conv2(out) + out = self.act2(out) + + out = self.conv3(out) + out = self.act3(out) + + out = self.conv4(out) + out = self.act4(out) + + out = self.conv5(out) + out_attention = self.output_act(out) + + return out_attention + + +class ResNet(nn.Module): + def __init__(self, num_classes, block, layers, is_cuda=True): + self.inplanes = 64 + super(ResNet, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + + if block == BasicBlock: + fpn_sizes = [self.layer2[layers[1] - 1].conv2.out_channels, self.layer3[layers[2] - 1].conv2.out_channels, + self.layer4[layers[3] - 1].conv2.out_channels] + elif block == Bottleneck: + fpn_sizes = [self.layer2[layers[1] - 1].conv3.out_channels, self.layer3[layers[2] - 1].conv3.out_channels, + self.layer4[layers[3] - 1].conv3.out_channels] + else: + raise Exception("Invalid block type") + + self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2]) + + self.regressionModel = RegressionModel(256) + self.classificationModel = ClassificationModel(256, num_classes=num_classes) + self.levelattentionModel = LevelAttentionModel(256) + + self.anchors = Anchors(is_cuda=is_cuda) + + self.regressBoxes = BBoxTransform(is_cuda=is_cuda) + + self.clipBoxes = ClipBoxes() + + self.levelattentionLoss = LevelAttentionLoss(is_cuda=is_cuda) + + self.focalLoss = FocalLoss(is_cuda=is_cuda) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + # init.xavier_normal(m.weight) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + prior = 0.01 + + self.classificationModel.output.weight.data.fill_(0) + self.classificationModel.output.bias.data.fill_(-math.log((1.0 - prior) / prior)) + + self.regressionModel.output.weight.data.fill_(0) + self.regressionModel.output.bias.data.fill_(0) + + self.levelattentionModel.conv5.weight.data.fill_(0) + self.levelattentionModel.conv5.bias.data.fill_(0) + + self.freeze_bn() + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [block(self.inplanes, planes, stride, downsample)] + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def freeze_bn(self): + """Freeze BatchNorm layers.""" + for layer in self.modules(): + if isinstance(layer, nn.BatchNorm2d): + layer.eval() + + def forward(self, inputs): + if self.training: + img_batch, annotations = inputs + else: + img_batch = inputs + annotations = None + + # TODO hack for old model + self.conv1.padding_mode = 'zeros' + + x = self.conv1(img_batch) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x1 = self.layer1(x) + x2 = self.layer2(x1) + x3 = self.layer3(x2) + x4 = self.layer4(x3) + + features = self.fpn([x2, x3, x4]) + + attention = [self.levelattentionModel(feature) for feature in features] + + # i = 1 + # for level in attention: + # i += 1 + # level = level.squeeze(0) + # level = np.array(255 * unnormalize(level)).copy() + # level = np.transpose(level, (1, 2, 0)) + # plt.imsave(os.path.join('./output', str(i) + '.jpg'), level[:,:,0]) + + features = [features[i] * torch.exp(attention[i]) for i in range(len(features))] + + regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1) + + classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1) + + anchors = self.anchors(img_batch) + + if self.training: + clc_loss, reg_loss = self.focalLoss(classification, regression, anchors, annotations) + mask_loss = self.levelattentionLoss(img_batch.shape, attention, annotations) + return clc_loss, reg_loss, mask_loss + else: + # transformed_anchors = self.regressBoxes(anchors, regression) + transformed_anchors = self.clipBoxes(anchors, img_batch) + + scores = torch.max(classification, dim=2, keepdim=True)[0] + scores_over_thresh = (scores > 0.05)[0, :, 0] + + if scores_over_thresh.sum() == 0: + # no boxes to NMS, just return + # return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)] + return [None, None, None] + + classification = classification[:, scores_over_thresh, :] + transformed_anchors = transformed_anchors[:, scores_over_thresh, :] + scores = scores[:, scores_over_thresh, :] + + anchors_nms_idx = nms(torch.cat([transformed_anchors, scores], dim=2)[0, :, :], 0.3) + nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(dim=1) + return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]] + + +def resnet18(num_classes, is_cuda=True): + return ResNet(num_classes, BasicBlock, [2, 2, 2, 2], is_cuda=is_cuda) + + +def resnet34(num_classes, is_cuda=True): + return ResNet(num_classes, BasicBlock, [3, 4, 6, 3], is_cuda=is_cuda) + + +def resnet50(num_classes, is_cuda=True): + return ResNet(num_classes, Bottleneck, [3, 4, 6, 3], is_cuda=is_cuda) + + +def resnet101(num_classes, is_cuda=True): + return ResNet(num_classes, Bottleneck, [3, 4, 23, 3], is_cuda=is_cuda) + + +def resnet152(num_classes, is_cuda=True): + return ResNet(num_classes, Bottleneck, [3, 8, 36, 3], is_cuda=is_cuda) diff --git a/vectorizer/identification/utils.py b/vectorizer/identification/utils.py new file mode 100644 index 0000000..dd6b076 --- /dev/null +++ b/vectorizer/identification/utils.py @@ -0,0 +1,282 @@ +import torch +import torch.nn as nn +import numpy as np + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, + padding=1, bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + # TODO hack for old model + self.conv1.padding_mode = 'zeros' + self.conv2.padding_mode = 'zeros' + self.conv3.padding_mode = 'zeros' + if self.downsample is not None: + self.downsample[0].padding_mode = 'zeros' + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class SELayer(nn.Module): + def __init__(self, channel, reduction=16): + super(SELayer, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), + nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel), + nn.Sigmoid() + ) + + def forward(self, x): + b, c, _, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + return x * y + + +class BottleneckSE(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=16): + super(BottleneckSE, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.se = SELayer(planes * 4, reduction) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + out = self.se(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class CBAMModule(nn.Module): + def __init__(self, channels, reduction): + super(CBAMModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.max_pool = nn.AdaptiveMaxPool2d(1) + self.fc1 = nn.Conv2d(channels, channels // reduction, kernel_size=1, + padding=0) + self.relu = nn.ReLU(inplace=True) + self.fc2 = nn.Conv2d(channels // reduction, channels, kernel_size=1, + padding=0) + self.sigmoid_channel = nn.Sigmoid() + self.conv_after_concat = nn.Conv2d(2, 1, kernel_size=7, stride=1, padding=3) + self.sigmoid_spatial = nn.Sigmoid() + + def forward(self, x): + module_input = x + avg = self.avg_pool(x) + mx = self.max_pool(x) + avg = self.fc1(avg) + mx = self.fc1(mx) + avg = self.relu(avg) + mx = self.relu(mx) + avg = self.fc2(avg) + mx = self.fc2(mx) + x = avg + mx + x = self.sigmoid_channel(x) + x = module_input * x + module_input = x + avg = torch.mean(x, 1, True) + mx, _ = torch.max(x, 1, True) + x = torch.cat((avg, mx), 1) + x = self.conv_after_concat(x) + x = self.sigmoid_spatial(x) + x = module_input * x + return x + + +class BottleneckCBAM(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=16): + super(BottleneckCBAM, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, + padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.se = CBAMModule(planes * 4, reduction) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + out = self.se(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class BBoxTransform(nn.Module): + def __init__(self, mean=None, std=None, is_cuda=True): + super(BBoxTransform, self).__init__() + if mean is None: + self.mean = torch.from_numpy(np.array([0, 0, 0, 0]).astype(np.float32)) + if is_cuda: + self.mean = self.mean.cuda() + else: + self.mean = mean + if std is None: + self.std = torch.from_numpy(np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32)) + if is_cuda: + self.std = self.std.cuda() + else: + self.std = std + + def forward(self, boxes, deltas): + widths = boxes[:, :, 2] - boxes[:, :, 0] + heights = boxes[:, :, 3] - boxes[:, :, 1] + ctr_x = boxes[:, :, 0] + 0.5 * widths + ctr_y = boxes[:, :, 1] + 0.5 * heights + + dx = deltas[:, :, 0] * self.std[0] + self.mean[0] + dy = deltas[:, :, 1] * self.std[1] + self.mean[1] + dw = deltas[:, :, 2] * self.std[2] + self.mean[2] + dh = deltas[:, :, 3] * self.std[3] + self.mean[3] + + pred_ctr_x = ctr_x + dx * widths + pred_ctr_y = ctr_y + dy * heights + pred_w = torch.exp(dw) * widths + pred_h = torch.exp(dh) * heights + + pred_boxes_x1 = pred_ctr_x - 0.5 * pred_w + pred_boxes_y1 = pred_ctr_y - 0.5 * pred_h + pred_boxes_x2 = pred_ctr_x + 0.5 * pred_w + pred_boxes_y2 = pred_ctr_y + 0.5 * pred_h + + pred_boxes = torch.stack([pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2], dim=2) + + return pred_boxes + + +class ClipBoxes(nn.Module): + def __init__(self): + super(ClipBoxes, self).__init__() + + def forward(self, boxes, img): + batch_size, num_channels, height, width = img.shape + + boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0) + boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0) + + boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width) + boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height) + + return boxes diff --git a/vectorizer/recognition/angle.py b/vectorizer/recognition/angle.py new file mode 100644 index 0000000..cbb274d --- /dev/null +++ b/vectorizer/recognition/angle.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter + + +class AngleLinear(nn.Module): + def __init__(self, in_features, out_features): + super(AngleLinear, self).__init__() + self.W = Parameter(torch.FloatTensor(out_features, in_features)) + nn.init.xavier_uniform_(self.W) + + def forward(self, input): + x = F.normalize(input) + W = F.normalize(self.W) + return F.linear(x, W) + + +class AdaCos(nn.Module): + def __init__(self, num_classes, m=0.50, is_cuda=True): + super(AdaCos, self).__init__() + self.n_classes = num_classes + self.s = math.sqrt(2) * math.log(num_classes - 1) + self.base_s = self.s + self.m = m + self.criterion = nn.CrossEntropyLoss() + if is_cuda: + self.criterion = self.criterion.cuda() + + def forward(self, input, label): +# changed to fixed adacos +# theta = torch.acos(torch.clamp(input, -1.0 + 1e-7, 1.0 - 1e-7)) +# one_hot = torch.zeros_like(input) +# one_hot.scatter_(1, label.view(-1, 1).long(), 1) +# with torch.no_grad(): +# B_avg = torch.where(one_hot < 1, torch.exp(self.s * input), torch.zeros_like(input)) +# B_avg = torch.sum(B_avg) / input.size(0) +# theta_med = torch.median(theta) +# self.s = torch.log(B_avg) / torch.cos(torch.min(math.pi/4 * torch.ones_like(theta_med), theta_med)) +# # TODO why converge to infinity ? +# self.s = torch.clamp(self.s, self.base_s / 2, self.base_s * 2) +# print(self.s) + output = self.s * input + + return self.criterion(output, label) + + +class ArcFace(nn.Module): + def __init__(self, s=30.0, m=0.50, is_cuda=True): + super(ArcFace, self).__init__() + self.s = s + self.m = m + self.criterion = nn.CrossEntropyLoss() + if is_cuda: + self.criterion = self.criterion.cuda() + + def forward(self, input, label): + theta = torch.acos(torch.clamp(input, -1.0 + 1e-7, 1.0 - 1e-7)) + target_logits = torch.cos(theta + self.m) + one_hot = torch.zeros_like(input) + one_hot.scatter_(1, label.view(-1, 1).long(), 1) + output = input * (1 - one_hot) + target_logits * one_hot + output *= self.s + + return self.criterion(output, label) + + +class SphereFace(nn.Module): + def __init__(self, s=30.0, m=1.35, is_cuda=True): + super(SphereFace, self).__init__() + self.s = s + self.m = m + self.criterion = nn.CrossEntropyLoss() + if is_cuda: + self.criterion = self.criterion.cuda() + + def forward(self, input, label): + theta = torch.acos(torch.clamp(input, -1.0 + 1e-7, 1.0 - 1e-7)) + target_logits = torch.cos(self.m * theta) + one_hot = torch.zeros_like(input) + one_hot.scatter_(1, label.view(-1, 1).long(), 1) + output = input * (1 - one_hot) + target_logits * one_hot + output *= self.s + + return self.criterion(output, label) + + +class CosFace(nn.Module): + def __init__(self, s=30.0, m=0.35, is_cuda=True): + super(CosFace, self).__init__() + self.s = s + self.m = m + self.criterion = nn.CrossEntropyLoss() + if is_cuda: + self.criterion = self.criterion.cuda() + + def forward(self, input, label): + target_logits = input - self.m + one_hot = torch.zeros_like(input) + one_hot.scatter_(1, label.view(-1, 1).long(), 1) + output = input * (1 - one_hot) + target_logits * one_hot + output *= self.s + + return self.criterion(output, label) diff --git a/vectorizer/recognition/focal_loss.py b/vectorizer/recognition/focal_loss.py new file mode 100644 index 0000000..39e42d7 --- /dev/null +++ b/vectorizer/recognition/focal_loss.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +""" +Created on 18-6-7 上午10:11 + +@author: ronghuaiyang +""" + +import torch +import torch.nn as nn + + +class FocalLoss(nn.Module): + def __init__(self, gamma=0, eps=1e-7, is_cuda=True): + super(FocalLoss, self).__init__() + self.gamma = gamma + self.eps = eps + self.ce = nn.CrossEntropyLoss() + if is_cuda: + self.ce = self.ce.cuda() + + def forward(self, inp, target): + logp = self.ce(inp, target) + p = torch.exp(-logp) + loss = (1 - p) ** self.gamma * logp + return loss.mean() diff --git a/vectorizer/recognition/nets.py b/vectorizer/recognition/nets.py new file mode 100644 index 0000000..96a7744 --- /dev/null +++ b/vectorizer/recognition/nets.py @@ -0,0 +1,129 @@ +import torchvision.models as models +from torch import nn + + +def resnet18(pretrained=False, **kwargs): + """Constructs a ResNet-18 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = models.resnet18(num_classes=512, **kwargs) + return model + + +def resnet34(pretrained=False, **kwargs): + """Constructs a ResNet-34 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = models.resnet34(num_classes=512, **kwargs) + return model + + +def resnet50(pretrained=False, **kwargs): + """Constructs a ResNet-50 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = models.resnet50(num_classes=512, **kwargs) + return model + + +def resnet101(pretrained=False, **kwargs): + """Constructs a ResNet-101 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = models.resnet101(num_classes=512, **kwargs) + return model + + +def resnet152(pretrained=False, **kwargs): + """Constructs a ResNet-152 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = models.resnet152(num_classes=512, **kwargs) + return model + +def sphere20(): + return sphere20a() + + +class sphere20a(nn.Module): + def __init__(self): + super(sphere20a, self).__init__() + #input = B*3*112*96 + self.conv1_1 = nn.Conv2d(3,64,3,2,1) #=>B*64*56*48 + self.relu1_1 = nn.PReLU(64) + self.conv1_2 = nn.Conv2d(64,64,3,1,1) + self.relu1_2 = nn.PReLU(64) + self.conv1_3 = nn.Conv2d(64,64,3,1,1) + self.relu1_3 = nn.PReLU(64) + + self.conv2_1 = nn.Conv2d(64,128,3,2,1) #=>B*128*28*24 + self.relu2_1 = nn.PReLU(128) + self.conv2_2 = nn.Conv2d(128,128,3,1,1) + self.relu2_2 = nn.PReLU(128) + self.conv2_3 = nn.Conv2d(128,128,3,1,1) + self.relu2_3 = nn.PReLU(128) + + self.conv2_4 = nn.Conv2d(128,128,3,1,1) #=>B*128*28*24 + self.relu2_4 = nn.PReLU(128) + self.conv2_5 = nn.Conv2d(128,128,3,1,1) + self.relu2_5 = nn.PReLU(128) + + + self.conv3_1 = nn.Conv2d(128,256,3,2,1) #=>B*256*14*12 + self.relu3_1 = nn.PReLU(256) + self.conv3_2 = nn.Conv2d(256,256,3,1,1) + self.relu3_2 = nn.PReLU(256) + self.conv3_3 = nn.Conv2d(256,256,3,1,1) + self.relu3_3 = nn.PReLU(256) + + self.conv3_4 = nn.Conv2d(256,256,3,1,1) #=>B*256*14*12 + self.relu3_4 = nn.PReLU(256) + self.conv3_5 = nn.Conv2d(256,256,3,1,1) + self.relu3_5 = nn.PReLU(256) + + self.conv3_6 = nn.Conv2d(256,256,3,1,1) #=>B*256*14*12 + self.relu3_6 = nn.PReLU(256) + self.conv3_7 = nn.Conv2d(256,256,3,1,1) + self.relu3_7 = nn.PReLU(256) + + self.conv3_8 = nn.Conv2d(256,256,3,1,1) #=>B*256*14*12 + self.relu3_8 = nn.PReLU(256) + self.conv3_9 = nn.Conv2d(256,256,3,1,1) + self.relu3_9 = nn.PReLU(256) + + self.conv4_1 = nn.Conv2d(256,512,3,2,1) #=>B*512*7*6 + self.relu4_1 = nn.PReLU(512) + self.conv4_2 = nn.Conv2d(512,512,3,1,1) + self.relu4_2 = nn.PReLU(512) + self.conv4_3 = nn.Conv2d(512,512,3,1,1) + self.relu4_3 = nn.PReLU(512) + + self.fc5 = nn.Linear(512*14*14,512) + # ORIGINAL for 112x96: self.fc5 = nn.Linear(512*7*6,512) + + + def forward(self, x): + x = self.relu1_1(self.conv1_1(x)) + x = x + self.relu1_3(self.conv1_3(self.relu1_2(self.conv1_2(x)))) + + x = self.relu2_1(self.conv2_1(x)) + x = x + self.relu2_3(self.conv2_3(self.relu2_2(self.conv2_2(x)))) + x = x + self.relu2_5(self.conv2_5(self.relu2_4(self.conv2_4(x)))) + + x = self.relu3_1(self.conv3_1(x)) + x = x + self.relu3_3(self.conv3_3(self.relu3_2(self.conv3_2(x)))) + x = x + self.relu3_5(self.conv3_5(self.relu3_4(self.conv3_4(x)))) + x = x + self.relu3_7(self.conv3_7(self.relu3_6(self.conv3_6(x)))) + x = x + self.relu3_9(self.conv3_9(self.relu3_8(self.conv3_8(x)))) + + x = self.relu4_1(self.conv4_1(x)) + x = x + self.relu4_3(self.conv4_3(self.relu4_2(self.conv4_2(x)))) + + x = x.view(x.size(0),-1) + x = self.fc5(x) + return x diff --git a/vectorizer/recognition/test.py b/vectorizer/recognition/test.py new file mode 100644 index 0000000..b2f279c --- /dev/null +++ b/vectorizer/recognition/test.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- +""" +Created on 18-5-30 下午4:55 + +@author: ronghuaiyang +""" +import os +import argparse + +from torch.utils.data import TensorDataset, DataLoader + +from recognition.nets import resnet18, resnet34, resnet50, resnet101, resnet152, sphere20 +import torch +import numpy as np +from torch.nn import DataParallel +from PIL import Image +from torchvision import transforms as T + + +imagesize = 224 +batch_size = 20 + + +class Dataset(torch.utils.data.Dataset): + def __init__(self, identity_list, root_path): + self.identity_list = identity_list + self.root_path = root_path + + normalize = T.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + self.transforms = T.Compose([ + T.Resize(imagesize), + T.ToTensor(), + normalize + ]) + + def __getitem__(self, index): + a, b, label = self.identity_list[index] + a_data = self.load_image(a) + b_data = self.load_image(b) + return a_data, b_data, label + + def load_image(self, p): + img_path = os.path.join(self.root_path, p) + data = Image.open(img_path) + if data is None: + return None + data = data.convert(mode="RGB") + data = self.transforms(data) + return data + + def __len__(self): + return len(self.identity_list) + + +def get_pair_list(pair_list): + print('Loading pair list') + with open(pair_list, 'r') as fd: + pairs = fd.readlines() + return [line.split() for line in pairs] + + +def load_img_data(identity_list, root_path): + dataset = Dataset(identity_list, root_path) + loader = DataLoader(dataset, + batch_size=batch_size, + shuffle=False, + # pin_memory=True, + num_workers=0) + return loader + + +def lfw_test2(model, identity_list, img_data, is_cuda=True): + print('Converting to features') + sims = [] + labels = [] + max_size = len(img_data) * batch_size + for i, sample in enumerate(img_data): + if i % 10 == 0: + print('%d of %d' % (i * batch_size, max_size)) + a_data, b_data, label = sample + if is_cuda: + a_data = a_data.cuda() + b_data = b_data.cuda() + + a_output = model(a_data).detach().cpu().numpy() + b_output = model(b_data).detach().cpu().numpy() + + for idx in range(batch_size): + sim = cosin_metric(a_output[idx], b_output[idx]) + sims.append(sim) + labels.append(np.bool(label[idx] == '1')) + + acc, th = cal_accuracy(sims, labels) + print('lfw face verification accuracy: ', acc, 'threshold: ', th) + return acc + + +def cosin_metric(x1, x2): + return np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2)) + + +def cal_accuracy(y_score, y_true): + y_score = np.asarray(y_score) + y_true = np.asarray(y_true) + best_acc = 0 + best_th = 0 + for i in range(len(y_score)): + th = y_score[i] + y_test = (y_score >= th) + acc = np.mean((y_test == y_true).astype(int)) + if acc > best_acc: + best_acc = acc + best_th = th + + return best_acc, best_th + + +def main(args=None): + parser = argparse.ArgumentParser(description='Testing script for face identification.') + + parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152 or 20 for sphere', type=int, default=50) + parser.add_argument('--parallel', help='Run training with DataParallel', dest='parallel', + default=False, action='store_true') + parser.add_argument('--model', help='Path to model') + parser.add_argument('--batch_size', help='Batch size (default 50)', type=int, default=50) + parser.add_argument('--lfw_root', help='Path to LFW dataset') + parser.add_argument('--lfw_pair_list', help='Path to LFW pair list file') + + parser = parser.parse_args(args) + + is_cuda = torch.cuda.is_available() + print('CUDA available: {}'.format(is_cuda)) + + if parser.depth == 18: + model = resnet18() + elif parser.depth == 20: + model = sphere20() + elif parser.depth == 34: + model = resnet34() + elif parser.depth == 50: + model = resnet50() + elif parser.depth == 101: + model = resnet101() + elif parser.depth == 152: + model = resnet152() + else: + raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152') + + if parser.parallel: + model = DataParallel(model) + + # load_model(model, opt.test_model_path) + model.load_state_dict(torch.load(parser.model)) + if is_cuda: + model.cuda() + + identity_list = get_pair_list(parser.lfw_pair_list) + img_data = load_img_data(identity_list, parser.lfw_root) + + model.eval() + lfw_test2(model, identity_list, img_data, is_cuda=is_cuda) + + +if __name__ == '__main__': + main() diff --git a/vectorizer/recognition/train.py b/vectorizer/recognition/train.py new file mode 100644 index 0000000..141e557 --- /dev/null +++ b/vectorizer/recognition/train.py @@ -0,0 +1,201 @@ +import argparse +import os +import time + +import numpy as np +import torch +import torch.nn as nn +from PIL import Image +from torch.optim.lr_scheduler import StepLR +from torchvision import transforms as T + +from recognition.angle import AngleLinear, CosFace, SphereFace, ArcFace, AdaCos +from recognition.focal_loss import FocalLoss +from recognition.nets import resnet18, resnet34, resnet50, resnet101, resnet152, sphere20 +from recognition.test import lfw_test2, get_pair_list, load_img_data + + +class Dataset(torch.utils.data.Dataset): + def __init__(self, root, data_list_file, imagesize): + with open(os.path.join(data_list_file), 'r') as fd: + imgs = fd.readlines() + + imgs = [os.path.join(root, img[:-1]) for img in imgs] + self.labels = list(set([img.split()[1] for img in imgs])) + self.imgs = np.random.permutation(imgs) + + normalize = T.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + self.transforms = T.Compose([ + T.RandomResizedCrop(imagesize), + T.RandomHorizontalFlip(), + T.ToTensor(), + normalize + ]) + + def __getitem__(self, index): + sample = self.imgs[index] + splits = sample.split() + img_path = splits[0] + data = Image.open(img_path) + data = data.convert(mode="RGB") + data = self.transforms(data) + cls = self.label_to_class(splits[1]) + return data.float(), cls + + def __len__(self): + return len(self.imgs) + + def label_to_class(self, label): + for idx, v in enumerate(self.labels): + if v == label: + return idx + raise Exception("Unknown label %s" % label) + + def num_labels(self): + return len(self.labels) + + +def main(args=None): + parser = argparse.ArgumentParser(description='Training script for face identification.') + + parser.add_argument('--print_freq', help='Print every N batch (default 100)', type=int, default=100) + parser.add_argument('--epochs', help='Number of epochs', type=int, default=50) + parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152 or 20 for sphere', type=int, default=50) + parser.add_argument('--lr_step', help='Learning rate step (default 10)', type=int, default=10) + parser.add_argument('--lr', help='Learning rate (default 0.1)', type=float, default=0.1) + parser.add_argument('--weight_decay', help='Weight decay (default 0.0005)', type=float, default=0.0005) + parser.add_argument('--easy_margin', help='Use easy margin (default false)', dest='easy_margin', default=False, action='store_true') + parser.add_argument('--parallel', help='Run training with DataParallel', dest='parallel', + default=False, action='store_true') + parser.add_argument('--loss', help='One of focal_loss. cross_entropy, arcface, cosface, sphereface, adacos (default cross_entropy)', + type=str, default='cross_entropy') + parser.add_argument('--optimizer', help='One of sgd, adam (default sgd)', + type=str, default='sgd') + parser.add_argument('--batch_size', help='Batch size (default 16)', type=int, default=16) + parser.add_argument('--casia_list', help='Path to CASIA dataset file list (training)') + parser.add_argument('--casia_root', help='Path to CASIA images (training)') + parser.add_argument('--lfw_root', help='Path to LFW dataset (testing)') + parser.add_argument('--lfw_pair_list', help='Path to LFW pair list file (testing)') + parser.add_argument('--model_name', help='Name of the model to save') + + parser = parser.parse_args(args) + + is_cuda = torch.cuda.is_available() + print('CUDA available: {}'.format(is_cuda)) + + imagesize = 224 + if parser.depth == 18: + model = resnet18() + elif parser.depth == 20: + model = sphere20() + elif parser.depth == 34: + model = resnet34() + elif parser.depth == 50: + model = resnet50() + elif parser.depth == 101: + model = resnet101() + elif parser.depth == 152: + model = resnet152() + else: + raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152') + + # TODO split training dataset to train/validation and stop using test dataset for acc + train_dataset = Dataset(parser.casia_root, parser.casia_list, imagesize) + trainloader = torch.utils.data.DataLoader(train_dataset, + batch_size=parser.batch_size, + shuffle=True, + # pin_memory=True, + num_workers=0) + num_classes = train_dataset.num_labels() + + if parser.loss == 'focal_loss': + metric_fc = nn.Linear(512, num_classes) + criterion = FocalLoss(gamma=2, is_cuda=is_cuda) + elif parser.loss == 'cross_entropy': + metric_fc = nn.Linear(512, num_classes) + criterion = torch.nn.CrossEntropyLoss() + if is_cuda: + criterion = criterion.cuda() + elif parser.loss == 'cosface': + metric_fc = AngleLinear(512, num_classes) + criterion = CosFace(is_cuda=is_cuda) + elif parser.loss == 'arcface': + metric_fc = AngleLinear(512, num_classes) + criterion = ArcFace(is_cuda=is_cuda) + elif parser.loss == 'sphereface': + metric_fc = AngleLinear(512, num_classes) + criterion = SphereFace(is_cuda=is_cuda) + elif parser.loss == 'adacos': + metric_fc = AngleLinear(512, num_classes) + criterion = AdaCos(num_classes, is_cuda=is_cuda) + else: + raise ValueError('Unknown loss %s' % parser.loss) + + if parser.optimizer == 'sgd': + optimizer = torch.optim.SGD([{'params': model.parameters()}, {'params': metric_fc.parameters()}], + lr=parser.lr, weight_decay=parser.weight_decay) + elif parser.optimizer == 'adam': + optimizer = torch.optim.Adam([{'params': model.parameters()}, {'params': metric_fc.parameters()}], + lr=parser.lr, weight_decay=parser.weight_decay) + else: + raise ValueError('Unknown optimizer %s' % parser.optimizer) + + scheduler = StepLR(optimizer, step_size=parser.lr_step, gamma=0.1) + + if parser.parallel: + model = nn.DataParallel(model) + metric_fc = nn.DataParallel(metric_fc) + + if is_cuda: + model.cuda() + metric_fc.cuda() + + print(model) + print(metric_fc) + + identity_list = get_pair_list(parser.lfw_pair_list) + img_data = load_img_data(identity_list, parser.lfw_root) + + print('{} train iters per epoch:'.format(len(trainloader))) + + start = time.time() + last_acc = 0.0 + for i in range(parser.epochs): + scheduler.step() + + model.train() + for ii, data in enumerate(trainloader): + data_input, label = data + if is_cuda: + data_input = data_input.cuda() + label = label.cuda().long() + feature = model(data_input) + output = metric_fc(feature) + loss = criterion(output, label) + optimizer.zero_grad() + loss.backward() + optimizer.step() + + iters = i * len(trainloader) + ii + + if iters % parser.print_freq == 0: + speed = parser.print_freq / (time.time() - start) + time_str = time.asctime(time.localtime(time.time())) + print('{} train epoch {} iter {} {} iters/s loss {}'.format(time_str, i, ii, speed, loss.item())) + + start = time.time() + + model.eval() + acc = lfw_test2(model, identity_list, img_data, is_cuda=is_cuda) + print('Accuracy: %f' % acc) + if last_acc < acc: + #TODO remove makedir + os.makedirs('./ckpt', exist_ok=True) + torch.save(model.state_dict(), './ckpt/' + parser.model_name + '_{}.pt'.format(i)) + torch.save(metric_fc.state_dict(), './ckpt/' + parser.model_name + '_metric_{}.pt'.format(i)) + + +if __name__ == '__main__': + main() diff --git a/vectorizer/requirements.txt b/vectorizer/requirements.txt new file mode 100644 index 0000000..71cd205 --- /dev/null +++ b/vectorizer/requirements.txt @@ -0,0 +1,4 @@ +Flask +Pillow +https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl +https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp37-cp37m-linux_x86_64.whl diff --git a/vectorizer/vectorizer/server.py b/vectorizer/vectorizer/server.py new file mode 100644 index 0000000..5ba39e7 --- /dev/null +++ b/vectorizer/vectorizer/server.py @@ -0,0 +1,82 @@ +import logging +import os +import sys +import tempfile + +from flask import Flask, request, abort, jsonify +from werkzeug.utils import secure_filename + +import torch +from recognition.nets import resnet50 +from torchvision import transforms as T +from PIL import Image +import identification.detector as fan + +is_cuda = torch.cuda.is_available() +fan_model = fan.load_model('ckpt/wider6_10.pt', is_cuda=is_cuda) + +# load recognition model +rec_model = resnet50() +rec_model.load_state_dict(torch.load('ckpt/recongition3_37.pt', map_location=lambda storage, location: storage)) +rec_model.eval() +if is_cuda: + rec_model = rec_model.cuda() + +# compute vectors +normalize = T.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + +imagesize = 224 +transforms = T.Compose([ + T.Resize((imagesize, imagesize)), + T.ToTensor(), + normalize +]) + +app = Flask(__name__) +UPLOAD_FOLDER = tempfile.gettempdir() +app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER + +def compute_vector(data): + with torch.no_grad(): + data = transforms(data) + if is_cuda: + data = data.cuda() + mo = rec_model(data.unsqueeze(dim=0)) + return mo.detach().cpu().numpy() + + +@app.route('/vectorize', methods=['GET', 'POST']) +def upload_file(): + if request.method == 'GET': + return 'OK' + + if request.method == 'POST': + # check if the post request has the file part + if 'file' not in request.files: + abort(500) + f = request.files['file'] + if f: + filename = secure_filename(f.filename) + filepath = os.path.join(UPLOAD_FOLDER, filename) + f.save(filepath) + + img = Image.open(filepath) + data = img.convert(mode="RGB") + + with torch.no_grad(): + boxes = fan.fan_detect(fan_model, data, threshold=0.9, is_cuda=is_cuda).astype(int) + boxes = [b for b in boxes if abs(b[1] - b[0]) >= imagesize / 2 and abs(b[2] - b[0]) >= imagesize / 2] + + if boxes is None or len(boxes) == 0: + abort(404) + + extracted = [{'box': arr.tolist(), 'vector': compute_vector(img.crop((arr[0], arr[1], arr[2], arr[3]))).squeeze().tolist()} for arr in boxes] + return jsonify(extracted) + else: + abort(500) + + +if __name__ == '__main__': + logging.basicConfig() + app.run()