Faceserver vectorizer training

6 years ago · aa6f45fc8b
parent 95d4f2ca7d
commit aa6f45fc8b
8 changed files with 7088 additions and 68 deletions
--- a/vectorizer/identification/csv_eval.py
+++ b/vectorizer/identification/csv_eval.py
@ -0,0 +1,238 @@
+import numpy as np
+
+import torch
+
+
+def compute_overlap(a, b):
+    """
+    Parameters
+    ----------
+    a: (N, 4) ndarray of float
+    b: (K, 4) ndarray of float
+    Returns
+    -------
+    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+    """
+    area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
+
+    iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0])
+    ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1])
+
+    iw = np.maximum(iw, 0)
+    ih = np.maximum(ih, 0)
+
+    ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
+
+    ua = np.maximum(ua, np.finfo(float).eps)
+
+    intersection = iw * ih
+
+    return intersection / ua
+
+
+def _compute_ap(recall, precision):
+    """ Compute the average precision, given the recall and precision curves.
+    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
+    # Arguments
+        recall:    The recall curve (list).
+        precision: The precision curve (list).
+    # Returns
+        The average precision as computed in py-faster-rcnn.
+    """
+    # correct AP calculation
+    # first append sentinel values at the end
+    mrec = np.concatenate(([0.], recall, [1.]))
+    mpre = np.concatenate(([0.], precision, [0.]))
+
+    # compute the precision envelope
+    for i in range(mpre.size - 1, 0, -1):
+        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+    # to calculate area under PR curve, look for points
+    # where X axis (recall) changes value
+    i = np.where(mrec[1:] != mrec[:-1])[0]
+
+    # and sum (\Delta recall) * prec
+    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def _get_detections(dataset, retinanet, score_threshold=0.05, max_detections=100, is_cuda=True):
+    """ Get the detections from the retinanet using the generator.
+    The result is a list of lists such that the size is:
+        all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes]
+    # Arguments
+        dataset         : The generator used to run images through the retinanet.
+        retinanet           : The retinanet to run on the images.
+        score_threshold : The score confidence threshold to use.
+        max_detections  : The maximum number of detections to use per image.
+        is_cuda         : CUDA available
+    # Returns
+        A list of lists containing the detections for each image in the generator.
+    """
+    all_detections = [[None for i in range(dataset.num_classes())] for j in range(len(dataset))]
+
+    retinanet.eval()
+
+    with torch.no_grad():
+
+        for index in range(len(dataset)):
+            data = dataset[index]
+            scale = data['scale']
+
+            # run network
+            img_data = data['img'].permute(2, 0, 1).float().unsqueeze(dim=0)
+            if is_cuda:
+                img_data = img_data.cuda()
+            scores, labels, boxes = retinanet(img_data)
+            if isinstance(scores, torch.Tensor):
+                scores = scores.cpu().numpy()
+                labels = labels.cpu().numpy()
+                boxes = boxes.cpu().numpy()
+
+                # correct boxes for image scale
+                boxes /= scale
+
+                # select indices which have a score above the threshold
+                indices = np.where(scores > score_threshold)[0]
+
+                # select those scores
+                scores = scores[indices]
+
+                # find the order with which to sort the scores
+                scores_sort = np.argsort(-scores)[:max_detections]
+
+                # select detections
+                image_boxes = boxes[indices[scores_sort], :]
+                image_scores = scores[scores_sort]
+                image_labels = labels[indices[scores_sort]]
+                image_detections = np.concatenate(
+                    [image_boxes, np.expand_dims(image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
+
+                # copy detections to all_detections
+                for label in range(dataset.num_classes()):
+                    all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1]
+            else:
+                # copy detections to all_detections
+                for label in range(dataset.num_classes()):
+                    all_detections[index][label] = np.zeros((0, 5))
+
+            print('{}/{}'.format(index + 1, len(dataset)), end='\r')
+
+    return all_detections
+
+
+def _get_annotations(generator):
+    """ Get the ground truth annotations from the generator.
+    The result is a list of lists such that the size is:
+        all_detections[num_images][num_classes] = annotations[num_detections, 5]
+    # Arguments
+        generator : The generator used to retrieve ground truth annotations.
+    # Returns
+        A list of lists containing the annotations for each image in the generator.
+    """
+    all_annotations = [[None for i in range(generator.num_classes())] for j in range(len(generator))]
+
+    for i in range(len(generator)):
+        # load the annotations
+        annotations = generator.load_annotations(i)
+
+        # copy detections to all_annotations
+        for label in range(generator.num_classes()):
+            all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy()
+
+        print('{}/{}'.format(i + 1, len(generator)), end='\r')
+
+    return all_annotations
+
+
+def evaluate(
+        generator,
+        retinanet,
+        iou_threshold=0.5,
+        score_threshold=0.05,
+        max_detections=100,
+        is_cuda=True,
+        save_path=None
+):
+    """ Evaluate a given dataset using a given retinanet.
+    # Arguments
+        generator       : The generator that represents the dataset to evaluate.
+        retinanet           : The retinanet to evaluate.
+        iou_threshold   : The threshold used to consider when a detection is positive or negative.
+        score_threshold : The score confidence threshold to use for detections.
+        max_detections  : The maximum number of detections to use per image.
+        is_cuda         : CUDA available
+        save_path       : The path to save images with visualized detections to.
+    # Returns
+        A dict mapping class names to mAP scores.
+    """
+
+    # gather all detections and annotations
+
+    all_detections = _get_detections(generator, retinanet, score_threshold=score_threshold,
+                                     max_detections=max_detections, is_cuda=is_cuda)
+    all_annotations = _get_annotations(generator)
+
+    average_precisions = {}
+
+    for label in range(generator.num_classes()):
+        false_positives = np.zeros((0,))
+        true_positives = np.zeros((0,))
+        scores = np.zeros((0,))
+        num_annotations = 0.0
+
+        for i in range(len(generator)):
+            detections = all_detections[i][label]
+            annotations = all_annotations[i][label]
+            num_annotations += annotations.shape[0]
+            detected_annotations = []
+
+            for d in detections:
+                scores = np.append(scores, d[4])
+
+                if annotations.shape[0] == 0:
+                    false_positives = np.append(false_positives, 1)
+                    true_positives = np.append(true_positives, 0)
+                    continue
+
+                overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations)
+                assigned_annotation = np.argmax(overlaps, axis=1)
+                max_overlap = overlaps[0, assigned_annotation]
+
+                if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
+                    false_positives = np.append(false_positives, 0)
+                    true_positives = np.append(true_positives, 1)
+                    detected_annotations.append(assigned_annotation)
+                else:
+                    false_positives = np.append(false_positives, 1)
+                    true_positives = np.append(true_positives, 0)
+
+        # no annotations -> AP for this class is 0 (is this correct?)
+        if num_annotations == 0:
+            average_precisions[label] = 0, 0
+            continue
+
+        # sort by score
+        indices = np.argsort(-scores)
+        false_positives = false_positives[indices]
+        true_positives = true_positives[indices]
+
+        # compute false positives and true positives
+        false_positives = np.cumsum(false_positives)
+        true_positives = np.cumsum(true_positives)
+
+        # compute recall and precision
+        recall = true_positives / num_annotations
+        precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps)
+
+        # compute average precision
+        average_precision = _compute_ap(recall, precision)
+        average_precisions[label] = average_precision, num_annotations
+
+    print('\nmAP:')
+    for label in range(generator.num_classes()):
+        label_name = generator.label_to_name(label)
+        print('{}: {}'.format(label_name, average_precisions[label][0]))
+
+    return average_precisions
--- a/vectorizer/identification/dataloader.py
+++ b/vectorizer/identification/dataloader.py
@ -0,0 +1,566 @@
+import torch
+import numpy as np
+import random
+import csv
+import os
+
+from torch.utils.data import Dataset
+from torch.utils.data.sampler import Sampler
+
+from PIL import Image, ImageEnhance, ImageFilter
+
+
+class CSVDataset(Dataset):
+    """CSV dataset."""
+
+    def __init__(self, train_file, class_list, transform=None):
+        """
+        Args:
+            train_file (string): CSV file with training annotations
+            class_list (string): CSV file with class list
+            transform (optional): Transformation function
+        """
+        self.train_file = train_file
+        self.class_list = class_list
+        self.transform = transform
+
+        # parse the provided class file
+        try:
+            with open(self.class_list, 'r', newline='') as file:
+                self.classes = CSVDataset.load_classes(csv.reader(file, delimiter=' '))
+        except ValueError as e:
+            raise (ValueError('invalid CSV class file: {}: {}'.format(self.class_list, e)), None)
+
+        self.labels = {}
+        for key, value in self.classes.items():
+            self.labels[value] = key
+
+        # csv with img_path, x1, y1, x2, y2, class_name
+        try:
+            with open(self.train_file, 'r', newline='') as file:
+                self.image_data = CSVDataset._read_annotations(csv.reader(file, delimiter=' '), self.classes)
+        except ValueError as e:
+            raise (ValueError('invalid CSV annotations file: {}: {}'.format(self.train_file, e)), None)
+        self.image_names = list(self.image_data.keys())
+
+    @staticmethod
+    def _parse(value, function, fmt):
+        """
+        Parse a string into a value, and format a nice ValueError if it fails.
+        Returns `function(value)`.
+        Any `ValueError` raised is catched and a new `ValueError` is raised
+        with message `fmt.format(e)`, where `e` is the caught `ValueError`.
+        """
+        try:
+            return function(value)
+        except ValueError as e:
+            raise (ValueError(fmt.format(e)), None)
+
+    @staticmethod
+    def load_classes(csv_reader):
+        result = {}
+
+        for line, row in enumerate(csv_reader):
+            line += 1
+
+            try:
+                class_name, class_id = row
+            except ValueError:
+                raise (ValueError("line {}: format should be 'class_name,class_id'".format(line)), None)
+            class_id = CSVDataset._parse(class_id, int, 'line {}: malformed class ID: {{}}'.format(line))
+
+            if class_name in result:
+                raise ValueError('line {}: duplicate class name: \'{}\''.format(line, class_name))
+            result[class_name] = class_id
+        return result
+
+    def __len__(self):
+        return len(self.image_names)
+
+    def __getitem__(self, idx):
+        img = self.load_image(idx)
+        annot = self.load_annotations(idx)
+        sample = {'img': img, 'annot': annot, 'scale': 1}
+        if self.transform:
+            sample = self.transform(sample)
+
+        return sample
+
+    def load_image(self, image_index):
+        img = Image.open(filepath)
+        img = img.convert(mode="RGB")
+
+        return img
+
+    def load_annotations(self, image_index):
+        # get ground truth annotations
+        annotation_list = self.image_data[self.image_names[image_index]]
+        annotations = np.zeros((0, 5))
+
+        # some images appear to miss annotations (like image with id 257034)
+        if len(annotation_list) == 0:
+            return annotations
+
+        # parse annotations
+        for idx, a in enumerate(annotation_list):
+            # some annotations have basically no width / height, skip them
+            x1 = a['x1']
+            x2 = a['x2']
+            y1 = a['y1']
+            y2 = a['y2']
+
+            if (x2 - x1) < 1 or (y2 - y1) < 1:
+                continue
+
+            annotation = np.zeros((1, 5))
+
+            annotation[0, 0] = x1
+            annotation[0, 1] = y1
+            annotation[0, 2] = x2
+            annotation[0, 3] = y2
+
+            annotation[0, 4] = self.name_to_label(a['class'])
+            annotations = np.append(annotations, annotation, axis=0)
+
+        return annotations
+
+    @staticmethod
+    def _read_annotations(csv_reader, classes):
+        result = {}
+        for line, row in enumerate(csv_reader):
+            line += 1
+
+            try:
+                img_file, x1, y1, x2, y2, class_name = row[:6]
+            except ValueError:
+                raise (ValueError(
+                    "line {}: format should be 'img_file,x1,y1,x2,y2,class_name' or 'img_file,,,,,'".format(line)),
+                       None)
+
+            if img_file not in result:
+                result[img_file] = []
+
+            # If a row contains only an image path, it's an image without annotations.
+            if (x1, y1, x2, y2, class_name) == ('.', '.', '.', '.', '.'):
+                continue
+
+            x1 = CSVDataset._parse(x1, int, 'line {}: malformed x1: {{}}'.format(line))
+            y1 = CSVDataset._parse(y1, int, 'line {}: malformed y1: {{}}'.format(line))
+            x2 = CSVDataset._parse(x2, int, 'line {}: malformed x2: {{}}'.format(line))
+            y2 = CSVDataset._parse(y2, int, 'line {}: malformed y2: {{}}'.format(line))
+
+            if class_name != 'ignore':
+                # Check that the bounding box is valid.
+                if x2 <= x1:
+                    raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1))
+                if y2 <= y1:
+                    raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1))
+
+                # check if the current class name is correctly present
+                if class_name not in classes:
+                    raise ValueError(
+                        'line {}: unknown class name: \'{}\' (classes: {})'.format(line, class_name, classes))
+
+            result[img_file].append({'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2, 'class': class_name})
+        return result
+
+    def name_to_label(self, name):
+        return self.classes[name]
+
+    def label_to_name(self, label):
+        return self.labels[label]
+
+    def num_classes(self):
+        return max(self.classes.values()) + 1
+
+    def image_aspect_ratio(self, image_index):
+        image = Image.open(self.image_names[image_index])
+        return float(image.width) / float(image.height)
+
+
+class WIDERDataset(Dataset):
+    """Wider dataset."""
+
+    def __init__(self, train_file, img_prefix='', transform=None):
+        """
+        Args:
+            train_file (string): Wider txt file with training annotations
+            img_prefix (string, optional): Prefix for images location
+            transform (optional): Transformation function
+        """
+        self.train_file = train_file
+        self.transform = transform
+        self.img_prefix = img_prefix
+
+        # WIDER dataset has only faces. Enhance for additional face properties (see below).
+        self.classes = {'face': 0}
+
+        self.labels = {}
+        for key, value in self.classes.items():
+            self.labels[value] = key
+
+        # Wider file definition example:
+        # file with image name, number of faces, x y w h blur expression illumination invalid occlusion pose
+        # 0--Parade/0_Parade_marchingband_1_117.jpg
+        # 9
+        # 69 359 50 36 1 0 0 0 0 1
+        # 227 382 56 43 1 0 1 0 0 1
+        # 296 305 44 26 1 0 0 0 0 1
+        # 353 280 40 36 2 0 0 0 2 1
+        # 885 377 63 41 1 0 0 0 0 1
+        # 819 391 34 43 2 0 0 0 1 0
+        # 727 342 37 31 2 0 0 0 0 1
+        # 598 246 33 29 2 0 0 0 0 1
+        # 740 308 45 33 1 0 0 0 2 1
+        try:
+            with open(self.train_file, 'r') as file:
+                self.image_data = WIDERDataset._read_data(file)
+        except ValueError as e:
+            raise (ValueError('invalid WIDER annotations file: {}: {}'.format(self.train_file, e)), None)
+        self.image_names = list(self.image_data.keys())
+
+    def __len__(self):
+        return len(self.image_names)
+
+    def __getitem__(self, idx):
+        img = self.load_image(idx)
+        annot = self.load_annotations(idx)
+        sample = {'img': img, 'annot': annot, 'scale': 1, 'img_name': self.image_names[idx]}
+        if self.transform:
+            sample = self.transform(sample)
+
+        return sample
+
+    def load_image(self, image_index):
+        print('Loading image %s' % self.image_names[image_index])
+        img = Image.open(os.path.join(self.img_prefix, self.image_names[image_index]))
+        img = img.convert(mode="RGB")
+
+        return img
+
+    def load_annotations(self, image_index):
+        # get ground truth annotations
+        annotation_list = self.image_data[self.image_names[image_index]]
+        annotations = np.zeros((0, 5))
+
+        # some images appear to miss annotations (like image with id 257034)
+        if len(annotation_list) == 0:
+            return annotations
+
+        # parse annotations
+        for idx, a in enumerate(annotation_list):
+            # some annotations have basically no width / height, skip them
+            x1 = a['x1']
+            x2 = a['x2']
+            y1 = a['y1']
+            y2 = a['y2']
+
+            if (x2 - x1) < 1 or (y2 - y1) < 1:
+                continue
+
+            annotation = np.zeros((1, 5))
+
+            annotation[0, 0] = x1
+            annotation[0, 1] = y1
+            annotation[0, 2] = x2
+            annotation[0, 3] = y2
+
+            annotation[0, 4] = self.name_to_label(a['class'])
+            annotations = np.append(annotations, annotation, axis=0)
+
+        return annotations
+
+    @staticmethod
+    def _read_data(reader):
+        result = {}
+        counter = 0
+        img_file = None
+        for line in reader:
+            line = line.strip()
+            if counter == 0:
+                # file name or number of faces
+                try:
+                    counter = int(line)
+                except ValueError:
+                    if img_file and len(result[img_file]) == 0:
+                        print("Warning - no faces: %s" % img_file)
+                    img_file = line
+            else:
+                counter -= 1
+                # coordinates e.g. 370 170 9 13 2 0 0 0 2 0
+                nums = [int(x) for x in line.split()]
+                result.setdefault(img_file, []).append({'x1': nums[0], 'x2': nums[0] + nums[2],
+                                                        'y1': nums[1], 'y2': nums[1] + nums[3],
+                                                        'class': 'face'})
+        return result
+
+    def name_to_label(self, name):
+        return self.classes[name]
+
+    def label_to_name(self, label):
+        return self.labels[label]
+
+    def num_classes(self):
+        return max(self.classes.values()) + 1
+
+    def image_aspect_ratio(self, image_index):
+        image = Image.open(os.path.join(self.img_prefix, self.image_names[image_index]))
+        return float(image.width) / float(image.height)
+
+
+def collater(data):
+    imgs = [s['img'] for s in data]
+    annots = [s['annot'] for s in data]
+    scales = [s['scale'] for s in data]
+
+    widths = [int(s.shape[0]) for s in imgs]
+    heights = [int(s.shape[1]) for s in imgs]
+    batch_size = len(imgs)
+
+    max_width = np.array(widths).max()
+    max_height = np.array(heights).max()
+
+    padded_imgs = torch.zeros(batch_size, max_width, max_height, 3)
+
+    for i in range(batch_size):
+        img = imgs[i]
+        padded_imgs[i, :int(img.shape[0]), :int(img.shape[1]), :] = img
+
+    max_num_annots = max(annot.shape[0] for annot in annots)
+    # print(annot_padded.shape)
+    if max_num_annots > 0:
+        annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1
+        for idx, annot in enumerate(annots):
+            # print(annot.shape)
+            if annot.shape[0] > 0:
+                annot_padded[idx, :annot.shape[0], :] = annot
+    else:
+        annot_padded = torch.ones((len(annots), 1, 5)) * -1
+
+    padded_imgs = padded_imgs.permute(0, 3, 1, 2)
+
+    return {'img': padded_imgs, 'annot': annot_padded, 'scale': scales}
+
+
+class Resizer(object):
+    """Convert ndarrays in sample to Tensors."""
+
+    def __call__(self, sample, min_side=800, max_side=1400):
+        image, annots, scale = sample['img'], sample['annot'], sample['scale']
+
+        cols, rows = image.size
+
+        # scale = min_side / rows
+
+        smallest_side = min(rows, cols)
+
+        # rescale the image so the smallest side is min_side
+        scale = min_side / smallest_side
+
+        # check if the largest side is now greater than max_side, which can happen
+        # when images have a large aspect ratio
+        largest_side = max(rows, cols)
+
+        if largest_side * scale > max_side:
+            scale = max_side / largest_side
+
+        # resize the image with the computed scale
+
+        image = np.array(image.resize((int(round((cols * scale))), int(round((rows * scale)))), resample=Image.BILINEAR))
+        image = image  / 255.0
+
+        rows, cols, cns = image.shape
+
+        pad_w = 32 - rows % 32
+        pad_h = 32 - cols % 32
+
+        new_image = np.zeros((rows + pad_w, cols + pad_h, cns)).astype(np.float32)
+        new_image[:rows, :cols, :] = image.astype(np.float32)
+
+        annots[:, :4] *= scale
+
+        return {'img': new_image, 'annot': annots, 'scale': scale}
+
+
+class Augmenter(object):
+    """Convert ndarrays in sample to Tensors."""
+
+    def __call__(self, sample, flip_x=0.5):
+        if np.random.rand() < flip_x:
+            image, annots, scales = sample['img'], sample['annot'], sample['scale']
+            image = image[:, ::-1, :]
+
+            rows, cols, channels = image.shape
+
+            x1 = annots[:, 0].copy()
+            x2 = annots[:, 2].copy()
+
+            x_tmp = x1.copy()
+
+            annots[:, 0] = cols - x2
+            annots[:, 2] = cols - x_tmp
+
+            sample = {'img': image, 'annot': annots, 'scale': scales}
+
+        return sample
+
+
+class RandomCrop(object):
+    def __call__(self, sample):
+        image, annots, scales = sample['img'], sample['annot'], sample['scale']
+
+        if not annots.shape[0]:
+            return {'img': image, 'annot': annots, 'scale': scales}
+        if random.choice([0, 1]):
+            return {'img': image, 'annot': annots, 'scale': scales}
+        else:
+            rows, cols, cns = image.shape
+            flag = 0
+            while True:
+                flag += 1
+                if flag > 10:
+                    return {'img': image, 'annot': annots, 'scale': scales}
+
+                crop_ratio = random.uniform(0.5, 1)
+                rows_zero = int(rows * random.uniform(0, 1 - crop_ratio))
+                cols_zero = int(cols * random.uniform(0, 1 - crop_ratio))
+                crop_rows = int(rows * crop_ratio)
+                crop_cols = int(cols * crop_ratio)
+                '''
+                new_image = image[rows_zero:rows_zero+crop_rows, cols_zero:cols_zero+crop_cols, :]
+                new_image = cv2.resize(new_image, (cols, rows))
+                #new_image = skimage.transform.resize(new_image, (rows, cols))
+
+                new_annots = np.zeros((0, 5))
+                for i in range(annots.shape[0]):
+                    x1 = max(annots[i, 0] - cols_zero, 0)
+                    y1 = max(annots[i, 1] - rows_zero, 0)
+                    x2 = min(annots[i, 2] - cols_zero, crop_cols)
+                    y2 = min(annots[i, 3] - rows_zero, crop_rows)
+                    label = annots[i, 4]
+                    if x1 + 10 < x2 and y1 + 10 < y2:
+                        x1 /= crop_ratio
+                        y1 /= crop_ratio
+                        x2 /= crop_ratio
+                        y2 /= crop_ratio
+                        new_annots = np.append(new_annots, np.array([[x1, y1, x2, y2, label]]), axis=0)
+
+                if not new_annots.shape[0]:
+                    continue
+                '''
+                new_image = np.zeros((rows, cols, cns))
+                new_image[rows_zero:rows_zero + crop_rows, cols_zero:cols_zero + crop_cols, :] = \
+                    image[
+                    rows_zero:rows_zero + crop_rows,
+                    cols_zero:cols_zero + crop_cols,
+                    :]
+
+                new_annots = np.zeros((0, 5))
+                for i in range(annots.shape[0]):
+                    x1 = max(cols_zero, annots[i, 0])
+                    y1 = max(rows_zero, annots[i, 1])
+                    x2 = min(cols_zero + crop_cols, annots[i, 2])
+                    y2 = min(rows_zero + crop_rows, annots[i, 3])
+                    label = annots[i, 4]
+                    if x1 + 10 < x2 and y1 + 10 < y2:
+                        new_annots = np.append(new_annots, np.array([[x1, y1, x2, y2, label]]), axis=0)
+
+                if not new_annots.shape[0]:
+                    continue
+
+                return {'img': new_image, 'annot': new_annots, 'scale': scales}
+
+
+class Color(object):
+    def __call__(self, sample):
+        image, annots, scales = sample['img'], sample['annot'], sample['scale']
+        image = Image.fromarray(image)
+
+        ratio = [0.5, 0.8, 1.2, 1.5]
+
+        if random.choice([0, 1]):
+            enh_bri = ImageEnhance.Brightness(image)
+            brightness = random.choice(ratio)
+            image = enh_bri.enhance(brightness)
+        if random.choice([0, 1]):
+            enh_col = ImageEnhance.Color(image)
+            color = random.choice(ratio)
+            image = enh_col.enhance(color)
+        if random.choice([0, 1]):
+            enh_con = ImageEnhance.Contrast(image)
+            contrast = random.choice(ratio)
+            image = enh_con.enhance(contrast)
+        if random.choice([0, 1]):
+            enh_sha = ImageEnhance.Sharpness(image)
+            sharpness = random.choice(ratio)
+            image = enh_sha.enhance(sharpness)
+        if random.choice([0, 1]):
+            image = image.filter(ImageFilter.BLUR)
+
+        image = np.asarray(image)
+        return {'img': image, 'annot': annots, 'scale': scales}
+
+
+class Normalizer(object):
+    def __init__(self):
+        self.mean = np.array([[[0.485, 0.456, 0.406]]])
+        self.std = np.array([[[0.229, 0.224, 0.225]]])
+
+    def __call__(self, sample):
+        image, annots, scales = sample['img'], sample['annot'], sample['scale']
+
+        image = (image.astype(np.float32) - self.mean) / self.std
+
+        sample = {'img': torch.from_numpy(image), 'annot': torch.from_numpy(annots), 'scale': scales}
+        return sample
+
+
+class UnNormalizer(object):
+    def __init__(self, mean=None, std=None):
+        if mean is None:
+            self.mean = [0.485, 0.456, 0.406]
+        else:
+            self.mean = mean
+        if std is None:
+            self.std = [0.229, 0.224, 0.225]
+        else:
+            self.std = std
+
+    def __call__(self, tensor):
+        """
+        Args:
+            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
+        Returns:
+            Tensor: Normalized image.
+        """
+        for t, m, s in zip(tensor, self.mean, self.std):
+            t.mul_(s).add_(m)
+        return tensor
+
+
+class AspectRatioBasedSampler(Sampler):
+    def __init__(self, data_source, batch_size, drop_last):
+        self.data_source = data_source
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.groups = self.group_images()
+
+    def __iter__(self):
+        random.shuffle(self.groups)
+        for group in self.groups:
+            yield group
+
+    def __len__(self):
+        if self.drop_last:
+            return len(self.data_source) // self.batch_size
+        else:
+            return (len(self.data_source) + self.batch_size - 1) // self.batch_size
+
+    def group_images(self):
+        # determine the order of the images
+        order = list(range(len(self.data_source)))
+        order.sort(key=lambda x: self.data_source.image_aspect_ratio(x))
+
+        # divide into groups, one group = one batch
+        return [[order[x % len(order)] for x in range(i, i + self.batch_size)] for i in
+                range(0, len(order), self.batch_size)]
--- a/vectorizer/identification/detector.py
+++ b/vectorizer/identification/detector.py
@ -1,63 +1,13 @@
 import numpy as np
 import torch
-from PIL import Image
+import argparse
+import json
+from PIL import Image, ImageDraw

+from identification.dataloader import Normalizer, Resizer
 from torchvision import transforms


-class Resizer(object):
-    """Convert ndarrays in sample to Tensors."""
-
-    def __call__(self, sample, min_side=800, max_side=1400):
-        image, annots, scale = sample['img'], sample['annot'], sample['scale']
-
-        rows, cols = image.size
-
-        # scale = min_side / rows
-
-        smallest_side = min(rows, cols)
-
-        # rescale the image so the smallest side is min_side
-        scale = min_side / smallest_side
-
-        # check if the largest side is now greater than max_side, which can happen
-        # when images have a large aspect ratio
-        largest_side = max(rows, cols)
-
-        if largest_side * scale > max_side:
-            scale = max_side / largest_side
-
-        # resize the image with the computed scale
-        image = np.array(image.resize((int(round((cols * scale))), int(round((rows * scale)))), resample=Image.BILINEAR))
-        image = image  / 255.0
-
-        rows, cols, cns = image.shape
-
-        pad_w = 32 - rows % 32
-        pad_h = 32 - cols % 32
-
-        new_image = np.zeros((rows + pad_w, cols + pad_h, cns)).astype(np.float32)
-        new_image[:rows, :cols, :] = image.astype(np.float32)
-
-        annots[:, :4] *= scale
-
-        return {'img': new_image, 'annot': annots, 'scale': scale}
-
-
-class Normalizer(object):
-    def __init__(self):
-        self.mean = np.array([[[0.485, 0.456, 0.406]]])
-        self.std = np.array([[[0.229, 0.224, 0.225]]])
-
-    def __call__(self, sample):
-        image, annots, scales = sample['img'], sample['annot'], sample['scale']
-
-        image = (image.astype(np.float32) - self.mean) / self.std
-
-        sample = {'img': torch.from_numpy(image), 'annot': torch.from_numpy(annots), 'scale': scales}
-        return sample
-
-
 def fan_detect(model, img_data, threshold=0.9, max_detections=100, is_cuda=True):
    input_data = {'img': img_data, 'annot': np.zeros((0, 5)), 'scale': 1}
    transform = transforms.Compose([Resizer(), Normalizer()])
@ -70,7 +20,7 @@ def fan_detect(model, img_data, threshold=0.9, max_detections=100, is_cuda=True)
            img_data = img_data.cuda()
        scores, labels, boxes = model(img_data)
        if scores is None:
-            return np.array()
+            return np.empty((0,0)), np.empty((0,0))

        scores = scores.cpu().numpy()
        scale = transformed['scale']
@ -81,7 +31,16 @@ def fan_detect(model, img_data, threshold=0.9, max_detections=100, is_cuda=True)
        scores_sort = np.argsort(-scores)[:max_detections]
        image_boxes = boxes[indices[scores_sort], :]

-    return image_boxes
+    return image_boxes, scores[:max_detections]
+
+
+def img_rectangles(img, output_path, boxes=None):
+    if boxes is not None:
+        draw = ImageDraw.Draw(img)
+        for arr in boxes:
+            draw.rectangle(((arr[0], arr[1]), (arr[2], arr[3])), outline="black", width=1)
+
+    img.save(output_path)


 def load_model(model_path, is_cuda=True):
@ -93,3 +52,36 @@ def load_model(model_path, is_cuda=True):
    model.anchors.is_cuda=is_cuda

    return model
+
+
+def load_image(filepath):
+    img = Image.open(filepath)
+    img = img.convert(mode="RGB")
+    return img
+
+
+def main(args=None):
+    parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.')
+
+    parser.add_argument('--model', help='Path to model')
+    parser.add_argument('--image', help='Path to image')
+    parser.add_argument('--rect', help='Output image with rectangles')
+    parser.add_argument('--threshold', help='Probability threshold (default 0.9)', type=float, default=0.9)
+    parser.add_argument('--force-cpu', help='Force CPU for detection (default false)', dest='force_cpu',
+                        default=False, action='store_true')
+
+    parser = parser.parse_args(args)
+
+    is_cuda = torch.cuda.is_available() and not parser.force_cpu
+
+    model = load_model(parser.model, is_cuda=is_cuda)
+    img = load_image(parser.image)
+    boxes, scores = fan_detect(model, img, threshold=parser.threshold, is_cuda=is_cuda)
+    print(json.dumps({'boxes': boxes.tolist(), 'scores': scores}))
+    if parser.rect:
+        img = load_image(parser.image)
+        img_rectangles(img, parser.rect, boxes)
+
+
+if __name__ == '__main__':
+    main()
--- a/vectorizer/identification/train.py
+++ b/vectorizer/identification/train.py
@ -0,0 +1,207 @@
+import argparse
+import collections
+import os
+
+import numpy as np
+
+import torch
+import torch.optim as optim
+from torchvision import transforms
+import torch.utils.model_zoo as model_zoo
+
+from identification.model_level_attention import resnet18, resnet34, resnet50, resnet101, resnet152
+from torch.utils.data import DataLoader
+from identification.csv_eval import evaluate
+from identification.dataloader import WIDERDataset, AspectRatioBasedSampler, collater, Resizer, Augmenter, Normalizer, CSVDataset
+
+is_cuda = torch.cuda.is_available()
+print('CUDA available: {}'.format(is_cuda))
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+ckpt = False
+
+def main(args=None):
+    parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.')
+
+    parser.add_argument('--wider_train', help='Path to file containing WIDER training annotations (see readme)')
+    parser.add_argument('--wider_val',
+                        help='Path to file containing WIDER validation annotations (optional, see readme)')
+    parser.add_argument('--wider_train_prefix', help='Prefix path to WIDER train images')
+    parser.add_argument('--wider_val_prefix', help='Prefix path to WIDER validation images')
+
+    parser.add_argument('--csv_train', help='Path to file containing training annotations (see readme)')
+    parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)')
+    parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)')
+
+    parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50)
+    parser.add_argument('--epochs', help='Number of epochs', type=int, default=50)
+    parser.add_argument('--batch_size', help='Batch size (default 2)', type=int, default=2)
+
+    parser.add_argument('--model_name', help='Name of the model to save')
+    parser.add_argument('--parallel', help='Run training with DataParallel', dest='parallel',
+                        default=False, action='store_true')
+    parser.add_argument('--pretrained', help='Pretrained model name in weight directory')
+
+    parser = parser.parse_args(args)
+
+    # Create the data loaders
+    if parser.wider_train is None:
+        dataset_train = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes,
+                                   transform=transforms.Compose([Resizer(), Augmenter(), Normalizer()]))
+    else:
+        dataset_train = WIDERDataset(train_file=parser.wider_train, img_prefix=parser.wider_train_prefix,
+                                     transform=transforms.Compose([Resizer(), Augmenter(), Normalizer()]))
+
+    if parser.wider_val is None:
+        if parser.csv_val is None:
+            dataset_val = None
+            print('No validation annotations provided.')
+        else:
+            print('Loading CSV validation dataset')
+            dataset_val = CSVDataset(train_file=parser.csv_val, class_list=parser.csv_classes,
+                                     transform=transforms.Compose([Resizer(), Normalizer()]))
+    else:
+        print('Loading WIDER validation dataset')
+        dataset_val = WIDERDataset(train_file=parser.wider_val, img_prefix=parser.wider_val_prefix,
+                                   transform=transforms.Compose([Resizer(), Normalizer()]))
+
+    print('Loading training dataset')
+    sampler = AspectRatioBasedSampler(dataset_train, batch_size=parser.batch_size, drop_last=False)
+    if parser.parallel:
+        dataloader_train = DataLoader(dataset_train, num_workers=16, collate_fn=collater, batch_sampler=sampler)
+    else:
+        dataloader_train = DataLoader(dataset_train, collate_fn=collater, batch_sampler=sampler)
+
+    # Create the model_pose_level_attention
+    if parser.depth == 18:
+        retinanet = resnet18(num_classes=dataset_train.num_classes(), is_cuda=is_cuda)
+    elif parser.depth == 34:
+        retinanet = resnet34(num_classes=dataset_train.num_classes(), is_cuda=is_cuda)
+    elif parser.depth == 50:
+        retinanet = resnet50(num_classes=dataset_train.num_classes(), is_cuda=is_cuda)
+    elif parser.depth == 101:
+        retinanet = resnet101(num_classes=dataset_train.num_classes(), is_cuda=is_cuda)
+    elif parser.depth == 152:
+        retinanet = resnet152(num_classes=dataset_train.num_classes(), is_cuda=is_cuda)
+    else:
+        raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152')
+
+    if ckpt:
+        retinanet = torch.load('')
+        print('Loading checkpoint')
+    else:
+        print('Loading pretrained model')
+        retinanet_dict = retinanet.state_dict()
+        if parser.pretrained is None:
+            pretrained_dict = model_zoo.load_url(model_urls['resnet' + str(parser.depth)])
+        else:
+            pretrained_dict = torch.load(parser.pretrained)
+        pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in retinanet_dict}
+        retinanet_dict.update(pretrained_dict)
+        retinanet.load_state_dict(retinanet_dict)
+        print('load pretrained backbone')
+
+    print(retinanet)
+    if parser.parallel:
+        retinanet = torch.nn.DataParallel(retinanet, device_ids=[0])
+    if is_cuda:
+        retinanet.cuda()
+
+    retinanet.training = True
+
+    optimizer = optim.Adam(retinanet.parameters(), lr=1e-5)
+    # optimizer = optim.SGD(retinanet.parameters(), lr=1e-3, momentum=0.9, weight_decay=1e-4)
+
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)
+    # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)
+
+    loss_hist = collections.deque(maxlen=500)
+
+    retinanet.train()
+    if parser.parallel:
+        retinanet.module.freeze_bn()
+    else:
+        retinanet.freeze_bn()
+
+    print('Num training images: {}'.format(len(dataset_train)))
+    iters = 0
+    for epoch_num in range(0, parser.epochs):
+
+        retinanet.train()
+        if parser.parallel:
+            retinanet.module.freeze_bn()
+        else:
+            retinanet.freeze_bn()
+
+        epoch_loss = []
+
+        for iter_num, data in enumerate(dataloader_train):
+
+            iters += 1
+
+            optimizer.zero_grad()
+
+            img_data = data['img'].float()
+            annot_data = data['annot']
+            if is_cuda:
+                img_data = img_data.cuda()
+                annot_data = annot_data.cuda()
+
+            print("GPU memory allocated: %d max memory allocated: %d memory cached: %d max memory cached: %d" % (torch.cuda.memory_allocated() / 1024**2, torch.cuda.max_memory_allocated() / 1024**2, torch.cuda.memory_cached() / 1024**2, torch.cuda.max_memory_cached() / 1024**2))
+            classification_loss, regression_loss, mask_loss = retinanet([img_data, annot_data])
+            
+            del img_data
+            del annot_data
+
+            classification_loss = classification_loss.mean()
+            regression_loss = regression_loss.mean()
+            mask_loss = mask_loss.mean()
+
+            loss = classification_loss + regression_loss + mask_loss
+
+            if bool(loss == 0):
+                continue
+
+            loss.backward()
+
+            torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1)
+
+            optimizer.step()
+
+            loss_hist.append(float(loss.item()))
+
+            epoch_loss.append(float(loss.item()))
+
+            print(
+                'Epoch: {} | Iteration: {} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | '
+                'mask_loss {:1.5f} | Running loss: {:1.5f}'.format(
+                    epoch_num, iter_num, float(classification_loss), float(regression_loss), float(mask_loss),
+                    np.mean(loss_hist)))
+
+            del classification_loss
+            del regression_loss
+            del loss
+
+        if parser.wider_val is not None:
+            print('Evaluating dataset')
+            evaluate(dataset_val, retinanet, is_cuda=is_cuda)
+
+        scheduler.step(np.mean(epoch_loss))
+
+        #TODO remove makedir
+        os.makedirs('./ckpt', exist_ok=True)
+        if parser.parallel:
+            torch.save(retinanet.module, './ckpt/' + parser.model_name + '_{}.pt'.format(epoch_num))
+        else:
+            torch.save(retinanet, './ckpt/' + parser.model_name + '_{}.pt'.format(epoch_num))
+
+
+if __name__ == '__main__':
+    main()
--- a/vectorizer/lfw_test_pair.txt
+++ b/vectorizer/lfw_test_pair.txt
--- a/vectorizer/train-rec.sh
+++ b/vectorizer/train-rec.sh
@ -0,0 +1,2 @@
+python3 -m recognition.train --casia_list /home/ehp/tmp/datasets/CASIA-maxpy-clean/train.txt --casia_root /home/ehp/tmp/datasets/CASIA-maxpy-clean --lfw_root /home/ehp/tmp/datasets/lfw \
+--lfw_pair_list /home/ehp/git/arcface/lfw_test_pair.txt --model_name recongition3 --batch_size 20 --loss adacos --print_freq 20 --depth 50
--- a/vectorizer/train.sh
+++ b/vectorizer/train.sh
@ -0,0 +1,7 @@
+#python3 -m identification.train --wider_train /home/ehp/tmp/datasets/wider/sample.txt --wider_train_prefix /home/ehp/tmp/datasets/wider/sample/images \
+#--wider_val /home/ehp/tmp/datasets/wider/sample_val.txt --wider_val_prefix /home/ehp/tmp/datasets/wider/sample_val/images \
+#--depth 50 --epochs 30 --batch_size 1 --model_name wider_sample1
+
+python3 -m identification.train --wider_train /home/ehp/tmp/datasets/wider/wider_face_train_bbx_gt.txt --wider_train_prefix /home/ehp/tmp/datasets/wider/WIDER_train/images \
+--wider_val /home/ehp/tmp/datasets/wider/wider_face_val_bbx_gt.txt --wider_val_prefix /home/ehp/tmp/datasets/wider/WIDER_val/images \
+--depth 50 --epochs 30 --batch_size 1 --model_name widernew1
--- a/vectorizer/vectorizer/server.py
+++ b/vectorizer/vectorizer/server.py
@ -13,6 +13,7 @@ from PIL import Image
 import identification.detector as fan

 is_cuda = torch.cuda.is_available()
+print('CUDA: %s' % is_cuda)
 fan_model = fan.load_model('ckpt/wider6_10.pt', is_cuda=is_cuda)

 # load recognition model
@ -61,18 +62,25 @@ def upload_file():
            filepath = os.path.join(UPLOAD_FOLDER, filename)
            f.save(filepath)

-            img = Image.open(filepath)
-            data = img.convert(mode="RGB")
-
-            with torch.no_grad():
-                boxes = fan.fan_detect(fan_model, data, threshold=0.9, is_cuda=is_cuda).astype(int)
-                boxes = [b for b in boxes if abs(b[1] - b[0]) >= imagesize / 2 and abs(b[2] - b[0]) >= imagesize / 2]
-
-                if boxes is None or len(boxes) == 0:
-                    abort(404)
-
-                extracted = [{'box': arr.tolist(), 'vector': compute_vector(img.crop((arr[0], arr[1], arr[2], arr[3]))).squeeze().tolist()} for arr in boxes]
-                return jsonify(extracted)
+            try:
+                img = Image.open(filepath)
+                data = img.convert(mode="RGB")
+
+                with torch.no_grad():
+                    boxes, scores = fan.fan_detect(fan_model, data, threshold=0.9, is_cuda=is_cuda)
+
+                    if boxes is None or len(boxes) == 0:
+                        return jsonify([])
+
+                    boxes = boxes.astype(int)
+                    scores = scores.astype(float)
+                    extracted = [{'box': arr.tolist(),
+                                  'vector': compute_vector(img.crop((arr[0], arr[1], arr[2], arr[3]))).squeeze().tolist(),
+                                  'scores': score.tolist()
+                                  } for arr, score in zip(boxes, scores)]
+                    return jsonify(extracted)
+            finally:
+                os.remove(filepath)
        else:
            abort(500)