Source code for openem.Detect.RetinaNet

""" RetinaNet Object Detector for OpenEM """

import tensorflow as tf
import numpy as np
from openem.models import ImageModel

import cv2
import math
import copy

from openem.Detect import Detection
from openem.image import force_aspect

[docs]class RetinaNetPreprocessor: """ Perform preprocessinig for RetinaNet inputs Meets the callable interface of openem.Detect.Preprocessor """ def __init__(self,meanImage=None): self.mean_image = meanImage def __call__(self, image, requiredWidth, requiredHeight): #TODO: (Provide way to optionally convert channel ordering?) #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image = image.astype(np.float32) required_aspect = requiredWidth / requiredHeight image = force_aspect(image, required_aspect) resized_image = cv2.resize(image, (requiredWidth, requiredHeight)) if self.mean_image: for dim in [0,1,2]: resized_image[:,:,dim] -= self.mean_image[:,:,dim] else: # Use the ImageNet mean image by default; which in BGR is: imagenet_mean = np.array([103.939, 116.779, 123.68 ]) resized_image -= imagenet_mean return resized_image
[docs]class RetinaNetDetector(ImageModel): def __init__(self, modelPath, meanImage=None, gpuFraction=1.0, imageShape=(360,720), **kwargs): """ Initialize the RetinaNet Detector model modelPath: str path-like object to frozen pb graph meanImage: np.array Mean image subtracted from image prior to network insertion. Can be None. image_shape: tuple (height, width) of the image to feed into the detector network. """ super(RetinaNetDetector,self).__init__(modelPath, image_dims=imageShape, gpu_fraction=gpuFraction, input_name='input_1:0', output_name='nms/map/TensorArrayStack/TensorArrayGatherV3:0', **kwargs) self.input_shape[1:3] = imageShape[:2] self.image_shape = imageShape self.network_aspect = imageShape[1] / imageShape[0] if meanImage: resized_mean = cv2.resize(meanImage,(imageShape[1], imageShape[0])) self.preprocessor=RetinaNetPreprocesser(meanImage=resized_mean) else: self.preprocessor=RetinaNetPreprocessor(meanImage=None) def addImage(self, image, cookie=None): # Determine the actual shape of the image as it goes into the network # To account for padding to aspect ratio img_height = image.shape[0] img_width = image.shape[1] img_aspect = img_width / img_height if math.isclose(img_aspect, self.network_aspect): img_size = image.shape[:2] elif img_aspect < self.network_aspect: #Image is boxer than we want new_width = round(img_height * self.network_aspect) img_size = (img_height, new_width) else: new_height = round(img_width / self.network_aspect) img_size = (new_height,img_width) if cookie is None: cookie = {} cookie.update({"size": img_size}) return super(RetinaNetDetector, self)._addImage(image, self.preprocessor, cookie) def format_results(self, detections, sizes, threshold, **kwargs): # clip to image shape detections[:, :, 0] = np.maximum(0, detections[:, :, 0]) detections[:, :, 1] = np.maximum(0, detections[:, :, 1]) detections[:, :, 2] = np.minimum(self.image_shape[1], detections[:, :, 2]) detections[:, :, 3] = np.minimum(self.image_shape[0], detections[:, :, 3]) num_images = detections.shape[0] for idx in range(num_images): # correct boxes for image scale # Keep in mind there is a shift here potentially to force # an aspect ratio. h_scale = self.image_shape[0] / sizes[idx][0] w_scale = self.image_shape[1] / sizes[idx][1] detections[idx, :, 0] /= w_scale detections[idx, :, 1] /= h_scale detections[idx, :, 2] /= w_scale detections[idx, :, 3] /= h_scale # change to (x, y, w, h) (MS COCO standard) detections[:, :, 2] -= detections[:, :, 0] detections[:, :, 3] -= detections[:, :, 1] results=[] frame = kwargs.get('frame', None) video_id = kwargs.get('video_id', None) # compute predicted labels and scores num_imgs = detections.shape[0] for img_idx in range(num_imgs): if not frame is None: this_frame = frame + img_idx else: this_frame = None image_detections=[] for detection in detections[img_idx, ...]: label = int(detection[4]) max_confidence = detection[5+label] if max_confidence >= threshold: detection = Detection(location=detection[:4].tolist(), confidence=detection[5:].tolist(), # OpenEM uses 1-based indexing species=label+1, frame=this_frame, video_id=video_id) image_detections.append(detection) results.append(image_detections) return results