Source code for openem.Detect.RetinaNet

""" RetinaNet Object Detector for OpenEM """

import tensorflow as tf
import numpy as np
from openem.models import ImageModel

import cv2
import math

from openem.Detect import Detection
from openem.image import force_aspect

[docs]class RetinaNetPreprocessor:
    """ Perform preprocessinig for RetinaNet inputs
        Meets the callable interface of openem.Detect.Preprocessor
    """
    def __init__(self,meanImage=None):
        self.mean_image = meanImage

    def __call__(self, image, requiredWidth, requiredHeight):
        #TODO: (Provide way to optionally convert channel ordering?)
        #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = image.astype(np.float32)

        required_aspect = requiredWidth / requiredHeight
        image = force_aspect(image, required_aspect)
        resized_image = cv2.resize(image, (requiredWidth, requiredHeight))
        if self.mean_image:
            for dim in [0,1,2]:
                resized_image[:,:,dim] -= self.mean_image[:,:,dim]
        else:
            # Use the ImageNet mean image by default; which in BGR is:
            imagenet_mean = np.array([103.939, 116.779, 123.68 ])
            resized_image -= imagenet_mean
        return resized_image

[docs]class RetinaNetDetector(ImageModel):
    def __init__(self, modelPath, meanImage=None, gpuFraction=1.0, imageShape=(360,720)):
        """ Initialize the RetinaNet Detector model
        modelPath: str
                   path-like object to frozen pb graph
        meanImage: np.array
                   Mean image subtracted from image prior to network
                   insertion. Can be None.
        image_shape: tuple
                   (height, width) of the image to feed into the detector network.
        """
        super(RetinaNetDetector,self).__init__(modelPath,
                                               gpuFraction,
                                               'input_1:0',
                                               'nms/map/TensorArrayStack/TensorArrayGatherV3:0')
        self.input_shape[1:3] = imageShape

        self.image_shape = imageShape
        self.network_aspect = imageShape[1] / imageShape[0]

        if meanImage:
            resized_mean = cv2.resize(meanImage,(imageShape[1],
                                                 imageShape[0]))
            self.preprocessor=RetinaNetPreprocesser(meanImage=resized_mean)
        else:
            self.preprocessor=RetinaNetPreprocessor(meanImage=None)

        self._imageSizes = None

    def addImage(self, image):
        if self._imageSizes is None:
            self._imageSizes = []

        # Determine the actual shape of the image as it goes into the network
        # To account for padding to aspect ratio
        img_height = image.shape[0]
        img_width = image.shape[1]
        img_aspect = img_width / img_height
        if math.isclose(img_aspect, self.network_aspect):
            self._imageSizes.append(image.shape)
        elif img_aspect < self.network_aspect:
            #Image is boxer than we want
            new_width = round(img_height * self.network_aspect)
            self._imageSizes.append((img_height, new_width))
        else:
            new_height = round(img_width / self.network_aspect)
            self._imageSizes.append((new_height,img_width))
        return super(RetinaNetDetector, self)._addImage(image,
                                                        self.preprocessor)

[docs]    def process(self, threshold=0.0, **kwargs):
        detections = super(RetinaNetDetector,self).process()

        # clip to image shape
        detections[:, :, 0] = np.maximum(0, detections[:, :, 0])
        detections[:, :, 1] = np.maximum(0, detections[:, :, 1])
        detections[:, :, 2] = np.minimum(self.image_shape[1], detections[:, :, 2])
        detections[:, :, 3] = np.minimum(self.image_shape[0], detections[:, :, 3])

        num_images = detections.shape[0]
        for idx in range(num_images):
            # correct boxes for image scale
            # Keep in mind there is a shift here potentially to force
            # an aspect ratio.
            h_scale = self.image_shape[0] / self._imageSizes[idx][0]
            w_scale = self.image_shape[1] / self._imageSizes[idx][1]

            detections[idx, :, 0] /= w_scale
            detections[idx, :, 1] /= h_scale
            detections[idx, :, 2] /= w_scale
            detections[idx, :, 3] /= h_scale


        # change to (x, y, w, h) (MS COCO standard)
        detections[:, :, 2] -= detections[:, :, 0]
        detections[:, :, 3] -= detections[:, :, 1]

        results=[]
        frame = kwargs.get('frame', None)
        video_id = kwargs.get('video_id', None)
        # compute predicted labels and scores
        num_imgs = detections.shape[0]
        for img_idx in range(num_imgs):
            if not frame is None:
                this_frame = frame + img_idx
            else:
                this_frame = None

            image_detections=[]
            for detection in detections[img_idx, ...]:
                label = int(detection[4])
                confidence = float(detection[5+label])
                if confidence > threshold:
                    detection = Detection(location=detection[:4].tolist(),
                                          confidence=confidence,
                                          # OpenEM uses 1-based indexing
                                          species=label+1,
                                          frame=this_frame,
                                          video_id=video_id)
                    image_detections.append(detection)
            results.append(image_detections)

        self._imageSizes = None
        return results