3

我一直在使用 Tensorflow 对象检测 API - 就我而言,我正在尝试使用模型动物园中的 kitti 训练模型 (faster_rcnn_resnet101_kitti_2018_01_28) 检测静止图像中的车辆,并且我正在使用从 object_detection_tutorial jupyter notebook 修改的代码包含在 github 存储库中。

我在下面包含了我修改过的代码,但我发现与来自 github 的原始笔记本相同的结果。

在具有深度学习 AMI 的实例上的 jupyter 笔记本服务器上运行时Amazon AWS g3x4large (GPU),处理单个图像只需不到 4 秒。推理功能的时间为 1.3-1.5 秒(请参见下面的代码) - 对于模型报告的推理时间(20 毫秒),这似乎异常高。虽然我不希望达到报告的目标,但我的时间似乎超出了我的需求,并且不切实际。我正在考虑一次处理超过 100 万张图像,但无法承受 46 天的处理时间。鉴于该模型用于视频帧捕获......我认为至少应该可以将每张图像的时间缩短到 1 秒以下。

我的问题是:

1) 有哪些解释/解决方案可以减少推理时间?

2) 将图像转换为 numpy(处理之前)是否需要 1.5 秒?

3) 如果这是我可以预期的最佳性能,我希望从重做模型到批处理图像中获得多少时间?

谢谢你的帮助!

来自 python 笔记本的代码:

import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile
import json
import collections
import os.path
import datetime

from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image

# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")

# This is needed to display the images.
get_ipython().magic('matplotlib inline')

#Setup variables
PATH_TO_TEST_IMAGES_DIR = 'test_images'

MODEL_NAME = 'faster_rcnn_resnet101_kitti_2018_01_28'

# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_CKPT = MODEL_NAME + '/frozen_inference_graph.pb'

# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'kitti_label_map.pbtxt')

NUM_CLASSES = 2

from utils import label_map_util
from utils import visualization_utils as vis_util

def get_scores(
    boxes,
    classes,
    scores,
    category_index,
    min_score_thresh=.5
):

  import collections
  # Create a display string (and color) for every box location, group any boxes
  # that correspond to the same location.
  box_to_display_str_map = collections.defaultdict(list)

  for i in range(boxes.shape[0]):
    if scores is None or scores[i] > min_score_thresh:
      box = tuple(boxes[i].tolist())
      if scores is None:
        box_to_color_map[box] = groundtruth_box_visualization_color
      else:
        display_str = ''
        if classes[i] in category_index.keys():
          class_name = category_index[classes[i]]['name']
        else:
          class_name = 'N/A'
        display_str = str(class_name)
        if not display_str:
          display_str = '{}%'.format(int(100*scores[i]))
        else:
          display_str = '{}: {}%'.format(display_str, int(100*scores[i]))
        box_to_display_str_map[i].append(display_str)

  return box_to_display_str_map

def load_image_into_numpy_array(image):
  (im_width, im_height) = image.size
  return np.array(image.getdata()).reshape(
      (im_height, im_width, 3)).astype(np.uint8)

def run_inference_for_single_image(image, graph):
  with graph.as_default():
    with tf.Session() as sess:
      # Get handles to input and output tensors
      ops = tf.get_default_graph().get_operations()
      all_tensor_names = {output.name for op in ops for output in op.outputs}
      tensor_dict = {}
      for key in [
          'num_detections', 'detection_boxes', 'detection_scores',
          'detection_classes', 'detection_masks'
      ]:
        tensor_name = key + ':0'
        if tensor_name in all_tensor_names:
          tensor_dict[key] = tf.get_default_graph().get_tensor_by_name(
              tensor_name)
      if 'detection_masks' in tensor_dict:
        # The following processing is only for single image
        detection_boxes = tf.squeeze(tensor_dict['detection_boxes'], [0])
        detection_masks = tf.squeeze(tensor_dict['detection_masks'], [0])
        # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
        real_num_detection = tf.cast(tensor_dict['num_detections'][0], tf.int32)
        detection_boxes = tf.slice(detection_boxes, [0, 0], [real_num_detection, -1])
        detection_masks = tf.slice(detection_masks, [0, 0, 0], [real_num_detection, -1, -1])
        detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
            detection_masks, detection_boxes, image.shape[0], image.shape[1])
        detection_masks_reframed = tf.cast(
            tf.greater(detection_masks_reframed, 0.5), tf.uint8)
        # Follow the convention by adding back the batch dimension
        tensor_dict['detection_masks'] = tf.expand_dims(
            detection_masks_reframed, 0)
      image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')

      # Run inference
      output_dict = sess.run(tensor_dict,
                             feed_dict={image_tensor: np.expand_dims(image, 0)})

      # all outputs are float32 numpy arrays, so convert types as appropriate
      output_dict['num_detections'] = int(output_dict['num_detections'][0])
      output_dict['detection_classes'] = output_dict[
          'detection_classes'][0].astype(np.uint8)
      output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
      output_dict['detection_scores'] = output_dict['detection_scores'][0]
      if 'detection_masks' in output_dict:
        output_dict['detection_masks'] = output_dict['detection_masks'][0]
  return output_dict

#get list of paths
exten='.jpg'
TEST_IMAGE_PATHS=[]

for dirpath, dirnames, files in os.walk(PATH_TO_TEST_IMAGES_DIR):
    for name in files:
        if name.lower().endswith(exten):
            #print(os.path.join(dirpath,name))
            TEST_IMAGE_PATHS.append(os.path.join(dirpath,name))
print((len(TEST_IMAGE_PATHS), 'Images To Process'))

#load model graph for inference
detection_graph = tf.Graph()
with detection_graph.as_default():
  od_graph_def = tf.GraphDef()
  with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
    serialized_graph = fid.read()
    od_graph_def.ParseFromString(serialized_graph)
    tf.import_graph_def(od_graph_def, name='')

#setup class labeling parameters    
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)

#placeholder for timings
myTimings=[]

myX = 1
myResults = collections.defaultdict(list)
for image_path in TEST_IMAGE_PATHS:
  if os.path.exists(image_path):  
    print(myX,"--------------------------------------",datetime.datetime.time(datetime.datetime.now()))
    print(myX,"Image:", image_path)
    myTimings.append((myX,"Image", image_path))
    print(myX,"Open:",datetime.datetime.time(datetime.datetime.now()))
    myTimings.append((myX,"Open",datetime.datetime.time(datetime.datetime.now()).__str__()))
    image = Image.open(image_path)
    # the array based representation of the image will be used later in order to prepare the
    # result image with boxes and labels on it.
    print(myX,"Numpy:",datetime.datetime.time(datetime.datetime.now()))
    myTimings.append((myX,"Numpy",datetime.datetime.time(datetime.datetime.now()).__str__()))
    image_np = load_image_into_numpy_array(image)
    # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
    print(myX,"Expand:",datetime.datetime.time(datetime.datetime.now()))
    myTimings.append((myX,"Expand",datetime.datetime.time(datetime.datetime.now()).__str__()))
    image_np_expanded = np.expand_dims(image_np, axis=0)
    # Actual detection.
    print(myX,"Detect:",datetime.datetime.time(datetime.datetime.now()))
    myTimings.append((myX,"Detect",datetime.datetime.time(datetime.datetime.now()).__str__()))
    output_dict = run_inference_for_single_image(image_np, detection_graph)
    # Visualization of the results of a detection.
    print(myX,"Export:",datetime.datetime.time(datetime.datetime.now()))
    myTimings.append((myX,"Export",datetime.datetime.time(datetime.datetime.now()).__str__()))
    op=get_scores(
      output_dict['detection_boxes'],
      output_dict['detection_classes'],
      output_dict['detection_scores'],
      category_index,
      min_score_thresh=.2)
    myResults[image_path].append(op)  
    print(myX,"Done:", datetime.datetime.time(datetime.datetime.now()))
    myTimings.append((myX,"Done", datetime.datetime.time(datetime.datetime.now()).__str__()))
    myX= myX + 1

#save results    
with open((OUTPUTS_BASENAME+'_Results.json'), 'w') as fout:
    json.dump(myResults, fout)
with open((OUTPUTS_BASENAME+'_Timings.json'), 'w') as fout:
    json.dump(myTimings, fout)

时间示例:

[1, "Image", "test_images/DE4T_11Jan2018/MFDC4612.JPG"]
[1, "Open", "19:20:08.029423"]
[1, "Numpy", "19:20:08.052679"]
[1, "Expand", "19:20:09.977166"]
[1, "Detect", "19:20:09.977250"]
[1, "Export", "19:23:13.902443"]
[1, "Done", "19:23:13.903012"]
[2, "Image", "test_images/DE4T_11Jan2018/MFDC4616.JPG"]
[2, "Open", "19:23:13.903885"]
[2, "Numpy", "19:23:13.906320"]
[2, "Expand", "19:23:15.756308"]
[2, "Detect", "19:23:15.756597"]
[2, "Export", "19:23:17.153233"]
[2, "Done", "19:23:17.153699"]
[3, "Image", "test_images/DE4T_11Jan2018/MFDC4681.JPG"]
[3, "Open", "19:23:17.154510"]
[3, "Numpy", "19:23:17.156576"]
[3, "Expand", "19:23:19.012935"]
[3, "Detect", "19:23:19.013013"]
[3, "Export", "19:23:20.323839"]
[3, "Done", "19:23:20.324307"]
[4, "Image", "test_images/DE4T_11Jan2018/MFDC4697.JPG"]
[4, "Open", "19:23:20.324791"]
[4, "Numpy", "19:23:20.327136"]
[4, "Expand", "19:23:22.175578"]
[4, "Detect", "19:23:22.175658"]
[4, "Export", "19:23:23.472040"]
[4, "Done", "19:23:23.472297"]
4

1 回答 1

1

1)您可以做的是直接加载视频而不是图像,然后更改“run_inference_for_single_image()”以创建一次会话并在其中加载图像/视频(重新创建图形非常慢)。此外,您可以编辑管道配置文件以减少提议的数量,这将直接加速推理。请注意,您必须在之后重新导出图形(https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/exporting_models.md)。批处理也有帮助(虽然我很抱歉,我忘记了多少),最后,您可以使用多处理来卸载 CPU 特定操作(绘制边界框、加载数据)以更好地利用 GPU。

2) 将图像转换为 numpy (处理之前)离线是 1​​.5 秒 <- 是的,这非常慢,还有很大的改进空间。

3)虽然我不知道 AWS 的确切 gpu(k80?),但您应该能够在 geforce 1080TI 上获得超过 10fps 的所有修复,这与他们报告的 79ms 时间一致(您从哪里得到 20ms对于更快的rcnn_resnet_101??)

于 2018-03-19T11:54:22.320 回答