mobilenet_v1.py

   1 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   2 #
   3 # Licensed under the Apache License, Version 2.0 (the "License");
   4 # you may not use this file except in compliance with the License.
   5 # You may obtain a copy of the License at
   6 #
   7 # http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 # Unless required by applicable law or agreed to in writing, software
  10 # distributed under the License is distributed on an "AS IS" BASIS,
  11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 # See the License for the specific language governing permissions and
  13 # limitations under the License.
  14 # =============================================================================
  15 """MobileNet v1.
  16
  17 MobileNet is a general architecture and can be used for multiple use cases.
  18 Depending on the use case, it can use different input layer size and different
  19 head (for example: embeddings, localization and classification).
  20
  21 As described in https://arxiv.org/abs/1704.04861.
  22
  23   MobileNets: Efficient Convolutional Neural Networks for
  24     Mobile Vision Applications
  25   Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang,
  26     Tobias Weyand, Marco Andreetto, Hartwig Adam
  27
  28 100% Mobilenet V1 (base) with input size 224x224:
  29
  30 Layer                                                     params           macs
  31 --------------------------------------------------------------------------------
  32 MobilenetV1/Conv2d_0/Conv2D:                                 864      10,838,016
  33 MobilenetV1/Conv2d_1_depthwise/depthwise:                    288       3,612,672
  34 MobilenetV1/Conv2d_1_pointwise/Conv2D:                     2,048      25,690,112
  35 MobilenetV1/Conv2d_2_depthwise/depthwise:                    576       1,806,336
  36 MobilenetV1/Conv2d_2_pointwise/Conv2D:                     8,192      25,690,112
  37 MobilenetV1/Conv2d_3_depthwise/depthwise:                  1,152       3,612,672
  38 MobilenetV1/Conv2d_3_pointwise/Conv2D:                    16,384      51,380,224
  39 MobilenetV1/Conv2d_4_depthwise/depthwise:                  1,152         903,168
  40 MobilenetV1/Conv2d_4_pointwise/Conv2D:                    32,768      25,690,112
  41 MobilenetV1/Conv2d_5_depthwise/depthwise:                  2,304       1,806,336
  42 MobilenetV1/Conv2d_5_pointwise/Conv2D:                    65,536      51,380,224
  43 MobilenetV1/Conv2d_6_depthwise/depthwise:                  2,304         451,584
  44 MobilenetV1/Conv2d_6_pointwise/Conv2D:                   131,072      25,690,112
  45 MobilenetV1/Conv2d_7_depthwise/depthwise:                  4,608         903,168
  46 MobilenetV1/Conv2d_7_pointwise/Conv2D:                   262,144      51,380,224
  47 MobilenetV1/Conv2d_8_depthwise/depthwise:                  4,608         903,168
  48 MobilenetV1/Conv2d_8_pointwise/Conv2D:                   262,144      51,380,224
  49 MobilenetV1/Conv2d_9_depthwise/depthwise:                  4,608         903,168
  50 MobilenetV1/Conv2d_9_pointwise/Conv2D:                   262,144      51,380,224
  51 MobilenetV1/Conv2d_10_depthwise/depthwise:                 4,608         903,168
  52 MobilenetV1/Conv2d_10_pointwise/Conv2D:                  262,144      51,380,224
  53 MobilenetV1/Conv2d_11_depthwise/depthwise:                 4,608         903,168
  54 MobilenetV1/Conv2d_11_pointwise/Conv2D:                  262,144      51,380,224
  55 MobilenetV1/Conv2d_12_depthwise/depthwise:                 4,608         225,792
  56 MobilenetV1/Conv2d_12_pointwise/Conv2D:                  524,288      25,690,112
  57 MobilenetV1/Conv2d_13_depthwise/depthwise:                 9,216         451,584
  58 MobilenetV1/Conv2d_13_pointwise/Conv2D:                1,048,576      51,380,224
  59 --------------------------------------------------------------------------------
  60 Total:                                                 3,185,088     567,716,352
  61
  62
  63 75% Mobilenet V1 (base) with input size 128x128:
  64
  65 Layer                                                     params           macs
  66 --------------------------------------------------------------------------------
  67 MobilenetV1/Conv2d_0/Conv2D:                                 648       2,654,208
  68 MobilenetV1/Conv2d_1_depthwise/depthwise:                    216         884,736
  69 MobilenetV1/Conv2d_1_pointwise/Conv2D:                     1,152       4,718,592
  70 MobilenetV1/Conv2d_2_depthwise/depthwise:                    432         442,368
  71 MobilenetV1/Conv2d_2_pointwise/Conv2D:                     4,608       4,718,592
  72 MobilenetV1/Conv2d_3_depthwise/depthwise:                    864         884,736
  73 MobilenetV1/Conv2d_3_pointwise/Conv2D:                     9,216       9,437,184
  74 MobilenetV1/Conv2d_4_depthwise/depthwise:                    864         221,184
  75 MobilenetV1/Conv2d_4_pointwise/Conv2D:                    18,432       4,718,592
  76 MobilenetV1/Conv2d_5_depthwise/depthwise:                  1,728         442,368
  77 MobilenetV1/Conv2d_5_pointwise/Conv2D:                    36,864       9,437,184
  78 MobilenetV1/Conv2d_6_depthwise/depthwise:                  1,728         110,592
  79 MobilenetV1/Conv2d_6_pointwise/Conv2D:                    73,728       4,718,592
  80 MobilenetV1/Conv2d_7_depthwise/depthwise:                  3,456         221,184
  81 MobilenetV1/Conv2d_7_pointwise/Conv2D:                   147,456       9,437,184
  82 MobilenetV1/Conv2d_8_depthwise/depthwise:                  3,456         221,184
  83 MobilenetV1/Conv2d_8_pointwise/Conv2D:                   147,456       9,437,184
  84 MobilenetV1/Conv2d_9_depthwise/depthwise:                  3,456         221,184
  85 MobilenetV1/Conv2d_9_pointwise/Conv2D:                   147,456       9,437,184
  86 MobilenetV1/Conv2d_10_depthwise/depthwise:                 3,456         221,184
  87 MobilenetV1/Conv2d_10_pointwise/Conv2D:                  147,456       9,437,184
  88 MobilenetV1/Conv2d_11_depthwise/depthwise:                 3,456         221,184
  89 MobilenetV1/Conv2d_11_pointwise/Conv2D:                  147,456       9,437,184
  90 MobilenetV1/Conv2d_12_depthwise/depthwise:                 3,456          55,296
  91 MobilenetV1/Conv2d_12_pointwise/Conv2D:                  294,912       4,718,592
  92 MobilenetV1/Conv2d_13_depthwise/depthwise:                 6,912         110,592
  93 MobilenetV1/Conv2d_13_pointwise/Conv2D:                  589,824       9,437,184
  94 --------------------------------------------------------------------------------
  95 Total:                                                 1,800,144     106,002,432
  96
  97 """
  98
  99 # Tensorflow mandates these.
 100 from __future__ import absolute_import
 101 from __future__ import division
 102 from __future__ import print_function
 103
 104 from collections import namedtuple
 105
 106 import tensorflow as tf
 107
 108 slim = tf.contrib.slim
 109
 110 # Conv and DepthSepConv namedtuple define layers of the MobileNet architecture
 111 # Conv defines 3x3 convolution layers
 112 # DepthSepConv defines 3x3 depthwise convolution followed by 1x1 convolution.
 113 # stride is the stride of the convolution
 114 # depth is the number of channels or filters in a layer
 115 Conv = namedtuple('Conv', ['kernel', 'stride', 'depth'])
 116 DepthSepConv = namedtuple('DepthSepConv', ['kernel', 'stride', 'depth'])
 117
 118 # _CONV_DEFS specifies the MobileNet body
 119 _CONV_DEFS = [
 120     Conv(kernel=[3, 3], stride=2, depth=32),
 121     DepthSepConv(kernel=[3, 3], stride=1, depth=64),
 122     DepthSepConv(kernel=[3, 3], stride=2, depth=128),
 123     DepthSepConv(kernel=[3, 3], stride=1, depth=128),
 124     DepthSepConv(kernel=[3, 3], stride=2, depth=256),
 125     DepthSepConv(kernel=[3, 3], stride=1, depth=256),
 126     DepthSepConv(kernel=[3, 3], stride=2, depth=512),
 127     DepthSepConv(kernel=[3, 3], stride=1, depth=512),
 128     DepthSepConv(kernel=[3, 3], stride=1, depth=512),
 129     DepthSepConv(kernel=[3, 3], stride=1, depth=512),
 130     DepthSepConv(kernel=[3, 3], stride=1, depth=512),
 131     DepthSepConv(kernel=[3, 3], stride=1, depth=512),
 132     DepthSepConv(kernel=[3, 3], stride=2, depth=1024),
 133     DepthSepConv(kernel=[3, 3], stride=1, depth=1024)
 134 ]
 135
 136
 137 def mobilenet_v1_base(inputs,
 138                       final_endpoint='Conv2d_13_pointwise',
 139                       min_depth=8,
 140                       depth_multiplier=1.0,
 141                       conv_defs=None,
 142                       output_stride=None,
 143                       scope=None):
 144   """Mobilenet v1.
 145
 146   Constructs a Mobilenet v1 network from inputs to the given final endpoint.
 147
 148   Args:
 149     inputs: a tensor of shape [batch_size, height, width, channels].
 150     final_endpoint: specifies the endpoint to construct the network up to. It
 151       can be one of ['Conv2d_0', 'Conv2d_1_pointwise', 'Conv2d_2_pointwise',
 152       'Conv2d_3_pointwise', 'Conv2d_4_pointwise', 'Conv2d_5'_pointwise,
 153       'Conv2d_6_pointwise', 'Conv2d_7_pointwise', 'Conv2d_8_pointwise',
 154       'Conv2d_9_pointwise', 'Conv2d_10_pointwise', 'Conv2d_11_pointwise',
 155       'Conv2d_12_pointwise', 'Conv2d_13_pointwise'].
 156     min_depth: Minimum depth value (number of channels) for all convolution ops.
 157       Enforced when depth_multiplier < 1, and not an active constraint when
 158       depth_multiplier >= 1.
 159     depth_multiplier: Float multiplier for the depth (number of channels)
 160       for all convolution ops. The value must be greater than zero. Typical
 161       usage will be to set this value in (0, 1) to reduce the number of
 162       parameters or computation cost of the model.
 163     conv_defs: A list of ConvDef namedtuples specifying the net architecture.
 164     output_stride: An integer that specifies the requested ratio of input to
 165       output spatial resolution. If not None, then we invoke atrous convolution
 166       if necessary to prevent the network from reducing the spatial resolution
 167       of the activation maps. Allowed values are 8 (accurate fully convolutional
 168       mode), 16 (fast fully convolutional mode), 32 (classification mode).
 169     scope: Optional variable_scope.
 170
 171   Returns:
 172     tensor_out: output tensor corresponding to the final_endpoint.
 173     end_points: a set of activations for external use, for example summaries or
 174                 losses.
 175
 176   Raises:
 177     ValueError: if final_endpoint is not set to one of the predefined values,
 178                 or depth_multiplier <= 0, or the target output_stride is not
 179                 allowed.
 180   """
 181   depth = lambda d: max(int(d * depth_multiplier), min_depth)
 182   end_points = {}
 183
 184   # Used to find thinned depths for each layer.
 185   if depth_multiplier <= 0:
 186     raise ValueError('depth_multiplier is not greater than zero.')
 187
 188   if conv_defs is None:
 189     conv_defs = _CONV_DEFS
 190
 191   if output_stride is not None and output_stride not in [8, 16, 32]:
 192     raise ValueError('Only allowed output_stride values are 8, 16, 32.')
 193
 194   with tf.variable_scope(scope, 'MobilenetV1', [inputs]):
 195     with slim.arg_scope([slim.conv2d, slim.separable_conv2d], padding='SAME'):
 196       # The current_stride variable keeps track of the output stride of the
 197       # activations, i.e., the running product of convolution strides up to the
 198       # current network layer. This allows us to invoke atrous convolution
 199       # whenever applying the next convolution would result in the activations
 200       # having output stride larger than the target output_stride.
 201       current_stride = 1
 202
 203       # The atrous convolution rate parameter.
 204       rate = 1
 205
 206       net = inputs
 207       for i, conv_def in enumerate(conv_defs):
 208         end_point_base = 'Conv2d_%d' % i
 209
 210         if output_stride is not None and current_stride == output_stride:
 211           # If we have reached the target output_stride, then we need to employ
 212           # atrous convolution with stride=1 and multiply the atrous rate by the
 213           # current unit's stride for use in subsequent layers.
 214           layer_stride = 1
 215           layer_rate = rate
 216           rate *= conv_def.stride
 217         else:
 218           layer_stride = conv_def.stride
 219           layer_rate = 1
 220           current_stride *= conv_def.stride
 221
 222         if isinstance(conv_def, Conv):
 223           end_point = end_point_base
 224           net = slim.conv2d(net, depth(conv_def.depth), conv_def.kernel,
 225                             stride=conv_def.stride,
 226                             normalizer_fn=slim.batch_norm,
 227                             scope=end_point)
 228           end_points[end_point] = net
 229           if end_point == final_endpoint:
 230             return net, end_points
 231
 232         elif isinstance(conv_def, DepthSepConv):
 233           end_point = end_point_base + '_depthwise'
 234
 235           # By passing filters=None
 236           # separable_conv2d produces only a depthwise convolution layer
 237           net = slim.separable_conv2d(net, None, conv_def.kernel,
 238                                       depth_multiplier=1,
 239                                       stride=layer_stride,
 240                                       rate=layer_rate,
 241                                       normalizer_fn=slim.batch_norm,
 242                                       scope=end_point)
 243
 244           end_points[end_point] = net
 245           if end_point == final_endpoint:
 246             return net, end_points
 247
 248           end_point = end_point_base + '_pointwise'
 249
 250           net = slim.conv2d(net, depth(conv_def.depth), [1, 1],
 251                             stride=1,
 252                             normalizer_fn=slim.batch_norm,
 253                             scope=end_point)
 254
 255           end_points[end_point] = net
 256           if end_point == final_endpoint:
 257             return net, end_points
 258         else:
 259           raise ValueError('Unknown convolution type %s for layer %d'
 260                            % (conv_def.ltype, i))
 261   raise ValueError('Unknown final endpoint %s' % final_endpoint)
 262
 263
 264 def mobilenet_v1(inputs,
 265                  num_classes=1000,
 266                  dropout_keep_prob=0.999,
 267                  is_training=True,
 268                  min_depth=8,
 269                  depth_multiplier=1.0,
 270                  conv_defs=None,
 271                  prediction_fn=tf.contrib.layers.softmax,
 272                  spatial_squeeze=True,
 273                  reuse=None,
 274                  scope='MobilenetV1'):
 275   """Mobilenet v1 model for classification.
 276
 277   Args:
 278     inputs: a tensor of shape [batch_size, height, width, channels].
 279     num_classes: number of predicted classes.
 280     dropout_keep_prob: the percentage of activation values that are retained.
 281     is_training: whether is training or not.
 282     min_depth: Minimum depth value (number of channels) for all convolution ops.
 283       Enforced when depth_multiplier < 1, and not an active constraint when
 284       depth_multiplier >= 1.
 285     depth_multiplier: Float multiplier for the depth (number of channels)
 286       for all convolution ops. The value must be greater than zero. Typical
 287       usage will be to set this value in (0, 1) to reduce the number of
 288       parameters or computation cost of the model.
 289     conv_defs: A list of ConvDef namedtuples specifying the net architecture.
 290     prediction_fn: a function to get predictions out of logits.
 291     spatial_squeeze: if True, logits is of shape is [B, C], if false logits is
 292         of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
 293     reuse: whether or not the network and its variables should be reused. To be
 294       able to reuse 'scope' must be given.
 295     scope: Optional variable_scope.
 296
 297   Returns:
 298     logits: the pre-softmax activations, a tensor of size
 299       [batch_size, num_classes]
 300     end_points: a dictionary from components of the network to the corresponding
 301       activation.
 302
 303   Raises:
 304     ValueError: Input rank is invalid.
 305   """
 306   input_shape = inputs.get_shape().as_list()
 307   if len(input_shape) != 4:
 308     raise ValueError('Invalid input tensor rank, expected 4, was: %d' %
 309                      len(input_shape))
 310
 311   with tf.variable_scope(scope, 'MobilenetV1', [inputs, num_classes],
 312                          reuse=reuse) as scope:
 313     with slim.arg_scope([slim.batch_norm, slim.dropout],
 314                         is_training=is_training):
 315       net, end_points = mobilenet_v1_base(inputs, scope=scope,
 316                                           min_depth=min_depth,
 317                                           depth_multiplier=depth_multiplier,
 318                                           conv_defs=conv_defs)
 319       with tf.variable_scope('Logits'):
 320         kernel_size = _reduced_kernel_size_for_small_input(net, [7, 7])
 321         net = slim.avg_pool2d(net, kernel_size, padding='VALID',
 322                               scope='AvgPool_1a')
 323         end_points['AvgPool_1a'] = net
 324         # 1 x 1 x 1024
 325         net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b')
 326         logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
 327                              normalizer_fn=None, scope='Conv2d_1c_1x1')
 328         if spatial_squeeze:
 329           logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
 330       end_points['Logits'] = logits
 331       if prediction_fn:
 332         end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
 333   return logits, end_points
 334
 335 mobilenet_v1.default_image_size = 224
 336
 337
 338 def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
 339   """Define kernel size which is automatically reduced for small input.
 340
 341   If the shape of the input images is unknown at graph construction time this
 342   function assumes that the input images are large enough.
 343
 344   Args:
 345     input_tensor: input tensor of size [batch_size, height, width, channels].
 346     kernel_size: desired kernel size of length 2: [kernel_height, kernel_width]
 347
 348   Returns:
 349     a tensor with the kernel size.
 350   """
 351   shape = input_tensor.get_shape().as_list()
 352   if shape[1] is None or shape[2] is None:
 353     kernel_size_out = kernel_size
 354   else:
 355     kernel_size_out = [min(shape[1], kernel_size[0]),
 356                        min(shape[2], kernel_size[1])]
 357   return kernel_size_out
 358
 359
 360 def mobilenet_v1_arg_scope(is_training=True,
 361                            weight_decay=0.00004,
 362                            stddev=0.09,
 363                            regularize_depthwise=False):
 364   """Defines the default MobilenetV1 arg scope.
 365
 366   Args:
 367     is_training: Whether or not we're training the model.
 368     weight_decay: The weight decay to use for regularizing the model.
 369     stddev: The standard deviation of the trunctated normal weight initializer.
 370     regularize_depthwise: Whether or not apply regularization on depthwise.
 371
 372   Returns:
 373     An `arg_scope` to use for the mobilenet v1 model.
 374   """
 375   batch_norm_params = {
 376       'is_training': is_training,
 377       'center': True,
 378       'scale': True,
 379       'decay': 0.9997,
 380       'epsilon': 0.001,
 381   }
 382
 383   # Set weight_decay for weights in Conv and DepthSepConv layers.
 384   weights_init = tf.truncated_normal_initializer(stddev=stddev)
 385   regularizer = tf.contrib.layers.l2_regularizer(weight_decay)
 386   if regularize_depthwise:
 387     depthwise_regularizer = regularizer
 388   else:
 389     depthwise_regularizer = None
 390   with slim.arg_scope([slim.conv2d, slim.separable_conv2d],
 391                       weights_initializer=weights_init,
 392                       activation_fn=tf.nn.relu6, normalizer_fn=slim.batch_norm):
 393     with slim.arg_scope([slim.batch_norm], **batch_norm_params):
 394       with slim.arg_scope([slim.conv2d], weights_regularizer=regularizer):
 395         with slim.arg_scope([slim.separable_conv2d],
 396                             weights_regularizer=depthwise_regularizer) as sc:
 397           return sc