mobilenet.py

   1 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
   2 #
   3 # Licensed under the Apache License, Version 2.0 (the "License");
   4 # you may not use this file except in compliance with the License.
   5 # You may obtain a copy of the License at
   6 #
   7 #     http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 # Unless required by applicable law or agreed to in writing, software
  10 # distributed under the License is distributed on an "AS IS" BASIS,
  11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 # See the License for the specific language governing permissions and
  13 # limitations under the License.
  14 # ==============================================================================
  15 """Mobilenet Base Class."""
  16
  17 from __future__ import absolute_import
  18 from __future__ import division
  19 from __future__ import print_function
  20 import collections
  21 import contextlib
  22 import copy
  23 import os
  24
  25 import tensorflow as tf
  26
  27
  28 slim = tf.contrib.slim
  29
  30
  31 @slim.add_arg_scope
  32 def apply_activation(x, name=None, activation_fn=None):
  33   return activation_fn(x, name=name) if activation_fn else x
  34
  35
  36 def _fixed_padding(inputs, kernel_size, rate=1):
  37   """Pads the input along the spatial dimensions independently of input size.
  38
  39   Pads the input such that if it was used in a convolution with 'VALID' padding,
  40   the output would have the same dimensions as if the unpadded input was used
  41   in a convolution with 'SAME' padding.
  42
  43   Args:
  44     inputs: A tensor of size [batch, height_in, width_in, channels].
  45     kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
  46     rate: An integer, rate for atrous convolution.
  47
  48   Returns:
  49     output: A tensor of size [batch, height_out, width_out, channels] with the
  50       input, either intact (if kernel_size == 1) or padded (if kernel_size > 1).
  51   """
  52   kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1),
  53                            kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)]
  54   pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1]
  55   pad_beg = [pad_total[0] // 2, pad_total[1] // 2]
  56   pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]]
  57   padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]],
  58                                   [pad_beg[1], pad_end[1]], [0, 0]])
  59   return padded_inputs
  60
  61
  62 def _make_divisible(v, divisor, min_value=None):
  63   if min_value is None:
  64     min_value = divisor
  65   new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
  66   # Make sure that round down does not go down by more than 10%.
  67   if new_v < 0.9 * v:
  68     new_v += divisor
  69   return new_v
  70
  71
  72 @contextlib.contextmanager
  73 def _set_arg_scope_defaults(defaults):
  74   """Sets arg scope defaults for all items present in defaults.
  75
  76   Args:
  77     defaults: dictionary/list of pairs, containing a mapping from
  78     function to a dictionary of default args.
  79
  80   Yields:
  81     context manager where all defaults are set.
  82   """
  83   if hasattr(defaults, 'items'):
  84     items = defaults.items()
  85   else:
  86     items = defaults
  87   if not items:
  88     yield
  89   else:
  90     func, default_arg = items[0]
  91     with slim.arg_scope(func, **default_arg):
  92       with _set_arg_scope_defaults(items[1:]):
  93         yield
  94
  95
  96 @slim.add_arg_scope
  97 def depth_multiplier(output_params,
  98                      multiplier,
  99                      divisible_by=8,
 100                      min_depth=8,
 101                      **unused_kwargs):
 102   if 'num_outputs' not in output_params:
 103     return
 104   d = output_params['num_outputs']
 105   output_params['num_outputs'] = _make_divisible(d * multiplier, divisible_by,
 106                                                  min_depth)
 107
 108
 109 _Op = collections.namedtuple('Op', ['op', 'params', 'multiplier_func'])
 110
 111
 112 def op(opfunc, **params):
 113   multiplier = params.pop('multiplier_transorm', depth_multiplier)
 114   return _Op(opfunc, params=params, multiplier_func=multiplier)
 115
 116
 117 @slim.add_arg_scope
 118 def mobilenet_base(  # pylint: disable=invalid-name
 119     inputs,
 120     conv_defs,
 121     multiplier=1.0,
 122     final_endpoint=None,
 123     output_stride=None,
 124     use_explicit_padding=False,
 125     scope=None,
 126     is_training=False):
 127   """Mobilenet base network.
 128
 129   Constructs a network from inputs to the given final endpoint. By default
 130   the network is constructed in inference mode. To create network
 131   in training mode use:
 132
 133   with slim.arg_scope(mobilenet.training_scope()):
 134      logits, endpoints = mobilenet_base(...)
 135
 136   Args:
 137     inputs: a tensor of shape [batch_size, height, width, channels].
 138     conv_defs: A list of op(...) layers specifying the net architecture.
 139     multiplier: Float multiplier for the depth (number of channels)
 140       for all convolution ops. The value must be greater than zero. Typical
 141       usage will be to set this value in (0, 1) to reduce the number of
 142       parameters or computation cost of the model.
 143     final_endpoint: The name of last layer, for early termination for
 144     for V1-based networks: last layer is "layer_14", for V2: "layer_20"
 145     output_stride: An integer that specifies the requested ratio of input to
 146       output spatial resolution. If not None, then we invoke atrous convolution
 147       if necessary to prevent the network from reducing the spatial resolution
 148       of the activation maps. Allowed values are 1 or any even number, excluding
 149       zero. Typical values are 8 (accurate fully convolutional mode), 16
 150       (fast fully convolutional mode), and 32 (classification mode).
 151
 152       NOTE- output_stride relies on all consequent operators to support dilated
 153       operators via "rate" parameter. This might require wrapping non-conv
 154       operators to operate properly.
 155
 156     use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
 157       inputs so that the output dimensions are the same as if 'SAME' padding
 158       were used.
 159     scope: optional variable scope.
 160     is_training: How to setup batch_norm and other ops. Note: most of the time
 161       this does not need be set directly. Use mobilenet.training_scope() to set
 162       up training instead. This parameter is here for backward compatibility
 163       only. It is safe to set it to the value matching
 164       training_scope(is_training=...). It is also safe to explicitly set
 165       it to False, even if there is outer training_scope set to to training.
 166       (The network will be built in inference mode).
 167   Returns:
 168     tensor_out: output tensor.
 169     end_points: a set of activations for external use, for example summaries or
 170                 losses.
 171
 172   Raises:
 173     ValueError: depth_multiplier <= 0, or the target output_stride is not
 174                 allowed.
 175   """
 176   if multiplier <= 0:
 177     raise ValueError('multiplier is not greater than zero.')
 178
 179   # Set conv defs defaults and overrides.
 180   conv_defs_defaults = conv_defs.get('defaults', {})
 181   conv_defs_overrides = conv_defs.get('overrides', {})
 182   if use_explicit_padding:
 183     conv_defs_overrides = copy.deepcopy(conv_defs_overrides)
 184     conv_defs_overrides[
 185         (slim.conv2d, slim.separable_conv2d)] = {'padding': 'VALID'}
 186
 187   if output_stride is not None:
 188     if output_stride == 0 or (output_stride > 1 and output_stride % 2):
 189       raise ValueError('Output stride must be None, 1 or a multiple of 2.')
 190
 191   # a) Set the tensorflow scope
 192   # b) set padding to default: note we might consider removing this
 193   # since it is also set by mobilenet_scope
 194   # c) set all defaults
 195   # d) set all extra overrides.
 196   with _scope_all(scope, default_scope='Mobilenet'), \
 197       slim.arg_scope([slim.batch_norm], is_training=is_training), \
 198       _set_arg_scope_defaults(conv_defs_defaults), \
 199       _set_arg_scope_defaults(conv_defs_overrides):
 200     # The current_stride variable keeps track of the output stride of the
 201     # activations, i.e., the running product of convolution strides up to the
 202     # current network layer. This allows us to invoke atrous convolution
 203     # whenever applying the next convolution would result in the activations
 204     # having output stride larger than the target output_stride.
 205     current_stride = 1
 206
 207     # The atrous convolution rate parameter.
 208     rate = 1
 209
 210     net = inputs
 211     # Insert default parameters before the base scope which includes
 212     # any custom overrides set in mobilenet.
 213     end_points = {}
 214     scopes = {}
 215     for i, opdef in enumerate(conv_defs['spec']):
 216       params = dict(opdef.params)
 217       opdef.multiplier_func(params, multiplier)
 218       stride = params.get('stride', 1)
 219       if output_stride is not None and current_stride == output_stride:
 220         # If we have reached the target output_stride, then we need to employ
 221         # atrous convolution with stride=1 and multiply the atrous rate by the
 222         # current unit's stride for use in subsequent layers.
 223         layer_stride = 1
 224         layer_rate = rate
 225         rate *= stride
 226       else:
 227         layer_stride = stride
 228         layer_rate = 1
 229         current_stride *= stride
 230       # Update params.
 231       params['stride'] = layer_stride
 232       # Only insert rate to params if rate > 1.
 233       if layer_rate > 1:
 234         params['rate'] = layer_rate
 235       # Set padding
 236       if use_explicit_padding:
 237         if 'kernel_size' in params:
 238           net = _fixed_padding(net, params['kernel_size'], layer_rate)
 239         else:
 240           params['use_explicit_padding'] = True
 241
 242       end_point = 'layer_%d' % (i + 1)
 243       try:
 244         net = opdef.op(net, **params)
 245       except Exception:
 246         print('Failed to create op %i: %r params: %r' % (i, opdef, params))
 247         raise
 248       end_points[end_point] = net
 249       scope = os.path.dirname(net.name)
 250       scopes[scope] = end_point
 251       if final_endpoint is not None and end_point == final_endpoint:
 252         break
 253
 254     # Add all tensors that end with 'output' to
 255     # endpoints
 256     for t in net.graph.get_operations():
 257       scope = os.path.dirname(t.name)
 258       bn = os.path.basename(t.name)
 259       if scope in scopes and t.name.endswith('output'):
 260         end_points[scopes[scope] + '/' + bn] = t.outputs[0]
 261     return net, end_points
 262
 263
 264 @contextlib.contextmanager
 265 def _scope_all(scope, default_scope=None):
 266   with tf.variable_scope(scope, default_name=default_scope) as s,\
 267        tf.name_scope(s.original_name_scope):
 268     yield s
 269
 270
 271 @slim.add_arg_scope
 272 def mobilenet(inputs,
 273               num_classes=1001,
 274               prediction_fn=slim.softmax,
 275               reuse=None,
 276               scope='Mobilenet',
 277               base_only=False,
 278               **mobilenet_args):
 279   """Mobilenet model for classification, supports both V1 and V2.
 280
 281   Note: default mode is inference, use mobilenet.training_scope to create
 282   training network.
 283
 284
 285   Args:
 286     inputs: a tensor of shape [batch_size, height, width, channels].
 287     num_classes: number of predicted classes. If 0 or None, the logits layer
 288       is omitted and the input features to the logits layer (before dropout)
 289       are returned instead.
 290     prediction_fn: a function to get predictions out of logits
 291       (default softmax).
 292     reuse: whether or not the network and its variables should be reused. To be
 293       able to reuse 'scope' must be given.
 294     scope: Optional variable_scope.
 295     base_only: if True will only create the base of the network (no pooling
 296     and no logits).
 297     **mobilenet_args: passed to mobilenet_base verbatim.
 298       - conv_defs: list of conv defs
 299       - multiplier: Float multiplier for the depth (number of channels)
 300       for all convolution ops. The value must be greater than zero. Typical
 301       usage will be to set this value in (0, 1) to reduce the number of
 302       parameters or computation cost of the model.
 303       - output_stride: will ensure that the last layer has at most total stride.
 304       If the architecture calls for more stride than that provided
 305       (e.g. output_stride=16, but the architecture has 5 stride=2 operators),
 306       it will replace output_stride with fractional convolutions using Atrous
 307       Convolutions.
 308
 309   Returns:
 310     logits: the pre-softmax activations, a tensor of size
 311       [batch_size, num_classes]
 312     end_points: a dictionary from components of the network to the corresponding
 313       activation tensor.
 314
 315   Raises:
 316     ValueError: Input rank is invalid.
 317   """
 318   is_training = mobilenet_args.get('is_training', False)
 319   input_shape = inputs.get_shape().as_list()
 320   if len(input_shape) != 4:
 321     raise ValueError('Expected rank 4 input, was: %d' % len(input_shape))
 322
 323   with tf.variable_scope(scope, 'Mobilenet', reuse=reuse) as scope:
 324     inputs = tf.identity(inputs, 'input')
 325     net, end_points = mobilenet_base(inputs, scope=scope, **mobilenet_args)
 326     if base_only:
 327       return net, end_points
 328
 329     net = tf.identity(net, name='embedding')
 330
 331     with tf.variable_scope('Logits'):
 332       net = global_pool(net)
 333       end_points['global_pool'] = net
 334       if not num_classes:
 335         return net, end_points
 336       net = slim.dropout(net, scope='Dropout', is_training=is_training)
 337       # 1 x 1 x num_classes
 338       # Note: legacy scope name.
 339       logits = slim.conv2d(
 340           net,
 341           num_classes, [1, 1],
 342           activation_fn=None,
 343           normalizer_fn=None,
 344           biases_initializer=tf.zeros_initializer(),
 345           scope='Conv2d_1c_1x1')
 346
 347       logits = tf.squeeze(logits, [1, 2])
 348
 349       logits = tf.identity(logits, name='output')
 350     end_points['Logits'] = logits
 351     if prediction_fn:
 352       end_points['Predictions'] = prediction_fn(logits, 'Predictions')
 353   return logits, end_points
 354
 355
 356 def global_pool(input_tensor, pool_op=tf.nn.avg_pool):
 357   """Applies avg pool to produce 1x1 output.
 358
 359   NOTE: This function is funcitonally equivalenet to reduce_mean, but it has
 360   baked in average pool which has better support across hardware.
 361
 362   Args:
 363     input_tensor: input tensor
 364     pool_op: pooling op (avg pool is default)
 365   Returns:
 366     a tensor batch_size x 1 x 1 x depth.
 367   """
 368   shape = input_tensor.get_shape().as_list()
 369   if shape[1] is None or shape[2] is None:
 370     kernel_size = tf.convert_to_tensor(
 371         [1, tf.shape(input_tensor)[1],
 372          tf.shape(input_tensor)[2], 1])
 373   else:
 374     kernel_size = [1, shape[1], shape[2], 1]
 375   output = pool_op(
 376       input_tensor, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID')
 377   # Recover output shape, for unknown shape.
 378   output.set_shape([None, 1, 1, None])
 379   return output
 380
 381
 382 def training_scope(is_training=True,
 383                    weight_decay=0.00004,
 384                    stddev=0.09,
 385                    dropout_keep_prob=0.8,
 386                    bn_decay=0.997):
 387   """Defines Mobilenet training scope.
 388
 389   Usage:
 390      with tf.contrib.slim.arg_scope(mobilenet.training_scope()):
 391        logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
 392
 393      # the network created will be trainble with dropout/batch norm
 394      # initialized appropriately.
 395   Args:
 396     is_training: if set to False this will ensure that all customizations are
 397     set to non-training mode. This might be helpful for code that is reused
 398     across both training/evaluation, but most of the time training_scope with
 399     value False is not needed.
 400
 401     weight_decay: The weight decay to use for regularizing the model.
 402     stddev: Standard deviation for initialization, if negative uses xavier.
 403     dropout_keep_prob: dropout keep probability
 404     bn_decay: decay for the batch norm moving averages.
 405
 406   Returns:
 407     An argument scope to use via arg_scope.
 408   """
 409   # Note: do not introduce parameters that would change the inference
 410   # model here (for example whether to use bias), modify conv_def instead.
 411   batch_norm_params = {
 412       'is_training': is_training,
 413       'decay': bn_decay,
 414   }
 415
 416   if stddev < 0:
 417     weight_intitializer = slim.initializers.xavier_initializer()
 418   else:
 419     weight_intitializer = tf.truncated_normal_initializer(stddev=stddev)
 420
 421   # Set weight_decay for weights in Conv and FC layers.
 422   with slim.arg_scope(
 423       [slim.conv2d, slim.fully_connected, slim.separable_conv2d],
 424       weights_initializer=weight_intitializer,
 425       normalizer_fn=slim.batch_norm), \
 426       slim.arg_scope([mobilenet_base, mobilenet], is_training=is_training),\
 427       slim.arg_scope([slim.batch_norm], **batch_norm_params), \
 428       slim.arg_scope([slim.dropout], is_training=is_training,
 429                      keep_prob=dropout_keep_prob), \
 430       slim.arg_scope([slim.conv2d], \
 431                      weights_regularizer=slim.l2_regularizer(weight_decay)), \
 432       slim.arg_scope([slim.separable_conv2d], weights_regularizer=None) as s:
 433     return s