1 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 # ==============================================================================
15 """Mobilenet Base Class."""
17 from __future__ import absolute_import
18 from __future__ import division
19 from __future__ import print_function
25 import tensorflow as tf
28 slim = tf.contrib.slim
32 def apply_activation(x, name=None, activation_fn=None):
33 return activation_fn(x, name=name) if activation_fn else x
36 def _fixed_padding(inputs, kernel_size, rate=1):
37 """Pads the input along the spatial dimensions independently of input size.
39 Pads the input such that if it was used in a convolution with 'VALID' padding,
40 the output would have the same dimensions as if the unpadded input was used
41 in a convolution with 'SAME' padding.
44 inputs: A tensor of size [batch, height_in, width_in, channels].
45 kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
46 rate: An integer, rate for atrous convolution.
49 output: A tensor of size [batch, height_out, width_out, channels] with the
50 input, either intact (if kernel_size == 1) or padded (if kernel_size > 1).
52 kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1),
53 kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)]
54 pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1]
55 pad_beg = [pad_total[0] // 2, pad_total[1] // 2]
56 pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]]
57 padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]],
58 [pad_beg[1], pad_end[1]], [0, 0]])
62 def _make_divisible(v, divisor, min_value=None):
65 new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
66 # Make sure that round down does not go down by more than 10%.
72 @contextlib.contextmanager
73 def _set_arg_scope_defaults(defaults):
74 """Sets arg scope defaults for all items present in defaults.
77 defaults: dictionary/list of pairs, containing a mapping from
78 function to a dictionary of default args.
81 context manager where all defaults are set.
83 if hasattr(defaults, 'items'):
84 items = defaults.items()
90 func, default_arg = items[0]
91 with slim.arg_scope(func, **default_arg):
92 with _set_arg_scope_defaults(items[1:]):
97 def depth_multiplier(output_params,
102 if 'num_outputs' not in output_params:
104 d = output_params['num_outputs']
105 output_params['num_outputs'] = _make_divisible(d * multiplier, divisible_by,
109 _Op = collections.namedtuple('Op', ['op', 'params', 'multiplier_func'])
112 def op(opfunc, **params):
113 multiplier = params.pop('multiplier_transorm', depth_multiplier)
114 return _Op(opfunc, params=params, multiplier_func=multiplier)
118 def mobilenet_base( # pylint: disable=invalid-name
124 use_explicit_padding=False,
127 """Mobilenet base network.
129 Constructs a network from inputs to the given final endpoint. By default
130 the network is constructed in inference mode. To create network
131 in training mode use:
133 with slim.arg_scope(mobilenet.training_scope()):
134 logits, endpoints = mobilenet_base(...)
137 inputs: a tensor of shape [batch_size, height, width, channels].
138 conv_defs: A list of op(...) layers specifying the net architecture.
139 multiplier: Float multiplier for the depth (number of channels)
140 for all convolution ops. The value must be greater than zero. Typical
141 usage will be to set this value in (0, 1) to reduce the number of
142 parameters or computation cost of the model.
143 final_endpoint: The name of last layer, for early termination for
144 for V1-based networks: last layer is "layer_14", for V2: "layer_20"
145 output_stride: An integer that specifies the requested ratio of input to
146 output spatial resolution. If not None, then we invoke atrous convolution
147 if necessary to prevent the network from reducing the spatial resolution
148 of the activation maps. Allowed values are 1 or any even number, excluding
149 zero. Typical values are 8 (accurate fully convolutional mode), 16
150 (fast fully convolutional mode), and 32 (classification mode).
152 NOTE- output_stride relies on all consequent operators to support dilated
153 operators via "rate" parameter. This might require wrapping non-conv
154 operators to operate properly.
156 use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
157 inputs so that the output dimensions are the same as if 'SAME' padding
159 scope: optional variable scope.
160 is_training: How to setup batch_norm and other ops. Note: most of the time
161 this does not need be set directly. Use mobilenet.training_scope() to set
162 up training instead. This parameter is here for backward compatibility
163 only. It is safe to set it to the value matching
164 training_scope(is_training=...). It is also safe to explicitly set
165 it to False, even if there is outer training_scope set to to training.
166 (The network will be built in inference mode).
168 tensor_out: output tensor.
169 end_points: a set of activations for external use, for example summaries or
173 ValueError: depth_multiplier <= 0, or the target output_stride is not
177 raise ValueError('multiplier is not greater than zero.')
179 # Set conv defs defaults and overrides.
180 conv_defs_defaults = conv_defs.get('defaults', {})
181 conv_defs_overrides = conv_defs.get('overrides', {})
182 if use_explicit_padding:
183 conv_defs_overrides = copy.deepcopy(conv_defs_overrides)
185 (slim.conv2d, slim.separable_conv2d)] = {'padding': 'VALID'}
187 if output_stride is not None:
188 if output_stride == 0 or (output_stride > 1 and output_stride % 2):
189 raise ValueError('Output stride must be None, 1 or a multiple of 2.')
191 # a) Set the tensorflow scope
192 # b) set padding to default: note we might consider removing this
193 # since it is also set by mobilenet_scope
194 # c) set all defaults
195 # d) set all extra overrides.
196 with _scope_all(scope, default_scope='Mobilenet'), \
197 slim.arg_scope([slim.batch_norm], is_training=is_training), \
198 _set_arg_scope_defaults(conv_defs_defaults), \
199 _set_arg_scope_defaults(conv_defs_overrides):
200 # The current_stride variable keeps track of the output stride of the
201 # activations, i.e., the running product of convolution strides up to the
202 # current network layer. This allows us to invoke atrous convolution
203 # whenever applying the next convolution would result in the activations
204 # having output stride larger than the target output_stride.
207 # The atrous convolution rate parameter.
211 # Insert default parameters before the base scope which includes
212 # any custom overrides set in mobilenet.
215 for i, opdef in enumerate(conv_defs['spec']):
216 params = dict(opdef.params)
217 opdef.multiplier_func(params, multiplier)
218 stride = params.get('stride', 1)
219 if output_stride is not None and current_stride == output_stride:
220 # If we have reached the target output_stride, then we need to employ
221 # atrous convolution with stride=1 and multiply the atrous rate by the
222 # current unit's stride for use in subsequent layers.
227 layer_stride = stride
229 current_stride *= stride
231 params['stride'] = layer_stride
232 # Only insert rate to params if rate > 1.
234 params['rate'] = layer_rate
236 if use_explicit_padding:
237 if 'kernel_size' in params:
238 net = _fixed_padding(net, params['kernel_size'], layer_rate)
240 params['use_explicit_padding'] = True
242 end_point = 'layer_%d' % (i + 1)
244 net = opdef.op(net, **params)
246 print('Failed to create op %i: %r params: %r' % (i, opdef, params))
248 end_points[end_point] = net
249 scope = os.path.dirname(net.name)
250 scopes[scope] = end_point
251 if final_endpoint is not None and end_point == final_endpoint:
254 # Add all tensors that end with 'output' to
256 for t in net.graph.get_operations():
257 scope = os.path.dirname(t.name)
258 bn = os.path.basename(t.name)
259 if scope in scopes and t.name.endswith('output'):
260 end_points[scopes[scope] + '/' + bn] = t.outputs[0]
261 return net, end_points
264 @contextlib.contextmanager
265 def _scope_all(scope, default_scope=None):
266 with tf.variable_scope(scope, default_name=default_scope) as s,\
267 tf.name_scope(s.original_name_scope):
272 def mobilenet(inputs,
274 prediction_fn=slim.softmax,
279 """Mobilenet model for classification, supports both V1 and V2.
281 Note: default mode is inference, use mobilenet.training_scope to create
286 inputs: a tensor of shape [batch_size, height, width, channels].
287 num_classes: number of predicted classes. If 0 or None, the logits layer
288 is omitted and the input features to the logits layer (before dropout)
289 are returned instead.
290 prediction_fn: a function to get predictions out of logits
292 reuse: whether or not the network and its variables should be reused. To be
293 able to reuse 'scope' must be given.
294 scope: Optional variable_scope.
295 base_only: if True will only create the base of the network (no pooling
297 **mobilenet_args: passed to mobilenet_base verbatim.
298 - conv_defs: list of conv defs
299 - multiplier: Float multiplier for the depth (number of channels)
300 for all convolution ops. The value must be greater than zero. Typical
301 usage will be to set this value in (0, 1) to reduce the number of
302 parameters or computation cost of the model.
303 - output_stride: will ensure that the last layer has at most total stride.
304 If the architecture calls for more stride than that provided
305 (e.g. output_stride=16, but the architecture has 5 stride=2 operators),
306 it will replace output_stride with fractional convolutions using Atrous
310 logits: the pre-softmax activations, a tensor of size
311 [batch_size, num_classes]
312 end_points: a dictionary from components of the network to the corresponding
316 ValueError: Input rank is invalid.
318 is_training = mobilenet_args.get('is_training', False)
319 input_shape = inputs.get_shape().as_list()
320 if len(input_shape) != 4:
321 raise ValueError('Expected rank 4 input, was: %d' % len(input_shape))
323 with tf.variable_scope(scope, 'Mobilenet', reuse=reuse) as scope:
324 inputs = tf.identity(inputs, 'input')
325 net, end_points = mobilenet_base(inputs, scope=scope, **mobilenet_args)
327 return net, end_points
329 net = tf.identity(net, name='embedding')
331 with tf.variable_scope('Logits'):
332 net = global_pool(net)
333 end_points['global_pool'] = net
335 return net, end_points
336 net = slim.dropout(net, scope='Dropout', is_training=is_training)
337 # 1 x 1 x num_classes
338 # Note: legacy scope name.
339 logits = slim.conv2d(
344 biases_initializer=tf.zeros_initializer(),
345 scope='Conv2d_1c_1x1')
347 logits = tf.squeeze(logits, [1, 2])
349 logits = tf.identity(logits, name='output')
350 end_points['Logits'] = logits
352 end_points['Predictions'] = prediction_fn(logits, 'Predictions')
353 return logits, end_points
356 def global_pool(input_tensor, pool_op=tf.nn.avg_pool):
357 """Applies avg pool to produce 1x1 output.
359 NOTE: This function is funcitonally equivalenet to reduce_mean, but it has
360 baked in average pool which has better support across hardware.
363 input_tensor: input tensor
364 pool_op: pooling op (avg pool is default)
366 a tensor batch_size x 1 x 1 x depth.
368 shape = input_tensor.get_shape().as_list()
369 if shape[1] is None or shape[2] is None:
370 kernel_size = tf.convert_to_tensor(
371 [1, tf.shape(input_tensor)[1],
372 tf.shape(input_tensor)[2], 1])
374 kernel_size = [1, shape[1], shape[2], 1]
376 input_tensor, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID')
377 # Recover output shape, for unknown shape.
378 output.set_shape([None, 1, 1, None])
382 def training_scope(is_training=True,
383 weight_decay=0.00004,
385 dropout_keep_prob=0.8,
387 """Defines Mobilenet training scope.
390 with tf.contrib.slim.arg_scope(mobilenet.training_scope()):
391 logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
393 # the network created will be trainble with dropout/batch norm
394 # initialized appropriately.
396 is_training: if set to False this will ensure that all customizations are
397 set to non-training mode. This might be helpful for code that is reused
398 across both training/evaluation, but most of the time training_scope with
399 value False is not needed.
401 weight_decay: The weight decay to use for regularizing the model.
402 stddev: Standard deviation for initialization, if negative uses xavier.
403 dropout_keep_prob: dropout keep probability
404 bn_decay: decay for the batch norm moving averages.
407 An argument scope to use via arg_scope.
409 # Note: do not introduce parameters that would change the inference
410 # model here (for example whether to use bias), modify conv_def instead.
411 batch_norm_params = {
412 'is_training': is_training,
417 weight_intitializer = slim.initializers.xavier_initializer()
419 weight_intitializer = tf.truncated_normal_initializer(stddev=stddev)
421 # Set weight_decay for weights in Conv and FC layers.
423 [slim.conv2d, slim.fully_connected, slim.separable_conv2d],
424 weights_initializer=weight_intitializer,
425 normalizer_fn=slim.batch_norm), \
426 slim.arg_scope([mobilenet_base, mobilenet], is_training=is_training),\
427 slim.arg_scope([slim.batch_norm], **batch_norm_params), \
428 slim.arg_scope([slim.dropout], is_training=is_training,
429 keep_prob=dropout_keep_prob), \
430 slim.arg_scope([slim.conv2d], \
431 weights_regularizer=slim.l2_regularizer(weight_decay)), \
432 slim.arg_scope([slim.separable_conv2d], weights_regularizer=None) as s: