1 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 # =============================================================================
17 MobileNet is a general architecture and can be used for multiple use cases.
18 Depending on the use case, it can use different input layer size and different
19 head (for example: embeddings, localization and classification).
21 As described in https://arxiv.org/abs/1704.04861.
23 MobileNets: Efficient Convolutional Neural Networks for
24 Mobile Vision Applications
25 Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang,
26 Tobias Weyand, Marco Andreetto, Hartwig Adam
28 100% Mobilenet V1 (base) with input size 224x224:
31 --------------------------------------------------------------------------------
32 MobilenetV1/Conv2d_0/Conv2D: 864 10,838,016
33 MobilenetV1/Conv2d_1_depthwise/depthwise: 288 3,612,672
34 MobilenetV1/Conv2d_1_pointwise/Conv2D: 2,048 25,690,112
35 MobilenetV1/Conv2d_2_depthwise/depthwise: 576 1,806,336
36 MobilenetV1/Conv2d_2_pointwise/Conv2D: 8,192 25,690,112
37 MobilenetV1/Conv2d_3_depthwise/depthwise: 1,152 3,612,672
38 MobilenetV1/Conv2d_3_pointwise/Conv2D: 16,384 51,380,224
39 MobilenetV1/Conv2d_4_depthwise/depthwise: 1,152 903,168
40 MobilenetV1/Conv2d_4_pointwise/Conv2D: 32,768 25,690,112
41 MobilenetV1/Conv2d_5_depthwise/depthwise: 2,304 1,806,336
42 MobilenetV1/Conv2d_5_pointwise/Conv2D: 65,536 51,380,224
43 MobilenetV1/Conv2d_6_depthwise/depthwise: 2,304 451,584
44 MobilenetV1/Conv2d_6_pointwise/Conv2D: 131,072 25,690,112
45 MobilenetV1/Conv2d_7_depthwise/depthwise: 4,608 903,168
46 MobilenetV1/Conv2d_7_pointwise/Conv2D: 262,144 51,380,224
47 MobilenetV1/Conv2d_8_depthwise/depthwise: 4,608 903,168
48 MobilenetV1/Conv2d_8_pointwise/Conv2D: 262,144 51,380,224
49 MobilenetV1/Conv2d_9_depthwise/depthwise: 4,608 903,168
50 MobilenetV1/Conv2d_9_pointwise/Conv2D: 262,144 51,380,224
51 MobilenetV1/Conv2d_10_depthwise/depthwise: 4,608 903,168
52 MobilenetV1/Conv2d_10_pointwise/Conv2D: 262,144 51,380,224
53 MobilenetV1/Conv2d_11_depthwise/depthwise: 4,608 903,168
54 MobilenetV1/Conv2d_11_pointwise/Conv2D: 262,144 51,380,224
55 MobilenetV1/Conv2d_12_depthwise/depthwise: 4,608 225,792
56 MobilenetV1/Conv2d_12_pointwise/Conv2D: 524,288 25,690,112
57 MobilenetV1/Conv2d_13_depthwise/depthwise: 9,216 451,584
58 MobilenetV1/Conv2d_13_pointwise/Conv2D: 1,048,576 51,380,224
59 --------------------------------------------------------------------------------
60 Total: 3,185,088 567,716,352
63 75% Mobilenet V1 (base) with input size 128x128:
66 --------------------------------------------------------------------------------
67 MobilenetV1/Conv2d_0/Conv2D: 648 2,654,208
68 MobilenetV1/Conv2d_1_depthwise/depthwise: 216 884,736
69 MobilenetV1/Conv2d_1_pointwise/Conv2D: 1,152 4,718,592
70 MobilenetV1/Conv2d_2_depthwise/depthwise: 432 442,368
71 MobilenetV1/Conv2d_2_pointwise/Conv2D: 4,608 4,718,592
72 MobilenetV1/Conv2d_3_depthwise/depthwise: 864 884,736
73 MobilenetV1/Conv2d_3_pointwise/Conv2D: 9,216 9,437,184
74 MobilenetV1/Conv2d_4_depthwise/depthwise: 864 221,184
75 MobilenetV1/Conv2d_4_pointwise/Conv2D: 18,432 4,718,592
76 MobilenetV1/Conv2d_5_depthwise/depthwise: 1,728 442,368
77 MobilenetV1/Conv2d_5_pointwise/Conv2D: 36,864 9,437,184
78 MobilenetV1/Conv2d_6_depthwise/depthwise: 1,728 110,592
79 MobilenetV1/Conv2d_6_pointwise/Conv2D: 73,728 4,718,592
80 MobilenetV1/Conv2d_7_depthwise/depthwise: 3,456 221,184
81 MobilenetV1/Conv2d_7_pointwise/Conv2D: 147,456 9,437,184
82 MobilenetV1/Conv2d_8_depthwise/depthwise: 3,456 221,184
83 MobilenetV1/Conv2d_8_pointwise/Conv2D: 147,456 9,437,184
84 MobilenetV1/Conv2d_9_depthwise/depthwise: 3,456 221,184
85 MobilenetV1/Conv2d_9_pointwise/Conv2D: 147,456 9,437,184
86 MobilenetV1/Conv2d_10_depthwise/depthwise: 3,456 221,184
87 MobilenetV1/Conv2d_10_pointwise/Conv2D: 147,456 9,437,184
88 MobilenetV1/Conv2d_11_depthwise/depthwise: 3,456 221,184
89 MobilenetV1/Conv2d_11_pointwise/Conv2D: 147,456 9,437,184
90 MobilenetV1/Conv2d_12_depthwise/depthwise: 3,456 55,296
91 MobilenetV1/Conv2d_12_pointwise/Conv2D: 294,912 4,718,592
92 MobilenetV1/Conv2d_13_depthwise/depthwise: 6,912 110,592
93 MobilenetV1/Conv2d_13_pointwise/Conv2D: 589,824 9,437,184
94 --------------------------------------------------------------------------------
95 Total: 1,800,144 106,002,432
99 # Tensorflow mandates these.
100 from __future__ import absolute_import
101 from __future__ import division
102 from __future__ import print_function
104 from collections import namedtuple
106 import tensorflow as tf
108 slim = tf.contrib.slim
110 # Conv and DepthSepConv namedtuple define layers of the MobileNet architecture
111 # Conv defines 3x3 convolution layers
112 # DepthSepConv defines 3x3 depthwise convolution followed by 1x1 convolution.
113 # stride is the stride of the convolution
114 # depth is the number of channels or filters in a layer
115 Conv = namedtuple('Conv', ['kernel', 'stride', 'depth'])
116 DepthSepConv = namedtuple('DepthSepConv', ['kernel', 'stride', 'depth'])
118 # _CONV_DEFS specifies the MobileNet body
120 Conv(kernel=[3, 3], stride=2, depth=32),
121 DepthSepConv(kernel=[3, 3], stride=1, depth=64),
122 DepthSepConv(kernel=[3, 3], stride=2, depth=128),
123 DepthSepConv(kernel=[3, 3], stride=1, depth=128),
124 DepthSepConv(kernel=[3, 3], stride=2, depth=256),
125 DepthSepConv(kernel=[3, 3], stride=1, depth=256),
126 DepthSepConv(kernel=[3, 3], stride=2, depth=512),
127 DepthSepConv(kernel=[3, 3], stride=1, depth=512),
128 DepthSepConv(kernel=[3, 3], stride=1, depth=512),
129 DepthSepConv(kernel=[3, 3], stride=1, depth=512),
130 DepthSepConv(kernel=[3, 3], stride=1, depth=512),
131 DepthSepConv(kernel=[3, 3], stride=1, depth=512),
132 DepthSepConv(kernel=[3, 3], stride=2, depth=1024),
133 DepthSepConv(kernel=[3, 3], stride=1, depth=1024)
137 def mobilenet_v1_base(inputs,
138 final_endpoint='Conv2d_13_pointwise',
140 depth_multiplier=1.0,
146 Constructs a Mobilenet v1 network from inputs to the given final endpoint.
149 inputs: a tensor of shape [batch_size, height, width, channels].
150 final_endpoint: specifies the endpoint to construct the network up to. It
151 can be one of ['Conv2d_0', 'Conv2d_1_pointwise', 'Conv2d_2_pointwise',
152 'Conv2d_3_pointwise', 'Conv2d_4_pointwise', 'Conv2d_5'_pointwise,
153 'Conv2d_6_pointwise', 'Conv2d_7_pointwise', 'Conv2d_8_pointwise',
154 'Conv2d_9_pointwise', 'Conv2d_10_pointwise', 'Conv2d_11_pointwise',
155 'Conv2d_12_pointwise', 'Conv2d_13_pointwise'].
156 min_depth: Minimum depth value (number of channels) for all convolution ops.
157 Enforced when depth_multiplier < 1, and not an active constraint when
158 depth_multiplier >= 1.
159 depth_multiplier: Float multiplier for the depth (number of channels)
160 for all convolution ops. The value must be greater than zero. Typical
161 usage will be to set this value in (0, 1) to reduce the number of
162 parameters or computation cost of the model.
163 conv_defs: A list of ConvDef namedtuples specifying the net architecture.
164 output_stride: An integer that specifies the requested ratio of input to
165 output spatial resolution. If not None, then we invoke atrous convolution
166 if necessary to prevent the network from reducing the spatial resolution
167 of the activation maps. Allowed values are 8 (accurate fully convolutional
168 mode), 16 (fast fully convolutional mode), 32 (classification mode).
169 scope: Optional variable_scope.
172 tensor_out: output tensor corresponding to the final_endpoint.
173 end_points: a set of activations for external use, for example summaries or
177 ValueError: if final_endpoint is not set to one of the predefined values,
178 or depth_multiplier <= 0, or the target output_stride is not
181 depth = lambda d: max(int(d * depth_multiplier), min_depth)
184 # Used to find thinned depths for each layer.
185 if depth_multiplier <= 0:
186 raise ValueError('depth_multiplier is not greater than zero.')
188 if conv_defs is None:
189 conv_defs = _CONV_DEFS
191 if output_stride is not None and output_stride not in [8, 16, 32]:
192 raise ValueError('Only allowed output_stride values are 8, 16, 32.')
194 with tf.variable_scope(scope, 'MobilenetV1', [inputs]):
195 with slim.arg_scope([slim.conv2d, slim.separable_conv2d], padding='SAME'):
196 # The current_stride variable keeps track of the output stride of the
197 # activations, i.e., the running product of convolution strides up to the
198 # current network layer. This allows us to invoke atrous convolution
199 # whenever applying the next convolution would result in the activations
200 # having output stride larger than the target output_stride.
203 # The atrous convolution rate parameter.
207 for i, conv_def in enumerate(conv_defs):
208 end_point_base = 'Conv2d_%d' % i
210 if output_stride is not None and current_stride == output_stride:
211 # If we have reached the target output_stride, then we need to employ
212 # atrous convolution with stride=1 and multiply the atrous rate by the
213 # current unit's stride for use in subsequent layers.
216 rate *= conv_def.stride
218 layer_stride = conv_def.stride
220 current_stride *= conv_def.stride
222 if isinstance(conv_def, Conv):
223 end_point = end_point_base
224 net = slim.conv2d(net, depth(conv_def.depth), conv_def.kernel,
225 stride=conv_def.stride,
226 normalizer_fn=slim.batch_norm,
228 end_points[end_point] = net
229 if end_point == final_endpoint:
230 return net, end_points
232 elif isinstance(conv_def, DepthSepConv):
233 end_point = end_point_base + '_depthwise'
235 # By passing filters=None
236 # separable_conv2d produces only a depthwise convolution layer
237 net = slim.separable_conv2d(net, None, conv_def.kernel,
241 normalizer_fn=slim.batch_norm,
244 end_points[end_point] = net
245 if end_point == final_endpoint:
246 return net, end_points
248 end_point = end_point_base + '_pointwise'
250 net = slim.conv2d(net, depth(conv_def.depth), [1, 1],
252 normalizer_fn=slim.batch_norm,
255 end_points[end_point] = net
256 if end_point == final_endpoint:
257 return net, end_points
259 raise ValueError('Unknown convolution type %s for layer %d'
260 % (conv_def.ltype, i))
261 raise ValueError('Unknown final endpoint %s' % final_endpoint)
264 def mobilenet_v1(inputs,
266 dropout_keep_prob=0.999,
269 depth_multiplier=1.0,
271 prediction_fn=tf.contrib.layers.softmax,
272 spatial_squeeze=True,
274 scope='MobilenetV1'):
275 """Mobilenet v1 model for classification.
278 inputs: a tensor of shape [batch_size, height, width, channels].
279 num_classes: number of predicted classes.
280 dropout_keep_prob: the percentage of activation values that are retained.
281 is_training: whether is training or not.
282 min_depth: Minimum depth value (number of channels) for all convolution ops.
283 Enforced when depth_multiplier < 1, and not an active constraint when
284 depth_multiplier >= 1.
285 depth_multiplier: Float multiplier for the depth (number of channels)
286 for all convolution ops. The value must be greater than zero. Typical
287 usage will be to set this value in (0, 1) to reduce the number of
288 parameters or computation cost of the model.
289 conv_defs: A list of ConvDef namedtuples specifying the net architecture.
290 prediction_fn: a function to get predictions out of logits.
291 spatial_squeeze: if True, logits is of shape is [B, C], if false logits is
292 of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
293 reuse: whether or not the network and its variables should be reused. To be
294 able to reuse 'scope' must be given.
295 scope: Optional variable_scope.
298 logits: the pre-softmax activations, a tensor of size
299 [batch_size, num_classes]
300 end_points: a dictionary from components of the network to the corresponding
304 ValueError: Input rank is invalid.
306 input_shape = inputs.get_shape().as_list()
307 if len(input_shape) != 4:
308 raise ValueError('Invalid input tensor rank, expected 4, was: %d' %
311 with tf.variable_scope(scope, 'MobilenetV1', [inputs, num_classes],
312 reuse=reuse) as scope:
313 with slim.arg_scope([slim.batch_norm, slim.dropout],
314 is_training=is_training):
315 net, end_points = mobilenet_v1_base(inputs, scope=scope,
317 depth_multiplier=depth_multiplier,
319 with tf.variable_scope('Logits'):
320 kernel_size = _reduced_kernel_size_for_small_input(net, [7, 7])
321 net = slim.avg_pool2d(net, kernel_size, padding='VALID',
323 end_points['AvgPool_1a'] = net
325 net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b')
326 logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
327 normalizer_fn=None, scope='Conv2d_1c_1x1')
329 logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
330 end_points['Logits'] = logits
332 end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
333 return logits, end_points
335 mobilenet_v1.default_image_size = 224
338 def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
339 """Define kernel size which is automatically reduced for small input.
341 If the shape of the input images is unknown at graph construction time this
342 function assumes that the input images are large enough.
345 input_tensor: input tensor of size [batch_size, height, width, channels].
346 kernel_size: desired kernel size of length 2: [kernel_height, kernel_width]
349 a tensor with the kernel size.
351 shape = input_tensor.get_shape().as_list()
352 if shape[1] is None or shape[2] is None:
353 kernel_size_out = kernel_size
355 kernel_size_out = [min(shape[1], kernel_size[0]),
356 min(shape[2], kernel_size[1])]
357 return kernel_size_out
360 def mobilenet_v1_arg_scope(is_training=True,
361 weight_decay=0.00004,
363 regularize_depthwise=False):
364 """Defines the default MobilenetV1 arg scope.
367 is_training: Whether or not we're training the model.
368 weight_decay: The weight decay to use for regularizing the model.
369 stddev: The standard deviation of the trunctated normal weight initializer.
370 regularize_depthwise: Whether or not apply regularization on depthwise.
373 An `arg_scope` to use for the mobilenet v1 model.
375 batch_norm_params = {
376 'is_training': is_training,
383 # Set weight_decay for weights in Conv and DepthSepConv layers.
384 weights_init = tf.truncated_normal_initializer(stddev=stddev)
385 regularizer = tf.contrib.layers.l2_regularizer(weight_decay)
386 if regularize_depthwise:
387 depthwise_regularizer = regularizer
389 depthwise_regularizer = None
390 with slim.arg_scope([slim.conv2d, slim.separable_conv2d],
391 weights_initializer=weights_init,
392 activation_fn=tf.nn.relu6, normalizer_fn=slim.batch_norm):
393 with slim.arg_scope([slim.batch_norm], **batch_norm_params):
394 with slim.arg_scope([slim.conv2d], weights_regularizer=regularizer):
395 with slim.arg_scope([slim.separable_conv2d],
396 weights_regularizer=depthwise_regularizer) as sc: