--- /dev/null
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""MobileNet v1.
+
+MobileNet is a general architecture and can be used for multiple use cases.
+Depending on the use case, it can use different input layer size and different
+head (for example: embeddings, localization and classification).
+
+As described in https://arxiv.org/abs/1704.04861.
+
+ MobileNets: Efficient Convolutional Neural Networks for
+ Mobile Vision Applications
+ Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang,
+ Tobias Weyand, Marco Andreetto, Hartwig Adam
+
+100% Mobilenet V1 (base) with input size 224x224:
+
+Layer params macs
+--------------------------------------------------------------------------------
+MobilenetV1/Conv2d_0/Conv2D: 864 10,838,016
+MobilenetV1/Conv2d_1_depthwise/depthwise: 288 3,612,672
+MobilenetV1/Conv2d_1_pointwise/Conv2D: 2,048 25,690,112
+MobilenetV1/Conv2d_2_depthwise/depthwise: 576 1,806,336
+MobilenetV1/Conv2d_2_pointwise/Conv2D: 8,192 25,690,112
+MobilenetV1/Conv2d_3_depthwise/depthwise: 1,152 3,612,672
+MobilenetV1/Conv2d_3_pointwise/Conv2D: 16,384 51,380,224
+MobilenetV1/Conv2d_4_depthwise/depthwise: 1,152 903,168
+MobilenetV1/Conv2d_4_pointwise/Conv2D: 32,768 25,690,112
+MobilenetV1/Conv2d_5_depthwise/depthwise: 2,304 1,806,336
+MobilenetV1/Conv2d_5_pointwise/Conv2D: 65,536 51,380,224
+MobilenetV1/Conv2d_6_depthwise/depthwise: 2,304 451,584
+MobilenetV1/Conv2d_6_pointwise/Conv2D: 131,072 25,690,112
+MobilenetV1/Conv2d_7_depthwise/depthwise: 4,608 903,168
+MobilenetV1/Conv2d_7_pointwise/Conv2D: 262,144 51,380,224
+MobilenetV1/Conv2d_8_depthwise/depthwise: 4,608 903,168
+MobilenetV1/Conv2d_8_pointwise/Conv2D: 262,144 51,380,224
+MobilenetV1/Conv2d_9_depthwise/depthwise: 4,608 903,168
+MobilenetV1/Conv2d_9_pointwise/Conv2D: 262,144 51,380,224
+MobilenetV1/Conv2d_10_depthwise/depthwise: 4,608 903,168
+MobilenetV1/Conv2d_10_pointwise/Conv2D: 262,144 51,380,224
+MobilenetV1/Conv2d_11_depthwise/depthwise: 4,608 903,168
+MobilenetV1/Conv2d_11_pointwise/Conv2D: 262,144 51,380,224
+MobilenetV1/Conv2d_12_depthwise/depthwise: 4,608 225,792
+MobilenetV1/Conv2d_12_pointwise/Conv2D: 524,288 25,690,112
+MobilenetV1/Conv2d_13_depthwise/depthwise: 9,216 451,584
+MobilenetV1/Conv2d_13_pointwise/Conv2D: 1,048,576 51,380,224
+--------------------------------------------------------------------------------
+Total: 3,185,088 567,716,352
+
+
+75% Mobilenet V1 (base) with input size 128x128:
+
+Layer params macs
+--------------------------------------------------------------------------------
+MobilenetV1/Conv2d_0/Conv2D: 648 2,654,208
+MobilenetV1/Conv2d_1_depthwise/depthwise: 216 884,736
+MobilenetV1/Conv2d_1_pointwise/Conv2D: 1,152 4,718,592
+MobilenetV1/Conv2d_2_depthwise/depthwise: 432 442,368
+MobilenetV1/Conv2d_2_pointwise/Conv2D: 4,608 4,718,592
+MobilenetV1/Conv2d_3_depthwise/depthwise: 864 884,736
+MobilenetV1/Conv2d_3_pointwise/Conv2D: 9,216 9,437,184
+MobilenetV1/Conv2d_4_depthwise/depthwise: 864 221,184
+MobilenetV1/Conv2d_4_pointwise/Conv2D: 18,432 4,718,592
+MobilenetV1/Conv2d_5_depthwise/depthwise: 1,728 442,368
+MobilenetV1/Conv2d_5_pointwise/Conv2D: 36,864 9,437,184
+MobilenetV1/Conv2d_6_depthwise/depthwise: 1,728 110,592
+MobilenetV1/Conv2d_6_pointwise/Conv2D: 73,728 4,718,592
+MobilenetV1/Conv2d_7_depthwise/depthwise: 3,456 221,184
+MobilenetV1/Conv2d_7_pointwise/Conv2D: 147,456 9,437,184
+MobilenetV1/Conv2d_8_depthwise/depthwise: 3,456 221,184
+MobilenetV1/Conv2d_8_pointwise/Conv2D: 147,456 9,437,184
+MobilenetV1/Conv2d_9_depthwise/depthwise: 3,456 221,184
+MobilenetV1/Conv2d_9_pointwise/Conv2D: 147,456 9,437,184
+MobilenetV1/Conv2d_10_depthwise/depthwise: 3,456 221,184
+MobilenetV1/Conv2d_10_pointwise/Conv2D: 147,456 9,437,184
+MobilenetV1/Conv2d_11_depthwise/depthwise: 3,456 221,184
+MobilenetV1/Conv2d_11_pointwise/Conv2D: 147,456 9,437,184
+MobilenetV1/Conv2d_12_depthwise/depthwise: 3,456 55,296
+MobilenetV1/Conv2d_12_pointwise/Conv2D: 294,912 4,718,592
+MobilenetV1/Conv2d_13_depthwise/depthwise: 6,912 110,592
+MobilenetV1/Conv2d_13_pointwise/Conv2D: 589,824 9,437,184
+--------------------------------------------------------------------------------
+Total: 1,800,144 106,002,432
+
+"""
+
+# Tensorflow mandates these.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+
+import tensorflow as tf
+
+slim = tf.contrib.slim
+
+# Conv and DepthSepConv namedtuple define layers of the MobileNet architecture
+# Conv defines 3x3 convolution layers
+# DepthSepConv defines 3x3 depthwise convolution followed by 1x1 convolution.
+# stride is the stride of the convolution
+# depth is the number of channels or filters in a layer
+Conv = namedtuple('Conv', ['kernel', 'stride', 'depth'])
+DepthSepConv = namedtuple('DepthSepConv', ['kernel', 'stride', 'depth'])
+
+# _CONV_DEFS specifies the MobileNet body
+_CONV_DEFS = [
+ Conv(kernel=[3, 3], stride=2, depth=32),
+ DepthSepConv(kernel=[3, 3], stride=1, depth=64),
+ DepthSepConv(kernel=[3, 3], stride=2, depth=128),
+ DepthSepConv(kernel=[3, 3], stride=1, depth=128),
+ DepthSepConv(kernel=[3, 3], stride=2, depth=256),
+ DepthSepConv(kernel=[3, 3], stride=1, depth=256),
+ DepthSepConv(kernel=[3, 3], stride=2, depth=512),
+ DepthSepConv(kernel=[3, 3], stride=1, depth=512),
+ DepthSepConv(kernel=[3, 3], stride=1, depth=512),
+ DepthSepConv(kernel=[3, 3], stride=1, depth=512),
+ DepthSepConv(kernel=[3, 3], stride=1, depth=512),
+ DepthSepConv(kernel=[3, 3], stride=1, depth=512),
+ DepthSepConv(kernel=[3, 3], stride=2, depth=1024),
+ DepthSepConv(kernel=[3, 3], stride=1, depth=1024)
+]
+
+
+def mobilenet_v1_base(inputs,
+ final_endpoint='Conv2d_13_pointwise',
+ min_depth=8,
+ depth_multiplier=1.0,
+ conv_defs=None,
+ output_stride=None,
+ scope=None):
+ """Mobilenet v1.
+
+ Constructs a Mobilenet v1 network from inputs to the given final endpoint.
+
+ Args:
+ inputs: a tensor of shape [batch_size, height, width, channels].
+ final_endpoint: specifies the endpoint to construct the network up to. It
+ can be one of ['Conv2d_0', 'Conv2d_1_pointwise', 'Conv2d_2_pointwise',
+ 'Conv2d_3_pointwise', 'Conv2d_4_pointwise', 'Conv2d_5'_pointwise,
+ 'Conv2d_6_pointwise', 'Conv2d_7_pointwise', 'Conv2d_8_pointwise',
+ 'Conv2d_9_pointwise', 'Conv2d_10_pointwise', 'Conv2d_11_pointwise',
+ 'Conv2d_12_pointwise', 'Conv2d_13_pointwise'].
+ min_depth: Minimum depth value (number of channels) for all convolution ops.
+ Enforced when depth_multiplier < 1, and not an active constraint when
+ depth_multiplier >= 1.
+ depth_multiplier: Float multiplier for the depth (number of channels)
+ for all convolution ops. The value must be greater than zero. Typical
+ usage will be to set this value in (0, 1) to reduce the number of
+ parameters or computation cost of the model.
+ conv_defs: A list of ConvDef namedtuples specifying the net architecture.
+ output_stride: An integer that specifies the requested ratio of input to
+ output spatial resolution. If not None, then we invoke atrous convolution
+ if necessary to prevent the network from reducing the spatial resolution
+ of the activation maps. Allowed values are 8 (accurate fully convolutional
+ mode), 16 (fast fully convolutional mode), 32 (classification mode).
+ scope: Optional variable_scope.
+
+ Returns:
+ tensor_out: output tensor corresponding to the final_endpoint.
+ end_points: a set of activations for external use, for example summaries or
+ losses.
+
+ Raises:
+ ValueError: if final_endpoint is not set to one of the predefined values,
+ or depth_multiplier <= 0, or the target output_stride is not
+ allowed.
+ """
+ depth = lambda d: max(int(d * depth_multiplier), min_depth)
+ end_points = {}
+
+ # Used to find thinned depths for each layer.
+ if depth_multiplier <= 0:
+ raise ValueError('depth_multiplier is not greater than zero.')
+
+ if conv_defs is None:
+ conv_defs = _CONV_DEFS
+
+ if output_stride is not None and output_stride not in [8, 16, 32]:
+ raise ValueError('Only allowed output_stride values are 8, 16, 32.')
+
+ with tf.variable_scope(scope, 'MobilenetV1', [inputs]):
+ with slim.arg_scope([slim.conv2d, slim.separable_conv2d], padding='SAME'):
+ # The current_stride variable keeps track of the output stride of the
+ # activations, i.e., the running product of convolution strides up to the
+ # current network layer. This allows us to invoke atrous convolution
+ # whenever applying the next convolution would result in the activations
+ # having output stride larger than the target output_stride.
+ current_stride = 1
+
+ # The atrous convolution rate parameter.
+ rate = 1
+
+ net = inputs
+ for i, conv_def in enumerate(conv_defs):
+ end_point_base = 'Conv2d_%d' % i
+
+ if output_stride is not None and current_stride == output_stride:
+ # If we have reached the target output_stride, then we need to employ
+ # atrous convolution with stride=1 and multiply the atrous rate by the
+ # current unit's stride for use in subsequent layers.
+ layer_stride = 1
+ layer_rate = rate
+ rate *= conv_def.stride
+ else:
+ layer_stride = conv_def.stride
+ layer_rate = 1
+ current_stride *= conv_def.stride
+
+ if isinstance(conv_def, Conv):
+ end_point = end_point_base
+ net = slim.conv2d(net, depth(conv_def.depth), conv_def.kernel,
+ stride=conv_def.stride,
+ normalizer_fn=slim.batch_norm,
+ scope=end_point)
+ end_points[end_point] = net
+ if end_point == final_endpoint:
+ return net, end_points
+
+ elif isinstance(conv_def, DepthSepConv):
+ end_point = end_point_base + '_depthwise'
+
+ # By passing filters=None
+ # separable_conv2d produces only a depthwise convolution layer
+ net = slim.separable_conv2d(net, None, conv_def.kernel,
+ depth_multiplier=1,
+ stride=layer_stride,
+ rate=layer_rate,
+ normalizer_fn=slim.batch_norm,
+ scope=end_point)
+
+ end_points[end_point] = net
+ if end_point == final_endpoint:
+ return net, end_points
+
+ end_point = end_point_base + '_pointwise'
+
+ net = slim.conv2d(net, depth(conv_def.depth), [1, 1],
+ stride=1,
+ normalizer_fn=slim.batch_norm,
+ scope=end_point)
+
+ end_points[end_point] = net
+ if end_point == final_endpoint:
+ return net, end_points
+ else:
+ raise ValueError('Unknown convolution type %s for layer %d'
+ % (conv_def.ltype, i))
+ raise ValueError('Unknown final endpoint %s' % final_endpoint)
+
+
+def mobilenet_v1(inputs,
+ num_classes=1000,
+ dropout_keep_prob=0.999,
+ is_training=True,
+ min_depth=8,
+ depth_multiplier=1.0,
+ conv_defs=None,
+ prediction_fn=tf.contrib.layers.softmax,
+ spatial_squeeze=True,
+ reuse=None,
+ scope='MobilenetV1'):
+ """Mobilenet v1 model for classification.
+
+ Args:
+ inputs: a tensor of shape [batch_size, height, width, channels].
+ num_classes: number of predicted classes.
+ dropout_keep_prob: the percentage of activation values that are retained.
+ is_training: whether is training or not.
+ min_depth: Minimum depth value (number of channels) for all convolution ops.
+ Enforced when depth_multiplier < 1, and not an active constraint when
+ depth_multiplier >= 1.
+ depth_multiplier: Float multiplier for the depth (number of channels)
+ for all convolution ops. The value must be greater than zero. Typical
+ usage will be to set this value in (0, 1) to reduce the number of
+ parameters or computation cost of the model.
+ conv_defs: A list of ConvDef namedtuples specifying the net architecture.
+ prediction_fn: a function to get predictions out of logits.
+ spatial_squeeze: if True, logits is of shape is [B, C], if false logits is
+ of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
+ reuse: whether or not the network and its variables should be reused. To be
+ able to reuse 'scope' must be given.
+ scope: Optional variable_scope.
+
+ Returns:
+ logits: the pre-softmax activations, a tensor of size
+ [batch_size, num_classes]
+ end_points: a dictionary from components of the network to the corresponding
+ activation.
+
+ Raises:
+ ValueError: Input rank is invalid.
+ """
+ input_shape = inputs.get_shape().as_list()
+ if len(input_shape) != 4:
+ raise ValueError('Invalid input tensor rank, expected 4, was: %d' %
+ len(input_shape))
+
+ with tf.variable_scope(scope, 'MobilenetV1', [inputs, num_classes],
+ reuse=reuse) as scope:
+ with slim.arg_scope([slim.batch_norm, slim.dropout],
+ is_training=is_training):
+ net, end_points = mobilenet_v1_base(inputs, scope=scope,
+ min_depth=min_depth,
+ depth_multiplier=depth_multiplier,
+ conv_defs=conv_defs)
+ with tf.variable_scope('Logits'):
+ kernel_size = _reduced_kernel_size_for_small_input(net, [7, 7])
+ net = slim.avg_pool2d(net, kernel_size, padding='VALID',
+ scope='AvgPool_1a')
+ end_points['AvgPool_1a'] = net
+ # 1 x 1 x 1024
+ net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b')
+ logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
+ normalizer_fn=None, scope='Conv2d_1c_1x1')
+ if spatial_squeeze:
+ logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
+ end_points['Logits'] = logits
+ if prediction_fn:
+ end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
+ return logits, end_points
+
+mobilenet_v1.default_image_size = 224
+
+
+def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
+ """Define kernel size which is automatically reduced for small input.
+
+ If the shape of the input images is unknown at graph construction time this
+ function assumes that the input images are large enough.
+
+ Args:
+ input_tensor: input tensor of size [batch_size, height, width, channels].
+ kernel_size: desired kernel size of length 2: [kernel_height, kernel_width]
+
+ Returns:
+ a tensor with the kernel size.
+ """
+ shape = input_tensor.get_shape().as_list()
+ if shape[1] is None or shape[2] is None:
+ kernel_size_out = kernel_size
+ else:
+ kernel_size_out = [min(shape[1], kernel_size[0]),
+ min(shape[2], kernel_size[1])]
+ return kernel_size_out
+
+
+def mobilenet_v1_arg_scope(is_training=True,
+ weight_decay=0.00004,
+ stddev=0.09,
+ regularize_depthwise=False):
+ """Defines the default MobilenetV1 arg scope.
+
+ Args:
+ is_training: Whether or not we're training the model.
+ weight_decay: The weight decay to use for regularizing the model.
+ stddev: The standard deviation of the trunctated normal weight initializer.
+ regularize_depthwise: Whether or not apply regularization on depthwise.
+
+ Returns:
+ An `arg_scope` to use for the mobilenet v1 model.
+ """
+ batch_norm_params = {
+ 'is_training': is_training,
+ 'center': True,
+ 'scale': True,
+ 'decay': 0.9997,
+ 'epsilon': 0.001,
+ }
+
+ # Set weight_decay for weights in Conv and DepthSepConv layers.
+ weights_init = tf.truncated_normal_initializer(stddev=stddev)
+ regularizer = tf.contrib.layers.l2_regularizer(weight_decay)
+ if regularize_depthwise:
+ depthwise_regularizer = regularizer
+ else:
+ depthwise_regularizer = None
+ with slim.arg_scope([slim.conv2d, slim.separable_conv2d],
+ weights_initializer=weights_init,
+ activation_fn=tf.nn.relu6, normalizer_fn=slim.batch_norm):
+ with slim.arg_scope([slim.batch_norm], **batch_norm_params):
+ with slim.arg_scope([slim.conv2d], weights_regularizer=regularizer):
+ with slim.arg_scope([slim.separable_conv2d],
+ weights_regularizer=depthwise_regularizer) as sc:
+ return sc