9b25145f8d48093cf5a92a3415699105353aa038
[ealt-edge.git] / example-apps / PDD / pcb-defect-detection / libs / networks / slim_nets / mobilenet_v1.py
1 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 # =============================================================================
15 """MobileNet v1.
16
17 MobileNet is a general architecture and can be used for multiple use cases.
18 Depending on the use case, it can use different input layer size and different
19 head (for example: embeddings, localization and classification).
20
21 As described in https://arxiv.org/abs/1704.04861.
22
23   MobileNets: Efficient Convolutional Neural Networks for
24     Mobile Vision Applications
25   Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang,
26     Tobias Weyand, Marco Andreetto, Hartwig Adam
27
28 100% Mobilenet V1 (base) with input size 224x224:
29
30 Layer                                                     params           macs
31 --------------------------------------------------------------------------------
32 MobilenetV1/Conv2d_0/Conv2D:                                 864      10,838,016
33 MobilenetV1/Conv2d_1_depthwise/depthwise:                    288       3,612,672
34 MobilenetV1/Conv2d_1_pointwise/Conv2D:                     2,048      25,690,112
35 MobilenetV1/Conv2d_2_depthwise/depthwise:                    576       1,806,336
36 MobilenetV1/Conv2d_2_pointwise/Conv2D:                     8,192      25,690,112
37 MobilenetV1/Conv2d_3_depthwise/depthwise:                  1,152       3,612,672
38 MobilenetV1/Conv2d_3_pointwise/Conv2D:                    16,384      51,380,224
39 MobilenetV1/Conv2d_4_depthwise/depthwise:                  1,152         903,168
40 MobilenetV1/Conv2d_4_pointwise/Conv2D:                    32,768      25,690,112
41 MobilenetV1/Conv2d_5_depthwise/depthwise:                  2,304       1,806,336
42 MobilenetV1/Conv2d_5_pointwise/Conv2D:                    65,536      51,380,224
43 MobilenetV1/Conv2d_6_depthwise/depthwise:                  2,304         451,584
44 MobilenetV1/Conv2d_6_pointwise/Conv2D:                   131,072      25,690,112
45 MobilenetV1/Conv2d_7_depthwise/depthwise:                  4,608         903,168
46 MobilenetV1/Conv2d_7_pointwise/Conv2D:                   262,144      51,380,224
47 MobilenetV1/Conv2d_8_depthwise/depthwise:                  4,608         903,168
48 MobilenetV1/Conv2d_8_pointwise/Conv2D:                   262,144      51,380,224
49 MobilenetV1/Conv2d_9_depthwise/depthwise:                  4,608         903,168
50 MobilenetV1/Conv2d_9_pointwise/Conv2D:                   262,144      51,380,224
51 MobilenetV1/Conv2d_10_depthwise/depthwise:                 4,608         903,168
52 MobilenetV1/Conv2d_10_pointwise/Conv2D:                  262,144      51,380,224
53 MobilenetV1/Conv2d_11_depthwise/depthwise:                 4,608         903,168
54 MobilenetV1/Conv2d_11_pointwise/Conv2D:                  262,144      51,380,224
55 MobilenetV1/Conv2d_12_depthwise/depthwise:                 4,608         225,792
56 MobilenetV1/Conv2d_12_pointwise/Conv2D:                  524,288      25,690,112
57 MobilenetV1/Conv2d_13_depthwise/depthwise:                 9,216         451,584
58 MobilenetV1/Conv2d_13_pointwise/Conv2D:                1,048,576      51,380,224
59 --------------------------------------------------------------------------------
60 Total:                                                 3,185,088     567,716,352
61
62
63 75% Mobilenet V1 (base) with input size 128x128:
64
65 Layer                                                     params           macs
66 --------------------------------------------------------------------------------
67 MobilenetV1/Conv2d_0/Conv2D:                                 648       2,654,208
68 MobilenetV1/Conv2d_1_depthwise/depthwise:                    216         884,736
69 MobilenetV1/Conv2d_1_pointwise/Conv2D:                     1,152       4,718,592
70 MobilenetV1/Conv2d_2_depthwise/depthwise:                    432         442,368
71 MobilenetV1/Conv2d_2_pointwise/Conv2D:                     4,608       4,718,592
72 MobilenetV1/Conv2d_3_depthwise/depthwise:                    864         884,736
73 MobilenetV1/Conv2d_3_pointwise/Conv2D:                     9,216       9,437,184
74 MobilenetV1/Conv2d_4_depthwise/depthwise:                    864         221,184
75 MobilenetV1/Conv2d_4_pointwise/Conv2D:                    18,432       4,718,592
76 MobilenetV1/Conv2d_5_depthwise/depthwise:                  1,728         442,368
77 MobilenetV1/Conv2d_5_pointwise/Conv2D:                    36,864       9,437,184
78 MobilenetV1/Conv2d_6_depthwise/depthwise:                  1,728         110,592
79 MobilenetV1/Conv2d_6_pointwise/Conv2D:                    73,728       4,718,592
80 MobilenetV1/Conv2d_7_depthwise/depthwise:                  3,456         221,184
81 MobilenetV1/Conv2d_7_pointwise/Conv2D:                   147,456       9,437,184
82 MobilenetV1/Conv2d_8_depthwise/depthwise:                  3,456         221,184
83 MobilenetV1/Conv2d_8_pointwise/Conv2D:                   147,456       9,437,184
84 MobilenetV1/Conv2d_9_depthwise/depthwise:                  3,456         221,184
85 MobilenetV1/Conv2d_9_pointwise/Conv2D:                   147,456       9,437,184
86 MobilenetV1/Conv2d_10_depthwise/depthwise:                 3,456         221,184
87 MobilenetV1/Conv2d_10_pointwise/Conv2D:                  147,456       9,437,184
88 MobilenetV1/Conv2d_11_depthwise/depthwise:                 3,456         221,184
89 MobilenetV1/Conv2d_11_pointwise/Conv2D:                  147,456       9,437,184
90 MobilenetV1/Conv2d_12_depthwise/depthwise:                 3,456          55,296
91 MobilenetV1/Conv2d_12_pointwise/Conv2D:                  294,912       4,718,592
92 MobilenetV1/Conv2d_13_depthwise/depthwise:                 6,912         110,592
93 MobilenetV1/Conv2d_13_pointwise/Conv2D:                  589,824       9,437,184
94 --------------------------------------------------------------------------------
95 Total:                                                 1,800,144     106,002,432
96
97 """
98
99 # Tensorflow mandates these.
100 from __future__ import absolute_import
101 from __future__ import division
102 from __future__ import print_function
103
104 from collections import namedtuple
105
106 import tensorflow as tf
107
108 slim = tf.contrib.slim
109
110 # Conv and DepthSepConv namedtuple define layers of the MobileNet architecture
111 # Conv defines 3x3 convolution layers
112 # DepthSepConv defines 3x3 depthwise convolution followed by 1x1 convolution.
113 # stride is the stride of the convolution
114 # depth is the number of channels or filters in a layer
115 Conv = namedtuple('Conv', ['kernel', 'stride', 'depth'])
116 DepthSepConv = namedtuple('DepthSepConv', ['kernel', 'stride', 'depth'])
117
118 # _CONV_DEFS specifies the MobileNet body
119 _CONV_DEFS = [
120     Conv(kernel=[3, 3], stride=2, depth=32),
121     DepthSepConv(kernel=[3, 3], stride=1, depth=64),
122     DepthSepConv(kernel=[3, 3], stride=2, depth=128),
123     DepthSepConv(kernel=[3, 3], stride=1, depth=128),
124     DepthSepConv(kernel=[3, 3], stride=2, depth=256),
125     DepthSepConv(kernel=[3, 3], stride=1, depth=256),
126     DepthSepConv(kernel=[3, 3], stride=2, depth=512),
127     DepthSepConv(kernel=[3, 3], stride=1, depth=512),
128     DepthSepConv(kernel=[3, 3], stride=1, depth=512),
129     DepthSepConv(kernel=[3, 3], stride=1, depth=512),
130     DepthSepConv(kernel=[3, 3], stride=1, depth=512),
131     DepthSepConv(kernel=[3, 3], stride=1, depth=512),
132     DepthSepConv(kernel=[3, 3], stride=2, depth=1024),
133     DepthSepConv(kernel=[3, 3], stride=1, depth=1024)
134 ]
135
136
137 def mobilenet_v1_base(inputs,
138                       final_endpoint='Conv2d_13_pointwise',
139                       min_depth=8,
140                       depth_multiplier=1.0,
141                       conv_defs=None,
142                       output_stride=None,
143                       scope=None):
144   """Mobilenet v1.
145
146   Constructs a Mobilenet v1 network from inputs to the given final endpoint.
147
148   Args:
149     inputs: a tensor of shape [batch_size, height, width, channels].
150     final_endpoint: specifies the endpoint to construct the network up to. It
151       can be one of ['Conv2d_0', 'Conv2d_1_pointwise', 'Conv2d_2_pointwise',
152       'Conv2d_3_pointwise', 'Conv2d_4_pointwise', 'Conv2d_5'_pointwise,
153       'Conv2d_6_pointwise', 'Conv2d_7_pointwise', 'Conv2d_8_pointwise',
154       'Conv2d_9_pointwise', 'Conv2d_10_pointwise', 'Conv2d_11_pointwise',
155       'Conv2d_12_pointwise', 'Conv2d_13_pointwise'].
156     min_depth: Minimum depth value (number of channels) for all convolution ops.
157       Enforced when depth_multiplier < 1, and not an active constraint when
158       depth_multiplier >= 1.
159     depth_multiplier: Float multiplier for the depth (number of channels)
160       for all convolution ops. The value must be greater than zero. Typical
161       usage will be to set this value in (0, 1) to reduce the number of
162       parameters or computation cost of the model.
163     conv_defs: A list of ConvDef namedtuples specifying the net architecture.
164     output_stride: An integer that specifies the requested ratio of input to
165       output spatial resolution. If not None, then we invoke atrous convolution
166       if necessary to prevent the network from reducing the spatial resolution
167       of the activation maps. Allowed values are 8 (accurate fully convolutional
168       mode), 16 (fast fully convolutional mode), 32 (classification mode).
169     scope: Optional variable_scope.
170
171   Returns:
172     tensor_out: output tensor corresponding to the final_endpoint.
173     end_points: a set of activations for external use, for example summaries or
174                 losses.
175
176   Raises:
177     ValueError: if final_endpoint is not set to one of the predefined values,
178                 or depth_multiplier <= 0, or the target output_stride is not
179                 allowed.
180   """
181   depth = lambda d: max(int(d * depth_multiplier), min_depth)
182   end_points = {}
183
184   # Used to find thinned depths for each layer.
185   if depth_multiplier <= 0:
186     raise ValueError('depth_multiplier is not greater than zero.')
187
188   if conv_defs is None:
189     conv_defs = _CONV_DEFS
190
191   if output_stride is not None and output_stride not in [8, 16, 32]:
192     raise ValueError('Only allowed output_stride values are 8, 16, 32.')
193
194   with tf.variable_scope(scope, 'MobilenetV1', [inputs]):
195     with slim.arg_scope([slim.conv2d, slim.separable_conv2d], padding='SAME'):
196       # The current_stride variable keeps track of the output stride of the
197       # activations, i.e., the running product of convolution strides up to the
198       # current network layer. This allows us to invoke atrous convolution
199       # whenever applying the next convolution would result in the activations
200       # having output stride larger than the target output_stride.
201       current_stride = 1
202
203       # The atrous convolution rate parameter.
204       rate = 1
205
206       net = inputs
207       for i, conv_def in enumerate(conv_defs):
208         end_point_base = 'Conv2d_%d' % i
209
210         if output_stride is not None and current_stride == output_stride:
211           # If we have reached the target output_stride, then we need to employ
212           # atrous convolution with stride=1 and multiply the atrous rate by the
213           # current unit's stride for use in subsequent layers.
214           layer_stride = 1
215           layer_rate = rate
216           rate *= conv_def.stride
217         else:
218           layer_stride = conv_def.stride
219           layer_rate = 1
220           current_stride *= conv_def.stride
221
222         if isinstance(conv_def, Conv):
223           end_point = end_point_base
224           net = slim.conv2d(net, depth(conv_def.depth), conv_def.kernel,
225                             stride=conv_def.stride,
226                             normalizer_fn=slim.batch_norm,
227                             scope=end_point)
228           end_points[end_point] = net
229           if end_point == final_endpoint:
230             return net, end_points
231
232         elif isinstance(conv_def, DepthSepConv):
233           end_point = end_point_base + '_depthwise'
234
235           # By passing filters=None
236           # separable_conv2d produces only a depthwise convolution layer
237           net = slim.separable_conv2d(net, None, conv_def.kernel,
238                                       depth_multiplier=1,
239                                       stride=layer_stride,
240                                       rate=layer_rate,
241                                       normalizer_fn=slim.batch_norm,
242                                       scope=end_point)
243
244           end_points[end_point] = net
245           if end_point == final_endpoint:
246             return net, end_points
247
248           end_point = end_point_base + '_pointwise'
249
250           net = slim.conv2d(net, depth(conv_def.depth), [1, 1],
251                             stride=1,
252                             normalizer_fn=slim.batch_norm,
253                             scope=end_point)
254
255           end_points[end_point] = net
256           if end_point == final_endpoint:
257             return net, end_points
258         else:
259           raise ValueError('Unknown convolution type %s for layer %d'
260                            % (conv_def.ltype, i))
261   raise ValueError('Unknown final endpoint %s' % final_endpoint)
262
263
264 def mobilenet_v1(inputs,
265                  num_classes=1000,
266                  dropout_keep_prob=0.999,
267                  is_training=True,
268                  min_depth=8,
269                  depth_multiplier=1.0,
270                  conv_defs=None,
271                  prediction_fn=tf.contrib.layers.softmax,
272                  spatial_squeeze=True,
273                  reuse=None,
274                  scope='MobilenetV1'):
275   """Mobilenet v1 model for classification.
276
277   Args:
278     inputs: a tensor of shape [batch_size, height, width, channels].
279     num_classes: number of predicted classes.
280     dropout_keep_prob: the percentage of activation values that are retained.
281     is_training: whether is training or not.
282     min_depth: Minimum depth value (number of channels) for all convolution ops.
283       Enforced when depth_multiplier < 1, and not an active constraint when
284       depth_multiplier >= 1.
285     depth_multiplier: Float multiplier for the depth (number of channels)
286       for all convolution ops. The value must be greater than zero. Typical
287       usage will be to set this value in (0, 1) to reduce the number of
288       parameters or computation cost of the model.
289     conv_defs: A list of ConvDef namedtuples specifying the net architecture.
290     prediction_fn: a function to get predictions out of logits.
291     spatial_squeeze: if True, logits is of shape is [B, C], if false logits is
292         of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
293     reuse: whether or not the network and its variables should be reused. To be
294       able to reuse 'scope' must be given.
295     scope: Optional variable_scope.
296
297   Returns:
298     logits: the pre-softmax activations, a tensor of size
299       [batch_size, num_classes]
300     end_points: a dictionary from components of the network to the corresponding
301       activation.
302
303   Raises:
304     ValueError: Input rank is invalid.
305   """
306   input_shape = inputs.get_shape().as_list()
307   if len(input_shape) != 4:
308     raise ValueError('Invalid input tensor rank, expected 4, was: %d' %
309                      len(input_shape))
310
311   with tf.variable_scope(scope, 'MobilenetV1', [inputs, num_classes],
312                          reuse=reuse) as scope:
313     with slim.arg_scope([slim.batch_norm, slim.dropout],
314                         is_training=is_training):
315       net, end_points = mobilenet_v1_base(inputs, scope=scope,
316                                           min_depth=min_depth,
317                                           depth_multiplier=depth_multiplier,
318                                           conv_defs=conv_defs)
319       with tf.variable_scope('Logits'):
320         kernel_size = _reduced_kernel_size_for_small_input(net, [7, 7])
321         net = slim.avg_pool2d(net, kernel_size, padding='VALID',
322                               scope='AvgPool_1a')
323         end_points['AvgPool_1a'] = net
324         # 1 x 1 x 1024
325         net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b')
326         logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
327                              normalizer_fn=None, scope='Conv2d_1c_1x1')
328         if spatial_squeeze:
329           logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
330       end_points['Logits'] = logits
331       if prediction_fn:
332         end_points['Predictions'] = prediction_fn(logits, scope='Predictions')
333   return logits, end_points
334
335 mobilenet_v1.default_image_size = 224
336
337
338 def _reduced_kernel_size_for_small_input(input_tensor, kernel_size):
339   """Define kernel size which is automatically reduced for small input.
340
341   If the shape of the input images is unknown at graph construction time this
342   function assumes that the input images are large enough.
343
344   Args:
345     input_tensor: input tensor of size [batch_size, height, width, channels].
346     kernel_size: desired kernel size of length 2: [kernel_height, kernel_width]
347
348   Returns:
349     a tensor with the kernel size.
350   """
351   shape = input_tensor.get_shape().as_list()
352   if shape[1] is None or shape[2] is None:
353     kernel_size_out = kernel_size
354   else:
355     kernel_size_out = [min(shape[1], kernel_size[0]),
356                        min(shape[2], kernel_size[1])]
357   return kernel_size_out
358
359
360 def mobilenet_v1_arg_scope(is_training=True,
361                            weight_decay=0.00004,
362                            stddev=0.09,
363                            regularize_depthwise=False):
364   """Defines the default MobilenetV1 arg scope.
365
366   Args:
367     is_training: Whether or not we're training the model.
368     weight_decay: The weight decay to use for regularizing the model.
369     stddev: The standard deviation of the trunctated normal weight initializer.
370     regularize_depthwise: Whether or not apply regularization on depthwise.
371
372   Returns:
373     An `arg_scope` to use for the mobilenet v1 model.
374   """
375   batch_norm_params = {
376       'is_training': is_training,
377       'center': True,
378       'scale': True,
379       'decay': 0.9997,
380       'epsilon': 0.001,
381   }
382
383   # Set weight_decay for weights in Conv and DepthSepConv layers.
384   weights_init = tf.truncated_normal_initializer(stddev=stddev)
385   regularizer = tf.contrib.layers.l2_regularizer(weight_decay)
386   if regularize_depthwise:
387     depthwise_regularizer = regularizer
388   else:
389     depthwise_regularizer = None
390   with slim.arg_scope([slim.conv2d, slim.separable_conv2d],
391                       weights_initializer=weights_init,
392                       activation_fn=tf.nn.relu6, normalizer_fn=slim.batch_norm):
393     with slim.arg_scope([slim.batch_norm], **batch_norm_params):
394       with slim.arg_scope([slim.conv2d], weights_regularizer=regularizer):
395         with slim.arg_scope([slim.separable_conv2d],
396                             weights_regularizer=depthwise_regularizer) as sc:
397           return sc