1 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 # ==============================================================================
15 """Convolution blocks for mobilenet."""
19 import tensorflow as tf
21 slim = tf.contrib.slim
24 def _fixed_padding(inputs, kernel_size, rate=1):
25 """Pads the input along the spatial dimensions independently of input size.
27 Pads the input such that if it was used in a convolution with 'VALID' padding,
28 the output would have the same dimensions as if the unpadded input was used
29 in a convolution with 'SAME' padding.
32 inputs: A tensor of size [batch, height_in, width_in, channels].
33 kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
34 rate: An integer, rate for atrous convolution.
37 output: A tensor of size [batch, height_out, width_out, channels] with the
38 input, either intact (if kernel_size == 1) or padded (if kernel_size > 1).
40 kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1),
41 kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)]
42 pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1]
43 pad_beg = [pad_total[0] // 2, pad_total[1] // 2]
44 pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]]
45 padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]],
46 [pad_beg[1], pad_end[1]], [0, 0]])
50 def _make_divisible(v, divisor, min_value=None):
53 new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
54 # Make sure that round down does not go down by more than 10%.
60 def _split_divisible(num, num_ways, divisible_by=8):
61 """Evenly splits num, num_ways so each piece is a multiple of divisible_by."""
62 assert num % divisible_by == 0
63 assert num / num_ways >= divisible_by
64 # Note: want to round down, we adjust each split to match the total.
65 base = num // num_ways // divisible_by * divisible_by
68 for i in range(num_ways):
70 while accumulated + r < num * (i + 1) / num_ways:
74 assert accumulated == num
78 @contextlib.contextmanager
79 def _v1_compatible_scope_naming(scope):
80 if scope is None: # Create uniqified separable blocks.
81 with tf.variable_scope(None, default_name='separable') as s, \
82 tf.name_scope(s.original_name_scope):
85 # We use scope_depthwise, scope_pointwise for compatibility with V1 ckpts.
86 # which provide numbered scopes.
92 def split_separable_conv2d(input_tensor,
99 use_explicit_padding=False):
100 """Separable mobilenet V1 style convolution.
102 Depthwise convolution, with default non-linearity,
103 followed by 1x1 depthwise convolution. This is similar to
104 slim.separable_conv2d, but differs in tha it applies batch
105 normalization and non-linearity to depthwise. This matches
106 the basic building of Mobilenet Paper
107 (https://arxiv.org/abs/1704.04861)
111 num_outputs: number of outputs
112 scope: optional name of the scope. Note if provided it will use
113 scope_depthwise for deptwhise, and scope_pointwise for pointwise.
114 normalizer_fn: which normalizer function to use for depthwise/pointwise
116 rate: output rate (also known as dilation rate)
117 endpoints: optional, if provided, will export additional tensors to it.
118 use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
119 inputs so that the output dimensions are the same as if 'SAME' padding
126 with _v1_compatible_scope_naming(scope) as scope:
127 dw_scope = scope + 'depthwise'
128 endpoints = endpoints if endpoints is not None else {}
131 if use_explicit_padding:
133 input_tensor = _fixed_padding(input_tensor, kernel_size, rate)
134 net = slim.separable_conv2d(
141 normalizer_fn=normalizer_fn,
145 endpoints[dw_scope] = net
147 pw_scope = scope + 'pointwise'
152 normalizer_fn=normalizer_fn,
154 endpoints[pw_scope] = net
158 def expand_input_by_factor(n, divisible_by=8):
159 return lambda num_inputs, **_: _make_divisible(num_inputs * n, divisible_by)
163 def expanded_conv(input_tensor,
165 expansion_size=expand_input_by_factor(6),
173 expansion_transform=None,
174 depthwise_location='expansion',
175 depthwise_channel_multiplier=1,
177 use_explicit_padding=False,
179 """Depthwise Convolution Block with expansion.
181 Builds a composite convolution that has the following structure
182 expansion (1x1) -> depthwise (kernel_size) -> projection (1x1)
186 num_outputs: number of outputs in the final layer.
187 expansion_size: the size of expansion, could be a constant or a callable.
188 If latter it will be provided 'num_inputs' as an input. For forward
189 compatibility it should accept arbitrary keyword arguments.
190 Default will expand the input by factor of 6.
191 stride: depthwise stride
193 kernel_size: depthwise kernel
194 residual: whether to include residual connection between input
196 normalizer_fn: batchnorm or otherwise
197 split_projection: how many ways to split projection operator
198 (that is conv expansion->bottleneck)
199 split_expansion: how many ways to split expansion op
200 (that is conv bottleneck->expansion) ops will keep depth divisible
202 expansion_transform: Optional function that takes expansion
203 as a single input and returns output.
204 depthwise_location: where to put depthwise covnvolutions supported
205 values None, 'input', 'output', 'expansion'
206 depthwise_channel_multiplier: depthwise channel multiplier:
207 each input will replicated (with different filters)
208 that many times. So if input had c channels,
209 output will have c x depthwise_channel_multpilier.
210 endpoints: An optional dictionary into which intermediate endpoints are
211 placed. The keys "expansion_output", "depthwise_output",
212 "projection_output" and "expansion_transform" are always populated, even
213 if the corresponding functions are not invoked.
214 use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
215 inputs so that the output dimensions are the same as if 'SAME' padding
217 scope: optional scope.
220 Tensor of depth num_outputs
225 with tf.variable_scope(scope, default_name='expanded_conv') as s, \
226 tf.name_scope(s.original_name_scope):
227 prev_depth = input_tensor.get_shape().as_list()[3]
228 if depthwise_location not in [None, 'input', 'output', 'expansion']:
229 raise TypeError('%r is unknown value for depthwise_location' %
232 if use_explicit_padding:
234 depthwise_func = functools.partial(
235 slim.separable_conv2d,
237 kernel_size=kernel_size,
238 depth_multiplier=depthwise_channel_multiplier,
241 normalizer_fn=normalizer_fn,
245 # i -> (o * r) (bottleneck) -> o
246 input_tensor = tf.identity(input_tensor, 'input')
249 if depthwise_location == 'input':
250 if use_explicit_padding:
251 net = _fixed_padding(net, kernel_size, rate)
252 net = depthwise_func(net, activation_fn=None)
254 if callable(expansion_size):
255 inner_size = expansion_size(num_inputs=prev_depth)
257 inner_size = expansion_size
259 if inner_size > net.shape[3]:
263 num_ways=split_expansion,
266 normalizer_fn=normalizer_fn)
267 net = tf.identity(net, 'expansion_output')
268 if endpoints is not None:
269 endpoints['expansion_output'] = net
271 if depthwise_location == 'expansion':
272 if use_explicit_padding:
273 net = _fixed_padding(net, kernel_size, rate)
274 net = depthwise_func(net)
276 net = tf.identity(net, name='depthwise_output')
277 if endpoints is not None:
278 endpoints['depthwise_output'] = net
279 if expansion_transform:
280 net = expansion_transform(expansion_tensor=net, input_tensor=input_tensor)
281 # Note in contrast with expansion, we always have
282 # projection to produce the desired output size.
286 num_ways=split_projection,
289 normalizer_fn=normalizer_fn,
290 activation_fn=tf.identity)
291 if endpoints is not None:
292 endpoints['projection_output'] = net
293 if depthwise_location == 'output':
294 if use_explicit_padding:
295 net = _fixed_padding(net, kernel_size, rate)
296 net = depthwise_func(net, activation_fn=None)
298 if callable(residual): # custom residual
299 net = residual(input_tensor=input_tensor, output_tensor=net)
301 # stride check enforces that we don't add residuals when spatial
302 # dimensions are None
305 net.get_shape().as_list()[3] ==
306 input_tensor.get_shape().as_list()[3]):
308 return tf.identity(net, name='output')
311 def split_conv(input_tensor,
317 """Creates a split convolution.
319 Split convolution splits the input and output into
320 'num_blocks' blocks of approximately the same size each,
321 and only connects $i$-th input to $i$ output.
324 input_tensor: input tensor
325 num_outputs: number of output filters
326 num_ways: num blocks to split by.
327 scope: scope for all the operators.
328 divisible_by: make sure that every part is divisiable by this.
329 **kwargs: will be passed directly into conv2d operator
333 b = input_tensor.get_shape().as_list()[3]
335 if num_ways == 1 or min(b // num_ways,
336 num_outputs // num_ways) < divisible_by:
337 # Don't do any splitting if we end up with less than 8 filters
339 return slim.conv2d(input_tensor, num_outputs, [1, 1], scope=scope, **kwargs)
342 input_splits = _split_divisible(b, num_ways, divisible_by=divisible_by)
343 output_splits = _split_divisible(
344 num_outputs, num_ways, divisible_by=divisible_by)
345 inputs = tf.split(input_tensor, input_splits, axis=3, name='split_' + scope)
347 for i, (input_tensor, out_size) in enumerate(zip(inputs, output_splits)):
348 scope = base + '_part_%d' % (i,)
349 n = slim.conv2d(input_tensor, out_size, [1, 1], scope=scope, **kwargs)
350 n = tf.identity(n, scope + '_output')
352 return tf.concat(outs, 3, name=scope + '_concat')