resnet_v1.py

   1 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
   2 #
   3 # Licensed under the Apache License, Version 2.0 (the "License");
   4 # you may not use this file except in compliance with the License.
   5 # You may obtain a copy of the License at
   6 #
   7 # http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 # Unless required by applicable law or agreed to in writing, software
  10 # distributed under the License is distributed on an "AS IS" BASIS,
  11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 # See the License for the specific language governing permissions and
  13 # limitations under the License.
  14 # ==============================================================================
  15 """Contains definitions for the original form of Residual Networks.
  16
  17 The 'v1' residual networks (ResNets) implemented in this module were proposed
  18 by:
  19 [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
  20     Deep Residual Learning for Image Recognition. arXiv:1512.03385
  21
  22 Other variants were introduced in:
  23 [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
  24     Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
  25
  26 The networks defined in this module utilize the bottleneck building block of
  27 [1] with projection shortcuts only for increasing depths. They employ batch
  28 normalization *after* every weight layer. This is the architecture used by
  29 MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and
  30 ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1'
  31 architecture and the alternative 'v2' architecture of [2] which uses batch
  32 normalization *before* every weight layer in the so-called full pre-activation
  33 units.
  34
  35 Typical use:
  36
  37    from tensorflow.contrib.slim.slim_nets import resnet_v1
  38
  39 ResNet-101 for image classification into 1000 classes:
  40
  41    # inputs has shape [batch, 224, 224, 3]
  42    with slim.arg_scope(resnet_v1.resnet_arg_scope()):
  43       net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False)
  44
  45 ResNet-101 for semantic segmentation into 21 classes:
  46
  47    # inputs has shape [batch, 513, 513, 3]
  48    with slim.arg_scope(resnet_v1.resnet_arg_scope()):
  49       net, end_points = resnet_v1.resnet_v1_101(inputs,
  50                                                 21,
  51                                                 is_training=False,
  52                                                 global_pool=False,
  53                                                 output_stride=16)
  54 """
  55 from __future__ import absolute_import
  56 from __future__ import division
  57 from __future__ import print_function
  58
  59 import tensorflow as tf
  60
  61 from libs.networks.slim_nets import resnet_utils
  62
  63
  64 resnet_arg_scope = resnet_utils.resnet_arg_scope
  65 slim = tf.contrib.slim
  66
  67
  68 @slim.add_arg_scope
  69 def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1,
  70                outputs_collections=None, scope=None):
  71   """Bottleneck residual unit variant with BN after convolutions.
  72
  73   This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for
  74   its definition. Note that we use here the bottleneck variant which has an
  75   extra bottleneck layer.
  76
  77   When putting together two consecutive ResNet blocks that use this unit, one
  78   should use stride = 2 in the last unit of the first block.
  79
  80   Args:
  81     inputs: A tensor of size [batch, height, width, channels].
  82     depth: The depth of the ResNet unit output.
  83     depth_bottleneck: The depth of the bottleneck layers.
  84     stride: The ResNet unit's stride. Determines the amount of downsampling of
  85       the units output compared to its input.
  86     rate: An integer, rate for atrous convolution.
  87     outputs_collections: Collection to add the ResNet unit output.
  88     scope: Optional variable_scope.
  89
  90   Returns:
  91     The ResNet unit's output.
  92   """
  93   with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc:
  94     depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
  95     if depth == depth_in:
  96       shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
  97     else:
  98       shortcut = slim.conv2d(inputs, depth, [1, 1], stride=stride,
  99                              activation_fn=None, scope='shortcut')
 100
 101     residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1,
 102                            scope='conv1')
 103     residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride,
 104                                         rate=rate, scope='conv2')
 105     residual = slim.conv2d(residual, depth, [1, 1], stride=1,
 106                            activation_fn=None, scope='conv3')
 107
 108     output = tf.nn.relu(shortcut + residual)
 109
 110     return slim.utils.collect_named_outputs(outputs_collections,
 111                                             sc.original_name_scope,
 112                                             output)
 113
 114
 115 def resnet_v1(inputs,
 116               blocks,
 117               num_classes=None,
 118               is_training=True,
 119               global_pool=True,
 120               output_stride=None,
 121               include_root_block=True,
 122               spatial_squeeze=False,
 123               reuse=None,
 124               scope=None):
 125   """Generator for v1 ResNet models.
 126
 127   This function generates a family of ResNet v1 models. See the resnet_v1_*()
 128   methods for specific model instantiations, obtained by selecting different
 129   block instantiations that produce ResNets of various depths.
 130
 131   Training for image classification on Imagenet is usually done with [224, 224]
 132   inputs, resulting in [7, 7] feature maps at the output of the last ResNet
 133   block for the ResNets defined in [1] that have nominal stride equal to 32.
 134   However, for dense prediction tasks we advise that one uses inputs with
 135   spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
 136   this case the feature maps at the ResNet output will have spatial shape
 137   [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
 138   and corners exactly aligned with the input image corners, which greatly
 139   facilitates alignment of the features to the image. Using as input [225, 225]
 140   images results in [8, 8] feature maps at the output of the last ResNet block.
 141
 142   For dense prediction tasks, the ResNet needs to run in fully-convolutional
 143   (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
 144   have nominal stride equal to 32 and a good choice in FCN mode is to use
 145   output_stride=16 in order to increase the density of the computed features at
 146   small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.
 147
 148   Args:
 149     inputs: A tensor of size [batch, height_in, width_in, channels].
 150     blocks: A list of length equal to the number of ResNet blocks. Each element
 151       is a resnet_utils.Block object describing the units in the block.
 152     num_classes: Number of predicted classes for classification tasks. If None
 153       we return the features before the logit layer.
 154     is_training: whether is training or not.
 155     global_pool: If True, we perform global average pooling before computing the
 156       logits. Set to True for image classification, False for dense prediction.
 157     output_stride: If None, then the output will be computed at the nominal
 158       network stride. If output_stride is not None, it specifies the requested
 159       ratio of input to output spatial resolution.
 160     include_root_block: If True, include the initial convolution followed by
 161       max-pooling, if False excludes it.
 162     spatial_squeeze: if True, logits is of shape [B, C], if false logits is
 163         of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
 164     reuse: whether or not the network and its variables should be reused. To be
 165       able to reuse 'scope' must be given.
 166     scope: Optional variable_scope.
 167
 168   Returns:
 169     net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
 170       If global_pool is False, then height_out and width_out are reduced by a
 171       factor of output_stride compared to the respective height_in and width_in,
 172       else both height_out and width_out equal one. If num_classes is None, then
 173       net is the output of the last ResNet block, potentially after global
 174       average pooling. If num_classes is not None, net contains the pre-softmax
 175       activations.
 176     end_points: A dictionary from components of the network to the corresponding
 177       activation.
 178
 179   Raises:
 180     ValueError: If the target output_stride is not valid.
 181   """
 182   with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
 183     end_points_collection = sc.name + '_end_points'
 184     with slim.arg_scope([slim.conv2d, bottleneck,
 185                          resnet_utils.stack_blocks_dense],
 186                         outputs_collections=end_points_collection):
 187       with slim.arg_scope([slim.batch_norm], is_training=is_training):
 188         net = inputs
 189         if include_root_block:
 190           if output_stride is not None:
 191             if output_stride % 4 != 0:
 192               raise ValueError('The output_stride needs to be a multiple of 4.')
 193             output_stride /= 4
 194           net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
 195           net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')
 196         net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
 197         if global_pool:
 198           # Global average pooling.
 199           net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
 200           # yjr_feature = tf.squeeze(net, [0, 1, 2])
 201         if num_classes is not None:
 202           net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
 203                             normalizer_fn=None, scope='logits')
 204         if spatial_squeeze:
 205           logits = tf.squeeze(net, [1, 2], name='SpatialSqueeze')
 206         else:
 207           logits = net
 208         # Convert end_points_collection into a dictionary of end_points.
 209         end_points = slim.utils.convert_collection_to_dict(
 210             end_points_collection)
 211         if num_classes is not None:
 212           end_points['predictions'] = slim.softmax(logits, scope='predictions')
 213
 214         ###
 215         # end_points['yjr_feature'] = yjr_feature
 216         return logits, end_points
 217 resnet_v1.default_image_size = 224
 218
 219
 220 def resnet_v1_block(scope, base_depth, num_units, stride):
 221   """Helper function for creating a resnet_v1 bottleneck block.
 222
 223   Args:
 224     scope: The scope of the block.
 225     base_depth: The depth of the bottleneck layer for each unit.
 226     num_units: The number of units in the block.
 227     stride: The stride of the block, implemented as a stride in the last unit.
 228       All other units have stride=1.
 229
 230   Returns:
 231     A resnet_v1 bottleneck block.
 232   """
 233   return resnet_utils.Block(scope, bottleneck, [{
 234       'depth': base_depth * 4,
 235       'depth_bottleneck': base_depth,
 236       'stride': 1
 237   }] * (num_units - 1) + [{
 238       'depth': base_depth * 4,
 239       'depth_bottleneck': base_depth,
 240       'stride': stride
 241   }])
 242
 243
 244 def resnet_v1_50(inputs,
 245                  num_classes=None,
 246                  is_training=True,
 247                  global_pool=True,
 248                  output_stride=None,
 249                  spatial_squeeze=True,
 250                  reuse=None,
 251                  scope='resnet_v1_50'):
 252   """ResNet-50 model of [1]. See resnet_v1() for arg and return description."""
 253   blocks = [
 254       resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
 255       resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
 256       resnet_v1_block('block3', base_depth=256, num_units=6, stride=2),
 257       resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
 258   ]
 259   return resnet_v1(inputs, blocks, num_classes, is_training,
 260                    global_pool=global_pool, output_stride=output_stride,
 261                    include_root_block=True, spatial_squeeze=spatial_squeeze,
 262                    reuse=reuse, scope=scope)
 263 resnet_v1_50.default_image_size = resnet_v1.default_image_size
 264
 265
 266 def resnet_v1_101(inputs,
 267                   num_classes=None,
 268                   is_training=True,
 269                   global_pool=True,
 270                   output_stride=None,
 271                   spatial_squeeze=True,
 272                   reuse=None,
 273                   scope='resnet_v1_101'):
 274   """ResNet-101 model of [1]. See resnet_v1() for arg and return description."""
 275   blocks = [
 276       resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
 277       resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
 278       resnet_v1_block('block3', base_depth=256, num_units=23, stride=2),
 279       resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
 280   ]
 281   return resnet_v1(inputs, blocks, num_classes, is_training,
 282                    global_pool=global_pool, output_stride=output_stride,
 283                    include_root_block=True, spatial_squeeze=spatial_squeeze,
 284                    reuse=reuse, scope=scope)
 285 resnet_v1_101.default_image_size = resnet_v1.default_image_size
 286
 287
 288 def resnet_v1_152(inputs,
 289                   num_classes=None,
 290                   is_training=True,
 291                   global_pool=True,
 292                   output_stride=None,
 293                   spatial_squeeze=True,
 294                   reuse=None,
 295                   scope='resnet_v1_152'):
 296   """ResNet-152 model of [1]. See resnet_v1() for arg and return description."""
 297   blocks = [
 298       resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
 299       resnet_v1_block('block2', base_depth=128, num_units=8, stride=2),
 300       resnet_v1_block('block3', base_depth=256, num_units=36, stride=2),
 301       resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
 302   ]
 303   return resnet_v1(inputs, blocks, num_classes, is_training,
 304                    global_pool=global_pool, output_stride=output_stride,
 305                    include_root_block=True, spatial_squeeze=spatial_squeeze,
 306                    reuse=reuse, scope=scope)
 307 resnet_v1_152.default_image_size = resnet_v1.default_image_size
 308
 309
 310 def resnet_v1_200(inputs,
 311                   num_classes=None,
 312                   is_training=True,
 313                   global_pool=True,
 314                   output_stride=None,
 315                   spatial_squeeze=True,
 316                   reuse=None,
 317                   scope='resnet_v1_200'):
 318   """ResNet-200 model of [2]. See resnet_v1() for arg and return description."""
 319   blocks = [
 320       resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
 321       resnet_v1_block('block2', base_depth=128, num_units=24, stride=2),
 322       resnet_v1_block('block3', base_depth=256, num_units=36, stride=2),
 323       resnet_v1_block('block4', base_depth=512, num_units=3, stride=1),
 324   ]
 325   return resnet_v1(inputs, blocks, num_classes, is_training,
 326                    global_pool=global_pool, output_stride=output_stride,
 327                    include_root_block=True, spatial_squeeze=spatial_squeeze,
 328                    reuse=reuse, scope=scope)
 329 resnet_v1_200.default_image_size = resnet_v1.default_image_size