networks/slim_nets/vgg.py

   1 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
   2 #
   3 # Licensed under the Apache License, Version 2.0 (the "License");
   4 # you may not use this file except in compliance with the License.
   5 # You may obtain a copy of the License at
   6 #
   7 # http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 # Unless required by applicable law or agreed to in writing, software
  10 # distributed under the License is distributed on an "AS IS" BASIS,
  11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 # See the License for the specific language governing permissions and
  13 # limitations under the License.
  14 # ==============================================================================
  15 """Contains model definitions for versions of the Oxford VGG network.
  16
  17 These model definitions were introduced in the following technical report:
  18
  19   Very Deep Convolutional Networks For Large-Scale Image Recognition
  20   Karen Simonyan and Andrew Zisserman
  21   arXiv technical report, 2015
  22   PDF: http://arxiv.org/pdf/1409.1556.pdf
  23   ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf
  24   CC-BY-4.0
  25
  26 More information can be obtained from the VGG website:
  27 www.robots.ox.ac.uk/~vgg/research/very_deep/
  28
  29 Usage:
  30   with slim.arg_scope(vgg.vgg_arg_scope()):
  31     outputs, end_points = vgg.vgg_a(inputs)
  32
  33   with slim.arg_scope(vgg.vgg_arg_scope()):
  34     outputs, end_points = vgg.vgg_16(inputs)
  35
  36 @@vgg_a
  37 @@vgg_16
  38 @@vgg_19
  39 """
  40 from __future__ import absolute_import
  41 from __future__ import division
  42 from __future__ import print_function
  43
  44 import tensorflow as tf
  45
  46 slim = tf.contrib.slim
  47
  48
  49 def vgg_arg_scope(weight_decay=0.0005):
  50   """Defines the VGG arg scope.
  51
  52   Args:
  53     weight_decay: The l2 regularization coefficient.
  54
  55   Returns:
  56     An arg_scope.
  57   """
  58   with slim.arg_scope([slim.conv2d, slim.fully_connected],
  59                       activation_fn=tf.nn.relu,
  60                       weights_regularizer=slim.l2_regularizer(weight_decay),
  61                       biases_initializer=tf.zeros_initializer()):
  62     with slim.arg_scope([slim.conv2d], padding='SAME') as arg_sc:
  63       return arg_sc
  64
  65
  66 def vgg_a(inputs,
  67           num_classes=1000,
  68           is_training=True,
  69           dropout_keep_prob=0.5,
  70           spatial_squeeze=True,
  71           scope='vgg_a',
  72           fc_conv_padding='VALID'):
  73   """Oxford Net VGG 11-Layers version A Example.
  74
  75   Note: All the fully_connected layers have been transformed to conv2d layers.
  76         To use in classification mode, resize input to 224x224.
  77
  78   Args:
  79     inputs: a tensor of size [batch_size, height, width, channels].
  80     num_classes: number of predicted classes.
  81     is_training: whether or not the model is being trained.
  82     dropout_keep_prob: the probability that activations are kept in the dropout
  83       layers during training.
  84     spatial_squeeze: whether or not should squeeze the spatial dimensions of the
  85       outputs. Useful to remove unnecessary dimensions for classification.
  86     scope: Optional scope for the variables.
  87     fc_conv_padding: the type of padding to use for the fully connected layer
  88       that is implemented as a convolutional layer. Use 'SAME' padding if you
  89       are applying the network in a fully convolutional manner and want to
  90       get a prediction map downsampled by a factor of 32 as an output. Otherwise,
  91       the output prediction map will be (input / 32) - 6 in case of 'VALID' padding.
  92
  93   Returns:
  94     the last op containing the log predictions and end_points dict.
  95   """
  96   with tf.variable_scope(scope, 'vgg_a', [inputs]) as sc:
  97     end_points_collection = sc.name + '_end_points'
  98     # Collect outputs for conv2d, fully_connected and max_pool2d.
  99     with slim.arg_scope([slim.conv2d, slim.max_pool2d],
 100                         outputs_collections=end_points_collection):
 101       net = slim.repeat(inputs, 1, slim.conv2d, 64, [3, 3], scope='conv1')
 102       net = slim.max_pool2d(net, [2, 2], scope='pool1')
 103       net = slim.repeat(net, 1, slim.conv2d, 128, [3, 3], scope='conv2')
 104       net = slim.max_pool2d(net, [2, 2], scope='pool2')
 105       net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], scope='conv3')
 106       net = slim.max_pool2d(net, [2, 2], scope='pool3')
 107       net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv4')
 108       net = slim.max_pool2d(net, [2, 2], scope='pool4')
 109       net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv5')
 110       net = slim.max_pool2d(net, [2, 2], scope='pool5')
 111       # Use conv2d instead of fully_connected layers.
 112       net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
 113       net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
 114                          scope='dropout6')
 115       net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
 116       net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
 117                          scope='dropout7')
 118       net = slim.conv2d(net, num_classes, [1, 1],
 119                         activation_fn=None,
 120                         normalizer_fn=None,
 121                         scope='fc8')
 122       # Convert end_points_collection into a end_point dict.
 123       end_points = slim.utils.convert_collection_to_dict(end_points_collection)
 124       if spatial_squeeze:
 125         net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
 126         end_points[sc.name + '/fc8'] = net
 127       return net, end_points
 128 vgg_a.default_image_size = 224
 129
 130
 131 def vgg_16(inputs,
 132            num_classes=1000,
 133            is_training=True,
 134            dropout_keep_prob=0.5,
 135            spatial_squeeze=True,
 136            scope='vgg_16',
 137            fc_conv_padding='VALID'):
 138   """Oxford Net VGG 16-Layers version D Example.
 139
 140   Note: All the fully_connected layers have been transformed to conv2d layers.
 141         To use in classification mode, resize input to 224x224.
 142
 143   Args:
 144     inputs: a tensor of size [batch_size, height, width, channels].
 145     num_classes: number of predicted classes.
 146     is_training: whether or not the model is being trained.
 147     dropout_keep_prob: the probability that activations are kept in the dropout
 148       layers during training.
 149     spatial_squeeze: whether or not should squeeze the spatial dimensions of the
 150       outputs. Useful to remove unnecessary dimensions for classification.
 151     scope: Optional scope for the variables.
 152     fc_conv_padding: the type of padding to use for the fully connected layer
 153       that is implemented as a convolutional layer. Use 'SAME' padding if you
 154       are applying the network in a fully convolutional manner and want to
 155       get a prediction map downsampled by a factor of 32 as an output. Otherwise,
 156       the output prediction map will be (input / 32) - 6 in case of 'VALID' padding.
 157
 158   Returns:
 159     the last op containing the log predictions and end_points dict.
 160   """
 161   with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc:
 162     end_points_collection = sc.name + '_end_points'
 163     # Collect outputs for conv2d, fully_connected and max_pool2d.
 164     with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
 165                         outputs_collections=end_points_collection):
 166       net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
 167       net = slim.max_pool2d(net, [2, 2], scope='pool1')
 168       net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
 169       net = slim.max_pool2d(net, [2, 2], scope='pool2')
 170       net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
 171       net = slim.max_pool2d(net, [2, 2], scope='pool3')
 172       net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
 173       net = slim.max_pool2d(net, [2, 2], scope='pool4')
 174       net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
 175       net = slim.max_pool2d(net, [2, 2], scope='pool5')
 176       # Use conv2d instead of fully_connected layers.
 177       net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
 178       net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
 179                          scope='dropout6')
 180       net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
 181       net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
 182                          scope='dropout7')
 183       # yjr_feature = tf.squeeze(net)
 184       net = slim.conv2d(net, num_classes, [1, 1],
 185                         activation_fn=None,
 186                         normalizer_fn=None,
 187                         scope='fc8')
 188       # Convert end_points_collection into a end_point dict.
 189       end_points = slim.utils.convert_collection_to_dict(end_points_collection)
 190       if spatial_squeeze:
 191         net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
 192         end_points[sc.name + '/fc8'] = net
 193       # end_points['yjr_feature'] = yjr_feature
 194       end_points['predictions'] = slim.softmax(net, scope='predictions')
 195       return net, end_points
 196 vgg_16.default_image_size = 224
 197
 198
 199 def vgg_19(inputs,
 200            num_classes=1000,
 201            is_training=True,
 202            dropout_keep_prob=0.5,
 203            spatial_squeeze=True,
 204            scope='vgg_19',
 205            fc_conv_padding='VALID'):
 206   """Oxford Net VGG 19-Layers version E Example.
 207
 208   Note: All the fully_connected layers have been transformed to conv2d layers.
 209         To use in classification mode, resize input to 224x224.
 210
 211   Args:
 212     inputs: a tensor of size [batch_size, height, width, channels].
 213     num_classes: number of predicted classes.
 214     is_training: whether or not the model is being trained.
 215     dropout_keep_prob: the probability that activations are kept in the dropout
 216       layers during training.
 217     spatial_squeeze: whether or not should squeeze the spatial dimensions of the
 218       outputs. Useful to remove unnecessary dimensions for classification.
 219     scope: Optional scope for the variables.
 220     fc_conv_padding: the type of padding to use for the fully connected layer
 221       that is implemented as a convolutional layer. Use 'SAME' padding if you
 222       are applying the network in a fully convolutional manner and want to
 223       get a prediction map downsampled by a factor of 32 as an output. Otherwise,
 224       the output prediction map will be (input / 32) - 6 in case of 'VALID' padding.
 225
 226   Returns:
 227     the last op containing the log predictions and end_points dict.
 228   """
 229   with tf.variable_scope(scope, 'vgg_19', [inputs]) as sc:
 230     end_points_collection = sc.name + '_end_points'
 231     # Collect outputs for conv2d, fully_connected and max_pool2d.
 232     with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
 233                         outputs_collections=end_points_collection):
 234       net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
 235       net = slim.max_pool2d(net, [2, 2], scope='pool1')
 236       net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
 237       net = slim.max_pool2d(net, [2, 2], scope='pool2')
 238       net = slim.repeat(net, 4, slim.conv2d, 256, [3, 3], scope='conv3')
 239       net = slim.max_pool2d(net, [2, 2], scope='pool3')
 240       net = slim.repeat(net, 4, slim.conv2d, 512, [3, 3], scope='conv4')
 241       net = slim.max_pool2d(net, [2, 2], scope='pool4')
 242       net = slim.repeat(net, 4, slim.conv2d, 512, [3, 3], scope='conv5')
 243       net = slim.max_pool2d(net, [2, 2], scope='pool5')
 244       # Use conv2d instead of fully_connected layers.
 245       net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
 246       net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
 247                          scope='dropout6')
 248       net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
 249       net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
 250                          scope='dropout7')
 251       net = slim.conv2d(net, num_classes, [1, 1],
 252                         activation_fn=None,
 253                         normalizer_fn=None,
 254                         scope='fc8')
 255       # Convert end_points_collection into a end_point dict.
 256       end_points = slim.utils.convert_collection_to_dict(end_points_collection)
 257       if spatial_squeeze:
 258         net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
 259         end_points[sc.name + '/fc8'] = net
 260       return net, end_points
 261 vgg_19.default_image_size = 224
 262
 263 # Alias
 264 vgg_d = vgg_16
 265 vgg_e = vgg_19