TensorFlow 1.13.2
所有文档

          BML 全功能AI开发平台

          TensorFlow 1.13.2

          TensorFlow

          基于tensorflow框架的MNIST图像分类任务示例代码,训练数据集点击这里下载

          单机训练(计算节点数为1),示例代码如下:

          import os
          import tensorflow as tf
          import numpy as np
          from tensorflow import keras
          layers = tf.layers
          tf.logging.set_verbosity(tf.logging.INFO)
          def conv_model(feature, target, mode):
           """2-layer convolution model."""
           # Convert the target to a one-hot tensor of shape (batch_size, 10) and
           # with a on-value of 1 for each one-hot vector of length 10.
           target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
           # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
           # image width and height final dimension being the number of color channels.
           feature = tf.reshape(feature, [-1, 28, 28, 1])
           # First conv layer will compute 32 features for each 5x5 patch
           with tf.variable_scope('conv_layer1'):
               h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
                                       activation=tf.nn.relu, padding="SAME")
               h_pool1 = tf.nn.max_pool(
                   h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
           # Second conv layer will compute 64 features for each 5x5 patch.
           with tf.variable_scope('conv_layer2'):
               h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
                                       activation=tf.nn.relu, padding="SAME")
               h_pool2 = tf.nn.max_pool(
                   h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
               # reshape tensor into a batch of vectors
               h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
           # Densely connected layer with 1024 neurons.
           h_fc1 = layers.dropout(
               layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
               rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
           # Compute logits (1 per class) and compute loss.
           logits = layers.dense(h_fc1, 10, activation=None)
           loss = tf.losses.softmax_cross_entropy(target, logits)
           return tf.argmax(logits, 1), loss
          def train_input_generator(x_train, y_train, batch_size=64):
           assert len(x_train) == len(y_train)
           while True:
               p = np.random.permutation(len(x_train))
               x_train, y_train = x_train[p], y_train[p]
               index = 0
               while index <= len(x_train) - batch_size:
                   yield x_train[index:index + batch_size], \
                         y_train[index:index + batch_size],
                   index += batch_size
          def main(_):
           work_path = os.getcwd()
           # Download and load MNIST dataset.
           (x_train, y_train), (x_test, y_test) = \
               keras.datasets.mnist.load_data('%s/train_data/mnist.npz' % work_path)
           # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
           # into (-1, 784) to feed into our network. Also, need to normalize the
           # features between 0 and 1.
           x_train = np.reshape(x_train, (-1, 784)) / 255.0
           x_test = np.reshape(x_test, (-1, 784)) / 255.0
           # Build model...
           with tf.name_scope('input'):
               image = tf.placeholder(tf.float32, [None, 784], name='image')
               label = tf.placeholder(tf.float32, [None], name='label')
           predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)
           opt = tf.train.RMSPropOptimizer(0.001)
           global_step = tf.train.get_or_create_global_step()
           train_op = opt.minimize(loss, global_step=global_step)
           hooks = [
               tf.train.StopAtStepHook(last_step=20000),
               tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                          every_n_iter=10),
           ]
           # Horovod: pin GPU to be used to process local rank (one GPU per process)
           config = tf.ConfigProto()
           config.gpu_options.allow_growth = True
           config.gpu_options.visible_device_list = '0'
           # Horovod: save checkpoints only on worker 0 to prevent other workers from
           # corrupting them.
           checkpoint_dir = './checkpoints'
           training_batch_generator = train_input_generator(x_train,
                                                            y_train, batch_size=100)
           # The MonitoredTrainingSession takes care of session initialization,
           # restoring from a checkpoint, saving to a checkpoint, and closing when done
           # or an error occurs.
           with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                                  hooks=hooks,
                                                  config=config) as mon_sess:
               while not mon_sess.should_stop():
                   # Run a training step synchronously.
                   image_, label_ = next(training_batch_generator)
                   mon_sess.run(train_op, feed_dict={image: image_, label: label_})
           checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
           saver = tf.train.Saver()
           inputs_classes = tf.saved_model.utils.build_tensor_info(image)
           outputs_classes = tf.saved_model.utils.build_tensor_info(predict)
           signature = (tf.saved_model.signature_def_utils.build_signature_def(
               inputs={tf.saved_model.signature_constants.CLASSIFY_INPUTS: inputs_classes},
               outputs={tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: outputs_classes},
               method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME))
           os.system("rm -rf ./output")
           with tf.Session() as sess:
              sess.run([tf.local_variables_initializer(), tf.tables_initializer()])
              saver.restore(sess, checkpoint_file)
              builder = tf.saved_model.builder.SavedModelBuilder('./output')
              legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
              builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={'predict_images': signature}, legacy_init_op=legacy_init_op)
              builder.save()
          if __name__ == "__main__":
           tf.app.run()

          分布式训练(计算节点数大于1),示例代码如下:

          说明:demo分布式程序没有做数据的分片操作,仅供参考

          import os
          import tensorflow as tf
          import horovod.tensorflow as hvd
          import numpy as np
          from tensorflow import keras
          layers = tf.layers
          tf.logging.set_verbosity(tf.logging.INFO)
          def conv_model(feature, target, mode):
              """2-layer convolution model."""
              # Convert the target to a one-hot tensor of shape (batch_size, 10) and
              # with a on-value of 1 for each one-hot vector of length 10.
              target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
              # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
              # image width and height final dimension being the number of color channels.
              feature = tf.reshape(feature, [-1, 28, 28, 1])
              # First conv layer will compute 32 features for each 5x5 patch
              with tf.variable_scope('conv_layer1'):
                  h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
                                          activation=tf.nn.relu, padding="SAME")
                  h_pool1 = tf.nn.max_pool(
                      h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
              # Second conv layer will compute 64 features for each 5x5 patch.
              with tf.variable_scope('conv_layer2'):
                  h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
                                          activation=tf.nn.relu, padding="SAME")
                  h_pool2 = tf.nn.max_pool(
                      h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
                  # reshape tensor into a batch of vectors
                  h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
              # Densely connected layer with 1024 neurons.
              h_fc1 = layers.dropout(
                  layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
                  rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
              # Compute logits (1 per class) and compute loss.
              logits = layers.dense(h_fc1, 10, activation=None)
              loss = tf.losses.softmax_cross_entropy(target, logits)
              return tf.argmax(logits, 1), loss
          def train_input_generator(x_train, y_train, batch_size=64):
              assert len(x_train) == len(y_train)
              while True:
                  p = np.random.permutation(len(x_train))
                  x_train, y_train = x_train[p], y_train[p]
                  index = 0
                  while index <= len(x_train) - batch_size:
                      yield x_train[index:index + batch_size], \
                            y_train[index:index + batch_size],
                      index += batch_size
          def main(_):
              # Horovod: initialize Horovod.
              hvd.init()
              work_path = os.getcwd()
              # Download and load MNIST dataset.
              (x_train, y_train), (x_test, y_test) = \
                  keras.datasets.mnist.load_data('%s/train_data/mnist.npz' % work_path)
              # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
              # into (-1, 784) to feed into our network. Also, need to normalize the
              # features between 0 and 1.
              x_train = np.reshape(x_train, (-1, 784)) / 255.0
              x_test = np.reshape(x_test, (-1, 784)) / 255.0
              # Build model...
              with tf.name_scope('input'):
                  image = tf.placeholder(tf.float32, [None, 784], name='image')
                  label = tf.placeholder(tf.float32, [None], name='label')
              predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)
              serve_graph_file = "./serve_graph.meta"
              tf.train.export_meta_graph(serve_graph_file, as_text=True)
              # Horovod: adjust learning rate based on number of GPUs.
              opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())
              # Horovod: add Horovod Distributed Optimizer.
              opt = hvd.DistributedOptimizer(opt)
              global_step = tf.train.get_or_create_global_step()
              train_op = opt.minimize(loss, global_step=global_step)
              hooks = [
                  # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
                  # from rank 0 to all other processes. This is necessary to ensure consistent
                  # initialization of all workers when training is started with random weights
                  # or restored from a checkpoint.
                  hvd.BroadcastGlobalVariablesHook(0),
                  # Horovod: adjust number of steps based on number of GPUs.
                  tf.train.StopAtStepHook(last_step=10000 // hvd.size()),
                  tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                                             every_n_iter=10),
              ]
              # Horovod: pin GPU to be used to process local rank (one GPU per process)
              config = tf.ConfigProto()
              config.gpu_options.allow_growth = True
              config.gpu_options.visible_device_list = str(hvd.local_rank())
              # Horovod: save checkpoints only on worker 0 to prevent other workers from
              # corrupting them.
              checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
              training_batch_generator = train_input_generator(x_train,
                                                               y_train, batch_size=100)
              # The MonitoredTrainingSession takes care of session initialization,
              # restoring from a checkpoint, saving to a checkpoint, and closing when done
              # or an error occurs.
              with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                                     hooks=hooks,
                                                     config=config) as mon_sess:
                  while not mon_sess.should_stop():
                      # Run a training step synchronously.
                      image_, label_ = next(training_batch_generator)
                      mon_sess.run(train_op, feed_dict={image: image_, label: label_})
              if hvd.rank() != 0:
                  return
              checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
              tf.reset_default_graph()
              saver = tf.train.import_meta_graph(serve_graph_file)
              inputs_classes = tf.saved_model.utils.build_tensor_info(image)
              outputs_classes = tf.saved_model.utils.build_tensor_info(predict)
              signature = (tf.saved_model.signature_def_utils.build_signature_def(
                  inputs={tf.saved_model.signature_constants.CLASSIFY_INPUTS: inputs_classes},
                  outputs={tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: outputs_classes},
                  method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME))
              os.system("rm -rf ./output")
              with tf.Session() as sess:
                 sess.run([tf.local_variables_initializer(), tf.tables_initializer()])
                 saver.restore(sess, checkpoint_file)
                 builder = tf.saved_model.builder.SavedModelBuilder('./output')
                 legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
                 builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={'predict_images': signature}, legacy_init_op=legacy_init_op)
                 builder.save()
          if __name__ == "__main__":
              tf.app.run()
          上一篇
          XGBoost 1.3.1
          下一篇
          模型仓库