TensorFlow 1.13.2
更新时间:2023-01-18
TensorFlow
基于tensorflow框架的MNIST图像分类任务示例代码,训练数据集点击这里下载
单机训练(计算节点数为1),示例代码如下:
Python
1import os
2import tensorflow as tf
3import numpy as np
4from tensorflow import keras
5layers = tf.layers
6tf.logging.set_verbosity(tf.logging.INFO)
7def conv_model(feature, target, mode):
8 """2-layer convolution model."""
9 # Convert the target to a one-hot tensor of shape (batch_size, 10) and
10 # with a on-value of 1 for each one-hot vector of length 10.
11 target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
12 # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
13 # image width and height final dimension being the number of color channels.
14 feature = tf.reshape(feature, [-1, 28, 28, 1])
15 # First conv layer will compute 32 features for each 5x5 patch
16 with tf.variable_scope('conv_layer1'):
17 h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
18 activation=tf.nn.relu, padding="SAME")
19 h_pool1 = tf.nn.max_pool(
20 h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
21 # Second conv layer will compute 64 features for each 5x5 patch.
22 with tf.variable_scope('conv_layer2'):
23 h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
24 activation=tf.nn.relu, padding="SAME")
25 h_pool2 = tf.nn.max_pool(
26 h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
27 # reshape tensor into a batch of vectors
28 h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
29 # Densely connected layer with 1024 neurons.
30 h_fc1 = layers.dropout(
31 layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
32 rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
33 # Compute logits (1 per class) and compute loss.
34 logits = layers.dense(h_fc1, 10, activation=None)
35 loss = tf.losses.softmax_cross_entropy(target, logits)
36 return tf.argmax(logits, 1), loss
37def train_input_generator(x_train, y_train, batch_size=64):
38 assert len(x_train) == len(y_train)
39 while True:
40 p = np.random.permutation(len(x_train))
41 x_train, y_train = x_train[p], y_train[p]
42 index = 0
43 while index <= len(x_train) - batch_size:
44 yield x_train[index:index + batch_size], \
45 y_train[index:index + batch_size],
46 index += batch_size
47def main(_):
48 work_path = os.getcwd()
49 # Download and load MNIST dataset.
50 (x_train, y_train), (x_test, y_test) = \
51 keras.datasets.mnist.load_data('%s/train_data/mnist.npz' % work_path)
52 # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
53 # into (-1, 784) to feed into our network. Also, need to normalize the
54 # features between 0 and 1.
55 x_train = np.reshape(x_train, (-1, 784)) / 255.0
56 x_test = np.reshape(x_test, (-1, 784)) / 255.0
57 # Build model...
58 with tf.name_scope('input'):
59 image = tf.placeholder(tf.float32, [None, 784], name='image')
60 label = tf.placeholder(tf.float32, [None], name='label')
61 predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)
62 opt = tf.train.RMSPropOptimizer(0.001)
63 global_step = tf.train.get_or_create_global_step()
64 train_op = opt.minimize(loss, global_step=global_step)
65 hooks = [
66 tf.train.StopAtStepHook(last_step=20000),
67 tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
68 every_n_iter=10),
69 ]
70 # Horovod: pin GPU to be used to process local rank (one GPU per process)
71 config = tf.ConfigProto()
72 config.gpu_options.allow_growth = True
73 config.gpu_options.visible_device_list = '0'
74 # Horovod: save checkpoints only on worker 0 to prevent other workers from
75 # corrupting them.
76 checkpoint_dir = './checkpoints'
77 training_batch_generator = train_input_generator(x_train,
78 y_train, batch_size=100)
79 # The MonitoredTrainingSession takes care of session initialization,
80 # restoring from a checkpoint, saving to a checkpoint, and closing when done
81 # or an error occurs.
82 with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
83 hooks=hooks,
84 config=config) as mon_sess:
85 while not mon_sess.should_stop():
86 # Run a training step synchronously.
87 image_, label_ = next(training_batch_generator)
88 mon_sess.run(train_op, feed_dict={image: image_, label: label_})
89 checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
90 saver = tf.train.Saver()
91 inputs_classes = tf.saved_model.utils.build_tensor_info(image)
92 outputs_classes = tf.saved_model.utils.build_tensor_info(predict)
93 signature = (tf.saved_model.signature_def_utils.build_signature_def(
94 inputs={tf.saved_model.signature_constants.CLASSIFY_INPUTS: inputs_classes},
95 outputs={tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: outputs_classes},
96 method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME))
97 os.system("rm -rf ./output")
98 with tf.Session() as sess:
99 sess.run([tf.local_variables_initializer(), tf.tables_initializer()])
100 saver.restore(sess, checkpoint_file)
101 builder = tf.saved_model.builder.SavedModelBuilder('./output')
102 legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
103 builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={'predict_images': signature}, legacy_init_op=legacy_init_op)
104 builder.save()
105if __name__ == "__main__":
106 tf.app.run()
分布式训练(计算节点数大于1),示例代码如下:
说明:demo分布式程序没有做数据的分片操作,仅供参考
Python
1import os
2import tensorflow as tf
3import horovod.tensorflow as hvd
4import numpy as np
5from tensorflow import keras
6layers = tf.layers
7tf.logging.set_verbosity(tf.logging.INFO)
8def conv_model(feature, target, mode):
9 """2-layer convolution model."""
10 # Convert the target to a one-hot tensor of shape (batch_size, 10) and
11 # with a on-value of 1 for each one-hot vector of length 10.
12 target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
13 # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
14 # image width and height final dimension being the number of color channels.
15 feature = tf.reshape(feature, [-1, 28, 28, 1])
16 # First conv layer will compute 32 features for each 5x5 patch
17 with tf.variable_scope('conv_layer1'):
18 h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5],
19 activation=tf.nn.relu, padding="SAME")
20 h_pool1 = tf.nn.max_pool(
21 h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
22 # Second conv layer will compute 64 features for each 5x5 patch.
23 with tf.variable_scope('conv_layer2'):
24 h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
25 activation=tf.nn.relu, padding="SAME")
26 h_pool2 = tf.nn.max_pool(
27 h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
28 # reshape tensor into a batch of vectors
29 h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
30 # Densely connected layer with 1024 neurons.
31 h_fc1 = layers.dropout(
32 layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
33 rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
34 # Compute logits (1 per class) and compute loss.
35 logits = layers.dense(h_fc1, 10, activation=None)
36 loss = tf.losses.softmax_cross_entropy(target, logits)
37 return tf.argmax(logits, 1), loss
38def train_input_generator(x_train, y_train, batch_size=64):
39 assert len(x_train) == len(y_train)
40 while True:
41 p = np.random.permutation(len(x_train))
42 x_train, y_train = x_train[p], y_train[p]
43 index = 0
44 while index <= len(x_train) - batch_size:
45 yield x_train[index:index + batch_size], \
46 y_train[index:index + batch_size],
47 index += batch_size
48def main(_):
49 # Horovod: initialize Horovod.
50 hvd.init()
51 work_path = os.getcwd()
52 # Download and load MNIST dataset.
53 (x_train, y_train), (x_test, y_test) = \
54 keras.datasets.mnist.load_data('%s/train_data/mnist.npz' % work_path)
55 # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
56 # into (-1, 784) to feed into our network. Also, need to normalize the
57 # features between 0 and 1.
58 x_train = np.reshape(x_train, (-1, 784)) / 255.0
59 x_test = np.reshape(x_test, (-1, 784)) / 255.0
60 # Build model...
61 with tf.name_scope('input'):
62 image = tf.placeholder(tf.float32, [None, 784], name='image')
63 label = tf.placeholder(tf.float32, [None], name='label')
64 predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN)
65 serve_graph_file = "./serve_graph.meta"
66 tf.train.export_meta_graph(serve_graph_file, as_text=True)
67 # Horovod: adjust learning rate based on number of GPUs.
68 opt = tf.train.RMSPropOptimizer(0.001 * hvd.size())
69 # Horovod: add Horovod Distributed Optimizer.
70 opt = hvd.DistributedOptimizer(opt)
71 global_step = tf.train.get_or_create_global_step()
72 train_op = opt.minimize(loss, global_step=global_step)
73 hooks = [
74 # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
75 # from rank 0 to all other processes. This is necessary to ensure consistent
76 # initialization of all workers when training is started with random weights
77 # or restored from a checkpoint.
78 hvd.BroadcastGlobalVariablesHook(0),
79 # Horovod: adjust number of steps based on number of GPUs.
80 tf.train.StopAtStepHook(last_step=10000 // hvd.size()),
81 tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
82 every_n_iter=10),
83 ]
84 # Horovod: pin GPU to be used to process local rank (one GPU per process)
85 config = tf.ConfigProto()
86 config.gpu_options.allow_growth = True
87 config.gpu_options.visible_device_list = str(hvd.local_rank())
88 # Horovod: save checkpoints only on worker 0 to prevent other workers from
89 # corrupting them.
90 checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None
91 training_batch_generator = train_input_generator(x_train,
92 y_train, batch_size=100)
93 # The MonitoredTrainingSession takes care of session initialization,
94 # restoring from a checkpoint, saving to a checkpoint, and closing when done
95 # or an error occurs.
96 with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
97 hooks=hooks,
98 config=config) as mon_sess:
99 while not mon_sess.should_stop():
100 # Run a training step synchronously.
101 image_, label_ = next(training_batch_generator)
102 mon_sess.run(train_op, feed_dict={image: image_, label: label_})
103 if hvd.rank() != 0:
104 return
105 checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
106 tf.reset_default_graph()
107 saver = tf.train.import_meta_graph(serve_graph_file)
108 inputs_classes = tf.saved_model.utils.build_tensor_info(image)
109 outputs_classes = tf.saved_model.utils.build_tensor_info(predict)
110 signature = (tf.saved_model.signature_def_utils.build_signature_def(
111 inputs={tf.saved_model.signature_constants.CLASSIFY_INPUTS: inputs_classes},
112 outputs={tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: outputs_classes},
113 method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME))
114 os.system("rm -rf ./output")
115 with tf.Session() as sess:
116 sess.run([tf.local_variables_initializer(), tf.tables_initializer()])
117 saver.restore(sess, checkpoint_file)
118 builder = tf.saved_model.builder.SavedModelBuilder('./output')
119 legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
120 builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={'predict_images': signature}, legacy_init_op=legacy_init_op)
121 builder.save()
122if __name__ == "__main__":
123 tf.app.run()