TensorFlow 1.13.2代码规范
更新时间:2023-01-18
TensorFlow 1.13.2代码规范
基于TensorFlow1.13.2框架的MNIST图像分类,训练数据集tf_train_data2.zip点击这里下载。
如下所示是其超参搜索任务中一个超参数组合的训练代码,代码会通过argparse模块接受在平台中填写的信息,请保持一致。
tensorflow1.13.2_autosearch.py示例代码
Python
1# -*- coding:utf-8 -*-
2""" tensorflow1 train demo """
3import os
4import tensorflow as tf
5import numpy as np
6import time
7from tensorflow import keras
8import os
9import argparse
10from rudder_autosearch.sdk.amaas_tools import AMaasTools
11
12tf.logging.set_verbosity(tf.logging.INFO)
13
14def parse_arg():
15 """parse arguments"""
16 parser = argparse.ArgumentParser(description='tensorflow1.13.2 mnist Example')
17 parser.add_argument('--train_dir', type=str, default='./train_data',
18 help='input data dir for training (default: ./train_data)')
19 parser.add_argument('--test_dir', type=str, default='./test_data',
20 help='input data dir for test (default: ./test_data)')
21 parser.add_argument('--output_dir', type=str, default='./output',
22 help='output dir for auto_search job (default: ./output)')
23 parser.add_argument('--job_id', type=str, default="job-1234",
24 help='auto_search job id (default: "job-1234")')
25 parser.add_argument('--trial_id', type=str, default="0-0",
26 help='auto_search id of a single trial (default: "0-0")')
27 parser.add_argument('--metric', type=str, default="acc",
28 help='evaluation metric of the model')
29 parser.add_argument('--data_sampling_scale', type=float, default=1.0,
30 help='sampling ratio of the data (default: 1.0)')
31 parser.add_argument('--batch_size', type=int, default=100,
32 help='number of images input in an iteration (default: 100)')
33 parser.add_argument('--lr', type=float, default=0.001,
34 help='learning rate of the training (default: 0.001)')
35 parser.add_argument('--last_step', type=int, default=20000,
36 help='number of steps to train (default: 20000)')
37 args = parser.parse_args()
38 args.output_dir = os.path.join(args.output_dir, args.job_id, args.trial_id)
39 print("job_id: {}, trial_id: {}".format(args.job_id, args.trial_id))
40 return args
41
42def load_data(data_sampling_scale):
43 """ load data """
44 work_path = os.getcwd()
45 (x_train, y_train), (x_test, y_test) = \
46 keras.datasets.mnist.load_data('%s/train_data/mnist.npz' % work_path)
47 # sample training data
48 np.random.seed(0)
49 sample_data_num = int(data_sampling_scale * len(x_train))
50 idx = np.arange(len(x_train))
51 np.random.shuffle(idx)
52 x_train, y_train = x_train[0:sample_data_num], y_train[0:sample_data_num]
53 # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it
54 # into (-1, 784) to feed into our network. Also, need to normalize the
55 # features between 0 and 1.
56 x_train = np.reshape(x_train, (-1, 784)) / 255.0
57 x_test = np.reshape(x_test, (-1, 784)) / 255.0
58 return (x_train, x_test), (y_train, y_test)
59
60def train_input_generator(x_train, y_train, batch_size=64):
61 """train_input_generator"""
62 assert len(x_train) == len(y_train)
63 while True:
64 p = np.random.permutation(len(x_train))
65 x_train, y_train = x_train[p], y_train[p]
66 index = 0
67 while index <= len(x_train) - batch_size:
68 yield x_train[index:index + batch_size], \
69 y_train[index:index + batch_size],
70 index += batch_size
71
72def conv_model(feature, target, mode):
73 """2-layer convolution model."""
74 # Convert the target to a one-hot tensor of shape (batch_size, 10) and
75 # with a on-value of 1 for each one-hot vector of length 10.
76 target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)
77 # Reshape feature to 4d tensor with 2nd and 3rd dimensions being
78 # image width and height final dimension being the number of color channels.
79 feature = tf.reshape(feature, [-1, 28, 28, 1])
80 # First conv layer will compute 32 features for each 5x5 patch
81 with tf.variable_scope('conv_layer1'):
82 h_conv1 = tf.layers.conv2d(feature, 32, kernel_size=[5, 5],
83 activation=tf.nn.relu, padding="SAME")
84 h_pool1 = tf.nn.max_pool(
85 h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
86 # Second conv layer will compute 64 features for each 5x5 patch.
87 with tf.variable_scope('conv_layer2'):
88 h_conv2 = tf.layers.conv2d(h_pool1, 64, kernel_size=[5, 5],
89 activation=tf.nn.relu, padding="SAME")
90 h_pool2 = tf.nn.max_pool(
91 h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
92 # reshape tensor into a batch of vectors
93 h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
94 # Densely connected layer with 1024 neurons.
95 h_fc1 = tf.layers.dropout(
96 tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu),
97 rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
98
99 # Compute logits (1 per class) and compute loss.
100 logits = tf.layers.dense(h_fc1, 10, activation=None)
101 loss = tf.losses.softmax_cross_entropy(target, logits)
102 return tf.argmax(logits, 1), loss
103
104class Model():
105 def __init__(self, args, train_test_data):
106 self.args = args
107 self.create_model()
108 (self.x_train, self.x_test), (self.y_train, self.y_test) = train_test_data
109
110 def create_model(self):
111 """create_model"""
112 with tf.name_scope('input'):
113 self.image = tf.placeholder(tf.float32, [None, 784], name='image')
114 self.label = tf.placeholder(tf.float32, [None], name='label')
115 self.predict, self.loss = conv_model(self.image, self.label, tf.estimator.ModeKeys.TRAIN)
116 opt = tf.train.RMSPropOptimizer(self.args.lr)
117 self.global_step = tf.train.get_or_create_global_step()
118 self.train_op = opt.minimize(self.loss, global_step=self.global_step)
119
120 def run_train(self):
121 """run_train"""
122 hooks = [
123 tf.train.StopAtStepHook(last_step=self.args.last_step),
124 tf.train.LoggingTensorHook(tensors={'step': self.global_step, 'loss': self.loss},
125 every_n_iter=10),
126 ]
127 # Horovod: pin GPU to be used to process local rank (one GPU per process)
128 config = tf.ConfigProto()
129 config.gpu_options.allow_growth = True
130 config.gpu_options.visible_device_list = '0'
131 # Horovod: save checkpoints only on worker 0 to prevent other workers from
132 # corrupting them.
133 self.checkpoint_dir = '/checkpoints'
134 os.system("rm -rf " + self.checkpoint_dir)
135 training_batch_generator = train_input_generator(self.x_train,
136 self.y_train, batch_size=self.args.batch_size)
137 # The MonitoredTrainingSession takes care of session initialization,
138 # restoring from a checkpoint, saving to a checkpoint, and closing when done
139 # or an error occurs.
140 with tf.train.MonitoredTrainingSession(checkpoint_dir=self.checkpoint_dir,
141 hooks=hooks,
142 config=config) as mon_sess:
143 while not mon_sess.should_stop():
144 # Run a training step synchronously.
145 image_, label_ = next(training_batch_generator)
146 mon_sess.run(self.train_op, feed_dict={self.image: image_, self.label: label_})
147
148 def save_model(self):
149 """save_model"""
150 saver = tf.train.Saver()
151 inputs_classes = tf.saved_model.utils.build_tensor_info(self.image)
152 outputs_classes = tf.saved_model.utils.build_tensor_info(self.predict)
153 signature = (tf.saved_model.signature_def_utils.build_signature_def(
154 inputs={tf.saved_model.signature_constants.CLASSIFY_INPUTS: inputs_classes},
155 outputs={tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: outputs_classes},
156 method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME))
157
158 with tf.Session() as sess:
159 sess.run([tf.local_variables_initializer(), tf.tables_initializer()])
160 saver.restore(sess, tf.train.latest_checkpoint(self.checkpoint_dir))
161 model_output_dir = self.args.output_dir
162 builder = tf.saved_model.builder.SavedModelBuilder(model_output_dir)
163 legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
164 builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING],
165 signature_def_map={'predict_images': signature},
166 legacy_init_op=legacy_init_op)
167 builder.save()
168
169 def evaluate(self):
170 """evaluate"""
171 with tf.Session() as sess:
172 sess.run([tf.local_variables_initializer(), tf.tables_initializer()])
173 saver = tf.train.Saver()
174 saver.restore(sess, tf.train.latest_checkpoint(self.checkpoint_dir))
175 y_pred = sess.run(self.predict, feed_dict={self.image: self.x_test})
176 self.acc = sum(y_pred == self.y_test) / len(y_pred)
177 print("accuracy: %f" % self.acc)
178 return self.acc
179
180def report_final(args, metric):
181 """report_final_result"""
182 # 结果上报sdk
183 amaas_tools = AMaasTools(args.job_id, args.trial_id)
184 metric_dict = {args.metric: metric}
185 for i in range(3):
186 flag, ret_msg = amaas_tools.report_final_result(metric=metric_dict,
187 export_model_path=args.output_dir,
188 checkpoint_path="")
189 print("End Report, metric:{}, ret_msg:{}".format(metric, ret_msg))
190 if flag:
191 break
192 time.sleep(1)
193 assert flag, "Report final result to manager failed! Please check whether manager'address or manager'status " \
194 "is ok! "
195
196def main(_):
197 """main"""
198 # 获取参数
199 args = parse_arg()
200 # 加载数据集
201 train_test_data = load_data(args.data_sampling_scale)
202 # 模型定义
203 model = Model(args, train_test_data)
204 # 模型训练
205 model.run_train()
206 # 模型保存
207 model.save_model()
208 # 模型评估
209 acc = model.evaluate()
210 # 上报结果
211 report_final(args, metric=acc)
212
213if __name__ == "__main__":
214 tf.app.run()
示例代码对应的yaml配置如下,请保持格式一致
tpe_search_demo.yml示例内容
Plain Text
1#搜索算法参数
2search_strategy:
3 algo: TPE_SEARCH #搜索策略:贝叶斯搜索
4 params:
5 n_startup_points: 5 # 初始点数量 |[1,20] int类型
6 max_concurrent: 5 #最大并发量 |[1,20] int类型
7
8#单次训练时数据的采样比例,单位%
9data_sampling_scale: 100 #|(0,100] int类型
10
11#最大搜索次数
12max_trial_num: 10 # |>0 int类型
13
14#评价指标参数
15metrics:
16 name: acc #评价指标 | 任意字符串 str类型
17 goal: MAXIMIZE #最大值/最小值 | str类型 MAXIMIZE or MINIMIZE 必须为这两个之一(也即支持大写)
18 expected_value: 100 #早停标准值,评价指标超过该值则结束整个超参搜索,单位% |无限制 int类型
19
20#搜索参数空间
21search_space:
22 batch_size:
23 htype: choice
24 value: [100, 200, 300, 400, 500, 600]
25 lr:
26 htype: loguniform
27 value: [0.0001, 0.1]
28 last_step:
29 htype: choice
30 value: [20000, 50000, 100000]