PaddlePaddle 2.0.0rc
更新时间:2023-01-18
Paddle
此处提供基于Paddle框架的MNIST图像分类示例代码,数据集请点击这里下载。
单机训练时(计算节点等于1),示例代码如下:
Python
1import os
2import numpy
3import paddle # 导入paddle模块
4import paddle.fluid as fluid
5import gzip
6import struct
7work_path = os.getcwd()
8cluster_train_dir = "%s/train_data" % work_path
9def load_data(file_dir, is_train=True):
10 """
11 :param file_dir:
12 :param is_train:
13 :return:
14 """
15 if is_train:
16 image_path = file_dir + '/train-images-idx3-ubyte.gz'
17 label_path = file_dir + '/train-labels-idx1-ubyte.gz'
18 else:
19 image_path = file_dir + '/t10k-images-idx3-ubyte.gz'
20 label_path = file_dir + '/t10k-labels-idx1-ubyte.gz'
21 with open(image_path.replace('.gz', ''), 'wb') as out_f, gzip.GzipFile(image_path) as zip_f:
22 out_f.write(zip_f.read())
23 os.unlink(image_path)
24 with open(label_path.replace('.gz', ''), 'wb') as out_f, gzip.GzipFile(label_path) as zip_f:
25 out_f.write(zip_f.read())
26 os.unlink(label_path)
27 with open(label_path[:-3], 'rb') as lbpath:
28 magic, n = struct.unpack('>II', lbpath.read(8))
29 labels = numpy.fromfile(lbpath, dtype=numpy.uint8)
30 with open(image_path[:-3], 'rb') as imgpath:
31 magic, num, rows, cols = struct.unpack('>IIII', imgpath.read(16))
32 images = numpy.fromfile(imgpath, dtype=numpy.uint8).reshape(len(labels), 784)
33 return images, labels
34def reader_creator(file_dir, is_train=True, buffer_size=100):
35 """
36 :param file_dir:
37 :param is_train:
38 :param buffer_size:
39 :return:
40 """
41 images, labels = load_data(file_dir, is_train)
42 def reader():
43 """
44 :return:
45 """
46 for num in range(int(len(labels) / buffer_size)):
47 for i in range(buffer_size):
48 yield images[num * buffer_size + i, :], int(labels[num * buffer_size + i])
49 return reader
50def softmax_regression():
51 """
52 定义softmax分类器:
53 一个以softmax为激活函数的全连接层
54 Return:
55 predict_image -- 分类的结果
56 """
57 # 输入的原始图像数据,大小为28*28*1
58 img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
59 # 以softmax为激活函数的全连接层,输出层的大小必须为数字的个数10
60 predict = fluid.layers.fc(
61 input=img, size=10, act='softmax')
62 return predict
63def multilayer_perceptron():
64 """
65 定义多层感知机分类器:
66 含有两个隐藏层(全连接层)的多层感知器
67 其中前两个隐藏层的激活函数采用 ReLU,输出层的激活函数用 Softmax
68 Return:
69 predict_image -- 分类的结果
70 """
71 # 输入的原始图像数据,大小为28*28*1
72 img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
73 # 第一个全连接层,激活函数为ReLU
74 hidden = fluid.layers.fc(input=img, size=200, act='relu')
75 # 第二个全连接层,激活函数为ReLU
76 hidden = fluid.layers.fc(input=hidden, size=200, act='relu')
77 # 以softmax为激活函数的全连接输出层,输出层的大小必须为数字的个数10
78 prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
79 return prediction
80def convolutional_neural_network():
81 """
82 定义卷积神经网络分类器:
83 输入的二维图像,经过两个卷积-池化层,使用以softmax为激活函数的全连接层作为输出层
84 Return:
85 predict -- 分类的结果
86 """
87 # 输入的原始图像数据,大小为28*28*1
88 img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
89 # 第一个卷积-池化层
90 # 使用20个5*5的滤波器,池化大小为2,池化步长为2,激活函数为Relu
91 conv_pool_1 = fluid.nets.simple_img_conv_pool(
92 input=img,
93 filter_size=5,
94 num_filters=20,
95 pool_size=2,
96 pool_stride=2,
97 act="relu")
98 conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
99 # 第二个卷积-池化层
100 # 使用20个5*5的滤波器,池化大小为2,池化步长为2,激活函数为Relu
101 conv_pool_2 = fluid.nets.simple_img_conv_pool(
102 input=conv_pool_1,
103 filter_size=5,
104 num_filters=50,
105 pool_size=2,
106 pool_stride=2,
107 act="relu")
108 # 以softmax为激活函数的全连接输出层,输出层的大小必须为数字的个数10
109 prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
110 return prediction
111def train_program():
112 """
113 配置train_program
114 Return:
115 predict -- 分类的结果
116 avg_cost -- 平均损失
117 acc -- 分类的准确率
118 """
119 paddle.enable_static()
120 # 标签层,名称为label,对应输入图片的类别标签
121 label = fluid.layers.data(name='label', shape=[1], dtype='int64')
122 # predict = softmax_regression() # 取消注释将使用 Softmax回归
123 # predict = multilayer_perceptron() # 取消注释将使用 多层感知器
124 predict = convolutional_neural_network() # 取消注释将使用 LeNet5卷积神经网络
125 # 使用类交叉熵函数计算predict和label之间的损失函数
126 cost = fluid.layers.cross_entropy(input=predict, label=label)
127 # 计算平均损失
128 avg_cost = fluid.layers.mean(cost)
129 # 计算分类准确率
130 acc = fluid.layers.accuracy(input=predict, label=label)
131 return predict, [avg_cost, acc]
132def optimizer_program():
133 """
134 :return:
135 """
136 return fluid.optimizer.Adam(learning_rate=0.001)
137# 一个minibatch中有64个数据
138BATCH_SIZE = 64
139# 每次读取训练集中的500个数据并随机打乱,传入batched reader中,batched reader 每次 yield 64个数据
140train_reader = paddle.batch(
141 paddle.reader.shuffle(
142 reader_creator(cluster_train_dir, is_train=True, buffer_size=100), buf_size=500),
143 batch_size=BATCH_SIZE)
144# 读取测试集的数据,每次 yield 64个数据
145test_reader = paddle.batch(
146 reader_creator(cluster_train_dir, is_train=False, buffer_size=100), batch_size=BATCH_SIZE)
147def event_handler(pass_id, batch_id, cost):
148 # 打印训练的中间结果,训练轮次,batch数,损失函数
149 print("Pass %d, Batch %d, Cost %f" % (pass_id, batch_id, cost))
150# 该模型运行在单个CPU上
151place = fluid.CPUPlace()
152# 调用train_program 获取预测值,损失值,
153prediction, [avg_loss, acc] = train_program()
154# 输入的原始图像数据,大小为28*28*1
155img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
156# 标签层,名称为label,对应输入图片的类别标签
157label = fluid.layers.data(name='label', shape=[1], dtype='int64')
158# 告知网络传入的数据分为两部分,第一部分是img值,第二部分是label值
159feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
160# 选择Adam优化器
161optimizer = fluid.optimizer.Adam(learning_rate=0.001)
162optimizer.minimize(avg_loss)
163PASS_NUM = 1 #训练1轮
164epochs = [epoch_id for epoch_id in range(PASS_NUM)]
165# 将模型参数存储在名为 save_dirname 的文件中
166save_dirname = "./output/"
167def train_test(train_test_program,
168 train_test_feed, train_test_reader):
169 # 将分类准确率存储在acc_set中
170 acc_set = []
171 # 将平均损失存储在avg_loss_set中
172 avg_loss_set = []
173 # 将测试 reader yield 出的每一个数据传入网络中进行训练
174 for test_data in train_test_reader():
175 acc_np, avg_loss_np = exe.run(
176 program=train_test_program,
177 feed=train_test_feed.feed(test_data),
178 fetch_list=[acc, avg_loss])
179 acc_set.append(float(acc_np))
180 avg_loss_set.append(float(avg_loss_np))
181 # 获得测试数据上的准确率和损失值
182 acc_val_mean = numpy.array(acc_set).mean()
183 avg_loss_val_mean = numpy.array(avg_loss_set).mean()
184 # 返回平均损失值,平均准确率
185 return avg_loss_val_mean, acc_val_mean
186exe = fluid.Executor(place)
187exe.run(fluid.default_startup_program())
188main_program = fluid.default_main_program()
189test_program = fluid.default_main_program().clone(for_test=True)
190lists = []
191step = 0
192for epoch_id in epochs:
193 for step_id, data in enumerate(train_reader()):
194 metrics = exe.run(main_program,
195 feed=feeder.feed(data),
196 fetch_list=[avg_loss, acc])
197 if step % 100 == 0: #每训练100次 更新一次图片
198 event_handler(step, epoch_id, metrics[0])
199 step += 1
200 # 测试每个epoch的分类效果
201 avg_loss_val, acc_val = train_test(train_test_program=test_program,
202 train_test_reader=test_reader,
203 train_test_feed=feeder)
204 print("Test with Epoch %d, avg_cost: %s, acc: %s" % (epoch_id, avg_loss_val, acc_val))
205 lists.append((epoch_id, avg_loss_val, acc_val))
206 # 保存训练好的模型参数用于预测
207 if save_dirname is not None:
208 fluid.io.save_inference_model(save_dirname,
209 ["img"], [prediction], exe,
210 model_filename='model',
211 params_filename='params')
212# 选择效果最好的pass
213best = sorted(lists, key=lambda list: float(list[1]))[0]
214print('Best pass is %s, testing Avgcost is %s' % (best[0], best[1]))
215print('The classification accuracy is %.2f%%' % (float(best[2]) * 100))
216
分布式训练时(计算节点大于1),示例代码如下: 说明:demo分布式程序没有做数据的分片操作,仅供参考
Python
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3"""
4"""
5import os
6import gzip
7import struct
8import numpy as np
9from PIL import Image
10import time
11import paddle
12import paddle.distributed.fleet as fleet
13import paddle.static.nn as nn
14import paddle.fluid as fluid
15from paddle.io import Dataset
16TEST_IMAGE = 't10k-images-idx3-ubyte.gz'
17TEST_LABEL = 't10k-labels-idx1-ubyte.gz'
18TRAIN_IMAGE = 'train-images-idx3-ubyte.gz'
19TRAIN_LABEL = 'train-labels-idx1-ubyte.gz'
20class MNIST(Dataset):
21 """
22 MNIST
23 """
24 def __init__(self,
25 data_dir=None,
26 mode='train',
27 transform=None,
28 backend=None):
29 assert mode.lower() in ['train', 'test'], \
30 "mode should be 'train' or 'test', but got {}".format(mode)
31 if backend is None:
32 backend = paddle.vision.get_image_backend()
33 if backend not in ['pil', 'cv2']:
34 raise ValueError(
35 "Expected backend are one of ['pil', 'cv2'], but got {}"
36 .format(backend))
37 self.backend = backend
38 self.mode = mode.lower()
39 if self.mode == 'train':
40 self.image_path = os.path.join(data_dir, TRAIN_IMAGE)
41 self.label_path = os.path.join(data_dir, TRAIN_LABEL)
42 else:
43 self.image_path = os.path.join(data_dir, TEST_IMAGE)
44 self.label_path = os.path.join(data_dir, TEST_LABEL)
45 self.transform = transform
46 # read dataset into memory
47 self._parse_dataset()
48 self.dtype = paddle.get_default_dtype()
49 def _parse_dataset(self, buffer_size=100):
50 self.images = []
51 self.labels = []
52 with gzip.GzipFile(self.image_path, 'rb') as image_file:
53 img_buf = image_file.read()
54 with gzip.GzipFile(self.label_path, 'rb') as label_file:
55 lab_buf = label_file.read()
56 step_label = 0
57 offset_img = 0
58 # read from Big-endian
59 # get file info from magic byte
60 # image file : 16B
61 magic_byte_img = '>IIII'
62 magic_img, image_num, rows, cols = struct.unpack_from(
63 magic_byte_img, img_buf, offset_img)
64 offset_img += struct.calcsize(magic_byte_img)
65 offset_lab = 0
66 # label file : 8B
67 magic_byte_lab = '>II'
68 magic_lab, label_num = struct.unpack_from(magic_byte_lab,
69 lab_buf, offset_lab)
70 offset_lab += struct.calcsize(magic_byte_lab)
71 while True:
72 if step_label >= label_num:
73 break
74 fmt_label = '>' + str(buffer_size) + 'B'
75 labels = struct.unpack_from(fmt_label, lab_buf, offset_lab)
76 offset_lab += struct.calcsize(fmt_label)
77 step_label += buffer_size
78 fmt_images = '>' + str(buffer_size * rows * cols) + 'B'
79 images_temp = struct.unpack_from(fmt_images, img_buf,
80 offset_img)
81 images = np.reshape(images_temp, (buffer_size, rows *
82 cols)).astype('float32')
83 offset_img += struct.calcsize(fmt_images)
84 for i in range(buffer_size):
85 self.images.append(images[i, :])
86 self.labels.append(
87 np.array([labels[i]]).astype('int64'))
88 def __getitem__(self, idx):
89 image, label = self.images[idx], self.labels[idx]
90 image = np.reshape(image, [28, 28])
91 if self.backend == 'pil':
92 image = Image.fromarray(image.astype('uint8'), mode='L')
93 if self.transform is not None:
94 image = self.transform(image)
95 if self.backend == 'pil':
96 return image, label.astype('int64')
97 return image.astype(self.dtype), label.astype('int64')
98 def __len__(self):
99 return len(self.labels)
100def mlp_model():
101 """
102 mlp_model
103 """
104 x = paddle.static.data(name="x", shape=[64, 28, 28], dtype='float32')
105 y = paddle.static.data(name="y", shape=[64, 1], dtype='int64')
106 x_flatten = paddle.reshape(x, [64, 784])
107 fc_1 = nn.fc(x=x_flatten, size=128, activation='tanh')
108 fc_2 = nn.fc(x=fc_1, size=128, activation='tanh')
109 prediction = nn.fc(x=[fc_2], size=10, activation='softmax')
110 cost = paddle.fluid.layers.cross_entropy(input=prediction, label=y)
111 acc_top1 = paddle.metric.accuracy(input=prediction, label=y, k=1)
112 avg_cost = paddle.mean(x=cost)
113 res = [x, y, prediction, avg_cost, acc_top1]
114 return res
115def train(epoch, exe, train_dataloader, cost, acc):
116 """
117 train
118 """
119 total_time = 0
120 step = 0
121 for data in train_dataloader():
122 step += 1
123 start_time = time.time()
124 loss_val, acc_val = exe.run(
125 paddle.static.default_main_program(),
126 feed=data, fetch_list=[cost.name, acc.name])
127 if step % 100 == 0:
128 end_time = time.time()
129 total_time += (end_time - start_time)
130 print(
131 "epoch: %d, step:%d, train_loss: %f, train_acc: %f, total time cost = %f, speed: %f"
132 % (epoch, step, loss_val[0], acc_val[0], total_time,
133 1 / (end_time - start_time) ))
134def test(exe, test_dataloader, cost, acc):
135 """
136 test
137 """
138 total_time = 0
139 step = 0
140 for data in test_dataloader():
141 step += 1
142 start_time = time.time()
143 loss_val, acc_val = exe.run(
144 paddle.static.default_main_program(),
145 feed=data, fetch_list=[cost.name, acc.name])
146 if step % 100 == 0:
147 end_time = time.time()
148 total_time += (end_time - start_time)
149 print(
150 "step:%d, test_loss: %f, test_acc: %f, total time cost = %f, speed: %f"
151 % (step, loss_val[0], acc_val[0], total_time,
152 1 / (end_time - start_time) ))
153def save(save_dir, feed_vars, fetch_vars, exe):
154 """
155 save
156 """
157 path_prefix = os.path.join(save_dir, 'model')
158 if fleet.is_first_worker():
159 paddle.static.save_inference_model(path_prefix, feed_vars, fetch_vars, exe)
160if __name__ == '__main__':
161 # 设置训练集路径
162 train_data = './train_data'
163 # 设置验证集路径
164 test_data = './test_data'
165 # 设置输出路径
166 save_dir = './output'
167 # 设置迭代轮数
168 epochs = 10
169 # 设置验证间隔轮数
170 test_interval = 2
171 # 设置模型保存间隔轮数
172 save_interval = 2
173 paddle.enable_static()
174 paddle.vision.set_image_backend('cv2')
175 # 训练数据集
176 train_dataset = MNIST(data_dir=train_data, mode='train')
177 # 验证数据集
178 test_dataset = MNIST(data_dir=test_data, mode='test')
179 # 设置模型
180 [x, y, pred, cost, acc] = mlp_model()
181 place = paddle.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0)))
182 # 数据加载
183 train_dataloader = paddle.io.DataLoader(
184 train_dataset, feed_list=[x, y], drop_last=True,
185 places=place, batch_size=64, shuffle=True, return_list=False)
186 test_dataloader = paddle.io.DataLoader(
187 test_dataset, feed_list=[x, y], drop_last=True,
188 places=place, batch_size=64, return_list=False)
189 # fleet初始化
190 strategy = fleet.DistributedStrategy()
191 fleet.init(is_collective=True, strategy=strategy)
192 # 设置优化器
193 optimizer = paddle.optimizer.Adam()
194 optimizer = fleet.distributed_optimizer(optimizer)
195 optimizer.minimize(cost)
196 exe = paddle.static.Executor(place)
197 exe.run(paddle.static.default_startup_program())
198 prog = paddle.static.default_main_program()
199 for epoch in range(epochs):
200 train(epoch, exe, train_dataloader, cost, acc)
201 if epoch % test_interval == 0:
202 test(exe, test_dataloader, cost, acc)
203 # save model
204 if epoch % save_interval == 0:
205 save(save_dir, [x], [pred], exe)