Sklearn 0.23.2代码规范
更新时间:2023-01-18
Sklearn 0.23.2代码规范
基于Sklearn 0.23.2框架的结构化数据的多分类问题,训练数据集sklearn_train_data.zip点击这里下载。
如下所示是其超参搜索任务中一个超参数组合的训练代码,代码会通过argparse模块接受在平台中填写的信息,请保持一致。另外该框架支持发布保存模型为pickle和joblib格式,并且在发布至模型仓库时需要选择相应的模型文件。
sklearn0.23.2_autosearch.py示例代码
Python
1# -*- coding:utf-8 -*-
2""" sklearn train demo """
3import os
4import argparse
5import time
6from sklearn.model_selection import train_test_split
7from sklearn.metrics import f1_score
8from sklearn import svm
9import pandas as pd
10import numpy as np
11from rudder_autosearch.sdk.amaas_tools import AMaasTools
12
13def parse_arg():
14 """parse arguments"""
15 parser = argparse.ArgumentParser(description='Sklearn iris Example')
16 parser.add_argument('--train_dir', type=str, default='./train_data',
17 help='input data dir for training (default: ./train_data)')
18 parser.add_argument('--test_dir', type=str, default='./test_data',
19 help='input data dir for test (default: ./test_data)')
20 parser.add_argument('--output_dir', type=str, default='./output',
21 help='output dir for auto_search job (default: ./output)')
22 parser.add_argument('--job_id', type=str, default="job-1234",
23 help='auto_search job id')
24 parser.add_argument('--trial_id', type=str, default="0-0",
25 help='auto_search id of a single trial')
26 parser.add_argument('--metric', type=str, default="f1_score",
27 help='evaluation metric of the model')
28 parser.add_argument('--data_sampling_scale', type=float, default=1.0,
29 help='sampling ratio of the dataset for auto_search (default: 1.0)')
30 parser.add_argument('--kernel', type=str, default='linear',
31 help='kernel function (default: "linear")')
32 parser.add_argument('--C', type=float, default=1,
33 help='penalty term (default: 1)')
34 parser.add_argument('--gamma', type=float, default=0.5,
35 help='parameter of the kernel (default: 0.5)')
36
37 args = parser.parse_args()
38 args.output_dir = os.path.join(args.output_dir, args.job_id, args.trial_id)
39 if not os.path.exists(args.output_dir):
40 os.makedirs(args.output_dir)
41 print("job_id: {}, trial_id: {}".format(args.job_id, args.trial_id))
42 return args
43
44def load_data(train_dir, data_sampling_scale):
45 """ load data """
46 # 共150条数据,训练120条,测试30条,进行2,8分进行模型训练
47 # 每条数据类型为 x{nbarray} [6.4, 3.1, 5.5, 1.8]
48 # 上传的数据储存在./train_data和./test_data中
49 inputdata = pd.read_csv(train_dir + "/iris.csv")
50 target = inputdata["Species"]
51 inputdata = inputdata.drop(columns=["Species"])
52 # 切分,测试训练2,8分
53 x_train, x_test, y_train, y_test = train_test_split(inputdata, target, test_size=0.2, random_state=0)
54 train_data = np.concatenate([x_train, y_train.ravel().reshape([-1, 1])], axis=1)
55 np.random.seed(0)
56 np.random.shuffle(train_data)
57 train_data = train_data[0:int(data_sampling_scale * len(train_data))]
58 x_train, y_train = train_data[:, 0:-1], train_data[:, -1]
59 return (x_train, x_test), (y_train, y_test)
60
61def save_model(model, output_dir):
62 """ save model with pickle format """
63 import pickle
64 with open(output_dir + '/clf.pickle', 'wb') as f:
65 pickle.dump(model, f)
66
67def save_model_joblib(model, output_dir):
68 """ save model with joblib format """
69 try:
70 import joblib
71 except:
72 from sklearn.externals import joblib
73 joblib.dump(model, output_dir + '/clf.pkl')
74
75def evaluate(model, x_test, y_test):
76 """evaluate"""
77 # 多分类f1_score指标
78 predict = model.predict(x_test)
79 f1 = f1_score(y_test, predict, average="micro")
80 print("f1_score: %f" % f1)
81 return f1
82
83def report_final(args, metric):
84 """report_final_result"""
85 # 结果上报sdk
86 amaas_tools = AMaasTools(args.job_id, args.trial_id)
87 metric_dict = {args.metric: metric}
88 for i in range(3):
89 flag, ret_msg = amaas_tools.report_final_result(metric=metric_dict,
90 export_model_path=args.output_dir,
91 checkpoint_path="")
92 print("End Report, metric:{}, ret_msg:{}".format(metric, ret_msg))
93 if flag:
94 break
95 time.sleep(1)
96 assert flag, "Report final result to manager failed! Please check whether manager'address or manager'status " \
97 "is ok! "
98
99def main():
100 """ main """
101 # 获取参数
102 args = parse_arg()
103 # 加载数据集
104 (x_train, x_test), (y_train, y_test) = load_data(args.train_dir, args.data_sampling_scale)
105 # 模型定义
106 model = svm.SVC(C=args.C, kernel=args.kernel, gamma=args.gamma)
107 # 模型训练
108 model.fit(x_train, y_train)
109 # 模型保存
110 save_model(model, args.output_dir)
111 # 模型评估
112 f1 = evaluate(model, x_test, y_test)
113 # 上报结果
114 report_final(args, metric=f1)
115
116if __name__ == '__main__':
117 main()
示例代码对应的yaml配置如下,请保持格式一致
random_search_demo.yml示例内容
Plain Text
1#搜索算法参数
2search_strategy:
3 algo: RANDOM_SEARCH #搜索策略:随机搜索
4
5#单次训练时数据的采样比例,单位%
6data_sampling_scale: 100 #|(0,100] int类型
7
8#最大搜索次数
9max_trial_num: 10 # |>0 int类型
10
11#评价指标参数
12metrics:
13 name: f1_score #评价指标 | 任意字符串 str类型
14 goal: MAXIMIZE #最大值/最小值 | str类型 MAXIMIZE or MINIMIZE 必须为这两个之一(也即支持大写)
15 expected_value: 100 #早停标准值,评价指标超过该值则结束整个超参搜索,单位% |无限制 int类型
16
17#搜索参数空间
18search_space:
19 kernel: #核函数
20 htype: choice
21 value: ["linear", "rbf"]
22 C: #惩罚项
23 htype: loguniform
24 value: [0.001, 1000]
25 gamma: #核函数参数
26 htype: loguniform
27 value: [0.0001, 1]