XGBoost 1.3.1代码规范
更新时间:2023-01-18
XGBoost 1.3.1代码规范
基于XGBoost 1.3.1框架的结构化数据的回归问题。
如下所示是其超参搜索任务中一个超参数组合的训练代码,代码会通过argparse模块接受在平台中填写的信息,请保持一致。另外该框架支持发布保存模型为pickle和joblib格式,并且在发布至模型仓库时需要选择相应的模型文件。
xgboost1.3.1_autosearch.py示例代码
Python
1# -*- coding:utf-8 -*-
2""" xgboost train demo """
3import xgboost as xgb
4from sklearn.model_selection import train_test_split
5from sklearn import datasets
6from sklearn.metrics import mean_squared_error
7import numpy as np
8import os
9import time
10import argparse
11from rudder_autosearch.sdk.amaas_tools import AMaasTools
12
13def parse_arg():
14 """parse arguments"""
15 parser = argparse.ArgumentParser(description='xgboost boston Example')
16 parser.add_argument('--train_dir', type=str, default='./train_data',
17 help='input data dir for training (default: ./train_data)')
18 parser.add_argument('--test_dir', type=str, default='./test_data',
19 help='input data dir for test (default: ./test_data)')
20 parser.add_argument('--output_dir', type=str, default='./output',
21 help='output dir for auto_search job (default: ./output)')
22 parser.add_argument('--job_id', type=str, default="job-1234",
23 help='auto_search job id (default: "job-1234")')
24 parser.add_argument('--trial_id', type=str, default="0-0",
25 help='auto_search id of a single trial (default: "0-0")')
26 parser.add_argument('--metric', type=str, default="mse",
27 help='evaluation metric of the model')
28 parser.add_argument('--data_sampling_scale', type=float, default=1.0,
29 help='sampling ratio of the dataset for auto_search (default: 1.0)')
30 parser.add_argument('--max_depth', type=int, default=6,
31 help='maximum depth of the tree (default: 6)')
32 parser.add_argument('--gamma', type=float, default=0.1,
33 help='minimum loss reduction required for further splitting (default: 0.1)')
34 parser.add_argument('--eta', type=float, default=0.1,
35 help='learning rate (default: 0.1)')
36 parser.add_argument('--num_round', type=int, default=10,
37 help='number of trees (default: 10)')
38 args = parser.parse_args()
39 args.output_dir = os.path.join(args.output_dir, args.job_id, args.trial_id)
40 if not os.path.exists(args.output_dir):
41 os.makedirs(args.output_dir)
42 print("job_id: {}, trial_id: {}".format(args.job_id, args.trial_id))
43 return args
44
45def load_data(data_sampling_scale):
46 """ load data """
47 boston = datasets.load_boston()
48 X, Y = boston.data, boston.target
49 # 切分,测试训练2,8分
50 x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
51 train_data = np.concatenate([x_train, y_train.reshape([-1, 1])], axis=1)
52 np.random.seed(0)
53 np.random.shuffle(train_data)
54 train_data = train_data[0:int(data_sampling_scale * len(train_data))]
55 x_train, y_train = train_data[:, 0:-1], train_data[:, -1]
56 return (x_train, x_test), (y_train, y_test)
57
58def save_model(model, output_dir):
59 """ save model with pickle format """
60 import pickle
61 with open(output_dir + '/clf.pickle', 'wb') as f:
62 pickle.dump(model, f)
63
64def save_model_joblib(model, output_dir):
65 """ save model with joblib format """
66 import joblib
67 joblib.dump(model, output_dir + '/clf.pkl')
68
69def evaluate(model, x_test, y_test):
70 """evaluate"""
71 # 回归mean_squared_error指标
72 deval = xgb.DMatrix(x_test)
73 predict = model.predict(deval)
74 mse = mean_squared_error(y_test, predict)
75 print("mean_squared_error: %f" % mse)
76 return mse
77
78def report_final(args, metric):
79 """report_final_result"""
80 # 结果上报sdk
81 amaas_tools = AMaasTools(args.job_id, args.trial_id)
82 metric_dict = {args.metric: metric}
83 for i in range(3):
84 flag, ret_msg = amaas_tools.report_final_result(metric=metric_dict,
85 export_model_path=args.output_dir,
86 checkpoint_path="")
87 print("End Report, metric:{}, ret_msg:{}".format(metric, ret_msg))
88 if flag:
89 break
90 time.sleep(1)
91 assert flag, "Report final result to manager failed! Please check whether manager'address or manager'status " \
92 "is ok! "
93
94def main():
95 """ main """
96 # 获取参数
97 args = parse_arg()
98 # 加载数据集
99 (x_train, x_test), (y_train, y_test) = load_data(args.data_sampling_scale)
100 dtrain = xgb.DMatrix(x_train, label=y_train)
101 # 模型参数定义
102 param = {"gamma": args.gamma, 'max_depth': args.max_depth,
103 'eta': args.eta, 'objective': 'reg:squarederror'}
104 # 模型训练
105 model = xgb.train(param, dtrain, args.num_round)
106 # 模型保存
107 save_model_joblib(model, args.output_dir)
108 # 模型评估
109 mse = evaluate(model, x_test, y_test)
110 # 上报结果
111 report_final(args, metric=mse)
112
113if __name__ == '__main__':
114 main()
示例代码对应的yaml配置如下,请保持格式一致
cmaes_search_demo.yml示例内容
Plain Text
1#搜索算法参数
2search_strategy:
3 algo: CMAES_SEARCH #搜索策略:进化-cmaes算法
4 params:
5 population_num: 8 #种群个体数量 | [1,10] int类型
6 round: 10 #迭代轮数 |[5,50] int类型
7 step_size: 1.0 # 学习步长 |(0,10] float类型
8
9#单次训练时数据的采样比例,单位%
10data_sampling_scale: 100 #|(0,100] int类型
11
12#评价指标参数
13metrics:
14 name: mse #评价指标 | 任意字符串 str类型
15 goal: MINIMIZE #最大值/最小值 | str类型 MAXIMIZE or MINIMIZE 必须为这两个之一(也即支持大写)
16 expected_value: 10 #早停标准值,评价指标超过该值则结束整个超参搜索,单位% |无限制 int类型
17
18#搜索参数空间
19search_space:
20 max_depth:
21 htype: randint
22 value: [3, 10]
23 num_round:
24 htype: randint
25 value: [1, 8]
26 gamma:
27 htype: uniform
28 value: [0.1, 1]
29 eta:
30 htype: loguniform
31 value: [0.01, 1]