Blackhole 1.0.0
更新时间:2023-01-18
Blackhole 1.0.0
Blackhole是百度自研的高性能数据科学引擎,CodeLab中内嵌了该引擎。通过异构加速计算、超大数据处理、高效数据存储等技术,单机Blackhole在数据分析和机器学习等场景相比开源Pandas/Sklearn性能可提升7倍以上、拥有TB级的单机超大数据处理能力,同时提供和Pandas、Sklearn基本一致的易用接口。参考文档点击这里查看。
本文使用Blackhole中随机森林算法对希格斯玻色子的信号进行预测,并采用准确率评估指标对模型性能进行评估。参考kaggle竞赛-希格斯玻色子机器学习挑战,HIGGS数据集由加利福尼亚大学机器学习与智能系统中心提供,用于预测希格斯玻色子的信号。
训练数据集点击这里下载。
单机训练(计算节点数为1),示例代码如下:
Python
1# Blackhole train demo
2#!/usr/bin/env python
3# -*- coding: utf-8 -*-
4"""
5 Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
6 File: demo.py
7 Blackhole demo
8 本demo背景参考kaggle竞赛-希格斯玻色子机器学习挑战,HIGGS数据集由加利福尼亚大学机器学习与智能系统中心提供,用于预测希格斯玻色子的信号。
9 本demo使用blackhole中随机森林算法对希格斯玻色子的信号进行预测,并采用准确率评估指标对模型性能进行评估。
10 数据集中第1列为标签列,其后28列为特征列.
11 数据集地址: https://archive.ics.uci.edu/ml/datasets/HIGGS
12 https://codelab-dataset.cdn.bcebos.com/small/competition/higgs.zip
13 竞赛地址: https://www.kaggle.com/c/higgs-boson/overview
14"""
15import os
16import logging
17import shutil
18import blackhole
19import blackhole.gibbons as pd
20from blackhole.ml.metrics import accuracy_score
21from blackhole.ml.model_selection import train_test_split
22from blackhole.ml.ensemble import RandomForestClassifier
23logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',
24 level=logging.INFO)
25
26
27def load_data(csv_file):
28 """
29 :param csv_file:
30 :return:
31 """
32 assert os.path.exists(csv_file), "%s not exists" % csv_file
33 logging.info("Load data from %s" % csv_file)
34 col_names = ['label'] + ["col-{}".format(i) for i in range(2, 30)] # Assign column names
35 data = pd.read_csv(csv_file, names=col_names)
36 return data
37
38
39def split_data(data, test_ratio=0.3):
40 """
41 :param data: dataframe
42 :param test_ratio: test data ratio
43 :return:
44 """
45 logging.info("Split_data, train ratio: %s, split_ratio: %s" % (1.0 - test_ratio, test_ratio))
46 X, y = data[data.columns.difference(['label'])], data['label'] # Separate data into X and y
47 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(len(data) * test_ratio))
48 logging.info("After split, train_data: %s, test_data: %s" % (len(X_train), len(X_test)))
49 train_test_data_dict = {
50 "X_train": X_train,
51 "X_test": X_test,
52 "y_train": y_train,
53 "y_test": y_test
54 }
55 return train_test_data_dict
56
57
58def create_model(model, params):
59 """
60 :param model:
61 :param params:
62 :return:
63 """
64 logging.info("Create model from %s, params: %s" % (model, params))
65 bh_model = model(**params)
66 return bh_model
67
68
69def fit(model_instance, X_train, y_train, X_test, y_test):
70 """
71 :param model_instance:
72 :param X_train:
73 :param y_train:
74 :param X_test:
75 :param y_test:
76 :return:
77 """
78 logging.info("Fit model...")
79 model_instance.fit(X_train, y_train)
80 pre = model_instance.predict(X_test)
81 accuracy = accuracy_score(pre, y_test)
82 logging.info("Train accuracy: %s" % accuracy)
83 return model_instance
84
85
86def save_model(model_instance, output_path):
87 """
88 :param model_instance:
89 :param output_path:
90 :return:
91 """
92 shutil.rmtree(output_path, ignore_errors=True)
93 blackhole.ml.save_model(model_instance, output_path)
94 logging.info("Save model to %s" % output_path)
95 return output_path
96
97
98def predict(model_path, X_test):
99 """
100 :param model_path:
101 :param X_test: dataframe, should not contain label
102 :return:
103 """
104 assert os.path.exists(model_path), "%s not exists" % model_path
105 logging.info("Load model from %s, predict ..." % model_path)
106 model = blackhole.ml.load_model(model_path)
107 pred = model.predict(X_test)
108 logging.info("Predict_result number: %s, show top 5: \n%s" % (len(pred), pred.head()))
109 return pred
110
111
112def main():
113 """
114 :return:
115 File directory example:
116 |-- demo.py
117 |-- output
118 | |-- bhml.meta
119 | `-- bhml.model
120 |-- test_data
121 | `-- HIGGS.csv
122 `-- train_data
123 `-- HIGGS.csv
124 """
125 # step1, load and split data
126 train_csv_file = "./train_data/HIGGS.csv" # csv file is in train_data folder
127 train_data = load_data(train_csv_file)
128 train_test_data_dict = split_data(train_data)
129 X_train = train_test_data_dict['X_train']
130 X_test = train_test_data_dict['X_test']
131 y_train = train_test_data_dict['y_train']
132 y_test = train_test_data_dict['y_test']
133
134 # step 2, create model、fit and save
135 model_params = {
136 'n_estimators': 25,
137 'max_depth': 13,
138 }
139 bh_RandomForestClassifier = create_model(RandomForestClassifier, model_params)
140 bh_RandomForestClassifier = fit(bh_RandomForestClassifier, X_train, y_train, X_test, y_test)
141 saved_model_path = "./output/" # output path is in ./output
142 save_model(bh_RandomForestClassifier, saved_model_path)
143
144 # step 3, predict
145 test_csv_file = "./test_data/HIGGS.csv" # csv file is in train_data folder
146 test_data = load_data(test_csv_file)
147 y_predict = predict(saved_model_path, test_data)
148
149
150if __name__ == "__main__":
151 main()