Blackhole 1.0.0
更新时间:2023-01-18
Blackhole 1.0.0
Blackhole是百度自研的高性能数据科学引擎,CodeLab中内嵌了该引擎。通过异构加速计算、超大数据处理、高效数据存储等技术,单机Blackhole在数据分析和机器学习等场景相比开源Pandas/Sklearn性能可提升7倍以上、拥有TB级的单机超大数据处理能力,同时提供和Pandas、Sklearn基本一致的易用接口。参考文档点击这里查看。
本文使用Blackhole中随机森林算法对希格斯玻色子的信号进行预测,并采用准确率评估指标对模型性能进行评估。参考kaggle竞赛-希格斯玻色子机器学习挑战,HIGGS数据集由加利福尼亚大学机器学习与智能系统中心提供,用于预测希格斯玻色子的信号。
训练数据集点击这里下载。
单机训练(计算节点数为1),示例代码如下:
# Blackhole train demo
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
File: demo.py
Blackhole demo
本demo背景参考kaggle竞赛-希格斯玻色子机器学习挑战,HIGGS数据集由加利福尼亚大学机器学习与智能系统中心提供,用于预测希格斯玻色子的信号。
本demo使用blackhole中随机森林算法对希格斯玻色子的信号进行预测,并采用准确率评估指标对模型性能进行评估。
数据集中第1列为标签列,其后28列为特征列.
数据集地址: https://archive.ics.uci.edu/ml/datasets/HIGGS
https://codelab-dataset.cdn.bcebos.com/small/competition/higgs.zip
竞赛地址: https://www.kaggle.com/c/higgs-boson/overview
"""
import os
import logging
import shutil
import blackhole
import blackhole.gibbons as pd
from blackhole.ml.metrics import accuracy_score
from blackhole.ml.model_selection import train_test_split
from blackhole.ml.ensemble import RandomForestClassifier
logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',
level=logging.INFO)
def load_data(csv_file):
"""
:param csv_file:
:return:
"""
assert os.path.exists(csv_file), "%s not exists" % csv_file
logging.info("Load data from %s" % csv_file)
col_names = ['label'] + ["col-{}".format(i) for i in range(2, 30)] # Assign column names
data = pd.read_csv(csv_file, names=col_names)
return data
def split_data(data, test_ratio=0.3):
"""
:param data: dataframe
:param test_ratio: test data ratio
:return:
"""
logging.info("Split_data, train ratio: %s, split_ratio: %s" % (1.0 - test_ratio, test_ratio))
X, y = data[data.columns.difference(['label'])], data['label'] # Separate data into X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(len(data) * test_ratio))
logging.info("After split, train_data: %s, test_data: %s" % (len(X_train), len(X_test)))
train_test_data_dict = {
"X_train": X_train,
"X_test": X_test,
"y_train": y_train,
"y_test": y_test
}
return train_test_data_dict
def create_model(model, params):
"""
:param model:
:param params:
:return:
"""
logging.info("Create model from %s, params: %s" % (model, params))
bh_model = model(**params)
return bh_model
def fit(model_instance, X_train, y_train, X_test, y_test):
"""
:param model_instance:
:param X_train:
:param y_train:
:param X_test:
:param y_test:
:return:
"""
logging.info("Fit model...")
model_instance.fit(X_train, y_train)
pre = model_instance.predict(X_test)
accuracy = accuracy_score(pre, y_test)
logging.info("Train accuracy: %s" % accuracy)
return model_instance
def save_model(model_instance, output_path):
"""
:param model_instance:
:param output_path:
:return:
"""
shutil.rmtree(output_path, ignore_errors=True)
blackhole.ml.save_model(model_instance, output_path)
logging.info("Save model to %s" % output_path)
return output_path
def predict(model_path, X_test):
"""
:param model_path:
:param X_test: dataframe, should not contain label
:return:
"""
assert os.path.exists(model_path), "%s not exists" % model_path
logging.info("Load model from %s, predict ..." % model_path)
model = blackhole.ml.load_model(model_path)
pred = model.predict(X_test)
logging.info("Predict_result number: %s, show top 5: \n%s" % (len(pred), pred.head()))
return pred
def main():
"""
:return:
File directory example:
|-- demo.py
|-- output
| |-- bhml.meta
| `-- bhml.model
|-- test_data
| `-- HIGGS.csv
`-- train_data
`-- HIGGS.csv
"""
# step1, load and split data
train_csv_file = "./train_data/HIGGS.csv" # csv file is in train_data folder
train_data = load_data(train_csv_file)
train_test_data_dict = split_data(train_data)
X_train = train_test_data_dict['X_train']
X_test = train_test_data_dict['X_test']
y_train = train_test_data_dict['y_train']
y_test = train_test_data_dict['y_test']
# step 2, create model、fit and save
model_params = {
'n_estimators': 25,
'max_depth': 13,
}
bh_RandomForestClassifier = create_model(RandomForestClassifier, model_params)
bh_RandomForestClassifier = fit(bh_RandomForestClassifier, X_train, y_train, X_test, y_test)
saved_model_path = "./output/" # output path is in ./output
save_model(bh_RandomForestClassifier, saved_model_path)
# step 3, predict
test_csv_file = "./test_data/HIGGS.csv" # csv file is in train_data folder
test_data = load_data(test_csv_file)
y_predict = predict(saved_model_path, test_data)
if __name__ == "__main__":
main()