机器学习

ML09-lightGBM算法

2019-08-13  本文已影响83人  杨强AT南京

本主题主要是lightGBM的入门与调优思路介绍;包含如下几个内容:
  1. lightGBM的原生分类实现
  2. lightGBM的sklearn分类实现
  3. lightGBM的原生回归实现
  4. lightGBM的sklearn回归实现
  5. lightGBM的主要参数说明,以及参数的调优思路入门;

注意:
  1.lightGBM本身有交叉验证训练以及网格搜索的优化训练,这里不在解释范围内。
  2.数据集采用的是sklearn提供的iris数据集与SP500数据集。


一. lightGBM的原生分类例子

说明

使用步骤

1. 准备数据

import sklearn.datasets  as ds

data, target = ds.load_iris(return_X_y=True)
data.shape,target.shape
((150, 4), (150,))
data = data [0:100, :]
target = target[0:100]
data.shape,target.shape
((100, 4), (100,))

2. 训练集与测试集切分

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
X_train.shape, X_test.shape, y_train.shape,y_test.shape
((80, 4), (20, 4), (80,), (20,))

3. lightGBM的基本使用模式

import lightgbm as lgb

lgb_train = lgb.Dataset(X_train,label = y_train)
# 训练 
gbm = lgb.train({},train_set=lgb_train)
# 预测数据集
y_pred = gbm.predict(X_test)
# 评估模型
print(y_pred)
[1.22846467e-05 1.22846467e-05 1.22846467e-05 9.99985723e-01
 9.99985723e-01 9.99985723e-01 1.22846467e-05 1.22846467e-05
 9.99985723e-01 9.99985723e-01 9.99985723e-01 9.99985723e-01
 9.99985723e-01 9.99985723e-01 1.22846467e-05 9.99985723e-01
 1.22846467e-05 9.99985723e-01 9.99985723e-01 9.99985723e-01]

4. 测试结果手工评估

y_test
array([0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1])
y_pred[y_pred>0.7] = 1
y_pred[y_pred<0.3] = 0
y_pred
array([0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0.,
       1., 1., 1.])
import numpy as np
y_pred = y_pred.astype(np.int)
y_pred
array([0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1])
corr = (y_pred == y_test).sum()
print(F'正确数:{corr}')
正确数:20

二. lightGBM的sklearn分类例子

说明

使用步骤

1. 准备数据

import sklearn.datasets  as ds
from sklearn.model_selection import train_test_split

data, target = ds.load_iris(return_X_y=True)
data = data [0:100, :]
target = target[0:100]
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)

2. 训练

import lightgbm as lgb
classifier = lgb.LGBMClassifier()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

3. 手工评估

y_pred
array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1])
corr = (y_pred == y_test).sum()
print(F'正确数:{corr}')
正确数:20

三. lightGBM的原生回归实现

准备数据

1. 读取数据

import pandas as pd

data = pd.read_csv("data_stocks.csv");

2. 可视化数据

%matplotlib inline
# import seaborn as sns; 
import matplotlib.pyplot as plt

# -----seaborn-----
# 保证每个点是相同间隔,不使用DATE作为x轴,而是采用整数序列
# data['IDX'] = data.index
# data[0: 5]
# sns.set(context='paper',  style='ticks')
# ax = sns.lineplot(x="IDX", y="SP500", data=data)
# ax.figure.set_size_inches(15, 4)
# ----------------

figure = plt.figure(figsize=(15,4))
ax = figure.add_axes((0.1, 0.1, 0.8, 0.8))
ax.set_xlim(0, data.shape[0])
ax.set_ylim(2250, 2550)
ax.plot('SP500', data=data, color=(1, 0, 0, 1))

plt.show()
指数曲线

3. 数据清洗

data.drop('DATE', axis=1, inplace=True)

4.训练集与测试集切分

from  sklearn.model_selection import train_test_split
# train, test = train_test_split(data, test_size=0.2, shuffle=False)
# ((33012, 502), (8254, 502))

train, test = data[: data.shape[0] * 4 // 5], data[data.shape[0] * 4 // 5: ]

5. 数据归一化

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

6. 特征与标签

train_data = train[:, 1:]
train_label = train[:, 0]

test_data = test[:, 1:]
test_label = test[:, 0]

7. 归一化的数据可视化

%matplotlib inline
# import seaborn as sns; 
import matplotlib.pyplot as plt


figure = plt.figure(figsize=(15,4))
ax = figure.add_axes((0.1, 0.1, 0.8, 0.8))
ax.set_xlim(0, data.shape[0])
ax.plot(range(len(train_label)), train_label, color=(1, 0, 0, 1))   # 训练集

ax.plot(range(len(train_label), len(train_label) + len(test_label)), test_label, color=(0, 0, 1, 1))  # 测试集
plt.show()
归一化数据可视化曲线

循环与测试

1. 训练

import lightgbm as lgb

lgb_train = lgb.Dataset(train_data,label = train_label)
# 训练 
gbm = lgb.train({},train_set=lgb_train)

2. 预测

predict_train = gbm.predict(train_data)
predict_test = gbm.predict(test_data)

3. 可视化

%matplotlib inline
# import seaborn as sns; 
import matplotlib.pyplot as plt


figure = plt.figure(figsize=(15,4))
ax = figure.add_axes((0.1, 0.1, 0.8, 0.8))
ax.set_xlim(0, data.shape[0])
# 原始数据
ax.plot(range(len(train_label)), train_label, color=(1, 0, 0, 1))   # 训练集
ax.plot(range(len(train_label), len(train_label) + len(test_label)), test_label, color=(0, 0, 1, 1))  # 测试集

# 预测数据
ax.plot(range(len(train_label)), predict_train, color=(1, 1, 0, 1))   # 训练集
ax.plot(range(len(train_label), len(train_label) + len(test_label)), predict_test, color=(0, 1, 1, 1))  # 测试集


plt.show()
训练集与测试集回归曲线

4. 训练集预测可视化

%matplotlib inline
# import seaborn as sns; 
import matplotlib.pyplot as plt

figure = plt.figure(figsize=(15,4))
ax = figure.add_axes((0.1, 0.1, 0.8, 0.8))
ax.plot(predict_train, color=(1, 0, 0, 1))
ax.plot(train_label, color=(0, 0, 1, 1))

plt.show()
训练集独立可视化

5. 测试集预测可视化

%matplotlib inline
# import seaborn as sns; 
import matplotlib.pyplot as plt

figure = plt.figure(figsize=(15,4))
ax = figure.add_axes((0.1, 0.1, 0.8, 0.8))
ax.plot(predict_test, color=(1, 0, 0, 1))
ax.plot(test_label, color=(0, 0, 1, 1))

plt.show()
测试集拟合曲线可视化

四. lightGBM的sklearn回归实现

import lightgbm as lgb

regressor = lgb.LGBMRegressor()
regressor.fit(train_data, train_label)

五. lightGBM的主要参数说明

鸢尾花三类分类例子

import sklearn.datasets  as ds
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import numpy as np

data, target = ds.load_iris(return_X_y=True)

X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
print(X_test.shape)

lgb_train = lgb.Dataset(X_train,label = y_train)
# 训练 
gbm = lgb.train({},train_set=lgb_train)
# 预测数据集
y_pred = gbm.predict(X_test)

# 评估模型
print(y_pred)
print(y_test)
(30, 4)
[ 2.94937766e-03  2.04888134e+00  1.08704226e+00  2.04229695e+00
  1.41606515e+00  2.04163640e+00  1.92434531e+00  1.29981967e+00
 -6.66079203e-03 -1.56922140e-03  1.95794545e+00 -5.36818708e-03
  2.94937766e-03  2.94937766e-03  7.28622873e-05  7.28622873e-05
  1.02877711e+00  1.04163151e+00  1.98902325e+00  1.05445874e+00
  1.19528594e+00  2.00206072e+00  1.08159577e+00  6.82782116e-04
  2.94937766e-03  2.94937766e-03  1.76885071e+00  1.54081382e+00
  1.02567221e+00  1.20616748e+00]
[0 2 1 2 2 2 2 1 0 0 2 0 0 0 0 0 1 1 2 1 1 2 1 0 0 0 1 2 1 1]


lightGBM的特征

lightGBM的应用实验

数据 任务 连接 #训练集总数 #特征数 说明
Higgs Binary classification 连接 10,500,000 28 last 500,000 samples were used as test set
Yahoo LTR Learning to rank 连接 473,134 700 set1.train as train, set1.test as test
MS LTR Learning to rank 连接 2,270,296 137 {S1,S2,S3} as train set, {S5} as test set
Expo Binary classification 连接 11,000,000 700 last 1,000,000 samples were used as test set
Allstate Binary classification 连接 13,184,290 4228 last 1,000,000 samples were used as test set

lightGBM的参数

1. 参数格式

2. 常用参数-核心参数

  1. task参数

    • 缺省值 = train
    • 参数类型 = enum
    • 值列表:
      • train/training:训练
      • predict/prediction/test:预测
      • convert_model:模型存储格式转换
      • refit/refit_tree:使用新数据重新训练已经训练的模型;
    • 参数别名: task_type
  2. objective参数

    • 缺省值:regression
    • 类型: enum
    • 参数别名:objective_type/app/application
    • 值列表:
      1. 回归:regression
        • regression
        • regression_l1
        • huber
        • fair
        • poisson
        • quantile
        • mape
        • gamma
        • tweedie
      2. 逻辑回归分类:binary:二分类0或者1
        • binary
      3. 多类分类:multi-class classification:多分类
        • multiclass
        • multiclassova
        • num_class
      4. 概率标签分类:cross-entropy:标签是0-1之间的概率
        • cross_entropy
        • cross_entropy_lambda
      5. 排名:lambdarank
        • lambdarank
  3. boosting参数

    • 缺省值:gbdt,
    • 类型:enum
    • 取值列表:
      • gbdt
      • rf
      • dart
      • goss
  4. num_iterations参数

    • 迭代次数
  5. learning_rate参数

    • 学习率
  6. num_leaves参数

    • 叶子数量

3. 控制参数

4. 评估度量参数

5. 使用参数的例子

import sklearn.datasets  as ds
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import numpy as np

data, target = ds.load_iris(return_X_y=True)

X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
print(X_test.shape)

lgb_train = lgb.Dataset(X_train,label = y_train)
# 训练 
params = {
    'task': 'train',
    'objective': 'multiclass',
    'num_class': 3,
}
gbm = lgb.train(params, train_set=lgb_train)   # 返回模型
# 预测数据集
y_pred = gbm.predict(X_test)

# 评估模型
print(y_pred.argmax(axis=1))
print(y_test)
e_result = y_pred.argmax(axis=1) == y_test
print(F"准确数:{e_result.sum()}")
(30, 4)
[1 1 1 2 2 0 2 1 2 0 2 2 0 0 2 1 0 0 1 2 2 1 2 2 0 0 0 0 1 2]
[1 2 1 2 2 0 2 1 1 0 2 2 0 0 1 1 0 0 1 2 2 1 2 2 0 0 0 0 1 2]
准确数:27       # 优化前的效果

四.参数与调优例子

1. 准确率

import sklearn.datasets  as ds
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import numpy as np
import pandas as pd

data, target = ds.load_iris(return_X_y=True)

X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)

lgb_train = lgb.Dataset(X_train,label = y_train)
params = {
    'task': 'train',
    'objective': 'multiclass',
    'num_class': 3,
}
min_merror = float('Inf')   # 无穷
best_params = {}             # 存放最优参数
    
for num_leaves in range(20,50):
    for max_depth in range(3,8):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
        cv_results = lgb.cv(
            params,
            lgb_train,
            seed=2018,
            nfold=3,
            metrics=['multi_error'],
            early_stopping_rounds=30,
            verbose_eval=False)
        mean_merror = pd.Series(cv_results['multi_error-mean']).min()
        boost_rounds = pd.Series(cv_results['multi_error-mean']).idxmin()
        if mean_merror < min_merror:
            min_merror = mean_merror
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth
params['num_leaves'] = best_params['num_leaves']
params['max_depth'] = best_params['max_depth']
print(params['num_leaves'], params['max_depth'] )
20 3    # 优化的最佳叶子节点数量(20)与深度(3)
gbm = lgb.train(params, train_set=lgb_train)   # 返回模型
# 预测数据集
y_pred = gbm.predict(X_test)

# 评估模型
print(y_pred.argmax(axis=1))
print(y_test)

e_result = y_pred.argmax(axis=1) == y_test
print(F"准确数:{e_result.sum()}")
[0 1 2 1 0 2 1 1 0 0 0 2 0 0 2 1 1 1 0 0 1 2 2 2 1 2 2 0 2 0]
[0 1 2 1 0 2 1 1 0 0 0 1 0 0 2 1 1 1 0 0 1 2 2 2 1 2 2 0 2 0]
准确数:29       #  优化后的效果。

上一篇 下一篇

猜你喜欢

热点阅读