微软大杀器 LightGBM 在R中安装及使用注解
2017-11-02 本文已影响214人
飘舞的鼻涕
安装
安装R版本的 lightgbm, 相较于之前的 install.packages('xx') 分分钟完事, 会略显繁琐, 笔者在安装之初也是填了n次坑, 与 巨硬的R包作者 来往了好几次才成功, 故将安装过程笔记放在这里, 以饷后来人
注,任何疑问都返回 github blog留言,或者加QQ群..[174225475].. 共同探讨进步
- 非GPU版本
- 1.0 官方安装指导 传送门
- 1.1 安装前的准备
安装 git 和 cmake
注: lightgbm 不支持 32-bit R/Rtools - 1.2.1 windows 平台
安装 64 位 Rtools 并将 启动路径放置于 环境变量的path中
也可以直接运行代码 :
library(devtools)
options(devtools.install.args = "--no-multiarch") # if you have 64-bit R only, you can skip this
install_github("Microsoft/LightGBM", subdir = "R-package") - 1.2.2 linux 平台
先安装 Open MPI
尔后运行以下代码 :
git clone --recursive https://github.com/Microsoft/LightGBM ; cd LightGBM
mkdir build ; cd build
cmake -DUSE_MPI=ON ..
make -j4
Note: glibc >= 2.14 is required. - 1.2.3 osx 平台
先安装 gcc 和 Open MPI :
brew install openmpi
brew install cmake
brew install gcc --without-multilib
尔后 :
git clone --recursive https://github.com/Microsoft/LightGBM ; cd LightGBM
export CXX=g++-7 CC=gcc-7
mkdir build ; cd build
cmake -DUSE_MPI=ON ..
make -j4
- GPU版本
参考资料
应用
1. 回归
train_dt <- df1 %>% filter(label1!=0)
test_dt <- df1 %>% filter(label1==0)
indx1 <- sample(c(0,1),size = nrow(train_dt),replace = TRUE,prob = c(0.8,0.2))
train1 <- train_dt %>% filter(indx1==0)
valid1 <- train_dt %>% filter(indx1==1)
library(lightgbm)
#lgb.unloader(wipe = TRUE)
bia1 <- train1 %>%
select(-ids1,-label1) %>%
data.matrix() # change class(Vars) into numeric
bia2 <- train1$price_1mi
dtrain <- lgb.Dataset(data=bia1,
label=bia2,
is_sparse=FALSE,
# colnames/categorical_feather used for specifying categorical feathers
colnames = colnames(train1 %>% select(-ids1,-label1)),
categorical_feature = c('cateVar1','cateVar2'))
bia3 <- valid1 %>%
select(-ids1,-label1) %>%
data.matrix() # change class(Vars) into numeric
bia4 <- valid1$price_1mi
dtest <- lgb.Dataset.create.valid(dataset=dtrain,
data=bia3,
label=bia4)
valids <- list(test=dtest)
params <- list(objective = "regression", metric = "l2") # L2 not twelve
lgb1 <- lgb.train(params=params,
data=dtrain,
valids = valids,
min_data =1, # min data in a group
learning_rate=0.1, # smaller,slower,maybe more accurate
nrounds = 300,
early_stopping_rounds = 20) #if not better than last 20 rounds,stop
bia7 <- test_dt %>%
select(-ids1,-label1) %>%
data.matrix() # change class(Vars) into numeric
pre.lgb=predict(lgb1,bia7)
2. 分类
根据官网demo文件提示
- MultiClass模型中 label必须是 数值型且必须起始于0
# We must convert factors to numeric
# They must be starting from number 0 to use multiclass
# For instance: 0, 1, 2, 3, 4, 5...
iris$Species <- as.numeric(as.factor(iris$Species)) - 1
- Binary模型中, label必须是 数值型[0,1]
so, 重要的事情说三遍! 重要的事情说三遍! 重要的事情说三遍 ... 也不一定记得住:
lightgbm 做分类预测, label 必须是 数值 且 起始于0
2.1 二分类
library(bit64)
library(data.table)
library(dplyr)
library(lightgbm)
info1 <- fread('./data2/info1.csv',header = TRUE,encoding = 'UTF-8')
train_dt <- info1 %>% filter(!is.na(overdue))
test_dt <- info1 %>% filter(is.na(overdue))
set.seed(123)
train_dt$valid_inx <- sample(c(1,0),nrow(train_dt),replace = TRUE,prob = c(0.2,0.8))
bia1 <- train_dt %>% filter(valid_inx==0) %>%
select(-userid,-overdue,-valid_inx) %>% data.matrix()
bia2 <- 1-(train_dt %>% filter(valid_inx==0))$overdue
bia3 <- train_dt %>% filter(valid_inx==1) %>%
select(-userid,-overdue,-valid_inx) %>% data.matrix()
bia4 <- 1-(train_dt %>% filter(valid_inx==1))$overdue
dtrain <- lgb.Dataset(data = bia1,
label = bia2,
is_sparse = FALSE,
# colnames/categorical_feather used for specifying categorical feathers
colnames = colnames(bia1),
categorical_feature = c('sex','occupation','education','marriage'))
dtest <- lgb.Dataset.create.valid(dtrain,
data = bia3,
label = bia4)
valids <- list(test = dtest)
## --- way2
param <- list(num_leaves =70, # 70/80 default,2^(max_depth)
min_data_in_leaf=1,
learning_rate = 0.1, # smaller,slower,maybe more accurate
is_unbalance=TRUE, # unbalance TrainingSet
nthread = 3,
verbose = 1,
metric=c("AUC", "binary_logloss"), # evaluate rules
objective = "binary")
lgb2 <- lgb.train(params=param,
data=dtrain,
nrounds = 200,
early_stopping_rounds = 10,
valids = valids,
bagging_fraction = 0.7, # random sample ratio from trainSet
bagging_freq = 10, # random sample freq from trainSet
bagging_seed = 1) # set.seed
pred2 <- predict(lgb2,data.matrix(test_dt %>% select(-userid,-overdue)))
pred20 <- as.data.frame(cbind(userid=test_dt$userid,probability = 1-pred2))
2.2 多分类
多分类 与 二分类参数设置上的不同主要有:
- 需要设置 num_class(label类别数量)
params <- list(objective = "multiclass", metric = "multi_error", num_class = 3)
- predict 结果输出形式可以自定义
# A (30x3) matrix with the predictions, use parameter reshape
# class1 class2 class3
# obs1 obs1 obs1
# obs2 obs2 obs2
# .... .... ....
my_preds <- predict(model, test[, 1:4], reshape = TRUE)
# We can also get the predicted scores before the Sigmoid/Softmax application
my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE)
# We can also get the leaf index
my_preds <- predict(model, test[, 1:4], predleaf = TRUE, reshape = TRUE)
下面是lightgbm官网对 iris 数据集的分类预测demo, 供实践参考
require(lightgbm)
# We load the default iris dataset shipped with R
data(iris)
# We must convert factors to numeric
# They must be starting from number 0 to use multiclass
# For instance: 0, 1, 2, 3, 4, 5...
iris$Species <- as.numeric(as.factor(iris$Species)) - 1
# We cut the data set into 80% train and 20% validation
# The 10 last samples of each class are for validation
train <- as.matrix(iris[c(1:40, 51:90, 101:140), ])
test <- as.matrix(iris[c(41:50, 91:100, 141:150), ])
dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5])
dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5])
valids <- list(test = dtest)
# Method 1 of training
params <- list(objective = "multiclass", metric = "multi_error", num_class = 3)
model <- lgb.train(params = params,
data = dtrain,
nrounds = 100,
valids = valids,
min_data = 1,
learning_rate = 1,
early_stopping_rounds = 10)
# possibility for each class by cols:
my_preds <- predict(model, test[, 1:4], reshape = TRUE)
[,1] [,2] [,3]
[1,] 0.82590130 0.08704935 0.08704935
[2,] 0.82590130 0.08704935 0.08704935
[3,] 0.82590130 0.08704935 0.08704935
[4,] 0.82590130 0.08704935 0.08704935
[5,] 0.82590130 0.08704935 0.08704935
[6,] 0.82590130 0.08704935 0.08704935
# We can also get the predicted scores before the Sigmoid/Softmax application
my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE)
[,1] [,2] [,3]
[1,] 1.50 -0.75 -0.75
[2,] 1.50 -0.75 -0.75
[3,] 1.50 -0.75 -0.75
[4,] 1.50 -0.75 -0.75
[5,] 1.50 -0.75 -0.75
[6,] 1.50 -0.75 -0.75
# We can also get the leaf index
my_preds <- predict(model, test[, 1:4], predleaf = TRUE)
[,1] [,2] [,3]
[7,] 0 0 0
[8,] 0 0 0
[9,] 0 0 0
[10,] 0 0 0
[11,] 1 6 0
[12,] 2 6 0
3. 参数调优
## For faster speed
# Use bagging by setting bagging_fraction and bagging_freq
# Use feature sub-sampling by setting feature_fraction
# Use small max_bin
# Use save_binary to speed up data loading in future learning
# Use parallel learning, refer to parallel learning guide.
## For better accuracy
# Use large max_bin (may be slower)
# Use small learning_rate with large num_iterations
# Use large num_leaves(may cause over-fitting)
# Use bigger training data
# Try dart
## Deal with over-fitting
# Use small max_bin
# Use small num_leaves
# Use min_data_in_leaf and min_sum_hessian_in_leaf
# Use bagging by set bagging_fraction and bagging_freq
# Use feature sub-sampling by set feature_fraction
# Use bigger training data
# Try lambda_l1, lambda_l2 and min_gain_to_split to regularization
# Try max_depth to avoid growing deep tree