数据挖掘应用

微软大杀器 LightGBM 在R中安装及使用注解

2017-11-02  本文已影响214人  飘舞的鼻涕

安装

安装R版本的 lightgbm, 相较于之前的 install.packages('xx') 分分钟完事, 会略显繁琐, 笔者在安装之初也是填了n次坑, 与 巨硬的R包作者 来往了好几次才成功, 故将安装过程笔记放在这里, 以饷后来人
注,任何疑问都返回 github blog留言,或者加QQ群..[174225475].. 共同探讨进步

  1. 非GPU版本
  1. GPU版本
    参考资料

应用

1. 回归

train_dt <- df1 %>% filter(label1!=0)
test_dt <- df1 %>% filter(label1==0)
indx1 <- sample(c(0,1),size = nrow(train_dt),replace = TRUE,prob = c(0.8,0.2))  
train1 <- train_dt %>% filter(indx1==0)
valid1 <- train_dt %>% filter(indx1==1)

library(lightgbm)
#lgb.unloader(wipe = TRUE)
bia1 <- train1 %>% 
  select(-ids1,-label1) %>% 
  data.matrix() # change class(Vars) into numeric
bia2 <- train1$price_1mi
dtrain <- lgb.Dataset(data=bia1,
                      label=bia2,
                      is_sparse=FALSE,
                      # colnames/categorical_feather used for specifying categorical feathers
                      colnames = colnames(train1 %>% select(-ids1,-label1)),
                      categorical_feature = c('cateVar1','cateVar2'))
 
bia3 <- valid1 %>% 
  select(-ids1,-label1) %>%
  data.matrix() # change class(Vars) into numeric
bia4 <- valid1$price_1mi
dtest <- lgb.Dataset.create.valid(dataset=dtrain,
                                  data=bia3,
                                  label=bia4)
valids <- list(test=dtest)

params <- list(objective = "regression", metric = "l2") # L2 not twelve
lgb1 <- lgb.train(params=params,
                  data=dtrain,
                  valids = valids,
                  min_data =1, # min data in a group
                  learning_rate=0.1, # smaller,slower,maybe more accurate
                  nrounds = 300,
                  early_stopping_rounds = 20) #if not better than last 20 rounds,stop

bia7 <- test_dt %>% 
  select(-ids1,-label1) %>%
  data.matrix() # change class(Vars) into numeric

pre.lgb=predict(lgb1,bia7)

2. 分类

根据官网demo文件提示

# We must convert factors to numeric
# They must be starting from number 0 to use multiclass
# For instance: 0, 1, 2, 3, 4, 5...
iris$Species <- as.numeric(as.factor(iris$Species)) - 1

so, 重要的事情说三遍! 重要的事情说三遍! 重要的事情说三遍 ... 也不一定记得住:

lightgbm 做分类预测, label 必须是 数值 且 起始于0

2.1 二分类

library(bit64)
library(data.table)
library(dplyr)
library(lightgbm)

info1 <- fread('./data2/info1.csv',header = TRUE,encoding = 'UTF-8')

train_dt <- info1 %>% filter(!is.na(overdue))
test_dt <- info1 %>% filter(is.na(overdue))

set.seed(123)
train_dt$valid_inx <- sample(c(1,0),nrow(train_dt),replace = TRUE,prob = c(0.2,0.8))

bia1 <- train_dt %>% filter(valid_inx==0) %>% 
  select(-userid,-overdue,-valid_inx) %>% data.matrix()
bia2 <- 1-(train_dt %>% filter(valid_inx==0))$overdue
bia3 <- train_dt %>% filter(valid_inx==1) %>% 
  select(-userid,-overdue,-valid_inx) %>% data.matrix()
bia4 <- 1-(train_dt %>% filter(valid_inx==1))$overdue

dtrain <- lgb.Dataset(data = bia1, 
                      label = bia2,
                      is_sparse = FALSE,
                      # colnames/categorical_feather used for specifying categorical feathers
                      colnames = colnames(bia1),
                      categorical_feature = c('sex','occupation','education','marriage'))
dtest <- lgb.Dataset.create.valid(dtrain, 
                                  data = bia3, 
                                  label = bia4)
valids <- list(test = dtest)


## --- way2
param <- list(num_leaves =70, # 70/80 default,2^(max_depth)
              min_data_in_leaf=1, 
              learning_rate = 0.1, # smaller,slower,maybe more accurate
              is_unbalance=TRUE, # unbalance TrainingSet
              nthread = 3,
              verbose = 1,
              metric=c("AUC", "binary_logloss"), # evaluate rules
              objective = "binary")
lgb2 <- lgb.train(params=param, 
                  data=dtrain,
                  nrounds = 200,
                  early_stopping_rounds = 10,
                  valids = valids,
                  bagging_fraction = 0.7, # random sample ratio from trainSet
                  bagging_freq = 10, # random sample freq from trainSet
                  bagging_seed = 1) # set.seed

pred2 <- predict(lgb2,data.matrix(test_dt %>% select(-userid,-overdue)))
pred20 <- as.data.frame(cbind(userid=test_dt$userid,probability = 1-pred2))

2.2 多分类

多分类 与 二分类参数设置上的不同主要有:

  1. 需要设置 num_class(label类别数量)
params <- list(objective = "multiclass", metric = "multi_error", num_class = 3)
  1. predict 结果输出形式可以自定义
# A (30x3) matrix with the predictions, use parameter reshape
# class1 class2 class3
#   obs1   obs1   obs1
#   obs2   obs2   obs2
#   ....   ....   ....
my_preds <- predict(model, test[, 1:4], reshape = TRUE)

# We can also get the predicted scores before the Sigmoid/Softmax application
my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE)

# We can also get the leaf index
my_preds <- predict(model, test[, 1:4], predleaf = TRUE, reshape = TRUE)

下面是lightgbm官网对 iris 数据集的分类预测demo, 供实践参考

require(lightgbm)
# We load the default iris dataset shipped with R
data(iris)

# We must convert factors to numeric
# They must be starting from number 0 to use multiclass
# For instance: 0, 1, 2, 3, 4, 5...
iris$Species <- as.numeric(as.factor(iris$Species)) - 1

# We cut the data set into 80% train and 20% validation
# The 10 last samples of each class are for validation

train <- as.matrix(iris[c(1:40, 51:90, 101:140), ])
test <- as.matrix(iris[c(41:50, 91:100, 141:150), ])
dtrain <- lgb.Dataset(data = train[, 1:4], label = train[, 5])
dtest <- lgb.Dataset.create.valid(dtrain, data = test[, 1:4], label = test[, 5])
valids <- list(test = dtest)

# Method 1 of training
params <- list(objective = "multiclass", metric = "multi_error", num_class = 3)
model <- lgb.train(params = params,
                  data = dtrain,
                  nrounds = 100,
                  valids = valids,
                   min_data = 1,
                   learning_rate = 1,
                   early_stopping_rounds = 10)

# possibility for each class by cols:
my_preds <- predict(model, test[, 1:4], reshape = TRUE)
            [,1]       [,2]       [,3]
 [1,] 0.82590130 0.08704935 0.08704935
 [2,] 0.82590130 0.08704935 0.08704935
 [3,] 0.82590130 0.08704935 0.08704935
 [4,] 0.82590130 0.08704935 0.08704935
 [5,] 0.82590130 0.08704935 0.08704935
 [6,] 0.82590130 0.08704935 0.08704935
# We can also get the predicted scores before the Sigmoid/Softmax application
my_preds <- predict(model, test[, 1:4], rawscore = TRUE, reshape = TRUE)
       [,1]  [,2]  [,3]
 [1,]  1.50 -0.75 -0.75
 [2,]  1.50 -0.75 -0.75
 [3,]  1.50 -0.75 -0.75
 [4,]  1.50 -0.75 -0.75
 [5,]  1.50 -0.75 -0.75
 [6,]  1.50 -0.75 -0.75
# We can also get the leaf index
my_preds <- predict(model, test[, 1:4], predleaf = TRUE)
      [,1] [,2] [,3]
 [7,]    0    0    0
 [8,]    0    0    0
 [9,]    0    0    0
[10,]    0    0    0
[11,]    1    6    0
[12,]    2    6    0

3. 参数调优

## For faster speed
# Use bagging by setting bagging_fraction and bagging_freq
# Use feature sub-sampling by setting feature_fraction
# Use small max_bin
# Use save_binary to speed up data loading in future learning
# Use parallel learning, refer to parallel learning guide.

## For better accuracy
# Use large max_bin (may be slower)
# Use small learning_rate with large num_iterations
# Use large num_leaves(may cause over-fitting)
# Use bigger training data
# Try dart

## Deal with over-fitting
# Use small max_bin
# Use small num_leaves
# Use min_data_in_leaf and min_sum_hessian_in_leaf
# Use bagging by set bagging_fraction and bagging_freq
# Use feature sub-sampling by set feature_fraction
# Use bigger training data
# Try lambda_l1, lambda_l2 and min_gain_to_split to regularization
# Try max_depth to avoid growing deep tree

参考资料

lightgbm R-package github
lightgbm demos

上一篇 下一篇

猜你喜欢

热点阅读