[R - ml]模型优化方法:bagging
2018-11-23 本文已影响3人
XuXiaolu
建立数据
set.seed(10)
y = seq(1:1000)
x1 = seq(1:1000)*runif(1000, min = 0, max = 2)
x2 = seq(1:1000)*runif(1000, min = 0, max = 2)
x3 = seq(1:1000)*runif(1000, min = 0, max = 2)
my.data = data.frame(y, x1, x2, x3)
线性模型
lm.fit = lm(y~. , data = my.data)
summary(lm.fit)
训练集和测试集
set.seed(10)
size = floor(nrow(my.data)*0.75)
in.train = sample(nrow(my.data), size = size)
training = my.data[in.train, ]
testing = my.data[-in.train, ]
lm.fit = lm(y~. , data = training)
pred = predict(lm.fit, newdata = testing)
rmse = sqrt(sum((testing$y - pred)^2) / nrow(testing))
# rmse = 161.1543
bagging
require(foreach)
iterations = 1000
length.sample = 1/4
size = floor(nrow(training)*length.sample)
pred = foreach(i = 1:iterations, .combine = cbind) %do% {
in.train = sample(nrow(training), size = size) # 采样 0.25
lm.fit = lm(y~. , data = training[in.train, ])
predict(lm.fit, newdata = testing)
}
head(pred)
# bagging 方法:
# 回归模型,求平均
# 分类模型,投票
final.pred = rowMeans(pred)
rmse = sqrt(sum((testing$y - final.pred)^2) / nrow(testing))
# rmse = 161.0751
# bagging 改进不是特别大
# 如果样本间差距较大,bagging 会有较大提升
自定义一个 bagging 函数
bagging = function(training, testing, formula = y ~ x1 + x2 + x3, length_divisor = 4, iterations = 100) {
predictions <- foreach(i = 1:iterations, .combine = cbind) %do% {
training_posistions = sample(nrow(training), size = floor(nrow(training) / length_divisor))
train_pos = 1:nrow(training) %in% training_posistions
lm.fit = lm(formula, data = training[train_pos, ])
predict(lm.fit, newdata = testing)
}
rowMeans(predictions)
}