第7章 模型评估
2021-11-27 本文已影响0人
zd200572
7.2 k折交叉验证模型性能
这个方法可以解决过度适应的问题,
library(modeldata)
library(e1071)
data(mlc_churn)
churnTrain <- mlc_churn
ind <- cut(1:nrow(churnTrain), breaks = 10, labels = F)
accuracies <- c()
for (i in 1:10) {
fit <- svm(churn~.,churnTrain[ind!=i,])
predictions <- predict(fit, churnTrain[ind==i,!names(churnTrain) %in% c("churn")])
correct_count <- sum(as.data.frame(predictions) == churnTrain[ind==i,c("churn")])
accuracies <- append(correct_count/nrow(churnTrain[ind==i,]),accuracies)
}
accuracies
[1] 0.920 0.874 0.906 0.902 0.874 0.890 0.884 0.904 0.922 0.902
7.3 e1071包进行交叉验证
# e1071 交叉验证
library(e1071)
churnTrain <- churn[,!names(churn) %in% c('state',
'area_code', "account_length")]
set.seed(2)
ind <- sample(2,nrow(churnTrain),replace = TRUE,
prob = c(0.7,0.3))
trainset <- churnTrain[ind==1,]
testset <- churnTrain[ind==2,]
tuned <- tune.svm(churn~., data = trainset, gamma = 10^-2,
cost = 10^2, tunedcontrol = tune.control(cross = 10))
summary(tuned)
# Error estimation of ‘svm’ using 10-fold cross validation: 0.07473914
tuned$performances
gamma cost error dispersion
1 0.01 100 0.07473914 0.01605983
svmfit <- tuned$best.model
table(trainset[,c("churn")], predict(svmfit))
yes no
yes 400 93
no 14 2972
tune函数采用风格式搜索方法来完成参数优化,查看其他优化函数?tune
7.4 caret包进行交叉验证
# caret
library(caret)
# 重复3次10折交叉
control <- trainControl(method = "repeatedcv",number = 10,
repeats = 3)
model <- train(churn~.,data = trainset, method = "rpart",
preProcess="scale", trControl=control)
model
CART
3479 samples
19 predictor
2 classes: 'yes', 'no'
Pre-processing: scaled (69)
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 3131, 3132, 3130, 3130, 3131, 3132, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.04056795 0.9301563 0.6720876
0.05578093 0.9147378 0.5774005
0.07505071 0.8731365 0.2416613
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.04056795.
trainControl中可以设置重采样的参数,指定boot\boot632\cv\repeatdcv\LOOCV\LGOCV\non\oob\adaptive_cv\adaptive_boot\adaptive_LGOCV等。
7.5 caret包对变量重要程度排序
得到监督学习模型后,可以改变输入值,比较给定模型输出效果的变化敏感程度来评估不同特征对模型的重要性。
# 重要性排序
importance <- varImp(model, scale = FALSE)
importance
rpart variable importance
only 20 most important variables shown (out of 69)
Overall
international_planyes 196.919
total_day_minutes 172.870
total_day_charge 161.582
number_customer_service_calls 155.274
total_intl_minutes 133.221
total_intl_charge 124.817
total_intl_calls 48.700
voice_mail_planyes 40.912
total_eve_charge 31.116
total_eve_minutes 31.116
...
plot(importance)
终于出个图
扩展
rpart等一些分类算法包中从训练模型中产生的对象包含了变量重要性,可以查看和输出。
library(rpart)
model.rp <- rpart(churn~.,data = trainset)
model.rp$variable.importance
total_day_charge total_day_minutes state
161.951011 161.951011 85.145010
total_intl_minutes number_customer_service_calls total_intl_charge
80.504234 76.323349 75.141041
...
7.6 rimier包进行变量重要程度排序
这个费了好大劲,好像只有数值变量才行。
# rminier
install.packages("rminer")
library(rminer)
model <- fit(Species~., iris,model = "svm")
variable.importance <- Importance(model,iris,method = "sensv")
L=list(runs=1,sen=t(variable.importance$imp),sresponses=variable.importance$sresponses)
mgraph(L,graph = "IMP",leg = names(iris),col = "gray", Grid = 4)
7.7 caret包找到高度关联的特征
去掉非数值型属性,相关性计算获得一个关联度矩阵,将阈值设置为0.75,挑选高度关联的属性。
# findCorrelation
new_train <- trainset[,!names(trainset) %in% c('churn',
'international_plan', "voice_mail_plan")]
cor_mat <- cor(new_train)
highlyCorrelated <- findCorrelation(cor_mat, cutoff = 0.75)
names(new_train)[highlyCorrelated]
[1] "total_night_minutes" "total_intl_charge" "total_day_charge" "total_eve_charge"
还可以使用subselect包中的leaps, genetic和anneal函数达到同样效果。
7.8 利用caret包选择特征
特征选择可以挑选出预测误差最低的属性子集,有助于我们判断究竟应该使用哪些特征才能建立一个精确的模型,递归特征排除函数rfe,自动选出符合要求的特征。
# caret选择特征
library(modeldata)
library(caret)
data(mlc_churn)
churnTrain <- mlc_churn
ind <- sample(2,nrow(churnTrain),replace = TRUE,
prob = c(0.7,0.3))
trainset <- churnTrain[ind==1,]
testset <- churnTrain[ind==2,]
# 特征转换,应该就是yes no变1,0,因子变二元属性
intl_plan <- model.matrix(~ trainset.international_plan - 1,
data = data.frame(trainset$international_plan))
colnames(intl_plan) <- c("trainset.international_planno"="intl_no",
"trainset.international_planyes"="intl_yes")
voice_plan <- model.matrix(~ trainset.voice_mail_plan - 1,
data = data.frame(trainset$voice_mail_plan))
colnames(voice_plan) <- c("trainset.voice_mail_planno"="voice_no",
"trainset.voice_mail_planyes"="voice_yes")
trainset$international_plan <- NULL
trainset$voice_mail_plan <- NULL
trainset <- cbind(intl_plan, voice_plan,trainset)
# testset同样操作
intl_plan <- model.matrix(~ testset.international_plan - 1,
data = data.frame(testset$international_plan))
colnames(intl_plan) <- c("testset.international_planno"="intl_no",
"testset.international_planyes"="intl_yes")
voice_plan <- model.matrix(~ testset.voice_mail_plan - 1,
data = data.frame(testset$voice_mail_plan))
colnames(voice_plan) <- c("testset.voice_mail_planno"="voice_no",
"testset.voice_mail_planyes"="voice_yes")
testset$international_plan <- NULL
testset$voice_mail_plan <- NULL
testset <- cbind(intl_plan, voice_plan,testset)
# 线性判别分析创建一个特征筛选算法,交叉验证方法cv
ldaControl <- rfeControl(functions = ldaFuncs, method = "cv")
# 反向特征筛选
ldaProfile <- rfe(trainset[,names(trainset)!='churn'][,-c(5,6,7)],
trainset[,'churn'],sizes = c(1:18), rfeControl = ldaControl)
ldaProfile
Recursive feature selection
Outer resampling method: Cross-Validated (10 fold)
Resampling performance over subset size:
Variables Accuracy Kappa AccuracySD KappaSD Selected
1 0.8554 0.02245 0.009841 0.06008
2 0.8554 0.02245 0.009841 0.06008
3 0.8494 0.23064 0.013363 0.10679
...
plot(ldaProfile, type = c("o","g"))
# 最佳变量子集
ldaProfile$optVariables
# 合适模型
ldaProfile$fit
Call:
lda(x, y)
Prior probabilities of groups:
yes no
0.1417074 0.8582926
Group means:
postResample(predict(ldaProfile, testset[,names(testset)!="churn"]), testset[,c("churn")])
Accuracy Kappa
0.8520710 0.2523709
又到了画图时间
扩展
7.9 评测回归模型性能
均方根误差法RMSE,相对平方差RSE,可决系数R-Square。
# 回归模型性能评估
library(car)
data("Quartet")
plot(Quartet$x,Quartet$y3)
lmfit<- lm(Quartet$y3~Quartet$x)
abline(lmfit, col="red")
predicted <- predict(lmfit,newdata = Quartet[c("x")])
actual <- Quartet$y3
# 这里用的caret包的这个函数,这个包是个宝呀,啥都有
rmse <- RMSE(predicted, actual)
mu <- mean(actual)
rse <- mean((predicted-actual)^2)/mean((mu-actual)^2)
Rsquare <- 1-rse
c(rmse,rse,Rsquare)
[1] 1.118286 0.333676 0.666324
# MASS包rlm重新计算
library(MASS)
plot(Quartet$x,Quartet$y3)
rlmfit <- rlm(Quartet$y3~Quartet$x)
abline(rlmfit,col='red')
predicted <- predict(rlmfit, newdata = Quartet['x'])
rmse <- (mean((predicted-actual)^2))^0.5
rse <- mean((predicted-actual)^2)/mean((mu-actual)^2)
rsqure <- 1-rse
c(rmse,rse,Rsquare)
[1] 1.2790448 0.4365067 0.6663240
还是这张老图
优点无视极值
扩展
library(e1071)
tune(lm,y3~x,data = Quartet)
Error estimation of ‘lm’ using 10-fold cross validation: 2.351897
caret的train函数交叉验证,DAAG包的cv.lm可以达到同样效果
7.10 利用混淆矩阵评测模型的预测能力
模型的精确度、召回率、特异性以及准确率等性能指标
# 混淆矩阵
svm.model <- train(churn~., data=trainset,method = "svmRadial")
svm.pred <- predict(svm.model, testset[,names(testset)!="churn"])
table(svm.pred, testset[,"churn",drop=TRUE])
svm.pred yes no
yes 12 1
no 198 1290
confusionMatrix(svm.pred,testset[,"churn",drop=TRUE])
Confusion Matrix and Statistics
Reference
Prediction yes no
yes 12 1
no 198 1290
Accuracy : 0.8674
95% CI : (0.8492, 0.8842)
No Information Rate : 0.8601
P-Value [Acc > NIR] : 0.2183
Kappa : 0.0928
7.11 ROCR评测模型的预测能力
受试者工作曲线ROC是一种常见的二元分类系统性能展示图形,曲线上分别标注了不同切点的真阳和假阳率。通常会基于曲线下面积AUC来衡量模型的分类性能。
install.packages("ROCR")
library(ROCR)
svmfit <- svm(churn~.,data = trainset, prob = TRUE)
pred <- predict(svmfit, testset[,names(testset)!="churn"],probability = TRUE)
# 标记yes的概率
pred.prob <- attr(pred,"probabilities")
pred.to.roc <- pred.prob[, 2]
# 预测
pred.rocr <- prediction(pred.to.roc, testset$churn)
# 性能评估
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr", "fpr")
plot(perf.tpr.rocr, colorize = T, main = paste("AUC:", (perf.rocr@y.values)))
7.12 利用caret包比较ROC曲线
install.packages("pROC")
library(pROC)
# 重复3次10折交叉
control <- trainControl(method = "repeatedcv", number = 10,
repeats = 3,classProbs = TRUE,
summaryFunction = twoClassSummary)
# glm分类
glm.model <- train(churn ~., data = trainset,method = "glm",
metric = "ROC", trControl = control)
# svm分类
svm.model <- train(churn ~., data = trainset,method = "svmRadial",
metric = "ROC", trControl = control)
rpart.model <- train(churn ~., data = trainset,method = "rpart",
metric = "ROC", trControl = control)
# 分别预测
glm.probs <- predict(glm.model, testset[,names(testset) != "churn"], type = "prob")
svm.probs <- predict(svm.model, testset[,names(testset) != "churn"], type = "prob")
rpart.probs <- predict(rpart.model, testset[,names(testset) != "churn"], type = "prob")
# 生成ROC曲线,在一个图中
glm.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
predictor = glm.probs$yes,
levels = levels(testset[,"churn",drop=TRUE]))
plot(glm.ROC, type = "S", col = 'red')
svm.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
predictor = svm.probs$yes,
levels = levels(testset[,"churn",drop=TRUE]))
plot(svm.ROC, add = TRUE, col = 'green')
rpart.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
predictor = rpart.probs$yes,
levels = levels(testset[,"churn",drop=TRUE]))
plot(rpart.ROC, add = TRUE, col = 'blue')
在这里困扰了好久,一直报错,
Error in colnames(data) : argument "data" is missing, with no default
,最后发现由于tibble默认不降维引起的,加drop=TRUE
解决。
7.13 caret包比较模型性能差异
# 模型重采样
cv.values <- resamples(list(glm=glm.model, svm=svm.model, rpart = rpart.model))
summary(cv.values)
Call:
summary.resamples(object = cv.values)
Models: glm, svm, rpart
Number of resamples: 30
ROC
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
glm 0.7483007 0.7977494 0.8187351 0.8200275 0.8357939 0.8953608 0
svm 0.8356721 0.8594691 0.8722842 0.8778421 0.9018812 0.9184929 0
rpart 0.6446078 0.7146965 0.7666979 0.7773052 0.8459407 0.9173395 0
Sens
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
glm 0.1600000 0.2088235 0.2672549 0.2667320 0.3184314 0.40 0
svm 0.2600000 0.3879412 0.4454902 0.4460654 0.5049020 0.66 0
rpart 0.2352941 0.3800000 0.4411765 0.4622876 0.5637255 0.76 0
Spec
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
glm 0.9509804 0.9639639 0.9673203 0.9666242 0.9705882 0.9771242 0
svm 0.9346405 0.9542484 0.9672131 0.9641094 0.9729749 0.9836601 0
rpart 0.9705882 0.9836601 0.9901639 0.9885492 0.9934641 1.0000000 0
dotplot(cv.values, metric = "ROC")
bwplot(cv.values, layout=c(3,1))
扩展
还可以使用densityplot.splom和xyplot函数可视化