R语言实战17:分类

2019-08-22  本文已影响0人  冬之心

knitr::opts_chunk$set(echo = TRUE)
函数 分类方法
基本包 glm() 逻辑回归
rpart rpart() 经典决策树
party ctree() 条件推断树
randomForest randomForest() 经典决策树的随机森林
party cforest() 条件推断树的随机森林
e1071 svm() 支持向量机

数据准备

#读取数据
loc <- "http://archive.ics.uci.edu/ml/machine-learning-databases/"
ds <- "breast-cancer-wisconsin/breast-cancer-wisconsin.data"
url <- paste(loc, ds, sep="")
breast <- read.table(url, sep=",", header=FALSE, na.strings = "?")
names(breast) <- c("ID", "clumpThickness", "sizeUniformity", "shapeUniformity", "maginalAdhesion","singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitosis", "class")

df <- breast[-1]  # 删除ID
#将class转为因子
df$class <- factor(df$class, levels=c(2,4), labels=c("benign", "malignant"))

#将库分为训练集和验证集
set.seed(1234)
train <- sample(nrow(df), 0.7*nrow(df))
df.train <- df[train,]   #70%样本进入训练集
df.validate <- df[-train,]  #30%样本进入验证集

#查看训练集和验证集的因变量分布
table(df.train$class)
table(df.validate$class)

Logit回归

#逻辑回归
fit.logit <- glm(class~., data=df.train, family=binomial())
summary(fit.logit)

#基于训练集的回归模型,对验证集的样本进行预测。
#predict()默认输出对数概率,指定参数type="response"即可得到概率。
prob <- predict(fit.logit, df.validate, type = "response")
logit.pred <- factor(prob>.5 , levels=c(FALSE, TRUE), labels=c("benign","malignant"))

#评估预测的准确性:预测结果与实际情况交叉对比。
logit.perf <- table(df.validate$class, logit.pred, dnn=c("Actual","Predicted"))
logit.perf

决策树

经典决策树

library(rpart)
set.seed(1234)
#生成树
dtree <- rpart(class~., data=df.train, method="class", parms = list(split="information"))

#rpart()返回的cptable值包括不同大小的树对应的预测误差。
#CP复杂度参数,nsplit树的分支数; rel errror误差; xerror10折交叉验证误差;xstd交叉验证误差的标准差。
dtree$cptable

#画图。虚线代表基于一个标准差准则得到的上限。选择虚线下最左侧CP值对应的树。
 plotcp(dtree)

#剪枝
 dtree.pruned <- prune(dtree, cp=0.0176)

# package ‘rpart.plot’ is not available (for R version 3.6.0) 没法安装画图包。
#library(rpart.plot)
#prp(dtree.pruned, type=2, extra=104, fallen.leaves=TRUE, main="Decision Tree")

# 基于训练集的回归模型,对验证集的样本进行预测,并验证。
dtree.pred <- predict(dtree.pruned, df.validate, type="class")
dtree.perf <- table(df.validate$class, dtree.pred, dnn = c("Actual", "Predicted"))
dtree.perf
Rplot02.png

条件推断树


library(party)
fit.ctree <- ctree(class~., data=df.train)
plot(fit.ctree, main="Conditional Inference Tree")



ctree.pred <- predict(fit.ctree, df.validate, type="response")
ctree.perd <- table(df.validate$class, ctree.pred, dnn = c("Actual", "Predicted"))
ctree.perd
Rplot.png
library(partykit)
plot(as.party(dtree.pruned))
Rplot01.png

随机森林

library(randomForest)
set.seed(1234)
fit.forest <- randomForest(class~., data=df.train, na.action=na.roughfix, importance=TRUE)
fit.forest

# 度量变量重要性
importance(fit.forest, type=2)

# 预测和评估
forest.pred <- predict(fit.forest, df.validate)
forest.perf <- table(df.validate$class, forest.pred, dnn = c("Actual","Predicted"))
forest.perf

支持向量机(SVM)

library(e1071)

set.seed(1234)
fit.svm<- svm( class~., data=df.train)
fit.svm

svm.pred <- predict(fit.svm, na.omit(df.validate))
svm.perf <- table(na.omit(df.validate)$class, svm.pred, dnn=c("Actual", "Predicted"))
svm.perf
set.seed(1234)
tuned <- tune.svm(class~., data = df.train, gamma = 10^(-6:1), cost=10^(-10:10))
tuned

fit.svm<- svm( class~., data=df.train, gamma=0.01, cost=1)
svm.pred <- predict(fit.svm, na.omit(df.validate))
svm.perf <- table(na.omit(df.validate)$class, svm.pred, dnn=c("Actual", "Predicted"))
svm.perf
上一篇 下一篇

猜你喜欢

热点阅读