机器学习(R语言) part1.preprocessing
2016-08-29 本文已影响305人
任海亮
本文是Johns Hopkins Univerisity<Practical Machine Learning>的课堂笔记
文章是Rmarkdown中编写的(需要安装Rstudio和knitr)
首先安装caret包
install.packages("rmarkdown")
install.packages("knitr")
install.packages("caret")
{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE)
{r pack, warning=FALSE}
#导入caret包和ggplot2包
library(kernlab)#import spam
library(caret)
library(ggplot2)
random split划分数据集
{r, warning=FALSE}
data(spam)
str(spam)
## 'data.frame': 4601 obs. of 58 variables:
## $ make : num 0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
## $ address : num 0.64 0.28 0 0 0 0 0 0 0 0.12 ...
## $ all : num 0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
## $ num3d : num 0 0 0 0 0 0 0 0 0 0 ...
## $ our : num 0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
## $ over : num 0 0.28 0.19 0 0 0 0 0 0 0.32 ...
## $ remove : num 0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
## $ internet : num 0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
## $ order : num 0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
## $ mail : num 0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
## $ receive : num 0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
## $ will : num 0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
## $ people : num 0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
## $ report : num 0 0.21 0 0 0 0 0 0 0 0 ...
## $ addresses : num 0 0.14 1.75 0 0 0 0 0 0 0.12 ...
## $ free : num 0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
## $ business : num 0 0.07 0.06 0 0 0 0 0 0 0 ...
## $ email : num 1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
## $ you : num 1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
## $ credit : num 0 0 0.32 0 0 0 0 0 3.53 0.06 ...
## $ your : num 0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
## $ font : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num000 : num 0 0.43 1.16 0 0 0 0 0 0 0.19 ...
## $ money : num 0 0.43 0.06 0 0 0 0 0 0.15 0 ...
## $ hp : num 0 0 0 0 0 0 0 0 0 0 ...
## $ hpl : num 0 0 0 0 0 0 0 0 0 0 ...
## $ george : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num650 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lab : num 0 0 0 0 0 0 0 0 0 0 ...
## $ labs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ telnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num857 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data : num 0 0 0 0 0 0 0 0 0.15 0 ...
## $ num415 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num85 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ technology : num 0 0 0 0 0 0 0 0 0 0 ...
## $ num1999 : num 0 0.07 0 0 0 0 0 0 0 0 ...
## $ parts : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ direct : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ cs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ meeting : num 0 0 0 0 0 0 0 0 0 0 ...
## $ original : num 0 0 0.12 0 0 0 0 0 0.3 0 ...
## $ project : num 0 0 0 0 0 0 0 0 0 0.06 ...
## $ re : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ edu : num 0 0 0.06 0 0 0 0 0 0 0 ...
## $ table : num 0 0 0 0 0 0 0 0 0 0 ...
## $ conference : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charSemicolon : num 0 0 0.01 0 0 0 0 0 0 0.04 ...
## $ charRoundbracket : num 0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
## $ charSquarebracket: num 0 0 0 0 0 0 0 0 0 0 ...
## $ charExclamation : num 0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
## $ charDollar : num 0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
## $ charHash : num 0 0.048 0.01 0 0 0 0 0 0.022 0 ...
## $ capitalAve : num 3.76 5.11 9.82 3.54 3.54 ...
## $ capitalLong : num 61 101 485 40 40 15 4 11 445 43 ...
## $ capitalTotal : num 278 1028 2259 191 191 ...
## $ type : Factor w/ 2 levels "nonspam","spam": 2 2 2 2 2 2 2 2 2 2 ...
inTrain <- createDataPartition(y=spam$type,p =0.75,list=FALSE)
training <-spam[inTrain,]
testing <-spam[-inTrain,]
dim(training)
建模
{r, warning=FALSE}
set.seed(32343)
modelFit <- train(type~.,data = training,method ="glm")
modelFit
## Generalized Linear Model
##
## 3451 samples
## 57 predictor
## 2 classes: 'nonspam', 'spam'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9135469 0.8170884
##
##
modelFit$finalModel
##
## Call: NULL
##
## Coefficients:
## (Intercept) make address
## -1.502e+00 -6.005e-01 -1.306e-01
## all num3d our
## 1.062e-01 2.070e+00 4.989e-01
## over remove internet
## 5.783e-01 2.091e+00 6.461e-01
## order mail receive
## 6.232e-01 5.494e-02 2.028e-01
## will people report
## -1.649e-01 7.905e-02 1.167e-01
## addresses free business
## 1.001e+00 1.473e+00 9.881e-01
## email you credit
## 9.061e-02 8.283e-02 6.958e-01
## your font num000
## 2.186e-01 1.971e-01 2.628e+00
## money hp hpl
## 6.129e-01 -1.860e+00 -9.281e-01
## george num650 lab
## -1.011e+01 7.645e-01 -2.274e+00
## labs telnet num857
## -3.314e-01 -2.708e-01 1.333e+00
## data num415 num85
## -6.654e-01 2.115e+00 -2.018e+00
## technology num1999 parts
## 8.414e-01 8.711e-02 1.335e+00
## pm direct cs
## -7.447e-01 -1.470e-01 -4.479e+01
## meeting original project
## -3.197e+00 -8.725e-01 -1.326e+00
## re edu table
## -9.060e-01 -1.234e+00 -1.879e+00
## conference charSemicolon charRoundbracket
## -3.710e+00 -1.332e+00 -3.722e-01
## charSquarebracket charExclamation charDollar
## -7.725e-01 2.528e-01 6.307e+00
## charHash capitalAve capitalLong
## 2.252e+00 -2.200e-03 9.859e-03
## capitalTotal
## 6.601e-04
##
## Degrees of Freedom: 3450 Total (i.e. Null); 3393 Residual
## Null Deviance: 4628
## Residual Deviance: 1366 AIC: 1482
predictionglm <-predict(modelFit,newdata=testing)
head(predictionglm)
## [1] spam spam spam spam spam spam
## Levels: nonspam spam
使用混淆矩阵,准确率,KAPPA,P-VALUE等
{r, warning=FALSE}
confusionMatrix(predictionglm,testing$type)
## Confusion Matrix and Statistics
##
## Reference
## Prediction nonspam spam
## nonspam 663 43
## spam 34 410
##
## Accuracy : 0.933
## 95% CI : (0.917, 0.9468)
## No Information Rate : 0.6061
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8593
## Mcnemar's Test P-Value : 0.3619
##
## Sensitivity : 0.9512
## Specificity : 0.9051
## Pos Pred Value : 0.9391
## Neg Pred Value : 0.9234
## Prevalence : 0.6061
## Detection Rate : 0.5765
## Detection Prevalence : 0.6139
## Balanced Accuracy : 0.9281
##
## 'Positive' Class : nonspam
data slicing
folds <-createFolds(y=spam$type,k=10,list = TRUE, returnTrain = FALSE)
sapply(folds,length)
folds[[1]][1:10]
folds
flods2 <-createResample(y=spam$type,times =10,list = TRUE)
sapply(flods2,length)
flods2[[1]][1:10]
tme<-1:1000
folds3 <- createTimeSlices(y=tme,initialWindow = 20,horizon = 10)#time slice
names(folds3)
folds3$train[[1]]
folds3$test[[1]]
args(train.default)#train option
args(trainControl)
plot predictor
install.packages("ISLR")#use the dataset wage
{r wage data, warning=FALSE}
library(ISLR)
library(ggplot2)
library(caret)
data(Wage)
str(Wage)
## 'data.frame': 3000 obs. of 12 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ sex : Factor w/ 2 levels "1. Male","2. Female": 1 1 1 1 1 1 1 1 1 1 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
summary(Wage)
## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins logwage
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083 Min. :3.000
## 2. Information:1456 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447
## Median :4.653
## Mean :4.654
## 3rd Qu.:4.857
## Max. :5.763
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
get data
wagedata <- createDataPartition(y=Wage$wage,p=0.7,list=FALSE)
#划分数据集
trainingwage <-Wage[wagedata,]
testingwage <-Wage[-wagedata,]
dim(trainingwage)
featurePlot(x= trainingwage[,c("age","education")],
y=trainingwage$wage,plot = "pairs")
featurePlot(trainingwage[,c("age")],trainingwage$wage,plot="scatter")
Paste_Image.png
Paste_Image.png
add regression smoothers
{r, warning=FALSE}
ggplot(Wage,aes(age,wage,color=education))+geom_point()+geom_smooth(method='lm',formula = y~x)
Paste_Image.png
TABLE
{r, warning=FALSE}
t1 <- table(trainingwage$education,trainingwage$jobclass)
t1
##
## 1. Industrial 2. Information
## 1. < HS Grad 145 57
## 2. HS Grad 433 241
## 3. Some College 250 211
## 4. College Grad 184 289
## 5. Advanced Degree 72 220
prop.table(t1,1)
ggplot(Wage,aes(wage,color=education))+geom_density()
Paste_Image.png
preprcocessing
preprocessfunction in Caret
{r, warning=FALSE}
preobj <-preProcess(training[,-58],method = c('center','scale'))
traincapavess <-predict(preobj,training[,-58])$capitalAve
mean(traincapavess)
## [1] 1.051522e-17
sd(traincapavess)
## [1] 1
set.seed(32343)
modelFit<-train(type~.,data=training,preProcess=c("center","scale"),method="glm")
deal with missing value,用KNN 赋值,eg
library(RANN)
#新建一列capAve
training$capAve <-training$capitalAve
#随机一些NA值给新建列
selectNA <-rbinom(dim(training)[1],size=1,prob=0.05)==1
training$capAve[selectNA] <- NA
#用KNN方法赋值
preobj1 <-preProcess(training[,-58],method='knnImpute')
capAve <- predict(preobj1,training[,-58])$capAve
Covariate creation
raw data to covariate, balance is summarization vs information loss
from tidy covariate to new covariates like sqrt the covariate
should be done only on the training set
the best approach is throught exploratory analysis (plotting/tables)
new covariate should be add to data frames
deal dummy variables哑变量
table(trainingwage$jobclass)
##
## 1. Industrial 2. Information
## 1084 1018
dummies <-dummyVars(wage~jobclass, data=trainingwage)
tr <- predict(dummies,newdata=trainingwage)
remove zero covariates移除零相关变量
nsv <- nearZeroVar(trainingwage,saveMetrics = TRUE)
nsv
## freqRatio percentUnique zeroVar nzv
## year 1.011561 0.33301618 FALSE FALSE
## age 1.081081 2.85442436 FALSE FALSE
## sex 0.000000 0.04757374 TRUE TRUE
## maritl 3.174009 0.23786870 FALSE FALSE
## race 8.386473 0.19029496 FALSE FALSE
## education 1.424947 0.23786870 FALSE FALSE
## region 0.000000 0.04757374 TRUE TRUE
## jobclass 1.064833 0.09514748 FALSE FALSE
## health 2.497504 0.09514748 FALSE FALSE
## health_ins 2.279251 0.09514748 FALSE FALSE
## logwage 1.011905 18.93434824 FALSE FALSE
## wage 1.011905 18.93434824 FALSE FALSE
so you can through away sex region, that won't help to the prediction parameters
sbline basis
preprocessing with PCA twoway:SVD & PCA
smallSpam <-spam[,c(34,32)]
prco <-prcomp(smallSpam)
plot(prco$x[,1],prco$x[,2])
Paste_Image.png
plot PCA
1.用prcomp方法
typecolor <-((spam$type =="spam")*1+1)
table(typecolor)
## typecolor
## 1 2
## 2788 1813
prco <- prcomp(log10(spam[,-58]+1))#计算整个DATA的PCA,log是为了看起来更高斯分布
plot(prco$x[,1],prco$x[,2],col= typecolor,xlab="PC1",ylab="PC2")
Paste_Image.png
2.plot PCA with caret
preprospam <- preProcess(log10(spam[,-58]+1),method = "pca",pcaComp =2)
spamPC <- predict(preprospam,log10(spam[,-58]+1))
plot(spamPC[,1],spamPC[,2],col=typecolor)
Paste_Image.png
3.alternative option
modelfit <-train(training$type~.,method = "glm",preProcess ="pca",data=training)
confusionMatrix(testing$type,predict(modelfit,testing))
prediction with regression
data(faithful)
#splitdata in train and test
intrainf <-createDataPartition(y=faithful$waiting,p=0.5,list=FALSE)
trainFaith<-faithful(intrainf)
testFaith<-faithful(-intrainf)
lm1 <-lm(erution~waiting,data=trainFaith)
summary(lm1)
plot(trainFaith$waiting,trainFaith$eruptions)
lines(trainFaith$waiting,lm1$fitted)
#predict
newdata <-data.frame(waiting=80)
predict(lm1,newdata)
#calculate RMSE
sqrt(sum(lm1$fitted-trainFaith$eruptions)^2)
sqrt(sum(predict((lm1,newdata=testFaith)-testFaith$eruptions)^2))
#predict intervals