数据-R语言-图表-决策-Linux-Python

R语言实战_终篇

2017-03-29  本文已影响258人  00e6a8fd618f

经过近三个月的学习,匆忙将《R in action》过了一遍,自己实现了其中的绝大部分代码。当然这个过程是机械重复而非创造性的工作。不过编程初期就是这样把,不断重复别人的代码,领会编程的奥妙。

其实初期的学习因为没有问题导向,看的比较浅薄,以至于在后来统计课程的需要,自己亲手做了一个小的项目才意识到学习方法需要改进、统计知识需要补充。好在通过整个过程,各个方面都在提高。

之前一直在按照章节更新学习的笔记。其实最有帮助的只是代码的注释。因为后边的学习没有详尽的笔记,只保存了部分代码及注释,分享在此希望能对后学者有所帮助。另,如有任何问题欢迎通过各种方式联系我,大家一起讨论。

接下来将投入到R在交通方面的具体问题学习中。具体的近期任务是mlogit及交通相关的几个packages的学习。


#13 泊松回归

#载入程序包
library(rrcov)
library(robust)

#查看数据
data(breslow.dat, package="robust")
names(breslow.dat)
summary(breslow.dat[c(6, 7, 8, 10)]) #查看所研究变量的统计信息
#基础信息图表
opar <- par(no.readonly=TRUE) #创建新的绘图参数
par(mfrow=c(1,2)) #一行两列放置图形
attach(breslow.dat)
hist(sumY, breaks=20, #绘制sumY直方图,分组参考20
     xlab="Geizure Count",
     main="Distribution of Seizures")
boxplot(sumY ~ Trt, xlab="Treamtment",  #按照Trt分组,对sumY绘制箱线图
        main="Group Comparisons")
par(opar)
#进行泊松回归
fit <- glm(sumY ~ Base + Age + Trt, data=breslow.dat, family=poisson())
summary(fit) #显示模型主要结果
#解释模型参数
coef(fit) #显示模型系数(此为对数系数)
exp(coef(fit)) #指数化系数
#过度离势检验
library(qcc)
qcc.overdispersion.test(breslow.dat$sumY, type="poisson") #是否存在过度离势检验
fit.od <- glm(sumY ~ Base + Age + Trt, data=breslow.dat,
              family=quasipoisson()) #类泊松方法拟合
summary(fit.od)
#时间段变化的泊松回归
fittime <- glm(sumY ~ Base + Age + Trt, data=breslow.dat,
               offset= log(time), family=poisson())

opar <- par(no.readonly=TRUE) #

#14主成份和因子分析
#主成分分析-主成分个数判断-三种特征值判别准则
library(psych)
fa.parallel(USJudgeRatings[,-1],  #所有观测(列),删去第一列
            fa="pc", n.iter=100, #100个随机数据矩阵对比
            #原书代码不能执行,修改fa="pc"
            show.legend=FALSE, 
            main="Scree plot with parallel analysis")
abline(h=1, lwd=1, col="green") #手动添加直线

#代码清单14-1 主成分分析
library(psych)
pc <- principal(USJudgeRatings[,-1], nfactors=1)
pc
#case2 
library(psych)
fa.parallel(Harman23.cor$cov, n.obs=302, #n.obs观测数
            fa="pc", n.iter=100,
            show.legend=FALSE, main="Scree plot with parallel analysis")
#代码清单14-2 身高测量指标的主成分分析
library(psych)
pc <- principal(Harman23.cor$cov, 
                nfactors=2, rotate="none")
pc
#代码清单14-3 方差极大旋转的主成分分析
rc <- principal(Harman23.cor$cov, nfactors=2, rotate="varimax")
rc
#代码清单14-4 从原始数据中获取成分得分
library(psych)
pc <- principal(USJudgeRatings[,-1], nfactors=1, score=TRUE)
head(pc$scores) #查看各个观测对象在主成分上的得分,head
cor(USJudgeRatings$CONT, pc$score) #变量CONT与评分间的相关系数
#代码清单14-5 获取主成分得分的系数
library(psych)
rc <- principal(Harman23.cor$cov, nfactors=2, rotate="varimax")
round(unclass(rc$weights),2)

#14.3 探索性因子分析
options(digits=2) #显示小数位数
covariances <- ability.cov$cov #ability.cov协方差矩阵数据转换为相关系数矩阵
correlations <- cov2cor(covariances)  #cov2cor()转化相关系数矩阵
##判断需提取的因子数
fa.parallel(correlations, n.obs=112,  #观测数112
            fa="both", n.iter=100, #fa="both"同时展示PCA和EFA分析结果
            main="Scree plots with parallel analysis")
#提取公共因子
##代码清单14-5 为旋转的主轴迭代因子法
fa <- fa(correlations, nfactors=2, rotate="none", fm="pa")
fa
#因子旋转
##代码清单14-7 正交旋转提取因子
fa.varimax <- fa(correlations, nfacotrs=2, rotate="varimax", fm="pa")
fa.varimax
#斜交旋转提取因子
fa.promax <- fa(correlations, nfactors=2, rotate="promax", fm="pa")
fa.promax
#因子结构矩阵(因子载荷阵)求解函数:F = P * Phi
fsm <- function(oblique) {
  if  (class(oblique)[2] == "fa" & is.null(oblique$Phi)) {
       waning("Object doesn't look oblique EFA")
  } else {
       P <- unclass(oblique$loading)
       F <- P %*% oblique$Phi
       colnames(F) <- c("PA1", "PA2")
       return(F)
     }
}
#使用函数
fsm(fa.promax)
#绘图结果可视化
factor.plot(fa.promax, labels=rownames(fa.promax$loadings)) #轴为载荷大小
fa.diagram(fa.promax, simple=FALSE)


#第15章 处理缺失数据的高级方法
install.packages(c("VIM", "mice"))
library(VIM)
library(mice)

data(sleep, package="VIM") #加载数据集
sleep[complete.cases(sleep),] #列出没有缺失值的行
sleep[!complete.cases(sleep),] #列出含有缺失值的行

sum(is.na(sleep$Dream)) #Dream数据有多少个缺失值
mean(is.na(sleep$Dream)) #缺失占总体比例
mean(!complete.cases(sleep)) #含有确缺失数据的观测的比例

library(mice)
data(sleep, package="VIM")
md.pattern(sleep)

library(VIM)
aggr(sleep, prop=FALSE, numbers=TRUE)
matrixplot(sleep)

#用相关性探索缺失值
x <- as.data.frame(abs(is.na(sleep))) #计算缺失数据
head(sleep, n=5)  #查看原始数据
head(x, n=5) #查看缺失数据(1为缺失)
y <- x[which(sd(x) > 0)] #提取*含*缺失值的变量
cor(y) #指示变量间相关系数

#实例分析(行删除)
complete.cases(sleep) #识别缺失值行
newdata <- sleep[complete.cases(sleep), ]
na.omit(sleep)
options(digits=1) #显示小数点后1位
cor(na.omit(sleep))
#多重插补
library(mice)
data(sleep, package="VIM")
imp <- mice(sleep, seed=1234)
fit <- with(imp, lm(Dream ~ Span + Gest))
pooled <- pool(fit)
summary(pooled)

#16 高级图形进阶
library(lattice)
singer
histogram(~height | voice.part, data=singer,
          main="Distribution of Heights by Voice Pitch",
          xlab="Height (inches)")
#16-1 lattice绘图示例
library(lattice)
attach(mtcars)
mtcars
gear <- factor(gear, levels=c(3, 4, 5),
               labels=c("3 gears", "4 gears", "5 gears")) #分组标签
cyl <- factor(cyl,levels=c(4, 6, 8),
              labels=c("4 cylinders", "6 cylinders", "8 cylinders"))
densityplot(~mpg, main="Density Plot",
            xlab="Miles per Gallon") #核密度图
densityplot(~mpg | cyl,
            main="Density Plot by Number of Cylinders",
            xlab="Miles per Gallon") #条件变量核密度图
bwplot(cyl ~ mpg | gear,
       main="Box Plots by Cylinders and Gears",
       xlab="Miles per Gallon", ylab="Cylinders") #箱线图
xyplot(mpg ~ wt | cyl * gear,
       main="Scatter Plots by Cylinders and Gears",
       xlab="Car Weight", ylab="Miles per Gallon") #散点图
cloud(mpg ~ wt * qsec | cyl,
      main="3D Scatter Plots by Cylinders") #三维散点图
dotplot(cyl ~ mpg | gear, 
        main="Dot Plots by Number of Gears and Cylinders",
        xlab="Miles Per Gallon") #点图
splom(mtcars[c(1, 3, 4, 5, 6)],
      main="Scatter Plot Matrix for mtcars Data") #散点图矩阵
detach(mtcars)
library(lattice)
mygraph <- densityplot(~height | voice.part, data=singer) #存储图形至mygraph
#条件变量 因子-离散变量
displacement <- equal.count(mtcars$disp, number=3, overlap=0)
xyplot(mpg ~ wt | displacement, data=mtcars,
       main="Miles per Gallon vs. Weight by Engine Displacement",
       xlab="Weight", ylab="Miles per Gallon",
       layout=c(3, 1), aspect=1.5) #aspect宽高比
#自定义面板函数xyplot
displacement <- equal.count(mtcars$disp, number=3, overlap=0)
              #equal.count连续变量离散化
mypanel <- function(x, y) {
             panel.xyplot(x, y, pch=19) #pch=19填充圆圈
             panel.rug(x, y) #轴须线
             panel.grid(h=-1, v=-1) #水平和竖直网格线,负值强制对齐轴标签
             panel.lmline(x, y, col="red", lwd=1, lty=2)
           } #自定义面板函数
xyplot(mpg~wt | displacement, data=mtcars,
       layout=c(3, 1),
       aspect=1.5, #锁定宽高比
       main="Miles per Gallon vs. Weight by Engine Displacement",
       xlab="Weight",
       ylab="Miles per Gallon",
       panel=mypanel)
#自定义面板函数和额外选系选项的plot
library(lattice)
mtcars$transmission <- factor(mtcars$am, levels=c(0, 1),
                              labels=c("Automatic", "Manual"))
panel.smoother <- function(x, y) {
                    panel.grid(h=-1, v=-1) #强制对齐轴标签的网格线
                    panel.xyplot(x, y) #散点图
                    panel.loess(x, y) #绘制非参数拟合曲线
                    panel.abline(h=mean(y), lwd=2, lty=2, col="green")#添加均直线
                  }
xyplot(mpg~disp|transmission, data=mtcars,
       scales=list(cex=.8, col="red"),
       panel=panel.smoother,
       xlab="Displacement", ylab="Miles per Gallon",
       main="MPG vs Displacement by Transmission Type",
       sub="Dotted lines aer Group Means", aspect=1) #sub下标题
#叠加显示分组变量图形
library(lattice)
mtcars$transmission <- factor(mtcars$am, levels=c(0, 1),
                               labels=c("Automatic", "Manual"))
densityplot(~mpg, data=mtcars,
            group=transmission,
            main="MPG Distribution by Transmission Type",
            xlab="Miles per Gallon",
            auto.key=TRUE)#auto.key手动添加图例
auto.key=list(space="right", columns=1, title="Transmission")
#16-4
library(lattice)
mtcars$transmission <- factor(mtcars$am, levels=c(0, 1),
                              labels=c("Automatic", "Manual"))
colors = c("red", "blue")
lines = c(1, 2)
points = c(16, 17)
key.trans <- list(title="Transmission", 
                  space="bottom", columns=2,
                  #图例摆放在下方,两栏
                  text=list(levels(mtcars$transmission)),
                  #水平名称
                  points=list(pch=points, col=colors),
                  lines=list(col=colors, lty=lines),
                  cex.title=1, cex=.9)
densityplot(~mpg, data=mtcars,
            group=transmission,
            main="MPG Distribution by Transmission Type", 
            xlab="Miles per Gallon",
            pch=points, lty=lines, col=colors,
            lwd=2, jitter=.005,
            key=key.trans)
#16-5 包含分组变量和条件变量以及自定义图例的xyplot
library(lattice)
colors <- "darkgreen"
symbols <- c(1:12)
linetype <- c(1:3)
CO2
key.species <- list(title="Plant",
                    space="right",#标签在右
                    text=list(levels(CO2$Plant)),
                    points=list(pch=symbols, col=colors))

xyplot(uptake~conc|Type*Treatment, data=CO2,
       group=Plant,
       type="o",
       pch=symbols, col=colors, lty=linetype,
       main="Carbon Dioxide Uptake\nin Grass Plants",
       ylab=expression(paste("Uptake",
              bgroup("(", italic(frac("umol", "m"^2)), ")"))),
       xlab=expression(paset("Concentration",
              bgroup("(", italic(frac(mL,L)), ")"))),
       sub="Grass Species:Echinochloa crus-galli",
       key=key.species)
#图形参数
show.settings()
mysettings <- trellis.par.get()
mysettings$superpose.symbol
mysettings$superpose.symbol$pch <- c(1:10)
trellis.par.set(mysettings)
show.settings()
#页面摆放
#splite
library(lattice)
graph1 <- histogram(~height|voice.part, data=singer,
                    main="Heights of Choral Singers by Voice Part")
graph2 <- densityplot(~height, data=singer, group=voice.part,
                      plot.points=FALSE, auto.key=list(columns=4))
plot(graph1, split=c(1, 1, 1, 2))
  plot(graph2, split=c(1, 2, 1, 2), newpage=FALSE)
#position
library(lattice)
graph1 <- histogram(~height|voice.part, data=singer,
                    main="Heights of Choral Singers by Voice Part")
graph2 <- densityplot(~height, data=singer, group=voice.part,
                      plot.point=FALSE, auto.key=list(columns=4))
plot(graph1, position=c(0, 3, 1, 1))
plot(graph2, position=c(0, 0, 1, 3), newpage=FALSE)
#ggplot2
library(ggplot2)
mtcars$cylinder <- as.factor(mtcars$cyl)
qplot(cylinder, mpg, data=mtcars, geom=c("boxplot", "jitter"),
      fill=cylinder,
      main="Box plots with superimposed data points",
      xlab="Number of Cylinders",
      ylab="Miles per Gallon")
#case2
library(ggplot2)
transmission <- factor(mtcars$am, levels=c(0, 1),
                       labels=c("Automatic", "Manual"))
qplot(wt, mpg, data=mtcars, 
      color=transmission, shape=transmission,
      geom=c("point", "smooth"),
      method="lm", formula=y~x,
      xlab="Weight", ylab="Miles Per Gallon",
      main="Regression Example")
#case3
library(ggplot2)
mtcars$cyl <- factor(mtcars$cyl, levels=c(4, 6, 8),
                     labels=c("4 cylinders", "6 cylinders", "8 cylinders"))
mtcars$am <- factor(mtcars$am, levels=c(0, 1),
                    labels=c("Automatic", "Manual"))
qplot(wt, mpg, data=mtcars, facets=am ~ cyl, size=hp)
#case4
library(ggplot2)
data(singer, package="lattice")
qplot(height, data=singer, geom=c("density"),
      facets=voice.part ~ . , fill=voice.part)

#交互式图形
install.packages(c("playwith", "latticist", "iplot", "rggobi"))
#identify()
plot(mtcars$wt, mtcars$mpg)
identify(mtcars$wt, mtcars$mpg, labels=row.names(mtcars))
#playwith
library(playwith)
library(lattice)
playwith(
  xyplot(mpg~wt|factor(cyl)*factor(am),
         data=mtcars, subscripts=TRUE,
         type=c("r", "p"))
)
#latticist
install.packages("latticist")
library(latticist)
mtcars$cyl <- factor(mtcars$cyl)
mtcars$gear <- factor(mtcars$gear)
latticist(mtcars, use.playwith=TRUE)
#iplots
library(iplots)
attach(mtcars)
cylinders <- factor(cyl)
gears <- factor(gear)
transmission <- factor(am)
ihist(mpg)
ibar(gears)
iplot(mpg, wt)
ibox(mtcars[c("mpg", "wt", "qsec", "disp", "hp")])
ipcp(mtcars[c("mpg", "wt", "qsec", "disp", "hp")])
imosaic(transmission, cylinders)
detach(mtcars)

##########
alpha
cov
ability.cov
数据集:Harman74.cor
install.packages("qcc")
install.packages(c("playwith", "latticist", "iplot", "rggobi"))

其实一个详尽的笔记应该是更好的学习分享方式,之后会注意,尽量不再出现现在这样的烂尾工程。


——2017.3.29
——实验中心510

上一篇 下一篇

猜你喜欢

热点阅读