第四章 散点图
第一节 绘制基本散点图
library(gcookbook)
View(heightweight)
heightweight[,c("ageYear","heightIn")]
library(ggplot2)
ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point()
ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point(shape=21)
ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point(size=1.5)
简单散点图
第二节 使用点的形和颜色属性,并基于某变量对数据进行分组
heightweight[,c("sex","ageYear","heightIn")]
ggplot(heightweight,aes(x = ageYear,y = heightIn,colour=sex))+
geom_point()
ggplot(heightweight,aes(x = ageYear,y = heightIn,shape=sex))+
geom_point()
ggplot(heightweight,aes(x = ageYear,y = heightIn,shape=sex,colour=sex))+
geom_point()
分类变量映射点的属性
第三节 使用不同的默认设置的点形
ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point(shape=3)
ggplot(heightweight,aes(x = ageYear,y = heightIn,shape=sex))+
geom_point(size=3)+
scale_shape_manual(values = c(1,4)) #修改电形
hw <- heightweight
hw$weightGroup <- cut(hw$weightLb,breaks = c(-Inf,100,Inf))
ggplot(hw,aes(hw$ageYear,hw$heightIn,shape=sex,fill=weightGroup))+
geom_point(size=2.5)+
scale_shape_manual(values = c(21,24))+
scale_fill_manual(values = c(NA,"black"),
guide=guide_legend(override.aes = list(shape=21)))
scale_shape_manual
第四节 将连续变量映射到点的颜色或大小属性上
ggplot(heightweight,aes(x = ageYear,y = heightIn,colour=weightLb))+
geom_point()
ggplot(heightweight,aes(x = ageYear,y = heightIn,size=weightLb))+
geom_point()
连续变量映射到点的属性
将连续变量映射至颜色上,并 scale_fill_gradient()赋予颜色的范围
ggplot(heightweight,aes(x = ageYear,y = heightIn,fill=weightLb))+
geom_point(shape=21,size=2.5)+
scale_fill_gradient(low = "black",high = "white")
连续变量映射至颜色上,并 scale_fill_gradient()赋予颜色的范围
使用离散变量代替色阶
range(heightweight$weightLb)
ggplot(heightweight,aes(x = ageYear,y = heightIn,fill=weightLb))+
geom_point(shape=21,size=2.5)+
scale_fill_gradient(low = "black",high = "white",
breaks=seq(70,170,by=20),
guide=guide_legend())
使用离散变量代替色阶
将连续变量映射给大小,将分类变量映射给颜色
ggplot(heightweight,aes(x = ageYear,y = heightIn,size=weightLb,colour=sex))+
geom_point(alpha=.5)+
scale_size_area()+ #使数据点面积正比于变量值
scale_color_brewer(palette = "Set1")
连续变量映射给大小,将分类变量映射给颜色
第五节 处理图形重叠
当绘制的点数目巨大时,我们怎么办?怎么解决重叠?
sp <- ggplot(diamonds,aes(x = carat,y = price))
sp+geom_point()
图形重叠
解决办法1,使用半透明的点
sp+geom_point(alpha=.5)
sp+geom_point(alpha=.1)
通过透明度调节alpha=.1
解决办法2,将数据分箱,并用矩形表示(适用于量化分析)
sp+stat_bin2d()
sp+stat_bin2d(bins = 50)+
scale_fill_gradient(low = "lightblue",high = "red",limits=c(0,6000))
数据分箱
解决办法3,将数据分箱,并以六边形表示
library(hexbin)
sp+stat_bin_hex()+
scale_fill_gradient(low = "lightblue",high = "red",limits=c(0,8000))
sp+stat_bin_hex()+
scale_fill_gradient(low = "lightblue",high = "red",
breaks=c(0,250,500,1000,2000,4000,6000),
limits=c(0,8000))
数据分箱,并以六边形表示
解决办法4,离散性数据直接使用箱型图,或者添加随即扰动
sp1 <- ggplot(ChickWeight,aes(x = Time,y = weight))
sp1+geom_point()
离散型数据
sp1+geom_point(position = "jitter")
sp1+geom_point(position = position_jitter(width = .3,height = 0))
sp1+geom_boxplot(aes(group=Time))
扰动
箱型图
第六节 添加回归模拟线
*线性模拟线
library(gcookbook)
View(heightweight)
heightweight[,c("ageYear","heightIn")]
library(ggplot2)
sp <- ggplot(heightweight,aes(x = ageYear,y = heightIn))
sp+geom_point()+geom_smooth(method = lm)
回归模拟线
sp+geom_point()+geom_smooth(method = lm,level = 0.99) #99%的置信区间 level
sp+geom_point()+geom_smooth(method = lm,se = F)#没有置信区间 se
sp+geom_point(colour="grey60")+geom_smooth(method = lm,se=F,colour="black")
最后一条代码
sp+geom_point()+geom_smooth() #默认情况下是局部加权多项式,即loess曲线
默认情况下是局部加权多项式,即loess曲线
*logistic模型
library(MASS)
b <- biopsy
b$cla <- factor(ifelse(b$class=="benign",0,1),levels = c(0,1))
ggplot(b,aes(x = V1,y = cla))+
geom_point(position = position_jitter(width = .3,height=0.06),
alpha = 0.4,shape = 21,size = 1.5)+
stat_smooth(method = glm,family=binomial)
分类变量使用logistic回归
sp <- ggplot(heightweight,aes(x = ageYear,y = heightIn,colour=sex))+
geom_point(alpha=.5)+ #使数据点面积正比于变量值
scale_color_brewer(palette = "Set1")
sp+stat_smooth(method = lm,se=F,fullrange = T)
分类变量映射至颜色后进一步添加模拟线
第七节 根据已有的模型向散点图添加拟合线
library(gcookbook)
data(heightweight)
model <- lm(heightIn ~ ageYear,+I(ageYear^2),data=heightweight)
model
xmin <- min(heightweight$ageYear)
xmax <- max(heightweight$ageYear)
predicted <- data.frame(ageYear=seq(xmin,xmax,length.out=100))
predicted$heightIn <- predict(model,predicted)
head(predicted)
sp <- ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point(colour="grey40")
sp+geom_line(data=predicted,size=1)
根据模型heightIn ~ ageYear,+I(ageYear^2)添加点图,预测后添加模拟线
#定义predictvals()函数
sp <- ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point(colour="grey40")
predictvals <- function(model,xvar,yvar,xrange=NULL,samples=100,...){
if(is.null(xrange)){
if(any(class(model)%in%c("lm","glm")))
xrange <- range(model$model[[xvar]])
else if(any(class(model)%in%"loess"))
xrange <- range(model$x)
}
newdata <- data.frame(x=seq(xrange[1],xrange[2],length.out=samples))
names(newdata) <- xvar
newdata[[yvar]] <- predict(model,newdata=newdata, ...)
newdata
}
modelinear <- lm(heightIn~ageYear,heightweight) #使用lm和loess建模
modelloess <- loess(heightIn~ageYear,heightweight)
lm_predict <- predictvals(modelinear,"ageYear","heightIn") #调用predictivals()函数进行预测
loess_predict <- predictvals(modelloess,"ageYear","heightIn")
sp+geom_line(data = lm_predict,colour="red",size=.8)+
geom_line(data = loess_predict,colour="blue",size=.8)
使用lm和loess建模,调用predictivals()函数,并将结果传递给geomline
第八节 添加来自多个模型的模拟曲线
make_model <- function(data){
lm(heightIn~ageYear,data)
}
library(gcookbook)
library(plyr)
models <- dlply(heightweight,"sex",.fun=make_model)
models
predvals <- ldply(models,.fun = predictvals,xvar="ageYear",yvar="heightIn")
predvals
ggplot(heightweight,aes(x = ageYear,y = heightIn,colour=sex))+
geom_point()+
geom_line(data = predvals)
#> table(predvals$sex)
#f m
#100 100
#> range(predvals$ageYear[predvals$sex=="f"])
#[1] 11.58 17.50
#> range(predvals$ageYear[predvals$sex=="m"])
#[1] 11.58 17.17
根据性别添加多条模拟线
predvals <- ldply(models,.fun = predictvals,xvar="ageYear",yvar="heightIn",
xrang=range(heightweight$ageYear))
ggplot(heightweight,aes(x = ageYear,y = heightIn,colour=sex))+
geom_point()+
geom_line(data = predvals)
根据性别添加多条模拟线
第九节 向散点图添加模型系数
library(gcookbook)
model <- lm(heightIn~ageYear,data = heightweight)
summary(model)
Call:
lm(formula = heightIn ~ ageYear, data = heightweight)
Residuals:
Min 1Q Median 3Q Max
-8.3517 -1.9006 0.1378 1.9071 8.3371
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 37.4356 1.8281 20.48 <2e-16 ***
ageYear 1.7483 0.1329 13.15 <2e-16 ***
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 ‘ ’ 1
Residual standard error: 2.989 on 234 degrees of freedom
Multiple R-squared: 0.4249, Adjusted R-squared: 0.4225
F-statistic: 172.9 on 1 and 234 DF, p-value: < 2.2e-16
pred <- predictvals(model,"ageYear","heightIn")
sp <- ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point()+
geom_line(data = pred)
sp+annotate("text",label = "r^2=0.42",x=16.5,y = 52)
sp+annotate("text",label = "r^2==0.42",x=16.5,y = 52,parse=T)
添加公式
第十节 向散点图添加边际地毯
library(ggplot2)
ggplot(faithful,aes(x = eruptions,y = waiting))+
geom_point()+
geom_rug()
ggplot(faithful,aes(x = eruptions,y = waiting))+
geom_point()+
geom_rug(position = "jitter",size = .2) #添加扰动,增加线宽
geom_rug(position = "jitter",size = .2)
第十一节 像散点图添加标签
library(gcookbook)
subset(countries,Year==2009 & healthexp>2000)
sp <- ggplot(subset(countries,Year==2009 & healthexp>2000),
aes(x = healthexp,y = infmortality))+
geom_point()
sp+annotate("text",x = 4350,y = 5.4,label = "Canada")+
annotate("text",x = 7400,y = 6.8, label = "USA")
annotate
sp+geom_text(aes(label = Name),size=4)
sp+geom_text(aes(label = Name),size=4,vjust=0) #vjust文本的上下基线调整
sp+geom_text(aes(y=infmortality+.1, label = Name),size=4,vjust=0)
sp+geom_text(aes(label = Name),size=4,hjust=0) #hjust文本左右对齐
sp+geom_text(aes(x = healthexp+100,label = Name),size=4,hjust=0)
hjust右对齐
adat <- subset(countries,Year==2009 & healthexp>2000)
adat$Name1 <- adat$Name
idx <- adat$Name1%in%c("Canada","Denmark","Austria","Norway","Sweden","Portugal","Ireland")
adat$Name1[!idx] <- NA
ggplot(adat,aes(x = healthexp,y = infmortality))+ geom_point()+
geom_text(aes(x = healthexp+100,label = Name1),size=4,hjust=0) +
xlim(2000,10000)
为个别点添加注释
第十二节 绘制气泡图
cdat <- subset(countries,Year==2009&
Name%in% c("Canada","Ireland","United States"))
p <- ggplot(cdat,aes(x = healthexp,y = infmortality,size=GDP))+
geom_point(shape=21,colour="black",fill="cornsilk")
p+scale_size_area(max_size = 15)
通过scale_size_area添加形状
总结
基本绘图的语句不多,但熟能生巧,这些图都基本版图的堆积以及变形!