第四章 散点图
第一节 绘制基本散点图
library(gcookbook)
View(heightweight)
heightweight[,c("ageYear","heightIn")]
library(ggplot2)
ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point()
ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point(shape=21)
ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point(size=1.5)
data:image/s3,"s3://crabby-images/f03a1/f03a183384c3e74b5d2d79462076e77e17e1aac4" alt=""
第二节 使用点的形和颜色属性,并基于某变量对数据进行分组
heightweight[,c("sex","ageYear","heightIn")]
ggplot(heightweight,aes(x = ageYear,y = heightIn,colour=sex))+
geom_point()
ggplot(heightweight,aes(x = ageYear,y = heightIn,shape=sex))+
geom_point()
ggplot(heightweight,aes(x = ageYear,y = heightIn,shape=sex,colour=sex))+
geom_point()
data:image/s3,"s3://crabby-images/f76cb/f76cbe27b74de3f20d7424badfddc534c168cd04" alt=""
第三节 使用不同的默认设置的点形
ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point(shape=3)
ggplot(heightweight,aes(x = ageYear,y = heightIn,shape=sex))+
geom_point(size=3)+
scale_shape_manual(values = c(1,4)) #修改电形
hw <- heightweight
hw$weightGroup <- cut(hw$weightLb,breaks = c(-Inf,100,Inf))
ggplot(hw,aes(hw$ageYear,hw$heightIn,shape=sex,fill=weightGroup))+
geom_point(size=2.5)+
scale_shape_manual(values = c(21,24))+
scale_fill_manual(values = c(NA,"black"),
guide=guide_legend(override.aes = list(shape=21)))
data:image/s3,"s3://crabby-images/31da8/31da896cb0b8947b02a97cc8b533695aa23e11f7" alt=""
第四节 将连续变量映射到点的颜色或大小属性上
ggplot(heightweight,aes(x = ageYear,y = heightIn,colour=weightLb))+
geom_point()
ggplot(heightweight,aes(x = ageYear,y = heightIn,size=weightLb))+
geom_point()
data:image/s3,"s3://crabby-images/6ff16/6ff16cdf897270594ed4ad56516197443d4fb1b7" alt=""
将连续变量映射至颜色上,并 scale_fill_gradient()赋予颜色的范围
ggplot(heightweight,aes(x = ageYear,y = heightIn,fill=weightLb))+
geom_point(shape=21,size=2.5)+
scale_fill_gradient(low = "black",high = "white")
data:image/s3,"s3://crabby-images/b3b7a/b3b7a7464980d39654c5f6bceff15cf953eb43bb" alt=""
使用离散变量代替色阶
range(heightweight$weightLb)
ggplot(heightweight,aes(x = ageYear,y = heightIn,fill=weightLb))+
geom_point(shape=21,size=2.5)+
scale_fill_gradient(low = "black",high = "white",
breaks=seq(70,170,by=20),
guide=guide_legend())
data:image/s3,"s3://crabby-images/da851/da85105362dd778cccb35ea8e8b6acb167798e12" alt=""
将连续变量映射给大小,将分类变量映射给颜色
ggplot(heightweight,aes(x = ageYear,y = heightIn,size=weightLb,colour=sex))+
geom_point(alpha=.5)+
scale_size_area()+ #使数据点面积正比于变量值
scale_color_brewer(palette = "Set1")
data:image/s3,"s3://crabby-images/69ec0/69ec006eef936606145bda904849f2045c8ba292" alt=""
第五节 处理图形重叠
当绘制的点数目巨大时,我们怎么办?怎么解决重叠?
sp <- ggplot(diamonds,aes(x = carat,y = price))
sp+geom_point()
data:image/s3,"s3://crabby-images/902db/902db7c5586691a4e8c601588295d2053e422255" alt=""
解决办法1,使用半透明的点
sp+geom_point(alpha=.5)
sp+geom_point(alpha=.1)
data:image/s3,"s3://crabby-images/a89e5/a89e5d853c436b518c6723c835f836bbe3af2f22" alt=""
解决办法2,将数据分箱,并用矩形表示(适用于量化分析)
sp+stat_bin2d()
sp+stat_bin2d(bins = 50)+
scale_fill_gradient(low = "lightblue",high = "red",limits=c(0,6000))
data:image/s3,"s3://crabby-images/97ed5/97ed5195ce7d9a48cbc8992dbfffd9fe751fb067" alt=""
解决办法3,将数据分箱,并以六边形表示
library(hexbin)
sp+stat_bin_hex()+
scale_fill_gradient(low = "lightblue",high = "red",limits=c(0,8000))
sp+stat_bin_hex()+
scale_fill_gradient(low = "lightblue",high = "red",
breaks=c(0,250,500,1000,2000,4000,6000),
limits=c(0,8000))
data:image/s3,"s3://crabby-images/eeff0/eeff0441465905b27e84092b2390134a1134a14c" alt=""
解决办法4,离散性数据直接使用箱型图,或者添加随即扰动
sp1 <- ggplot(ChickWeight,aes(x = Time,y = weight))
sp1+geom_point()
data:image/s3,"s3://crabby-images/cdb1b/cdb1b2408924ef764d83952931d599a6a432c7f6" alt=""
sp1+geom_point(position = "jitter")
sp1+geom_point(position = position_jitter(width = .3,height = 0))
sp1+geom_boxplot(aes(group=Time))
data:image/s3,"s3://crabby-images/932c6/932c6029f81715883c970e9ee98336a9336218f9" alt=""
data:image/s3,"s3://crabby-images/13caa/13caab2fedfce00c2274841fa2eb37bf2a0f5cf9" alt=""
第六节 添加回归模拟线
*线性模拟线
library(gcookbook)
View(heightweight)
heightweight[,c("ageYear","heightIn")]
library(ggplot2)
sp <- ggplot(heightweight,aes(x = ageYear,y = heightIn))
sp+geom_point()+geom_smooth(method = lm)
data:image/s3,"s3://crabby-images/6bc10/6bc10b07d4a77e859968a6c7db50e959bf69b5a3" alt=""
sp+geom_point()+geom_smooth(method = lm,level = 0.99) #99%的置信区间 level
sp+geom_point()+geom_smooth(method = lm,se = F)#没有置信区间 se
sp+geom_point(colour="grey60")+geom_smooth(method = lm,se=F,colour="black")
data:image/s3,"s3://crabby-images/c499f/c499f5016386df79330fec948216fa39a2672e57" alt=""
sp+geom_point()+geom_smooth() #默认情况下是局部加权多项式,即loess曲线
data:image/s3,"s3://crabby-images/b1285/b1285d45784faeb92da3e9b6c768f804e569644c" alt=""
*logistic模型
library(MASS)
b <- biopsy
b$cla <- factor(ifelse(b$class=="benign",0,1),levels = c(0,1))
ggplot(b,aes(x = V1,y = cla))+
geom_point(position = position_jitter(width = .3,height=0.06),
alpha = 0.4,shape = 21,size = 1.5)+
stat_smooth(method = glm,family=binomial)
data:image/s3,"s3://crabby-images/fb0a8/fb0a820affb863a76fd0105167a298f2869e4dca" alt=""
sp <- ggplot(heightweight,aes(x = ageYear,y = heightIn,colour=sex))+
geom_point(alpha=.5)+ #使数据点面积正比于变量值
scale_color_brewer(palette = "Set1")
sp+stat_smooth(method = lm,se=F,fullrange = T)
data:image/s3,"s3://crabby-images/503bb/503bbb9df975de119d474b43b638919bb6890bbc" alt=""
第七节 根据已有的模型向散点图添加拟合线
library(gcookbook)
data(heightweight)
model <- lm(heightIn ~ ageYear,+I(ageYear^2),data=heightweight)
model
xmin <- min(heightweight$ageYear)
xmax <- max(heightweight$ageYear)
predicted <- data.frame(ageYear=seq(xmin,xmax,length.out=100))
predicted$heightIn <- predict(model,predicted)
head(predicted)
sp <- ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point(colour="grey40")
sp+geom_line(data=predicted,size=1)
data:image/s3,"s3://crabby-images/6cd6a/6cd6afe35c6efb9fce6839536e0c069588409c8d" alt=""
#定义predictvals()函数
sp <- ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point(colour="grey40")
predictvals <- function(model,xvar,yvar,xrange=NULL,samples=100,...){
if(is.null(xrange)){
if(any(class(model)%in%c("lm","glm")))
xrange <- range(model$model[[xvar]])
else if(any(class(model)%in%"loess"))
xrange <- range(model$x)
}
newdata <- data.frame(x=seq(xrange[1],xrange[2],length.out=samples))
names(newdata) <- xvar
newdata[[yvar]] <- predict(model,newdata=newdata, ...)
newdata
}
modelinear <- lm(heightIn~ageYear,heightweight) #使用lm和loess建模
modelloess <- loess(heightIn~ageYear,heightweight)
lm_predict <- predictvals(modelinear,"ageYear","heightIn") #调用predictivals()函数进行预测
loess_predict <- predictvals(modelloess,"ageYear","heightIn")
sp+geom_line(data = lm_predict,colour="red",size=.8)+
geom_line(data = loess_predict,colour="blue",size=.8)
data:image/s3,"s3://crabby-images/734dd/734dddf0d31eb5188dc99fc1d134d2ab18e2ee18" alt=""
第八节 添加来自多个模型的模拟曲线
make_model <- function(data){
lm(heightIn~ageYear,data)
}
library(gcookbook)
library(plyr)
models <- dlply(heightweight,"sex",.fun=make_model)
models
predvals <- ldply(models,.fun = predictvals,xvar="ageYear",yvar="heightIn")
predvals
ggplot(heightweight,aes(x = ageYear,y = heightIn,colour=sex))+
geom_point()+
geom_line(data = predvals)
#> table(predvals$sex)
#f m
#100 100
#> range(predvals$ageYear[predvals$sex=="f"])
#[1] 11.58 17.50
#> range(predvals$ageYear[predvals$sex=="m"])
#[1] 11.58 17.17
data:image/s3,"s3://crabby-images/52c79/52c79eb319990c1312d90c78c96ca522014b6e6a" alt=""
predvals <- ldply(models,.fun = predictvals,xvar="ageYear",yvar="heightIn",
xrang=range(heightweight$ageYear))
ggplot(heightweight,aes(x = ageYear,y = heightIn,colour=sex))+
geom_point()+
geom_line(data = predvals)
data:image/s3,"s3://crabby-images/7ce43/7ce43af9c83337d8c55d1065272c89cb36807f97" alt=""
第九节 向散点图添加模型系数
library(gcookbook)
model <- lm(heightIn~ageYear,data = heightweight)
summary(model)
Call:
lm(formula = heightIn ~ ageYear, data = heightweight)
Residuals:
Min 1Q Median 3Q Max
-8.3517 -1.9006 0.1378 1.9071 8.3371
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 37.4356 1.8281 20.48 <2e-16 ***
ageYear 1.7483 0.1329 13.15 <2e-16 ***
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 ‘ ’ 1
Residual standard error: 2.989 on 234 degrees of freedom
Multiple R-squared: 0.4249, Adjusted R-squared: 0.4225
F-statistic: 172.9 on 1 and 234 DF, p-value: < 2.2e-16
pred <- predictvals(model,"ageYear","heightIn")
sp <- ggplot(heightweight,aes(x = ageYear,y = heightIn))+
geom_point()+
geom_line(data = pred)
sp+annotate("text",label = "r^2=0.42",x=16.5,y = 52)
sp+annotate("text",label = "r^2==0.42",x=16.5,y = 52,parse=T)
data:image/s3,"s3://crabby-images/679e3/679e3f9d93690624947e423a2c9aa01e0804333f" alt=""
第十节 向散点图添加边际地毯
library(ggplot2)
ggplot(faithful,aes(x = eruptions,y = waiting))+
geom_point()+
geom_rug()
ggplot(faithful,aes(x = eruptions,y = waiting))+
geom_point()+
geom_rug(position = "jitter",size = .2) #添加扰动,增加线宽
data:image/s3,"s3://crabby-images/9df96/9df9614fab5c97a1fb530a651ee8db0818b351e3" alt=""
第十一节 像散点图添加标签
library(gcookbook)
subset(countries,Year==2009 & healthexp>2000)
sp <- ggplot(subset(countries,Year==2009 & healthexp>2000),
aes(x = healthexp,y = infmortality))+
geom_point()
sp+annotate("text",x = 4350,y = 5.4,label = "Canada")+
annotate("text",x = 7400,y = 6.8, label = "USA")
data:image/s3,"s3://crabby-images/31683/31683d6e77b87ad48200eac9a1941251838d36a0" alt=""
sp+geom_text(aes(label = Name),size=4)
sp+geom_text(aes(label = Name),size=4,vjust=0) #vjust文本的上下基线调整
sp+geom_text(aes(y=infmortality+.1, label = Name),size=4,vjust=0)
sp+geom_text(aes(label = Name),size=4,hjust=0) #hjust文本左右对齐
sp+geom_text(aes(x = healthexp+100,label = Name),size=4,hjust=0)
data:image/s3,"s3://crabby-images/f68be/f68be735d1a3c8426327950e1c13611574eb2436" alt=""
adat <- subset(countries,Year==2009 & healthexp>2000)
adat$Name1 <- adat$Name
idx <- adat$Name1%in%c("Canada","Denmark","Austria","Norway","Sweden","Portugal","Ireland")
adat$Name1[!idx] <- NA
ggplot(adat,aes(x = healthexp,y = infmortality))+ geom_point()+
geom_text(aes(x = healthexp+100,label = Name1),size=4,hjust=0) +
xlim(2000,10000)
data:image/s3,"s3://crabby-images/d3b6b/d3b6be6bd01c441ec911e273401bf97b599466ce" alt=""
第十二节 绘制气泡图
cdat <- subset(countries,Year==2009&
Name%in% c("Canada","Ireland","United States"))
p <- ggplot(cdat,aes(x = healthexp,y = infmortality,size=GDP))+
geom_point(shape=21,colour="black",fill="cornsilk")
p+scale_size_area(max_size = 15)
data:image/s3,"s3://crabby-images/32ede/32edead5e88bb4d8c2cf91928657c7cbf39d7ae7" alt=""
总结
基本绘图的语句不多,但熟能生巧,这些图都基本版图的堆积以及变形!