生信人的20个R语言习题-高级
2020-03-01 本文已影响0人
DrKu
- 安装一些R包:
数据包: ALL, CLL, pasilla, airway
软件包:limma,DESeq2,clusterProfiler
工具包:reshape2
绘图包:ggplot2
不同领域的R包使用频率不一样,在生物信息学领域,尤其需要掌握bioconductor系列包。
if(!require(ALL)) BiocManager::install("ALL")
if(!require(CLL)) BiocManager::install("CLL")
if(!require(pasilla)) BiocManager::install("pasilla")
if(!require(airway)) BiocManager::install("airway")
if(!require(limma))BiocManager::install("limma")
if(!require(DESeq2)) BiocManager::install("DESeq2")
if(!require(clusterProfiler)) BiocManager::install("clusterProfiler")
if(!require(reshape2))install.packages("reshape2")
if(!require(ggplot2))install.packages("ggplot2")
- 了解ExpressionSet对象,比如CLL包里面就有data(sCLLex) ,找到它包含的元素,提取其表达矩阵(使用exprs函数),查看其大小
A. 参考:http://www.bio-info-trainee.com/bioconductor_China/software/limma.html
B. 参考:https://github.com/bioconductor-china/basic/blob/master/ExpressionSet.md
suppressPackageStartupMessages(library(CLL))
data(sCLLex)
str(sCLLex)
exprSet <- exprs(sCLLex)
dim(exprSet)
# [1] 12625 22
- 了解 str,head,help函数,作用于 第二步提取到的表达矩阵
str(exprSet)
# num [1:12625, 1:22] 5.74 2.29 3.31 1.09 7.54 ...
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:12625] "1000_at" "1001_at" "1002_f_at" "1003_s_at" ...
# ..$ : chr [1:22] "CLL11.CEL" "CLL12.CEL" "CLL13.CEL" "CLL14.CEL" ...
head(exprSet)
# CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL CLL16.CEL CLL17.CEL CLL18.CEL CLL19.CEL CLL20.CEL CLL21.CEL CLL22.CEL
# 1000_at 5.743132 6.219412 5.523328 5.340477 5.229904 4.920686 5.325348 4.826131 5.212387 5.285830 5.581859 6.251678
# 1001_at 2.285143 2.291229 2.287986 2.295313 2.662170 2.278040 2.350796 2.325163 2.432635 2.256547 2.348389 2.263849
# 1002_f_at 3.309294 3.318466 3.354423 3.327130 3.365113 3.568353 3.502440 3.394410 3.617099 3.279726 3.391734 3.306811
# 1003_s_at 1.085264 1.117288 1.084010 1.103217 1.074243 1.073097 1.091264 1.076470 1.132558 1.096870 1.138386 1.061184
# 1004_at 7.544884 7.671801 7.474025 7.152482 6.902932 7.368660 6.456285 6.824862 7.304803 8.757756 6.695295 7.372185
# 1005_at 5.083793 7.610593 7.631311 6.518594 5.059087 4.855161 5.176975 4.874563 4.097580 9.250585 7.657463 7.683677
# CLL23.CEL CLL24.CEL CLL2.CEL CLL3.CEL CLL4.CEL CLL5.CEL CLL6.CEL CLL7.CEL CLL8.CEL CLL9.CEL
# 1000_at 5.480752 5.216033 5.966942 5.397508 5.281720 5.414718 5.460626 5.897821 5.253883 5.214155
# 1001_at 2.264434 2.344079 2.350073 2.406846 2.341961 2.372928 2.356978 2.222276 2.254772 2.358544
# 1002_f_at 3.341444 3.798335 3.427736 3.453564 3.412944 3.411922 3.396466 3.247276 3.255148 3.365746
# 1003_s_at 1.046188 1.082169 1.501367 1.191339 1.139510 1.153548 1.135671 1.074631 1.166247 1.151184
# 1004_at 7.616749 6.839187 7.545673 8.802729 7.307752 7.491507 8.063202 7.014543 8.019108 7.432331
# 1005_at 8.700667 5.546996 9.611601 5.732182 6.483191 6.072558 9.919125 5.149411 4.989604 5.339848
help(str)
help(head)
- 安装并了解 hgu95av2.db 包,看看 ls("package:hgu95av2.db") 后 显示的那些变量
library(hgu95av2.db)
ls("package:hgu95av2.db")
#
# [1] "hgu95av2" "hgu95av2.db" "hgu95av2_dbconn" "hgu95av2_dbfile" "hgu95av2_dbInfo"
# [6] "hgu95av2_dbschema" "hgu95av2ACCNUM" "hgu95av2ALIAS2PROBE" "hgu95av2CHR" "hgu95av2CHRLENGTHS"
# [11] "hgu95av2CHRLOC" "hgu95av2CHRLOCEND" "hgu95av2ENSEMBL" "hgu95av2ENSEMBL2PROBE" "hgu95av2ENTREZID"
# [16] "hgu95av2ENZYME" "hgu95av2ENZYME2PROBE" "hgu95av2GENENAME" "hgu95av2GO" "hgu95av2GO2ALLPROBES"
# [21] "hgu95av2GO2PROBE" "hgu95av2MAP" "hgu95av2MAPCOUNTS" "hgu95av2OMIM" "hgu95av2ORGANISM"
# [26] "hgu95av2ORGPKG" "hgu95av2PATH" "hgu95av2PATH2PROBE" "hgu95av2PFAM" "hgu95av2PMID"
# [31] "hgu95av2PMID2PROBE" "hgu95av2PROSITE" "hgu95av2REFSEQ" "hgu95av2SYMBOL" "hgu95av2UNIGENE"
# [36] "hgu95av2UNIPROT"
- 理解 head(toTable(hgu95av2SYMBOL)) 的用法,找到 TP53 基因对应的探针ID
head(toTable(hgu95av2SYMBOL))
# probe_id symbol
# 1 1000_at MAPK3
# 2 1001_at TIE1
# 3 1002_f_at CYP2C19
# 4 1003_s_at CXCR5
# 5 1004_at CXCR5
# 6 1005_at DUSP1
ids <- toTable(hgu95av2SYMBOL)
ids[ids$symbol=="TP53",]
# probe_id symbol
# 966 1939_at TP53
# 997 1974_s_at TP53
# 1420 31618_at TP53
- 理解探针与基因的对应关系,总共多少个基因,基因最多对应多少个探针,是哪些基因,是不是因为这些基因很长,所以在其上面设计多个探针呢?
length(unique(ids$symbol))
# [1] 8585
tail(sort(table(ids$symbol)))
# YME1L1 GAPDH INPP4A MYB PTGER3 STAT1
# 7 8 8 8 8 8
- 第二步提取到的表达矩阵是12625个探针在22个样本的表达量矩阵,找到那些不在 hgu95av2.db 包收录的对应着SYMBOL的探针。
A. 提示:有1165个探针是没有对应基因名字的。
ids_out = rownames(exprSet[(!rownames(exprSet) %in% ids$probe_id),])
ids_out
# [1] "1007_s_at" "1047_s_at" "1089_i_at" "108_g_at" "1090_f_at" "1099_s_at" "1104_s_at" "1106_s_at"
# [9] "1116_at" "1122_f_at"
- 过滤表达矩阵,删除那1165个没有对应基因名字的探针。
save(exprSet,ids,file = "scllex_expr_ids.Rdata")
load(file = "scllex_expr_ids.Rdata")
ids_out = rownames(exprSet[(!rownames(exprSet) %in% ids$probe_id),])
dim(exprSet)
# [1] 12625 22 过滤前
exprSet=exprSet[!(rownames(exprSet) %in% ids_out),]
dim(exprSet)
# [1] 11460 22 过滤后
- 整合表达矩阵,多个探针对应一个基因的情况下,只保留在所有样本里面平均表达量最大的那个探针。
A.提示,理解 tapply,by,aggregate,split 函数 , 首先对每个基因找到最大表达量的探针。
B. 然后根据得到探针去过滤原始表达矩阵
exprSet[1:5,1:5]
# CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL
# 1000_at 5.743132 6.219412 5.523328 5.340477 5.229904
# 1001_at 2.285143 2.291229 2.287986 2.295313 2.662170
# 1002_f_at 3.309294 3.318466 3.354423 3.327130 3.365113
# 1003_s_at 1.085264 1.117288 1.084010 1.103217 1.074243
# 1004_at 7.544884 7.671801 7.474025 7.152482 6.902932
ids[1:5,1:2]
# probe_id symbol
# 1 1000_at MAPK3
# 2 1001_at TIE1
# 3 1002_f_at CYP2C19
# 4 1003_s_at CXCR5
# 5 1004_at CXCR5
p <- identical(ids$probe_id,rownames(exprSet))
if(!p) exprSet=exprSet[,match(ids$probe_id,rownames(exprSet))]
ids$mean <- apply(exprSet,1,mean)
ids= ids[order(ids$symbol,ids$mean,decreasing = T),]
dim(ids)
# [1] 11460 3
ids=ids[!duplicated(ids$symbol),]
dim(ids)
# [1] 8585 3
exprSet=exprSet[ids$probe_id,]
dim(exprSet)
# [1] 8585 22
- 把过滤后的表达矩阵更改行名为基因的symbol,因为这个时候探针和基因是一对一关系了。
rownames(exprSet)=ids$symbol
exprSet[1:5,1:5]
# CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL
# ZZZ3 6.645791 7.350613 6.333290 6.603640 6.711462
# ZZEF1 5.289264 6.677600 4.447104 7.008260 6.046429
# ZYX 3.949769 5.423343 3.540189 5.234420 3.603839
# ZWINT 4.316881 2.705329 3.131087 2.821306 2.963397
# ZW10 4.382004 4.355469 4.336743 4.304551 4.482850
- 对第10步得到的表达矩阵进行探索,先画第一个样本的所有基因的表达量的boxplot,hist,density , 然后画所有样本的 这些图
A. 参考:http://bio-info-trainee.com/tmp/basic_visualization_for_expression_matrix.html
B. 理解ggplot2的绘图语法,数据和图形元素的映射关系
par(mfrow = c(3,1))
boxplot(exprSet[,1])
hist(exprSet[,1])
plot(density(exprSet[,1]))
for (i in 1:ncol(exprSet)) {
par(mfrow = c(1,3))
boxplot(exprSet[,i],ylab=colnames(exprSet)[i])
hist(exprSet[,i],xlab =colnames(exprSet)[i],main =colnames(exprSet)[i])
plot(density(exprSet[,i]),main =colnames(exprSet)[i])
}
dat=data.frame(exprSet)
p = list()
library(ggplot2)
for (i in 1:ncol(exprSet)) {
p[[i]] = ggplot(data = dat,mapping = aes_string(y=colnames(dat)[i]))+
geom_boxplot()+
theme_bw()
}
library(patchwork)
wrap_plots(p,nrow = 3,guides = "collect")
- 理解统计学指标mean,median,max,min,sd,var,mad并计算出每个基因在所有样本的这些统计学指标,最后按照mad值排序,取top 50 mad值的基因,得到列表。
A. 注意:这个题目出的并不合规,请仔细看。
dat=data.frame(exprSet)
dat$mean=apply(exprSet, 1, mean)
dat$median=apply(exprSet, 1, median)
dat$max=apply(exprSet, 1, max)
dat$min=apply(exprSet, 1, min)
dat$sd=apply(exprSet, 1, sd)
dat$var=apply(exprSet, 1, var)
dat$mad=apply(exprSet, 1, mad)
dat=dat[order(dat$mad,decreasing = T),]
top50_mad =rownames(dat)[1:50]
top50_mad
# [1] "FAM30A" "IGF2BP3" "DMD" "TCF7" "SLAMF1" "FOS" "LGALS1" "IGLC1" "ZAP70" "FCN1" "LHFPL2" "HBB"
# [13] "S100A8" "GUSBP11" "COBLL1" "VIPR1" "PCDH9" "IGH" "ZNF804A" "TRIB2" "OAS1" "CCL3" "GNLY" "CYBB"
# [25] "VAMP5" "RNASE6" "RGS2" "PLXNC1" "CAPG" "RBM38" "VCAN" "APBB2" "ARF6" "TGFBI" "NR4A2" "S100A9"
# [37] "ZNF266" "TSPYL2" "CLEC2B" "FLNA" "H1FX" "DUSP5" "DUSP6" "ANXA4" "LPL" "THEMIS2" "P2RY14" "ARHGAP44"
# [49] "TNFSF9" "PFN2"
- 根据第12步骤得到top 50 mad值的基因列表来取表达矩阵的子集,并且热图可视化子表达矩阵。试试看其它5种热图的包的不同效果。
dat13 <- exprSet[top50_mad,]
pheatmap::pheatmap(dat13,scale = "row")
- 取不同统计学指标mean,median,max,min,sd,var,mad的各top50基因列表,使用UpSetR包来看他们之间的overlap情况。
dat=dat[order(dat$mean,decreasing = T),]
top50_mean =rownames(dat)[1:50]
dat=dat[order(dat$median,decreasing = T),]
top50_median =rownames(dat)[1:50]
dat=dat[order(dat$max,decreasing = T),]
top50_max =rownames(dat)[1:50]
dat=dat[order(dat$min,decreasing = T),]
top50_min =rownames(dat)[1:50]
dat=dat[order(dat$sd,decreasing = T),]
top50_sd =rownames(dat)[1:50]
dat=dat[order(dat$var,decreasing = T),]
top50_var =rownames(dat)[1:50]
input <- c(
"top50_mean" = 50,
"top50_median"=50,
"top50_max"= 50,
"top50_min"= 50,
"top50_sd"= 50,
"top50_var"= 50,
"top50_mad"= 50,
"top50_mean&top50_median" = length(intersect(top50_mean,top50_median)),
"top50_mean&top50_min" = length(intersect(top50_mean,top50_min)),
"top50_mean&top50_max" = length(intersect(top50_mean,top50_max)),
"top50_mean&top50_sd" = length(intersect(top50_mean,top50_sd)),
"top50_mean&top50_var" = length(intersect(top50_mean,top50_var)),
"top50_mean&top50_mad" = length(intersect(top50_mean,top50_mad)),
"top50_median&top50_max" = length(intersect(top50_mean,top50_max)),
"top50_median&top50_min" = length(intersect(top50_mean,top50_min)),
"top50_median&top50_sd" = length(intersect(top50_mean,top50_sd))
)
###列表中两两关系只做了一部分
data <- UpSetR::fromExpression(input)
upset(data,nsets = 7)
- 在第二步的基础上面提取CLL包里面的data(sCLLex) 数据对象的样本的表型数据。
suppressPackageStartupMessages(library(CLL))
data(sCLLex)
pdata <- pData(sCLLex)
identical(rownames(pdata),colnames(exprSet))
- 对所有样本的表达矩阵进行聚类并且绘图,然后添加样本的临床表型数据信息(更改样本名)
Cor <- data.frame(cor(exprSet))
pheatmap::pheatmap(Cor)
group_list <- pdata$Disease
annotation_col = data.frame(
group = group_list
)
rownames(annotation_col) =colnames(exprSet)
pheatmap::pheatmap(Cor,annotation_col = annotation_col)
dat2 <- exprSet
save(dat2,group_list,annotation_col,file = "16.Rdata")
- 对所有样本的表达矩阵进行PCA分析并且绘图,同样要添加表型信息。
load(file = "16.Rdata")
dat3=as.data.frame(t(dat2))
library(FactoMineR)#画主成分分析图需要加载这两个包
library(factoextra)
# pca的统一操作走起
dat.pca <- PCA(dat3, graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = group_list, # color by groups
#palette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups"
)
print(pca_plot)
- 根据表达矩阵及样本分组信息进行批量T检验,得到检验结果表格
t.test(dat2[1,]~group_list)
# t = -0.18376, df = 10.212, p-value = 0.8578
pvals=apply(dat2, 1, function(x){
t.test(as.numeric(x)~group_list)$p.value
})
p.adj=p.adjust(pvals,method = "BH")
p.adj=data.frame(p.adj)
head(p.adj)
# p.adj
# ZZZ3 0.9854230
# ZZEF1 0.6284552
# ZYX 0.9756691
# ZWINT 0.9756691
# ZW10 0.9848623
# ZSWIM8 0.9756691
- 使用limma包对表达矩阵及样本分组信息进行差异分析,得到差异分析表格,重点看logFC和P值,画个火山图(就是logFC和-log10(P值)的散点图。)。
library(limma)
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
head(design)
exprSet=dat2
rownames(design)=colnames(exprSet)
design
contrast.matrix<-makeContrasts("progres.-stable",
levels = design)
contrast.matrix ##这个矩阵声明,我们要把 Tumor 组跟 Normal 进行差异分析比较
deg = function(exprSet,design,contrast.matrix){
##step1
fit <- lmFit(exprSet,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix)
##这一步很重要,大家可以自行看看效果
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
##step3
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
#write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
head(nrDEG)
return(nrDEG)
}
deg = deg(exprSet,design,contrast.matrix)
head(deg)
# logFC AveExpr t P.Value adj.P.Val B
# TBC1D2B -1.0285 5.621 -5.837 8.241e-06 0.02237 3.352
# CLIC1 0.9888 9.954 5.773 9.560e-06 0.02237 3.231
# DLEU1 1.8302 6.951 5.741 1.029e-05 0.02237 3.171
# SH3BP2 -1.3836 4.463 -5.735 1.042e-05 0.02237 3.160
# GPM6A 2.5472 6.915 5.043 5.269e-05 0.08731 1.822
# YTHDC2 -0.5187 7.602 -4.874 7.881e-05 0.08731 1.485
plot(deg$logFC,-log10(deg$P.Value))
#20. 对T检验结果的P值和limma包差异分析的P值画散点图,看看哪些基因相差很大。
plot(pvals)
pvals=apply(dat2, 1, function(x){
t.test(as.numeric(x)~group_list)$p.value
})
pvals=data.frame(pvals)
p <- identical(rownames(deg),rownames(pvals))
if(!p) deg=deg[match(rownames(pvals),rownames(deg)),]
plot(deg$P.Value,pvals$pvals)
- 对T检验结果的P值和limma包差异分析的P值画散点图,看看哪些基因相差很大。
plot(pvals)
pvals=apply(dat2, 1, function(x){
t.test(as.numeric(x)~group_list)$p.value
})
pvals=data.frame(pvals)
p <- identical(rownames(deg),rownames(pvals))
if(!p) deg=deg[match(rownames(pvals),rownames(deg)),]
plot(deg$P.Value,pvals$pvals)