2022-07-20 生信技能树R语言小作业(高级)
1.了解ExpressionSet对象,比如CLL包里面就有data(sCLLex) ,找到它包含的元素,提取其表达矩阵(使用exprs函数),查看其大小
参考:http://www.bio-info-trainee.com/bioconductor_China/software/limma.html
suppressPackageStartupMessages(library(CLL))
data(sCLLex)
exprSet=exprs(sCLLex) ##sCLLex是依赖于CLL这个package的一个对象
samples=sampleNames(sCLLex)
pdata=pData(sCLLex)
group_list=as.character(pdata[,2])
dim(exprSet)
[1] 12625 22
2.了解 str,head,help函数,作用于 第二步提取到的表达矩阵
str(exprSet) #查看exprSet的结构·
num [1:12625, 1:22] 5.74 2.29 3.31 1.09 7.54 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:12625] "1000_at" "1001_at" "1002_f_at" "1003_s_at" ...
..$ : chr [1:22] "CLL11.CEL" "CLL12.CEL" "CLL13.CEL" "CLL14.CEL" ...
head(exprSet) #默认查看数据前5行
help()#可获得相应函数的帮助信息
3.安装并了解 hgu95av2.db 包,看看 ls("package:hgu95av2.db") 后 显示的那些变量
library(hgu95av2.db)
ls("package:hgu95av2.db")
[1] "hgu95av2" "hgu95av2.db"
[3] "hgu95av2_dbconn" "hgu95av2_dbfile"
[5] "hgu95av2_dbInfo" "hgu95av2_dbschema"
[7] "hgu95av2ACCNUM" "hgu95av2ALIAS2PROBE"
[9] "hgu95av2CHR" "hgu95av2CHRLENGTHS"
[11] "hgu95av2CHRLOC" "hgu95av2CHRLOCEND"
[13] "hgu95av2ENSEMBL" "hgu95av2ENSEMBL2PROBE"
[15] "hgu95av2ENTREZID" "hgu95av2ENZYME"
[17] "hgu95av2ENZYME2PROBE" "hgu95av2GENENAME"
[19] "hgu95av2GO" "hgu95av2GO2ALLPROBES"
[21] "hgu95av2GO2PROBE" "hgu95av2MAP"
[23] "hgu95av2MAPCOUNTS" "hgu95av2OMIM"
[25] "hgu95av2ORGANISM" "hgu95av2ORGPKG"
[27] "hgu95av2PATH" "hgu95av2PATH2PROBE"
[29] "hgu95av2PFAM" "hgu95av2PMID"
[31] "hgu95av2PMID2PROBE" "hgu95av2PROSITE"
[33] "hgu95av2REFSEQ" "hgu95av2SYMBOL"
[35] "hgu95av2UNIPROT"
4.理解 head(toTable(hgu95av2SYMBOL)) 的用法,找到 TP53 基因对应的探针ID
head(toTable(hgu95av2SYMBOL))
probe_id symbol
1 1000_at MAPK3
2 1001_at TIE1
3 1002_f_at CYP2C19
4 1003_s_at CXCR5
5 1004_at CXCR5
6 1005_at DUSP1
ids %>%
+ filter(symbol=="TP53")
probe_id symbol
1 1939_at TP53
2 1974_s_at TP53
3 31618_at TP53
5.理解探针与基因的对应关系,总共多少个基因,基因最多对应多少个探针,是哪些基因,是不是因为这些基因很长,所以在其上面设计多个探针呢?
length(unique(ids$symbol))
#[1] 8582
ids%>%
count(.$symbol) %>%
slice_max(n)
.$symbol n
1 GAPDH 8
2 INPP4A 8
3 MYB 8
4 PTGER3 8
5 STAT1 8
6.第二步提取到的表达矩阵是12625个探针在22个样本的表达量矩阵,找到那些不在 hgu95av2.db 包收录的对应着SYMBOL的探针。
提示:有1168个探针是没有对应基因名字的。
dim(exprSet)
[1] 12625 22
exprSet=as.data.frame(exprSet)
no_probe=exprSet[!rownames(exprSet)%in%ids$probe_id,]
no_probe1=rownames(no_probe)
dim(no_probe)
[1] 1168 22
#另外一种方法
table(rownames(exprSet) %in% ids$probe_id)
FALSE TRUE
1168 11457
7.过滤表达矩阵,删除那1165个没有对应基因名字的探针。
#未过滤前
dim(exprSet)
#[1] 12625 22
#过滤后
exprSet=exprSet[rownames(exprSet)%in%ids$probe_id,]
dim(exprSet)
#[1] 11457 22
8.整合表达矩阵,多个探针对应一个基因的情况下,只保留在所有样本里面平均表达量最大的那个探针。
exprSet %>%
mutate(rowmean=rowMeans(.)) %>%
arrange(-rowmean) %>%
distinct(rowmean,.keep_all = TRUE) %>%
slice_max(rowmean)
#32748_at
9.把过滤后的表达矩阵更改行名为基因的symbol,因为这个时候探针和基因是一对一关系了
exprSet1=exprSet %>%
rownames_to_column("probe_id") %>%
inner_join(ids,by="probe_id") %>%
select(-probe_id) %>%
mutate(rowmean=rowMeans(.[,-23])) %>%
arrange(-rowmean) %>%
distinct(symbol,.keep_all = TRUE) %>%
column_to_rownames("symbol") %>%
select(-rowmean)
dim(exprSet1)
[1] 8582 22
10.对第9步得到的表达矩阵进行探索,先画第一个样本的所有基因的表达量的boxplot,hist,density ,然后画所有样本的 这些图
exprSet1[1:5,1:5]
CLL11.CEL CLL12.CEL CLL13.CEL CLL14.CEL CLL15.CEL
RPS27 14.04283 14.06055 13.97243 14.06541 14.22010
RPL9 13.59772 13.06800 13.66365 13.61514 13.75394
RPL37A 13.70027 12.52326 13.74436 13.67659 13.94459
RPS23 13.40957 13.08543 13.48360 13.44412 13.44006
B2M 13.34431 13.68734 13.32253 13.41613 13.49689
exp=exprSet1 #重命名一下
exp=exp %>%
mutate(genename=rownames(exp))
#分组 宽变长
exp=exp %>%
mutate(genename=rownames(exp))
exprSet_L=exp %>%
pivot_longer(-23,
names_to ="sample",
values_to="values"
)
dim(exprSet_L)
[1] 188804 3
exprSet_L1=exprSet_L %>%
mutate(group=rep(pdata$Disease,times=8582))
head(exprSet_L1)
# A tibble: 6 x 4
genename sample values group
<chr> <chr> <dbl> <fct>
1 RPS27 CLL11.CEL 14.0 progres.
2 RPS27 CLL12.CEL 14.1 stable
3 RPS27 CLL13.CEL 14.0 progres.
4 RPS27 CLL14.CEL 14.1 progres.
5 RPS27 CLL15.CEL 14.2 progres.
6 RPS27 CLL16.CEL 14.4 progres.
#改一下sample的后缀,方便画图
exprSet_L1=exprSet_L1 %>%
mutate(sample=str_replace(.$sample,".CEL",""))
head(exprSet_L1)
# A tibble: 6 x 4
genename sample values group
<chr> <chr> <dbl> <fct>
1 RPS27 CLL11 14.0 progres.
2 RPS27 CLL12 14.1 stable
3 RPS27 CLL13 14.0 progres.
4 RPS27 CLL14 14.1 progres.
5 RPS27 CLL15 14.2 progres.
6 RPS27 CLL16 14.4 progres.
### ggplot2画图
library(ggplot2)
exprSet_L1 %>%
ggplot(mapping = aes(x=sample,y=values,fill=group))+
geom_boxplot()
exprSet_L1 %>%
ggplot(mapping = aes(values,fill=group))+
geom_histogram(bins = 200)
exprSet_L1 %>%
ggplot(mapping = aes(values,col=group))+
geom_density()



11.理解统计学指标mean,median,max,min,sd,var,mad并计算出每个基因在所有样本的这些统计学指标,最后按照mad值排序,取top 50 mad值的基因,得到列表。
exprSet1 %>%
mutate(mean=rowMeans(.))
exprSet1 %>%
mutate(median=apply(exprSet1,1,median))
exprSet1 %>%
mutate(max=apply(exprSet1,1,max))
exprSet1 %>%
mutate(min=apply(exprSet1,1,min))
exprSet1 %>%
mutate(sd=apply(exprSet1,1,sd))
exprSet1 %>%
mutate(var=apply(exprSet1,1,var))
exprSet1=exprSet1 %>%
mutate(mad=apply(exprSet1,1,mad)) %>%
head(n=50)
rownames(exprSet1)
[1] "FAM30A" "IGF2BP3" "DMD" "TCF7" "SLAMF1"
[6] "FOS" "LGALS1" "IGLC1" "ZAP70" "FCN1"
[11] "LHFPL2" "HBB" "S100A8" "GUSBP11" "COBLL1"
[16] "VIPR1" "PCDH9" "IGH" "ZNF804A" "TRIB2"
[21] "OAS1" "CCL3" "GNLY" "CYBB" "VAMP5"
[26] "RNASE6" "RGS2" "PLXNC1" "CAPG" "RBM38"
[31] "VCAN" "APBB2" "ARF6" "TGFBI" "NR4A2"
[36] "S100A9" "ZNF266" "TSPYL2" "CLEC2B" "FLNA"
[41] "H1-10" "DUSP5" "DUSP6" "ANXA4" "LPL"
[46] "THEMIS2" "P2RY14" "ARHGAP44" "TNFSF9" "PFN2"
12.第11步骤得到top 50 mad值的基因列表来取表达矩阵的子集,并且热图可视化子表达矩阵。
library(pheatmap)
choose_gene=rownames(test)
choose_matrix=exprSet1[choose_gene,]
choose_matrix=t(scale(t(choose_matrix)))
pheatmap(choose_matrix)

13.取不同统计学指标mean,median,max,mean,sd,var,mad的各top50基因列表,使用UpSetR包来看他们之间的overlap情况。
g_mean = tail(sort(apply(exprSet1,1,mean)),50)
g_median = tail(sort(apply(exprSet1,1,median)),50)
g_max = tail(sort(apply(exprSet1,1,max)),50)
g_min = tail(sort(apply(exprSet1,1,min)),50)
g_sd = tail(sort(apply(exprSet1,1,sd)),50)
g_var = tail(sort(apply(exprSet1,1,var)),50)
g_mad = tail(sort(apply(exprSet1,1,mad)),50)
## UpSetR
# 看下说明书 https://cran.r-project.org/web/packages/UpSetR/README.html
library(UpSetR)
g_all <- unique(c(names(g_mean),names(g_median),names(g_max),names(g_min),
names(g_sd),names(g_var),names(g_mad) ))
dat=data.frame(g_all=g_all,
g_mean=ifelse(g_all %in% names(g_mean) ,1,0),
g_median=ifelse(g_all %in% names(g_median) ,1,0),
g_max=ifelse(g_all %in% names(g_max) ,1,0),
g_min=ifelse(g_all %in% names(g_min) ,1,0),
g_sd=ifelse(g_all %in% names(g_sd) ,1,0),
g_var=ifelse(g_all %in% names(g_var) ,1,0),
g_mad=ifelse(g_all %in% names(g_mad) ,1,0)
)
upset(dat,nsets = 7)
14.在第二步的基础上面提取CLL包里面的data(sCLLex) 数据对象的样本的表型数据。
pdata=pData(sCLLex)
group_list=as.character(pdata[,2])
group_list
[1] "progres." "stable" "progres." "progres." "progres."
[6] "progres." "stable" "stable" "progres." "stable"
[11] "progres." "stable" "progres." "stable" "stable"
[16] "progres." "progres." "progres." "progres." "progres."
[21] "progres." "stable"
15.对所有样本的表达矩阵进行聚类并且绘图,然后添加样本的临床表型数据信息(更改样本名)
head(exprSet1)
group_list <- as.character(pData(sCLLex)[,2])
colnames(exprSet1) <- paste(group_list, 1:22, sep = '')
head(exprSet1)
t.exp <- t(exprSet1)
t.exp[1:5,1:5]
hc <- hclust(dist(t.exp))
plot(as.dendrogram(hc))

exp_cluster <- t(exprSet1)
exp_clust_dist <- dist(exp_cluster, method = 'euclidean')
hc <- hclust(exp_clust_dist,'ward.D')
library(factoextra)
fviz_dend(hc, k=4, cex = 0.5,
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800", "#FC4E07"),
color_labels_by_k = TRUE, rect = TRUE)

16.对所有样本的表达矩阵进行PCA分析并且绘图,同样要添加表型信息。
res.pca <- prcomp(t(exprSet1), scale = TRUE)
library(factoextra)
fviz_pca_ind(res.pca,col.ind = group_list)

library(devtools)
install_github('sinhrks/ggfortify')
library(ggfortify)
df <- as.data.frame(t(exprSet1))
df$group <- group_list
autoplot(prcomp(df[,1:(ncol(df)-1)]), data=df,
label = TRUE, colour = 'group')

不知道两幅图区别为什么这么大
install.packages(c('FactoMineR','factoextra'))
library(FactoMineR)#画主成分分析图需要加载这两个包
library(factoextra)
exp1 <- t(exprSet1) #画PCA图时要求是行名是样本名,列名时探针名,因此此时需要转换
exp1 <- as.data.frame(exp1)#将matrix转换为data.frame
exp1[1:4,1:4]
exp1 <- cbind(exp1,group_list) #cbind横向追加,即将分组信息追加到最后一列
exp1.pca <- PCA(exp1[,-8583],graph=FALSE)
exp1.pca <- PCA(exp1[,-col(exp1],graph=FALSE)#和上面方式相同
fviz_pca_ind(exp1.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = exp1$group_list, # color by groups
palette = c("#00AFBB", "#E7B800"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups"
)

17.根据表达矩阵及样本分组信息进行批量T检验,得到检验结果表格
dat = exprSet1
group_list=as.factor(group_list)
group1 = which(group_list == levels(group_list)[1])
group2 = which(group_list == levels(group_list)[2])
dat1 = dat[, group1]
dat2 = dat[, group2]
dat = cbind(dat1, dat2)
pvals = apply(exprSet1, 1, function(x){
t.test(as.numeric(x)~group_list)$p.value
})
p.adj = p.adjust(pvals, method = "BH")
avg_1 = rowMeans(dat1)
avg_2 = rowMeans(dat2)
log2FC = avg_2-avg_1
DEG_t.test = cbind(avg_1, avg_2, log2FC, pvals, p.adj)
DEG_t.test=DEG_t.test[order(DEG_t.test[,4]),]
DEG_t.test=as.data.frame(DEG_t.test)
# 查看t检验结果表格,包含log2FC、pvals和p.adj等,通常认为t<0.05即有统计学意义
head(DEG_t.test)
avg_1 avg_2 log2FC pvals p.adj
SGSM2 7.875615 8.791753 0.9161377 1.629755e-05 0.1398656
PDE8A 6.622749 7.965007 1.3422581 4.058944e-05 0.1656022
DLEU1 7.616197 5.786041 -1.8301554 6.965416e-05 0.1656022
LDOC1 4.456446 2.152471 -2.3039752 8.993339e-05 0.1656022
USP6NL 5.988866 7.058738 1.0698718 9.648226e-05 0.1656022
COMMD4 4.157971 3.407405 -0.7505660 2.454557e-04 0.2586085
#这个还不是很清楚 下去再学习一下
18.使用limma包对表达矩阵及样本分组信息进行差异分析,得到差异分析表格,重点看logFC和P值,画个火山图(就是logFC和-log10(P值)的散点图)。
suppressMessages(library(limma))
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet1)
design
contrast.matrix<-makeContrasts(paste0(unique(group_list),collapse = "-"),levels = design)
contrast.matrix
##step1
fit <- lmFit(exprSet1,design)
##step2
fit2 <- contrasts.fit(fit, contrast.matrix) ##这一步很重要,大家可以自行看看效果
fit2 <- eBayes(fit2) ## default no trend !!!
##eBayes() with trend=TRUE
##step3
tempOutput = topTable(fit2, coef=1, n=Inf)
nrDEG = na.omit(tempOutput)
#write.csv(nrDEG2,"limma_notrend.results.csv",quote = F)
head(nrDEG)
logFC AveExpr t P.Value adj.P.Val
TBC1D2B -1.0284628 5.620700 -5.837402 8.240608e-06 0.02235789
CLIC1 0.9888221 9.954273 5.772846 9.559632e-06 0.02235789
DLEU1 1.8301554 6.950685 5.740901 1.029016e-05 0.02235789
SH3BP2 -1.3835699 4.463438 -5.735431 1.042083e-05 0.02235789
GPM6A 2.5471980 6.915045 5.043199 5.268465e-05 0.08728084
YTHDC2 -0.5187135 7.602354 -4.873697 7.881534e-05 0.08728084
B
TBC1D2B 3.352264
CLIC1 3.231210
DLEU1 3.171073
SH3BP2 3.160761
GPM6A 1.821981
YTHDC2 1.485228
#火山图
library(ggplot2)
library(ggrepel)
library(dplyr)
data <- nrDEG
### 如果没有gene这一列就需要添加
data$gene <- rownames(data)
## 仔细观察data数据
## 如果是你自己的数据,至少有三列
## logFC,P.Value,gene
ggplot(data=data, aes(x=logFC, y =-log10(P.Value))) +
## 三个部分分别画点
geom_point(data=subset(data,abs(data$logFC) <= 1),aes(size=abs(logFC)),color="black",alpha=0.1) +
geom_point(data=subset(data,data$P.Value<0.05 & data$logFC > 1),aes(size=abs(logFC)),color="red",alpha=0.2) +
geom_point(data=subset(data,data$P.Value<0.05 & data$logFC < -1),aes(size=abs(logFC)),color="green",alpha=0.2) +
## 画线
geom_hline(yintercept = -log10(0.05),lty=4,lwd=0.6,alpha=0.8)+
geom_vline(xintercept = c(1,-1),lty=4,lwd=0.6,alpha=0.8)+
## 主题
theme_bw()+
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"))+
labs(x="log2 (fold change)",y="-log10 (q-value)")+
theme(plot.title = element_text(hjust = 0.5))+
theme(legend.position='none')+
## 标签
geom_text_repel(data=subset(data, abs(logFC) > 3), aes(label=gene),col="black",alpha = 0.8)

19.对T检验结果的P值和limma包差异分析的P值画散点图,看看哪些基因相差很大。
### different P values
head(nrDEG)
head(DEG_t.test)
# 将limma生成的nrDEG与t检验合并
DEG_t.test=DEG_t.test[rownames(nrDEG),]
## 可以看到logFC是相反的
plot(DEG_t.test[,3],nrDEG[,1])
# 可以看到使用limma包和t.test本身的p值差异尚可接受
plot(DEG_t.test[,4],nrDEG[,4])
plot(-log10(DEG_t.test[,4]),-log10(nrDEG[,4]))



参考:盘一盘 生信技能树R语言小作业(高级) - 简书 (jianshu.com)
感觉还有很多细节需要自己摸索一下 待完善