2020-05-08 复现GSE137675 RNA-Seq结果
2020-05-08 本文已影响0人
猫叽先森
- 将上游分析最后得到的counts.txt读入到R中
rm(list = ls()) # 魔幻操作,一键清空~
options(stringsAsFactors = F) # 关闭字符串转因子选项
# 读入featureCounts的表达矩阵
data <- read.table("counts.txt",sep = "\t",header = T,row.names = 1)
data[1:3,]
# 查看低/零表达基因情况
dim(data)
## [1] 60662 12
table(rowSums(data==0)==ncol(data))
## FALSE TRUE
## 40892 19770
# 去除低表达和不表达的基因
exprSet <- data[apply(data,1,sum)!=0,]
dim(exprSet)
## [1] 40892 12
#对exprSet行名进行重命名(去除点号和后面的数字,去除重复的基因名)
tmp0 <- exprSet
tmp0$mean <- apply(tmp0,1,mean)
tmp0 <- tmp0[order(tmp0$mean,decreasing = T),]
str <- stringr::str_split(rownames(tmp0),"\\.",simplify=T)
tmp0$id <- str[,1]
tmp0 <- tmp0[!duplicated(tmp0$id),]
rownames(tmp0) <- tmp0$id
exprSet <- tmp0[,-(ncol(tmp0)-1)]
# 保存预处理后的表达矩阵
save(exprSet,file = "data/Step01-featureCounts2exprSet.Rdata")
- 读入作者上传的表达矩阵
GSE137675_Reads_count_featureCounts.xlsx
,
并跟自己处理的结果做样本相关性分析。
代码参考:https://mp.weixin.qq.com/s/b7UHpcicVu6yjj4aM_BGvw
rm(list = ls()) # 魔幻操作,一键清空~
options(stringsAsFactors = F) # 关闭字符串转因子选项
# 读入作者上传的表达矩阵
tmp <- read.csv("GSE137675_Reads_count_featureCounts.csv",row.names = 1)
dim(tmp)
## [1] 40892 12
tmp2 <- tmp[apply(tmp,1,sum)!=0,]
## [1] 35904 12
tmp2$id <- rownames(tmp2)
# 合并作者上传的表达矩阵和自己走上游分析流程得到的表达矩阵
m <- merge(exprSet,tmp2,by='id')
rownames(m) <- m$id
m <- m[,-1]
dim(m)
## [1] 31523 24
pheatmap::pheatmap(cor(m))
pheatmap::pheatmap(cor(log2(m+1)))
cpmM <- log(edgeR::cpm(m+1))
pheatmap::pheatmap(cor(cpmM))
hg1 <- names(tail(sort(apply(cpmM[,1:12],1,mad)),500))
hg2 <- names(tail(sort(apply(cpmM[,13:24],1,mad)),500))
hg <- intersect(hg1,hg2)
hgM <- cpmM[hg,]
pheatmap::pheatmap(cor(hgM))
corrplot.png
对照表
CM2005.1.RNA-seq.rep1 | SRR10139778 |
CM2005.1.RNA-seq.rep2 | SRR10139779 |
CRMM1.RNA-seq.rep1 | SRR10139780 |
CRMM1.RNA-seq.rep2 | SRR10139781 |
OCM1.RNA-seq.rep1 | SRR10139782 |
OCM1.RNA-seq.rep2 | SRR10139783 |
OCM1a.RNA-seq.rep1 | SRR10139784 |
OCM1a.RNA-seq.rep2 | SRR10139785 |
OM431.RNA-seq.rep1 | SRR10139786 |
OM431.RNA-seq.rep2 | SRR10139787 |
PIG1.RNA-seq.rep1 | SRR10139788 |
PIG1.RNA-seq.rep2 | SRR10139789 |
- 使用R包DESeq2分析基因差异表达
rm(list = ls()) # 魔幻操作,一键清空~
options(stringsAsFactors = F) # 关闭字符串转因子选项
# 读取基因表达矩阵信息
load(file = "data/Step01-featureCounts2exprSet.Rdata")
# 查看分组信息和表达矩阵数据
dim(exprSet)
## [1] 40884 13
colnames(exprSet)
## [1] "SRR10139778" "SRR10139779" "SRR10139780" "SRR10139781" "SRR10139782"
## [6] "SRR10139783" "SRR10139784" "SRR10139785" "SRR10139786" "SRR10139787"
## [11] "SRR10139788" "SRR10139789" "id"
exprSet <- exprSet[,-ncol(exprSet)]
group_list <- c(rep("tumor",10),rep("normal",2))
names(group_list)<-colnames(exprSet)
group_list <- as.factor(group_list)
suppressMessages(library(DESeq2))
#### 第一步,构建DESeq2的DESeq对象
colData <- data.frame(row.names=colnames(exprSet),group_list=group_list)
dds <- DESeqDataSetFromMatrix(countData = exprSet,colData = colData,
design = ~ group_list)
#### 第二步,进行差异表达分析
f <- 'data/Step03-DESeq2_dds2.Rdata'
if(!file.exists(f)){
dds2 <- DESeq(dds)
# 保存差异表达分析结果
save(dds2, file = "data/Step03-DESeq2_dds2.Rdata")
}
#### 第三步,提取差异分析结果
load(file = "data/Step03-DESeq2_dds2.Rdata")
# 提取差异分析结果,tumor组对normal组的差异分析结果
res <- results(dds2,contrast=c("group_list","tumor","normal"))
resOrdered <- res[order(res$padj),]
head(resOrdered)
DEG <- as.data.frame(resOrdered)
# 去除差异分析结果中包含NA值的行
DESeq2_DEG = na.omit(DEG)
# 取出包含logFC和P值的两列
nrDEG=DESeq2_DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
save(nrDEG, DESeq2_DEG, file = "data/Step03-DESeq2_nrDEG.Rdata")
DEG <- nrDEG
logFC_cutoff <- 1.5
DEG$change = as.factor(ifelse(DEG$pvalue < 0.05 & abs(DEG$log2FoldChange) > logFC_cutoff,
ifelse(DEG$log2FoldChange > logFC_cutoff ,'UP','DOWN'),'STABLE'))
table(DEG$change)
## DOWN STABLE UP
## 1555 20684 1964
DOWN | STABLE | UP |
---|---|---|
1555 | 20684 | 1964 |
作者的差异基因分析结果:
DEG_res.png