04. EdegeR Limma DESeq
Introduction
除了利用ballgown流程,目前比较流行的方法还是通过序列比对和计数获得的计数矩阵,然后用广义线性模型去评估和处理这些复杂实验条件下的数据;常见的R语言工具包有edgeR, DESeq2, and limma+voom(Law et al. 2014Stark, Grzelak, and Hadfield 2019).
值得注意的是:计数矩阵必须使用原始计数矩阵,便于评估和标准化。
EdgeR
1. load matirx and build DEGlist
library(edgeR)
# Load gene(/transcript) count matrix and labels
gene_count <- as.matrix(read.csv("RNAseq/gene_count_matrix.csv", row.names = "gene_id"))
pheno_data <- read.csv("RNAseq/geuvadis_phenodata.csv", header = TRUE)
# colnames(gene_count) <- rownames(pheno_data)
# head(pheno_data, 3)
# Check sample IDs are match to each other in orders
all(rownames(pheno_data) %in% colnames(gene_count))
#> [1] FALSE
all(pheno_data[,1] %in% colnames(gene_count))
#> [1] TRUE
# Create a DGEList
dge_list <- DGEList(counts = gene_count, group = pheno_data$group)
2. Removing genes that are lowly expressed
Plotting the distribution log-CPM values shows that a sizeable proportion of genes within each sample are either unexpressed or lowly-expressed with log-CPM values that are small or negative.
# Transformations from the raw-scale
log_cpm <- cpm(dge_list, log = TRUE, prior.count = 2)
# summary(lcpm)
# set group index
mygroup <- "sex"
# filtering uses CPM values
keep_exprs <- filterByExpr(dge_list, group = pheno_data[ ,mygroup])
dge_clean <- dge_list[keep_exprs, , keep.lib.sizes=FALSE]
# Transformation
clean_lcpm <- cpm(dge_clean, log = TRUE, prior.count = 2)
#
library(RColorBrewer)
nsamples <- ncol(dge_list)
col <- brewer.pal(nsamples, "Paired")
samplenames <- colnames(dge_list)
# set new plot plate
dev.new()
par(mfrow = c(1,2))
# density of raw data
plot(density(log_cpm[,1]), col=col[1], main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
for (i in 2:nsamples){
den <- density(log_cpm[,i])
lines(den$x, den$y, col=col[i])
}
legend("topright", samplenames, text.col=col, bty="n")
# density of filtered data
plot(density(clean_lcpm[,1]), col = col[1], main = '', xlab = '')
title(main="B. Filtered data", xlab="Log-cpm")
for (i in 2:nsamples){
den <- density(clean_lcpm[,i])
lines(den$x, den$y, col = col[i])
}
legend("topright", samplenames, text.col=col, bty="n")
![](https://img.haomeiwen.com/i19589140/eef2e14bc6ab0db6.png)
3. Normalising gene expression distributions
dge_clean <- calcNormFactors(dge_clean, method = "TMM")
# dge_clean$samples$norm.factors
clean_lcpm <- cpm(dge_clean, log=TRUE)
par(mfrow=c(1,2))
boxplot(log_cpm, las=2,main="")
title(main="A. Example: Unnormalised data", ylab="Log-cpm")
boxplot(clean_lcpm, las=2, main="")
title(main="B. Example: Normalised data",ylab="Log-cpm")
![](https://img.haomeiwen.com/i19589140/b1c73a74c12d4bcb.png)
4. Distances
- heat map
library(pheatmap)
require(gridExtra)
p1 <- pheatmap(mat = log_cpm,
show_rownames = FALSE,
color = palette,
main = "raw lcpm")
p2 <- pheatmap(mat = clean_lcpm,
show_rownames = FALSE,
color = palette,
main = "clean lcpm")
# extract plot list
plot_list <- list(p1[[4]], p2[[4]])
grid.arrange(arrangeGrob(grobs= plot_list,ncol=2))
![](https://img.haomeiwen.com/i19589140/9222aa9b3a0d1cb4.png)
- Pearson correlation
library(corrplot)
mat <- cor(clean_lcpm)
corrplot(corr = mat, is.corr = FALSE, type = "upper",
col = palette,
diag = FALSE)
![](https://img.haomeiwen.com/i19589140/71575212d6ec77eb.png)
- MDS Plot
library(ggplot2)
dist_mat <- dist(t(clean_lcpm))
mds_data <- cmdscale(dist_mat)
# scale the plot data
mds_data <- as.data.frame(scale(mds_data))
colnames(mds_data) <- c("MDS 1", "MDS 2")
mds_data$name <- rownames(mds_data)
ggplot(data = mds_data) +
geom_point(aes(x = `MDS 1`, y = `MDS 2`), size = 3) +
geom_text(aes(x = `MDS 1` + 0.12, y =`MDS 2`,
label = name), size = 2) +
theme_bw()
![](https://img.haomeiwen.com/i19589140/ba8c41ff3c81a78d.png)
- PCA Analysis
# library(ggplot2)
pca_fit <- prcomp(t(clean_lcpm))
components <- summary(pca_fit )$importance[2,c(1,2)]
plot_data <- as.data.frame(pca_fit$x)
plot_data$name <- row.names(plot_data)
ggplot(data = plot_data) + geom_point(aes(x = PC1, y = PC2),
size = 2) +
geom_text(aes(x = PC1*1.2, y = PC2, label = name)) +
xlab(label = paste0("PC 1 (",
round(components[1], 4)*100, "%)")) +
ylab(label = paste0("PC 2 (",
round(components[2], 4)*100, "%)")) +
theme_bw()
![](https://img.haomeiwen.com/i19589140/86738a5db813d076.png)
5. Differential expression analysis
- Removing heteroscedascity from count data
design <- model.matrix(~0 + factor(pheno_data[ ,mygroup]))
colnames(design) <- c("group1", "group2")
contrast_matrix <- makeContrasts(group1-group2, levels=design)
v <- voom(dge_clean, design, plot=TRUE)
![](https://img.haomeiwen.com/i19589140/c3285dba099c764a.png)
- Fitting linear models
vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contrast_matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model: Mean-variance trend")
DE_gene <- topTable(efit, coef = 1, adjust="BH", n=Inf)
# head map
library(pheatmap)
first <- colorRampPalette(c('#999999','#ffffff'))(10)
second <- colorRampPalette(c('#99d8c9','#66c2a4'))(100)
third <- colorRampPalette(c('#fc8d59','#d53e4f'))(100)
palette <- c(first, second, third)
pheatmap(mat = DE_gene,
show_rownames = FALSE,
#color = palette,
main = "raw lcpm")
# volcano plot
## adding threshold
attach(DE_gene)
DE_gene$threshold <- ifelse(`adj.P.Val` < 0.05 & abs(logFC) > 5,ifelse(logFC > 5, 'up', 'down'), 'no')
detach()
ggplot(DE_gene, aes(logFC,
-log10(adj.P.Val))) +
geom_point(aes(color = threshold)) +
scale_color_manual(values=c("#303F9F", "#757575", "#FF5252"))+
theme_bw()+
xlab(label = "log(fold-change)") +
ylab(label = expression(-log[10] (adj_Pvalue)))
# head map
gene_names <- rownames(DE_gene)[1:100]
i <- which(rownames(v) %in% gene_names)
pheatmap(mat = clean_lcpm[i,],
show_rownames = FALSE,
main = "clean lcpm")
![](https://img.haomeiwen.com/i19589140/288a1e782e53ff51.png)
- Stricter definition on significance
在某些时候,不仅仅需要用到校正p值阈值,还需要差异倍数的对数(lfc=1)也高于某个最小值来更为严格地定义显著性。
tfit <- treat(vfit, lfc=1)
dt <- decideTests(tfit)
DE_gene <- topTreat(efit, coef = 1, adjust="BH", n=Inf)
# adding threshold
attach(DE_gene)
DE_gene$threshold <- ifelse(`adj.P.Val` < 0.05 & abs(logFC) > 5,ifelse(logFC > 5, 'up', 'down'), 'no')
detach()
ggplot(DE_gene, aes(logFC,
-log10(adj.P.Val))) +
geom_point(aes(color = threshold)) +
scale_color_manual(values=c("#303F9F", "#757575", "#FF5252"))+
theme_bw()+
xlab(label = "log(fold-change)") +
ylab(label = expression(-log[10] (adj_Pvalue)))
![](https://img.haomeiwen.com/i19589140/5b8ac47d2230952e.png)
- 差异基因可视化2
DE_gene <- DE_gene[order(DE_gene$logFC, decreasing = FALSE), ]
DE_gene$xaixs <- seq(1, nrow(DE_gene), 1)
ggplot(DE_gene, aes(xaixs, logFC)) +
geom_point(aes(color = threshold)) +
geom_hline(yintercept=c(2,-2),linetype=2,size=0.25)+
geom_hline(yintercept = c(0),linetype=1,size=0.5) +
xlab('rank of differentially expressed genes')+
theme_bw()
![](https://img.haomeiwen.com/i19589140/1fc91d5601439255.png)
6. GO enrichment
1. ID trasformation
GO和KEGG 分析比较烦的一点就是ID转换,由于数据是人类测序结果,我们就以 org.Hs.eg.db 包为例子,看看数据转换的情况。
key types 是里面最需要小心的信息:
1. Ensemble id: ENSG开头
2. Entrez id: 纯数字
3. Symbol id: 基因名称
4. Refseq id: NG, NM, NP等
数据间映射关系(总的来说,ENTREZID是进行GO分析最好的ID类型):
- org.Hs.egACCNUM:将Entrez ID标识符映射到GenBank的登录号
- org.Hs.egCHRLOC:获取映射到染色体位置的Entrez ID标识符
- org.Hs.egENSEMBL:用于Entrez ID与EnsemblID之间的映射
- org.Hs.egENSEMBLPROT:用于Entrez ID与Ensembl蛋白ID的映射
- org.Hs.egENSEMBLTRANS:用于Entrez ID与Ensembl transcript编号
- org.Hs.egENZYME:Entrez基因id和酶活性(EC)之间的图谱
- org.Hs.egGENENAME:Entrez ID与基因名称之间的图谱
- org.Hs.egGO:Entrez ID与基因本体论(GO) id之间的映射
- org.Hs.egMAP:Entrez ID和细胞遗传学图谱/条带之间的映射
- org.Hs.egOMIM:Map between Entrez Gene Identifiers and Mendelian Inheritance in Man (MIM) identifiers
- org.Hs.egPATH:Entrez ID和KEGG通路标识符之间的映射
- org.Hs.egREFSEQ:Entrez ID与RefSeq标识符之间的映射
- org.Hs.egSYMBOL:Entrez ID和基因符号之间的映射
- org.Hs.egUNIPROT:Map Uniprot accession numbers with Entrez Gene identifiers
2. Exercise
# RSQLite(v2.2.5) has some problems
options(connectionObserver = NULL)
library(org.Hs.eg.db)
library(clusterProfiler)
#
keytypes(org.Hs.eg.db)
temp_names <- rownames(DE_gene)[1:100]
# temp_names <- gsub('NR[0-9]*[|]', '', temp_names)
gene_names <- gsub('[|].*', '', temp_names)
# keytype = org.Hs.eg(XXX)
## stand form
head(keys(org.Hs.eg.db, keytype = "SYMBOL"))
head(keys(org.Hs.eg.db, keytype = "REFSEQ"))
# ID transformation
gene_list <- mapIds(org.Hs.eg.db, keys = gene_names,
column= "ENTREZID",
keytype = "REFSEQ")
gene_list <- clusterProfiler::bitr(gene_names, fromType = "ENSEMBL" , toType = "ENTREZID", OrgDb = "org.Hs.eg.db", drop = TRUE)
go_all <- clusterProfiler::enrichGO(gene_list, OrgDb = org.Hs.eg.db,
ont = 'ALL', pAdjustMethod = 'BH',
pvalueCutoff = 0.2, qvalueCutoff = 0.5,
keyType ='ENTREZID')
go_result <- go_all@result
write.table(x = go_result,
file = "RNAseq/GO_result.txt",
sep = "\t",
col.names = TRUE,
row.names = FALSE)
3. Visualization
3.1 dot plot
# dot plot
go_result <- read.table(file = "RNAseq/GO_result.txt",
sep = "\t",
header = TRUE,
stringsAsFactors = F)
plot_data <- go_result[order(go_result$Count, decreasing = T),]
plot_data$Description <- forcats::fct_reorder(plot_data$Description,
plot_data$Count)
ggplot(data = plot_data[1:10, ]) +
geom_point(aes(x = Count, y = Description, size = Count)) +
theme_bw() +
scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 40)) +
xlab(label = NULL) +
ylab(label = NULL)
![](https://img.haomeiwen.com/i19589140/c4e2126f86adaef6.png)
3.2 bar plot
result1 <- read.table(file = "RNAseq/GO_result.txt",
sep = "\t",
header = TRUE,
stringsAsFactors = F)
result2 <- read.table(file = "RNAseq/GO_result2.txt",
sep = "\t",
header = TRUE,
stringsAsFactors = F)
set_bar_plot <- function(result1, result2, myrank){
myrank = as.numeric(myrank)
## setting df1
temp1 <- subset(result1, select = c("Description", "qvalue",
"Count"))
temp1 <- na.omit(temp1)
temp1 <- unique(temp1)
temp1$qvalue <- as.numeric(temp1$qvalue)
df1 <- temp1[order(temp1$qvalue,
decreasing = F), ][c(1:myrank ), ]
## setting df2
temp2 <- subset(result2, select = c("Description", "qvalue",
"Count"))
temp2 <- na.omit(temp2)
temp2 <- unique(temp2)
df2 <- temp2[order(temp2$qvalue,
decreasing = F), ][c(1:myrank ), ]
## setting bar order for df1
df1_1 <- df1[!df1$Description %in% df2$Description, ]
df1_2 <- df1[df1$Description %in% df2$Description, ]
df1_1 <- df1_1[order(df1_1$Count, decreasing = FALSE), ]
df1_1$xlab <- seq(1,nrow(df1_1),1)
df1_1$Count <- df1_1$Count* -1
df1_2 <- df1[df1$Description %in% df2$Description, ]
df1_2 <- df1_2[order(df1_2$Count, decreasing = FALSE), ]
df1_2$xlab <- seq(nrow(df1_1)+1, (nrow(df1_1) + nrow(df1_2)), 1)
df1_2$Count <- df1_2$Count* -1
## setting bar order for df2
df2_1 <- df2[!df2$Description %in% df1$Description, ]
df2_2 <- df2[df2$Description %in% df1$Description, ]
df2_1 <- df2_1[order(df2_1$Count, decreasing = FALSE), ]
df2_1$xlab <- seq((myrank + nrow(df2_2) + 1),
(myrank + nrow(df2_2) + nrow(df2_1)),1)
df2_2 <- df2_2[order(df2_2$Count, decreasing = FALSE), ]
df2_2$xlab <- seq((myrank + 1), (myrank + nrow(df2_2)), 1)
df2_2$Count <- df2_2$Count
mylist <- list(df1_1, df1_2, df2_1, df2_2)
mydf <- dplyr::bind_rows(mylist)
return(mydf)
}
library(ggplot2)
ggplot()+
geom_col(data = df1, aes(x = xlab,
y = Count,
fill = qvalue)) +
geom_text(data = subset(df1, Count < 0),
aes(x = xlab, y = 0.1, label = Description),
hjust = "outward", color = "#c02c38",
size = 3) +
geom_text(data = subset(df1, Count > 0),
aes(x = xlab, y = -0.1, label = Description),
hjust = "inward", color = "#15559a",
size = 3) +
scale_fill_gradient(low = "#046582",
high = "#fa1e0e") +
scale_y_continuous(breaks = seq(-5,20,5)) +
coord_flip() +
theme_void() +
theme(legend.position = c(0.96,0.3),
axis.line.x = element_line(size = 1),
axis.ticks.x = element_line(size = 1),
axis.ticks.length=unit(0.06, "cm"),
axis.text.x = element_text())
![](https://img.haomeiwen.com/i19589140/23d2408564679bd3.png)
References
Law, C. W., Y. Chen, W. Shi, and G. K. Smyth. 2014. “Voom: Precision Weights Unlock Linear Model Analysis Tools for RNA-Seq Read Counts.” Journal Article. Genome Biol 15 (2): R29. https://doi.org/10.1186/gb-2014-15-2-r29.
Stark, R., M. Grzelak, and J. Hadfield. 2019. “RNA Sequencing: The Teenage Years.” Journal Article. Nat Rev Genet 20 (11): 631–56. https://doi.org/10.1038/s41576-019-0150-2.
Bioconductor: RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR