生信文献rna_seqscRNA-seq

复现2:AD与Normal细胞类型水平的差异基因挖掘(snRNA

2021-08-24  本文已影响0人  小贝学生信
overall design

提示:因为本次数据集涉及17W~细胞,所以使用的是服务器Rstudio,本地电脑比较困难。

1、读入数据

1.1 下载数据(GSE157827)--上传到服务器--批量改名(Read10X())--创建Seurat对象

setwd("~/scAD")
fs=list.files('./GSE157827_RAW/','^GSM')
fs
library(tidyverse)
samples=str_split(fs,'_',simplify = T)[,1]
#批量将文件名改为 Read10X()函数能够识别的名字
lapply(unique(samples),function(x){
  # x = unique(samples)[1]
  y=fs[grepl(x,fs)]
  folder=paste0("GSE157827_RAW/", paste(str_split(y[1],'_',simplify = T)[,1:2], collapse = "_"))
  dir.create(folder,recursive = T)
  #为每个样本创建子文件夹
  file.rename(paste0("GSE157827_RAW/",y[1]),file.path(folder,"barcodes.tsv.gz"))
  #重命名文件,并移动到相应的子文件夹里
  file.rename(paste0("GSE157827_RAW/",y[2]),file.path(folder,"features.tsv.gz"))
  file.rename(paste0("GSE157827_RAW/",y[3]),file.path(folder,"matrix.mtx.gz"))
})
library(Seurat)
samples=list.files("GSE157827_RAW/")
samples
dir <- file.path('./GSE157827_RAW',samples)
names(dir) <- samples
#创建Seurat对象
counts <- Read10X(data.dir = dir)
scRNA = CreateSeuratObject(counts)
dim(scRNA)   #查看基因数和细胞总数
#[1]  33538 179392
table(scRNA@meta.data$orig.ident)  #查看每个样本的细胞数
head(scRNA@meta.data)

1.2 metadata细节整理

scRNA$orig.ident = stringr::str_split(rownames(scRNA@meta.data), "_",simplify = T)[,2]
# AD、NC
scRNA$group = substr(scRNA$orig.ident,1,2)
head(scRNA@meta.data)
table(scRNA$group)
table(scRNA$orig.ident)
#为了绘图直观,factor因子化,并且设置指定的levels排序
ranks = order(as.numeric(substr(unique(scRNA$orig.ident),3,4)))
scRNA$orig.ident = factor(scRNA$orig.ident, levels = unique(scRNA$orig.ident)[ranks])
table(scRNA$orig.ident, scRNA$group)

2、质控、过滤低质量的细胞、基因

2.1 过滤前的指标可视化

#指标1:nFeature_RNA 表达基因数
feats <- c("nFeature_RNA", "nCount_RNA")
library(patchwork)
p_filt_b_1=VlnPlot(scRNA, features = feats, pt.size = 0, ncol = 2, 
        group.by = "group") + NoLegend()
p_filt_b_2=VlnPlot(scRNA, features = feats, pt.size = 0, ncol = 1, 
        group.by = "orig.ident") + NoLegend()
#指标2~3:线粒体、核糖体基因含量
mito_genes=rownames(scRNA)[grep("^MT-", rownames(scRNA))] ;mito_genes 
scRNA=PercentageFeatureSet(scRNA, "^MT-", col.name = "percent_mito")
fivenum(scRNA@meta.data$percent_mito)
ribo_genes=rownames(scRNA)[grep("^Rp[sl]", rownames(scRNA),ignore.case = T)];ribo_genes
scRNA=PercentageFeatureSet(scRNA, "^RP[SL]", col.name = "percent_ribo")
fivenum(scRNA@meta.data$percent_ribo)
feats <- c("percent_mito","percent_ribo")
p_filt_b_3 =VlnPlot(scRNA, features = feats, pt.size = 0, ncol = 2, 
        group.by = "group") + NoLegend()
p_filt_b_4=VlnPlot(scRNA, features = feats, pt.size = 0, ncol = 1, 
        group.by = "orig.ident") + NoLegend()
(p_filt_b_1 / p_filt_b_2) | (p_filt_b_3 / p_filt_b_4)

2.2 确定过滤细胞、基因的阈值

#细胞-表达基因数-筛掉过高、过低
retained_c_umi_low <- scRNA$nFeature_RNA > 300
retained_c_umi_high <- scRNA$nFeature_RNA < 8000
#细胞-线粒体/核糖体含量-筛掉过低
retained_c_mito <- scRNA$percent_mito < 14
retained_c_ribo <- scRNA$percent_ribo < 3
table(retained_c_mito & retained_c_ribo & retained_c_umi_low & retained_c_umi_high)
# FALSE   TRUE 
# 9876 169516
table(scRNA$group[retained_c_mito & retained_c_ribo & retained_c_umi_low & retained_c_umi_high])
# AD    NC 
# 90480 79036

#指标可视化
#devtools::install_github("gaospecial/ggVennDiagram")
veen_eles = list(`Filt_low_umi\n(300)` = umi_low,
                 `Filt_high_umi\n(8000)` = umi_high,
                 `Filt_high_mito\n(14%)` = mito,
                 `Filt_high_ribo\n(3%)` = ribo)
library("ggVennDiagram")
ggVennDiagram(veen_eles, set_size = 4)  +
  ggplot2::scale_fill_gradient(low="#ffffb2",high = "#f03b20")

值得注意的是:snRNA-seq与scRNA-seq分析过程中最主要的不同便在于线粒体与核糖体比例。因为前者测的仅为核基因数据,所以理论上很少有核糖体、线粒体相关基因,因此需要过滤掉高表达核糖体、线粒体相关基因。

feature_rowsum = Matrix::rowSums(scRNA@assays$RNA@counts>0) 
head(feature_rowsum)
retained_f_low <- feature_rowsum > ncol(scRNA)*0.005
table(retained_f_low)
# FALSE  TRUE 
# 16262 17276
rankplot = data.frame(feature_count = sort(feature_rowsum),
                          gene = names(sort(retained_f_low)),
                          Rank = 1:length(feature_rowsum))
library(ggbreak)
ggplot(rankplot, aes(x=Rank, y=feature_count)) + 
  geom_point() + scale_y_break(c(10000,100000)) +
  geom_hline(yintercept=ncol(scRNA)*0.005, color="red") +
  geom_text(x=10000, y=4000, label="Feature cutoff : ncol(scRNA)*0.5%", size = 5)
scRNA_filt = scRNA[retained_f_low, retained_c_mito & retained_c_ribo & 
                                    retained_c_umi_low & retained_c_umi_high]
dim(scRNA_filt)
#[1]  17276 169516
table(scRNA_filt$group)
# AD    NC 
# 90480 79036

2.3 过滤后的指标可视化

feats <- c("nFeature_RNA", "nCount_RNA")
p_filt_a_1=VlnPlot(scRNA_filt, features = feats, pt.size = 0, ncol = 2, 
        group.by = "group") + NoLegend()
p_filt_a_2=VlnPlot(scRNA_filt, features = feats, pt.size = 0, ncol = 1, 
        group.by = "orig.ident") + NoLegend()
feats <- c("percent_mito","percent_ribo")
p_filt_a_3=VlnPlot(scRNA_filt, features = feats, pt.size = 0, ncol = 2, 
        group.by = "group") + NoLegend()
p_filt_a_4=VlnPlot(scRNA_filt, features = feats, pt.size = 0, ncol = 1, 
        group.by = "orig.ident") + NoLegend()
(p_filt_a_1 / p_filt_a_2) | (p_filt_a_3 / p_filt_a_4)

2.4 保存过滤后的数据(optional)

# saveRDS(scRNA_filt, file = "sce_filt.rds")
# scRNA = readRDS("sce_filt.rds")

3、标准化、高变基因、归一化、降维(PCA、UMAP)、分群

3.1 标准化、高变基因、归一化

library(future)
plan()
plan("multiprocess", workers = 4)
plan()

scRNA <- NormalizeData(scRNA, normalization.method = "LogNormalize", scale.factor = 10000)
scRNA <- FindVariableFeatures(scRNA, selection.method = "vst", nfeatures = 2000) 
scRNA <- ScaleData(scRNA, features = VariableFeatures(scRNA), 
                   vars.to.regress = c("nFeature_RNA","percent_mito"))

3.2 降维、分群

scRNA <- RunPCA(scRNA, features = VariableFeatures(scRNA)) 
scRNA = RunUMAP(scRNA, dims = pc.num)
#根据group,展示dimplot
p_dim = DimPlot(scRNA, group.by = "group")
scRNA <- FindNeighbors(scRNA, dims = pc.num) 
scRNA <- FindClusters(scRNA, resolution = c(0.01,0.05,0.1,0.2,0.5))
#不同分辨率的分群效果
p_cutree = clustree(scRNA@meta.data, prefix = "RNA_snn_res.")
Idents(scRNA) = scRNA$RNA_snn_res.0.05
table(scRNA@active.ident)
p_dim_1 = DimPlot(scRNA, label = T)

p_dim | p_cutree | p_dim_1

4、细胞类型注释

4.1 逐一展示每个细胞类型的marker gene dotplot,注释cluster

# astrocytes: AQP4, ADGRV1, GPC5, RYR3
# endothelial cells: CLDN5, ABCB1, EBF1
# excitatory neurons: CAMK2A, CBLN2, LDB2
# inhibitory neurons: GAD1, LHFPL3, PCDH15
# microglia: C3, LRMDA, DOCK8
# oligodendrocytes: MBP, PLP1, ST18
astrocytes = c("AQP4", "ADGRV1", "GPC5", "RYR3")
DotPlot(scRNA, features = astrocytes, 
        assay = "RNA") # 2

endothelial = c("CLDN5", "ABCB1", "EBF1")
DotPlot(scRNA, features = endothelial, 
        assay = "RNA") # 10

excitatory = c("CAMK2A", "CBLN2", "LDB2")
DotPlot(scRNA, features = excitatory, 
        assay = "RNA") # 0,6,8,9,11

inhibitory = c("GAD1", "LHFPL3", "PCDH15")
DotPlot(scRNA, features = inhibitory, 
        assay = "RNA") # 3,4,5

microglia = c("C3", "LRMDA", "DOCK8")
DotPlot(scRNA, features = microglia, 
        assay = "RNA") # 7

oligodendrocytes = c("MBP", "PLP1", "ST18")
DotPlot(scRNA, features = oligodendrocytes, 
        assay = "RNA") # 1
gene_list = list(
  Astro = astrocytes,
  Endo = endothelial,
  Excit = excitatory,
  Inhib = inhibitory,
  Mic = microglia,
  Oligo = oligodendrocytes
)
# 调整cluster的level因子顺序,从而较美观的展示dotplot
scRNA@active.ident = factor(scRNA@active.ident, levels = c(2,10,0,6,8,9,11,3,4,5,7,1))
p_dot_1 = DotPlot(scRNA, features = gene_list, 
        assay = "RNA") +
  theme(axis.text.x = element_text(angle = 60, vjust = 0.8, hjust=0.8))

p_dim_1 | p_dot_1

4.2 根据上述结果,注释细胞类型

scRNA$celltype = ifelse(scRNA@active.ident %in% c(2), "Astro",
                        ifelse(scRNA@active.ident %in% c(10), "Endo",
                               ifelse(scRNA@active.ident %in% c(7), "Mic",
                                      ifelse(scRNA@active.ident %in% c(1), "Oligo",
                                             ifelse(scRNA@active.ident %in% c(3,4,5), "Inhib",
                                                    "Excit")))))
table(scRNA$celltype)
Idents(scRNA) = scRNA$celltype
scRNA@active.ident = factor(scRNA@active.ident, levels = c("Astro","Endo","Excit","Inhib","Mic","Oligo"))
table(scRNA@active.ident)
p_dot_2= DotPlot(scRNA, features = gene_list, 
             assay = "RNA") +
  theme(axis.text.x = element_text(angle = 60, vjust = 0.8, hjust=0.8))
p_dim_2= DimPlot(scRNA, label = T)
p_dim_2 | p_dot_2 

4.3 复现paper1B-D

#Fig 1B
fig_1b=DimPlot(scRNA,  cols = c('#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02'))

#Fig 1C
ct_stat = table(scRNA$celltype)
ct_stat = data.frame(celltype = names(ct_stat),
                     total = as.numeric(ct_stat),
                     percentage = ct_stat/sum(ct_stat))
library(ggplot2)
fig_1c=ggplot(ct_stat, aes(x=celltype, fill=celltype, y = percentage.Freq)) +
  geom_bar(stat="identity",color = "black")  + NoLegend() +
  scale_fill_manual(values=c('#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02')) +
  theme_classic() + theme(axis.title.x = element_blank()) +
  ylab("Proportion of total nuclei(%)")

#Fig 1D
#Downsample 5000个细胞做Fingallmarker从而节省时间
table(scRNA@active.ident)
CTs = as.character(unique(scRNA@active.ident))
sampled = unlist(lapply(CTs, function(x){
            #x = CTs[1]
            index = scRNA@active.ident %in% x
            cells = names(scRNA@active.ident[index])
            if(length(cells) > 5000){
              sle_cell = sample(cells, 5000)
            } else {
              sle_cell = cells
            }
          }))

scRNA_sample = scRNA[ ,sampled]
dim(scRNA_sample)
table(scRNA_sample@active.ident)
#diff.wilcox = FindAllMarkers(scRNA_sample)
library(tidyverse)
all.markers = diff.wilcox %>% select(gene, everything()) %>%
  subset(p_val<0.05 & abs(diff.wilcox$avg_log2FC) > 0.5)
top5 = all.markers %>% group_by(cluster) %>% top_n(n = 5, wt = avg_log2FC)
top5 = CaseMatch(search = as.vector(top5$gene), match = rownames(scRNA_sample)) 
#save(top5, file = "top5.rda")
#load("top5.rda")
fig_1d = DoHeatmap(scRNA_sample, features = top5, angle = 0,
                   group.colors = c('#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02')) 

#合并
((fig_1b + labs(tag = "B"))+ (fig_1c +  labs(tag = "C"))  ) / (fig_1d +  labs(tag = "D"))

5、基于细胞类型的AD/NC差异分析(Fig 5A-D复现)

#Fig 2a
fig_2a=DimPlot(scRNA, split.by = "group", cols = c('#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02'))

#Fig 2b
ct_stat2 = as.data.frame(table(scRNA$celltype, scRNA$group))
sums = rep(c(sum(ct_stat2$Freq[1:6]),sum(ct_stat2$Freq[7:12])),each=6)
ct_stat2$percentage = ct_stat2$Freq/sums

ct_stat2$Var2 = factor(ct_stat2$Var2, levels = c("NC","AD"))
library(ggplot2)
fig_2b=ggplot(ct_stat2, aes(x=Var1, fill=Var2, y = percentage)) +
  geom_bar( position=position_dodge(width=0.8), width=0.6,
            stat="identity",color = "black")  + 
  scale_fill_manual(values=c('white','black')) +
  theme_classic() + ylab("Proportion of total nuclei(%)") + 
  theme(axis.title.x = element_blank(), legend.title = element_blank())
library(future)
plan()
plan("multiprocess", workers = 4)
plan()

head(scRNA@meta.data)
head(scRNA@meta.data[,c("group","celltype")])
scRNA$compare = paste(scRNA$group, scRNA$celltype, sep = "_")
table(scRNA$compare)
# AD_Astro  AD_Endo AD_Excit AD_Inhib   AD_Mic AD_Oligo 
# 10101     1621    32277    19271     4209    23001 
# NC_Astro  NC_Endo NC_Excit NC_Inhib   NC_Mic NC_Oligo 
# 8782      598    30050    18021     3714    17871 

ct = levels(scRNA@active.ident)
all_markers = lapply(ct, function(x){
              # x = ct[1]
              print(x)
              markers <- FindMarkers(scRNA, group.by = "compare",
                                     logfc.threshold = 0.1,
                                     ident.1 = paste("AD",x,sep = "_"),
                                     ident.2 = paste("NC",x,sep = "_"))
              #markers_sig <- subset(markers, p_val_adj < 0.1)
              return(markers)
            })
length(all_markers)
names(all_markers) = ct
lapply(all_markers,nrow)
all_markers_sig = lapply(all_markers, function(x){
  markers_sig <- subset(x, p_val_adj < 0.1)
  #markers_sig <- subset(x, p_val < 0.0001)
})
marker_stat = as.data.frame(lapply(all_markers_sig,function(x){
  # x=all_markers_sig[[1]]
  Up = sum(x$avg_log2FC>0)
  Down = sum(x$avg_log2FC<0)
  Total = length(x$avg_log2FC)
  return(c(Up, Down, Total))
}))
rownames(marker_stat) = c("Up","Down","Total")
marker_stat
library(gridExtra)
fig_2c = ggpubr::as_ggplot(tableGrob(marker_stat, theme = ttheme_default(base_size = 25)))
names(all_markers_sig)
Astro = rownames(all_markers_sig$Astro)
Endo = rownames(all_markers_sig$Endo)
`Excit+Inhib` = unique(c(rownames(all_markers_sig$Excit),
                  rownames(all_markers_sig$Inhib)))
Mic = rownames(all_markers_sig$Mic)
Oligo = rownames(all_markers_sig$Oligo)
veen_eles2 = list(Astro=Astro, Endo=Endo, 
                  `Excit+Inhib`=`Excit+Inhib`,
                  Mic=Mic, Oligo=Oligo)
library(VennDiagram)
library(ggplotify)
library(gridGraphics)
plt=venn.diagram(veen_eles2, filename=NULL)
p_tmp <- grobTree(plt)
fig_2d=as.ggplot(plot_grid(p_tmp))
(fig_2a + fig_2b)/(fig_2c + fig_2d) + plot_annotation(tag_levels = 'A')

6、以Astro为例的细胞再聚类深入分析(Fig 4A-D)

library(future)
plan()
plan("multiprocess", workers = 4)
plan()
#subcluster analysis
scRNA_astro = subset(scRNA, celltype == "Astro")
dim(scRNA_astro)
scRNA_astro <- NormalizeData(scRNA_astro, normalization.method = "LogNormalize", scale.factor = 10000)
scRNA_astro <- FindVariableFeatures(scRNA_astro, selection.method = "vst", nfeatures = 2000) 
scRNA_astro <- ScaleData(scRNA_astro, features = VariableFeatures(scRNA_astro), 
                   vars.to.regress = c("nFeature_RNA","percent_mito"))
scRNA_astro <- RunPCA(scRNA_astro, features = VariableFeatures(scRNA_astro)) 
ElbowPlot(scRNA_astro, ndims=30, reduction="pca") 
pc.num=1:30
scRNA_astro = RunUMAP(scRNA_astro, dims = pc.num)
DimPlot(scRNA_astro, group.by = "group")
DimPlot(scRNA_astro, split.by = "group")

scRNA_astro <- FindNeighbors(scRNA_astro, dims = pc.num)
scRNA_astro <- FindClusters(scRNA_astro, resolution = c(0.01,0.05,0.1,0.2,0.3,0.4,0.5))
clustree(scRNA_astro@meta.data, prefix = "RNA_snn_res.")
#选择resolution 0.3的分群结果
scRNA_astro$sle_reso = as.numeric(scRNA_astro$RNA_snn_res.0.3)
scRNA_astro$sle_reso = paste0("a",scRNA_astro$sle_reso)
table(scRNA_astro$sle_reso)
scRNA_astro$sle_reso = factor(scRNA_astro$sle_reso, levels = paste0("a",1:12))
Idents(scRNA_astro) = scRNA_astro$sle_reso
table(scRNA_astro@active.ident)
scRNA_astro$group = factor(scRNA_astro$group, levels = c("NC","AD"))
fig_4a=DimPlot(scRNA_astro, split.by = "group")
p_sub_dim = DimPlot(scRNA_astro, label = T)
scRNA_astro$subpop = ifelse(scRNA_astro@active.ident %in% c("a1","a5"), "Up",
                            ifelse(scRNA_astro@active.ident %in% c("a4","a3","a11"), 
                                   "Down","No change"))
table(scRNA_astro$subpop)
scRNA_astro$subpop = factor(scRNA_astro$subpop, levels = c("Up","Down","No change"))
fig_4b=DimPlot(scRNA_astro, group.by = "subpop", cols = c("#e41a1c","#4daf4a","#d9d9d9"))
marker_Astro <- FindMarkers(scRNA_astro, group.by = "subpop",
                       ident.1 = "Down",
                       ident.2 = "Up")
marker_Astro = marker_Astro[order(marker_Astro$avg_log2FC, decreasing = T),]
head(marker_Astro)
scRNA_astro_sub = subset(scRNA_astro, subpop != "No change")
scRNA_astro_sub$subpop = factor(scRNA_astro_sub$subpop, levels = c("Down","Up"))
library(future)
plan()
plan("multiprocess", workers = 4)
plan()
scRNA_astro_sub <- ScaleData(scRNA_astro_sub, features = rownames(marker_Astro), 
                   vars.to.regress = c("nFeature_RNA","percent_mito"))

#Doheatmap结果中,只展示部分感兴趣(与paper重合)的基因注释
paper = c("GLUD1","GLUL","PTN","TNIK","SLC4A4","GABRA2","VEGFA","ETNPPL","SLC1A3",
          "TUBB2B","SPARCL1","HES1","WIF1","HES5","APOE","SLC1A2","F3")
table(paper %in% rownames(marker_Astro)[1:100])
genes.to.label = paper[paper %in% rownames(marker_Astro)[1:100]]
labels <- rep(x = "transparent", times = length(x = rownames(marker_Astro)[1:100]))
labels[match(x = genes.to.label, table = rownames(marker_Astro)[1:100])] <- "black"
fig_4c=DoHeatmap(object = scRNA_astro_sub, features = rownames(marker_Astro)[1:100], 
          group.by = "subpop", group.colors = c("#4daf4a","#e41a1c"), angle = 0) +
  theme(axis.text.y = element_text(color = rev(x = labels)))
fig_4c
library(clusterProfiler)
library(org.Hs.eg.db)
ego <- enrichGO(gene          = rownames(marker_Astro)[1:200],
                keyType       = "SYMBOL",
                OrgDb         = org.Hs.eg.db,
                ont           = "BP",
                pAdjustMethod = "BH",
                pvalueCutoff  = 1,
                qvalueCutoff  = 1)
res = ego@result
interest_go = which(rownames(ego@result) %in% c("GO:0021953","GO:0050769","GO:0021872","GO:0021879",
                "GO:0007409","GO:0042552","GO:0008366","GO:0030900")
interest_go=ego@result$Description[interest_go]
fig_4d=barplot(ego, showCategory=interest_go, cols="pvalue") 
(fig_4a + fig_4b)/(fig_4c + fig_4d) + plot_layout(widths = c(2,1)) + 
  plot_annotation(tag_levels = 'A')
上一篇下一篇

猜你喜欢

热点阅读