转录因子分析
2022-07-07 本文已影响0人
sucycy
转录因子 (transcription factors, TFs) 是直接作用于基因组,与特定DNA序列结合 (TFBS/motif) ,调控DNA转录过程的一类蛋白质。转录因子可以调节基因组DNA开放性、募集RNA聚合酶进行转录过程、募集辅助因子调节特定的转录阶段,调控诸多生命进程,诸如免疫反应、发育模式等,F活性是根据靶标mRNA表达水平计算的,所以,可以将TF活性视为转录状态,所以,分析转录因子表达及其调控活性对于解析复杂生命活动具有重要意义。
在单细胞层次普及度最高的应该是SCENIC的方法,在这里我们主要就这种方法测试一下流程,SCENIC分为R版和Python版本,大家都在推荐使用python版本,因为R版本有几个函数实在是算的太慢了,由于用R用的顺手,所以本次是使用R做的测试,后边有时间会补充Python版本。
除了SCENIC的方法,我们还测试了DoRothEA,DoRothEA收集了文献,Chip-seq peaks,TF结合位点基序,基因表达推断等基因集,通过不考虑TF本身的基因表达,而是考虑其直接转录靶标的mRNA水平来计算活性。
测试1 SCENIC
1 首先还是加载需要的包
suppressMessages({
library(Seurat)
library(SCENIC)
library(AUCell)
library(GENIE3)
library(RcisTarget)
library(foreach)
})
2 构建数据库
#如果没有数据库,则需要下载一下,由于集群上已经有数据库了,所以可以直接用集群上的数据库
##1, For human:
dbFiles <- c("https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg19/refseq_r45/mc9nr/gene_based/hg19-500bp-upstream-7species.mc9nr.feather",
"https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg19/refseq_r45/mc9nr/gene_based/hg19-tss-centered-10kb-7species.mc9nr.feather")
# mc9nr: Motif collection version 9: 24k motifs
##2, For mouse:
dbFiles <- c("https://resources.aertslab.org/cistarget/databases/mus_musculus/mm9/refseq_r45/mc9nr/gene_based/mm9-500bp-upstream-7species.mc9nr.feather",
"https://resources.aertslab.org/cistarget/databases/mus_musculus/mm9/refseq_r45/mc9nr/gene_based/mm9-tss-centered-10kb-7species.mc9nr.feather")
# mc9nr: Motif collection version 9: 24k motifs
##3, For fly:
dbFiles <- c("https://resources.aertslab.org/cistarget/databases/drosophila_melanogaster/dm6/flybase_r6.02/mc8nr/gene_based/dm6-5kb-upstream-full-tx-11species.mc8nr.feather")
# mc8nr: Motif collection version 8: 20k motifs
##4, download
dir.create("cisTarget_databases"); #创建一个文件夹保存数据库
setwd("cisTarget_databases")
#如果3个参考数据库都想下载,每次设置变量dbFiles后,都要运行以下代码
for(featherURL in dbFiles)
{
download.file(featherURL, destfile=basename(featherURL)) # saved in current dir
}
#由于本来就已经下载了,直接输入文件夹
org <- "hgnc" #mgi 代表鼠标,hgnc 代表人,dmel 代表苍蝇 ,我们这儿pbmc3k为人的,选择hgnc
dbDir <- "/Database/RcisTarget" # RcisTarget databases location
myDatasetTitle <- "SCENIC" # choose a name for your analysis
dbs <- defaultDbNames[[org]]
scenicOptions <- initializeScenic(org=org, dbDir=dbDir, datasetTitle=myDatasetTitle, nCores=5) #nCores=10,代表开启10个线程计算
saveRDS(scenicOptions, "scenicOptions.rds")
2 读取并处理我们需要的数据
PRO<-readRDS('lung.diff_PRO.rds')
PRO1<-subset(PRO,idents = c('TCells','BCells','CancerCells','AT2','PlasmaCells'))
Idents(PRO1)<-'orig.ident'
PRO1<-subset(PRO1,idents = c('Sample_3a','Sample_3b','Sample_3c','Sample_3d'))#由于这个软件费时间,所以前期一直致力于把数据变小再变小
Idents(PRO1)<-'new_ident'
DimPlot(PRO1)
cellInfo <- data.frame(PRO@meta.data)
colnames(cellInfo)[which(colnames(cellInfo)=="orig.ident")] <- "sample"
colnames(cellInfo)[which(colnames(cellInfo)=="nFeature_RNA")] <- "nGene"
colnames(cellInfo)[which(colnames(cellInfo)=="nCount_RNA")] <- "nUMI"
colnames(cellInfo)[which(colnames(cellInfo)=="seurat_clusters")] <- "cluster"
colnames(cellInfo)[which(colnames(cellInfo)=="new_ident")] <- "celltype"
cellInfo <- cellInfo[,c("sample","nGene","nUMI","cluster","celltype")]
saveRDS(cellInfo, file="cellInfo.Rds")
subcell <- sample(colnames(PRO),1000)
scRNAsub <- PRO[,subcell]
#saveRDS(scRNAsub, "scRNAsub.rds")
exprMat <- as.matrix(scRNAsub@assays$RNA@counts)
3 转录因子分析
#基因表达量之和>细胞数*3%,且在1%的细胞中表达
genesKept <- geneFiltering(exprMat, scenicOptions,
minCountsPerGene = 3 * 0.01 * ncol(exprMat),
minSamples = ncol(exprMat) * 0.01)
exprMat_filtered <- exprMat[genesKept, ]
##计算相关性矩阵
runCorrelation(exprMat_filtered, scenicOptions)
##TF-Targets相关性回归分析
exprMat_filtered_log <- log2(exprMat_filtered+1)
runGenie3(exprMat_filtered_log, scenicOptions, nParts = 5)#巨慢,用了大概10个小时,实在不想等了,所以进行了投递
##,Motif验证共表达模块,推断共表达模块
runSCENIC_1_coexNetwork2modules(scenicOptions)
##推断转录调控网络(regulon)
runSCENIC_2_createRegulons(scenicOptions)
saveRDS(scenicOptions,'scenicOptions.rds')
4 转录因子活性评估与可视化
exprMat_all <- as.matrix(PRO@assays$RNA@counts)
exprMat_all <- log2(exprMat_all+1)
runSCENIC_3_scoreCells(scenicOptions, exprMat=exprMat_all) #根据基因的表达量进行评分,这一步会在int和out文件夹下生成一些过程文件
runSCENIC_4_aucell_binarize(scenicOptions, exprMat=exprMat_all) #转录因子二进制转换与可视化
5 Seurat可视化SCENIC结果
#regulonAUC矩阵
AUCmatrix <- readRDS("int/3.4_regulonAUC.Rds")
AUCmatrix <- AUCmatrix@assays@data@listData$AUC
AUCmatrix <- data.frame(t(AUCmatrix), check.names=F)
RegulonName_AUC <- colnames(AUCmatrix)
RegulonName_AUC <- gsub(' \\(','_',RegulonName_AUC)
RegulonName_AUC <- gsub('\\)','',RegulonName_AUC)
colnames(AUCmatrix) <- RegulonName_AUC
scRNAauc <- AddMetaData(PRO, AUCmatrix)
scRNAauc@assays$integrated <- NULL
saveRDS(scRNAauc,'scRNAauc.rds')
##导入二进制regulonAUC矩阵
BINmatrix <- readRDS("int/4.1_binaryRegulonActivity.Rds")
BINmatrix <- data.frame(t(BINmatrix), check.names=F)
RegulonName_BIN <- colnames(BINmatrix)
RegulonName_BIN <- gsub(' \\(','_',RegulonName_BIN)
RegulonName_BIN <- gsub('\\)','',RegulonName_BIN)
colnames(BINmatrix) <- RegulonName_BIN
scRNAbin <- AddMetaData(PRO, BINmatrix)
scRNAbin@assays$integrated <- NULL
#saveRDS(scRNAbin, 'scRNAbin.rds')
#绘图
p1<-FeaturePlot(scRNAauc, features='SOX2_extended_108g', reduction = 'umap')
p2<-FeaturePlot(scRNAbin, features='SOX2_extended_108g', reduction = 'umap')
p1
p2
![](https://img.haomeiwen.com/i25685886/55ae4ec45b7812da.png)
6 也可看一下关注的基因在每个cluster中的表达情况
VlnPlot(scRNAauc, features = "TCF4_extended_31g", pt.size = 0, group.by="new_ident")
![](https://img.haomeiwen.com/i25685886/122c62b7fbd655d5.png)
测试2 DoRothEA
1 加载需要用到的包
#conda activate singlecell_test
suppressMessages({
library(dorothea)
library(dplyr)
library(Seurat)
library(tibble)
library(pheatmap)
library(tidyr)
library(viper)
})
2 读取我们需要的数据库
dorothea_regulon_human <- get(data("dorothea_hs", package = "dorothea")) #人的数据库
#dorothea_regulon_mouse <- get(data("dorothea_mm", package = "dorothea")) #小鼠的数据库
regulon <- dorothea_regulon_human %>%dplyr::filter(confidence %in% c("A","B","C")) #TF把靶标之间的可信度评为A-E五个等级,在这里我们使用ABC 3个等级
3 读取自己需要的数据
PRO<-readRDS('lung.diff_PRO.rds')
PRO <- run_viper(PRO, regulon,options = list(method = "scale", minsize = 4,
eset.filter = FALSE, cores = 1,
verbose = FALSE))
DefaultAssay(object = PRO) <- "dorothea"
PRO <- ScaleData(PRO)
viper_scores_df <- GetAssayData(PRO, slot = "scale.data",assay = "dorothea") %>%data.frame(check.names = F) %>%t()
## 创建一个包含细胞和cluster的表格
CellsClusters <- data.frame(cell = names(Idents(PRO)),
cell_type = as.character(Idents(PRO)),
check.names = F)
## 加入Viper score信息
viper_scores_clusters <- viper_scores_df %>%data.frame() %>%rownames_to_column("cell") %>%gather(tf, activity, -cell) %>%inner_join(CellsClusters)
## 按照cluster 汇总Viper scores
summarized_viper_scores <- viper_scores_clusters %>%group_by(tf, cell_type) %>%summarise(avg = mean(activity),std = sd(activity))
## 筛选高可变的 TFs.
highly_variable_tfs <- summarized_viper_scores %>%group_by(tf) %>%mutate(var = var(avg)) %>%ungroup() %>%top_n(180, var) %>%distinct(tf)
4 画图
## 整理画图的数据
summarized_viper_scores_df <- summarized_viper_scores %>%semi_join(highly_variable_tfs, by = "tf") %>%dplyr::select(-std) %>%spread(tf, avg) %>%data.frame(row.names = 1, check.names = FALSE)
palette_length = 100
my_color = colorRampPalette(c("Darkblue", "white","red"))(palette_length)
my_breaks <- c(seq(min(summarized_viper_scores_df), 0,
length.out=ceiling(palette_length/2) + 1),
seq(max(summarized_viper_scores_df)/palette_length,
max(summarized_viper_scores_df),
length.out=floor(palette_length/2)))
viper_hmap <- pheatmap(t(summarized_viper_scores_df),fontsize=14,
fontsize_row = 10,
color=my_color, breaks = my_breaks,
main = "DoRothEA (ABC)", angle_col = 45,
treeheight_col = 0, border_color = NA)
print(viper_hmap)
![](https://img.haomeiwen.com/i25685886/175f8488affa338b.png)