scATAC分析实战(step4)
Step4-Run Cicero
Computing Gene Activity scores using Cicero and Co-Accessibility
用到的函数
Cicero
主要用到的包,简单看看,专门用来做scATAC-seq分析的
Precict cis-co-accessibility from single-cell chromatin accessibility data:
就像做表观测序的文章里面经常会做的东西,预测enhancer-promoter pairs
The main purpose of Cicero is to use single-cell chromatin accessibility data to predict regions of the genome that are more likely to be in physical proximity in the nucleus. This can be used to identify putative enhancer-promoter pairs, and to get a sense of the overall stucture of the cis-architecture of a genomic region.
不过因为是单细胞数据,不可能直接对稀疏counts做pairs预测,所以需要对单个细胞的数据根据相似性进行cluster,预处理
Because of the sparsity of single-cell data, cells must be aggregated by similarity to allow robust correction for various technical factors in the data.
Ultimately, Cicero provides a “Cicero co-accessibility” score between -1 and 1 between each pair of accessible peaks within a user defined distance where a higher score indicates higher co-accessibility.
一个CellDataSet对象示例:feature为chr上对应的peaks区间
ciceroObj
CellDataSet (storageMode: environment)
assayData: 132455 features, 1718 samples
element names: exprs
protocolData: none
phenoData
sampleNames: agg1017 agg679 ... agg2958 (1718 total)
varLabels: agg_cell Size_Factor num_genes_expressed
varMetadata: labelDescription
featureData
featureNames: chr1_713822_714322 chr1_752462_752962 ...
chrX_155260028_155260528 (132455 total)
fvarLabels: site_name chr ... num_cells_expressed (5 total)
fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:
library(cicero)
library(data.table)
library(Matrix)
library(GenomicRanges)
library(magrittr)
library(SummarizedExperiment)
library(optparse)
library(yaml)
library(Rcpp)
set.seed(1)
grToFeature <- function(gr){
peakinfo <- data.frame(
row.names = paste(seqnames(gr),start(gr),end(gr),sep="_"),
site_name = paste(seqnames(gr),start(gr),end(gr),sep="_"),
chr = gsub("chr","",as.character(seqnames(gr))),
bp1 = start(gr),
bp2 = end(gr)
)
return(peakinfo)
}#和下面的函数都是用来实现gr和data.frame数据格式的转换
featureToGR <- function(feature){
featureSplit <- stringr::str_split(paste0(feature), pattern = "_", n = 3, simplify = TRUE)
gr <- GRanges(featureSplit[,1],IRanges(as.integer(featureSplit[,2]),as.integer(featureSplit[,3])))
return(gr)
}
makeCDS <- function(se, binarize = TRUE){#这个函数用来制作the CellDataSet对象
peakinfo <- grToFeature(se)
mat <- assay(se)
if(binarize){
mat@x[which(mat@x > 0)] <- 1
}
cellinfo <- data.frame(colData(se))
cellinfo$cells <- rownames(cellinfo)
cds <- suppressWarnings(newCellDataSet(mat,
phenoData = methods::new("AnnotatedDataFrame", data = cellinfo),
featureData = methods::new("AnnotatedDataFrame", data = peakinfo),
expressionFamily=negbinomial.size(),
lowerDetectionLimit=0))
fData(cds)$chr <- as.character(fData(cds)$chr)
fData(cds)$bp1 <- as.numeric(as.character(fData(cds)$bp1))
fData(cds)$bp2 <- as.numeric(as.character(fData(cds)$bp2))
cds <- cds[order(fData(cds)$chr, fData(cds)$bp1),]
return(cds)
}
sourceCpp(code='
#include <Rcpp.h>
using namespace Rcpp;
using namespace std;
// Adapted from https://github.com/AEBilgrau/correlateR/blob/master/src/auxiliary_functions.cpp
// [[Rcpp::export]]
Rcpp::NumericVector rowCorCpp(IntegerVector idxX, IntegerVector idxY, Rcpp::NumericMatrix X, Rcpp::NumericMatrix Y) {
if(X.ncol() != Y.ncol()){
stop("Columns of Matrix X and Y must be equal length!");
}
if(max(idxX) > X.nrow()){
stop("Idx X greater than nrow of Matrix X");
}
if(max(idxY) > Y.nrow()){
stop("Idx Y greater than nrow of Matrix Y");
}
// Transpose Matrices
X = transpose(X);
Y = transpose(Y);
const int nx = X.ncol();
const int ny = Y.ncol();
// Centering the matrices
for (int j = 0; j < nx; ++j) {
X(Rcpp::_, j) = X(Rcpp::_, j) - Rcpp::mean(X(Rcpp::_, j));
}
for (int j = 0; j < ny; ++j) {
Y(Rcpp::_, j) = Y(Rcpp::_, j) - Rcpp::mean(Y(Rcpp::_, j));
}
// Compute 1 over the sample standard deviation
Rcpp::NumericVector inv_sqrt_ss_X(nx);
for (int i = 0; i < nx; ++i) {
inv_sqrt_ss_X(i) = 1/sqrt(Rcpp::sum( X(Rcpp::_, i) * X(Rcpp::_, i) ));
}
Rcpp::NumericVector inv_sqrt_ss_Y(ny);
for (int i = 0; i < ny; ++i) {
inv_sqrt_ss_Y(i) = 1/sqrt(Rcpp::sum( Y(Rcpp::_, i) * Y(Rcpp::_, i) ));
}
//Calculate Correlations
const int n = idxX.size();
Rcpp::NumericVector cor(n);
for(int k = 0; k < n; k++){
cor[k] = Rcpp::sum( X(Rcpp::_, idxX[k] - 1) * Y(Rcpp::_, idxY[k] - 1) ) * inv_sqrt_ss_X(idxX[k] - 1) * inv_sqrt_ss_Y(idxY[k] - 1);
}
return(cor);
}'
)
getTxDbGenes <- function(txdb = NULL, orgdb = NULL, gr = NULL, ignore.strand = TRUE){
if (is.null(genome)) {
if (is.null(txdb) | is.null(orgdb)) {
stop("If no provided genome then you need txdb and orgdb!")
}
}
if (is.null(gr)) {
genes <- GenomicFeatures::genes(txdb)
}else {
genes <- suppressWarnings(subsetByOverlaps(GenomicFeatures::genes(txdb), gr, ignore.strand = ignore.strand))#取出gr中的gene
}
if (length(genes) > 1) {
mcols(genes)$symbol <- suppressMessages(mapIds(orgdb,
keys = mcols(genes)$gene_id, column = "SYMBOL", keytype = "ENTREZID",
multiVals = "first"))
genes <- sort(sortSeqlevels(genes), ignore.strand = TRUE)
names(genes) <- NULL
out <- genes
}else {
out <- GRanges(seqnames(gr), ranges = IRanges(0, 0), gene_id = 0, symbol = "none")[-1]
}
return(out)
}
运行
library(BSgenome.Hsapiens.UCSC.hg19)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
#Read input
obj <- readRDS("results/scATAC-Summarized-Experiment.rds")
colData(obj)
DataFrame with 3770 rows and 5 columns
FRIP uniqueFrags Clusters
<numeric> <numeric> <character>
PBMC#GAGGTCCGTCTCTGCT-1 0.617957106506725 2751 Cluster1
PBMC#CCCTGATGTCTTAGCA-1 0.630749574105622 2348 Cluster1
PBMC#CTCGCTAGTTTCACCC-1 0.606741573033708 1958 Cluster1
PBMC#TATGTTCGTTTAGGAA-1 0.617335631617828 4061 Cluster4
PBMC#TCACTCGAGGGAAATG-1 0.626215219577606 2983 Cluster1
mdata <- colData(obj)
tssWindow <- 2500
flank <- 250*10^3
corCutOff <- 0.35
bsgenome <- BSgenome.Hsapiens.UCSC.hg19
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
orgdb <- org.Hs.eg.db
#Reduced Dimensions
dimred <- data.frame(
row.names = colnames(obj),
colData(obj)$UMAP1, #这里我换成了tsne
colData(obj)$UMAP2
)
#Get ChromSizes
chromSizes <- seqlengths(bsgenome)[paste0("chr",c(1:22,"X"))]
genome <- data.frame(names(chromSizes),chromSizes)
rownames(genome) <- NULL
#Get CDS cicero要求输入CDS对象
obj <- makeCDS(obj, binarize = TRUE)
obj <- detectGenes(obj)#detectGenes:Counts how many cells each feature in a CellDataSet object that are detectably expressed above a minimum threshold.
obj <- estimateSizeFactors(obj)
ciceroObj <- make_cicero_cds(obj, k = 50, reduced_coordinates = dimred[colnames(obj),])#Function to generate an aggregated input CDS for cicero,其中k表示每个bin需要合并多少个细胞的数据
#Overlap QC metrics:
#Cells per bin: 50
#Maximum shared cells bin-bin: 44
#Mean shared cells bin-bin: 0.709124600058444
#Median shared cells bin-bin: 0
#Compute Correlations
message("Computing grouped correlations...")
gr <- featureToGR(featureData(ciceroObj)[[1]])#把500bp peaks转成gr
o <- suppressWarnings(as.matrix( findOverlaps(resize( resize(gr,1,"center"), 2*flank + 1, "center"), resize(gr,1,"center"), ignore.strand=TRUE) ))#每个peaks都归一化成+-250kb的窗口,计算这些归一化后的peaks和原来peaks中心的重叠情况,意思是如果peaks之间距离小于250kb就会算成重叠
o <- data.table::as.data.table(data.frame(i = matrixStats::rowMins(o), j = matrixStats::rowMaxs(o)))
o <- data.frame(o[!duplicated(o),])
o <- o[o[,1]!=o[,2],] #自己和自己重叠的删掉
#我认为这些步骤可能是在重新调整peaks区间,以适合计算correlation
o$cor <- rowCorCpp(o[,1], o[,2], assayData(ciceroObj)$exprs, assayData(ciceroObj)$exprs)#这是一个用c写的函数,计算correlation
connections <- data.frame(
Peak1 = featureData(ciceroObj)[[1]][o[,1]],
Peak2 = featureData(ciceroObj)[[1]][o[,2]],
coaccess = o[,3]
)
#Annotate CDS
message("Annotating Cell Data Set...")
genes <- getTxDbGenes(txdb=txdb,orgdb=orgdb)
names(genes) <- genes$symbol
genes <- resize(genes, 1, "start") %>% resize(tssWindow * 2 + 1, "center")
geneDF <- data.frame(chromosome=seqnames(genes),start=start(genes),end=end(genes), gene=genes$symbol)
obj <- annotate_cds_by_site(obj, geneDF)
#Prepare for Co-Accessibility
nSites <- Matrix::colSums(assayData(obj)$exprs)
names(nSites) <- row.names(pData(obj))
#Cicero with Correlations
message("Calculating normalized gene activities...")
ciceroGA <- normalize_gene_activities(build_gene_activity_matrix(obj, connections, coaccess_cutoff = corCutOff), nSites)
seCicero <- SummarizedExperiment(
assays = SimpleList(gA = ciceroGA),
rowRanges = genes[rownames(ciceroGA),],
colData = mdata
)
seCiceroLog <- SummarizedExperiment(
assays = SimpleList(logGA = log2(10^6 * ciceroGA + 1)),
rowRanges = genes[rownames(ciceroGA),],
colData = mdata
)
#Save Output
saveRDS(connections, "results/Peaks-Co-Accessibility.rds")
saveRDS(seCicero, "results/Cicero-Gene-Activity.rds")
saveRDS(seCiceroLog, "results/Cicero-Log2-Gene-Activity.rds")
connections:
connections.jpgCiceroGA: 应该是根据peaks的accessibility推断潜在gene activity,生成gene activity sparse matrix
ciceroGA
13954 x 3770 sparse Matrix of class "dgCMatrix"
ciceroGA[1:4,1:4]
4 x 4 sparse Matrix of class "dgCMatrix"
PBMC#GAGGTCCGTCTCTGCT-1 PBMC#CCCTGATGTCTTAGCA-1
A1BG . .
A1BG-AS1 . .
A2M 0.0008241604 .
A4GALT . .
PBMC#CTCGCTAGTTTCACCC-1 PBMC#TATGTTCGTTTAGGAA-1
A1BG . .
A1BG-AS1 . .
A2M . .
A4GALT . .
关于 build_gene_activity_matrix 函数:
The initial function is called build_gene_activity_matrix. This function takes an input CDS and a Cicero connection list, and outputs an unnormalized table of gene activity scores. IMPORTANT: the input CDS must have a column in the fData table called “gene” which indicates the gene if that peak is a promoter, and NA if the peak is distal.