生物信息学与算法生物信息学从零开始学

scATAC分析实战(step4)

2020-05-01  本文已影响0人  Shaoqian_Ma

Step4-Run Cicero

Computing Gene Activity scores using Cicero and Co-Accessibility

用到的函数

Cicero

主要用到的包,简单看看,专门用来做scATAC-seq分析的

Precict cis-co-accessibility from single-cell chromatin accessibility data:

就像做表观测序的文章里面经常会做的东西,预测enhancer-promoter pairs

The main purpose of Cicero is to use single-cell chromatin accessibility data to predict regions of the genome that are more likely to be in physical proximity in the nucleus. This can be used to identify putative enhancer-promoter pairs, and to get a sense of the overall stucture of the cis-architecture of a genomic region.

不过因为是单细胞数据,不可能直接对稀疏counts做pairs预测,所以需要对单个细胞的数据根据相似性进行cluster,预处理

Because of the sparsity of single-cell data, cells must be aggregated by similarity to allow robust correction for various technical factors in the data.

Ultimately, Cicero provides a “Cicero co-accessibility” score between -1 and 1 between each pair of accessible peaks within a user defined distance where a higher score indicates higher co-accessibility.

一个CellDataSet对象示例:feature为chr上对应的peaks区间

ciceroObj
CellDataSet (storageMode: environment)
assayData: 132455 features, 1718 samples 
  element names: exprs 
protocolData: none
phenoData
  sampleNames: agg1017 agg679 ... agg2958 (1718 total)
  varLabels: agg_cell Size_Factor num_genes_expressed
  varMetadata: labelDescription
featureData
  featureNames: chr1_713822_714322 chr1_752462_752962 ...
    chrX_155260028_155260528 (132455 total)
  fvarLabels: site_name chr ... num_cells_expressed (5 total)
  fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:  
library(cicero)
library(data.table)
library(Matrix)
library(GenomicRanges)
library(magrittr)
library(SummarizedExperiment)
library(optparse)
library(yaml)
library(Rcpp)
set.seed(1)

grToFeature <- function(gr){
    peakinfo <- data.frame(
        row.names = paste(seqnames(gr),start(gr),end(gr),sep="_"),
        site_name = paste(seqnames(gr),start(gr),end(gr),sep="_"),
        chr = gsub("chr","",as.character(seqnames(gr))),
        bp1 = start(gr),
        bp2 = end(gr)
    )
    return(peakinfo)
}#和下面的函数都是用来实现gr和data.frame数据格式的转换

featureToGR <- function(feature){
    featureSplit <- stringr::str_split(paste0(feature), pattern = "_", n = 3, simplify = TRUE)
    gr <- GRanges(featureSplit[,1],IRanges(as.integer(featureSplit[,2]),as.integer(featureSplit[,3])))
    return(gr)
}

makeCDS <- function(se, binarize = TRUE){#这个函数用来制作the CellDataSet对象
    peakinfo <- grToFeature(se)
    mat <- assay(se)
    if(binarize){
        mat@x[which(mat@x > 0)] <- 1
    }
    cellinfo <- data.frame(colData(se))
    cellinfo$cells <- rownames(cellinfo)
    cds <-  suppressWarnings(newCellDataSet(mat,
                              phenoData = methods::new("AnnotatedDataFrame", data = cellinfo),
                              featureData = methods::new("AnnotatedDataFrame", data = peakinfo),
                              expressionFamily=negbinomial.size(),
                              lowerDetectionLimit=0))
    fData(cds)$chr <- as.character(fData(cds)$chr)
    fData(cds)$bp1 <- as.numeric(as.character(fData(cds)$bp1))
    fData(cds)$bp2 <- as.numeric(as.character(fData(cds)$bp2))
    cds <- cds[order(fData(cds)$chr, fData(cds)$bp1),]
    return(cds)
}


sourceCpp(code='
  #include <Rcpp.h>

  using namespace Rcpp;
  using namespace std;

  // Adapted from https://github.com/AEBilgrau/correlateR/blob/master/src/auxiliary_functions.cpp
  // [[Rcpp::export]]
  Rcpp::NumericVector rowCorCpp(IntegerVector idxX, IntegerVector idxY, Rcpp::NumericMatrix X, Rcpp::NumericMatrix Y) {
    
    if(X.ncol() != Y.ncol()){
      stop("Columns of Matrix X and Y must be equal length!");
    }

    if(max(idxX) > X.nrow()){
      stop("Idx X greater than nrow of Matrix X");
    }

    if(max(idxY) > Y.nrow()){
      stop("Idx Y greater than nrow of Matrix Y");
    }

    // Transpose Matrices
    X = transpose(X);
    Y = transpose(Y);
    
    const int nx = X.ncol();
    const int ny = Y.ncol();

    // Centering the matrices
    for (int j = 0; j < nx; ++j) {
      X(Rcpp::_, j) = X(Rcpp::_, j) - Rcpp::mean(X(Rcpp::_, j));
    }

    for (int j = 0; j < ny; ++j) {
      Y(Rcpp::_, j) = Y(Rcpp::_, j) - Rcpp::mean(Y(Rcpp::_, j));
    }

    // Compute 1 over the sample standard deviation
    Rcpp::NumericVector inv_sqrt_ss_X(nx);
    for (int i = 0; i < nx; ++i) {
      inv_sqrt_ss_X(i) = 1/sqrt(Rcpp::sum( X(Rcpp::_, i) * X(Rcpp::_, i) ));
    }

    Rcpp::NumericVector inv_sqrt_ss_Y(ny);
    for (int i = 0; i < ny; ++i) {
      inv_sqrt_ss_Y(i) = 1/sqrt(Rcpp::sum( Y(Rcpp::_, i) * Y(Rcpp::_, i) ));
    }

    //Calculate Correlations
    const int n = idxX.size();
    Rcpp::NumericVector cor(n);
    for(int k = 0; k < n; k++){
      cor[k] = Rcpp::sum( X(Rcpp::_, idxX[k] - 1) * Y(Rcpp::_, idxY[k] - 1) ) * inv_sqrt_ss_X(idxX[k] - 1) * inv_sqrt_ss_Y(idxY[k] - 1);
    } 

    return(cor);

  }'
)

getTxDbGenes <- function(txdb = NULL, orgdb = NULL, gr = NULL, ignore.strand = TRUE){
    
    if (is.null(genome)) {
        if (is.null(txdb) | is.null(orgdb)) {
            stop("If no provided genome then you need txdb and orgdb!")
        }
    }
        
    if (is.null(gr)) {
        genes <- GenomicFeatures::genes(txdb)
    }else {
        genes <- suppressWarnings(subsetByOverlaps(GenomicFeatures::genes(txdb), gr, ignore.strand = ignore.strand))#取出gr中的gene
    }
    
    if (length(genes) > 1) {
        mcols(genes)$symbol <- suppressMessages(mapIds(orgdb, 
            keys = mcols(genes)$gene_id, column = "SYMBOL", keytype = "ENTREZID", 
            multiVals = "first"))
        genes <- sort(sortSeqlevels(genes), ignore.strand = TRUE)
        names(genes) <- NULL
        out <- genes
    }else {
        out <- GRanges(seqnames(gr), ranges = IRanges(0, 0), gene_id = 0, symbol = "none")[-1]
    }

    return(out)

}

运行

library(BSgenome.Hsapiens.UCSC.hg19)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)

#Read input
obj <- readRDS("results/scATAC-Summarized-Experiment.rds")
 colData(obj)
DataFrame with 3770 rows and 5 columns
                                     FRIP uniqueFrags    Clusters
                                <numeric>   <numeric> <character>
PBMC#GAGGTCCGTCTCTGCT-1 0.617957106506725        2751    Cluster1
PBMC#CCCTGATGTCTTAGCA-1 0.630749574105622        2348    Cluster1
PBMC#CTCGCTAGTTTCACCC-1 0.606741573033708        1958    Cluster1
PBMC#TATGTTCGTTTAGGAA-1 0.617335631617828        4061    Cluster4
PBMC#TCACTCGAGGGAAATG-1 0.626215219577606        2983    Cluster1
mdata <- colData(obj)
tssWindow <- 2500
flank <- 250*10^3
corCutOff <- 0.35
bsgenome <- BSgenome.Hsapiens.UCSC.hg19
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
orgdb <- org.Hs.eg.db

#Reduced Dimensions
dimred <- data.frame(
        row.names = colnames(obj), 
        colData(obj)$UMAP1, #这里我换成了tsne
        colData(obj)$UMAP2
    )

#Get ChromSizes
chromSizes <- seqlengths(bsgenome)[paste0("chr",c(1:22,"X"))]
genome <- data.frame(names(chromSizes),chromSizes)
rownames(genome) <- NULL

#Get CDS  cicero要求输入CDS对象
obj <- makeCDS(obj, binarize = TRUE)
obj <- detectGenes(obj)#detectGenes:Counts how many cells each feature in a CellDataSet object that are detectably expressed above a minimum threshold.
obj <- estimateSizeFactors(obj)
ciceroObj <- make_cicero_cds(obj, k = 50, reduced_coordinates = dimred[colnames(obj),])#Function to generate an aggregated input CDS for cicero,其中k表示每个bin需要合并多少个细胞的数据
#Overlap QC metrics:    
#Cells per bin: 50
#Maximum shared cells bin-bin: 44
#Mean shared cells bin-bin: 0.709124600058444
#Median shared cells bin-bin: 0

#Compute Correlations
message("Computing grouped correlations...")
gr <- featureToGR(featureData(ciceroObj)[[1]])#把500bp peaks转成gr
o <- suppressWarnings(as.matrix( findOverlaps(resize( resize(gr,1,"center"), 2*flank + 1, "center"), resize(gr,1,"center"), ignore.strand=TRUE) ))#每个peaks都归一化成+-250kb的窗口,计算这些归一化后的peaks和原来peaks中心的重叠情况,意思是如果peaks之间距离小于250kb就会算成重叠
o <- data.table::as.data.table(data.frame(i = matrixStats::rowMins(o), j = matrixStats::rowMaxs(o)))
o <- data.frame(o[!duplicated(o),])
o <- o[o[,1]!=o[,2],]   #自己和自己重叠的删掉
#我认为这些步骤可能是在重新调整peaks区间,以适合计算correlation
o$cor <- rowCorCpp(o[,1], o[,2], assayData(ciceroObj)$exprs, assayData(ciceroObj)$exprs)#这是一个用c写的函数,计算correlation
connections <- data.frame(
    Peak1 = featureData(ciceroObj)[[1]][o[,1]], 
    Peak2 = featureData(ciceroObj)[[1]][o[,2]], 
    coaccess = o[,3]
    )

#Annotate CDS
message("Annotating Cell Data Set...")
genes <- getTxDbGenes(txdb=txdb,orgdb=orgdb)
names(genes) <- genes$symbol
genes <- resize(genes, 1, "start") %>% resize(tssWindow * 2 + 1, "center")
geneDF <- data.frame(chromosome=seqnames(genes),start=start(genes),end=end(genes), gene=genes$symbol)
obj <- annotate_cds_by_site(obj, geneDF)

#Prepare for Co-Accessibility
nSites <- Matrix::colSums(assayData(obj)$exprs)
names(nSites) <- row.names(pData(obj))

#Cicero with Correlations
message("Calculating normalized gene activities...")
ciceroGA <- normalize_gene_activities(build_gene_activity_matrix(obj, connections, coaccess_cutoff = corCutOff), nSites)

seCicero <- SummarizedExperiment(
    assays = SimpleList(gA = ciceroGA),
    rowRanges = genes[rownames(ciceroGA),],
    colData = mdata
)

seCiceroLog <- SummarizedExperiment(
    assays = SimpleList(logGA = log2(10^6 * ciceroGA + 1)),
    rowRanges = genes[rownames(ciceroGA),],
    colData = mdata
)

#Save Output
saveRDS(connections, "results/Peaks-Co-Accessibility.rds")
saveRDS(seCicero, "results/Cicero-Gene-Activity.rds")
saveRDS(seCiceroLog, "results/Cicero-Log2-Gene-Activity.rds")

connections:

connections.jpg

CiceroGA: 应该是根据peaks的accessibility推断潜在gene activity,生成gene activity sparse matrix

ciceroGA
13954 x 3770 sparse Matrix of class "dgCMatrix"
ciceroGA[1:4,1:4]
4 x 4 sparse Matrix of class "dgCMatrix"
         PBMC#GAGGTCCGTCTCTGCT-1 PBMC#CCCTGATGTCTTAGCA-1
A1BG                .                                  .
A1BG-AS1            .                                  .
A2M                 0.0008241604                       .
A4GALT              .                                  .
         PBMC#CTCGCTAGTTTCACCC-1 PBMC#TATGTTCGTTTAGGAA-1
A1BG                           .                       .
A1BG-AS1                       .                       .
A2M                            .                       .
A4GALT                         .                       .

关于 build_gene_activity_matrix 函数:

The initial function is called build_gene_activity_matrix. This function takes an input CDS and a Cicero connection list, and outputs an unnormalized table of gene activity scores. IMPORTANT: the input CDS must have a column in the fData table called “gene” which indicates the gene if that peak is a promoter, and NA if the peak is distal.

引用:[https://doi.org/10.1038/s41587-019-0206-z]

上一篇下一篇

猜你喜欢

热点阅读