pagoda1(scde包)+pagoda2

2020-11-03  本文已影响0人  一只烟酒僧

最近在使用pagoda做单细胞的基因集分析,不得不说,算法是真的慢,,,

pbmc3k数据集
5个线程跑满
用时为:
knn.error.models
    user   system  elapsed
6033.772   23.041 2105.214

pagoda.varnorm
 user   system  elapsed
3387.265   49.590  734.011

pagoda.pathway.wPCA
 user    system   elapsed
 8713.815 12737.006  1091.431

pagoda.gene.clusters
user    system   elapsed
11324.177  8925.026  1199.914

大概用时:2105+734+1091+1199=5129  约为1.4h

一个快速跑的示例文件

library(scde)
library(SeuratData)
data("pbmc3k")
#导入为count矩阵,同时要改为integer
pbmc3k<-apply(as.matrix(pbmc3k@assays$RNA@counts),2,function(x){storage.mode(x)<-"integer";x})
pbmc3k[1:6,1:66]
#为了加快速度,我取其中的1/10的细胞数用于后续分析
set.seed(2020)
pbmc3k<-pbmc3k[,sample(1:ncol(pbmc3k),round(ncol(pbmc3k)/10),replace = F)]
dim(pbmc3k)
#去掉低质量的细胞和基因
pbmc3k<-clean.counts(pbmc3k,min.lib.size = 200,min.reads = 10)
dim(pbmc3k)
#对每个细胞构建误差模型,
knn <- knn.error.models(pbmc3k, 
                        #groups = ,可选
                        k = ncol(pbmc3k)/4, #猜测整个细胞群可以分为4个群
                        n.cores = 5, #用10个核
                        min.count.threshold = 2, #质控参数参数
                        min.nonfailed = 5, #质控参数
                        max.model.plots = 10)

#Normalizing variance
varinfo <- pagoda.varnorm(knn, #
                          counts = pbmc3k, 
                          trim = 3/ncol(pbmc3k), 
                          max.adj.var = 5, 
                          n.cores = 5, 
                          plot = TRUE)
#Controlling for sequencing depth
varinfo <- pagoda.subtract.aspect(varinfo, colSums(pbmc3k[, rownames(knn)]>0))

#Evaluate overdispersion of pre-defined gene sets
library(org.Hs.eg.db)
library(msigdbr)
C2<-msigdbr(category = "C2")
C2_kegg<-C2[grep("KEGG",C2$gs_subcat),]
C2_kegg<-split(C2_kegg$human_gene_symbol,C2_kegg$gs_name)
C2_kegg
C2_kegg<-list2env(C2_kegg)
#对每个基因集计算加权的第一个主成分的量级
pwpca <- pagoda.pathway.wPCA(varinfo, 
                            C2_kegg,
                             n.components = 1, 
                             n.cores = 5)
#对上述结果做统计学检验
df <- pagoda.top.aspects(pwpca, 
                         return.table = F,
                         plot = F, 
                         z.score = 2)
dim(df)
# #寻找denovo的基因集
clpca <- pagoda.gene.clusters(varinfo, trim = 7.1/ncol(varinfo$mat), n.clusters = 50, n.cores = 10, plot = TRUE)
df <- pagoda.top.aspects(pwpca, clpca, n.cells = NULL, z.score = qnorm(0.01/2, lower.tail = FALSE))
#列先聚类,在做热图(很奇怪!!)
hc <- pagoda.cluster.cells(df, varinfo)
pagoda.view.aspects(df,cell.clustering = hc, box = TRUE, labCol = NA, margins = c(0.5, 20))

pheatmap(pbmc3k[names(df$gw[order(df$gw,decreasing = T)][1:200]),hc$order],
         scale = "row",
         cluster_rows = F,breaks = seq(-5,5,10/100),cluster_cols = F)


scde下的pagoda1实在是太慢了!!!!!
无奈还是要使用pagoda2包进行pagoda分析
但是pagoda2的对象结构有些特殊,属于对象中嵌套操作函数。而比较坑爹的是这种嵌套的操作函数却没有对应的帮助文档,,,,
为此我去浏览了pagoda2工程文件的全部内容


image.png

当我们打开pagoda2/R路径时,进入存放所有脚本的地方


image.png

如果对一些说明文档中未提及的函数感兴趣,可以直接通过.R脚本查看函数的源代码!

    testPathwayOverdispersion=function(setenv, 
                                       type='counts', 
                                       max.pathway.size=1e3, 
                                       min.pathway.size=10, 
                                       n.randomizations=5, 
                                       verbose=FALSE, 
                                       score.alpha=0.05,
                                       plot=FALSE, 
                                       cells=NULL,
                                       adjusted.pvalues=TRUE,
                                       z.score = qnorm(0.05/2, lower.tail = FALSE), 
                                       use.oe.scale = FALSE, return.table=FALSE,
                                       name='pathwayPCA',
                                       correlation.distance.threshold=0.2,
                                       loading.distance.threshold=0.01,
                                       top.aspects=Inf,
                                       recalculate.pca=FALSE,
                                       save.pca=TRUE) {
      nPcs <- 1;
      
      if(type=='counts') {
        x <- counts;
        # apply scaling if using raw counts
        x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
      } else {
        if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
        x <- reductions[[type]]
      }
      if(!is.null(cells)) {
        x <- x[cells,]
      }
      
      proper.gene.names <- colnames(x);
      
      if(is.null(misc[['pwpca']]) || recalculate.pca) {
        if(verbose) {
          message("determining valid pathways")
        }
        
        # determine valid pathways
        gsl <- ls(envir = setenv)
        gsl.ng <- unlist(mclapply(sn(gsl), function(go) sum(unique(get(go, envir = setenv)) %in% proper.gene.names),mc.cores=n.cores,mc.preschedule=TRUE))
        gsl <- gsl[gsl.ng >= min.pathway.size & gsl.ng<= max.pathway.size]
        names(gsl) <- gsl
        
        if(verbose) {
          message("processing ", length(gsl), " valid pathways")
        }
        
        cm <- Matrix::colMeans(x)
        
        pwpca <- papply(gsl, function(sn) {
          lab <- proper.gene.names %in% get(sn, envir = setenv)
          if(sum(lab)<1) { return(NULL) }
          pcs <- irlba(x[,lab], nv=nPcs, nu=0, center=cm[lab])
          pcs$d <- pcs$d/sqrt(nrow(x))
          pcs$rotation <- pcs$v;
          pcs$v <- NULL;
          
          # get standard deviations for the random samples
          ngenes <- sum(lab)
          z <- do.call(rbind,lapply(seq_len(n.randomizations), function(i) {
            si <- sample(ncol(x), ngenes)
            pcs <- irlba(x[,si], nv=nPcs, nu=0, center=cm[si])$d
          }))
          z <- z/sqrt(nrow(x));
          
          # local normalization of each component relative to sampled PC1 sd
          avar <- pmax(0, (pcs$d^2-mean(z[, 1]^2))/sd(z[, 1]^2))
          
          if(avar>0.5) {
            # flip orientations to roughly correspond with the means
            pcs$scores <- as.matrix(t(x[,lab] %*% pcs$rotation) - as.numeric((cm[lab] %*% pcs$rotation)))
            cs <- unlist(lapply(seq_len(nrow(pcs$scores)), function(i) sign(cor(pcs$scores[i,], colMeans(t(x[, lab, drop = FALSE])*abs(pcs$rotation[, i]))))))
            pcs$scores <- pcs$scores*cs
            pcs$rotation <- pcs$rotation*cs
            rownames(pcs$rotation) <- colnames(x)[lab];
          } # don't bother otherwise - it's not significant
          return(list(xp=pcs,z=z,n=ngenes))
        }, n.cores = n.cores,mc.preschedule=TRUE)
        if(save.pca) {
          misc[['pwpca']] <<- pwpca;
        }
      } else {
        if(verbose) {
          message("re-using previous overdispersion calculations")
          pwpca <- misc[['pwpca']];
        }
      }
      
      if(verbose) {
        message("scoring pathway overdispersion signifcance")
      }
      
      # score overdispersion
      true.n.cells <- nrow(x)
      
      pagoda.effective.cells <- function(pwpca, start = NULL) {
        n.genes <- unlist(lapply(pwpca, function(x) rep(x$n, nrow(x$z))))
        var <- unlist(lapply(pwpca, function(x) x$z[, 1]))
        if(is.null(start)) { start <- true.n.cells*2 } # start with a high value
        of <- function(p, v, sp) {
          sn <- p[1]
          vfit <- (sn+sp)^2/(sn*sn+1/2) -1.2065335745820*(sn+sp)*((1/sn + 1/sp)^(1/3))/(sn*sn+1/2)
          residuals <- (v-vfit)^2
          return(sum(residuals))
        }
        x <- nlminb(objective = of, start = c(start), v = var, sp = sqrt(n.genes-1/2), lower = c(1), upper = c(true.n.cells))
        return((x$par)^2+1/2)
      }
      n.cells <- pagoda.effective.cells(pwpca)
      
      vdf <- data.frame(do.call(rbind, lapply(seq_along(pwpca), function(i) {
        vars <- as.numeric((pwpca[[i]]$xp$d))
        cbind(i = i, var = vars, n = pwpca[[i]]$n, npc = seq(1:ncol(pwpca[[i]]$xp$rotation)))
      })))
      
      # fix p-to-q mistake in qWishartSpike
      qWishartSpikeFixed <- function (q, spike, ndf = NA, pdim = NA, var = 1, beta = 1, lower.tail = TRUE, log.p = FALSE)  {
        params <- RMTstat::WishartSpikePar(spike, ndf, pdim, var, beta)
        qnorm(q, mean = params$centering, sd = params$scaling, lower.tail, log.p)
      }
      
      # add right tail approximation to ptw, which gives up quite early
      pWishartMaxFixed <- function (q, ndf, pdim, var = 1, beta = 1, lower.tail = TRUE) {
        params <- RMTstat::WishartMaxPar(ndf, pdim, var, beta)
        q.tw <- (q - params$centering)/(params$scaling)
        p <- RMTstat::ptw(q.tw, beta, lower.tail, log.p = TRUE)
        p[p == -Inf] <- pgamma((2/3)*q.tw[p == -Inf]^(3/2), 2/3, lower.tail = FALSE, log.p = TRUE) + lgamma(2/3) + log((2/3)^(1/3))
        p
      }
      
      vshift <- 0
      ev <- 0
      
      vdf$var <- vdf$var-(vshift-ev)*vdf$n
      basevar <- 1
      vdf$exp <- RMTstat::qWishartMax(0.5, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
      #vdf$z <- qnorm(pWishartMax(vdf$var, n.cells, vdf$n, log.p = TRUE, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
      vdf$z <- qnorm(pWishartMaxFixed(vdf$var, n.cells, vdf$n, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
      vdf$cz <- qnorm(bh.adjust(pnorm(as.numeric(vdf$z), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE)
      vdf$ub <- RMTstat::qWishartMax(score.alpha/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
      vdf$ub.stringent <- RMTstat::qWishartMax(score.alpha/nrow(vdf)/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
      
      if(plot) {
        par(mfrow = c(1, 1), mar = c(3.5, 3.5, 1.0, 1.0), mgp = c(2, 0.65, 0))
        un <- sort(unique(vdf$n))
        on <- order(vdf$n, decreasing = FALSE)
        pccol <- colorRampPalette(c("black", "grey70"), space = "Lab")(max(vdf$npc))
        plot(vdf$n, vdf$var/vdf$n, xlab = "gene set size", ylab = "PC1 var/n", ylim = c(0, max(vdf$var/vdf$n)), col = adjustcolor(pccol[vdf$npc],alpha=0.1),pch=19)
        lines(vdf$n[on], (vdf$exp/vdf$n)[on], col = 2, lty = 1)
        lines(vdf$n[on], (vdf$ub.stringent/vdf$n)[on], col = 2, lty = 2)
      }
      
      rs <- (vshift-ev)*vdf$n
      vdf$oe <- (vdf$var+rs)/(vdf$exp+rs)
      vdf$oec <- (vdf$var+rs)/(vdf$ub+rs)
      
      df <- data.frame(name = names(pwpca)[vdf$i], npc = vdf$npc, n = vdf$n, score = vdf$oe, z = vdf$z, adj.z = vdf$cz, stringsAsFactors = FALSE)
      if(adjusted.pvalues) {
        vdf$valid <- vdf$cz  >=  z.score
      } else {
        vdf$valid <- vdf$z  >=  z.score
      }
      
      if(!any(vdf$valid)) { stop("no significantly overdispersed pathways found at z.score threshold of ",z.score) };
      
      # apply additional filtering based on >0.5 sd above the local random estimate
      vdf$valid <- vdf$valid & unlist(lapply(pwpca,function(x) !is.null(x$xp$scores)))
      vdf$name <- names(pwpca)[vdf$i];
      
      if(return.table) {
        df <- df[vdf$valid, ]
        df <- df[order(df$score, decreasing = TRUE), ]
        return(df)
      }
      if(verbose) {
        message("compiling pathway reduction")
      }
      # calculate pathway reduction matrix
      
      # return scaled patterns
      xmv <- do.call(rbind, lapply(pwpca[vdf$valid], function(x) {
        xm <- x$xp$scores
      }))
      
      if(use.oe.scale) {
        xmv <- (xmv -rowMeans(xmv))* (as.numeric(vdf$oe[vdf$valid])/sqrt(apply(xmv, 1, var)))
        vdf$sd <- as.numeric(vdf$oe)
      } else {
        # chi-squared
        xmv <- (xmv-rowMeans(xmv)) * sqrt((qchisq(pnorm(vdf$z[vdf$valid], lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells)/apply(xmv, 1, var))
        vdf$sd <- sqrt((qchisq(pnorm(vdf$z, lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells))
        
      }
      rownames(xmv) <- paste("#PC", vdf$npc[vdf$valid], "# ", names(pwpca)[vdf$i[vdf$valid]], sep = "")
      rownames(vdf) <- paste("#PC", vdf$npc, "# ", vdf$name, sep = "")
      misc[['pathwayODInfo']] <<- vdf
      
      # collapse gene loading
      if(verbose) {
        message("clustering aspects based on gene loading ... ",appendLF=FALSE)
      }
      tam2 <- pagoda.reduce.loading.redundancy(list(xv=xmv,xvw=matrix(1,ncol=ncol(xmv),nrow=nrow(xmv))),pwpca,NULL,plot=FALSE,distance.threshold=loading.distance.threshold,n.cores=n.cores)
      if(verbose) {
        message(nrow(tam2$xv)," aspects remaining")
      }
      if(verbose) {
        message("clustering aspects based on pattern similarity ... ",appendLF=FALSE)
      }
      tam3 <- pagoda.reduce.redundancy(tam2,distance.threshold=correlation.distance.threshold,top=top.aspects)
      if(verbose) {
        message(nrow(tam3$xv)," aspects remaining\n")
      }
      tam2$xvw <- tam3$xvw <- NULL; # to save space
      tam3$env <- setenv;
      
      # clean up aspect names, as GO ids are meaningless
      names(tam3$cnam) <- rownames(tam3$xv) <- paste0('aspect',1:nrow(tam3$xv))
      
      misc[['pathwayOD']] <<- tam3;
      reductions[[name]] <<- tam3$xv;
      return(invisible(tam3))
    },
    

#' @useDynLib pagoda2
#' @import MASS
#' @import Matrix
#' @importFrom Rcpp evalCpp
#' @import Rook
#' @import igraph
#' @import sccore
#' @importFrom irlba irlba
#' @importFrom parallel mclapply
#' @importFrom Rcpp sourceCpp
#' @importFrom magrittr %>%
NULL

#' A Reference Class, which holds and process single cell RNA-seq data.
#'
#' @field counts gene count matrix, normalized on total counts
#' @field clusters results of clustering
#' @field graphs graph representations of the dataset
#' @field reductions results of reductions, i.e. PCA
#' @field embeddings results of visualization algorithms, i.e. t-SNE or largeVis
#' @field diffgenes lists of differentially expressed genes
#' @field pathways pathway information
#' @field n.cores number of cores, used for the analyses
#' @field misc a list of miscelaneous structures
#' @field batch batch factor for the dataset
#' @field modelType plain (default) or raw (expression matrix taken as is without normalization, though log.scale still applies)
#' @field verbose verbosity level
#' @field depth cell size factor
#' @field genegraphs a slot to store graphical representations in gene space (i.e. gene kNN graphs)
#' @export Pagoda2
#' @exportClass Pagoda2
Pagoda2 <- setRefClass(
  "Pagoda2",
  
  fields=c('counts','clusters','graphs','reductions','embeddings','diffgenes',
           'pathways','n.cores','misc','batch','modelType','verbose','depth','batchNorm','mat','genegraphs'),
  
  methods = list(
    initialize=function(x, ..., modelType='plain', batchNorm='glm',
                        n.cores=parallel::detectCores(logical=FALSE), verbose=TRUE,
                        min.cells.per.gene=0, trim=round(min.cells.per.gene/2), min.transcripts.per.cell=10,
                        lib.sizes=NULL, log.scale=TRUE, keep.genes = NULL) {
      # # init all the output lists
      embeddings <<- list();
      graphs <<- list();
      diffgenes <<- list();
      reductions <<-list();
      clusters <<- list();
      pathways <<- list();
      genegraphs <<- list();
      misc <<-list(lib.sizes=lib.sizes, log.scale=log.scale, model.type=modelType, trim=trim);
      batch <<- NULL;
      counts <<- NULL;
      
      
      if(!missing(x) && ('Pagoda2' %in% class(x))) { # copy constructor
        callSuper(x, ..., modelType=modelType, batchNorm=batchNorm, n.cores=n.cores);
      } else {
        callSuper(..., modelType=modelType, batchNorm=batchNorm, n.cores=n.cores,verbose=verbose);
        if(!missing(x) && is.null(counts)) { # interpret x as a countMatrix
          if ('matrix' %in% class(x)) {
            x <- as(Matrix(x, sparse=TRUE), "dgCMatrix")
          }
          if(!('dgCMatrix' %in% class(x))) {
            stop("x is not of class dgCMatrix or matrix");
          }
          #if(any(x@x < 0)) {
          #  stop("x contains negative values");
          #}
          setCountMatrix(x, min.cells.per.gene=min.cells.per.gene, trim=trim, 
                         min.transcripts.per.cell=min.transcripts.per.cell, lib.sizes=lib.sizes,
                         log.scale=log.scale, keep.genes=keep.genes)
        }
      }
    },
    
    # provide the initial count matrix, and estimate deviance residual matrix (correcting for depth and batch)
    setCountMatrix=function(countMatrix, depthScale=1e3, min.cells.per.gene=30,
                            trim=round(min.cells.per.gene/2), min.transcripts.per.cell=10, 
                            lib.sizes=NULL, log.scale=FALSE, keep.genes = NULL) {
      # check names
      if(any(duplicated(rownames(countMatrix)))) {
        stop("duplicate gene names are not allowed - please reduce")
      }
      if(any(duplicated(colnames(countMatrix)))) {
        stop("duplicate cell names are not allowed - please reduce")
      }
      
      if(any(is.na(rownames(countMatrix)))) {
        stop("NA gene names are not allowed - please fix")
      }
      if(any(is.na(colnames(countMatrix)))) {
        stop("NA cell names are not allowed - please fix")
      }
      
      if(ncol(countMatrix)<3) { stop("too few cells remaining after min.count.per.cell filter applied - have you pre-filtered the count matrix to include only cells of a realistic size?") }
      
      counts <<- t(countMatrix)
      
      # Keep genes of sufficient coverage or genes that are in the keep.genes list
      counts <<- counts[,diff(counts@p) >= min.cells.per.gene | colnames(counts) %in% keep.genes]
      
      # Save the filtered count matrix in misc$rawCounts
      misc[['rawCounts']] <<- counts;
      misc$depthScale <<- depthScale;
      
      if (modelType == 'raw') {
        return()
      }
      
      if(!is.null(batch)) {
        if(!all(colnames(countMatrix) %in% names(batch))) { 
          stop("the supplied batch vector doesn't contain all the cells in its names attribute")
        }
        colBatch <- as.factor(batch[colnames(countMatrix)])
        batch <<- colBatch;
      }
      
      if(!is.null(lib.sizes)) {
        if(!all(colnames(countMatrix) %in% names(lib.sizes))) { 
          stop("the supplied lib.sizes vector doesn't contain all the cells in its names attribute")
        }
        lib.sizes <- lib.sizes[colnames(countMatrix)]
        depth <<- lib.sizes/mean(lib.sizes)*mean(Matrix::colSums(countMatrix))
      } else {
        depth <<- Matrix::colSums(countMatrix);
      }
      
      cell.filt.mask <- (depth >= min.transcripts.per.cell)
      counts <<- counts[cell.filt.mask,]
      depth <<- depth[cell.filt.mask]
      
      if (any(depth == 0)) {
        stop("Cells with zero expression over all genes are not allowed")
      }
      
      if(verbose) message(nrow(counts)," cells, ",ncol(counts)," genes; normalizing ... ")
      
      # get normalized matrix
      if(modelType=='linearObs') { # this shoudln't work well, since the depth dependency is not completely normalized out
        
        # winsorize in normalized space first in hopes of getting a more stable depth estimate
        if(trim>0) {
          counts <<- counts / as.numeric(depth);
          inplaceWinsorizeSparseCols(counts,trim,n.cores);
          counts <<- counts*as.numeric(depth);
          if(is.null(lib.sizes)) {
            depth <<- round(Matrix::rowSums(counts))
          }
        }
        
        ldepth <- log(depth);
        
        # rank cells, cut into n pieces
        n.depth.slices <- 20;
        #depth.fac <- as.factor(floor(rank(depth)/(length(depth)+1)*n.depth.slices)+1); names(depth.fac) <- rownames(counts);
        depth.fac <- cut(cumsum(sort(depth)),breaks=seq(0,sum(depth),length.out=n.depth.slices)); names(depth.fac) <- rownames(counts);
        depth.fac <- depth.fac[rank(depth)]
        # dataset-wide gene average
        gene.av <- (Matrix::colSums(counts)+n.depth.slices)/(sum(depth)+n.depth.slices)
        
        # pooled counts, df for all genes
        tc <- colSumByFac(counts,as.integer(depth.fac))[-1,,drop=FALSE]
        tc <- log(tc+1)- log(as.numeric(tapply(depth,depth.fac,sum))+1)
        md <- log(as.numeric(tapply(depth,depth.fac,mean)))
        # combined lm
        cm <- lm(tc ~ md)
        colnames(cm$coef) <- colnames(counts)
        # adjust counts
        # predict log(p) for each non-0 entry
        count.gene <- rep(1:counts@Dim[2],diff(counts@p))
        exp.x <- exp(log(gene.av)[count.gene] - cm$coef[1,count.gene] - ldepth[counts@i+1]*cm$coef[2,count.gene])
        counts@x <<- as.numeric(counts@x*exp.x/(depth[counts@i+1]/depthScale)); # normalize by depth as well
        # performa another round of trim
        if(trim>0) {
          inplaceWinsorizeSparseCols(counts,trim,n.cores);
        }
        
        
        # regress out on non-0 observations of ecah gene
        #non0LogColLmS(counts,mx,ldepth)
      } else if(modelType=='plain') {
        if(verbose) message("using plain model ")
        
        if(!is.null(batch)) {
          if(verbose) message("batch ... ")
          
          # dataset-wide gene average
          gene.av <- (Matrix::colSums(counts)+length(levels(batch)))/(sum(depth)+length(levels(batch)))
          
          # pooled counts, df for all genes
          tc <- colSumByFac(counts,as.integer(batch))[-1,,drop=FALSE]
          tc <- t(log(tc+1)- log(as.numeric(tapply(depth,batch,sum))+1))
          bc <- exp(tc-log(gene.av))
          
          # adjust every non-0 entry
          count.gene <- rep(1:counts@Dim[2],diff(counts@p))
          
          counts@x <<- as.numeric(counts@x/bc[cbind(count.gene,as.integer(batch)[counts@i+1])])
        }
        
        if(trim>0) {
          if(verbose) message("winsorizing ... ")
          counts <<- counts/as.numeric(depth);
          
          inplaceWinsorizeSparseCols(counts,trim,n.cores);
          counts <<- counts*as.numeric(depth);
          
          if(is.null(lib.sizes)) {
            depth <<- round(Matrix::rowSums(counts))
          }
        }
        
        counts <<- counts/as.numeric(depth/depthScale);
      } else {
        stop('modelType ',modelType,' is not implemented');
      }
      if(log.scale) {
        if(verbose) message("log scale ... ")
        counts@x <<- as.numeric(log(counts@x+1))
      }
      misc[['rescaled.mat']] <<- NULL;
      if(verbose) message("done.\n")
    },
    
    # adjust variance of the residual matrix, determine overdispersed sites
    adjustVariance=function(gam.k=5, alpha=5e-2, plot=FALSE, use.raw.variance=(.self$modelType=='raw'), use.unadjusted.pvals=FALSE, do.par=TRUE, max.adjusted.variance=1e3, min.adjusted.variance=1e-3, cells=NULL, verbose=TRUE, min.gene.cells=0, persist=is.null(cells), n.cores = .self$n.cores) {
      #persist <- is.null(cells) # persist results only if variance normalization is performed for all cells (not a subset)
      if(!is.null(cells)) { # translate cells into a rowSel boolean vector
        if(!(is.logical(cells) && length(cells)==nrow(counts))) {
          if(is.character(cells) || is.integer(cells)) {
            rowSel <- rep(FALSE,nrow(counts)); names(rowSel) <- rownames(counts); rowSel[cells] <- TRUE;
          } else {
            stop("cells argument must be either a logical vector over rows of the count matrix (cells), a vector of cell names or cell integer ids (row numbers)");
          }
        }
      } else {
        rowSel <- NULL;
      }
      
      if(verbose) message("calculating variance fit ...")
      df <- colMeanVarS(counts,rowSel,n.cores);
      
      if(use.raw.variance) { # use raw variance estimates without relative adjustments
        rownames(df) <- colnames(counts);
        vi <- which(is.finite(df$v) & df$nobs>=min.gene.cells);
        df$lp <- df$lpa <- log(df$v);
        df$qv <- df$v
        df$gsf <- 1; # no rescaling of variance
        ods <- order(df$v,decreasing=TRUE); if(length(ods)>1e3) { ods <- ods[1:1e3] }
        if(persist) misc[['odgenes']] <<- rownames(df)[ods];
      } else {
        # gene-relative normalizaton 
        df$m <- log(df$m); df$v <- log(df$v);
        rownames(df) <- colnames(counts);
        vi <- which(is.finite(df$v) & df$nobs>=min.gene.cells);
        if(length(vi)<gam.k*1.5) { gam.k=1 };# too few genes
        if(gam.k<2) {
          if(verbose) message(" using lm ")
          m <- lm(v ~ m, data = df[vi,])
        } else {
          if(verbose) message(" using gam ")
          m <- mgcv::gam(as.formula(paste0('v ~ s(m, k = ',gam.k,')')), data = df[vi,])
        }
        df$res <- -Inf;  df$res[vi] <- resid(m,type='response')
        n.obs <- df$nobs; #diff(counts@p)
        suppressWarnings(df$lp <- as.numeric(pf(exp(df$res),n.obs,n.obs,lower.tail=FALSE,log.p=TRUE)))
        df$lpa <- bh.adjust(df$lp,log=TRUE)
        n.cells <- nrow(counts)
        df$qv <- as.numeric(qchisq(df$lp, n.cells-1, lower.tail = FALSE, log.p=TRUE)/n.cells)
        
        if(use.unadjusted.pvals) {
          ods <- which(df$lp<log(alpha))
        } else {
          ods <- which(df$lpa<log(alpha))
        }
        
        if(persist) misc[['odgenes']] <<- rownames(df)[ods];
        if(verbose) message(length(ods),' overdispersed genes ... ',length(ods) )
        
        df$gsf <- geneScaleFactors <- sqrt(pmax(min.adjusted.variance,pmin(max.adjusted.variance,df$qv))/exp(df$v));
        df$gsf[!is.finite(df$gsf)] <- 0;
      }
      
      if(persist) {
        if(verbose) message('persisting ... ')
        misc[['varinfo']] <<- df;
      }
      
      # rescale mat variance
      ## if(rescale.mat) {
      ##   if(verbose) cat("rescaling signal matrix ... ")
      ##   #df$gsf <- geneScaleFactors <- sqrt(1/exp(df$v));
      ##   inplaceColMult(counts,geneScaleFactors,rowSel);  # normalize variance of each gene
      ##   #inplaceColMult(counts,rep(1/mean(Matrix::colSums(counts)),ncol(counts))); # normalize the column sums to be around 1
      ##   if(persist) misc[['rescaled.mat']] <<- geneScaleFactors;
      ## }
      if(plot) {
        if(do.par) {
          par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);
        }
        ## convert to log10 from natural log() by factor log10(exp(1))
        smoothScatter(log10(exp(1))*df$m,log10(exp(1))*df$v,main='',xlab='log10[ magnitude ]',ylab='log10[ variance ]')
        grid <- seq(min(log10(exp(1))*df$m[vi]),max(log10(exp(1))*df$m[vi]),length.out=1000)
        lines(grid,predict(m,newdata=data.frame(m=grid)),col="blue")
        ## NOTE: overdispersed genes ods calcuated with log()-based corrections...
        if(length(ods)>0) {
          points(log10(exp(1))*df$m[ods],log10(exp(1))*df$v[ods],pch='.',col=2,cex=1)
        }
        smoothScatter(log10(exp(1))*df$m[vi],log10(exp(1))*df$qv[vi],xlab='log10[ magnitude ]',ylab='',main='adjusted')
        abline(h=1,lty=2,col=8)
        if(is.finite(max.adjusted.variance)) { abline(h=max.adjusted.variance,lty=2,col=1) }
        points(log10(exp(1))*df$m[ods],log10(exp(1))*df$qv[ods],col=2,pch='.')
      }
      if(verbose) message("done.\n")
      return(invisible(df));
    },
    # make a Knn graph
    # note: for reproducibility, set.seed() and set n.cores=1
    makeKnnGraph=function(k=30,nrand=1e3,type='counts',weight.type='1m',odgenes=NULL,n.cores=.self$n.cores,distance='cosine',center=TRUE,x=NULL,verbose=TRUE,p=NULL, var.scale=(type == "counts")) {
      if(is.null(x)) {
        x.was.given <- FALSE;
        if(type=='counts') {
          x <- counts;
          # Scale Raw counts
        } else {
          if(type %in% names(reductions)) {
            x <- reductions[[type]];
          } else {
            stop('Specified reduction does not exist');
          }
        }
        
        if (var.scale) {
          x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
        }
        
        if(!is.null(odgenes)) {
          if(!all(odgenes %in% rownames(x))) { warning("not all of the provided odgenes are present in the selected matrix")}
          if(verbose) message("using provided odgenes ... ")
          x <- x[,odgenes]
        }
        
      } else { # is.null(x)
        x.was.given <- TRUE;
      }
      
      if(distance %in% c('cosine','angular')) {
        if(center) {
          x<- x - Matrix::rowMeans(x) # centering for consine distance
        }
        xn <- n2Knn(as.matrix(x), k, nThreads=n.cores, verbose=verbose, indexType='angular')
      } else if(distance %in% c('L2','euclidean')) {
        xn <- n2Knn(as.matrix(x), k, nThreads=n.cores, verbose=verbose, indexType='L2')
      } else {
        stop("unknown distance measure specified. Currently supported: angular, L2")
      }
      colnames(xn) <- rownames(xn) <- rownames(x);
      
      #if(weight.type=='rank') {
      #  xn$r <-  unlist(lapply(diff(c(0,which(diff(xn$s)>0),nrow(xn))),function(x) seq(x,1)))
      #}
      #xn <- xn[!xn$s==xn$e,]
      diag(xn) <- 0;
      xn <- Matrix::drop0(xn);
      
      #if(n.cores==1) { # for reproducibility, sort by node names
      #  if(verbose) cat("ordering neighbors for reproducibility ... ");
      #  xn <- xn[order(xn$s+xn$e),]
      #  if(verbose) cat("done\n");
      #}
      #df <- data.frame(from=rownames(x)[xn$s+1],to=rownames(x)[xn$e+1],weight=xn$d,stringsAsFactors=F)
      #if(weight.type=='rank') { df$rank <- xn$r }
      
      if(weight.type %in% c("cauchy","normal") && ncol(x)>sqrt(nrand)) {
        # generate some random pair data for scaling
        if(distance=='cosine') {
          #rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=T),sample(colnames(x),nrand,replace=T)),1,function(z) if(z[1]==z[2]) {return(NA); } else {1-cor(x[,z[1]],x[,z[2]])}))
          rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {1-sum(x[,z[1]]*x[,z[2]])/sqrt(sum(x[,z[1]]^2)*sum(x[,z[2]]^2))}))
        } else if(distance=='JS') {
          rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {jw.disR(x[,z[1]],x[,z[2]])}))
        } else if(distance=='L2') {
          rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {sqrt(sum((x[,z[1]]-x[,z[2]])^2))}))
        } else if(distance=='L1') {
          rd <- na.omit(apply(cbind(sample(colnames(x),nrand,replace=TRUE),sample(colnames(x),nrand,replace=TRUE)),1,function(z) if(z[1]==z[2]) {return(NA); } else {sum(abs(x[,z[1]]-x[,z[2]]))}))
        }
        suppressWarnings(rd.model <- MASS::fitdistr(rd,weight.type))
        if(weight.type=='cauchy') {
          xn@x <- 1/pcauchy(xn@x,location=rd.model$estimate['location'],scale=rd.model$estimate['scale'])-1
        } else {
          xn@x <- 1/pnorm(xn@x,mean=rd.model$estimate['mean'],sd=rd.model$estimate['sd'])-1
        }
      }
      xn@x <- pmax(0,xn@x);
      if(weight.type=='constant') { xn@x <- 1}
      if(weight.type=='1m') { xn@x <- pmax(0,1-xn@x) }
      #if(weight.type=='rank') { xn@x <- sqrt(df$rank) };
      # make a weighted edge matrix for the largeVis as well
      sxn <- (xn+t(xn))/2;
      g <- igraph::graph_from_adjacency_matrix(sxn,mode='undirected',weighted=TRUE)
      if(!x.was.given) {
        if(is.null(misc[['edgeMat']])) { misc[['edgeMat']] <<- list() }
        misc[['edgeMat']][[type]] <<- xn;
        graphs[[type]] <<- g;
      }
      return(invisible(g))
    },
    # calculate clusters based on the kNN graph
    getKnnClusters=function(type='counts',method=igraph::multilevel.community, name='community', subsampling.rate=0.8, n.subsamplings=10, cluster.stability.threshold=0.95, n.cores=.self$n.cores, g=NULL, min.cluster.size=1, persist=TRUE, return.details=FALSE, ...) {
      
      if(is.null(g)) {
        if(is.null(graphs[[type]])) { stop("call makeKnnGraph(type='",type,"', ...) first")}
        g <- graphs[[type]];
      }
      
      if(is.null(method)) {
        if(length(vcount(g))<2000) {
          method <- igraph::infomap.community;
        } else {
          method <- igraph::multilevel.community;
        }
      }
      
      # method <- igraph::multilevel.community; n.cores <- 20
      # n.subsamplings <- 10; cluster.stability.dilution <- 1.5; cluster.stability.fraction <- 0.9; subsampling.rate <- 0.8; metaclustering.method<- 'ward.D'
      # g <- r$graphs$PCA
      # x <- r$counts
      # cls <- method(g)
      
      #library(parallel)
      #x <- mclapply(1:5,function(z) method(g),mc.cores=20)
      
      cls <- method(g,...)
      cls.groups <- as.factor(membership(cls));
      
      # cleanup the clusters to remove very small ones
      if(min.cluster.size>1) {
        cn <- names(cls.groups);
        vg <- which(unlist(tapply(cls.groups,cls.groups,length))>=min.cluster.size);
        cls.groups <- as.integer(cls.groups); cls.groups[!cls.groups %in% vg] <- NA;
        cls.groups <- as.factor(cls.groups);
        names(cls.groups) <- cn;
      }
      
      if(persist) {
        clusters[[type]][[name]] <<- cls.groups;
        misc[['community']][[type]][[name]] <<- cls;
      }
      return(invisible(cls))
    },
    
    geneKnnbyPCA = function() {
      warning('geneKnnbyPCA is deprecated use makeGeneKnnGraph() instead');
      .self$makeGeneKnnGraph();
    },
    
    # take a given clustering and generate a hierarchical clustering
    # TODO: add support for innerOrder
    getHierarchicalDiffExpressionAspects = function(type='counts',groups=NULL,clusterName=NULL,method='ward.D', dist='pearson', persist=TRUE, z.threshold=2, n.cores=.self$n.cores, min.set.size=5 ) {
      if(type=='counts') {
        x <- counts;
      } else {
        x <- reductions[[type]];
      }
      if(is.null(groups)) {
        # retrieve clustering
        if(is.null(clusters)) stop("please generate clusters first")
        if(is.null(clusters[[type]])) stop(paste("please generate clusters for",type,"first"))
        if(is.null(clusterName)) { # use the first clustering
          if(length(clusters[[type]])<1) stop(paste("please generate clusters for",type,"first"))
          cl <- clusters[[type]][[1]];
        } else {
          cl <- clusters[[type]][[clusterName]]
          if(is.null(cl)) stop(paste("unable to find clustering",clusterName,'for',type))
          if(verbose) message("using ",clusterName," clustering for ",type," space\n")
        }
      } else {
        if(!all(rownames(x) %in% names(groups))) { warning("provided cluster vector doesn't list groups for all of the cells")}
        cl <- groups;
      }
      cl <- as.factor(cl[match(rownames(x),names(cl))]);
      
      if(dist %in% c('pearson','spearman')) {
        rx <- do.call(cbind,tapply(1:nrow(x),cl,function(ii) {
          if (length(ii) > 1) {
            Matrix::colMeans(x[ii,])
          } else {
            x[ii,]
          }
        }))
        d <- as.dist(1-cor(rx,method=dist))
      } else if(dist=='euclidean') {
        rx <- do.call(rbind,tapply(1:nrow(x),cl,function(ii) {
          if (legnth(ii) > 1) {
            Matrix::colMeans(x[ii,])
          } else {
            x[ii,]
          }
        }))
        d <- dist(rx)
      } else if(dist=='JS') {
        # this one has to be done on counts, even if we're working with a reduction
        cl <- as.factor(cl[match(rownames(misc[['rawCounts']]),names(cl))]);
        lvec <- colSumByFac(misc[['rawCounts']],as.integer(cl))[-1,] + 1
        d <- jsDist(t(lvec/pmax(1,Matrix::rowSums(lvec))));
        colnames(d) <- rownames(d) <- which(table(cl)>0)
        d <- as.dist(d)
      } else {
        stop("unknown distance",dist,"requested")
      }
      
      dd <- as.dendrogram(hclust(d,method=method))
      
      # walk down the dendrogram to generate diff. expression on every split
      diffcontrasts <- function(l,env) {
        v <- mget("contrasts",envir=env,ifnotfound=0)[[1]]
        if(!is.list(v)) v <- list();
        if(is.leaf(l)) return(NULL)
        lcl <- rep(NA,nrow(x));
        names(lcl) <- rownames(x)
        lcl[names(lcl) %in% names(cl)[cl %in% unlist(l[[1]])]] <- paste(unlist(l[[1]]),collapse='.');
        lcl[names(lcl) %in% names(cl)[cl %in% unlist(l[[2]])]] <- paste(unlist(l[[2]]),collapse='.');
        v <- c(v,list(as.factor(lcl)))
        assign("contrasts",v,envir=env)
        return(1);
      }
      
      de <- environment()
      assign('contrasts',NULL,envir=de)
      dc <- dendrapply(dd,diffcontrasts,env=de)
      dc <- get("contrasts",env=de)
      names(dc) <- unlist(lapply(dc,function(x) paste(levels(x),collapse=".vs.")))
      
      #dexp <- papply(dc,function(x) getDifferentialGenes(groups=x,z.threshold=z.threshold),n.cores=n.cores)
      
      x <- counts;
      x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p)); # apply variance scaling
      x <- t(x)
      dexp <- papply(dc,function(g) {
        dg <- getDifferentialGenes(groups=g,z.threshold=z.threshold)
        dg <- lapply(dg,function(x) x[x$Z>=z.threshold,])
        # calculate average profiles
        x <- x[rownames(x) %in% unlist(lapply(dg,rownames)),]
        if(nrow(x)<1) return(NULL);
        x <- x-rowMeans(x[,!is.na(g)])
        sf <- rep(1,nrow(x)); names(sf) <- rownames(x);
        if(nrow(dg[[1]])>0) {
          ig <- which(names(sf) %in% rownames(dg[[1]]))
          sf[ig] <- 1/length(ig)
        }
        if(nrow(dg[[2]])>0) {
          ig <- which(names(sf) %in% rownames(dg[[2]]))
          sf[ig] <- -1/length(ig)
        }
        sf <- sf*sqrt(length(sf))
        pt <- colSums(x*sf)
        pt[is.na(g)] <- 0;
        return(list(dg=dg,pt=pt))
      },n.cores=n.cores)
      
      
      dexp <- dexp[!unlist(lapply(dexp,is.null))]; # remove cases where nothing was reported
      dexp <- dexp[!unlist(lapply(dexp,function(x){class(x) == 'try-error'}))] ## remove cases that failed
      
      # fake pathwayOD output
      tamr <- list(xv=do.call(rbind,lapply(dexp,function(x) x$pt)),
                   cnam=lapply(sn(names(dexp)),function(n) c(n)))
      
      dgl <- lapply(dexp,function(d) as.character(unlist(lapply(d$dg,function(x) rownames(x)[x$Z>=z.threshold]))))
      tamr$env <- list2env(dgl[unlist(lapply(dgl,length))>=min.set.size])
      
      misc[['pathwayOD']] <<- tamr;
      
      # fake pathwayODInfo
      zs <- unlist(lapply(dexp,function(x) max(unlist(lapply(x$dg,function(y) y$Z)))))
      mval <- unlist(lapply(dexp,function(x) max(unlist(lapply(x$dg,function(y) y$M)))))
      vdf <- data.frame(i=1:nrow(tamr$xv),npc=1,valid=TRUE,sd=apply(tamr$xv,1,sd),cz=zs,z=zs,oe=mval,n=unlist(lapply(dexp,function(x) sum(unlist(lapply(x$dg,nrow))))))
      vdf$name <- rownames(vdf) <- names(dexp);
      misc[['pathwayODInfo']] <<- vdf
      
      return(invisible(tamr))
    },
    
    # Calculates gene Knn network for gene similarity
    # Author: Simon Steiger
    makeGeneKnnGraph = function(nPcs = 100, scale =TRUE , center=TRUE, fastpath =TRUE, maxit =1000, k = 30, n.cores = .self$n.cores, verbose =TRUE) {
      # Transpose first
      x <- t(counts);
      
      # TODO: factor out gene PCA calculation
      # Do the PCA
      nPcs <- min(nrow(x)-1,ncol(x)-1,nPcs);
      if (center) {
        cm <- Matrix::colMeans(x);
        pcs <- irlba(x, nv=nPcs, nu =0, center=cm, right_only = FALSE, fastpath = fastpath, maxit= maxit, reorth = TRUE);
      } else {
        pcs <- irlba(x, nv=nPcs, nu =0, right_only = FALSE, fastpath = fastpath, maxit= maxit, reorth = TRUE);
      }
      rownames(pcs$v) <- colnames(x);
      
      # Optional centering
      if (center) {
        pcs$center <- cm;
        pcas <- as.matrix(t(t(x %*% pcs$v) - t(cm  %*% pcs$v)))
      } else {
        pcas <- as.matrix(x %*% pcs$v);
      }
      
      # Keep the names
      rownames(pcas) <- rownames(x);
      colnames(pcas) <- paste0('PC',seq(ncol(pcas)));
      
      # Save into genegraphs slot
      #genegraphs$genePCs <<- pcs;
      #genegraphs$geneRotated <<- pcas;
      
      # Using cosine distance only here
      if(center) {
        pcas <- pcas - Matrix::rowMeans(pcas)
      }
      xn <- n2Knn(pcas, k, nThreads= n.cores, verbose=verbose)
      diag(xn) <- 0; # Remove self edges
      xn <- as(xn,'dgTMatrix'); # will drop 0s
      # Turn into a dataframe, convert from correlation distance into weight
      df <- data.frame('from'=rownames(pcas)[xn@i+1],'to'=rownames(pcas)[xn@j+1],'w'=pmax(1-xn@x,0),stringsAsFactors=FALSE)
      
      genegraphs$graph <<- df;
    },
    
    # calculate density-based clusters
    getDensityClusters=function(type='counts', embeddingType=NULL, name='density', v=0.7, s=1, ...) {
      if (!requireNamespace("dbscan", quietly = TRUE)) {
        stop("Package \"dbscan\" needed for this function to work. Please install it.", call. = FALSE)
      }
      
      if(is.null(embeddings[[type]])) { stop("first, generate embeddings for type ",type)}
      if(is.null(embeddingType)) {
        # take the first one
        embeddingType <- names(embeddings[[type]])[1]
        if(verbose) message("using ",embeddingType," embedding\n")
        emb <- embeddings[[type]][[embeddingType]]
        
      } else {
        emb <- embeddings[[type]][[embeddingType]]
        if(is.null(emb)) { stop("embedding ",embeddingType," for type ", type," doesn't exist")}
      }
      
      cl <- dbscan::dbscan(emb, ...)$cluster;
      cols <- rainbow(length(unique(cl)),v=v,s=s)[cl+1];    cols[cl==0] <- "gray70"
      names(cols) <- rownames(emb);
      clusters[[type]][[name]] <<- cols;
      misc[['clusters']][[type]][[name]] <<- cols;
      return(invisible(cols))
    },
    # determine subpopulation-specific genes
    
    getDifferentialGenes=function(type='counts',clusterType=NULL,groups=NULL,name='customClustering', z.threshold=3,upregulated.only=FALSE,verbose=FALSE, append.specificity.metrics=TRUE, append.auc=FALSE) {
      "Determine differentially expressed genes, comparing each group against all others using Wilcoxon rank sum test\n
      - type data type (currently only default 'counts' is supported)\n
      - clusterType optional cluster type to use as a group-defining factor\n
      - groups explicit cell group specification - a named cell factor (use NA in the factor to exclude cells from the comparison)\n
      - name name slot to store the results in\n
      - z.threshold minimal absolute Z score (adjusted) to report\n
      - upregulated.only whether to report only genes that are expressed significantly higher in each group\n
      - verbose verbose flag\n
      return a list, with each element of the list corresponding to a cell group in the provided/used factor (i.e. factor levels); Each element of a list is a data frame listing the differentially epxressed genes (row names), with the following columns: Z - adjusted Z score, with positive values indicating higher expression in a given group compare to the rest; M - log2 fold change; highest- a boolean flag indicating whether the expression of a given gene in a given vcell group was on average higher than in every other cell group; fe - fraction of cells in a given group having non-zero expression level of a given gene;\n
      Examples:\n\n
      result <- r$getDifferentialGenes(groups=r$clusters$PCA$multilevel);\n
      str(r$diffgenes)"
      
      # restrict counts to the cells for which non-NA value has been specified in groups
      
      if(is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          cols <- clusters[[type]][[1]]
        } else {
          cols <- clusters[[type]][[clusterType]]
          if(is.null(cols)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      } else {
        cols <- groups;
      }
      cm <- counts;
      if(!all(rownames(cm) %in% names(cols))) { warning("cluster vector doesn't specify groups for all of the cells, dropping missing cells from comparison")}
      # determine a subset of cells that's in the cols and cols[cell]!=NA
      valid.cells <- rownames(cm) %in% names(cols)[!is.na(cols)];
      if(!all(valid.cells)) {
        # take a subset of the count matrix
        cm <- cm[valid.cells,]
      }
      # reorder cols
      cols <- as.factor(cols[match(rownames(cm),names(cols))]);
      
      cols <- as.factor(cols);
      if(verbose) {
        message("running differential expression with ",length(levels(cols))," clusters ... ")
      }
      # use offsets based on the base model
      
      # run wilcoxon test comparing each group with the rest
      lower.lpv.limit <- -100;
      # calculate rank per-column (per-gene) average rank matrix
      xr <- sparse_matrix_column_ranks(cm);
      # calculate rank sums per group
      grs <- colSumByFac(xr,as.integer(cols))[-1,,drop=FALSE]
      # calculate number of non-zero entries per group
      xr@x <- numeric(length(xr@x))+1
      gnzz <- colSumByFac(xr,as.integer(cols))[-1,,drop=FALSE]
      #group.size <- as.numeric(tapply(cols,cols,length));
      group.size <- as.numeric(tapply(cols,cols,length))[1:nrow(gnzz)]; group.size[is.na(group.size)]<-0; # trailing empty levels are cut off by colSumByFac
      # add contribution of zero entries to the grs
      gnz <- (group.size-gnzz)
      # rank of a 0 entry for each gene
      zero.ranks <- (nrow(xr)-diff(xr@p)+1)/2 # number of total zero entries per gene
      ustat <- t((t(gnz)*zero.ranks)) + grs - group.size*(group.size+1)/2
      # standardize
      n1n2 <- group.size*(nrow(cm)-group.size);
      # usigma <- sqrt(n1n2*(nrow(cm)+1)/12) # without tie correction
      # correcting for 0 ties, of which there are plenty
      usigma <- sqrt(n1n2*(nrow(cm)+1)/12)
      usigma <- sqrt((nrow(cm) +1 - (gnz^3 - gnz)/(nrow(cm)*(nrow(cm)-1)))*n1n2/12)
      x <- t((ustat - n1n2/2)/usigma); # standardized U value- z score
      
      
      # correct for multiple hypothesis
      if(verbose) {
        message("adjusting p-values ... ")
      }
      x <- matrix(qnorm(bh.adjust(pnorm(as.numeric(abs(x)), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE),ncol=ncol(x))*sign(x)
      rownames(x) <- colnames(cm); colnames(x) <- levels(cols)[1:ncol(x)];
      if(verbose) {
        message("done.\n")
      }
      
      # add fold change information
      log.gene.av <- log2(Matrix::colMeans(cm));
      group.gene.av <- colSumByFac(cm,as.integer(cols))[-1,,drop=FALSE] / (group.size+1);
      log2.fold.change <- log2(t(group.gene.av)) - log.gene.av;
      # fraction of cells expressing
      f.expressing <- t(gnzz / group.size);
      max.group <- max.col(log2.fold.change)
      
      ds <- lapply(1:ncol(x),function(i) {
        z <- x[,i];
        vi <- which((if (upregulated.only) z else abs(z)) >= z.threshold);
        r <- data.frame(Z=z[vi],M=log2.fold.change[vi,i],highest=max.group[vi]==i,fe=f.expressing[vi,i], Gene=rownames(x)[vi])
        rownames(r) <- r$Gene;
        r <- r[order(r$Z,decreasing=TRUE),]
        r
      })
      names(ds) <- colnames(x);
      
      if (append.specificity.metrics) {
        ds <- names(ds) %>% setNames(., .) %>%
          papply(function(n) appendSpecificityMetricsToDE(ds[[n]], cols, n, p2.counts=cm, append.auc=append.auc), n.cores=n.cores)
      }
      
      if(is.null(groups)) {
        if(is.null(clusterType)) {
          diffgenes[[type]][[names(clusters[[type]])[1]]] <<- ds;
        } else {
          diffgenes[[type]][[clusterType]] <<- ds;
        }
      } else {
        diffgenes[[type]][[name]] <<- ds;
      }
      
      return(invisible(ds))
    },
    
    plotDiffGeneHeatmap=function(type='counts',clusterType=NULL, groups=NULL, n.genes=100, z.score=2, gradient.range.quantile=0.95, inner.clustering=FALSE, gradientPalette=NULL, v=0.8, s=1, box=TRUE, drawGroupNames=FALSE, ... ) {
      if(!is.null(clusterType)) {
        x <- diffgenes[[type]][[clusterType]];
        if(is.null(x)) { stop("differential genes for the specified cluster type haven't been calculated") }
      } else {
        x <- diffgenes[[type]][[1]];
        if(is.null(x)) { stop("no differential genes found for data type ",type) }
      }
      
      if(is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          cols <- clusters[[type]][[1]]
        } else {
          cols <- clusters[[type]][[clusterType]]
          if(is.null(cols)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      } else {
        # use clusters information
        if(!all(rownames(counts) %in% names(groups))) { warning("provided cluster vector doesn't list groups for all of the cells")}
        cols <- as.factor(groups[match(rownames(counts),names(groups))]);
      }
      cols <- as.factor(cols);
      # select genes to show
      if(!is.null(z.score)) {
        x <- lapply(x,function(d) d[d$Z >= z.score & d$highest==TRUE,])
        if(!is.null(n.genes)) {
          x <- lapply(x,function(d) {if(nrow(d)>0) { d[1:min(nrow(d),n.genes),]}})
        }
      } else {
        if(!is.null(n.genes)) {
          x <- lapply(x,function(d) {if(nrow(d)>0) { d[1:min(nrow(d),n.genes),]}})
        }
      }
      x <- lapply(x,rownames);
      # make expression matrix
      #browser()
      #x <- x[!unlist(lapply(x,is.null))]
      #cols <- cols[cols %in% names(x)]
      #cols <- droplevels(cols)
      em <- counts[,unlist(x)];
      # renormalize rows
      if(all(sign(em)>=0)) {
        if(is.null(gradientPalette)) {
          gradientPalette <- colorRampPalette(c('gray90','red'), space = "Lab")(1024)
        }
        em <- apply(em,1,function(x) {
          zlim <- as.numeric(quantile(x,p=c(1-gradient.range.quantile,gradient.range.quantile)))
          if(diff(zlim)==0) {
            zlim <- as.numeric(range(x))
          }
          x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
          x <- (x-zlim[1])/(zlim[2]-zlim[1])
        })
      } else {
        if(is.null(gradientPalette)) {
          gradientPalette <- colorRampPalette(c("blue", "grey90", "red"), space = "Lab")(1024)
        }
        em <- apply(em,1,function(x) {
          zlim <- c(-1,1)*as.numeric(quantile(abs(x),p=gradient.range.quantile))
          if(diff(zlim)==0) {
            zlim <- c(-1,1)*as.numeric(max(abs(x)))
          }
          x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
          x <- (x-zlim[1])/(zlim[2]-zlim[1])
        })
      }
      
      # cluster cell types by averages
      rowfac <- factor(rep(names(x),unlist(lapply(x,length))),levels=names(x))
      if(inner.clustering) {
        clclo <- hclust(as.dist(1-cor(do.call(cbind,tapply(1:nrow(em),rowfac,function(ii) Matrix::colMeans(em[ii,,drop=FALSE]))))),method='complete')$order
      } else {
        clclo <- 1:length(levels(rowfac))
      }
      
      if(inner.clustering) {
        # cluster genes within each cluster
        clgo <- tapply(1:nrow(em),rowfac,function(ii) {
          ii[hclust(as.dist(1-cor(t(em[ii,]))),method='complete')$order]
        })
      } else {
        clgo <- tapply(1:nrow(em),rowfac,I)
      }
      if(inner.clustering) {
        # cluster cells within each cluster
        clco <- tapply(1:ncol(em),cols,function(ii) {
          if(length(ii)>3) {
            ii[hclust(as.dist(1-cor(em[,ii,drop=FALSE])),method='complete')$order]
          } else {
            ii
          }
        })
      } else {
        clco <- tapply(1:ncol(em),cols,I)
      }
      #clco <- clco[names(clgo)]
      # filter down to the clusters that are included
      #vic <- cols %in% clclo
      colors <- fac2col(cols,v=v,s=s,return.details=TRUE)
      cellcols <- colors$colors[unlist(clco[clclo])]
      genecols <- rev(rep(colors$palette,unlist(lapply(clgo,length)[clclo])))
      bottomMargin <- ifelse(drawGroupNames,4,0.5);
      my.heatmap2(em[rev(unlist(clgo[clclo])),unlist(clco[clclo])],col=gradientPalette,Colv=NA,Rowv=NA,labRow=NA,labCol=NA,RowSideColors=genecols,ColSideColors=cellcols,margins=c(bottomMargin,0.5),ColSideColors.unit.vsize=0.05,RowSideColors.hsize=0.05,useRaster=T, box=box, ...)
      abline(v=cumsum(unlist(lapply(clco[clclo],length))),col=1,lty=3)
      abline(h=cumsum(rev(unlist(lapply(clgo[clclo],length)))),col=1,lty=3)
    },
    
    # recalculate library sizes using robust regression within clusters
    getRefinedLibSizes=function(clusterType=NULL, groups=NULL,type='counts') {
      if (!requireNamespace("robustbase", quietly = TRUE)) {
        stop("Package \"robustbase\" needed for this function to work. Please install it.", call. = FALSE)
      }
      
      if(is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          groups <- clusters[[type]][[1]]
        } else {
          groups <- clusters[[type]][[clusterType]]
          if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      }
      if(is.null(groups)) { stop("clustering must be determined first, or passed as a groups parameter") }
      
      # calculated pooled profiles per cluster
      lvec <- colSumByFac(misc[['rawCounts']],as.integer(groups))[-1,,drop=FALSE];
      lvec <- t(lvec/pmax(1,Matrix::rowSums(lvec)))*1e4
      
      # TODO: implement internal robust regression
      ## x <- misc[['rawCounts']]
      ## x <- x/as.numeric(depth);
      ## inplaceWinsorizeSparseCols(x,10);
      ## x <- x*as.numeric(depth);
      
      x <- mclapply(1:length(levels(groups)),function(j) {
        ii <- names(groups)[which(groups==j)]
        av <- lvec[,j]
        avi <- which(av>0)
        av <- av[avi]
        cvm <- as.matrix(misc[['rawCounts']][ii,avi])
        x <- unlist(lapply(ii,function(i) {
          cv <- cvm[i,]
          #as.numeric(coef(glm(cv~av+0,family=poisson(link='identity'),start=sum(cv)/1e4)))
          as.numeric(coef(robustbase::glmrob(cv~av+0,family=poisson(link='identity'),start=sum(cv)/1e4)))
        }))
        names(x) <- ii;
        x
      },mc.cores=30)
      
      lib.sizes <- unlist(x)[rownames(misc[['rawCounts']])]
      lib.sizes <- lib.sizes/mean(lib.sizes)*mean(Matrix::rowSums(misc[['rawCounts']]))
      
      depth <<- lib.sizes;
      return(invisible(lib.sizes))
    },
    
    # plot heatmap for a given set of genes
    plotGeneHeatmap=function(genes, type='counts', clusterType=NULL, groups=NULL, z.score=2, gradient.range.quantile=0.95, cluster.genes=FALSE, inner.clustering=FALSE, gradientPalette=NULL, v=0.8, s=1, box=TRUE, drawGroupNames=FALSE, useRaster=TRUE, smooth.span=max(1,round(nrow(counts)/1024)), ... ) {
      if(is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          cols <- clusters[[type]][[1]]
        } else {
          cols <- clusters[[type]][[clusterType]]
          if(is.null(cols)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      } else {
        # use clusters information
        if(!all(rownames(counts) %in% names(groups))) { warning("provided cluster vector doesn't list groups for all of the cells")}
        cols <- as.factor(groups[match(rownames(counts),names(groups))]);
      }
      cols <- as.factor(cols);
      # make expression matrix
      if(!all(genes %in% colnames(counts))) { warning(paste("the following specified genes were not found in the data: [",paste(genes[!genes %in% colnames(counts)],collapse=" "),"], omitting",sep="")) }
      x <- intersect(genes,colnames(counts));
      if(length(x)<1) { stop("too few genes") }
      em <- as.matrix(t(counts[,x]));
      
      # renormalize rows
      if(all(sign(em)>=0)) {
        if(is.null(gradientPalette)) {
          gradientPalette <- colorRampPalette(c('gray90','red'), space = "Lab")(1024)
        }
        em <- t(apply(em,1,function(x) {
          zlim <- as.numeric(quantile(x,p=c(1-gradient.range.quantile,gradient.range.quantile)))
          if(diff(zlim)==0) {
            zlim <- as.numeric(range(x))
          }
          x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
          x <- (x-zlim[1])/(zlim[2]-zlim[1])
        }))
      } else {
        if(is.null(gradientPalette)) {
          gradientPalette <- colorRampPalette(c("blue", "grey90", "red"), space = "Lab")(1024)
        }
        em <- t(apply(em,1,function(x) {
          zlim <- c(-1,1)*as.numeric(quantile(abs(x),p=gradient.range.quantile))
          if(diff(zlim)==0) {
            zlim <- c(-1,1)*as.numeric(max(abs(x)))
          }
          x[x<zlim[1]] <- zlim[1]; x[x>zlim[2]] <- zlim[2];
          x <- (x-zlim[1])/(zlim[2]-zlim[1])
        }))
      }
      
      # cluster cell types by averages
      clclo <- 1:length(levels(cols))
      
      if(cluster.genes) {
        # cluster genes within each cluster
        clgo <- hclust(as.dist(1-cor(t(em))),method='complete')$order
      } else {
        clgo <- 1:nrow(em)
      }
      
      if(inner.clustering) {
        # cluster cells within each cluster
        clco <- tapply(1:ncol(em),cols,function(ii) {
          if(length(ii)>3) {
            ii[hclust(as.dist(1-cor(em[,ii,drop=FALSE])),method='single')$order]
          } else {
            ii
          }
          # TODO: implement smoothing span support
        })
      } else {
        clco <- tapply(1:ncol(em),cols,I)
      }
      
      cellcols <- fac2col(cols,v=v,s=s)[unlist(clco[clclo])]
      #genecols <- rev(rep(fac2col(cols,v=v,s=s,return.level.colors=T),unlist(lapply(clgo,length)[clclo])))
      bottomMargin <- 0.5;
      # reorder and potentially smooth em
      em <- em[rev(clgo),unlist(clco[clclo])];
      
      my.heatmap2(em,col=gradientPalette,Colv=NA,Rowv=NA,labCol=NA,ColSideColors=cellcols,margins=c(bottomMargin,5),ColSideColors.unit.vsize=0.05,RowSideColors.hsize=0.05,useRaster=useRaster, box=box, ...)
      bp <- cumsum(unlist(lapply(clco[clclo],length))); # cluster border positions
      abline(v=bp,col=1,lty=3)
      #abline(h=cumsum(rev(unlist(lapply(clgo[clclo],length)))),col=1,lty=3)
      if(drawGroupNames) {
        clpos <- (c(0,bp[-length(bp)])+bp)/2;
        labpos <- rev(seq(0,length(bp)+1)/(length(bp)+1)*nrow(em)); labpos <- labpos[-1]; labpos <- labpos[-length(labpos)]
        text(x=clpos,y=labpos,labels = levels(cols),cex=1)
        # par(xpd=TRUE)
        # clpos <- (c(0,bp[-length(bp)])+bp)/2;
        # labpos <- seq(0,length(bp)+1)/(length(bp)+1)*max(bp); labpos <- labpos[-1]; labpos <- labpos[-length(labpos)]
        # text(x=labpos,y=-2,labels = levels(col))
        # segments(labpos,-1,clpos,0.5,lwd=0.5)
        # par(xpd=FALSE)
      }
    },
    
    # show embedding
    plotEmbedding=function(type=NULL, embeddingType=NULL, clusterType=NULL, groups=NULL, colors=NULL, gene=NULL, plot.theme=ggplot2::theme_bw(), ...) {
      if (is.null(type)) {
        if ('counts' %in% names(embeddings)) {
          type <- 'counts'
        } else if (length(embeddings) > 0) {
          type <- names(embeddings)[1]
        } else {
          stop("first, generate an embedding")
        }
      }
      
      if(is.null(embeddings[[type]]))
        stop("first, generate embeddings for type ",type)
      
      if(is.null(embeddingType)) {
        embeddingType <- 1 # take the first one
      }
      
      emb <- embeddings[[type]][[embeddingType]]
      
      if (!is.null(gene)) {
        if (!(gene %in% colnames(counts)))
          stop("Gene '", gene, "' isn't presented in the count matrix")
        
        colors <- .self$counts[,gene]
      }
      
      if(is.null(colors) && is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          groups <- clusters[[type]][[1]]
        } else {
          groups <- clusters[[type]][[clusterType]]
          if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      }
      
      sccore::embeddingPlot(emb, groups=groups, colors=colors, plot.theme=plot.theme, ...)
    },
    # get overdispersed genes
    getOdGenes=function(n.odgenes=NULL,alpha=5e-2,use.unadjusted.pvals=FALSE) {
      if(is.null(misc[['varinfo']])) { stop("please run adjustVariance first")}
      if(is.null(n.odgenes)) { #return according to alpha
        if(use.unadjusted.pvals) {
          rownames(misc[['varinfo']])[misc[['varinfo']]$lp <= log(alpha)]
        } else {
          rownames(misc[['varinfo']])[misc[['varinfo']]$lpa <= log(alpha)]
        }
      } else { # return top n.odgenes sites
        rownames(misc[['varinfo']])[(order(misc[['varinfo']]$lp,decreasing=FALSE)[1:min(ncol(counts),n.odgenes)])]
      }
    },
    
    getNormalizedExpressionMatrix=function(n.odgenes=NULL,genes=NULL) {
      "Return variance-normalized matrix for specified genes or a number of OD genes\n
      - genes explicit vector of genes to return
      - n.odgenes whether a certain number of top overdispersed genes should be used (default NULL - all significant ones); ignored if genes are passed
      return:  cell x gene matrix\n"
      if(is.null(genes)) {
        genes <- .self$getOdGenes(n.odgenes)
      }
      x <- .self$counts[,genes];
      x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
      x
    },
    
    calculatePcaReduction=function(nPcs=20, type='counts', name='PCA', use.odgenes=TRUE, n.odgenes=NULL, odgenes=NULL, center=TRUE, cells=NULL, fastpath=TRUE, maxit=100, verbose=TRUE, var.scale=(type == "counts"), ...) {
      "Calculate PCA reduction of the data\n
      - nPcs number of PCs\n
      - type dataset view to reduce (counts by default, but can specify a name of an existing reduction)\n
      - name name for the PCA reduction to be created\n
      - use.odgenes whether pre-calculated set of overdispersed genes should be used (default=TRUE)\n
      - n.odgenes whether a certain number of top overdispersed genes should be used\n
      - odgenes explicitly specify a set of genes to use for the reduction\n
      - center should data be centerred prior to PCA (default=TRUE)\n
      - cells optional subset of cells on which PCA should be ran\n
      - fastpath use C implementation for speedup\n
      - maxit maximum number of irlba iterations to use\n
      - verbose verbose\n
      - ... additional arguments forwarded to irlba::irlba\n
      return invisible PCA result (the reduction itself is saved in $reductions[[name]])"
      
      if(type=='counts') {
        x <- counts;
      } else {
        if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
        x <- reductions[[type]]
      }
      if((use.odgenes || !is.null(n.odgenes)) && is.null(odgenes)) {
        if(is.null(misc[['odgenes']] )) { stop("please run adjustVariance() first")}
        odgenes <- misc[['odgenes']];
        if(!is.null(n.odgenes)) {
          if(n.odgenes>length(odgenes)) {
            #warning("number of specified odgenes is higher than the number of the statistically significant sites, will take top ",n.odgenes,' sites')
            odgenes <- rownames(misc[['varinfo']])[(order(misc[['varinfo']]$lp,decreasing=FALSE)[1:min(ncol(counts),n.odgenes)])]
          } else {
            odgenes <- odgenes[1:n.odgenes]
          }
        }
      }
      if(!is.null(odgenes)) {
        x <- x[,odgenes]
        if(verbose) message('running PCA using ',length(odgenes),' OD genes .')
      } else { #all genes?
        if(verbose) message('running PCA all ',ncol(x),' genes .')
      }
      # apply scaling if using raw counts
      if(var.scale) {
        #x <- t(t(x)*misc[['varinfo']][colnames(x),'gsf'])
        x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
      }
      if(verbose) message('.')
      
      if(!is.null(cells)) {
        # cell subset is just for PC determination
        nPcs <- min(min(length(cells),ncol(x))-1,nPcs)
        cm <- Matrix::colMeans(x[cells,])
        pcs <- irlba(x[cells,], nv=nPcs, nu=0, center=cm, right_only=FALSE,fastpath=fastpath,maxit=maxit,reorth=TRUE, ...)
      } else {
        nPcs <- min(min(nrow(x),ncol(x))-1,nPcs)
        if(center) {
          cm <- Matrix::colMeans(x)
          pcs <- irlba(x, nv=nPcs, nu=0, center=cm, right_only=FALSE,fastpath=fastpath,maxit=maxit,reorth=TRUE, ...)
        } else {
          pcs <- irlba(x, nv=nPcs, nu=0, right_only=FALSE,fastpath=fastpath,maxit=maxit,reorth=TRUE, ...)
        }
      }
      rownames(pcs$v) <- colnames(x);
      
      if(verbose) message('.')
      
      # adjust for centering!
      if(center) {
        pcs$center <- cm;
        pcas <- as.matrix(t(as(t(x %*% pcs$v), "dgeMatrix") - t(cm %*% pcs$v)))
      } else {
        pcas <- as.matrix(x %*% pcs$v);
      }
      misc$PCA <<- pcs;
      if(verbose) message('.')
      #pcas <- scde::winsorize.matrix(pcas,0.05)
      # # control for sequencing depth
      # if(is.null(batch)) {
      #   mx <- model.matrix(x ~ d,data=data.frame(x=1,d=depth))
      # } else {
      #   mx <- model.matrix(x ~ d*b,data=data.frame(x=1,d=depth,b=batch))
      # }
      # # TODO: how to get rid of residual depth effects in the PCA-based clustering?
      # #pcas <- t(t(colLm(pcas,mx,returnResid=TRUE))+Matrix::colMeans(pcas))
      # pcas <- colLm(pcas,mx,returnResid=TRUE)
      rownames(pcas) <- rownames(x)
      colnames(pcas) <- paste('PC',seq(ncol(pcas)),sep='')
      #pcas <- pcas[,-1]
      #pcas <- scde::winsorize.matrix(pcas,0.1)
      if(verbose) message(' done\n')
      reductions[[name]] <<- pcas;
      ## nIcs <- nPcs;
      ## a <- ica.R.def(t(pcas),nIcs,tol=1e-3,fun='logcosh',maxit=200,verbose=T,alpha=1,w.init=matrix(rnorm(nIcs*nPcs),nIcs,nPcs))
      ## reductions[['ICA']] <<- as.matrix( x %*% pcs$v %*% a);
      ## colnames(reductions[['ICA']]) <<- paste('IC',seq(ncol(reductions[['ICA']])),sep='');
      
      return(invisible(pcas))
    },
    
    # will reset odgenes to be a superset of the standard odgene selection (guided by n.odgenes or alpha), and
    # a set of recursively determined odgenes based on a given group (or a cluster info)
    expandOdGenes=function(type='counts', clusterType=NULL, groups=NULL , min.group.size=30, od.alpha=1e-1, use.odgenes=FALSE, n.odgenes=NULL, odgenes=NULL, n.odgene.multiplier=1, gam.k=10,verbose=FALSE,n.cores=.self$n.cores,min.odgenes=10,max.odgenes=Inf,take.top.odgenes=TRUE, recursive=TRUE) {
      # determine groups
      if(is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          groups <- clusters[[type]][[1]]
        } else {
          groups <- clusters[[type]][[clusterType]]
          if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      } else {
        groups <- as.factor(groups[names(groups) %in% rownames(counts)]);
        groups <- droplevels(groups);
      }
      
      
      # determine initial set of odgenes
      if((use.odgenes || !is.null(n.odgenes)) && is.null(odgenes)) {
        if(is.null(misc[['varinfo']] )) { stop("please run adjustVariance() first")}
        df <- misc$varinfo
        odgenes <- rownames(df)[!is.na(df$lpa) & df$lpa<log(od.alpha)]
        #odgenes <- misc[['odgenes']];
        if(!is.null(n.odgenes)) {
          if(n.odgenes>length(odgenes)) {
            #warning("number of specified odgenes is higher than the number of the statistically significant sites, will take top ",n.odgenes,' sites')
            odgenes <- rownames(misc[['varinfo']])[(order(misc[['varinfo']]$lp,decreasing=FALSE)[1:min(ncol(counts),n.odgenes)])]
          } else {
            odgenes <- odgenes[1:n.odgenes]
          }
        }
      }
      
      # filter out small groups
      if(min.group.size>1) { groups[groups %in% levels(groups)[unlist(tapply(groups,groups,length))<min.group.size]] <- NA; groups <- droplevels(groups); }
      if(sum(!is.na(groups))<min.group.size) {
        warning("clustering specifies fewer cells than min.group.size")
        return(odgenes)
      }
      
      
      if(length(levels(groups))<2) {
        warning("cannot expand od genes based on a single group")
        return(odgenes)
      }
      
      # determine groups for which variance normalization will be reran
      if(recursive) {
        if(verbose) message("recursive group enumeration ...")
        # derive cluster hierarchy
        
        # use raw counts to derive clustering
        z <- misc$rawCounts;
        rowFac <- rep(-1,nrow(z)); names(rowFac) <- rownames(z);
        rowFac[match(names(groups),rownames(z))] <- as.integer(groups);
        tc <- colSumByFac(z,as.integer(rowFac))[-1,,drop=FALSE];
        rownames(tc) <- levels(groups)
        d <- 1-cor(t(log10(tc/pmax(1,Matrix::rowSums(tc))*1e3+1)))
        hc <- hclust(as.dist(d),method='average',members=unlist(tapply(groups,groups,length)))
        
        dlab <- function(l) {
          if(is.leaf(l)) {
            return(list(labels(l)))
          } else {
            return(c(list(labels(l)),dlab(l[[1]]),dlab(l[[2]])))
          }
        }
        
        # for each level in the cluster hierarchy, except for the top
        rgroups <- dlab(as.dendrogram(hc))[-1]
        rgroups <- c(list(levels(groups)),rgroups)
        if(verbose) message("done.\n");
      } else {
        rgroups <- lapply(levels(groups),I)
      }
      names(rgroups) <- unlist(lapply(rgroups,paste,collapse="+"))
      
      # run local variance normalization
      if(verbose) message("running local variance normalization ");
      # run variance normalization, determine PCs
      gpcs <- papply(rgroups,function(group) {
        cells <- names(groups)[groups %in% group]
        
        # variance normalization
        df <- .self$adjustVariance(persist=FALSE,gam.k=gam.k,verbose=FALSE,cells=cells,n.cores=1)
        #if(!is.null(n.odgenes)) {
        #  odgenes <- rownames(df)[order(df$lp,decreasing=F)[1:n.odgenes]]
        #} else {
        df <- df[!is.na(df$lp),,drop=FALSE]
        df <- df[order(df$lp,decreasing=FALSE),,drop=FALSE]
        n.od <- min(max(sum(df$lpa<log(od.alpha)),min.odgenes),max.odgenes);
        if(n.od>0) {
          odgenes <- rownames(df)[1:min(n.od*n.odgene.multiplier,nrow(df))]
        } else {
          return(NULL)
        }
        sf <- df$gsf[match(odgenes,rownames(df))];
        return(list(sf=sf,cells=cells,odgenes=odgenes))
      },n.cores=n.cores,mc.preschedule=TRUE)
      if(verbose) message(" done\n");
      odg <- unique(unlist(lapply(gpcs,function(z) z$odgenes)))
      # TODO: consider gsf?
      odgenes <- unique(c(odgenes,odg));
      misc[['odgenes']] <<- odgenes;
      return(invisible(odgenes));
    },
    
    localPcaKnn=function(nPcs=5, type='counts', clusterType=NULL, groups=NULL , k=30, b=1, a=1, min.group.size=30, name='localPCA', baseReduction='PCA', od.alpha=1e-1, n.odgenes=NULL,gam.k=10,verbose=FALSE,n.cores=.self$n.cores,min.odgenes=5,take.top.odgenes=FALSE, recursive=TRUE,euclidean=FALSE,perplexity=k,debug=FALSE,return.pca=FALSE,skip.pca=FALSE) {
      if(type=='counts') {
        x <- counts;
      } else {
        if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
        x <- reductions[[type]]
      }
      
      
      if(is.null(groups)) {
        # look up the clustering based on a specified type
        if(is.null(clusterType)) {
          # take the first one
          groups <- clusters[[type]][[1]]
        } else {
          groups <- clusters[[type]][[clusterType]]
          if(is.null(groups)) { stop("clustering ",clusterType," for type ", type," doesn't exist")}
        }
      } else {
        groups <- as.factor(groups[names(groups) %in% rownames(x)]);
        groups <- droplevels(groups);
        
      }
      if(min.group.size>1) { groups[groups %in% levels(groups)[unlist(tapply(groups,groups,length))<min.group.size]] <- NA; groups <- droplevels(groups); }
      if(sum(!is.na(groups))<min.group.size) { stop("clustering specifies fewer cells than min.group.size") }
      
      
      if(recursive) {
        if(verbose) message("recursive group enumeration ...")
        ## # derive cluster hierarchy
        ## rowFac <- rep(-1,nrow(x)); names(rowFac) <- rownames(x);
        ## rowFac[names(groups)] <- as.integer(groups);
        ## tc <- colSumByFac(x,as.integer(rowFac))[-1,]
        ## rownames(tc) <- levels(groups)
        ## #tc <- rbind("total"=Matrix::colSums(tc),tc)
        ## #d <- jsDist(t(((tc/pmax(1,Matrix::rowSums(tc)))))); rownames(d) <- colnames(d) <- rownames(tc)
        ## d <- 1-cor(t(tc))
        ## hc <- hclust(as.dist(d),method='ward.D')
        
        # use raw counts to derive clustering
        z <- misc$rawCounts;
        rowFac <- rep(-1,nrow(z)); names(rowFac) <- rownames(z);
        rowFac[match(names(groups),rownames(z))] <- as.integer(groups);
        tc <- colSumByFac(z,as.integer(rowFac))[-1,,drop=FALSE];
        rownames(tc) <- levels(groups)
        d <- 1-cor(t(log10(tc/pmax(1,Matrix::rowSums(tc))*1e3+1)))
        hc <- hclust(as.dist(d),method='average',members=unlist(tapply(groups,groups,length)))
        
        
        dlab <- function(l) {
          if(is.leaf(l)) {
            return(list(labels(l)))
          } else {
            return(c(list(labels(l)),dlab(l[[1]]),dlab(l[[2]])))
          }
        }
        
        # for each level in the cluster hierarchy, except for the top
        rgroups <- dlab(as.dendrogram(hc))[-1]
        rgroups <- c(list(levels(groups)),rgroups)
        if(verbose) message("done.\n");
      } else {
        rgroups <- lapply(levels(groups),I)
      }
      names(rgroups) <- unlist(lapply(rgroups,paste,collapse="+"))
      
      
      if(verbose) message("determining local PCs ");
      # run variance normalization, determine PCs
      gpcs <- papply(rgroups,function(group) {
        cells <- names(groups)[groups %in% group]
        
        # variance normalization
        df <- .self$adjustVariance(persist=FALSE,gam.k=gam.k,verbose=FALSE,cells=cells,n.cores=1)
        if(!is.null(n.odgenes)) {
          odgenes <- rownames(df)[order(df$lp,decreasing=FALSE)[1:n.odgenes]]
        } else {
          odgenes <- rownames(df)[!is.na(df$lpa) & df$lpa<log(od.alpha)]
        }
        if(length(odgenes)<min.odgenes) {
          if(take.top.odgenes) {
            odgenes <- rownames(df)[order(df$lp,decreasing=FALSE)[1:min.odgenes]]
          } else {
            return(NULL);
          }
        }
        sf <- df$gsf[match(odgenes,rownames(df))];
        
        if(return.pca && skip.pca) {
          return(list(sf=sf,cells=cells,odgenes=odgenes))
        }
        
        
        y <- t(t(x[cells,odgenes])*sf)
        cm <- Matrix::colMeans(y)
        # PCA
        pcs <- irlba(y, nv=nPcs, nu=0, center=cm, right_only=FALSE,fastpath=TRUE,reorth=TRUE)
        rownames(pcs$v) <- colnames(y);
        pcs$center <- cm;
        # row-randomize x to get a sense for the pcs
        m1 <- y;
        if(euclidean) {
          #for (i in 1:nrow(m1)) m1[i,] <- m1[i,order(runif(length(m1[i,])))]
          m1@i <- sample(m1@i)
          rpcas <- t(t(m1 %*% pcs$v) - t(pcs$center %*% pcs$v))
          pcs$rsd <- apply(rpcas,2,sd)
          pcs$trsd <- sd(dist(rpcas))
        }
        
        # sample within-cluster distances (based on main PCA)
        
        pcas <- as.matrix(t(t(t(t(x[,odgenes])*sf) %*% pcs$v) - t(pcs$center %*% pcs$v)))
        if(verbose) message(".")
        return(list(pcs=pcs,sf=sf,df=df,cells=cells,pcas=pcas,odgenes=odgenes))
      },n.cores=n.cores)
      if(verbose) message(" done\n");
      if(return.pca) return(gpcs)
      
      ivi <- unlist(lapply(gpcs,is.null))
      if(any(ivi)) { gpcs <- gpcs[!ivi]; rgroups <- rgroups[!ivi]; }
      
      if(debug) { browser() }
      
      
      # calculate cell relevance to each cluster (p_k,i matrix)
      # use global PCA distances
      if(verbose) message("calculating global distances ...");
      gcdist <- as.matrix(dist(gpcs[[1]]$pcas))
      if(verbose) message(" done.\n")
      
      # for each PCA
      # for each cell, determine p_k_i
      # for each PC
      # determine cell projections
      # subset genes
      # subtract center
      # scale, multiply
      # for each pair, determine cell distances
      # use cell distances to complete weight matrix
      # add to the w*d^2 and w matrices
      # normalize by the sqrt(sum(w))
      
      
      if(euclidean) {
        if(verbose) message("calculating local Euclidean distances .");
        dcs <- papply(gpcs,function(p) {
          pk <- rep(1,nrow(p$pcas)); names(pk) <- rownames(p$pcas);
          nci <- setdiff(rownames(gcdist),p$cells)
          if(length(nci)>0) {
            # determine within cluster sd
            scells <- sample(p$cells,min(1e3,length(p$cells)))
            cldsd <- sd(as.numeric(gcdist[scells,scells]))
            ncid <- rowMeans(gcdist[nci,scells])
            pk[nci] <- exp(-b*(ncid/cldsd)^2)
          }
          dsq <- as.matrix(dist(p$pcas)^2)
          w <- (1-exp(-a*(dsq)/(p$pcs$trsd^2))) * (pk %o% pk)
          if(verbose) message(".")
          list(dsq=dsq,w=w)
        },n.cores=n.cores)
        if(verbose) message(".")
        d <- Reduce('+',lapply(dcs,function(x) x$dsq*x$w))
        if(verbose) message(".")
        d <- sqrt(d/Reduce('+',lapply(dcs,function(x) x$w)));
        diag(d) <- 0
        if(verbose) message(" done.\n")
      } else {
        # weighted correlation
        if(verbose) message("calculating local correlation distances .");
        dcs <- papply(gpcs,function(p) {
          pk <- rep(1,nrow(p$pcas)); names(pk) <- rownames(p$pcas);
          nci <- setdiff(rownames(gcdist),p$cells)
          if(length(nci)>0) {
            # determine within cluster sd
            scells <- sample(p$cells,min(1e3,length(p$cells)))
            cldsd <- sd(as.numeric(gcdist[scells,scells]))
            ncid <- rowMeans(gcdist[nci,scells])
            pk[nci] <- exp(-b*(ncid/cldsd)^2)
          }
          x <- cov(t(p$pcas))*(ncol(p$pcas)-1)
          xc <- x / sqrt(diag(x) %o% diag(x)) # correlation
          w <- (1-exp(-a*(1-xc))) * (pk %o% pk)
          
          if(verbose) message(".")
          list(x=x,w=w)
        },n.cores=n.cores)
        if(verbose) message(".")
        # calculate sum_{k}_{w_k*v} matrix
        wm <- Reduce('+',lapply(dcs,function(z) z$w*diag(z$x)))
        d <- Reduce('+',lapply(dcs,function(z) z$x*z$w))
        d <- 1-d/sqrt(wm*t(wm))
        diag(d) <- 0
        if(verbose) message(" done.\n")
      }
      
      
      
      ## d <- dcs[[1]]$dsq
      ## d <- as.matrix(dist(r$reductions$PCA))
      ## knn <- apply(d,2,function(x) order(x,decreasing=F)[1:(k+1)])
      ## cat(".")
      ## m <- sparseMatrix(i=as.numeric(knn),p=c(0,(1:ncol(knn))*nrow(knn)),dims=rep(ncol(knn),2),x=rep(1,nrow(knn)*ncol(knn)))
      ## m <- m+t(m); # symmetrize
      ## diag(m) <- 0;
      ## rownames(m) <- colnames(m) <- rownames(d)
      ## x <- list(m=m)
      ## i <- 5; cl <- rep(NA,nrow(x$m)); names(cl) <- rownames(x$m); cl[rownames(x$m)[i]] <- 1; cl[which(x$m[,i]>0)] <- 2; cl <- as.factor(cl);
      ## r$plotEmbedding(type='PCA',embeddingType='tSNE',groups=cl,alpha=0.2,min.group.size=00,mark.clusters = TRUE, mark.cluster.cex=0.8,unclassified.cell.color=adjustcolor(1,alpha=0.1))
      
      ## i <- 5; cl <- 1/(dcs[[1]]$dsq[,i]+1e-6); names(cl) <- rownames(x$m);
      ## i <- 5; cl <- 1/(d[,i]+1e-6); names(cl) <- rownames(x$m);
      ## r$plotEmbedding(type='PCA',embeddingType='tSNE',colors=cl,alpha=0.2,min.group.size=00,mark.clusters = TRUE, mark.cluster.cex=0.8,unclassified.cell.color=adjustcolor(1,alpha=0.1))
      
      # kNN
      if(verbose) message("creating kNN graph .");
      knn <- apply(d,2,function(x) order(x,decreasing=FALSE)[1:(k+1)])
      if(verbose) message(".")
      #m <- sparseMatrix(i=as.numeric(knn),p=c(0,(1:ncol(knn))*nrow(knn)),dims=rep(ncol(knn),2),x=rep(1,nrow(knn)*ncol(knn)))
      m <- sparseMatrix(i=as.numeric(knn),p=c(0,(1:ncol(knn))*nrow(knn)),dims=rep(ncol(knn),2),x=d[as.integer(t(t(knn)+((1:ncol(knn))-1)*nrow(d)))])
      m <- m+t(m); # symmetrize
      diag(m) <- 0;
      rownames(m) <- colnames(m) <- rownames(d)
      if(verbose) message(".")
      g <- graph_from_adjacency_matrix(m,mode='undirected',weighted=TRUE);
      if(verbose) message(".")
      graphs[[name]] <<- g;
      if(verbose) message(" done.\n")
      
      emb <- Rtsne::Rtsne(d,is_distance=TRUE, perplexity=perplexity, num_threads=n.cores)$Y;
      rownames(emb) <- colnames(d)
      embeddings[[type]][[name]] <<- emb;
      
      # calculate cell-cell distance, considering weighting
      
      # getting total cell-cell distance
      
      return(invisible(list(d=d,m=m,gpcs=gpcs)))
    },
    
    
    # test pathway overdispersion
    # this is a compressed version of the PAGODA1 approach
    # env - pathway to gene environment
    testPathwayOverdispersion=function(setenv, 
                                       type='counts', 
                                       max.pathway.size=1e3, 
                                       min.pathway.size=10, 
                                       n.randomizations=5, 
                                       verbose=FALSE, 
                                       score.alpha=0.05,
                                       plot=FALSE, 
                                       cells=NULL,
                                       adjusted.pvalues=TRUE,
                                       z.score = qnorm(0.05/2, lower.tail = FALSE), 
                                       use.oe.scale = FALSE, return.table=FALSE,
                                       name='pathwayPCA',
                                       correlation.distance.threshold=0.2,
                                       loading.distance.threshold=0.01,
                                       top.aspects=Inf,
                                       recalculate.pca=FALSE,
                                       save.pca=TRUE) {
      nPcs <- 1;
      
      if(type=='counts') {
        x <- counts;
        # apply scaling if using raw counts
        x@x <- x@x*rep(misc[['varinfo']][colnames(x),'gsf'],diff(x@p))
      } else {
        if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
        x <- reductions[[type]]
      }
      if(!is.null(cells)) {
        x <- x[cells,]
      }
      
      proper.gene.names <- colnames(x);
      
      if(is.null(misc[['pwpca']]) || recalculate.pca) {
        if(verbose) {
          message("determining valid pathways")
        }
        
        # determine valid pathways
        gsl <- ls(envir = setenv)
        gsl.ng <- unlist(mclapply(sn(gsl), function(go) sum(unique(get(go, envir = setenv)) %in% proper.gene.names),mc.cores=n.cores,mc.preschedule=TRUE))
        gsl <- gsl[gsl.ng >= min.pathway.size & gsl.ng<= max.pathway.size]
        names(gsl) <- gsl
        
        if(verbose) {
          message("processing ", length(gsl), " valid pathways")
        }
        
        cm <- Matrix::colMeans(x)
        
        pwpca <- papply(gsl, function(sn) {
          lab <- proper.gene.names %in% get(sn, envir = setenv)
          if(sum(lab)<1) { return(NULL) }
          pcs <- irlba(x[,lab], nv=nPcs, nu=0, center=cm[lab])
          pcs$d <- pcs$d/sqrt(nrow(x))
          pcs$rotation <- pcs$v;
          pcs$v <- NULL;
          
          # get standard deviations for the random samples
          ngenes <- sum(lab)
          z <- do.call(rbind,lapply(seq_len(n.randomizations), function(i) {
            si <- sample(ncol(x), ngenes)
            pcs <- irlba(x[,si], nv=nPcs, nu=0, center=cm[si])$d
          }))
          z <- z/sqrt(nrow(x));
          
          # local normalization of each component relative to sampled PC1 sd
          avar <- pmax(0, (pcs$d^2-mean(z[, 1]^2))/sd(z[, 1]^2))
          
          if(avar>0.5) {
            # flip orientations to roughly correspond with the means
            pcs$scores <- as.matrix(t(x[,lab] %*% pcs$rotation) - as.numeric((cm[lab] %*% pcs$rotation)))
            cs <- unlist(lapply(seq_len(nrow(pcs$scores)), function(i) sign(cor(pcs$scores[i,], colMeans(t(x[, lab, drop = FALSE])*abs(pcs$rotation[, i]))))))
            pcs$scores <- pcs$scores*cs
            pcs$rotation <- pcs$rotation*cs
            rownames(pcs$rotation) <- colnames(x)[lab];
          } # don't bother otherwise - it's not significant
          return(list(xp=pcs,z=z,n=ngenes))
        }, n.cores = n.cores,mc.preschedule=TRUE)
        if(save.pca) {
          misc[['pwpca']] <<- pwpca;
        }
      } else {
        if(verbose) {
          message("re-using previous overdispersion calculations")
          pwpca <- misc[['pwpca']];
        }
      }
      
      if(verbose) {
        message("scoring pathway overdispersion signifcance")
      }
      
      # score overdispersion
      true.n.cells <- nrow(x)
      
      pagoda.effective.cells <- function(pwpca, start = NULL) {
        n.genes <- unlist(lapply(pwpca, function(x) rep(x$n, nrow(x$z))))
        var <- unlist(lapply(pwpca, function(x) x$z[, 1]))
        if(is.null(start)) { start <- true.n.cells*2 } # start with a high value
        of <- function(p, v, sp) {
          sn <- p[1]
          vfit <- (sn+sp)^2/(sn*sn+1/2) -1.2065335745820*(sn+sp)*((1/sn + 1/sp)^(1/3))/(sn*sn+1/2)
          residuals <- (v-vfit)^2
          return(sum(residuals))
        }
        x <- nlminb(objective = of, start = c(start), v = var, sp = sqrt(n.genes-1/2), lower = c(1), upper = c(true.n.cells))
        return((x$par)^2+1/2)
      }
      n.cells <- pagoda.effective.cells(pwpca)
      
      vdf <- data.frame(do.call(rbind, lapply(seq_along(pwpca), function(i) {
        vars <- as.numeric((pwpca[[i]]$xp$d))
        cbind(i = i, var = vars, n = pwpca[[i]]$n, npc = seq(1:ncol(pwpca[[i]]$xp$rotation)))
      })))
      
      # fix p-to-q mistake in qWishartSpike
      qWishartSpikeFixed <- function (q, spike, ndf = NA, pdim = NA, var = 1, beta = 1, lower.tail = TRUE, log.p = FALSE)  {
        params <- RMTstat::WishartSpikePar(spike, ndf, pdim, var, beta)
        qnorm(q, mean = params$centering, sd = params$scaling, lower.tail, log.p)
      }
      
      # add right tail approximation to ptw, which gives up quite early
      pWishartMaxFixed <- function (q, ndf, pdim, var = 1, beta = 1, lower.tail = TRUE) {
        params <- RMTstat::WishartMaxPar(ndf, pdim, var, beta)
        q.tw <- (q - params$centering)/(params$scaling)
        p <- RMTstat::ptw(q.tw, beta, lower.tail, log.p = TRUE)
        p[p == -Inf] <- pgamma((2/3)*q.tw[p == -Inf]^(3/2), 2/3, lower.tail = FALSE, log.p = TRUE) + lgamma(2/3) + log((2/3)^(1/3))
        p
      }
      
      vshift <- 0
      ev <- 0
      
      vdf$var <- vdf$var-(vshift-ev)*vdf$n
      basevar <- 1
      vdf$exp <- RMTstat::qWishartMax(0.5, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
      #vdf$z <- qnorm(pWishartMax(vdf$var, n.cells, vdf$n, log.p = TRUE, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
      vdf$z <- qnorm(pWishartMaxFixed(vdf$var, n.cells, vdf$n, lower.tail = FALSE, var = basevar), lower.tail = FALSE, log.p = TRUE)
      vdf$cz <- qnorm(bh.adjust(pnorm(as.numeric(vdf$z), lower.tail = FALSE, log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE)
      vdf$ub <- RMTstat::qWishartMax(score.alpha/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
      vdf$ub.stringent <- RMTstat::qWishartMax(score.alpha/nrow(vdf)/2, n.cells, vdf$n, var = basevar, lower.tail = FALSE)
      
      if(plot) {
        par(mfrow = c(1, 1), mar = c(3.5, 3.5, 1.0, 1.0), mgp = c(2, 0.65, 0))
        un <- sort(unique(vdf$n))
        on <- order(vdf$n, decreasing = FALSE)
        pccol <- colorRampPalette(c("black", "grey70"), space = "Lab")(max(vdf$npc))
        plot(vdf$n, vdf$var/vdf$n, xlab = "gene set size", ylab = "PC1 var/n", ylim = c(0, max(vdf$var/vdf$n)), col = adjustcolor(pccol[vdf$npc],alpha=0.1),pch=19)
        lines(vdf$n[on], (vdf$exp/vdf$n)[on], col = 2, lty = 1)
        lines(vdf$n[on], (vdf$ub.stringent/vdf$n)[on], col = 2, lty = 2)
      }
      
      rs <- (vshift-ev)*vdf$n
      vdf$oe <- (vdf$var+rs)/(vdf$exp+rs)
      vdf$oec <- (vdf$var+rs)/(vdf$ub+rs)
      
      df <- data.frame(name = names(pwpca)[vdf$i], npc = vdf$npc, n = vdf$n, score = vdf$oe, z = vdf$z, adj.z = vdf$cz, stringsAsFactors = FALSE)
      if(adjusted.pvalues) {
        vdf$valid <- vdf$cz  >=  z.score
      } else {
        vdf$valid <- vdf$z  >=  z.score
      }
      
      if(!any(vdf$valid)) { stop("no significantly overdispersed pathways found at z.score threshold of ",z.score) };
      
      # apply additional filtering based on >0.5 sd above the local random estimate
      vdf$valid <- vdf$valid & unlist(lapply(pwpca,function(x) !is.null(x$xp$scores)))
      vdf$name <- names(pwpca)[vdf$i];
      
      if(return.table) {
        df <- df[vdf$valid, ]
        df <- df[order(df$score, decreasing = TRUE), ]
        return(df)
      }
      if(verbose) {
        message("compiling pathway reduction")
      }
      # calculate pathway reduction matrix
      
      # return scaled patterns
      xmv <- do.call(rbind, lapply(pwpca[vdf$valid], function(x) {
        xm <- x$xp$scores
      }))
      
      if(use.oe.scale) {
        xmv <- (xmv -rowMeans(xmv))* (as.numeric(vdf$oe[vdf$valid])/sqrt(apply(xmv, 1, var)))
        vdf$sd <- as.numeric(vdf$oe)
      } else {
        # chi-squared
        xmv <- (xmv-rowMeans(xmv)) * sqrt((qchisq(pnorm(vdf$z[vdf$valid], lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells)/apply(xmv, 1, var))
        vdf$sd <- sqrt((qchisq(pnorm(vdf$z, lower.tail = FALSE, log.p = TRUE), n.cells, lower.tail = FALSE, log.p = TRUE)/n.cells))
        
      }
      rownames(xmv) <- paste("#PC", vdf$npc[vdf$valid], "# ", names(pwpca)[vdf$i[vdf$valid]], sep = "")
      rownames(vdf) <- paste("#PC", vdf$npc, "# ", vdf$name, sep = "")
      misc[['pathwayODInfo']] <<- vdf
      
      # collapse gene loading
      if(verbose) {
        message("clustering aspects based on gene loading ... ",appendLF=FALSE)
      }
      tam2 <- pagoda.reduce.loading.redundancy(list(xv=xmv,xvw=matrix(1,ncol=ncol(xmv),nrow=nrow(xmv))),pwpca,NULL,plot=FALSE,distance.threshold=loading.distance.threshold,n.cores=n.cores)
      if(verbose) {
        message(nrow(tam2$xv)," aspects remaining")
      }
      if(verbose) {
        message("clustering aspects based on pattern similarity ... ",appendLF=FALSE)
      }
      tam3 <- pagoda.reduce.redundancy(tam2,distance.threshold=correlation.distance.threshold,top=top.aspects)
      if(verbose) {
        message(nrow(tam3$xv)," aspects remaining\n")
      }
      tam2$xvw <- tam3$xvw <- NULL; # to save space
      tam3$env <- setenv;
      
      # clean up aspect names, as GO ids are meaningless
      names(tam3$cnam) <- rownames(tam3$xv) <- paste0('aspect',1:nrow(tam3$xv))
      
      misc[['pathwayOD']] <<- tam3;
      reductions[[name]] <<- tam3$xv;
      return(invisible(tam3))
    },
    
    getEmbedding=function(type='counts', embeddingType='largeVis', name=NULL, dims=2, M=1, gamma=1/M, perplexity=50, sgd_batches=NULL, diffusion.steps=0, diffusion.power=0.5, 
                          distance='pearson', n.cores = .self$n.cores, n.sgd.cores=n.cores, ... ) {
      
      if(dims<1) stop("dimensions must be >=1")
      if(type=='counts') {
        x <- counts;
      } else {
        if(!type %in% names(reductions)) { stop("reduction ",type,' not found')}
        x <- reductions[[type]]
      }
      if(is.null(name)) { 
        name <- embeddingType 
      }
      
      if(embeddingType=='largeVis') {
        edgeMat <- misc[['edgeMat']][[type]];
        if(is.null(edgeMat)) { stop(paste('KNN graph for type ',type,' not found. Please run makeKnnGraph with type=',type,sep='')) }
        if(is.null(sgd_batches)) { sgd_batches <- nrow(edgeMat)*1e3 }
        #edgeMat <- sparseMatrix(i=xn$s+1,j=xn$e+1,x=xn$rd,dims=c(nrow(x),nrow(x)))
        edgeMat <- (edgeMat + t(edgeMat))/2; # symmetrize
        #edgeMat <- sparseMatrix(i=c(xn$s,xn$e)+1,j=c(xn$e,xn$s)+1,x=c(xn$rd,xn$rd),dims=c(nrow(x),nrow(x)))
        # if(diffusion.steps>0) {
        #   Dinv <- Diagonal(nrow(edgeMat),1/colSums(edgeMat))
        #   Im <- Diagonal(nrow(edgeMat))
        #   W <- (Diagonal(nrow(edgeMat)) + edgeMat %*% Dinv)/2
        #   for(i in 1:diffusion.steps) {
        #     edgeMat <- edgeMat %*% W
        #   }
        # }
        #require(largeVis)
        #if(!is.null(seed)) { set.seed(seed) }
        if(!is.na(perplexity)) {
          wij <- buildWijMatrix(edgeMat,perplexity=perplexity,threads=n.cores)
        } else {
          wij <- edgeMat;
        }
        
        if(diffusion.steps>0) {
          Dinv <- Diagonal(nrow(wij),1/colSums(wij))
          W <- Dinv %*% wij ;
          W <- (1-diffusion.power)*Diagonal(nrow(wij)) + diffusion.power*W
          #W <- (Diagonal(nrow(wij)) + W)/2
          #W <- (Diagonal(nrow(wij)) + sign(W)*(abs(W)^(diffusion.power)))/2
          
          #W <- sign(W)*(abs(W)^diffusion.power)
          #W <- (Diagonal(nrow(wij)) + W)/2
          for(i in 1:diffusion.steps) {
            wij <- wij %*% W
          }
          #browser()
          if(!is.na(perplexity)) {
            wij <- buildWijMatrix(wij,perplexity=perplexity,threads=n.cores)
          }
          
        }
        coords <- projectKNNs(wij = wij, M = M, dim=dims, verbose = TRUE,sgd_batches = sgd_batches,gamma=gamma, seed=1, threads=n.cores, ...)
        colnames(coords) <- rownames(x);
        emb <- embeddings[[type]][[name]] <<- t(coords);
      } else if(embeddingType=='tSNE') {
        if(nrow(x)>4e4) {
          warning('Too many cells to pre-calcualte correlation distances, switching to L2. Please, consider using UMAP.');
          distance <- 'L2';
        }
        
        dup.ids <- which(duplicated(x))
        if (length(dup.ids) > 0) {
          max.vals <- abs(x[dup.ids,] * 0.01)
          x[dup.ids,] <- runif(length(x[dup.ids,]), -max.vals, max.vals)
        }
        
        if (distance=='L2') {
          if(verbose) message("running tSNE using ",n.cores," cores:\n")
          emb <- Rtsne::Rtsne(x, perplexity=perplexity, dims=dims, num_threads=n.cores, ... )$Y;
        } else {
          if(verbose) message('calculating distance ... ');
          if(verbose) message('pearson ...')
          d <- 1-cor(t(x))
          if(verbose) message("running tSNE using ",n.cores," cores:\n")
          emb <- Rtsne::Rtsne(d,is_distance=TRUE, perplexity=perplexity, dims=dims, num_threads=n.cores, ... )$Y;
        }
        rownames(emb) <- rownames(x)
        embeddings[[type]][[name]] <<- emb;
      } else if(embeddingType=='FR') {
        g <- graphs[[type]];
        if(is.null(g)){ stop(paste("generate KNN graph first (type=",type,")",sep=''))}
        emb <- layout.fruchterman.reingold(g, weights=E(g)$weight)
        rownames(emb) <- rownames(x); colnames(emb) <- c("D1","D2")
        embeddings[[type]][[name]] <<- emb;
      } else if (embeddingType == "UMAP") {
        if (!requireNamespace("uwot", quietly=TRUE))
          stop("You need to install package 'uwot' to be able to use UMAP embedding.")
        
        distance <- switch (distance, pearson = "cosine", L2 = "euclidean", distance)
        
        emb <- uwot::umap(as.matrix(x), metric=distance, verbose=verbose, n_threads=n.cores, n_sgd_threads=n.sgd.cores, n_components=dims, ...)
        rownames(emb) <- rownames(x)
        embeddings[[type]][[name]] <<- emb;
      } else if (embeddingType == "UMAP_graph") {
        g <- graphs[[type]];
        if(is.null(g)){ stop(paste("generate KNN graph first (type=",type,")",sep=''))}
        emb <- embedKnnGraphUmap(g, verbose=verbose, n_threads=n.cores, n_sgd_threads=n.sgd.cores, n_components=dims, ...)
        embeddings[[type]][[name]] <<- emb;
      } else {
        stop('unknown embeddingType ',embeddingType,' specified');
      }
      
      return(invisible(emb));
    }
  )
);

上一篇 下一篇

猜你喜欢

热点阅读