scTyper

2023-11-06  本文已影响0人  LET149

https://github.com/omicsCore/scTyper

1. 安装

# install BiocManager if necessary
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

# install devtools if necessary
BiocManager::install("devtools")
library(devtools)

# install the scTyper package
devtools::install_github("omicsCore/scTyper")

2. 使用

以下是一个脚本示例

require(infercnv)
require(ComplexHeatmap)
require(scTyper)
require(Seurat)
require(knitr)
require(kableExtra)

'#设置工作路径
setwd("/Users/let149/Desktop/Dmel/SRR9705086/Output")

#导入经Seurat处理后的数据
load(file = "/Users/let149/Desktop/Dmel/SRR9705086/RData/SRR9705086_raw_top_half.Rdata")

#为数据的meta.data添加后续处理需要的列
SRR9705086_raw_top_half@meta.data[["sample_id"]] <- "SRR9705086"  #新添加scTyper需要使用的数特征
SRR9705086_raw_top_half@meta.data[["sample.name"]] <- "SRR9705086"
SRR9705086_raw_top_half@meta.data[["tissue.type"]] <- "SRR9705086_testis"
SRR9705086_raw_top_half@meta.data[["cell.type.ref"]] <- "SRR9705086_testis"

#自建marker
#每个细胞类型的marker是一个数据框,此数据框只有一列,其中每行为一个marker基因
#所有细胞类型的marker构成一个列表

Early_spermatids <- read.csv(file="/Users/let149/Desktop/Dmel/SRR9705086/Marker/Marker_e-value_top9/Marker_e-value_top9_of_Early_spermatids.tsv", col.names = F)
Early_spermatids <- Early_spermatids[,1]

Late_spermatocytes <- read.csv(file="/Users/let149/Desktop/Dmel/SRR9705086/Marker/Marker_e-value_top9/Marker_e-value_top9_of_Late_spermatocytes.tsv", col.names = F)
Late_spermatocytes <- Late_spermatocytes[,1]

Early_spermatocytes <- read.csv(file="/Users/let149/Desktop/Dmel/SRR9705086/Marker/Marker_e-value_top9/Marker_e-value_top9_of_Early_spermatocytes.tsv", col.names = F)
Early_spermatocytes <- Early_spermatocytes[,1]

Hub_cells <- read.csv(file="/Users/let149/Desktop/Dmel/SRR9705086/Marker/Marker_e-value_top9/Marker_e-value_top9_of_Hub_cells.tsv", col.names = F)
Hub_cells <- Hub_cells[,1]

Cyst_cells <- read.csv(file="/Users/let149/Desktop/Dmel/SRR9705086/Marker/Marker_e-value_top9/Marker_e-value_top9_of_Cyst.tsv", col.names = F)
Cyst_cells <- Cyst_cells[,1]

Late_spermatogonia <- read.csv(file="/Users/let149/Desktop/Dmel/SRR9705086/Marker/Marker_e-value_top9/Marker_e-value_top9_of_Late_spermatogonia.tsv", col.names = F)
Late_spermatogonia <- Late_spermatogonia[,1]

Early_spermatogonia <- read.csv(file="/Users/let149/Desktop/Dmel/SRR9705086/Marker/Marker_e-value_top9/Marker_e-value_top9_of_GSC_Early_spermatogonia.tsv", col.names = F)
Early_spermatogonia <- Early_spermatogonia[,1]

Epithelial_cells <- read.csv(file="/Users/let149/Desktop/Dmel/SRR9705086/Marker/Marker_e-value_top9/Marker_e-value_top9_of_Epithelial_cells.tsv", col.names = F)
Epithelial_cells <- Epithelial_cells[,1]

Mature_spermatids <- read.csv(file="/Users/let149/Desktop/Dmel/SRR9705086/Marker/Marker_e-value_top9/Marker_e-value_top9_of_Mature_spermatids.tsv", col.names = F)
Mature_spermatids <- Mature_spermatids[,1]

marker_top9_list <- list(Early_spermatids=Early_spermatids, Early_spermatocytes=Early_spermatocytes, Hub_cells=Hub_cells, Cyst_cells=Cyst_cells, Late_spermatogonia=Late_spermatogonia, Early_spermatogonia=Early_spermatogonia, Epithelial_cells=Epithelial_cells, Mature_spermatids=Mature_spermatids, Late_spermatocytes=Late_spermatocytes)

#载入pheno.fn文件的路径  #此文件要遵循固定的格式,见下图
dir.pheno.fn <- "/Users/let149/Desktop/Dmel/SRR9705086/Marker/pheno.fn.csv"

#对导入后的数据进行重新聚类,当以下使用scTyper进行cluster注释时,默认被注释的cluster是当前活跃的cluster
SRR9705086_raw_top_half <- FindNeighbors(SRR9705086_raw_top_half, dims = 1:13)
SRR9705086_raw_top_half <- FindClusters(SRR9705086_raw_top_half, resolution = 9)

#运行scTyper
SRR9705086.NTP.cluster=scTyper(seurat.object=SRR9705086_raw_top_half,  #对象名
                                        marker=marker_top9_list,  #marker列表名
                                        wd = getwd(),
                                        output.name = "SRR9705086.raw.top.half_NTP_cluster_cluster57_based.on.top9.marker.output",  #输出结果目录名
                                        pheno.fn = dir.pheno.fn ,  #此文件在上面给出了形式
                                        qc = FALSE,
                                        run.cellranger=FALSE,
                                        norm.seurat=FALSE,
                                        cell.typing.method="NTP",  #可采用"NTP", "Average", "ES"三种方法之一
                                        level="cluster",  #可选择对"cluster"或者"cell"进行聚类
                                        run.inferCNV=FALSE,
                                        proj.name = "SRR9705086.scTyper",
                                        gene.ref.gtf=NULL,
                                        feature.to.test = "tissue.type",
                                        report.mode=TRUE,
                                        mc.cores = 6)

pheno.fn文件 :


图片.png

注意 : 文件名,文件格式,文件内容

3. scTyper 源码

#' @title scTyper
#' @description The main scTyper function
#' @param seurat.object Seurat object, if users have pre-processed seurat object, user have to insert seurat object as input
#' @param marker Cell markers to use in cell typing, character or List (#identifier or #StudyName or User defined gene list)
#' @param wd Working directory
#' @param output.name Output directory name
#' @param pheno.fn Phenotype file path
#' @param qc  Whether to execute FASTQC (default=FALSE)
#' @param run.cellranger whether to excute cellranger count (default=FALSE)
#' @param norm.seurat whether to normalize seurat object (default=FALSE)
#' @param cell.typing.method cell typing method, c("NTP", "ES", "Average"), (default = "NTP")
#' @param level Indicate the cell assignment level (cell or cluster)
#' @param run.inferCNV Indicate whether ‘malignant cell typing by inferCNV process run
#' @param proj.name Project name
#' @param fastqc.path FastQC program path
#' @param fastq.dir FastQC output directory
#' @param fq1.idx Index of the FASTQ file (Read 1)
#' @param fq2.idx Index of the FASTQ file (Read 2)
#' @param cellranger.path Cell Ranger program path
#' @param cellranger.ref.dir Directory of Cell Ranger reference file
#' @param percent.min.cells Cutoff to filter features containing minimum percent of cells
#' @param min.features Cutoff to filter cells containing minimum number of features
#' @param percent.mt Cutoff for filtering cells that have >n percent mitochondrial counts
#' @param vars.to.regress Variables to regress out
#' @param dims A vector of the dimensions to use in construction of the SNN graph.
#' @param resolution Value of the resolution parameter, use a value above (below) 1.0 if you want to obtain a larger (smaller) number of communities.
#' @param slot Data type of Seurat object, c("scale.data", "count.data", "data")
#' @param assay Assay of Seurat object
#' @param NTP.g.filter.method Method to filter genes in NTP
#' @param NTP.gene.filter.cutoff Cutoff to filter genes of in NTP
#' @param NTP.distance NTP distance method, a character, either c("correlation" or "cosine").
#' @param NTP.norm.method NTP normalization method, either c("none", "row.std")
#' @param gene.ref.gtf Path of GTF file including genomic location for genes
#' @param feature.to.test Column header name of the meta data in Seurat object (select the cell groups for T.test) either "tissue.type" or "cell.type"
#' @param cells.test_excluded A value indicates the cells to be excluded in T.test
#' @param cells.test_reference A value indicates the cells to use as be excluded in T.test
#' @param fc.cutoff Cutoff of fold change
#' @param cutoff.gene.cluster A cutoff P-value for filtering out the gene clusters (calculated from GO analysis)
#' @param malignant.cell.type Cell type to assign  malignant cell
#' @param report.mode Generate report file
#' @param mc.cores The number of cores to use. Must be at least one(default=1), and parallelization requires at least two cores.
#' @return Seurat object
#' @export
scTyper<- function(seurat.object=NULL,
                   marker="Puram.2017.HNSCC.TME",
                   
                   wd=getwd(),
                   output.name = "test.result",
                   pheno.fn,
                   qc = FALSE,
                   run.cellranger=FALSE,
                   norm.seurat=FALSE,
                   cell.typing.method=c("NTP", "ES", "Average"),
                   level=c("cell","cluster"),
                   run.inferCNV=TRUE,
                   proj.name = "scTyper",
                   
                   fastqc.path=NULL,
                   fastq.dir=NULL,
                   fq1.idx="_R1_001.fastq",
                   fq2.idx="_R2_001.fastq",
                   
                   cellranger.path=NULL,
                   cellranger.ref.dir=NULL,
                   
                   percent.min.cells=0.1,
                   min.features=200,
                   percent.mt=10,
                   vars.to.regress=c("nCount_RNA", "percent.mt"),
                   dims=1:10,
                   resolution=2,
                   
                   slot=c("scale.data", "count.data", "data"),
                   assay="RNA",
                   NTP.g.filter.method=c("sd", "mad", "none"),
                   NTP.gene.filter.cutoff=0.3,
                   NTP.distance=c("cosine","correlation"),
                   NTP.norm.method=c("none", "row.std"),
                   
                   gene.ref.gtf=NULL,
                   feature.to.test = c("tissue.type", "cell.type"),
                   cells.test_excluded="Epithelial",
                   cells.test_reference = "immune",
                   fc.cutoff=0.05,
                   cutoff.gene.cluster=0.05,
                   malignant.cell.type="Epithelial",
                   report.mode=TRUE,
                   mc.cores = 1){
    
    start_time <- Sys.time()
    cell.typing.method=match.arg(cell.typing.method)
    level=match.arg(level)
    slot=match.arg(slot)
    NTP.gene.filter.method=match.arg(NTP.g.filter.method)
    NTP.distance=match.arg(NTP.distance)
    NTP.norm.method=match.arg(NTP.norm.method)
    feature.to.test=match.arg(feature.to.test)
    
    library(fastqcr)
    library(Seurat)
    library(infercnv)
    library(gProfileR)
    library(rmarkdown)
    library(parallel)
    library(perm)
    library(Biobase)
    library(reshape2)
    library(limma)
    library(GenomicRanges)
    library(ggplot2)
    library(grid)
    library(gridExtra)
    library(grDevices)
    library(ComplexHeatmap)
    library(circlize)
    library(png)
    library(knitr)
    library(kableExtra)
    library(pander)
    library(colorspace)
    
    # default outdirs
    qc.dir <- file.path(wd, output.name, "00_qc")
    count.dir <- file.path(wd, output.name, "01_count")
    ntp.dir <- file.path(wd, output.name, "02_NTP")
    inferCNV.dir  <- file.path(wd, output.name, "03_inferCNV")
    rda.dir <- file.path(wd, output.name, "data")
    
    
    
    #######################
    ### start scTyper
    dir.create(file.path(wd, output.name), showWarnings = F)
    dir.create(file.path(rda.dir), showWarnings = F)
    
    pheno.df = read.csv(pheno.fn, header = T, sep = ",")
    sample.name = as.character(pheno.df$Sample_ID)
    
    if(qc==TRUE){
        fastqc.outs = fastqc(fastqc.path, fastq.dir=fastq.dir, sample.name, fq1.idx=fq1.idx, fq2.idx=fq2.idx, output.dir=qc.dir, run.cmd=TRUE, mc.cores=mc.cores)
    }
    
    if(run.cellranger==TRUE){
        count.out.dirs = CellrangerCount(cellranger.path, fastq.dir=fastq.dir, output.dir=count.dir, cellranger.ref.dir=cellranger.ref.dir, sample.name=sample.name, run.cmd=TRUE, mc.cores=mc.cores)
        setwd(wd)
        metrics_summary = make.stat_summary(count.dir = count.dir, sample.name = sample.name, pheno.df=pheno.df, output.dir = count.dir)
    }
    
    if(norm.seurat==TRUE){
        seurat = run.seurat.process(count.dir = count.dir, rda.dir = rda.dir, project = proj.name, sample.name = sample.name, metrics_summary,
                                    percent.min.cells=percent.min.cells, min.features=min.features, percent.mt=percent.mt, vars.to.regress=vars.to.regress, dims=dims,
                                    resolution=resolution)
    }
    
    if(class(seurat.object)=="Seurat") seurat=seurat.object
    
    ##cell typing
    seurat=cell.typing.seurat(seurat=seurat, marker=marker, cell.typing.method=cell.typing.method, level=level, wd=wd, slot=slot, assay=assay, ntp.dir=ntp.dir, rda.dir=rda.dir, NTP.g.filter.method=NTP.g.filter.method, NTP.gene.filter.cutoff=NTP.gene.filter.cutoff, NTP.distance=NTP.distance, NTP.norm.method=NTP.norm.method, mc.cores=mc.cores)
    
    if(run.inferCNV==TRUE){
        seurat@meta.data$tissue.type = pheno.df[match(seurat@meta.data$sample.name, pheno.df[,"Sample_ID"]),"TissueType"]
        fdata = make.seurat.fdata(seurat, rda.dir, gene.ref.gtf = gene.ref.gtf)
        
        seurat = run.inferCNV(seurat = seurat, assay=assay, fdata, pheno_info = pheno.df, output.dir = inferCNV.dir, rda.dir = rda.dir, feature.to.test=feature.to.test, cells.test_reference=cells.test_reference, cells.test_excluded=cells.test_excluded, fc.cutoff=fc.cutoff, cutoff.gene.cluster=cutoff.gene.cluster, mc.cores=mc.cores)
        
        seurat = malignant.cellTyper(seurat = seurat, rda.dir = rda.dir, cells.test_reference=cells.test_reference, malignant.cell.type=malignant.cell.type)
        
    }
    end_time <- Sys.time()
    runtime = format(end_time-start_time, format="%H")
    #report
    envList=as.list(environment())
    if(report.mode) {report(envList, qc.dir, output.dir=file.path(wd, output.name))}
    
    return(seurat)
}
上一篇 下一篇

猜你喜欢

热点阅读