SingCellaR代码实操(五):基于TARGET-Seq测序
前言
通过前面连续4期推文,Immugent从多方面介绍了SingCellaR包的强大功能,但是这些分析流程都是基于普通的scRNAseq数据。今天,Immugent就来介绍如何使用SingCellaR对TARGET-Seq测序数据进行分析。
在正式讲解今天的分析流程之前,Immugent先来介绍一下何为TARGET-Seq。所谓TARGET-Seq,就是指通过杂交或者扩增对目标区域进行富集,通过NGS结合生物信息方法对目标区域进行组装和序列分析的一种技术。Target-seq技术通过设计多重PCR引物对目标区域DNA进行特异性扩增,在单管内进行多重PCR扩增,获得特定长度的PCR产物。另外,通过双端barcode原理区分不同样本混样建库,就可以对PCR产物进行高通量测序与分析。
好啦,下面开始今天的代码实操。
代码展示
library(SingCellaR)
MPN<-new("SingCellaR")
MPN@GenesExpressionMatrixFile<-"../SingCellaR_example_datasets/TARGET_Seq/counts_HT_TARGET_GRCh38.v2.txt"
MPN@CellsMetaDataFile<-"../SingCellaR_example_datasets/TARGET_Seq/HT_TARGETseq_colData.qc_Ghr38.txt"
load_gene_expression_from_a_file(MPN,isTargetSeq = T,sep="\t")
MPN
## An object of class SingCellaR with a matrix of : 33514 genes across 3593 samples.
TargetSeq_process_cells_annotation(MPN,mito_genes_start_with = "MT-",
ERCC_genes_start_with = "ERCC-")
TargetSeq_plot_cells_annotation(MPN,type = "boxplot")
image.png
TargetSeq_filter_cells_and_genes(MPN,min_Reads = 2000,
max_percent_mito = 10,
genes_with_expressing_cells = 10,max_percent_ERCC = 50)
TargetSeq_load_cell_metadata(MPN,sep = "\t",a_column_of_the_cell_names = 1)
head(MPN@meta.data)
## Cell sampleID UMI_count detectedGenesPerCell percent_mito percent_ERCC
## 1 TR4PL21_11B 2595 834 2.466281 0
## 2 TR4PL21_11C 4859 632 5.412636 0
## 3 TR4PL21_11D 5300 670 2.264151 0
## 4 TR4PL21_11E 5251 1271 3.599314 0
## 5 TR4PL21_11F 2282 798 3.637160 0
## 6 TR4PL21_11L 20403 1801 4.170955 0
## IsPassed cell.type donor timepoint donor.type primer.mix batch plate
## 1 TRUE cd34+ IF0137 baseline patient IF0137_mix batch_1 TR4PL21
## 2 TRUE cd34+ IF0137 baseline patient IF0137_mix batch_1 TR4PL21
## 3 TRUE cd34+ IF0137 baseline patient IF0137_mix batch_1 TR4PL21
## 4 TRUE cd34+ IF0137 baseline patient IF0137_mix batch_1 TR4PL21
## 5 TRUE cd34+ IF0137 baseline patient IF0137_mix batch_1 TR4PL21
## 6 TRUE cd34+ IF0137 baseline patient IF0137_mix batch_1 TR4PL21
## pool CD45RA CD90 CD38 lineage.viability CD34 CD123
## 1 TR4PL21_Q2 1502.17 2621.87 5156.63 12043.33 29906.12 3428.90
## 2 TR4PL21_Q1 1782.70 1887.77 14743.83 16879.47 21457.04 1700.39
## 3 TR4PL21_Q2 995.36 1545.84 7552.66 4882.79 17353.63 1342.27
## 4 TR4PL21_Q1 748.31 16677.71 11011.12 14496.51 29338.79 3951.90
## 5 TR4PL21_Q2 1348.57 2874.07 19794.60 12866.39 18098.87 2719.89
## 6 TR4PL21_Q2 2213.63 2963.91 37265.15 18894.65 10691.38 649.52
## genotypesall qcgenotype genes.detected
## 1 JAK2_HET_U2AF1_HET_TET2_pI1105_HET_ASXL1_p910_HET TRUE 839
## 2 JAK2_HET_U2AF1_HET_ASXL1_p910_HET TRUE 636
## 3 JAK2_HET_U2AF1_HET TRUE 702
## 4 JAK2_HET_U2AF1_HET_ASXL1_p897_HET TRUE 1250
## 5 JAK2_HET_U2AF1_HET_TET2_pI1105_HET_ASXL1_p910_HET TRUE 812
## 6 JAK2_HOM_U2AF1_HET TRUE 1830
## input.reads unique.mapped multimap MReads lib.size unmapped ercc.count
## 1 7566 4235 551 131 2741 2780 134
## 2 17580 10089 1658 445 5561 5833 696
## 3 16977 10024 1497 296 6301 5456 554
## 4 39883 9074 1282 336 5614 29527 354
## 5 30008 4791 658 158 2450 24559 177
## 6 46909 29782 4315 1330 21983 12812 748
## pc.uniquemapped pc.multimap pc.unmapped pc.ingenes pc.ercc pc.mit.reads
## 1 0.5597409 0.07282580 0.3674333 0.6472255 0.04888727 0.017314301
## 2 0.5738908 0.09431172 0.3317975 0.5511944 0.12515735 0.025312856
## 3 0.5904459 0.08817812 0.3213760 0.6285914 0.08792255 0.017435354
## 4 0.2275155 0.03214402 0.7403405 0.6186908 0.06305664 0.008424642
## 5 0.1596574 0.02192749 0.8184151 0.5113755 0.07224490 0.005265263
## 6 0.6348888 0.09198661 0.2731246 0.7381304 0.03402629 0.028352768
## genotype.class rnaseq.qc qc.all
## 1 realclone TRUE TRUE
## 2 realclone TRUE TRUE
## 3 realclone TRUE TRUE
## 4 realclone TRUE TRUE
## 5 realclone TRUE TRUE
## 6 realclone TRUE TRUE
my.cell_info<-subset(MPN@meta.data,qc.all=="TRUE")
my.cell_filtered<-subset(my.cell_info, UMI_count > 2000 & percent_mito < 10 & detectedGenesPerCell > 500 & pc.ercc < 50)
nrow(my.cell_filtered)
## [1] 2679
my.cell_info$IsPassed[my.cell_info$Cell %in% my.cell_filtered$Cell]<-TRUE
table(my.cell_info$IsPassed)
##
## FALSE TRUE
## 119 2679
MPN@meta.data<-my.cell_info
Classification of cells base on genotypes
cell_metadata<-get_cells_annotation(MPN)
cell_metadata$genotype_class1<-"NA"
JAK2_only<-c("\\bJAK2_HET\\b","\\bJAK2_HOM\\b")
cell_metadata$genotype_class1[grep(paste(JAK2_only,collapse="|"),cell_metadata$genotypesall)]<-"JAK2_only"
WT_normal<-c("\\bWT\\b")
cell_metadata$genotype_class1[grep(paste(WT_normal,collapse="|"),cell_metadata$genotypesall)]<-"WT_normal"
WT_patient<-c("\\bJAK2_WT\\b")
cell_metadata$genotype_class1[grep(paste(WT_patient,collapse="|"),cell_metadata$genotypesall)]<-"WT_patient"
JAK2_spliceosome<-c("JAK2_HET_SRSF2_HET","JAK2_HET_U2AF1_HET","JAK2_HOM_CBL_p404_HET_SRSF2_HET","JAK2_HOM_SF3B1_HET","JAK2_HOM_SRSF2_HET",
"JAK2_HOM_U2AF1_HET")
cell_metadata$genotype_class1[grep(paste(JAK2_spliceosome,collapse="|"),cell_metadata$genotypesall)]<-"JAK2_spliceosome"
JAK2_spliceosome_epigenetic<-c("JAK2_HET_U2AF1_HET_ASXL1_p910_HET","JAK2_HET_U2AF1_HET_TET2_pI1105_HET_ASXL1_p910_HET",
"JAK2_HET_U2AF1_HET_ASXL1_p897_HET")
cell_metadata$genotype_class1[grep(paste(JAK2_spliceosome_epigenetic,collapse="|"),cell_metadata$genotypesall)]<-"JAK2_spliceosome_epigenetic"
JAK2_epigenetic<-c("JAK2_HOM_ASXL1_p644_HET","JAK2_HOM_TET2_p1612_HET")
cell_metadata$genotype_class1[grep(paste(JAK2_epigenetic,collapse="|"),cell_metadata$genotypesall)]<-"JAK2_epigenetic"
cell_metadata_for_analysis<-subset(cell_metadata,genotype_class1!="NA") #2742 cells considered for this analysis
nrow(cell_metadata_for_analysis)
## [1] 2742
MPN@meta.data<-cell_metadata_for_analysis
normalize_UMIs(MPN,use.scaled.factor = F)
get_variable_genes_by_fitting_GLM_model(MPN,mean_expr_cutoff = 1,disp_zscore_cutoff = 0.05,
quantile_genes_expr_for_fitting = 0.5,
quantile_genes_cv2_for_fitting =0.3)
remove_unwanted_genes_from_variable_gene_set(MPN,gmt.file = "../SingCellaR_example_datasets/Human_genesets/human.ribosomal-mitocondrial.genes.gmt",
removed_gene_sets=c("Ribosomal_gene","Mitocondrial_gene"))
plot_variable_genes(MPN,quantile_genes_expr_for_fitting = 0.5,quantile_genes_cv2_for_fitting =0.3)
image.png
runPCA(MPN,use.components=30,use.regressout.data = F)
runUMAP(MPN,dim_reduction_method = "pca",n.dims.use = 10,n.neighbors = 20,
uwot.metric = "euclidean")
plot_umap_label_by_a_feature_of_interest(MPN,feature = "donor",point.size = 1)
remove_unwanted_confounders(MPN,residualModelFormulaStr = "~donor+batch",
preserved_feature = "~genotype_class1")
image.png
UMAP analysis after regressing out batche and donor effects
runPCA(MPN,use.components=30,use.regressout.data = T)
runUMAP(MPN,dim_reduction_method = "pca",n.dims.use = 10,n.neighbors = 20,
uwot.metric = "euclidean")
plot_umap_label_by_a_feature_of_interest(MPN,feature = "donor",point.size = 1)
runFA2_ForceDirectedGraph(MPN,dim_reduction_method = "pca",n.dims.use = 10)
plot_forceDirectedGraph_label_by_a_feature_of_interest(MPN,feature = "genotype_class1",
background.color = "black",vertex.size = 1)
Differential gene expression analysis
WT_normal.cells<-subset(MPN@meta.data,genotype_class1=="WT_normal")
WT_normal.cell_ids<-WT_normal.cells$Cell
WT_patient.cells<-subset(MPN@meta.data,genotype_class1=="WT_patient")
WT_patient.cell_ids<-WT_patient.cells$Cell
DE.genes<-identifyDifferentialGenes(objectA = MPN, objectB = MPN, cellsA = WT_normal.cell_ids, cellsB = WT_patient.cell_ids)
head(DE.genes)
sig.genes<-subset(DE.genes,adjusted.pval < 0.05 & (ExpFractionA > 0.25 | ExpFractionB > 0.25) & abs(log2FC) > 2)
sig.genes<-sig.genes[order(abs(sig.genes$log2FC),decreasing = T),]
sig.genes
plot_violin_for_differential_genes(objectA = MPN, objectB = MPN, cellsA = WT_normal.cell_ids, cellsB = WT_patient.cell_ids,
gene_list = rownames(sig.genes)[1:5],take_log2 = T,point.size = 1,point.alpha = 0.5)
plot_heatmap_for_differential_genes(objectA = MPN, objectB = MPN, cellsA = WT_normal.cell_ids, cellsB = WT_patient.cell_ids,
gene_list = rownames(sig.genes))
image.png
image.png
preranked.genes<-identifyGSEAPrerankedGenes(objectA = MPN, objectB = MPN,
cellsA = WT_normal.cell_ids, cellsB = WT_patient.cell_ids)
GSEA.results<-Run_fGSEA_analysis(objectA = MPN, objectB = MPN, eps = 0,
cellsA = WT_normal.cell_ids, cellsB = WT_patient.cell_ids,
GSEAPrerankedGenes_info = preranked.genes,
gmt.file = "../SingCellaR_example_datasets/Human_genesets/h.all.v7.1.symbols.gmt",
plotGSEA = T) save(MPN,file="../SingCellaR_example_datasets/Psaila_et_al/SingCellaR_objects/MPN_v0.1.SingCellaR.rdata")
展望
好啦,截至到本期,有关SingCellaR包的系列推文就已经全部更新完毕。从本系列的6篇推文中,我们可以深刻感受到SingCellaR包的功能是十分全面的,有需要的小伙伴赶紧使用起来吧。除此之外,后续Immugent将会根据SingCellaR在已经发表文章中的使用情况,酌情加更一期有关SingCellaR的应用的推文,敬请期待!
单细胞测序技术发展到现在已经相当成熟了,为我们提供的数据资源也有很多,如何将这些公共数据重新利用起来解决新的科学问题将会为我们节省大量的资源。同时,类似于SingCellaR等工具的开发,极大的助力了对公共数据的再利用,欢迎大家通过后台积极的向我们推荐类似的工具,我们下期再会~~