2021-07-19 singleCellNet

2021-07-19  本文已影响0人  汪小静

相关链接

https://github.com/pcahan1/singleCellNet

https://pcahan1.github.io/singleCellNet/

0. 准备数据

#install
install.packages("devtools")
devtools::install_github("pcahan1/singleCellNet")
library(singleCellNet)

#download the data
#training metadata, https://s3.amazonaws.com/cnobjects/singleCellNet/examples/sampTab_TM_053018.rda
#training expression matrix , https://s3.amazonaws.com/cnobjects/singleCellNet/examples/expTM_Raw_053018.rda
#query metadata, https://s3.amazonaws.com/cnobjects/singleCellNet/examples/stDat_beads_mar22.rda
#query expression matrix, https://s3.amazonaws.com/cnobjects/singleCellNet/examples/6k_beadpurfied_raw.rda
#human-mouse orthologs.https://s3.amazonaws.com/cnobjects/singleCellNet/examples/human_mouse_genes_Jul_24_2018.rda

#loading training data MOUSE
stTM <- utils_loadObject(fname = "sampTab_TM_053018.rda") #metadata
expTMraw <- utils_loadObject(fname = "expTM_Raw_053018.rda") #expression matrix

#loading query data HUMAN
stQuery <- utils_loadObject(fname = "stDat_beads_mar22.rda") #metadata
expQuery <- utils_loadObject(fname = "6k_beadpurfied_raw.rda") #expression matrix

#Ortholog conversion for cross species analysis
download.file("https://s3.amazonaws.com/cnobjects/singleCellNet/examples/human_mouse_genes_Jul_24_2018.rda", "human_mouse_genes_Jul_24_2018.rda")
oTab <- utils_loadObject(fname = "human_mouse_genes_Jul_24_2018.rda")
aa = csRenameOrth(expQuery = expQuery, expTrain = expTMraw, orthTable = oTab)
#> query genes in ortholog table =  15532 
#> training genes in ortholog table and query data =  14550
expQueryOrth <- aa[['expQuery']]
expTrainOrth <- aa[['expTrain']]

#若为同一物种
commonGenes<-intersect(rownames(expTrain), rownames(expQuery))
expTrain <- expTrain[commonGenes, ]
expQuery <- expQuery[commonGenes, ]

#seurat 对象的导入
seuratfile <- extractSeurat(sc, exp_slot_name = "counts")

sampTab = seuratfile$sampTab
expDat = seuratfile$expDat

1. Training the data 建立分类器

We use the same number of cells per cell type, i.e. balanced data, to train Top-Pair Random Forest classifier. The remaining of the data or the held-out data will be used as validation data to evaluate the performance of the TP-RF classifier. Empirically we have found 50 cells to be a minimal cell number to create a classifier with good performance, however it may vary depend on the quality of your reference data, so it is really important to assess the performance of your classifier before proceeding to query your sample of interest.

stList<-splitCommon(sampTab = stTM, ncells = 50, dLevel = "newAnn")#以newAnn分类细胞
alveolar macrophage : 62 
B cell : 3134 
bladder urothelial cell : 759 
bladder_mesenchymal : 859 
cardiac muscle cell : 60 
cardiac_fibroblast : 222 
chondrocyte-like : 165 
endocardial cell : 52 
endothelial cell : 1890 
erythroblast : 152 
erythrocyte : 74 
granulocyte : 520 
hematopoietic precursor cell : 117 
hepatocyte : 882 
keratinocyte : 1203 
kidney capillary endothelial cell : 117 
kidney proximal straight tubule epithelial cell : 618 
kidney_duct_epithelial : 355 
late pro-B cell : 141 
limb_mesenchymal : 540 
luminal epithelial cell of mammary gland : 137 
lung_mammary_stromal : 2072 
macrophage : 1340 
mammary_basal_cell : 115 
monocyte : 370 
natural killer cell : 600 
neuroendocrine cell : 282 
skeletal muscle satellite cell : 190 
T cell : 1823 
tongue_basal_cell : 1726 
trachea_epithelial : 434 
trachea_mesenchymal : 3925 
stTrain<-stList[[1]]
expTrain <- expTrainOrth[,stTrain$cell]

stTest <- stList[[2]]
expTest <- expTrainOrth[,stTest$cell]

This training step includes:

#- If you increase nTopGenes and nTopGenePairs, you may get a even better classifier performance on query data!
class_info<-scn_train(stTrain = stTrain, expTrain = expTrain, nTopGenes = 10, nRand = 70, nTrees = 1000, nTopGenePairs = 25, dLevel = "newAnn", colName_samp = "cell")
#a list containing normalized expression data, classification gene list, cnPRoc

2. 评价分类器

#apply heldout data
system.time(classRes_val_all <- scn_predict(class_info[['cnProc']], expTest, nrand = 50))
#> Loaded in the cnProc
#> All Done
#>    user  system elapsed 
#>   0.208   0.012   0.220

#assessment
tm_heldoutassessment <- assess_comm(ct_scores = classRes_val_all, stTrain = stTrain, stQuery = stTest, dLevelSID = "cell", classTrain = "newAnn", classQuery = "newAnn")
plot_PRs(tm_heldoutassessment)

3. 进行分类

crPBMC <- scn_predict(class_info[['cnProc']], expQueryOrth, nrand = 50)
#需要对crPBMC过滤才能完成下一步
test = crPBMC[,colnames(crPBMC) %in% colnames(expQueryOrth)]
stQuery <- assign_cate(classRes = test, sampTab = stQuery, cThresh = 0.5) #选择classification score higher than 0.5

4. 可视化

#create labels for classification heatmap
sgrp<-as.vector(stQuery$description)
names(sgrp)<-rownames(stQuery)
grpRand<-rep("rand", 50)
names(grpRand)<-paste("rand_", 1:50, sep='')
sgrp<-append(sgrp, grpRand)

sc_hmClass(crPBMC, sgrp, max=5000, isBig=TRUE, cCol=F, font=8)

sc_violinClass(sampTab = stQuery,classRes = crPBMC, cellIDCol = "sample_name", dLevel = "description", ncol = 9)

sc_violinClass(sampTab = stQuery, classRes = crPBMC, cellIDCol = "sample_name", dLevel = "description", ncol = 9, sub_cluster = "B cell")

# attribution plot
plot_attr(sampTab = stQuery, classRes = crPBMC, nrand=50, sid="sample_name", dLevel="description")

#Visualize top pair gene expression
gpTab <- compareGenePairs(query_exp = expQueryOrth, training_exp = expTrainOrth, training_st = stTrain, classCol = "newAnn", sampleCol = "cell", RF_classifier = class_info$cnProc$classifier, numPairs = 20, trainingOnly = FALSE)
#> [1] "All Good"

sgrp<-as.vector(stQuery$prefix)
names(sgrp)<-rownames(stQuery)
train <- findAvgLabel(gpTab, stTrain = stTrain, dLevel = "newAnn")
sgrp <- append(sgrp, train)

hm_gpa_sel(gpTab, genes = class_info$cnProc$xpairs, grps = sgrp, maxPerGrp = 5)

5. 插入seurat 对象中进行可视化

sc@meta.data$category = stQuery$category
DimPlot(sc,group.by = "category",label = T)
上一篇下一篇

猜你喜欢

热点阅读