GEO单细胞测序数据生信分析复现
2021-06-26 本文已影响0人
18b79e7933ad
复现文章:Analyses of metastasis-associated genes in IDH wild-type glioma
数据来源:GSE84465
图1

数据筛选条件


除了min.cells min.features percent.mt,还有Pearson相关系数的指标(Fig S1)
根据Pearson相关系数大于0.4这个指标,去除了Tumor_BT_S1和Tumor_BT_S4两个样本,只针对剩余的6个样本分析
Fig S1

第一步:下载GSE84465_GBM_All_data.csv.gz和Metadata的SraRunTable.txt文件
第二步代码
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
a<-read.table("GSE84465_GBM_All_data.csv.gz")
head(rownames(a))
tail(rownames(a),10)
a=a[1:(nrow(a)-5),] #最后5行行名异常,剔除
b<-read.table("SraRunTable.txt",sep = ",",header = T) #样本信息
table(b$PATIENT_ID)
table(b$tissue)
table(b$tissue, b$PATIENT_ID)
new.b <- b[,c("Plate.ID","well","tissue","PATIENT_ID")]
new.b$sample <- paste0("X",b$Plate.ID,".",b$well)
head(new.b)
identical(colnames(a),new.b$sample)
pbmc <- CreateSeuratObject(counts = a)
head(pbmc@meta.data)
patient<-new.b$PATIENT_ID
pbmc <- AddMetaData(object = pbmc, metadata = patient, col.name = 'PATIENT_ID')
tissue<-new.b$tissue
pbmc <- AddMetaData(object = pbmc, metadata = tissue, col.name = 'Tissue')
sample<-paste0(tissue,"_",patient)
pbmc <- AddMetaData(object = pbmc, metadata = sample, col.name = 'Sample')
head(pbmc@meta.data)
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
table(pbmc$PATIENT_ID,pbmc$Tissue)
###########################
#计算Pearson相关系数
pbmc.filt <- subset(pbmc, subset = PATIENT_ID=="BT_S6"&Tissue=="Periphery") #BT_S1 2 4 6 Tumor Periphery
dim(pbmc.filt)
plot1 <- FeatureScatter(pbmc.filt, feature1 = "nCount_RNA", feature2 = "percent.mt",group.by = "PATIENT_ID",pt.size=1.5)
plot2 <- FeatureScatter(pbmc.filt, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",group.by = "PATIENT_ID",pt.size=1.5)
plot1 + plot2
############################
selected_f <- rownames(pbmc)[Matrix::rowSums(pbmc@assays$RNA@counts > 0 ) > 3]
pbmc.filt <- subset(pbmc, subset = nFeature_RNA > 50 & nFeature_RNA < 6000 & percent.mt < 0.05 &
Sample%in%c("Periphery_BT_S1","Periphery_BT_S2","Periphery_BT_S4","Periphery_BT_S6","Tumor_BT_S2","Tumor_BT_S6"),
features = selected_f)
dim(pbmc.filt)
VlnPlot(object = pbmc.filt, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),group.by = "Sample")


#标准化
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
### 鉴定高变异基因
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 1500) #变异最大的1500个基因
#输出特征方差图
top10 <- head(VariableFeatures(pbmc), 10)
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10,repel = TRUE)
plot1+plot2

图2

#标准化
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
### 鉴定高变异基因
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 500) #变异最大的1500个基因
#输出特征方差图
top10 <- head(VariableFeatures(pbmc), 10)
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10,repel = TRUE)
plot1+plot2
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
pbmc <- ScaleData(pbmc, vars.to.regress = "percent.mt")
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
VizDimLoadings(object = pbmc, dims = 1:2, reduction = "pca",nfeatures = 20) #4个PC 20个基因
DimPlot(pbmc, reduction = "pca",group.by="Sample")
DimHeatmap(pbmc, dims = 1:2, cells = 500, balanced = TRUE,nfeatures = 30,ncol=2)
pbmc <- JackStraw(object = pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(object = pbmc, dims = 1:20)
JackStrawPlot(object = pbmc, dims = 1:20,reduction = "pca")
ElbowPlot(pbmc,reduction="pca") #根据ElbowPlot确定PC数量
##TSNE聚类分析
pbmc <- FindNeighbors(pbmc, dims = 1:20)
pbmc <- FindClusters(pbmc, resolution = 0.5)
#tSNE降维
pbmc <- RunTSNE(object = pbmc, dims = 1:20)
DimPlot(pbmc, reduction = "tsne",label = TRUE,pt.size = 1)





table(pbmc@meta.data$Tissue,pbmc@meta.data$seurat_clusters)

### 差异表达分析
# 找到每一个cluster当中的marker,并且只展示阳性的marker。
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
head(pbmc.markers, n = 10)
write.table(pbmc.markers,file = 'pbmc.markers.txt',sep = '\t',row.names=F,quote=F)
library("tidyverse")
sig.markers<- pbmc.markers %>% select(gene,everything()) %>%
subset(p_val_adj<0.05 & abs(pbmc.markers$avg_log2FC)>1)
dim(sig.markers)
write.table(sig.markers,file="sigmarkers.xls",sep="\t",row.names=F,quote=F)
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
#差异基因可视化
VlnPlot(object = pbmc, features = c("EGFR", "CCL3","AGXT2L1","DCN","GPR17","MOG","SYT1"),group.by = "seurat_clusters",pt.size = 1)

Cluster 2 3 6被定义为metastatic tumor cell,比较metastatic和non-mentastatic之间的基因表达差异
metastatic.markers <- FindMarkers(pbmc, ident.1 = c(2,3,6), ident.2 = c(0,1,4,5,7,8,9,10,11,12), min.pct = 0.25)
head(metastatic.markers,10)
sig.metastatic.markers<- metastatic.markers %>% select(colnames(metastatic.markers),everything()) %>%
subset(p_val_adj<0.05 & abs(metastatic.markers$avg_log2FC)>1)
dim(sig.metastatic.markers)
write.table(sig.metastatic.markers,file="sigmetastaticmarkers.xls",sep="\t",row.names=F,quote=F)