用MCP法计算TCGA样本中的免疫浸润
2020-11-16 本文已影响0人
碌碌无为的杰少
下载TCGA FPKM数据
rm(list = ls())
# 安装加载需要的R包
# install.packages("pacman", repos = 'https://mirror.lzu.edu.cn/CRAN/')
library(pacman)
p_load(TCGAbiolinks, tidyverse, magrittr, data.table, biomaRt)
library(dplyr)
# 设置工作目录
projectPath = "C:/Users/81494/Desktop/changai fpkm" # 替换为真实的工作目录
dataPath = paste(projectPath, "Data", sep = "/")
if(!dir.exists(dataPath)) dir.create(dataPath) # 新建一个Data文件夹,用于存放从TCGA上下载的数据
setwd(dataPath)
library(dplyr)
#### 使用TCGAbiolinks下载Counts和FPKM-UQ数据 ----
#https://www.bioconductor.org/packages/release/bioc/html/TCGAbiolinks.html
for(data_type in c("HTSeq - Counts", "HTSeq - FPKM")){
repeat{
query = try(GDCquery(project = "TCGA-COAD", legacy = FALSE, experimental.strategy = "RNA-Seq", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = data_type), silent = TRUE)
if(class(query) != "try-error") {
break
}else{
print("Try to connect to GDC...")
}
}
GDCdownload(query, files.per.chunk = 100) # 分批下载
# 数据预处理
dataAssy = GDCprepare(query, summarizedExperiment = F)
# # 保存变量,以备后用
# filename = paste0("TCGA-LIHC.RNA.query.", str_match(data_type, "- ([^\\s]*)$")[,2], ".RData")
# save(query, dataAssy, file = filename)
# load(filename)
# 数据整理
exp_data = dataAssy %>%
filter(str_detect(X1, "^ENSG")) %>% # 提取ENSG*注释的基因
mutate(X1 = gsub("\\..*", "", X1)) %>% # 删除版本号
rename(ensembl_gene_id = X1) %>% # "X1"重命名为"ensembl_gene_id"
set_colnames(str_replace(colnames(.), "(-\\w*){3}$", "")) # 修改TCGA样本名
# 上两行等同于:
# set_colnames(c("ensembl_gene_id", str_match(colnames(.)[-1], "(TCGA-[^-]*-[^-]*-[^-]*)")[,2])) # 修改列名
exp_data[1:3,1:3]
# 输出
outfile = paste0("COAD_Portal_RNA_", str_match(data_type, "- ([^\\s]*)$")[,2], ".txt")
fwrite(exp_data, outfile, row.names = F, sep = "\t", quote = F)
下载TCGA临床数据
XENA官网下载整合好的数据
p_load(data.table, tidyverse, magrittr)
clinic_data = read.table("COAD_clinicalMatrix", header = T, sep = "\t")
surv_data = read.table("COAD_survival.txt", header = T, sep = "\t")
clinic_data = merge(clinic_data, surv_data, by.x = "sampleID", by.y = "sample")
rownames(clinic_data) <- clinic_data$sampleID
clinic_data=clinic_data[,c('sampleID','CDE_ID_3226963','MSI_updated_Oct62011','loss_expression_of_mismatch_repair_proteins_by_ihc','loss_expression_of_mismatch_repair_proteins_by_ihc_result','microsatellite_instability','pathologic_M','pathologic_N','pathologic_T','pathologic_stage','OS','OS.time','DSS','DSS.time','PFI','PFI.time')]
整理表达矩阵
exp_FPKM = fread("COAD_Portal_RNA_FPKM.txt", h = T, sep = "\t", check.names = F) %>% column_to_rownames("ensembl_gene_id")
table(!duplicated(colnames(exp_FPKM)))
exp_FPKM = exp_FPKM[,!duplicated(colnames(exp_FPKM))]#去重复测序两次的标本
exp_FPKM_T=exp_FPKM[,str_sub(colnames(exp_FPKM),14,16)=="01A"]#肿瘤样本
colnames(exp_FPKM_T) <- str_sub(colnames(exp_FPKM_T),1,15)
dd1 <- exp_FPKM_T[1:20,1:20]
提取mRNA和gene名转换
load("anno.Rdata")
#step4:表达矩阵拆分和注释
mRNA_anno <- mRNA_anno %>%
tidyr::separate(gene_id,into = c("gene_id","drop"),sep="\\.")
sum(rownames(exp_FPKM_T) %in% mRNA_anno$gene_id)
mRNA_exp = exp_FPKM_T[rownames(exp_FPKM_T) %in% mRNA_anno$gene_id,]
tmp = data.frame(gene_id = rownames(exp_FPKM_T))
x = dplyr::inner_join(tmp,mRNA_anno,by = "gene_id")
#inner_join不改变顺序,可以确认一下
table(!duplicated(x$gene_name))
#行名不允许重复,因此一个ensambelid对应多个symbol的需要去掉。
mRNA_exp = mRNA_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(mRNA_exp) = x$gene_name
mRNA_exp = na.omit(mRNA_exp)
表达矩阵和临床数据融合
table(colnames(mRNA_exp) %in% clinic_data$sample)
index <- intersect(colnames(mRNA_exp),clinic_data$sample)
mRNA_exp1 = mRNA_exp[,index]
clinic_data1 <- clinic_data[index,]
write.csv(clinic_data1,'clinic_data2.csv')
手工整理临床数据
在csv中剔除生存时间小于90天和NA的,和存活时间大于3000天的病例
表达矩阵和临床数据再融合
clinic_data2 <- read.csv('clinic_data2.csv',header = T)
rownames(clinic_data2) <- clinic_data2$sampleID
table(colnames(mRNA_exp) %in% clinic_data2$sampleID)
index1 <- intersect(colnames(mRNA_exp),clinic_data2$sampleID)
mRNA_exp2 = mRNA_exp[,index1]
clinic_data2 <- clinic_data2[index1,]
MCP法
install.packages("MCPcounter-master/Source/",repos=NULL,type="source")
library(MCPcounter)
genes <- data.table::fread("genes.txt",data.table = F)
results<- MCPcounter.estimate(mRNA_exp2,
featuresType= "HUGO_symbols",
probesets=probesets,
genes=genes)
results2 <- results[1:9,]#去除最后一行fbro细胞的行
library(pheatmap)
pheatmap(results2, #热图的数据
cluster_rows = F,#行聚类
cluster_cols = T,#列聚类,可以看出样本之间的区分度
##annotation_col =annotation_col, #标注样本分类
annotation_legend=TRUE, # 显示注释
show_rownames = T,
show_colnames = F,# 显示行名
scale = "row", #以行来标准化
color =colorRampPalette(c("blue", "white","red"))(100),#调色
#filename = "heatmap_F.pdf",#是否保存
图片.png
生存分析
results1 <- as.data.frame(t(results))
Last <- cbind(results1,clinic_data2)
Last$group = ifelse(Last$`B lineage` > median(Last$`B lineage`),'high','low')
library(survival)
library(survminer)
sfit1=survfit(Surv(OS.time, OS)~group, data=Last)
ggsurvplot(sfit1,pval =TRUE, data = Last,risk.table = TRUE)
sfit1=survfit(Surv(DSS.time, DSS)~group, data=Last)
ggsurvplot(sfit1,pval =TRUE, data = Last, risk.table = TRUE)
#surv.cut = surv_cutpoint(Last, time = "OS.time", event = "OS", #variables = "B lineage", minprop = 0.3)
# 最优分类值
#opticut = surv.cut$cutpoint$cutpoint
#opticut
图片.png