TCGA数据挖掘生信医学猴哥

用MCP法计算TCGA样本中的免疫浸润

2020-11-16  本文已影响0人  碌碌无为的杰少

下载TCGA FPKM数据

rm(list = ls())

# 安装加载需要的R包
# install.packages("pacman", repos = 'https://mirror.lzu.edu.cn/CRAN/')
library(pacman)
p_load(TCGAbiolinks, tidyverse, magrittr, data.table, biomaRt)
library(dplyr)
# 设置工作目录
projectPath = "C:/Users/81494/Desktop/changai fpkm"     # 替换为真实的工作目录
dataPath = paste(projectPath, "Data", sep = "/")
if(!dir.exists(dataPath)) dir.create(dataPath)  # 新建一个Data文件夹,用于存放从TCGA上下载的数据
setwd(dataPath)
library(dplyr)

#### 使用TCGAbiolinks下载Counts和FPKM-UQ数据 ----
#https://www.bioconductor.org/packages/release/bioc/html/TCGAbiolinks.html
for(data_type in c("HTSeq - Counts", "HTSeq - FPKM")){
    
    repeat{
        query = try(GDCquery(project = "TCGA-COAD", legacy = FALSE, experimental.strategy = "RNA-Seq", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = data_type), silent = TRUE)
        if(class(query) != "try-error") {
            break
        }else{
            print("Try to connect to GDC...")
        }
    }
    GDCdownload(query, files.per.chunk = 100)   # 分批下载

    # 数据预处理
    dataAssy = GDCprepare(query, summarizedExperiment = F)
    
    # # 保存变量,以备后用
    # filename = paste0("TCGA-LIHC.RNA.query.", str_match(data_type, "- ([^\\s]*)$")[,2], ".RData")
    # save(query, dataAssy, file = filename)
    # load(filename)
    
    # 数据整理
    exp_data = dataAssy %>% 
        filter(str_detect(X1, "^ENSG")) %>% # 提取ENSG*注释的基因
        mutate(X1 = gsub("\\..*", "", X1)) %>% # 删除版本号
        rename(ensembl_gene_id = X1) %>% # "X1"重命名为"ensembl_gene_id"
        set_colnames(str_replace(colnames(.), "(-\\w*){3}$", "")) # 修改TCGA样本名
        # 上两行等同于:
        # set_colnames(c("ensembl_gene_id", str_match(colnames(.)[-1], "(TCGA-[^-]*-[^-]*-[^-]*)")[,2])) # 修改列名
    exp_data[1:3,1:3]
    
   
    
    # 输出
    outfile = paste0("COAD_Portal_RNA_", str_match(data_type, "- ([^\\s]*)$")[,2], ".txt")
    fwrite(exp_data, outfile, row.names = F, sep = "\t", quote = F)
    

下载TCGA临床数据

XENA官网下载整合好的数据

p_load(data.table, tidyverse, magrittr)
clinic_data = read.table("COAD_clinicalMatrix", header = T, sep = "\t")
surv_data = read.table("COAD_survival.txt", header = T, sep = "\t")
clinic_data = merge(clinic_data, surv_data, by.x = "sampleID", by.y = "sample")
rownames(clinic_data) <- clinic_data$sampleID
clinic_data=clinic_data[,c('sampleID','CDE_ID_3226963','MSI_updated_Oct62011','loss_expression_of_mismatch_repair_proteins_by_ihc','loss_expression_of_mismatch_repair_proteins_by_ihc_result','microsatellite_instability','pathologic_M','pathologic_N','pathologic_T','pathologic_stage','OS','OS.time','DSS','DSS.time','PFI','PFI.time')]

整理表达矩阵

exp_FPKM = fread("COAD_Portal_RNA_FPKM.txt", h = T, sep = "\t", check.names = F) %>% column_to_rownames("ensembl_gene_id")
table(!duplicated(colnames(exp_FPKM)))
exp_FPKM = exp_FPKM[,!duplicated(colnames(exp_FPKM))]#去重复测序两次的标本
exp_FPKM_T=exp_FPKM[,str_sub(colnames(exp_FPKM),14,16)=="01A"]#肿瘤样本
colnames(exp_FPKM_T) <- str_sub(colnames(exp_FPKM_T),1,15)
dd1 <- exp_FPKM_T[1:20,1:20]

提取mRNA和gene名转换

load("anno.Rdata")
#step4:表达矩阵拆分和注释
mRNA_anno <- mRNA_anno %>% 
    tidyr::separate(gene_id,into = c("gene_id","drop"),sep="\\.")

sum(rownames(exp_FPKM_T) %in% mRNA_anno$gene_id)
mRNA_exp = exp_FPKM_T[rownames(exp_FPKM_T) %in% mRNA_anno$gene_id,]
tmp = data.frame(gene_id = rownames(exp_FPKM_T))
x = dplyr::inner_join(tmp,mRNA_anno,by = "gene_id")
#inner_join不改变顺序,可以确认一下
table(!duplicated(x$gene_name))
#行名不允许重复,因此一个ensambelid对应多个symbol的需要去掉。
mRNA_exp = mRNA_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(mRNA_exp) = x$gene_name
mRNA_exp = na.omit(mRNA_exp)

表达矩阵和临床数据融合

table(colnames(mRNA_exp) %in% clinic_data$sample)

index <- intersect(colnames(mRNA_exp),clinic_data$sample)

mRNA_exp1 = mRNA_exp[,index]

clinic_data1 <- clinic_data[index,]

write.csv(clinic_data1,'clinic_data2.csv') 

手工整理临床数据

在csv中剔除生存时间小于90天和NA的,和存活时间大于3000天的病例

表达矩阵和临床数据再融合

clinic_data2 <- read.csv('clinic_data2.csv',header = T)
rownames(clinic_data2) <- clinic_data2$sampleID
table(colnames(mRNA_exp) %in% clinic_data2$sampleID)
index1 <- intersect(colnames(mRNA_exp),clinic_data2$sampleID)
mRNA_exp2 = mRNA_exp[,index1]
clinic_data2 <- clinic_data2[index1,]

MCP法

install.packages("MCPcounter-master/Source/",repos=NULL,type="source")
library(MCPcounter)
genes <- data.table::fread("genes.txt",data.table = F)
results<- MCPcounter.estimate(mRNA_exp2,
                              featuresType= "HUGO_symbols",
                              probesets=probesets,
                              genes=genes)
results2 <- results[1:9,]#去除最后一行fbro细胞的行
library(pheatmap)
pheatmap(results2, #热图的数据
         cluster_rows = F,#行聚类
         cluster_cols = T,#列聚类,可以看出样本之间的区分度
         ##annotation_col =annotation_col, #标注样本分类
         annotation_legend=TRUE, # 显示注释
         show_rownames = T,
         show_colnames = F,# 显示行名
         scale = "row", #以行来标准化
         color =colorRampPalette(c("blue", "white","red"))(100),#调色
         #filename = "heatmap_F.pdf",#是否保存
图片.png

生存分析

results1 <- as.data.frame(t(results))
Last <- cbind(results1,clinic_data2)
Last$group = ifelse(Last$`B lineage` > median(Last$`B lineage`),'high','low')
library(survival)
library(survminer)
sfit1=survfit(Surv(OS.time, OS)~group, data=Last)
ggsurvplot(sfit1,pval =TRUE, data = Last,risk.table = TRUE)
sfit1=survfit(Surv(DSS.time, DSS)~group, data=Last)
ggsurvplot(sfit1,pval =TRUE, data = Last, risk.table = TRUE)
#surv.cut = surv_cutpoint(Last, time = "OS.time", event = "OS", #variables = "B lineage", minprop = 0.3)
# 最优分类值
#opticut = surv.cut$cutpoint$cutpoint 
#opticut
图片.png
上一篇下一篇

猜你喜欢

热点阅读