TCGA_批量基因_生存分析_流程搭建
2022-01-11 本文已影响0人
小贝学生信
前期基础:
[R]TCGAbiolinks包:数据准备--query、download、prepare - 简书 (jianshu.com)
R—生存分析—Kaplan-Meier + Log-rank test + Cox回归 - 简书 (jianshu.com)
step1:下载TCGA表达数据
library(TCGAbiolinks)
projects = TCGAbiolinks:::getGDCprojects()$project_id
grep("TCGA", projects, value = T)
TCGAbiolinks:::getProjectSummary("TCGA-LIHC")$data_categories
query <- GDCquery(project = c("TCGA-LIHC"),
legacy = FALSE, #default(GDC harmonized database)
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - FPKM")
dim(getResults(query))
# [1] 424 29
TP = TCGAquery_SampleTypes(getResults(query)$sample.submitter_id,"TP")
NT = TCGAquery_SampleTypes(getResults(query)$sample.submitter_id,"NT")
query <- GDCquery(project = c("TCGA-LIHC"),
legacy = FALSE, #default(GDC harmonized database)
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - FPKM",
barcode = c(TP, NT))
dim(getResults(query))
# [1] 421 29
GDCdownload(query, files.per.chunk = 10)
data <- GDCprepare(query, save = T, save.filename = "LIHC_RNAseq_FPKM.rda")
- 进一步整理信息
#样本(临床)信息
colData(data) = colData(data)[,"ID",drop=F]
# 筛选 01 TP primary tumor
tp_index = which(stringr::str_detect(colData(data)$ID,
"-01"))
data_tp = data[,tp_index]
assay(data_tp)[1:4,1:4]
# TCGA-DD-AAVS-01A-11R-A41C-07 TCGA-DD-AAE3-01A-11R-A41C-07
# ENSG00000000003 32.681039 39.248105
# ENSG00000000005 0.000000 0.000000
# ENSG00000000419 21.935276 33.647190
# ENSG00000000457 3.062907 0.844674
# TCGA-DD-A4NS-01A-11R-A311-07 TCGA-5R-AA1D-01A-11R-A38B-07
# ENSG00000000003 12.58508419 17.62580911
# ENSG00000000005 0.01225347 0.02573094
# ENSG00000000419 17.12927554 17.10958452
# ENSG00000000457 2.68276766 1.22480857
step2:结合ucsc整理好的生存信息
#可根据选择癌症类型,修改下面的链接地址
surdat_url="https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/survival%2FLIHC_survival.txt"
# ucsc_sur = data.table::fread(surdat_url)
ucsc_sur = data.table::fread("ucsc/Curated_survival_data.txt",
data.table = F)
head(ucsc_sur)
colData(data_tp) = dplyr::left_join(as.data.frame(colData(data_tp)),
ucsc_sur, by=c("ID"="sample")) %>%
DataFrame(., row.names = colnames(data_tp))
head(colData(data_tp))
# DataFrame with 6 rows and 11 columns
# ID X_PATIENT OS OS.time DSS DSS.time
# <character> <character> <integer> <integer> <integer> <integer>
# TCGA-DD-AAVS-01A-11R-A41C-07 TCGA-DD-AAVS-01 TCGA-DD-AAVS 0 1823 0 1823
# TCGA-DD-AAE3-01A-11R-A41C-07 TCGA-DD-AAE3-01 TCGA-DD-AAE3 0 566 0 566
# TCGA-DD-A4NS-01A-11R-A311-07 TCGA-DD-A4NS-01 TCGA-DD-A4NS 1 2456 1 2456
# TCGA-5R-AA1D-01A-11R-A38B-07 TCGA-5R-AA1D-01 TCGA-5R-AA1D 0 449 0 449
# TCGA-BW-A5NO-01A-11R-A27V-07 TCGA-BW-A5NO-01 TCGA-BW-A5NO 0 20 0 20
# TCGA-DD-AADL-01A-11R-A41C-07 TCGA-DD-AADL-01 TCGA-DD-AADL 0 636 0 636
# DFI DFI.time PFI PFI.time Redaction
# <integer> <integer> <integer> <integer> <character>
# TCGA-DD-AAVS-01A-11R-A41C-07 0 1823 0 1823
# TCGA-DD-AAE3-01A-11R-A41C-07 0 566 0 566
# TCGA-DD-A4NS-01A-11R-A311-07 1 893 1 893
# TCGA-5R-AA1D-01A-11R-A38B-07 0 449 0 449
# TCGA-BW-A5NO-01A-11R-A27V-07 NA NA 0 20
# TCGA-DD-AADL-01A-11R-A41C-07 0 636 0 636
sur_clinical = colData(data_tp) %>% as.data.frame() %>%
dplyr::select(OS, OS.time) %>%
dplyr::mutate(time = OS.time/365) %>%
dplyr::rename(status = OS) %>%
dplyr::select(status, time)
head(sur_clinical)
# status time
# TCGA-DD-AAVS-01A-11R-A41C-07 0 4.99452055
# TCGA-DD-AAE3-01A-11R-A41C-07 0 1.55068493
# TCGA-DD-A4NS-01A-11R-A311-07 1 6.72876712
# TCGA-5R-AA1D-01A-11R-A38B-07 0 1.23013699
# TCGA-BW-A5NO-01A-11R-A27V-07 0 0.05479452
# TCGA-DD-AADL-01A-11R-A41C-07 0 1.74246575
step3:批量生存分析
exp_t = assay(data_tp) %>% t()
identical(rownames(sur_clinical), rownames(exp_t))
# [1] TRUE
library(survival)
gene2sur = colnames(exp_t)[1:100]
gene_sur_list = lapply(seq(gene2sur), function(i){
# i = 1
print(paste0(i, " / ", length(gene2sur)))
gene_sle = gene2sur[i]
group_info = ifelse(exp_t[,gene_sle] > median(exp_t[,gene_sle]),
"high", "low")
#表达值全为0的情况,跳过
if(length(unique(group_info))==1) return(NULL)
surdata = sur_clinical %>%
dplyr::select(status, time) %>%
dplyr::mutate(group = group_info)
surv_diff <- survdiff(Surv(time, status) ~ group, data = surdata)
p.val = 1 - pchisq(surv_diff$chisq, length(surv_diff$n) - 1)
return(c(p.val, median(exp_t[,gene_sle])))
})
names(gene_sur_list) = gene2sur
gene_sur = do.call(rbind, gene_sur_list) %>% as.data.frame()
colnames(gene_sur) = c("pval","median_Exp")
head(gene_sur)
# pval median_Exp
# ENSG00000000003 0.827913157 20.13245532
# ENSG00000000005 0.491871470 0.01302639
# ENSG00000000419 0.001291037 20.46595268
# ENSG00000000457 0.680007018 1.97177656
# ENSG00000000460 0.024186393 0.86061452
# ENSG00000000938 0.245993727 0.97951132