TCGA_批量基因_生存分析_流程搭建

2022-01-11  本文已影响0人  小贝学生信

前期基础:
[R]TCGAbiolinks包:数据准备--query、download、prepare - 简书 (jianshu.com)
R—生存分析—Kaplan-Meier + Log-rank test + Cox回归 - 简书 (jianshu.com)

step1:下载TCGA表达数据

library(TCGAbiolinks)
projects = TCGAbiolinks:::getGDCprojects()$project_id
grep("TCGA", projects, value = T)
TCGAbiolinks:::getProjectSummary("TCGA-LIHC")$data_categories
query <- GDCquery(project = c("TCGA-LIHC"),
                  legacy = FALSE, #default(GDC harmonized database)
                  data.category = "Transcriptome Profiling",
                  data.type = "Gene Expression Quantification",
                  workflow.type = "HTSeq - FPKM")
dim(getResults(query))
# [1] 424  29

TP = TCGAquery_SampleTypes(getResults(query)$sample.submitter_id,"TP")
NT = TCGAquery_SampleTypes(getResults(query)$sample.submitter_id,"NT")
query <- GDCquery(project = c("TCGA-LIHC"),
                  legacy = FALSE, #default(GDC harmonized database)
                  data.category = "Transcriptome Profiling",
                  data.type = "Gene Expression Quantification",
                  workflow.type = "HTSeq - FPKM",
                  barcode = c(TP, NT))
dim(getResults(query))
# [1] 421  29
GDCdownload(query, files.per.chunk = 10)
data <- GDCprepare(query, save = T, save.filename = "LIHC_RNAseq_FPKM.rda")
#样本(临床)信息
colData(data) = colData(data)[,"ID",drop=F]
# 筛选 01 TP primary tumor
tp_index = which(stringr::str_detect(colData(data)$ID,
                                     "-01"))
data_tp = data[,tp_index]
assay(data_tp)[1:4,1:4]
#                 TCGA-DD-AAVS-01A-11R-A41C-07 TCGA-DD-AAE3-01A-11R-A41C-07
# ENSG00000000003                    32.681039                    39.248105
# ENSG00000000005                     0.000000                     0.000000
# ENSG00000000419                    21.935276                    33.647190
# ENSG00000000457                     3.062907                     0.844674
#                 TCGA-DD-A4NS-01A-11R-A311-07 TCGA-5R-AA1D-01A-11R-A38B-07
# ENSG00000000003                  12.58508419                  17.62580911
# ENSG00000000005                   0.01225347                   0.02573094
# ENSG00000000419                  17.12927554                  17.10958452
# ENSG00000000457                   2.68276766                   1.22480857

step2:结合ucsc整理好的生存信息

#可根据选择癌症类型,修改下面的链接地址
surdat_url="https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/survival%2FLIHC_survival.txt"
# ucsc_sur = data.table::fread(surdat_url)
ucsc_sur = data.table::fread("ucsc/Curated_survival_data.txt",
                             data.table = F)
head(ucsc_sur)
colData(data_tp) = dplyr::left_join(as.data.frame(colData(data_tp)), 
                 ucsc_sur, by=c("ID"="sample")) %>% 
  DataFrame(., row.names = colnames(data_tp))

head(colData(data_tp))
# DataFrame with 6 rows and 11 columns
# ID    X_PATIENT        OS   OS.time       DSS  DSS.time
# <character>  <character> <integer> <integer> <integer> <integer>
# TCGA-DD-AAVS-01A-11R-A41C-07 TCGA-DD-AAVS-01 TCGA-DD-AAVS         0      1823         0      1823
# TCGA-DD-AAE3-01A-11R-A41C-07 TCGA-DD-AAE3-01 TCGA-DD-AAE3         0       566         0       566
# TCGA-DD-A4NS-01A-11R-A311-07 TCGA-DD-A4NS-01 TCGA-DD-A4NS         1      2456         1      2456
# TCGA-5R-AA1D-01A-11R-A38B-07 TCGA-5R-AA1D-01 TCGA-5R-AA1D         0       449         0       449
# TCGA-BW-A5NO-01A-11R-A27V-07 TCGA-BW-A5NO-01 TCGA-BW-A5NO         0        20         0        20
# TCGA-DD-AADL-01A-11R-A41C-07 TCGA-DD-AADL-01 TCGA-DD-AADL         0       636         0       636
# DFI  DFI.time       PFI  PFI.time   Redaction
# <integer> <integer> <integer> <integer> <character>
# TCGA-DD-AAVS-01A-11R-A41C-07         0      1823         0      1823            
# TCGA-DD-AAE3-01A-11R-A41C-07         0       566         0       566            
# TCGA-DD-A4NS-01A-11R-A311-07         1       893         1       893            
# TCGA-5R-AA1D-01A-11R-A38B-07         0       449         0       449            
# TCGA-BW-A5NO-01A-11R-A27V-07        NA        NA         0        20            
# TCGA-DD-AADL-01A-11R-A41C-07         0       636         0       636

sur_clinical = colData(data_tp) %>% as.data.frame() %>% 
  dplyr::select(OS, OS.time) %>% 
  dplyr::mutate(time = OS.time/365) %>% 
  dplyr::rename(status = OS) %>% 
  dplyr::select(status, time)
head(sur_clinical)
#                              status       time
# TCGA-DD-AAVS-01A-11R-A41C-07      0 4.99452055
# TCGA-DD-AAE3-01A-11R-A41C-07      0 1.55068493
# TCGA-DD-A4NS-01A-11R-A311-07      1 6.72876712
# TCGA-5R-AA1D-01A-11R-A38B-07      0 1.23013699
# TCGA-BW-A5NO-01A-11R-A27V-07      0 0.05479452
# TCGA-DD-AADL-01A-11R-A41C-07      0 1.74246575

step3:批量生存分析

exp_t = assay(data_tp) %>% t()
identical(rownames(sur_clinical), rownames(exp_t))
# [1] TRUE

library(survival)
gene2sur = colnames(exp_t)[1:100]
gene_sur_list = lapply(seq(gene2sur), function(i){
  # i = 1
  print(paste0(i, " / ", length(gene2sur)))
  gene_sle = gene2sur[i]
  group_info = ifelse(exp_t[,gene_sle] > median(exp_t[,gene_sle]),
                      "high", "low")
  #表达值全为0的情况,跳过
  if(length(unique(group_info))==1) return(NULL)
  surdata = sur_clinical %>% 
    dplyr::select(status, time) %>% 
    dplyr::mutate(group = group_info)
  surv_diff <- survdiff(Surv(time, status) ~ group, data = surdata)
  p.val = 1 - pchisq(surv_diff$chisq, length(surv_diff$n) - 1)
  return(c(p.val, median(exp_t[,gene_sle])))
})
names(gene_sur_list) = gene2sur
gene_sur = do.call(rbind, gene_sur_list) %>% as.data.frame()
colnames(gene_sur) = c("pval","median_Exp")
head(gene_sur)
#                        pval  median_Exp
# ENSG00000000003 0.827913157 20.13245532
# ENSG00000000005 0.491871470  0.01302639
# ENSG00000000419 0.001291037 20.46595268
# ENSG00000000457 0.680007018  1.97177656
# ENSG00000000460 0.024186393  0.86061452
# ENSG00000000938 0.245993727  0.97951132
上一篇 下一篇

猜你喜欢

热点阅读