TCGA数据挖掘生信分析流程非肿瘤

TCGA文章复现——LncRNA-CRC预后

2020-04-30  本文已影响0人  猪莎爱学习

Title:An Integrated Three-Long Non-coding RNA Signature Predicts Prognosis in Colorectal Cancer Patients

文章思路

本文的内容是用GDC下载并整理表达矩阵和临床信息数据。

1.从网页选择数据,下载manifest文件

数据存放网站:https://portal.gdc.cancer.gov/
在Repository勾选自己需要的case和file类型。以CHOL为例:
case-Project选择TCGA-CHOL。
file-选择如图:


左右分别是mrna 和clinical的样本选择截图。选好后,点击右侧manifest键下载对应的清单文件。

2.使用gdc-client工具下载

注意
将gdc-client(mac)或gdc-client.exe(windows)放在工作目录下;
将manifest文件放在工作目录下。

options(stringsAsFactors = F)
cancer_type="TCGA-COAD"
if(!dir.exists("clinical"))dir.create("clinical")
if(!dir.exists("mrna"))dir.create("mrna")
dir()
#下面两行命令在terminal完成
#./gdc-client download -m  gdc_manifest.clinical.txt -d clinical
#./gdc-client download -m gdc_manifest.COAD_RNAseq.txt -d mrna

length(dir("./clinical/"))  #结果是459.说明有459个病人
length(dir("./mrna/"))    #结果是521,说明有521个样本

可以看到,下载的文件是按样本存放的,我们需要得到的是表格,需要将他们批量读入R语言并整理。

3.整理临床信息

library(XML)
result <- xmlParse("./clinical/007b27c5-d36e-4bec-9321-b03b773fd3b8/nationwidechildrens.org_clinical.TCGA-A6-6651.xml")
rootnode <- xmlRoot(result)
rootsize <- xmlSize(rootnode)
print(rootnode[1])
#print(rootnode[2])
xmldataframe <- xmlToDataFrame(rootnode[2])
head(t(xmlToDataFrame(rootnode[2])))

xmls = dir("clinical/",pattern = "*.xml$",recursive = T)

td = function(x){
  result <- xmlParse(file.path("clinical/",x))
  rootnode <- xmlRoot(result)
  xmldataframe <- xmlToDataFrame(rootnode[2])
  return(t(xmldataframe))
}

cl = lapply(xmls,td)
cl_df <- t(do.call(cbind,cl))
cl_df[1:3,1:3]
clinical = data.frame(cl_df)
clinical[1:4,1:4]

4.整理表达矩阵

探索数据:先任选两个counts文件读取,并观察geneid的顺序是否一致。

options(stringsAsFactors = F)
x = read.table("mrna/02734d4d-fc8f-4ef7-ac82-1b4d7184cc5e/28004569-048d-4f8c-99aa-7a8c69a98dcc.htseq.counts.gz")
x2 = read.table("mrna/0826add5-571b-41be-b186-936959fc9d79/dfeb2d54-28da-4521-b488-bdac246bb59b.htseq.counts.gz")
identical(x$V1,x2$V1)

由此可知,他们的geneid顺序是一致的,可以直接cbind,不会导致顺序错乱。

批量读取所有的counts.gz文件。

count_files = dir("mrna/",pattern = "*.htseq.counts.gz$",recursive = T)

ex = function(x){
  result <- read.table(file.path("mrna/",x),row.names = 1,sep = "\t")
  return(result)
}
head(ex("02734d4d-fc8f-4ef7-ac82-1b4d7184cc5e/28004569-048d-4f8c-99aa-7a8c69a98dcc.htseq.counts.gz"))

exp = lapply(count_files,ex)
exp <- do.call(cbind,exp)
dim(exp)    #60488  521  有6万多行,521个样本
exp[1:4,1:4]
发现问题:没有列名

发现问题:这样产生出来的表达矩阵没有列名。
解决办法:找到一个文件名与样本ID一一对应的文件。cart-json文件。

meta <- jsonlite::fromJSON("metadata.cart.2020-04-23.json")
colnames(meta)
ids <- meta$associated_entities;class(ids)
ids[[1]]
class(ids[[1]][,3])   #并不是每一个数据都是第3列,需要自己手动检查
#不过也可以用str_detect(ids[[1]],"TCGA")   返回值为TRUE就是我们要的列

可以看到,meta$associated_entities是个列表,这个列表里包含数据框,数据框的第一列内容就是tcga样本id。

注意,换了数据需要自己探索存放在哪一列。不一定是完全一样的,需要确认清楚。

ID = sapply(ids,function(x){x[,3]})
file2id = data.frame(file_name = meta$file_name,
                     ID = ID)

文件名与TCGA样本ID的对应关系已经得到,接下来是将其添加到表达矩阵中,成为行名。需要找到读取文件的顺序,一一对应修改。

head(file2id$file_name)
head(count_files)
count_files2 = stringr::str_split(count_files,"/",simplify = T)[,2]
count_files2[1] %in% file2id$file_name

count_files2的顺序就是列名的顺序,根据它来调整file2id的顺序。此处需要再次理解一下match函数。

file2id = file2id[match(count_files2,file2id$file_name),]
colnames(exp) = file2id$ID
exp[1:4,1:4]

表达矩阵整理完成,需要过滤一下那些在很多样本里表达量都为0的基因。过滤标准不唯一。

dim(exp)
exp = exp[apply(exp, 1, function(x) sum(x > 1) > 41), ]
dim(exp)
exp[1:4,1:4]

分组信息

根据样本ID的第14-15位,给样本分组(tumor和normal)

table(stringr::str_sub(colnames(exp),14,15))
group_list = ifelse(as.numeric(substr(colnames(exp),14,15)) < 10,'tumor','normal')
group_list = factor(group_list,levels = c("normal","tumor"))
table(group_list)    #normal:41   tumor:480
save(exp,clinical,group_list,cancer_type,file = paste0(cancer_type,"gdc.Rdata"))

接下来就是差异分析,但是差异分析之前要先把lncRNA挑出来,然后还有配对样本挑出来,是非常难的两步!!(小洁老师说的)

01.需求:文中作者把配对数据以及LncRNA都挑出来了

TCGA的RNA-seq数据使用的geneid是ensembl id,里面不仅有mRNA,也有非编码基因和其他类型。

所以,如何从TCGA得到的表达矩阵中分别提取出mRNA和lncRNA的表达量呢?

02.思路

1.找到TCGA数据对应的参考基因组版本。

2.下载该版本的参考基因组文件,找到mRNA和lncRNA对应的ensembl id

3.在表达矩阵中筛选。

03.动起来

1.找参考基因组版本

gdc首页的support

about the data - GDC Reference Files

可以看到,使用的参考基因组版本是genecode的v22。(版本很多,这个是14年的版本了)

2.找区分类型的列

在gtf文件里并不是直接分出了lncRNA,需要找gtf文件里对biotype的说明,不看不知道,一看发现这是一个很长的表格。

其中对lncRNA的说明是:

Generic long non-coding RNA biotype that replaced the following biotypes: 3prime_overlapping_ncRNA, antisense, bidirectional_promoter_lncRNA, lincRNA, macro_lncRNA, non_coding, processed_transcript, sense_intronic and sense_overlapping.

所以需要将genetype里这些类型对应的行挑出来,就是lncRNA了。
然后与表达矩阵行名进行匹配替换,就可以分别得到mRNA和lncRNA的矩阵了。

全部样本的表达矩阵,供生存分析用

#step1:读取并探索gtf文件----
options(stringsAsFactors = F)
if(!file.exists("anno.Rdata")){
  #BiocManager::install("rtracklayer")
  library(rtracklayer)
  x = rtracklayer::import("gencode.v22.annotation.gtf")
  class(x)
  x2 = as.data.frame(x);dim(x2)
  colnames(x2)
  tj = as.data.frame(table(x2$type));tj
  
  tj2 = as.data.frame(table(x2$gene_type));tj2
  #step2:先筛选出gene对应的行
  nrow(x2);x2 = x2[x2$type=="gene",];nrow(x2)
  tj3 = as.data.frame(table(x2$gene_type));tj3
  #step3:提取lnc和mRNA及其对应的ensambelid和symbol
  lnc_bype = c("3prime_overlapping_ncRNA", "antisense", "bidirectional_promoter_lncRNA", "lincRNA", "macro_lncRNA", "non_coding", "processed_transcript", "sense_intronic" , "sense_overlapping")
  table(x2$gene_type %in% lnc_bype)
  table(x2$gene_type == "protein_coding")
  lnc_anno = x2[x2$gene_type %in% lnc_bype,c("gene_name","gene_id","gene_type")]
  mRNA_anno = x2[x2$gene_type == "protein_coding",c("gene_name","gene_id","gene_type")]
  save(lnc_anno,mRNA_anno,file = "anno.Rdata")
}
load("anno.Rdata")
#step4:表达矩阵拆分和注释
load("TCGA-COADgdc.Rdata")
table(rownames(exp) %in% mRNA_anno$gene_id)
mRNA_exp = exp[rownames(exp) %in% mRNA_anno$gene_id,]
tmp = data.frame(gene_id = rownames(exp))
x = dplyr::inner_join(tmp,mRNA_anno,by = "gene_id")
#inner_join不改变顺序,可以确认一下
identical(tmp$gene_id,rownames(exp))
table(!duplicated(x$gene_name))
mRNA_exp = mRNA_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(mRNA_exp) = x$gene_name
mRNA_exp[1:4,1:4]   #这里的mRNA是所有样本521个样本的mRNA
mRNA_exp = na.omit(mRNA_exp)
#同样办法拆出lncRNA:也是所有样本的(521个样本)
lnc_exp = exp[rownames(exp) %in% lnc_anno$gene_id,]
tmp = data.frame(gene_id = rownames(exp))
x = dplyr::inner_join(tmp,lnc_anno,by = "gene_id")
identical(tmp$gene_id,rownames(exp))
table(!duplicated(x$gene_name))
lnc_exp = lnc_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(lnc_exp) = x$gene_name
lnc_exp[1:4,1:4]
lnc_exp = na.omit(lnc_exp)

配对样本的lnc、mRNA 表达矩阵,供差异分析用

挑选样本,是选列,与行(基因)无关

1.选出与normal样本匹配的tumor样本,合并为新的表达矩阵

#挑选出来normal的,然后匹配样本名
exp2 = rbind(mRNA_exp,lnc_exp) #mRNA和lncRNA合并,不改变顺序
identical(rownames(mRNA_exp),head(rownames(exp2),nrow(mRNA_exp)))
table(stringr::str_sub(colnames(exp2),14,15)) #正常样本个数?41个大于10的那就是41个正常样本
library(stringr)
exp2_nor = exp2[,str_sub(colnames(exp2),14,15)==11]
exp2_tum = exp2[,str_sub(colnames(exp2),14,15)!=11]
patient = str_sub(colnames(exp2_nor),1,12) #有配对样本的病人id
table(str_sub(colnames(exp2_tum),1,12) %in% str_sub(patient)) 
# 看看肿瘤样本有重复吗:
exp2_tum = exp2_tum[,str_sub(colnames(exp2_tum),1,12) %in% str_sub(patient)]
exp2_tum = exp2_tum[,str_sort(colnames(exp2_tum))] #str_sort是排序
exp2_tum = exp2_tum[,!duplicated(str_sub(colnames(exp2_tum),1,12) )] #排序去重,这样样本有01A和01B可选时,就会留下01A。

#让正常样本顺序与肿瘤样本顺序一致,这样才方便写配对向量
tmp = match(str_sub(colnames(exp2_tum),1,12),str_sub(colnames(exp2_nor),1,12))
exp2_nor = exp2_nor[,tmp]
identical(str_sub(colnames(exp2_tum),1,12),str_sub(colnames(exp2_nor),1,12))
exp2_small = cbind(exp2_nor,exp2_tum)

拆分回lnc和mRNA

mRNA_exp_small = head(exp2_small,nrow(mRNA_exp))
lnc_exp_small = tail(exp2_small,nrow(lnc_exp))

得到了几个表达矩阵,捋一下:6个矩阵

dim(exp) #原始ensambel矩阵   35267  521
dim(exp2) #lncRNA和mRNA矩阵合并得到的symbol矩阵   26682   521
dim(lnc_exp) #lncRNA   8714   521
dim(mRNA_exp) #mRNA     17968   521
dim(lnc_exp_small) #配对的lncRNA矩阵   8714   82
dim(mRNA_exp_small)#配对的lncRNA矩阵   17968  82
save(lnc_exp,mRNA_exp,file = paste0(cancer_type,"lmexp.Rdata"))
save(lnc_exp_small,mRNA_exp_small,file = paste0(cancer_type,"lmexp_small.Rdata"))

然后就是差异分析了:

if(!require(stringr))install.packages('stringr')
if(!require(ggplotify))install.packages("ggplotify")
if(!require(patchwork))install.packages("patchwork")
if(!require(cowplot))install.packages("cowplot")
if(!require(edgeR))BiocManager::install('edgeR')

1.lnc_RNA-edgeR 配对差异分析

rm(list = ls())
load("TCGA-COADgdc.Rdata")
load("TCGA-COADlmexp_small.Rdata")
group_list = rep(c("normal","tumor"),each = ncol(lnc_exp_small)/2)
pairinfo = rep(1:(ncol(lnc_exp_small)/2),times = 2)
table(group_list)
library(edgeR)

dge <- DGEList(counts=lnc_exp_small,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 

design <- model.matrix(~pairinfo+group_list)
dge <- estimateDisp(dge,design)

fit <- glmQLFit(dge, design)
fit2 <- glmQLFTest(fit)
result = topTags(fit2)

DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG)
#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff <- 1.5
DEG$change = as.factor(
  ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff,
         ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
head(DEG)
table(DEG$change)
lncRNA_DEG <- DEG
cg = rownames(DEG)[DEG$change!="NOT"]
cgexp = lnc_exp_small[cg,]
table(head(DEG[cg,"change"],580))

if(!require(tinyplanet))devtools::install_local("tinyplanet-master.zip",upgrade = F)
library(ggplot2)
library(tinyplanet)
lnc_exp_small[1:4,1:4]
#PCA
dat = log(lnc_exp_small+1)
pca.plot = draw_pca(dat,group_list);pca.plot
#热图火山图
x = lnc_exp_small[cg,]
library(tinyplanet)
h = draw_heatmap(x,group_list,scale_before = T)
v = draw_volcano(lncRNA_DEG,logFC_cutoff = 1.5,pkg = 2)
save(cg,file = paste0(cancer_type,"lnccg.Rdata"))

2.mRNA-edgeR 配对差异分析 可以搜索edgeR paired 网上有代码写配对怎么分析

rm(list = ls())
load("TCGA-COADgdc.Rdata")
load("TCGA-COADlmexp_small.Rdata")
group_list = rep(c("normal","tumor"),each = ncol(mRNA_exp_small)/2)
pairinfo = rep(1:41,times = 2)
table(group_list)
library(edgeR)

dge <- DGEList(counts=mRNA_exp_small,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 

design <- model.matrix(~pairinfo+group_list)
dge <- estimateDisp(dge,design)

fit <- glmQLFit(dge, design)
fit2 <- glmQLFTest(fit)
result = topTags(fit2)

DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG)
#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff <- 1.5
DEG$change = as.factor(
  ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff,
         ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
head(DEG)
table(DEG$change)
lncRNA_DEG <- DEG
cg = rownames(DEG)[DEG$change!="NOT"]
cgexp = mRNA_exp_small[cg,]
table(head(DEG[cg,"change"],580))

if(!require(tinyplanet))devtools::install_local("tinyplanet-master.zip",upgrade = F)
library(ggplot2)
library(tinyplanet)
mRNA_exp_small[1:4,1:4]
#PCA
dat = log(mRNA_exp_small+1)
pca.plot = draw_pca(dat,group_list);pca.plot
#热图火山图
x = mRNA_exp_small[cg,]
library(tinyplanet)
h = draw_heatmap(x,group_list,scale_before = T)
v = draw_volcano(lncRNA_DEG,logFC_cutoff = 1.5,pkg = 2)
save(cg,file = paste0(cancer_type,"mRNAcg.Rdata"))

以上代码出的图在下面:


LncRNA PCA
LncRNA 热图
mRNA PCA
mRNA 热图

下面是生存分析:

生存分析只需要tumor数据,不要normal,将其去掉,新表达矩阵数据命名为exprSet;

clinical信息需要进一步整理,成为生存分析需要的格式,新临床信息数据命名为meta。

由于不同癌症的临床信息表格组织形式不同,这里的代码需要根据实际情况修改。

rm(list=ls())
options(stringsAsFactors = F)
load("TCGA-COADgdc.Rdata")
load("TCGA-COADlmexp.Rdata")
load("TCGA-COADlnccg.Rdata")
library(stringr)

#clinical通常有几十列,选出其中有用的几列。
tmp = data.frame(colnames(clinical))
clinical = clinical[,c(
  'bcr_patient_barcode',
  'vital_status',
  'days_to_death',
  'days_to_last_followup',
  'race_list',
  'age_at_initial_pathologic_diagnosis',
  'gender' ,
  'stage_event'
)]
#其实days_to_last_followup应该是找age_at_initial_pathologic_diagnosis,这表格里没有,用days_to_birth计算一下年龄,暂且替代。
dim(clinical)
#rownames(clinical) <- NULL
rownames(clinical) <- clinical$bcr_patient_barcode
clinical[1:4,1:4]

meta = clinical[clinical$days_to_death != 0,]
#简化meta的列名
colnames(meta)=c('ID','event','death','last_followup','race','age','gender','stage')
#表达矩阵处理
exprSet=lnc_exp[cg,group_list=='tumor']
table(str_sub(colnames(exprSet),1,12) %in% meta$ID)
exprSet = exprSet[,str_sub(colnames(exprSet),1,12) %in% meta$ID]
exprSet = exprSet[,str_sort(colnames(exprSet))]
exprSet = exprSet[,!duplicated(str_sub(colnames(exprSet),1,12) )]

#调整meta的ID顺序与exprSet列名一致
meta=meta[match(str_sub(colnames(exprSet),1,12),meta$ID),]
all(meta$ID==str_sub(colnames(exprSet),1,12))

#整理生存分析的输入数据----

#1.1由随访时间和死亡时间计算生存时间(月)
is.empty.chr = function(x){
  ifelse(stringr::str_length(x)==0,T,F)
}
is.empty.chr(meta[2,3])
meta[,3][is.empty.chr(meta[,3])]=0
meta[,4][is.empty.chr(meta[,4])]=0
meta$time=(as.numeric(meta[,3])+as.numeric(meta[,4]))/30

#1.2 根据生死定义event,活着是0,死的是1
meta$event=ifelse(meta$event=='Alive',0,1)

#1.3 年龄和年龄分组
meta$age=as.integer(meta$age)
meta$age_group=ifelse(meta$age>median(meta$age),'older','younger')

#1.4 stage 
library(stringr) 
meta$stage
tmp = str_split(meta$stage,' ',simplify = T)[,2]
str_count(tmp,"T")
str_locate(tmp,"T")[,1]
tmp = str_sub(tmp,1,str_locate(tmp,"T")[,1]-1)
table(tmp)
table(is.na(tmp))
meta$stage = tmp

#1.5 race
table(meta$race)
save(meta,exprSet,group_list,cancer_type,file = paste0(cancer_type,"sur_model.Rdata"))
整理好的生存数据

下面就是生存分析的可视化:logrank批量生存分析和cox批量生存分析

1.logrank批量生存分析

logrankfile = paste0(cancer_type,"log_rank_p.Rdata")
if(!file.exists(logrankfile)){
    mySurv=with(meta,Surv(time, event))
  log_rank_p <- apply(exprSet , 1 , function(gene){
    # gene=exprSet[1,]
    meta$group=ifelse(gene>median(gene),'high','low')  
    data.survdiff=survdiff(mySurv~group,data=meta)
    p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
    return(p.val)
  })
  log_rank_p=sort(log_rank_p)
  save(log_rank_p,file = logrankfile)
}
load(logrankfile)
table(log_rank_p<0.01) 
table(log_rank_p<0.05) 

2.cox批量生存分析

coxfile = paste0(cancer_type,"cox.Rdata")
if(!file.exists(coxfile)){
  mySurv=with(meta,Surv(time, event))
  cox_results <-apply(exprSet , 1 , function(gene){
  # gene= exprSet[1,]
  group=ifelse(gene>median(gene),'high','low') 
  survival_dat <- data.frame(group=group,stage=meta$stage,age=meta$age,
                             gender=meta$gender,
                             stringsAsFactors = F)
  m=coxph(mySurv ~ gender + age + stage + group, data =  survival_dat)
  
  beta <- coef(m)
  se <- sqrt(diag(vcov(m)))
  HR <- exp(beta)
  HRse <- HR * se
  
  #summary(m)
  tmp <- round(cbind(coef = beta, se = se, z = beta/se, p = 1 - pchisq((beta/se)^2, 1),
                     HR = HR, HRse = HRse,
                     HRz = (HR - 1) / HRse, HRp = 1 - pchisq(((HR - 1)/HRse)^2, 1),
                     HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
                     HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 3)
  return(tmp['grouplow',])
  
})
cox_results=t(cox_results)
save(cox_results,file = coxfile)
}
load(coxfile)
table(cox_results[,4]<0.01)
table(cox_results[,4]<0.05)

lr = names(log_rank_p)[log_rank_p<0.05]
cox = rownames(cox_results)[cox_results[,4]<0.05]
length(intersect(lr,cox))
save(cox,lr,file = paste0(cancer_type,"cox_lr.Rdata"))

然后是Lasso回归

1.准备输入数据

load("TCGA-COADsur_model.Rdata")
#load("TCGA-CHOLsur_model.Rdata")
load("TCGA-COADcox_lr.Rdata")
exprSet = exprSet[cox,]

2.构建lasso回归模型

输入数据是表达矩阵(仅含tumor样本)和对应的生死。

x=t(log2(exprSet+1))
y=meta$event
library(glmnet)
model_lasso <- glmnet(x, y,nlambda=10, alpha=1)
print(model_lasso)

这里是举例子,所以只计算了10个λ值,解释一下输出结果三列的意思。

解释的残差百分比越高越好,但是构建模型使用的基因的数量也不能太多,需要取一个折中值。

2.1挑选合适的λ值

计算1000个,画图,筛选表现最好的λ值

set.seed(13098)
cv_fit <- cv.glmnet(x=x, y=y, nlambda = 1000,alpha = 1)
plot(cv_fit)

两条虚线分别指示了两个特殊的λ值,一个是lambda.min,一个是lambda.1se,这两个值之间的lambda都认为是合适的。lambda.1se构建的模型最简单,即使用的基因数量少,而lambda.min则准确率更高一点,使用的基因数量更多一点。

2.2 用这两个λ值重新建模

model_lasso_min <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.min)
model_lasso_1se <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.1se)

这两个值体现在参数lambda上。有了模型,可以将筛选的基因挑出来了。所有基因存放于模型的子集beta中,用到的基因有一个s0值,没用的基因只记录了“.”,所以可以用下面代码挑出用到的基因。

head(model_lasso_min$beta)
choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
choose_gene_1se=rownames(model_lasso_1se$beta)[as.numeric(model_lasso_1se$beta)!=0]
length(choose_gene_min)
length(choose_gene_1se)

load("TCGA-COADcox_lr.Rdata")
for_cox = intersect(choose_gene_min,lr)
save(for_cox,choose_gene_min,file = "for_cox.Rdata")

3.模型预测和评估

3.1自己预测自己

newx参数是预测对象。输出结果lasso.prob是一个矩阵,第一列是min的预测结果,第二列是1se的预测结果,预测结果是概率,或者说百分比,不是绝对的0和1。

将每个样本的生死和预测结果放在一起,直接cbind即可。

lasso.prob <- predict(cv_fit, newx=x , s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
re=cbind(y ,lasso.prob)
head(re)

3.2 箱线图

对预测结果进行可视化。以实际的生死作为分组,画箱线图整体上查看预测结果。

re=as.data.frame(re)
colnames(re)=c('event','prob_min','prob_1se')
re$event=as.factor(re$event)
library(ggpubr) 
p1 = ggboxplot(re, x = "event", y = "prob_min",
               color = "event", palette = "jco",
               add = "jitter")+ stat_compare_means()
p2 = ggboxplot(re, x = "event", y = "prob_1se",
          color = "event", palette = "jco",
          add = "jitter")+ stat_compare_means()
library(patchwork)
p1+p2

可以看到,真实结果是死(0)的样本,预测的值就小一点(靠近0),真实结果是活着(1)的样本,预测的值就大一点(靠近1),整体上趋势是对的,但不是完全准确,模型是可用的。

对比两个λ值构建的模型,差别不大,1se的预测值准确一点。

3.3 ROC曲线

计算AUC取值范围在0.5-1之间,越接近于1越好。可以根据预测结果绘制ROC曲线。

library(ROCR)
library(caret)
# 自己预测自己
#min
pred_min <- prediction(re[,2], re[,1])
auc_min = performance(pred_min,"auc")@y.values[[1]]
perf_min <- performance(pred_min,"tpr","fpr")
plot(perf_min,colorize=FALSE, col="blue") 
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.2, labels = paste0("AUC = ",round(auc_min,3)))
#1se
pred_1se <- prediction(re[,3], re[,1])
auc_1se = performance(pred_1se,"auc")@y.values[[1]]
perf_1se <- performance(pred_1se,"tpr","fpr")
plot(perf_1se,colorize=FALSE, col="red") 
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.2, labels = paste0("AUC = ",round(auc_1se,3)))
plot(perf_min,colorize=FALSE, col="blue") 
plot(perf_1se,colorize=FALSE, col="red",add = T) 
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.3, labels = paste0("AUC = ",round(auc_min,3)),col = "blue")
text(0.8,0.2, labels = paste0("AUC = ",round(auc_1se,3)),col = "red")

-还想用ggplot2画的更好看一点

tpr_min = performance(pred_min,"tpr")@y.values[[1]]
tpr_1se = performance(pred_1se,"tpr")@y.values[[1]]
dat = data.frame(tpr_min = perf_min@y.values[[1]],
                 fpr_min = perf_min@x.values[[1]],
                 tpr_1se = perf_1se@y.values[[1]],
                 fpr_1se = perf_1se@x.values[[1]])
library(ggplot2)
ggplot() + 
  geom_line(data = dat,aes(x = fpr_min, y = tpr_min),color = "blue") + 
  geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
  theme_bw()+
  annotate("text",x = .75, y = .25,
           label = paste("AUC of min = ",round(auc_min,2)),color = "blue")+
  scale_x_continuous(name  = "fpr")+
  scale_y_continuous(name = "tpr")


ggplot() + 
  geom_line(data = dat,aes(x = fpr_min, y = tpr_min),color = "blue") + 
  geom_line(data = dat,aes(x = fpr_1se, y = tpr_1se),color = "red")+
  geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
  theme_bw()+
  annotate("text",x = .75, y = .25,
           label = paste("AUC of min = ",round(auc_min,2)),color = "blue")+
  annotate("text",x = .75, y = .15,label = paste("AUC of 1se = ",round(auc_1se,2)),color = "red")+
  scale_x_continuous(name  = "fpr")+
  scale_y_continuous(name = "tpr")

然后是cox-风险森林图

1.准备输入数据

load("TCGA-COADsur_model.Rdata")
load("for_cox.Rdata")
library(stringr)

2.挑选感兴趣的基因构建coxph模型

出自文章Integrated genomic analysis identifies subclasses and prognosis signatures of kidney cancer中,五个miRNA是miR-21,miR-143,miR-10b,miR-192,miR-183

将他们从表达矩阵中取出来,就得到了5个基因在522个肿瘤样本中的表达量,可作为列添加在meta表噶的后面,组成的数据框赋值给dat。

e=t(exprSet[for_cox,])
e=log2(e+1)
colnames(e)=str_replace(colnames(e),"-","_")
dat=cbind(meta,e)
dat$gender=factor(dat$gender)
dat$stage=factor(dat$stage)
colnames(dat)

用survival::coxph()函数构建模型

library(survival)
library(survminer)
#s=Surv(time, event) ~ gender + stage + age + RP5_884M6.1+RP11_148K1.12+RP11_108K3.2+RP1_79C4.4+AC004510.3

# s = as.formula(paste("Surv(time, event) ~ ",
#                      paste(c(colnames(meta)[c(7,8,6)],
#                            colnames(e)),
#                            collapse = "+")
#                      )
#                )

#s=Surv(time, event) ~ RP5_884M6.1+RP11_148K1.12+RP11_108K3.2+RP1_79C4.4+AC004510.3
s = as.formula(paste("Surv(time, event) ~ ",paste(colnames(e),collapse = "+")))
model <- coxph(s, data = dat )
# 去掉不显著的基因
summary(model)$concordance
summary(model)
tmp = summary(model)$coefficients[,5]
re = names(tmp[tmp<0.05])

#s=Surv(time, event) ~ RP5_884M6.1+RP1_79C4.4+AC004510.3
s = as.formula(paste("Surv(time, event) ~ ",paste(re,collapse = "+")))

model <- coxph(s, data = dat )
summary(model)

3.模型可视化-森林图

options(scipen=1)
ggforest(model, data =dat, 
         main = "Hazard ratio", 
         cpositions = c(0.10, 0.22, 0.4), 
         fontsize = 1.0, 
         refLabel = "1", noDigits = 4)

4.模型预测

fp <- predict(model,type = "risk")
boxplot(fp)
library(Hmisc)
options(scipen=200)
with(dat,rcorr.cens(fp,Surv(time, event)))
# 若要找到最佳模型,我们可以进行变量选择,可以采用逐步回归法进行分析

这里只是举个栗子,自己预测自己的C-index是1-with(dat,rcorr.cens(fp,Surv(time, event)))[[1]],实战应该是拿另一个数据集来预测,或者将一个数据集分两半,一半构建模型,一半验证,可以使用机器学习的R包切割数据。

C-index用于计算生存分析中的COX模型预测值与真实之间的区分度(discrimination),也称为Harrell's concordanceindex。C-index在0.5-1之间。0.5为完全不一致,说明该模型没有预测作用,1为完全一致,说明该模型预测结果与实际完全一致。

5.划分高低风险并画生存分析图

names(fp) = rownames(dat)
ri = ifelse(fp<median(fp),"lowrisk","highrisk")
names(ri) = names(fp)
ri = factor(ri)

sfit <- survfit(Surv(time, event)~ri, data=meta)
ggsurvplot(sfit, conf.int=F, pval=TRUE)

ggsurvplot(sfit,palette = c("#E7B800", "#2E9FDF"),
           risk.table =TRUE,pval =TRUE,
           conf.int =TRUE,xlab ="Time in months", 
           ggtheme =theme_light(), 
           ncensor.plot = TRUE)
fp_dat=data.frame(patientid=1:length(fp),fp=as.numeric(sort(fp)))
sur_dat=data.frame(patientid=1:length(fp),
                   time=meta[names(sort(fp)),'time'] ,
                   event=meta[names(sort(fp )),'event']  ) 
sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
exp_dat=dat[names(sort(fp)),11:ncol(dat)]
###第一个图----
p1=ggplot(fp_dat,aes(x=patientid,y=fp))+geom_point()+theme_bw();p1
#第二个图
p2=ggplot(sur_dat,aes(x=patientid,y=time))+geom_point(aes(col=event))+theme_bw();p2
#第三个图
mycolors <- colorRampPalette(c("black", "green", "red"), bias = 1.2)(100)
tmp=t(scale(exp_dat[,re]))
tmp[tmp > 1] = 1
tmp[tmp < -1] = -1
#p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = T)
p3=pheatmap::pheatmap(tmp,
                      col= mycolors,
                      show_colnames = F,
                      cluster_cols = F)
#拼图实现三图联动
library(ggplotify)
plots = list(p1,p2,as.ggplot(as.grob(p3)))
library(gridExtra)
lay1 = rbind(c(rep(1,7),NA),
             c(rep(2,7)),
             c(rep(3,7))) #布局矩阵
grid.arrange(grobs = plots, 
             layout_matrix = lay1,
             heigths = c(1, 2,3))

按照预测值划分高低风险,加上注释

fp_dat=data.frame(patientid=1:length(fp),
                  fp=as.numeric(sort(fp)),
                  ri= ri[order(fp)])
sur_dat=data.frame(patientid=1:length(fp),
                   time=meta[names(sort(fp)),'time'] ,
                   event=meta[names(sort(fp )),'event']  ) 
sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
exp_dat=dat[names(sort(fp)),11:ncol(dat)]

###第一个图----
p1=ggplot(fp_dat,aes(x=patientid,y=fp))+
  geom_point(aes(color = ri))+
  scale_color_manual(values = c("red","blue"))+
  theme_bw();p1
#第二个图
p2=ggplot(sur_dat,aes(x=patientid,y=time))+
  geom_point(aes(col=event))+
  scale_color_manual(values = c("red","blue"))+
  theme_bw();p2
#第三个图
mycolors <- colorRampPalette(c("black", "green", "red"), bias = 1.2)(100)
tmp=t(scale(exp_dat[,re]))
tmp[tmp > 1] = 1
tmp[tmp < -1] = -1
#p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = T)
annotation_col = data.frame(group = ri,
                            row.names = names(ri))
ann_colors = list(
    group = c(lowrisk="blue", highrisk="red")
)

p3=pheatmap::pheatmap(tmp,
                      col= mycolors,
                      show_colnames = F,
                      cluster_cols = F,
                      annotation_col = annotation_col,
                      annotation_colors =ann_colors,
                      annotation_legend = F)
#拼图实现三图联动
library(ggplotify)
plots = list(p1,p2,as.ggplot(as.grob(p3)))
library(gridExtra)
lay1 = rbind(c(rep(1,7)),
             c(rep(2,7)),
             c(rep(3,7))) #布局矩阵
grid.arrange(grobs = plots, 
             layout_matrix = lay1,
             heigths = c(1,2,3))
三图联动

最后整理一下文章思路:

思路:由小洁老师整理
上一篇下一篇

猜你喜欢

热点阅读