TCGA数据挖掘task参考TCGA-流程

TCGA学习04:建模预测-lasso回归

2020-05-08  本文已影响0人  小贝学生信

法2:lasso回归

1、挑选合适模型
load("tosur.RData")
exprSet=exp_tumor  #套用老师的代码
identical(substring(colnames(exprSet),1,12),substring(rownames(meta),1,12))
#保证相同
x=t(log2(exprSet+1))
#行名为ID,列名为基因名
y=meta$event
library(glmnet)
model_lasso <- glmnet(x, y,nlambda=10, alpha=1)
print(model_lasso)

1、 nlambda参数表示做个nlambda个模型,当然多一点比较好,从而准确挑出最合适的模型,默认值为100;alpha=1为lasso回归标准参数。
2、如下图返回的三列值,即为所建的10个模型的情况

print(model_lasso)

一般选取%Dev值高,同时Df自由度较小的值,可如下作图选取合适的模型

cv_fit <- cv.glmnet(x=x, y=y, nlambda = 1000,alpha = 1)
#这里选取了1000个,精确些
plot(cv_fit)
plot(cv_fit)

如上图两条虚线分别指示了两个特殊的λ值,一个是lambda.min,一个是lambda.1se,这两个值之间的lambda都认为是合适的。lambda.1se构建的模型最简单,即使用的基因数量少,而lambda.min则准确率更高一点,使用的基因数量更多一点。


lambda.1se与lambda.min参数
2、确定合适模型
model_lasso_min <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.min)
model_lasso_1se <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.1se)
choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
choose_gene_1se=rownames(model_lasso_1se$beta)[as.numeric(model_lasso_1se$beta)!=0]
length(choose_gene_min)  #70个
length(choose_gene_1se)  #40个
3、看看模型预测效果如何
lasso.prob <- predict(cv_fit, newx=x , s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
#如上得到根据模型预测每个样本的生存概率的单列矩阵

re=cbind(y ,lasso.prob)
#合并样本预测值与真实值
head(re)
head(re)
结果可视化-1、箱线图
re=as.data.frame(re)
colnames(re)=c('event','prob_min','prob_1se')
re$event=as.factor(re$event)
library(ggpubr) 
p1 = ggboxplot(re, x = "event", y = "prob_min",
               color = "event", palette = "jco",
               add = "jitter")+ stat_compare_means()
p1
结果可视化-2、ROC曲线

ROC曲线评价模型很重要的一个概念

但是,我们还要注意一个问题

如下利用R包绘制

library(ROCR)
pred_min <- prediction(re[,2], re[,1])
auc_min = performance(pred_min,"auc")@y.values[[1]]
#求得AUC值
perf_min <- performance(pred_min,"tpr","fpr")
plot(perf_min,colorize=FALSE, col="blue") 
#绘图
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
# y=x
text(0.8,0.2, labels = paste0("AUC = ",round(auc_min,3)))
# 加AUC值
model_lasso_min <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.min)
model_lasso_1se <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.1se)
lasso.prob <- predict(cv_fit, newx=x , s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
re=cbind(y ,lasso.prob)
re=as.data.frame(re)
colnames(re)=c('event','prob_min','prob_1se')
re$event=as.factor(re$event)

pred_min <- prediction(re[,2], re[,1])
perf_min <- performance(pred_min,"tpr","fpr")
auc_min = performance(pred_min,"auc")@y.values[[1]]

pred_1se <- prediction(re[,3], re[,1])
perf_1se <- performance(pred_1se,"tpr","fpr")
auc_1se = performance(pred_1se,"auc")@y.values[[1]]

plot(perf_min,colorize=FALSE, col="blue") 
plot(perf_1se,colorize=FALSE, col="red",add = T) 
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.3, labels = paste0("AUC.min = ",round(auc_min,3)),col = "blue")
text(0.8,0.2, labels = paste0("AUC.lse= ",round(auc_1se,3)),col = "red")
比较两模型ROC曲线
4、训练集与验证集

一般建模分析,将数据分为两部分,一部分为训练集;一部分为验证集。根据前者建模,再根据模型预测验证集,看看效果如何,比较客观。过程基本同上,就是一开分数据要注意,一般7-3分,或者5-5分

load("tosur.RData")
exprSet=exp_tumor
library(caret)
set.seed(12345679)
sam<- createDataPartition(meta$event, p = .7,list = FALSE)
head(sam)

train <- exprSet[,sam]
test <- exprSet[,-sam]
train_meta <- meta[sam,]
test_meta <- meta[-sam,]

x = t(log2(train+1))
y = train_meta$event
cv_fit <- cv.glmnet(x=x, y=y, nlambda = 1000,alpha = 1)
plot(cv_fit)
model_lasso_min <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.min)
model_lasso_1se <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.1se)
lasso.prob <- predict(cv_fit, newx=t(log2(test+1)), s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
# 注意这里newx参数之前设置的同建模数据,现在设置的是验证集数据test
re=cbind(test_meta$event ,lasso.prob)
head(re)
re=as.data.frame(re)
colnames(re)=c('event','prob_min','prob_1se')
re$event=as.factor(re$event)
library(ggpubr) 
p1 = ggboxplot(re, x = "event", y = "prob_min",
               color = "event", palette = "jco",
               add = "jitter")+ stat_compare_means()
p2 = ggboxplot(re, x = "event", y = "prob_1se",
               color = "event", palette = "jco",
               add = "jitter")+ stat_compare_means()
library(patchwork)
p1+p2

library(ROCR)
#min
pred_min <- prediction(re[,2], re[,1])
auc_min = performance(pred_min,"auc")@y.values[[1]]
perf_min <- performance(pred_min,"tpr","fpr")
#1se
pred_1se <- prediction(re[,3], re[,1])
auc_1se = performance(pred_1se,"auc")@y.values[[1]]
perf_1se <- performance(pred_1se,"tpr","fpr")

tpr_min = performance(pred_min,"tpr")@y.values[[1]]
tpr_1se = performance(pred_1se,"tpr")@y.values[[1]]
dat = data.frame(tpr_min = perf_min@y.values[[1]],
                 fpr_min = perf_min@x.values[[1]],
                 tpr_1se = perf_1se@y.values[[1]],
                 fpr_1se = perf_1se@x.values[[1]])

ggplot() + 
  geom_line(data = dat,aes(x = fpr_min, y = tpr_min),color = "blue") + 
  geom_line(data = dat,aes(x = fpr_1se, y = tpr_1se),color = "red")+
  geom_line(aes(x=c(0,1),y=c(0,1)),color = "grey")+
  theme_bw()+
  annotate("text",x = .75, y = .25,
           label = paste("AUC of min = ",round(auc_min,2)),color = "blue")+
  annotate("text",x = .75, y = .15,label = paste("AUC of 1se = ",round(auc_1se,2)),color = "red")+
  scale_x_continuous(name  = "fpr")+
  scale_y_continuous(name = "tpr")

参考资料
1、TCGA数据分析流程梳理总结(含目录) - 简书
2、ROC曲线-阈值评价标准_人工智能_Rachel Zhang的专栏-CSDN博客
3、ROC曲线和PR曲线 - 简书
4、机器学习基础(1)- ROC曲线理解 - 简书
5、【r<-ROC|包】分析与可视化ROC——plotROC、pROC - 简书

上一篇下一篇

猜你喜欢

热点阅读