R语言 生信生物信息学R语言源码

TCGA-CRC

2020-08-13  本文已影响0人  数据控的迷妹

下载数据

TCGA官网下载TCGA-COAD projiect的HTSeq-counts 和clinical的manifest文件。
然后在电脑终端用gdc-client下载数据
得到clinical文件459个,mrna文件521个。

整理临床数据

setwd("D:\\COAD")
library(XML)
result <- xmlParse("./clinical/007b27c5-d36e-4bec-9321-b03b773fd3b8/nationwidechildrens.org_clinical.TCGA-A6-6651.xml")
rootnode <- xmlRoot(result)
rootsize <- xmlSize(rootnode)
print(rootnode[1])
#print(rootnode[2])
xmldataframe <- xmlToDataFrame(rootnode[2])
head(t(xmlToDataFrame(rootnode[2])))
 #                 [,1]                  
#additional_studies ""                    
#tumor_tissue_site  "Colon"               
#histological_type  "Colon Adenocarcinoma"
#other_dx           "No"                  
#gender             "FEMALE"              
#vital_status       "Alive"

xmls = dir("clinical/",pattern = "*.xml$",recursive = T)
td = function(x){
  result <- xmlParse(file.path("clinical/",x))
  rootnode <- xmlRoot(result)
  xmldataframe <- xmlToDataFrame(rootnode[2])
  return(t(xmldataframe))
}

cl = lapply(xmls,td)
cl_df <- t(do.call(cbind,cl))
cl_df[1:3,1:3]
#     additional_studies tumor_tissue_site histological_type              
#[1,] ""                 "Colon"           "Colon Adenocarcinoma"         
#[2,] ""                 "Colon"           "Colon Adenocarcinoma"         
#[3,] ""                 "Colon"           "Colon Mucinous Adenocarcinoma"
clinical = data.frame(cl_df)
clinical[1:4,1:4]
#  additional_studies tumor_tissue_site             histological_type other_dx
#1                                Colon          Colon Adenocarcinoma       No
#2                                Colon          Colon Adenocarcinoma       No
#3                                Colon Colon Mucinous Adenocarcinoma      Yes
#4                                Colon          Colon Adenocarcinoma       No

整理表达矩阵

探索数据:任选两个counts文件读取,观察geneid的顺序是否一致

x = read.table("mrna/02734d4d-fc8f-4ef7-ac82-1b4d7184cc5e/28004569-048d-4f8c-99aa-7a8c69a98dcc.htseq.counts.gz")
x2 = read.table("mrna/0826add5-571b-41be-b186-936959fc9d79/dfeb2d54-28da-4521-b488-bdac246bb59b.htseq.counts.gz")
identical(x$V1,x2$V1)
#[1] TRUE

geneid顺序是一致的,可以直接cbind,不会导致顺序错乱。
批量读取所有的counts.gz文件。

count_files = dir("mrna/",pattern = "*.htseq.counts.gz$",recursive = T)

ex = function(x){
  result <- read.table(file.path("mrna/",x),row.names = 1,sep = "\t")
  return(result)
}
head(ex("02734d4d-fc8f-4ef7-ac82-1b4d7184cc5e/28004569-048d-4f8c-99aa-7a8c69a98dcc.htseq.counts.gz"))
#                    V2
#ENSG00000000003.13 8357
#ENSG00000000005.5    37
#ENSG00000000419.11 2278
#ENSG00000000457.12  835
#ENSG00000000460.15  697
#ENSG00000000938.11  687
exp = lapply(count_files,ex)
exp <- do.call(cbind,exp)
dim(exp)  
 #60488  521  有6万多行,521个样本
exp[1:4,1:4]
#                   V2 V2.1 V2.2 V2.3
#ENSG00000000003.13 7501 5080 4721 8357
#ENSG00000000005.5    29    6   58   37
#ENSG00000000419.11 2108 1581 2203 2278
#ENSG00000000457.12  411  796 1001  835
没有列名的数据框

这个数据框没有列名,需从TCGA官网上下载JSON文件(mrn对应的json文件!!!),将文件名与样本配对

#加入列名
meta <- jsonlite::fromJSON("metadata.cart.2020-08-09.json")
colnames(meta)
#[1] "md5sum"              "associated_entities" "file_size"           "annotations"         "access"              "data_format"         "data_category"      
# [8] "data_type"           "file_id"             "file_name"           "submitter_id"        "state"   
 ids <- meta$associated_entities;class(ids)
#[1] "list"
ids[[1]]
#                             entity_id entity_submitter_id entity_type                              case_id
#1 381cf7f3-cd7b-4660-8175-ff8062f7bf78        TCGA-AA-A00L        case 381cf7f3-cd7b-4660-8175-ff8062f7bf78
 class(ids[[1]][,2])   #并不是每一个数据都是第2列,需要自己手动检查
#[1] "character"

meta$associated_entities是个列表,这个列表里包含数据框,数据框的第一列内容就是tcga样本id。

ID = sapply(ids,function(x){x[,2]})#同样与上面对应
file2id = data.frame(file_name = meta$file_name,
                     ID = ID)

文件名与TCGA样本ID的对应关系已经得到,接下来是将其添加到表达矩阵中,成为行名。需要找到读取文件的顺序,一 一对应修改。

head(file2id$file_name)
#[1] "a8e396d2-9528-4d8d-8e22-f56be5397cc1.htseq.counts.gz" "8f9232c6-4ae9-4e3f-af06-335f107b09bc.htseq.counts.gz"
#[3] "dd9a3ab6-6c93-4b47-9b4e-62e56767054b.htseq.counts.gz" "31cf7af3-5576-4201-912d-5948fa2c0296.htseq.counts.gz"
#[5] "06c8dfda-b057-47cb-b435-9c2739694f6a.htseq.counts.gz" "d753b828-373f-4ebc-a300-598e0c6e79ba.htseq.counts.gz"
head(count_files)
#[1] "0096f6b3-94f5-49d9-aa43-66d9ab6b2b5c/17510ea6-0c97-4bd0-94a4-63dd76e5b7c7.htseq.counts.gz"
#[2] "01f5a228-a2cf-444f-8102-a9d4a2d9a27d/7324b2d6-dd30-4f99-a101-7c3d497bbcb8.htseq.counts.gz"
#[3] "02566c4e-e3d2-43eb-aca3-754c1bd5bd5f/1e50d68e-ac09-42ad-9678-bc0598328037.htseq.counts.gz"
#[4] "02734d4d-fc8f-4ef7-ac82-1b4d7184cc5e/28004569-048d-4f8c-99aa-7a8c69a98dcc.htseq.counts.gz"
#[5] "029ffdfa-c2ff-439e-be14-a7b68342fbdb/1a587b92-0a11-47f0-83c2-77a2b6de1088.htseq.counts.gz"
#[6] "02d4cebf-e7f6-4c13-aca1-af16fd6d69cb/2f6070c7-99d0-49bb-83ca-33c032650cde.htseq.counts.gz"
count_files2 = stringr::str_split(count_files,"/",simplify = T)[,2]
count_files2[1] %in% file2id$file_name
#[1] TRUE

count_files2的顺序就是列名的顺序,根据它来调整file2id的顺序。match函数需要了解一下(我懂意思,但我写不出来。)

file2id = file2id[match(count_files2,file2id$file_name),]
colnames(exp) = file2id$ID
exp[1:4,1:4]
#                   TCGA-AA-3979-01A-01R-1022-07 TCGA-AA-3655-01A-02R-1723-07 TCGA-D5-6535-01A-11R-1723-07 TCGA-CK-4950-01A-01R-1723-07
#ENSG00000000003.13                         7501                         5080                         4721                         8357
#ENSG00000000005.5                            29                            6                           58                           37
#ENSG00000000419.11                         2108                         1581                         2203                         2278
#ENSG00000000457.12                          411                          796                         1001                          835

过滤低表达样本

dim(exp)
#[1] 60488   521
exp = exp[apply(exp, 1, function(x) sum(x > 1) > 41), ]
dim(exp)
#[1] 35267   521
exp[1:4,1:4]
 #                  TCGA-AA-3979-01A-01R-1022-07 TCGA-AA-3655-01A-02R-1723-07 TCGA-D5-6535-01A-11R-1723-07 TCGA-CK-4950-01A-01R-1723-07
#ENSG00000000003.13                         7501                         5080                         4721                         8357
#ENSG00000000005.5                            29                            6                           58                           37
#ENSG00000000419.11                         2108                         1581                         2203                         2278
#ENSG00000000457.12                          411                          796                         1001                          835

分组,根据样本ID的第14-15位,给样本分组(tumor和normal),<10是肿瘤,其他是癌旁组织

table(stringr::str_sub(colnames(exp),14,15))
# 01  02  06  11 
#478   1   1  41 
 group_list = ifelse(as.numeric(substr(colnames(exp),14,15)) < 10,'tumor','normal')
 group_list = factor(group_list,levels = c("normal","tumor"))
 table(group_list)    
#group_list
#normal  tumor 
#    41    480 
save(exp,clinical,group_list,cancer_type,file = paste0(cancer_type,"gdc.Rdata"))

接下来从TCGA得到的表达矩阵中分别提取出mRNA和lncRNA矩阵。

大概路线:1.找到TCGA数据对应的参考基因组版本,2.下载该版本的参考基因组文件,找到mRNA和lncRNA对应的ensembl id 3.在表达矩阵中筛选。
下载gencode.v22.annotation.gtf.gz后解压

#step1:读取并探索gtf文件----
options(stringsAsFactors = F)
if(!file.exists("anno.Rdata"))
  BiocManager::install("rtracklayer")
  library(rtracklayer)
  x = rtracklayer::import("gencode.v22.annotation.gtf")
  class(x)
#[1] "GRanges"
#attr(,"package")
#[1] "GenomicRanges
 x2 = as.data.frame(x);dim(x2)
#[1] 2563671      27
  colnames(x2)
#  [1] "seqnames"                 "start"                    "end"                      "width"                    "strand"                  
# [6] "source"                   "type"                     "score"                    "phase"                    "gene_id"                 
#[11] "gene_type"                "gene_status"              "gene_name"                "level"                    "havana_gene"             
#[16] "transcript_id"            "transcript_type"          "transcript_status"        "transcript_name"          "tag"                     
#[21] "transcript_support_level" "havana_transcript"        "exon_number"              "exon_id"                  "ont"   #[26] "protein_id"               "ccdsid"                    
  tj = as.data.frame(table(x2$type));tj
#            Var1    Freq
#1           gene   60483
#2     transcript  198442
#3           exon 1172082
#4            CDS  699443
#5    start_codon   82228
#6     stop_codon   74337
#7            UTR  276542
#8 Selenocysteine     114
 tj2 = as.data.frame(table(x2$gene_type));tj2

#step2:先筛选出gene对应的行
  nrow(x2);x2 = x2[x2$type=="gene",];nrow(x2)
#[1] 2563671
#[1] 60483
  tj3 = as.data.frame(table(x2$gene_type));tj3

#step3:提取lnc和mRNA及其对应的ensambelid和symbol
  lnc_bype = c("3prime_overlapping_ncRNA", "antisense", "bidirectional_promoter_lncRNA", "lincRNA", "macro_lncRNA", "non_coding", "processed_transcript", "sense_intronic" , "sense_overlapping")
  table(x2$gene_type %in% lnc_bype)
#FALSE  TRUE 
#45657 14826
table(x2$gene_type == "protein_coding")
#FALSE  TRUE 
#40669 19814  
lnc_anno = x2[x2$gene_type %in% lnc_bype,c("gene_name","gene_id","gene_type")]
  mRNA_anno = x2[x2$gene_type == "protein_coding",c("gene_name","gene_id","gene_type")]
  save(lnc_anno,mRNA_anno,file = "anno.Rdata")
load("anno.Rdata")
#step4:表达矩阵拆分和注释
load("TCGA-COADgdc.Rdata")
table(rownames(exp) %in% mRNA_anno$gene_id)

#FALSE  TRUE 
#17263 18004 
mRNA_exp = exp[rownames(exp) %in% mRNA_anno$gene_id,]
tmp = data.frame(gene_id = rownames(exp))
x = dplyr::inner_join(tmp,mRNA_anno,by = "gene_id")
#inner_join不改变顺序,可以确认一下
identical(tmp$gene_id,rownames(exp))
#[1] TRUE
table(!duplicated(x$gene_name))

#FALSE  TRUE 
#   36 17968 
mRNA_exp = mRNA_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(mRNA_exp) = x$gene_name
mRNA_exp[1:4,1:4]   #这里的mRNA是所有样本521个样本的mRNA
  #     TCGA-AA-3979-01A-01R-1022-07 TCGA-AA-3655-01A-02R-1723-07 TCGA-D5-6535-01A-11R-1723-07 TCGA-CK-4950-01A-01R-1723-07
#TSPAN6                         7501                         5080                         4721                         8357
#TNMD                             29                            6                           58                           37
#DPM1                           2108                         1581                         2203                         2278
#SCYL3                           411                          796                         1001                          835
mRNA_exp = na.omit(mRNA_exp)
#同样办法拆出lncRNA:也是所有样本的(521个样本)
lnc_exp = exp[rownames(exp) %in% lnc_anno$gene_id,]
tmp = data.frame(gene_id = rownames(exp))
x = dplyr::inner_join(tmp,lnc_anno,by = "gene_id")
identical(tmp$gene_id,rownames(exp))
#[1] TRUE
table(!duplicated(x$gene_name))
#FALSE  TRUE 
#    4  8714 
lnc_exp = lnc_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(lnc_exp) = x$gene_name
lnc_exp[1:4,1:4]
#                TCGA-AA-3979-01A-01R-1022-07 TCGA-AA-3655-01A-02R-1723-07 TCGA-D5-6535-01A-11R-1723-07 TCGA-CK-4950-01A-01R-1723-07
#LINC01587                                  0                            0                            1                            0
#AC000111.6                                 1                           18                            9                            2
#XXbac-B461K10.4                            3                            3                            9                            2
#IGF2-AS                                    1                            2                            5                            3
lnc_exp = na.omit(lnc_exp)

配对样本的lnc、mRNA 表达矩阵,供差异分析用

挑选样本,是选列,与行(基因)无关

1.选出与normal样本匹配的tumor样本,合并为新的表达矩阵

 exp2 = rbind(mRNA_exp,lnc_exp) #mRNA和lncRNA合并,不改变顺序
identical(rownames(mRNA_exp),head(rownames(exp2),nrow(mRNA_exp)))
#[1] TRUE
table(stringr::str_sub(colnames(exp2),14,15)) #正常样本个数?41个大于10的那就是41个正常样本
# 01  02  06  11 
#478   1   1  41 
library(stringr)
exp2_nor = exp2[,str_sub(colnames(exp2),14,15)==11]
exp2_tum = exp2[,str_sub(colnames(exp2),14,15)!=11]
patient = str_sub(colnames(exp2_nor),1,12) #有配对样本的病人id
table(str_sub(colnames(exp2_tum),1,12) %in% str_sub(patient)) 
#FALSE  TRUE 
  #434    46 
# 看看肿瘤样本有重复吗:
exp2_tum = exp2_tum[,str_sub(colnames(exp2_tum),1,12) %in% str_sub(patient)]
exp2_tum = exp2_tum[,str_sort(colnames(exp2_tum))] #str_sort是排序
exp2_tum = exp2_tum[,!duplicated(str_sub(colnames(exp2_tum),1,12) )] #排序去重,这样样本有01A和01B可选时,就会留下01A。
#让正常样本顺序与肿瘤样本顺序一致,这样才方便写配对向量
tmp = match(str_sub(colnames(exp2_tum),1,12),str_sub(colnames(exp2_nor),1,12))
exp2_nor = exp2_nor[,tmp]
identical(str_sub(colnames(exp2_tum),1,12),str_sub(colnames(exp2_nor),1,12))
#TRUE
exp2_small = cbind(exp2_nor,exp2_tum)

拆分回lnc和mRNA

mRNA_exp_small = head(exp2_small,nrow(mRNA_exp))
lnc_exp_small = tail(exp2_small,nrow(lnc_exp))

总共得到了6个表达矩阵

dim(exp) #原始ensambel矩阵   35267  521
dim(exp2) #lncRNA和mRNA矩阵合并得到的symbol矩阵   26682   521
dim(lnc_exp) #lncRNA   8714   521
dim(mRNA_exp) #mRNA     17968   521
dim(lnc_exp_small) #配对的lncRNA矩阵   8714   82
dim(mRNA_exp_small)#配对的mRNA矩阵   17968  82
save(lnc_exp,mRNA_exp,file = paste0(cancer_type,"lmexp.Rdata"))
save(lnc_exp_small,mRNA_exp_small,file = paste0(cancer_type,"lmexp_small.Rdata"))

终于来到差异分析了!

if(!require(stringr))install.packages('stringr')
if(!require(ggplotify))install.packages("ggplotify")
if(!require(patchwork))install.packages("patchwork")
if(!require(cowplot))install.packages("cowplot")
if(!require(edgeR))BiocManager::install('edgeR')

用edgeR 进行lnc_RNA差异分析

rm(list = ls())
load("TCGA-COADgdc.Rdata")
load("TCGA-COADlmexp_small.Rdata")
group_list = rep(c("normal","tumor"),each = ncol(lnc_exp_small)/2)
pairinfo = rep(1:(ncol(lnc_exp_small)/2),times = 2)
table(group_list)
#group_list
#normal  tumor 
#    41     41 
library(edgeR)
dge <- DGEList(counts=lnc_exp_small,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 
design <- model.matrix(~pairinfo+group_list)
dge <- estimateDisp(dge,design)

fit <- glmQLFit(dge, design)
fit2 <- glmQLFTest(fit)
result = topTags(fit2)

DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG)
#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff <- 1.5
DEG$change = as.factor(
  ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff,
         ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
 head(DEG)
#                 logFC    logCPM        F       PValue          FDR change
#ELFN1-AS1     5.176298  9.166588 356.4789 3.946428e-33 2.711619e-29     UP
#RP11-474D1.3  9.110774 10.067577 352.0110 6.223591e-33 2.711619e-29     UP
#RP11-138J23.1 7.056960  5.148157 334.7920 3.764552e-32 1.093477e-28     UP
#MAFG-AS1      2.853802  8.371429 320.2209 1.829845e-31 3.986317e-28     UP
#RP11-474D1.4  7.542251  5.577248 271.9532 5.318818e-29 9.269636e-26     UP
#PVT1          2.464724  9.594802 267.0938 9.814251e-29 1.425356e-25     UP
table(DEG$change)
#DOWN  NOT   UP 
# 685 7022 1007 
lncRNA_DEG <- DEG
cg = rownames(DEG)[DEG$change!="NOT"]
cgexp = lnc_exp_small[cg,]
table(head(DEG[cg,"change"],580))

#DOWN  NOT   UP 
# 265    0  315 
if(!require(devtools))install.packages("devtools")
library("devtools")
if(!require(tinyplanet))devtools::install_github("xjsun1221/tinyplanet",upgrade = F)#小洁老师的R包,需从github上面安装
library("tinyplanet")
library("ggplot2")
lnc_exp_small[1:4,1:4]
#                TCGA-A6-2671-11A-01R-A32Z-07 TCGA-A6-2675-11A-01R-1723-07 TCGA-A6-2678-11A-01R-A32Z-07 TCGA-A6-2679-11A-01R-A32Z-07
#LINC01587                                  0                            1                            3                            0
#AC000111.6                                 1                            2                            4                            1
#XXbac-B461K10.4                            1                           20                            3                            4
#IGF2-AS                                    0                            2                            2                            0
#PCA
dat = log(lnc_exp_small+1)
pca.plot = draw_pca(dat,group_list);pca.plot
#热图火山图
x = lnc_exp_small[cg,]
library(tinyplanet)
h = draw_heatmap(x,group_list,scale_before = T)
v = draw_volcano(lncRNA_DEG,logFC_cutoff = 1.5,pkg = 2)
save(cg,file = paste0(cancer_type,"lnccg.Rdata"))
lncrna PCA
lncRNA 热图

接下来是mRNA的差异分析

rm(list = ls())
load("TCGA-COADgdc.Rdata")
load("TCGA-COADlmexp_small.Rdata")
group_list = rep(c("normal","tumor"),each = ncol(mRNA_exp_small)/2)
pairinfo = rep(1:41,times = 2)
table(group_list)
#group_list
#normal  tumor 
#    41     41 

library(edgeR)
dge <- DGEList(counts=mRNA_exp_small,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 
design <- model.matrix(~pairinfo+group_list)
dge <- estimateDisp(dge,design)

fit <- glmQLFit(dge, design)
fit2 <- glmQLFTest(fit)
result = topTags(fit2)

DEG=topTags(fit2, n=nrow(exp))
DEG=as.data.frame(DEG)
#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff <- 1.5
DEG$change = as.factor(
  ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff,
         ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
 head(DEG)
#         logFC   logCPM        F       PValue          FDR change
#AJUBA 3.033725 3.826374 615.1156 3.157165e-40 3.984587e-36     UP
#KRT80 6.801211 4.460591 609.4493 4.435203e-40 3.984587e-36     UP
#ETV4  5.383569 5.963017 560.2076 9.619414e-39 5.761388e-35     UP
#CEMIP 5.292491 6.300332 546.5535 2.354213e-38 1.057512e-34     UP
#ESM1  5.521886 1.697701 478.7325 2.734995e-36 9.828479e-33     UP
#CLDN1 4.649788 5.536313 450.9748 2.265810e-35 6.785344e-32     UP
table(DEG$change)
# DOWN   NOT    UP 
# 1611 14826  1531 
lncRNA_DEG <- DEG
cg = rownames(DEG)[DEG$change!="NOT"]
cgexp = mRNA_exp_small[cg,]
table(head(DEG[cg,"change"],580))
#DOWN  NOT   UP 
 #272    0  308 

library("ggplot2")
library("tinyplanet")
mRNA_exp_small[1:4,1:4]
#       TCGA-A6-2671-11A-01R-A32Z-07 TCGA-A6-2675-11A-01R-1723-07 TCGA-A6-2678-11A-01R-A32Z-07 TCGA-A6-2679-11A-01R-A32Z-07
#TSPAN6                         5884                         2731                         7881                         4593
#TNMD                             28                           15                           61                           52
#DPM1                           1355                          912                         1453                         1148
#SCYL3                           895                          527                         1141                          825
#PCA
dat = log(mRNA_exp_small+1)
pca.plot = draw_pca(dat,group_list);pca.plot
#热图火山图
x = mRNA_exp_small[cg,]
library(tinyplanet)
h = draw_heatmap(x,group_list,scale_before = T)
v = draw_volcano(lncRNA_DEG,logFC_cutoff = 1.5,pkg = 2)
save(cg,file = paste0(cancer_type,"mRNAcg.Rdata"))
mRNA- PCA
mRNA 热图

火山图没画出来,所以我接着mrna的尝试了一下

#版本一
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                    '\nThe number of up gene is ',nrow(lncRNA_DEG[lncRNA_DEG$change =='UP',]) ,
                    '\nThe number of down gene is ',nrow(lncRNA_DEG[lncRNA_DEG$change =='DOWN',])
)

g = ggplot(data=lncRNA_DEG, 
           aes(x=logFC, y=-log10(PValue), 
               color=change)) +
  geom_point(alpha=0.4, size=1.75) +
  theme_set(theme_set(theme_bw(base_size=20)))+
  xlab("log2 fold change") + ylab("-log10 p-value") +
  ggtitle( this_tile ) + theme(plot.title = element_text(size=15,hjust = 0.5))+
  scale_colour_manual(values = c('blue','black','red')) ## corresponding to the levels(res$change)

print(g)
ggsave(g,filename = paste0(3,'_volcano.png'))
dev.off()
版本二

#重点关注的基因
lncRNA_DEG$sign <- ifelse(lncRNA_DEG$PValue < 0.001 & abs(lncRNA_DEG$logFC) > 8,#可以改
                     rownames(lncRNA_DEG),
                     NA)
table(lncRNA_DEG$sign)

ggplot(data= lncRNA_DEG, aes(x = logFC, y = -log10(PValue), color = change)) +
  geom_point(alpha=0.8, size = 1) +
  theme_bw(base_size = 15) +
  theme(plot.title=element_text(hjust=0.5),   #  标题居中
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank()) + # 网格线设置为空白
  geom_hline(yintercept=2 ,linetype=4) +
  geom_vline(xintercept=c(-2,2) ,linetype=4 ) +
  scale_color_manual(name = "", 
                     values = c("red", "green", "black"),
                     limits = c("UP", "DOWN", "NOT")) +
  geom_label_repel(aes(label=sign), # 防止标签过多重叠
                   fontface="bold",
                   color="grey50",
                   box.padding=unit(0.35, "lines"),  # 文本框周边填充
                   point.padding=unit(0.5, "lines"), # 点周边填充
                   segment.colour = "grey50", # 连接点与标签的线段的颜色
                   force = T) + 
  labs(title = 'COAD DEG volcano')
#若不想标出基因就直接这样
ggplot(data= lncRNA_DEG, aes(x = logFC, y = -log10(PValue), color = change)) +
  geom_point(alpha=0.8, size = 1) +
  theme_bw(base_size = 15) +
  theme(plot.title=element_text(hjust=0.5),   #  标题居中
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank()) + # 网格线设置为空白
  geom_hline(yintercept=2 ,linetype=4) +
  geom_vline(xintercept=c(-2,2) ,linetype=4 ) +
  scale_color_manual(name = "", 
                     values = c("red", "green", "black"),
                     limits = c("UP", "DOWN", "NOT")) +
  labs(title = 'COAD DEG volcano')
版本二mRNA火山图

轮到生存分析了

生存分析只需要tumor数据,将normal去掉,新表达矩阵数据命名为exprSet;
clinical信息需要进一步整理,成为生存分析需要的格式,新临床信息数据命名为meta。
由于不同癌症的临床信息表格组织形式不同,这里的代码需要根据实际情况修改。

rm(list=ls())
options(stringsAsFactors = F)
load("TCGA-COADgdc.Rdata")
load("TCGA-COADlmexp.Rdata")
load("TCGA-COADlnccg.Rdata")
library(stringr)

#clinical通常有几十列,选出其中有用的几列。
tmp = data.frame(colnames(clinical))
clinical = clinical[,c(
  'bcr_patient_barcode',
  'vital_status',
  'days_to_death',
  'days_to_last_followup',
  'race_list',
  'age_at_initial_pathologic_diagnosis',
  'gender' ,
  'stage_event'
)]
dim(clinical)
#[1] 459   8
#rownames(clinical) <- NULL
rownames(clinical) <- clinical$bcr_patient_barcode
clinical[1:4,1:4]
#                  bcr_patient_barcode vital_status days_to_death days_to_last_followup
#TCGA-AA-3518        TCGA-AA-3518        Alive                                  31
#TCGA-A6-6651        TCGA-A6-6651        Alive                                 249
#TCGA-DM-A285        TCGA-DM-A285         Dead                      179                      
#TCGA-AA-3495        TCGA-AA-3495        Alive                                  31
meta = clinical[clinical$days_to_death != 0,]

#简化meta的列名
colnames(meta)=c('ID','event','death','last_followup','race','age','gender','stage')

#表达矩阵处理
exprSet=lnc_exp[cg,group_list=='tumor']
table(str_sub(colnames(exprSet),1,12) %in% meta$ID)
#FALSE  TRUE 
#    7   473 
exprSet = exprSet[,str_sub(colnames(exprSet),1,12) %in% meta$ID]
exprSet = exprSet[,str_sort(colnames(exprSet))]
exprSet = exprSet[,!duplicated(str_sub(colnames(exprSet),1,12) )]

#调整meta的ID顺序与exprSet列名一致
meta=meta[match(str_sub(colnames(exprSet),1,12),meta$ID),]
all(meta$ID==str_sub(colnames(exprSet),1,12))
#[1] TRUE
#整理生存分析的输入数据----

#1.1由随访时间和死亡时间计算生存时间(月)
is.empty.chr = function(x){
  ifelse(stringr::str_length(x)==0,T,F)
}
is.empty.chr(meta[2,3])
meta[,3][is.empty.chr(meta[,3])]=0
meta[,4][is.empty.chr(meta[,4])]=0
meta$time=(as.numeric(meta[,3])+as.numeric(meta[,4]))/30
#1.2 根据生死定义event,活着是0,死的是1
meta$event=ifelse(meta$event=='Alive',0,1)
#1.3 年龄和年龄分组
meta$age=as.integer(meta$age)
meta$age_group=ifelse(meta$age>median(meta$age),'older','younger')

#1.4 stage 
library(stringr) 
meta$stage
tmp = str_split(meta$stage,' ',simplify = T)[,2]
str_count(tmp,"T")
str_locate(tmp,"T")[,1]
tmp = str_sub(tmp,1,str_locate(tmp,"T")[,1]-1)
table(tmp)
table(is.na(tmp))
#FALSE  TRUE 
#  438    11 
meta$stage = tmp

#1.5 race
table(meta$race)
save(meta,exprSet,group_list,cancer_type,file = paste0(cancer_type,"sur_model.Rdata"))
image.png

下面就是生存分析的可视化:logrank批量生存分析和cox批量生存分析

1.logrank批量生存分析

对每个基因(lncRNA)求了p值,结果结果是10个基因的p<0.01,72个基因的p<0.05。

logrankfile = paste0(cancer_type,"log_rank_p.Rdata")
library(survival)
library(survminer)
if(!file.exists(logrankfile)){
  mySurv=with(meta,Surv(time, event))
  log_rank_p <- apply(exprSet , 1 , function(gene){
    # gene=exprSet[1,]
    meta$group=ifelse(gene>median(gene),'high','low')  
    data.survdiff=survdiff(mySurv~group,data=meta)
    p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
    return(p.val)
  })
  log_rank_p=sort(log_rank_p)
  save(log_rank_p,file = logrankfile)
}
load(logrankfile)
table(log_rank_p<0.01) 

#FALSE  TRUE 
# 1682    10 
table(log_rank_p<0.05) 
#FALSE  TRUE 
# 1620    72 
2.cox批量生存分析
coxfile = paste0(cancer_type,"cox.Rdata")
if(!file.exists(coxfile)){
  mySurv=with(meta,Surv(time, event))
  cox_results <-apply(exprSet , 1 , function(gene){
    # gene= exprSet[1,]
    group=ifelse(gene>median(gene),'high','low') 
    survival_dat <- data.frame(group=group,stage=meta$stage,age=meta$age,
                               gender=meta$gender,
                               stringsAsFactors = F)
    m=coxph(mySurv ~ gender + age + stage + group, data =  survival_dat)
    
    beta <- coef(m)
    se <- sqrt(diag(vcov(m)))
    HR <- exp(beta)
    HRse <- HR * se
    
    #summary(m)
    tmp <- round(cbind(coef = beta, se = se, z = beta/se, p = 1 - pchisq((beta/se)^2, 1),
                       HR = HR, HRse = HRse,
                       HRz = (HR - 1) / HRse, HRp = 1 - pchisq(((HR - 1)/HRse)^2, 1),
                       HRCILL = exp(beta - qnorm(.975, 0, 1) * se),
                       HRCIUL = exp(beta + qnorm(.975, 0, 1) * se)), 3)
    return(tmp['grouplow',])
    
  })
  cox_results=t(cox_results)
  save(cox_results,file = coxfile)
}
load(coxfile)
table(cox_results[,4]<0.01)
#FALSE  TRUE 
 #1675    17
table(cox_results[,4]<0.05)
#FALSE  TRUE 
# 1611    81
lr = names(log_rank_p)[log_rank_p<0.05]
cox = rownames(cox_results)[cox_results[,4]<0.05]
length(intersect(lr,cox))
save(cox,lr,file = paste0(cancer_type,"cox_lr.Rdata"))

结果:17个基因的p<0.01,81个基因的p<0.05。

Lasso回归

1.载入数据
load("TCGA-COADsur_model.Rdata")
#load("TCGA-CHOLsur_model.Rdata")
load("TCGA-COADcox_lr.Rdata")
exprSet = exprSet[cox,]
2.构建lasso回归模型

lasso回归的输入数据是表达矩阵(仅含tumor样本)和对应的生死。

x=t(log2(exprSet+1))
y=meta$event
install.packages("glmnet")
library("glmnet")
model_lasso <- glmnet(x, y,nlambda=10, alpha=1)
print(model_lasso)
#:  glmnet(x = x, y = y, alpha = 1, nlambda = 10) 

#   Df  %Dev   Lambda
#1   0  0.00 0.073980
#2  18 12.22 0.026590
#3  39 25.53 0.009555
#4  61 30.80 0.003434
#5  76 32.15 0.001234
#6  81 32.42 0.000444
#7  81 32.45 0.000159
#8  81 32.46 0.000057
#9  81 32.46 0.000021
#10 81 32.46 0.000007

这里是举例子,所以只计算了10个λ值,输出结果三列的含义如下:

2.1挑选合适的λ值

计算1000个,画图,筛选表现最好的λ值

set.seed(13098)
cv_fit <- cv.glmnet(x=x, y=y, nlambda = 1000,alpha = 1)
plot(cv_fit)

纵坐标市MSE(均方误差),两条虚线分别指示了两个特殊的λ值,一个是lambda.min,一个是lambda.1se,这两个值之间的lambda都认为是合适的。lambda.1se构建的模型最简单,即使用的基因数量少,而lambda.min则准确率更高一点,使用的基因数量更多一点。

2.2 用这两个λ值重新建模
model_lasso_min <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.min)
model_lasso_1se <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.1se)

这两个值体现在参数lambda上。有了模型,可以将筛选的基因挑出来了。所有基因存放于模型的子集beta中,用到的基因有一个s0值,没用的基因只记录了“.”,所以可以用下面代码挑出用到的基因。

head(model_lasso_min$beta)
#6 x 1 sparse Matrix of class "dgCMatrix"
#                       s0
#RP11-474D1.3  .          
#RP5-884M6.1   0.010229910
#RP11-167H9.4  .          
#AC007182.6    0.004850337
#RP11-278H7.4  .          
#RP11-109M17.2 .          
 choose_gene_min=rownames(model_lasso_min$beta)[as.numeric(model_lasso_min$beta)!=0]
choose_gene_1se=rownames(model_lasso_1se$beta)[as.numeric(model_lasso_1se$beta)!=0]
length(choose_gene_min)
#[1] 34
length(choose_gene_1se)
#[1] 1
load("TCGA-COADcox_lr.Rdata")
for_cox = intersect(choose_gene_min,lr)
save(for_cox,choose_gene_min,file = "for_cox.Rdata")

3.模型预测和评估

3.1自己预测自己

newx参数是预测对象。输出结果lasso.prob是一个矩阵,第一列是min的预测结果,第二列是1se的预测结果,预测结果是概率,或者说百分比,不是绝对的0和1。
将每个样本的生死和预测结果放在一起,直接cbind即可。

lasso.prob <- predict(cv_fit, newx=x , s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
re=cbind(y ,lasso.prob)
head(re)
#                             y          1         2
#TCGA-3L-AA1B-01A-11R-A37K-07 0 0.10521478 0.1158129
#TCGA-4N-A93T-01A-11R-A37K-07 0 0.38231417 0.1158129
#TCGA-4T-AA8H-01A-11R-A41B-07 0 0.09498643 0.1158129
#TCGA-5M-AAT4-01A-11R-A41B-07 1 0.37359793 0.1158129
#TCGA-5M-AAT6-01A-11R-A41B-07 1 0.50234844 0.1158129
#TCGA-5M-AATE-01A-11R-A41B-07 0 0.24833076 0.1158129
3.2 箱线图

对预测结果进行可视化。以实际的生死作为分组,画箱线图整体上查看预测结果。

re=as.data.frame(re)
colnames(re)=c('event','prob_min','prob_1se')
re$event=as.factor(re$event)
library(ggpubr) 
p1 = ggboxplot(re, x = "event", y = "prob_min",
               color = "event", palette = "jco",
               add = "jitter")+ stat_compare_means()
p2 = ggboxplot(re, x = "event", y = "prob_1se",
               color = "event", palette = "jco",
               add = "jitter")+ stat_compare_means()
library(patchwork)
p1+p2
箱线图

可以看到,真实结果是死(0)的样本,预测的值就小一点(靠近0),真实结果是活着(1)的样本,预测的值就大一点(靠近1),整体上趋势是对的,但不是完全准确,模型是可用的。
对比两个λ值构建的模型,1se的箱线图有点奇怪,后面再探究了

3.3 ROC曲线

计算AUC取值范围在0.5-1之间,越接近于1越好。可以根据预测结果绘制ROC曲线。

install.packages("ROCR")
install.packages("caret")
library(ROCR)
library(caret)
# 自己预测自己
#min
pred_min <- prediction(re[,2], re[,1])
auc_min = performance(pred_min,"auc")@y.values[[1]]
perf_min <- performance(pred_min,"tpr","fpr")
plot(perf_min,colorize=FALSE, col="blue") 
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.2, labels = paste0("AUC = ",round(auc_min,3)))
min-ROC
#1se
pred_1se <- prediction(re[,3], re[,1])
auc_1se = performance(pred_1se,"auc")@y.values[[1]]
perf_1se <- performance(pred_1se,"tpr","fpr")
plot(perf_1se,colorize=FALSE, col="red") 
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
text(0.8,0.2, labels = paste0("AUC = ",round(auc_1se,3)))
1se-ROC

为什么一样的数据做出的图不一样?

cox-风险森林图

1.载入数据

load("TCGA-COADsur_model.Rdata")
load("for_cox.Rdata")
library(stringr)

1.挑选感兴趣的基因构建coxph模型

e=t(exprSet[for_cox,])
e=log2(e+1)
colnames(e)=str_replace(colnames(e),"-","_")
dat=cbind(meta,e)
dat$gender=factor(dat$gender)
dat$stage=factor(dat$stage)
colnames(dat)
#[1] "ID"            "event"         "death"         "last_followup" "race"          "age"           "gender"        "stage"         "time"         
#[10] "age_group"     "RP5_884M6.1"   "AC007182.6"    "CTD_2023N9.1"  "VAC14_AS1"     "RP11_148K1.12" "RP11_742B18.1" "RP11_256I9.2"  "AC079612.1"   
#[19] "RP11_190J1.3"  "LINC00402"     "THRB_AS1"      "CTD_2147F2.2"  "WT1_AS"        "AC007392.4"    "RP11_309M7.1"  "RP11_413M3.4"  "RP11_334E6.3" 
#[28] "WASIR1"        "MAFA_AS1"      "CTD_2013N17.4" "MACROD2_IT1"

用survival::coxph()函数构建模型

library(survival)
library(survminer)
s = as.formula(paste("Surv(time, event) ~ ",paste(colnames(e),collapse = "+")))
model <- coxph(s, data = dat )
# 去掉不显著的基因
summary(model)$concordance
#        C      se(C) 
#0.85125719 0.02311193 
 summary(model)
#Call:
#coxph(formula = s, data = dat)

  #n= 449, number of events= 52 

 #                  coef exp(coef)  se(coef)      z Pr(>|z|)    
#RP5_884M6.1    0.112751  1.119353  0.094305  1.196 0.231854    
#AC007182.6     0.127995  1.136548  0.099580  1.285 0.198672    
#CTD_2023N9.1  -0.044700  0.956285  0.527983 -0.085 0.932531    
#VAC14_AS1     -0.185046  0.831066  0.130355 -1.420 0.155738    
#RP11_148K1.12  0.361166  1.435002  0.186106  1.941 0.052301 .  
#RP11_742B18.1  0.108267  1.114345  0.094063  1.151 0.249728    
#RP11_256I9.2   0.088944  1.093019  0.176557  0.504 0.614423    
#AC079612.1    -0.146865  0.863410  0.179227 -0.819 0.412538    
#RP11_190J1.3   0.006628  1.006650  0.097532  0.068 0.945817    
#LINC00402     -0.217686  0.804378  0.128876 -1.689 0.091197 .  
#THRB_AS1      -0.400994  0.669654  0.130617 -3.070 0.002141 ** 
#CTD_2147F2.2  -0.067850  0.934400  0.108002 -0.628 0.529851    
#WT1_AS         0.014672  1.014780  0.081921  0.179 0.857863    
#AC007392.4    -0.591876  0.553288  0.384834 -1.538 0.124048    
#RP11_309M7.1   0.324670  1.383574  0.183396  1.770 0.076674 .  
#RP11_413M3.4  -0.648309  0.522929  0.195152 -3.322 0.000894 ***
#RP11_334E6.3   0.521098  1.683876  0.198750  2.622 0.008745 ** 
#WASIR1        -0.115116  0.891263  0.127139 -0.905 0.365234    
#MAFA_AS1       0.279864  1.322949  0.170886  1.638 0.101481    
#CTD_2013N17.4  0.319699  1.376714  0.238104  1.343 0.179373    
#MACROD2_IT1    0.840480  2.317479  0.269586  3.118 0.001823 ** 
#---
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

#              exp(coef) exp(-coef) lower .95 upper .95
#RP5_884M6.1      1.1194     0.8934    0.9305    1.3466
#AC007182.6       1.1365     0.8799    0.9350    1.3815
#CTD_2023N9.1     0.9563     1.0457    0.3398    2.6916
#VAC14_AS1        0.8311     1.2033    0.6437    1.0730
#RP11_148K1.12    1.4350     0.6969    0.9964    2.0666
#RP11_742B18.1    1.1143     0.8974    0.9267    1.3399
#RP11_256I9.2     1.0930     0.9149    0.7733    1.5449
#AC079612.1       0.8634     1.1582    0.6077    1.2268
#RP11_190J1.3     1.0067     0.9934    0.8315    1.2187
#LINC00402        0.8044     1.2432    0.6248    1.0355
#THRB_AS1         0.6697     1.4933    0.5184    0.8650
#CTD_2147F2.2     0.9344     1.0702    0.7561    1.1547
#WT1_AS           1.0148     0.9854    0.8643    1.1915
#AC007392.4       0.5533     1.8074    0.2602    1.1763
#RP11_309M7.1     1.3836     0.7228    0.9658    1.9820
#RP11_413M3.4     0.5229     1.9123    0.3567    0.7666
#RP11_334E6.3     1.6839     0.5939    1.1406    2.4859
#WASIR1           0.8913     1.1220    0.6947    1.1435
#MAFA_AS1         1.3229     0.7559    0.9464    1.8493
#CTD_2013N17.4    1.3767     0.7264    0.8633    2.1954
#MACROD2_IT1      2.3175     0.4315    1.3663    3.9309

#Concordance= 0.851  (se = 0.023 )
#Likelihood ratio test= 69.43  on 21 df,   p=4e-07
#Wald test            = 59.6  on 21 df,   p=1e-05
#Score (logrank) test = 73.72  on 21 df,   p=9e-08
tmp = summary(model)$coefficients[,5]
re = names(tmp[tmp<0.05])

#s=Surv(time, event) ~ RP5_884M6.1+RP1_79C4.4+AC004510.3
s = as.formula(paste("Surv(time, event) ~ ",paste(re,collapse = "+")))

model <- coxph(s, data = dat )
summary(model)
#Call:
#coxph(formula = s, data = dat)

  #n= 449, number of events= 52 

#                coef exp(coef) se(coef)      z Pr(>|z|)    
#THRB_AS1     -0.2388    0.7875   0.1080 -2.211 0.027028 *  
#RP11_413M3.4 -0.4353    0.6471   0.1524 -2.856 0.004284 ** 
#RP11_334E6.3  0.5853    1.7956   0.1758  3.329 0.000871 ***
#MACROD2_IT1   0.5153    1.6741   0.2090  2.466 0.013674 *  
#---
#Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

#             exp(coef) exp(-coef) lower .95 upper .95
#THRB_AS1        0.7875     1.2698    0.6373    0.9732
#RP11_413M3.4    0.6471     1.5454    0.4800    0.8723
#RP11_334E6.3    1.7956     0.5569    1.2722    2.5343
#MACROD2_IT1     1.6741     0.5973    1.1115    2.5216

#Concordance= 0.72  (se = 0.05 )
#Likelihood ratio test= 23.4  on 4 df,   p=1e-04
#Wald test            = 26.56  on 4 df,   p=2e-05
#Score (logrank) test = 27.44  on 4 df,   p=2e-05
3.模型可视化-森林图
options(scipen=1)
ggforest(model, data =dat, 
         main = "Hazard ratio", 
         cpositions = c(0.10, 0.22, 0.4), 
         fontsize = 1.0, 
         refLabel = "1", noDigits = 4)
森林图

可以继续美化森林图

4.模型预测
#预测模型
fp <- predict(model,type = "risk")
boxplot(fp)
library(Hmisc)
options(scipen=200)
with(dat,rcorr.cens(fp,Surv(time, event))
 #  C Index             Dxy            S.D.               n         missing      uncensored  Relevant Pairs      Concordant       Uncertain 
#     0.27953650     -0.44092699      0.09997376    449.00000000      0.00000000     52.00000000  13204.00000000   3691.00000000 187936.00000000 
# 若要找到最佳模型,我们可以进行变量选择,可以采用逐步回归法进行分析
botplot

C-index用于计算生存分析中的COX模型预测值与真实之间的区分度(discrimination),也称为Harrell's concordanceindex。C-index在0.5-1之间。0.5为完全不一致,说明该模型没有预测作用,1为完全一致,说明该模型预测结果与实际完全一致。

5.划分高低风险并画生存分析图
names(fp) = rownames(dat)
ri = ifelse(fp<median(fp),"lowrisk","highrisk")
names(ri) = names(fp)
ri = factor(ri)

sfit <- survfit(Surv(time, event)~ri, data=meta)
ggsurvplot(sfit, conf.int=F, pval=TRUE)

ggsurvplot(sfit,palette = c("#E7B800", "#2E9FDF"),
           risk.table =TRUE,pval =TRUE,
           conf.int =TRUE,xlab ="Time in months", 
           ggtheme =theme_light(), 
           ncensor.plot = TRUE)
image.png
fp_dat=data.frame(patientid=1:length(fp),fp=as.numeric(sort(fp)))
sur_dat=data.frame(patientid=1:length(fp),
                   time=meta[names(sort(fp)),'time'] ,
                   event=meta[names(sort(fp )),'event']  ) 
sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
exp_dat=dat[names(sort(fp)),11:ncol(dat)]
###第一个图----
p1=ggplot(fp_dat,aes(x=patientid,y=fp))+geom_point()+theme_bw();p1
#第二个图
p2=ggplot(sur_dat,aes(x=patientid,y=time))+geom_point(aes(col=event))+theme_bw();p2
#第三个图
mycolors <- colorRampPalette(c("black", "green", "red"), bias = 1.2)(100)
tmp=t(scale(exp_dat[,re]))
tmp[tmp > 1] = 1
tmp[tmp < -1] = -1
#p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = T)
p3=pheatmap::pheatmap(tmp,
                      col= mycolors,
                      show_colnames = F,
                      cluster_cols = F)
#拼图实现三图联动
library(ggplotify)
plots = list(p1,p2,as.ggplot(as.grob(p3)))
library(gridExtra)
lay1 = rbind(c(rep(1,7),NA),
             c(rep(2,7)),
             c(rep(3,7))) #布局矩阵
grid.arrange(grobs = plots, 
             layout_matrix = lay1,
             heigths = c(1, 2,3))
image.png
按照预测值划分高低风险,加上注释
fp_dat=data.frame(patientid=1:length(fp),
                  fp=as.numeric(sort(fp)),
                  ri= ri[order(fp)])
sur_dat=data.frame(patientid=1:length(fp),
                   time=meta[names(sort(fp)),'time'] ,
                   event=meta[names(sort(fp )),'event']  ) 
sur_dat$event=ifelse(sur_dat$event==0,'alive','death')
sur_dat$event=factor(sur_dat$event,levels = c("death","alive"))
exp_dat=dat[names(sort(fp)),11:ncol(dat)]

###第一个图----
p1=ggplot(fp_dat,aes(x=patientid,y=fp))+
  geom_point(aes(color = ri))+
  scale_color_manual(values = c("red","blue"))+
  theme_bw();p1
#第二个图
p2=ggplot(sur_dat,aes(x=patientid,y=time))+
  geom_point(aes(col=event))+
  scale_color_manual(values = c("red","blue"))+
  theme_bw();p2
#第三个图
mycolors <- colorRampPalette(c("black", "green", "red"), bias = 1.2)(100)
tmp=t(scale(exp_dat[,re]))
tmp[tmp > 1] = 1
tmp[tmp < -1] = -1
#p3=pheatmap(tmp,col= mycolors,show_colnames = F,cluster_cols = T)
annotation_col = data.frame(group = ri,
                            row.names = names(ri))
ann_colors = list(
  group = c(lowrisk="blue", highrisk="red")
)

p3=pheatmap::pheatmap(tmp,
                      col= mycolors,
                      show_colnames = F,
                      cluster_cols = F,
                      annotation_col = annotation_col,
                      annotation_colors =ann_colors,
                      annotation_legend = F)
#拼图实现三图联动
library(ggplotify)
plots = list(p1,p2,as.ggplot(as.grob(p3)))
library(gridExtra)
lay1 = rbind(c(rep(1,7)),
             c(rep(2,7)),
             c(rep(3,7))) #布局矩阵
grid.arrange(grobs = plots, 
             layout_matrix = lay1,
             heigths = c(1,2,3))

image.png

参考文章:1.TCGA文章复现——LncRNA-CRC预后
2.[小洁老师的TCGA系列文章](https://www.jianshu.com/u/c93ac360691a

上一篇下一篇

猜你喜欢

热点阅读