生物信息学R语言源码Science相关 杂数据整理

TCGA数据库中既含有TNBC又含有正常组织的病人,挑选出来跑流

2019-11-21  本文已影响0人  小梦游仙境

寻找TCGA数据库中既含有TNBC组织又含有正常组织的病人,把数据下载下来,跑流程。总共有十个病人。代码过程如下。

step1

##########################################################################从这开始
rm(list = ls())
expr<-read.csv('TCGA-BRCA.htseq_counts .tsv',sep = '\t')
save(expr,file = 'TCGA-BRCA.htseq_counts .Rdata')
load('TCGA-BRCA.htseq_counts .Rdata')
colnames(expr)<-gsub('.','-',colnames(expr),fixed = T)
colnames(expr)
phe<-read.csv('TCGA-BRCA.GDC_phenotype.tsv.gz',sep = '\t')
colnames(phe)
phe_tnbc<-phe[,c(1,20,25,67)]
er<-phe_tnbc[phe$breast_carcinoma_estrogen_receptor_status=='Negative',]
pr<-er[er$breast_carcinoma_progesterone_receptor_status=='Negative',]
her2<-pr[pr$lab_proc_her2_neu_immunohistochemistry_receptor_status=='Negative',]
phe_new<-her2
rownames(phe_new)<-phe_new$submitter_id.samples
phe_new<-phe_new[,-1]
a<- which(duplicated(substr(rownames(phe_new),1,12)))
b<-which(duplicated(substr(rownames(phe_new),1,12)))-1
b<-as.integer(b)
normal<-phe_new[a,]
tumor<-phe_new[b,]
nt<-rbind(normal,tumor)
rownames(nt)
nt <- nt[-c(12,28),]
expr_phe<-BRC_TNBC[,match(rownames(nt),colnames(BRC_TNBC))]
dim(expr_phe)
expr_phe<-expr_phe[,-c(3,4,7,10,12,18,19,22,25,27)]
nt<-nt[match(colnames(expr_phe),rownames(nt)),]
dim(nt)

save(expr_phe,nt,phe,file = 'step1.Rdata')

step2

options("repos"="https://mirrors.ustc.edu.cn/CRAN/")
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
rm(list = ls())
load('step1.Rdata')
library("FactoMineR")
library("factoextra") 

expr_phe[1:4,1:4]
expr_phe<-log(expr_phe)
x<-expr_phe
x[apply(is.nan(x) | is.infinite(x), FUN = any, 1), ] =0
dim(x)

exprSet<-x
exprSet=exprSet[apply(exprSet,1,function(x) sum(x>0))>=20,]
dim(exprSet)
group_list<-c(rep('tumor',10),rep('normal',10))
df.pca <- PCA(t(exprSet), graph = FALSE) #graph=FALSE不出图
fviz_pca_ind(df.pca,
             geom.ind = "point",
             col.ind = group_list, 
             addEllipses = TRUE, #不加圈圈起来因为样本少
             legend.title = "Groups"
)

save(exprSet,group_list,file = 'step2-exprSet_group_list.Rdata')

image-20190827090053754

step3

rm(list = ls())  ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
load('step2-exprSet_group_list.Rdata.Rdata')

###构建实验设计矩阵
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)
library(limma)
####构建比较矩阵,设置用来对比的组别
contrast.matrix<-makeContrasts(contrasts=c('tumor-normal'),levels = design)
######limma三部曲,只需要归一化后的数据、实验矩阵、比较矩阵的输入
fit <- lmFit(exprSet,design)
fit1 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit1)
####上面得到了p值等统计的结果,topTable对p值校验,对基因排序
tT <- topTable(fit2, adjust="fdr", number=nrow(fit2))
dim(tT)
####提取出我们需要的指标
tT <- subset(tT, select=c('adj.P.Val',"P.Value","logFC"))
save(tT,file='step3-DEG.Rdata')

step4

rm(list = ls())  ## 魔幻操作,一键清空~
options(stringsAsFactors = F)
load('step2-exprSet_group_list.Rdata')
load("step3-DEG.Rdata")
####主要用两个值,logFC和p.value
####可参考进行logFC_cutoff的计算
logFC_cutoff <- 2
###logFC_cutoff也可自行根据需求设置
####依据logFC和adj.P.val为火山图的颜色分组做准备
###
tT$change = ifelse(tT$P.Value < 0.05 & abs(tT$logFC) > 2,
                   ifelse(tT$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
####记录上调、下调基因数目,作为title的内容
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
                    '\nThe number of up gene is ',nrow(tT[tT$change =='UP',]) ,
                    '\nThe number of down gene is ',nrow(tT[tT$change =='DOWN',]))

library(ggplot2)
g = ggplot(data=tT, 
           aes(x=logFC, y=-log10(P.Value), color=change)) +
  geom_point(alpha=0.4, size=1.75) + #出点状图
  theme_set(theme_set(theme_bw(base_size=20)))+ 
  xlab("log2 fold change") + ylab("-log10 p-value") + 
  ggtitle( this_tile ) + 
  theme(plot.title = element_text(size=15,hjust = 0.5))+
  scale_colour_manual(values = c('blue','black','red'))+
  annotate('text',x=tT$logFC[tT$logFC>5],y=-log10(tT$P.Value[tT$logFC>5]),label=tT$SYMBOL[tT$logFC>5])
print(g)
ggsave(g,filename = 'volcano.png')
save(tT,file='step4-DEG_change.Rdata')

image-20190827090207306

step5

rm(list=ls())
load('step1.Rdata')
load('step2-exprSet_group_list.Rdata')
load('step3-DEG.Rdata')
load('step4-DEG_change.Rdata')
######pheatmap 这是取前1000个gene进行展示,可根据自己的需求进行调整
choose_gene<-rownames(tT)[1:1000]
#####用normalize后的数据进行展示
data_matrix <-exprSet[choose_gene,]
#####热图更详细的了解https://www.jianshu.com/p/0e5ce59fa79e
#####scale对数据处理,求得的结果为z-score,即数据与均值之间差几个标准差 
####t是对数据做转置处理,为了适应scale的要求
data_matrix=t(scale(t(data_matrix)))
#####查看scale处理后数据的范围
fivenum(data_matrix)
####下面的代码被我注释掉了,在这个数据中用不到,但其他数据中可能会用到;
####目的是避免出现极大极小值影响可视化的效果
data_matrix[data_matrix>1.2]=1.2
data_matrix[data_matrix< -1.2] = -1.2
library(pheatmap)
pheatmap(data_matrix)
####调整下颜色,使正负值颜色的对比会更加鲜明
require(RColorBrewer)
bk = c(seq(-1.2,1.2, length=100))
annotation_col = data.frame(Group = rep(c("tumor", "normal"), c(10,10)))
rownames(annotation_col)<-colnames(exprSet)
####annotation_col和annotation_row的格式需为数据框
####breaks参数用于值匹配颜色
####看下,color和breaks的对应,进行理解
pheatmap(data_matrix,
         breaks=bk,
         show_rownames = F,
         annotation_col = annotation_col,
         color = colorRampPalette(c("navy", "white", "firebrick3"))(100),
         filename = 'heatmap.png')
####可以调整的内容有是否聚类、是否分组、是否显示行和列的内容等

image-20190827090315251

step6

rm(list = ls())
load('step1.Rdata')
load('step2-exprSet_group_list.Rdata')
load('step3-DEG.Rdata')
load('step4-DEG_change.Rdata')
BiocManager::install('EnsDb.Hsapiens.v75')
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75
edb
columns(edb)
keytypes(edb)
keys <- keys(edb, keytype="GENEID")
## Get the data
gene2sym<-select(edb, keys=keys, 
                 columns=c("SYMBOL","ENTREZID"),
                 keytype="GENEID")
tT$GENEID<-rownames(tT)
tmp<-merge(tT,gene2sym,by='GENEID')
save(tmp,file = 'tT-tmp.Rdata')
library(clusterProfiler)
gene_up= as.character(tmp[tmp$change == 'UP','ENTREZID']) 
gene_down=as.character(tmp[tmp$change == 'DOWN','ENTREZID']) 
gene_diff=c(gene_up,gene_down)
gene_all = as.character(tmp$ENTREZID)
###############################KEGG富集分析#####################################################
## 1.KEGG pathway analysis
#上调、下调、差异、所有基因
#clusterProfiler作kegg富集分析:
###我们在意的是表格里的pvalue,对应的通路或term是否显著
kk.up<- enrichKEGG(gene         = gene_up,
                   organism     = 'hsa', #hsa指的是人的
                   universe     = gene_all, #universe是背景基因
                   pvalueCutoff = 0.1)
kk.down <- enrichKEGG(gene         =  gene_down,
                      organism     = 'hsa',
                      universe     = gene_all,
                      pvalueCutoff = 0.1)

#可视化展示-Barplot
barplot(kk.down,drop=T,showCategory = 12,title = 'bar_down')
barplot(kk.up,drop=T,showCategory = 12,title='bar_up')
dotplot(kk.down,showCategory = 12,title = 'dot_down')
dotplot(kk.up,showCategory = 12,title='dot_up')


library(org.Hs.eg.db)
ego_CC <- enrichGO(gene = gene_diff,
                   OrgDb= org.Hs.eg.db,
                   universe = gene_all,
                   ont = "CC",
                   pAdjustMethod = "BH")
#生物过程
ego_BP <- enrichGO(gene = gene_diff,
                   OrgDb= org.Hs.eg.db,
                   ont = "BP",
                   pAdjustMethod = "BH")
#分子功能:
ego_MF <- enrichGO(gene = gene_diff,
                   OrgDb= org.Hs.eg.db,
                   ont = "MF",
                   pAdjustMethod = "BH")
#可视化展示-Barplot
barplot(ego_CC, showCategory=12,title="GO Analysis - Cellular Component")
barplot(ego_BP, showCategory=12,title="Go Analysis - Biological Process")
barplot(ego_MF, showCategory=12,title="Go Analysis - Molecular function")
#可视化展示-Dotplot
dotplot(ego_CC,showCategory = 12,title="dot_CC")
dotplot(ego_BP,showCategory = 12,title="dot_BP")
dotplot(ego_MF,showCategory = 12,title="dot_MF")
save(kk.up,kk.down,ego_BP,ego_CC,ego_MF,file='annotation.Rdata')
kegg-down kegg-up image

step7

rm(list = ls())
load('step1.Rdata')
load('step2-exprSet_group_list.Rdata')
load('step3-DEG.Rdata')
load('step4-DEG_change.Rdata')
phe2<-phe[,c(1,151,158:160,167:170)]
#phe2[is.na(phe2)]<-0
colnames(phe2)
#phe2$days=sum(phe2$days_to_death.diagnoses+phe2$days_to_last_follow_up.diagnoses)
phe2$days=paste(phe2$days_to_death.diagnoses,phe2$days_to_last_follow_up.diagnoses,sep = '')
phe2$days=sub('NA','',phe2$days)
phe2<-phe2[match(rownames(nt),phe2$submitter_id.samples),]
phe2<-phe2[,c(1,7,8,10)]
phe2$vital_status.diagnoses=ifelse(phe2$vital_status.diagnoses=='dead',1,0)
phe2$time=as.numeric(phe2$days)/30
rownames(phe2)<-phe2$submitter_id.samples
phe2<-phe2[,-1]
load('tT-tmp.Rdata')

identical(colnames(exprSet),rownames(phe2))#判断exprSet和phe2的顺序是否一致,true为一致
dim(exprSet)
dim(phe2)

library(survival)
library(survminer)
phe2$group=ifelse(exprSet[1,]>median(exprSet[1,]),'high','low')
fit.surv <-Surv(phe2$time,as.numeric(phe2$vital_status.diagnoses))
km_2<-survfit(fit.surv~group,data=phe2)
surv_pvalue(km_2)
ggsurvplot (km_2,pval=TRUE,title='tnbc1')

phe2$group=ifelse(upgene_exprSet[1,]>median(upgene_exprSet[1,]),'high','low')
fit.surv <-Surv(phe2$time,as.numeric(phe2$vital_status.diagnoses))
km_2<-survfit(fit.surv~group,data=phe2)
surv_pvalue(km_2)
ggsurvplot (km_2,pval=TRUE,title='upgene_exprSet')
image-20190827091004551 image-20190827092001619

最后友情宣传生信技能树

上一篇 下一篇

猜你喜欢

热点阅读