高阶差异分析TCGA分析以及转录因子可视化-workflow01

2023-08-24  本文已影响0人  一车小面包人

背景:TCGA(The Cancer Genome Atlas Program癌症基因组图谱计划)是一个肿瘤样本数据库,里面有多种疾病的样本以及基因表达量矩阵等信息,下载特定疾病的样本数据可以进行差异分析,例如寻找正常样本和癌症样本之间的基因表达差异

metadata <- jsonlite::fromJSON("metadata.cart.2023-08-21.json") #'加载之前下载的json文件
library(dplyr)
metadata_id <- metadata %>%
  dplyr::select(c(file_name,associated_entities))
## 提取对应的文件
naid_df <- data.frame()
for (i in 1:nrow(metadata)){
  naid_df[i,1] <- metadata_id$file_name[i]
  naid_df[i,2] <- metadata_id$associated_entities[i][[1]]$entity_submitter_id
}

colnames(naid_df) <- c("filename","TCGA_id") #这里是根据json文件信息得到了01.merge_test路径下样本信息命名的文件名字与样本名字的对应关系

myfread <- function(files){
  data = data.table::fread(paste0("01.merge_test/files/",files))
  data = data[-seq(1,4),]
  data = data$unstranded
}
files <- dir("01.merge_test/files")
f <- lapply(files,myfread)
expr_df <- as.data.frame(do.call(cbind,f))
rownames(naid_df) <- naid_df[,1]
naid_df <- naid_df[files,]
colnames(expr_df) <- naid_df$TCGA_id
gene_id <- data.table::fread(paste0("01.merge_test/files/",files[1]))$gene_id
gene_id <- gene_id[-seq(1,4)]
expr_df <- cbind(gene_id=gene_id,expr_df)
gene_name<- data.table::fread(paste0("01.merge_test/files/",files[1]))$gene_name
gene_name <- gene_name[-seq(1,4)]
expr_df <- cbind(gene_name=gene_name,expr_df)
#提取gene_type
gene_type<- data.table::fread(paste0("01.merge/files/",files[1]))$gene_type
gene_type <- gene_type[-seq(1,4)]
expr_df <- cbind(gene_type=gene_type,expr_df)
write.table(expr_df,"expr_df.tsv",row.names=F,col.names=T,quote=F)#sep="\t"
#提取mRNA(lncRNA也是同样的提取方式)
exprset<-expr_df[expr_df$gene_type=="protein_coding",]
write.table(exprset,"mRNA_expr_df.tsv",row.names=F,col.names=T,quote=F)
#lncRNA and lncRNA中的癌症样本
exprset.lnc<-expr_df[expr_df$gene_type=="lncRNA",]
write.table(exprset.lnc,"lncRNA_expr_df.tsv",row.names=F,col.names=T,quote=F)
sample.list<-ifelse(substring(colnames(exprset.lnc)[-1],14,15)=="11","normal",colnames(exprset.lnc)[-1])
sample.list<-sample.list[which(sample.list!="normal")]
exprset.lnc<-exprset.lnc[,sample.list]
write.table(exprset.lnc,"lncRNA_expr_df_cancer.tsv",row.names=F,col.names=T,quote=F)
#只保留基因名
exprset<- exprset[,-c(1,3)]
write.table(exprset,"mRNA_genename_expr_df.tsv",row.names=F,col.names=T,quote=F)

至此,表达矩阵构建完成

library(edgeR)
foldChange=1
padj=0.05
exprset=read.table("mRNA_genename_expr_df.tsv",header=T,check.names=F) #check.names=F防止将列名中的-替换为.
metadata <- data.frame(TCGA_id =colnames(exprset)[-1]) #提取表达矩阵的列名为数据框
table(substring(metadata$TCGA_id,14,15)) #统计第4位置开头的01信息(01-09是癌症 10-19是正常 20-29是癌旁
sample.list<-ifelse(substring(metadata$TCGA_id,14,15)=="11","normal",metadata$TCGA_id)
sample.list<-sample.list[which(sample.list!="normal")] #'得到所有癌症患者的行名
sample <- ifelse(substring(metadata$TCGA_id,14,15)=="11","normal","cancer")
metadata$sample <- as.factor(sample)
#library(limma)
mycounts <- as.data.frame(avereps(exprset[,-1],ID = exprset$gene_name)) #'去除重复的genes

#' 方法一 使用edgeR
dimnames=list(rownames(mycounts),colnames(mycounts))
data=matrix(as.numeric(as.matrix(mycounts)),nrow=nrow(mycounts),dimnames=dimnames)
data=data[rowMeans(data)>1,] #此处有问题 data= data %>% filter(rowMeans(data)>1)不适用于matrix
data=avereps(data)
design <- model.matrix(~metadata$sample)
y <- DGEList(counts=data,group=as.vector(metadata$sample)) #metadata$sample是factor
y <- calcNormFactors(y) #归一化
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y) #先计算公共离散度,再计算tagwise离散度
et <- exactTest(y,pair = c("normal","cancer")) #计算差异基因
topTags(et)
ordered_tags <- topTags(et, n=100000)
#保存标准化后的data
allDiff=ordered_tags$table
allDiff=allDiff[is.na(allDiff$FDR)==FALSE,]
diff=allDiff
newData=y$pseudo.counts
write.table(diff,file="edgerOut.xls",sep="\t",quote=F) #原始的差异table
diffSig = diff[(diff$FDR < padj & (diff$logFC>foldChange | diff$logFC<(-foldChange))),] #logfc绝对值>1 fdr<0.05
write.table(diffSig, file="diffSig.xls",sep="\t",quote=F)
diffUp = diff[(diff$FDR < padj & (diff$logFC>foldChange)),] #logfc>1 癌症相对于正常上调
write.table(diffUp, file="up.xls",sep="\t",quote=F)
diffDown = diff[(diff$FDR < padj & (diff$logFC<(-foldChange))),] #logfc<-1 癌症相对于正常下调
write.table(diffDown, file="down.xls",sep="\t",quote=F)
normalizeExp=rbind(id=colnames(newData),newData) #标准化后的data
write.table(normalizeExp,file="normalizeExp.txt",sep="\t",quote=F,col.names=F)
diffExp=rbind(id=colnames(newData),newData[rownames(diffSig),]) #差异显著的标准化data
write.table(diffExp,file="diffmRNAExp.txt",sep="\t",quote=F,col.names=F)
#'样本筛选 只保留癌症患者的样本
normalizeExp.cancer<-normalizeExp[,sample.list]
write.table(normalizeExp,file="normalizeExp_cancer.txt",sep="\t",quote=F,col.names=F)
diffExp.cancer<-diffExp[,sample.list]
write.table(diffExp.cancer,file="diffExp_cancer.txt",sep="\t",quote=F,col.names=F)

至此,差异分析完成

DAVID.png

承接上文,在DAVID选择基因功能注释,上传前面步骤得出的差异基因集,物种选择人类,将转录因子预测结果下载TF.txt并上次到服务器分析路径下


TF_txt.png

上图TF.txt文件中Term列是预测的转录因子,Genes列是转录因子调控的差异基因,一般只需要用到第二、三、五、倒数一、二、三列。导数一、二、三列是检验,值越小越可靠,一般小于0.05进行筛选,保留最后的转录因子信息为TF.final.txt


TF_final_txt.png
由于我使用Cytoscape软件进行可视化,因此构建软件的输入文件,R脚本如下:
#'生产network.txt(三列,第一列是转录因子,第二列是转录因子调控的基因 第三列是转录关系)  gene.txt(一列,包含没有TF的基因)type.txt(两列,第一列是基因,第二列是上调或者下调或者转录因子关系)
library(dplyr)
#library(Seurat)
TF<-read.table("./TF.txt",sep="\t",header=T)
TF.final<-subset(TF,TF$FDR<0.05)
rownames(TF.final)<-TF.final$Term
TFBS<-c()
Gene<-c()
relationship<-c()
n=1
for(i in rownames(TF.final)){
        target.genes<-as.vector(unlist(strsplit(TF.final[i,"Genes"],",")))
        for(j in target.genes){
                TFBS[n]<-i
                Gene[n]<-j
                relationship[n]<-TF.final[i,"Category"]
                n<-n+1
        }
}
Gene<-as.vector(unlist(lapply(Gene,function(e){gsub(" ","",e)})))
network.table<-data.frame(TFBS=TFBS,Gene=Gene,relationship=relationship)
#write.table(network.table,"network.txt",col.names=T,row.names=F,quote=F) #'空格问题,第二列Gene中每个值多了一个空格

logFC<-read.table("./logFC.txt",header=F)
colnames(logFC)<-c("Gene","logFC")
Type <- ifelse(logFC$logFC>0,"UP","DOWN")
logFC<-cbind(logFC,Type)
TF.final<-cbind(TF.final,Type=rep("TF",nrow(TF.final)))
TF.sub<-TF.final[,c("Term","Type")]
colnames(TF.sub)<-c("Gene","Type")
type.table<-rbind(logFC[,c("Gene","Type")],TF.sub)
inter.target<-intersect(logFC$Gene,network.table$Gene)
type.table<-type.table[type.table$Gene%in%inter.target,]
network.table<-network.table[network.table$Gene%in%inter.target,]
write.table(type.table,"type.txt",row.names=F,col.names=T,quote=F)
write.table(network.table,"network.txt",col.names=T,row.names=F,quote=F)

if(length(logFC$Gene)==length(unique(logFC$Gene))&&length(intersect(unique(logFC$Gene),TF.sub$Gene))==0){
gene.table<-data.frame(gene=inter.target)
write.table(gene.table,"gene.txt",row.names=F,col.names=F,quote=F)}else{
        print("error")
}
network_txt.png
gene_txt.png
type_txt.png
上一篇 下一篇

猜你喜欢

热点阅读