RNA-seq后续count处理

2023-04-10 本文已影响0人 pudding815

---------------------------------加载R包-------------------------
library(DESeq2)
library(pheatmap)
library(corrplot)
library(EnhancedVolcano)
library(ggplot2)
library(ggsci)
library(Vennerable)
library(rio)
library(matrixStats)
library(BiocGenerics)
library(biomaRt)
library(limma)
library(dplyr)
library(tidyverse)
library(tibble)

----------------------count ------------------------------------------------
count <- read.table('C:/High-throughput sequencing/LH0-4-12h/6sample_count.txt',header = T,sep = '\t')
count <- count[,-c(2:5)]
meta <- count[,1:2]
rownames(count) <- count$Geneid
count <- count[,-c(1:2)]
colnames(count) <- c("LH0h_1","LH0h_2","LH4h_1","LH4h_2","LH12h_1","LH12h_2")


---------------------gene information---------------------------------
gene.info <- read.table('mouse.ref102.gene.type.gtf',header = F,sep = '\t')
gene.info<-gene.info[,c(5:7)]
colnames(gene.info) <- c('geneid','symbol','biotype')
colnames(meta) <- c('geneid','length')
meta <- meta %>% left_join(y = gene.info,by = 'geneid')

-------------------------get TPM--------------------------------
kb <- meta$Length / 1000
fpkm <- count / kb
tpm <- t(t(fpkm)/colSums(fpkm) * 1000000)
tpm <- round(tpm,2) %>% as.data.frame()

----------------clean data-----------------------------
pick.genes <- unique(c(rownames(tpm[matrixStats::rowMedians(as.matrix(tpm[,1:6]), cols = 1:2) > 0.5, ]),
                       rownames(tpm[matrixStats::rowMedians(as.matrix(tpm[,1:6]), cols = 3:4) > 0.5, ]),
                       rownames(tpm[matrixStats::rowMedians(as.matrix(tpm[,1:6]), cols = 5:6) > 0.5, ])
                       ))

count.clean <- count[pick.genes,]
fpkm.clean <- fpkm[pick.genes,]
tpm.clean <- tpm[pick.genes,]
count.clean <- rownames_to_column(count.clean,var = "row_names")
colnames(count.clean) <- c("geneid","LH0h_1","LH0h_2","LH4h_1","LH4h_2","LH12h_1","LH12h_2")
fpkm.clean <- rownames_to_column(fpkm.clean,var = "row_names")
colnames(fpkm.clean) <- c("geneid","LH0h_1","LH0h_2","LH4h_1","LH4h_2","LH12h_1","LH12h_2")
tpm.clean <- rownames_to_column(tpm.clean,var = "row_names")
colnames(tpm.clean) <- c("geneid","LH0h_1","LH0h_2","LH4h_1","LH4h_2","LH12h_1","LH12h_2")

count.clean <- count.clean %>% left_join(y = meta,by = 'geneid')
fpkm.clean <- fpkm.clean %>% left_join(y = meta,by = 'geneid')
tpm.clean <- tpm.clean %>% left_join(y = meta,by = 'geneid')

count.clean.pcg <- count.clean[count.clean$biotype == "protein_coding",]
fpkm.clean.pcg  <- fpkm.clean[fpkm.clean$biotype == "protein_coding",]
tpm.clean.pcg   <- tpm.clean[tpm.clean$biotype == "protein_coding",]

有重复基因，根据表达量删除低的
which(count.clean.pcg$symbol=="St6galnac2")

count.clean.pcg <- count.clean.pcg[-c(2719,6414,10370,14222),]
fpkm.clean.pcg  <- fpkm.clean.pcg[-c(2719,6414,10370,14222),]
tpm.clean.pcg   <-tpm.clean.pcg[-c(2719,6414,10370,14222),]

rownames(count.clean.pcg) <- count.clean.pcg$symbol
rownames(fpkm.clean.pcg)  <- fpkm.clean.pcg$symbol
rownames(tpm.clean.pcg)   <- tpm.clean.pcg$symbol

count.clean.pcg <- count.clean.pcg[,2:7]
fpkm.clean.pcg  <- fpkm.clean.pcg[,2:7]
tpm.clean.pcg   <- tpm.clean.pcg[,2:7]
write.csv(count.clean.pcg,'count_clean_pcg.csv')
write.csv(fpkm.clean.pcg,'fpkm_clean_pcg.csv')
write.csv(tpm.clean.pcg,'tpm_clean_pcg.csv')

差异分析

------------------------- DEG analysis-----------------------
padj_cutoff <- 0.05
group_list = c(rep('h0',2),rep("h4",2))

colData = data.frame(row.names = colnames(count.clean.pcg[,1:4]),group_list = group_list)
dds <- DESeqDataSetFromMatrix(countData = count.clean.pcg[,1:4],colData = colData,design = ~group_list)
rld <- rlog(dds, blind=TRUE)
dds <- DESeq(dds)
plotDispEsts(dds)
RES <- results(dds, contrast = c('group_list','h4','h0'),name = 'h4_vs_h0',alpha = 0.05) 前处后对
res_tbl <- RES %>%data.frame() %>%rownames_to_column(var="geneid") %>%as_tibble() %>%arrange(padj)
res_tbl<-res_tbl %>% left_join(y = gene.info,by = 'geneid')
write.csv(res_tbl,"h4_vs_h0_DEseq2_res.csv")

---------------log2FC pvaule------
sig_res <- dplyr::filter(res_tbl, padj < padj_cutoff) %>%
  dplyr::arrange(padj)
write.csv(sig_res,"h4_vs_h0_sig_res0.05.csv")
pvalue<-sig_res$pvalue<0.05
up<-sig_res$log2FoldChange>2
down<-sig_res$log2FoldChange<(-2)
sig_res$change<-ifelse(pvalue&up,"up",ifelse(pvalue&down,"down","none"))
write.csv(sig_res,"h4_vs_h0_DEG_p0.05_FC2.csv")

功能分析

-----------------------enrichment-method2: clusterprofiler---------------------------
library(clusterProfiler)
library(org.Mm.eg.db)
ms.up <- factor(na.omit(sig_res[sig_res$change == 'up',]$geneid))
ms.up <- bitr(ms.up,fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = org.Mm.eg.db)
ms.down <- factor(na.omit(sig_res[sig_res$change == 'up',]$geneid ))
ms.down <- bitr(ms.down,fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = org.Mm.eg.db)
bg.genes <- bitr(sig_res$geneid ,fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = org.Mm.eg.db )


ms.up.go = enrichGO(  gene = as.character(ms.up$ENTREZID),
                      universe = as.character(bg.genes$ENTREZID),
                      OrgDb = org.Mm.eg.db,
                      ont = 'BP',
                      pAdjustMethod = 'BH',
                      pvalueCutoff = 0.05,
                      qvalueCutoff = 0.1,
                      readable = TRUE)
dotplot(ms.up.go,showCategory = 20)
up.go_result_BP <- as.data.frame(ms.up.go)
write.table(up.go_result_BP, "up.go_result_BP.xls", quote = F, sep = "\t", col.names = T, row.names = F)

RNA-seq后续count处理

差异分析

功能分析

猜你喜欢

热点阅读