走进转录组转录组分析

转录组分析--FPKM与TPM

2022-06-26  本文已影响0人  千万别加香菜
读取文件(featurecounts后产生的row count文件)
rm(list=ls()) 
options(stringsAsFactors = F)  
library(tidyverse) 
# ggplot2 stringer dplyr tidyr readr purrr  tibble forcats 
library(data.table) #可多核读取文件 
a1 <- fread('all.featurecounts.txt', header = T, data.table = F)#载入counts,第一列设置为列名 

counts矩阵的构建

counts <- a1[,7:ncol(a1)] #截取样本基因表达量的counts部分作为counts  
rownames(counts) <- a1$Geneid #将基因名作为行名 
### 从featurecounts 原始输出文件counts.txt中提取Geneid、Length(转录本长度), 
geneid_efflen <- subset(a1,select = c("Geneid","Length"))        
colnames(geneid_efflen) <- c("geneid","efflen")   
geneid_efflen_fc <- geneid_efflen #用于之后比较 
### 取出counts中geneid的对应的efflen 
dim(geneid_efflen) 
efflen <- geneid_efflen[match(rownames(counts),                               
                              geneid_efflen$geneid),"efflen"] 

FPKM/RPKM (Fragments/Reads Per Kilobase Million ) 每千个碱基的转录每百万映射读取的Fragments/reads

# RPKM与FPKM分别针对单端与双端测序而言,计算公式是一样的 
counts2FPKM <- function(count=count, efflength=efflen){    
  PMSC_counts <- sum(count)/1e6   #counts的每百万缩放因子 (“per million” scaling factor) 深度标准化   
  FPM <- count/PMSC_counts        #每百万reads/Fragments (Reads/Fragments Per Million) 长度标准化   
  FPM/(efflength/1000)                                       
}
FPKM <- as.data.frame(apply(counts,2,counts2FPKM))
colnames(FPKM) <- c("Simmental_1","Simmental_2","Simmental_3","Wagyu_1","Wagyu_2","Wagyu_3") # 修改列名
FPKM <- FPKM[rowSums(FPKM)>=1,] # 去除全部为0的列
colSums(FPKM)

当前推荐使用 TPM 进行相关性分析、PCA分析等 (Transcripts Per Kilobase Million) 每千个碱基的转录每百万映射读取的Transcripts

counts2TPM <- function(count=count, efflength=efflen){   
  RPK <- count/(efflength/1000)   #每千碱基reads (reads per kilobase) 长度标准化   
  PMSC_rpk <- sum(RPK)/1e6        #RPK的每百万缩放因子 (“per million” scaling factor ) 深度标准化   
  RPK/PMSC_rpk                       
}
TPM <- as.data.frame(apply(counts,2,counts2TPM))
colnames(TPM) <- c("Zebu_1","Zebu_2","Zebu_3","Zebu_4","Zebu_5","Holstein_1","Holstein_2","Holstein_3","Holstein_4","Holstein_5") # 修改列名
TPM <- TPM[rowSums(TPM)>0,] # 去除全部为0的列
colSums(TPM)
上一篇下一篇

猜你喜欢

热点阅读