Hisat2+Stringtie+DESeq2 workflow

2020-01-18 本文已影响0人 547可是贼帅的547

前言

因为以前的分析流程要在Shell 和 R 之间切换，然后可控性还差，有点烦，正好最近有点数据要分析，干脆就重新建立一个 all in one 的流程，方便以后使用。

运行环境

Ubuntu 18.04
R 3.6.1

用到的软件

fastp
bowtie2
hisat2
stringtie
DESeq2

Refference data

rRNA index:用bowtie2去除fastq文件中的rRNA
tRNA index:http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/hg38-tRNAs.fa
Hisat2 index:https://cloud.biohpc.swmed.edu/index.php/s/grch38_tran/download
GRCh38.96.gtf:ftp://ftp.ensembl.org/pub/release-96/gtf/homo_sapiens/Homo_sapiens.GRCh38.96.gtf.gz
Homo_sapiens.GRCh38.dna.primary_assembly.fa:ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.gz

代码

############## Functions #####################
#Hisat2 参数
grch38_tran_index = "/home/zhou/RNASEQ/index/grch38_tran/genome_tran"
rRNA_index = "/home/zhou/RNASEQ/index/rRNA_index/human_rRNA"
tRNA_index = "/home/zhou/RNASEQ/index/tRNA_index_hg38/hg38_tRNA"
GRCh38.96.gtf = "/home/zhou/RNASEQ/index/grch38_tran/Homo_sapiens.GRCh38.96.gtf"
fasta = "/home/zhou/RNASEQ/index/grch38_tran/Homo_sapiens.GRCh38.dna.primary_assembly.fa"

#fastp preprocess
##Single version
fastp_SE <- function(input_fastq = input_fastq) {
  fastp_PATH = "/home/zhou/anaconda2/bin/fastp"
  output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq),"_fastpedited.fastq.gz")
  fastp_cmd = paste(fastp_PATH, "-i", input_fastq, "-o", output_fastq, "-w 16", "-z 9")
  system(fastp_cmd,wait = T)
  return(output_fastq)
}
##Paired version
fastp_PE <- function(input_fastq_F = input_fastq_F, input_fastq_R = input_fastq_R) {
  fastp_PATH = "/home/zhou/anaconda2/bin/fastp"
  output_fastq_F = paste0(gsub(pattern = "\\.fq\\.gz",replacement = "",x = input_fastq_F),"_fastpedited.fastq.gz")
  output_fastq_R = paste0(gsub(pattern = "\\.fq\\.gz",replacement = "",x = input_fastq_R),"_fastpedited.fastq.gz")
  fastp_cmd = paste(fastp_PATH, "-i", input_fastq_F, "-o", output_fastq_F, "-I", input_fastq_R, "-O", output_fastq_R, "-w 16", "-z 9")
  system(fastp_cmd,wait = T)
  return(c(output_fastq_F,output_fastq_R))
}

#rRNA remove
##Single version
remove_rRNA_SE <- function(input_fastq = input_fastq, index = index) {
  bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
  output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq),"_rmrRNA.fastq.gz")
  bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-gz", output_fastq, "-U", input_fastq, "-p 20 -S tmp.sam")
  system(bowtie2_cmd,wait = T)
  system("rm tmp.sam")
  return(output_fastq)
}

##Paired version
remove_rRNA_PE <- function(input_fastq_F = input_fastq_F, input_fastq_R = input_fastq_R, index = index) {
  bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
  output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmrRNA.fastq.gz")
  output_fastq_F = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.1\\.gz",x = output_fastq)
  output_fastq_R = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.2\\.gz",x = output_fastq)
  output_fastq_F_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmrRNA.fastq.gz")
  output_fastq_R_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_R),"_rmrRNA.fastq.gz")
  bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-conc-gz", output_fastq, "-1", input_fastq_F, "-2", input_fastq_R, "-p 20 -S tmp.sam")
  rename_cmd_F = paste("mv", output_fastq_F, output_fastq_F_reset)
  rename_cmd_R = paste("mv", output_fastq_R, output_fastq_R_reset)
  system(bowtie2_cmd,wait = T)
  system("rm tmp.sam",wait = T)
  system(rename_cmd_F,wait = T)
  system(rename_cmd_R,wait = T)
  return(c(output_fastq_F_reset,output_fastq_R_reset))
}
#tRNA remove
##Single version
remove_tRNA_SE <- function(input_fastq = input_fastq, index = index) {
  bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
  output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq),"_rmtRNA.fastq.gz")
  bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-gz", output_fastq, "-U", input_fastq, "-p 20 -S tmp.sam")
  system(bowtie2_cmd,wait = T)
  system("rm tmp.sam")
  return(output_fastq)
}

##Paired version
remove_tRNA_PE <- function(input_fastq_F = input_fastq_F, input_fastq_R = input_fastq_R, index = index) {
  bowtie2_PATH = "/home/zhou/anaconda2/bin/bowtie2"
  output_fastq = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmtRNA.fastq.gz")
  output_fastq_F = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.1\\.gz",x = output_fastq)
  output_fastq_R = gsub(pattern = "\\.fastq\\.gz",replacement = "\\.fastq\\.2\\.gz",x = output_fastq)
  output_fastq_F_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_F),"_rmtRNA.fastq.gz")
  output_fastq_R_reset = paste0(gsub(pattern = "\\.fastq\\.gz",replacement = "",x = input_fastq_R),"_rmtRNA.fastq.gz")
  bowtie2_cmd = paste(bowtie2_PATH, "-x", index, "--un-conc-gz", output_fastq, "-1", input_fastq_F, "-2", input_fastq_R, "-p 20 -S tmp.sam")
  rename_cmd_F = paste("mv", output_fastq_F, output_fastq_F_reset)
  rename_cmd_R = paste("mv", output_fastq_R, output_fastq_R_reset)
  system(bowtie2_cmd,wait = T)
  system("rm tmp.sam",wait = T)
  system(rename_cmd_F,wait = T)
  system(rename_cmd_R,wait = T)
  return(c(output_fastq_F_reset,output_fastq_R_reset))
}
#hisat2 
##Single version
hisat2_SE <- function(index = index, input_fastq = input_fastq, output_prefix = output_prefix) {
  bam_filename = paste0(output_prefix,".bam")
  hisat2_PATH = "/home/zhou/bin/hisat2"
  hisat2_cmd = paste(hisat2_PATH, "--dta -x",index, "-p 20 -U", input_fastq, " | samtools sort -@ 20 -o", bam_filename)
  system(hisat2_cmd,wait = T)
  return(bam_filename)
}

##Paired version
hisat2_PE <- function(index = index, input_fastq_F = input_fastq_F,input_fastq_R = input_fastq_R, output_prefix = output_prefix) {
  bam_filename = paste0(output_prefix,".bam")
  hisat2_PATH = "/home/zhou/bin/hisat2"
  hisat2_cmd = paste(hisat2_PATH, "--dta -x",index, "-p 20 -1", input_fastq_F, "-2", input_fastq_R, " | samtools sort -@ 20 -o", bam_filename)
  system(hisat2_cmd,wait = T)
  return(bam_filename)
}

#stringtie
stringtie <- function(input_bam = input_bam, gtf = gtf) {
  output_prefix = gsub(pattern = "\\.bam",replacement = "",x = input_bam)
  tab_filename = paste0(output_prefix,".tab")
  gtf_filename = paste0(output_prefix,".gtf")
  stringtie_PATH = "/home/zhou/bin/stringtie"
  stringtie_cmd = paste(stringtie_PATH,"-p 20 -G", gtf, "-o", gtf_filename, "-e -l", output_prefix, input_bam)
  system(stringtie_cmd,wait = T)
  return(gtf_filename)
}

#generate gene\transcript count matrix
run.prepDE.py <- function(inputfile = inputfile) {
  run.cmd = paste("python ~/Software/prepDE.py -i",inputfile)
  system(run.cmd,wait = T)
}
################ test data ############################
setwd("~/ribosome/mTOR_Liver_cell/testdata/")
sample_pattern = unique(gsub(x = list.files(path = "./"), pattern = "_R1_test.fq.gz|_R2_test.fq.gz",replacement = ""))

for (i in sample_pattern) {
  print(i)
  ### fastp preprocess
  fastp.fastq = fastp_PE(input_fastq_F = paste0(i,"_R1_test.fq.gz"),input_fastq_R = paste0(i,"_R2_test.fq.gz"))
  ### remove rRNA
  rmrRNA.fastq = remove_rRNA_PE(input_fastq_F = fastp.fastq[1],input_fastq_R = fastp.fastq[2],index = rRNA_index)
  ### remove tRNA
  rmtRNA.fastq = remove_tRNA_PE(input_fastq_F = rmrRNA.fastq[1],input_fastq_R = rmrRNA.fastq[2],index = tRNA_index)
  ### hisat2 mapping
  hisat2.bam = hisat2_PE(index = grch38_tran_index,input_fastq_F = rmtRNA.fastq[1],input_fastq_R = rmtRNA.fastq[2],output_prefix = i )
  ### stringtie assemble
  stringtie(input_bam = hisat2.bam,gtf = GRCh38.96.gtf)
}


################ real data ############################
#rnaseq
setwd("~/ribosome/mTOR_Liver_cell/RNA-Rawdata/")
sample_pattern = unique(gsub(x = list.files(path = "./"), pattern = "_R1.fq.gz|_R2.fq.gz",replacement = ""))

for (i in sample_pattern) {
  print(i)
  ### fastp preprocess
  fastp.fastq = fastp_PE(input_fastq_F = paste0(i,"_R1.fq.gz"),input_fastq_R = paste0(i,"_R2.fq.gz"))
  ### remove rRNA
  rmrRNA.fastq = remove_rRNA_PE(input_fastq_F = fastp.fastq[1],input_fastq_R = fastp.fastq[2],index = rRNA_index)
  ### remove tRNA
  rmtRNA.fastq = remove_tRNA_PE(input_fastq_F = rmrRNA.fastq[1],input_fastq_R = rmrRNA.fastq[2],index = tRNA_index)
  ### hisat2 mapping
  hisat2.bam = hisat2_PE(index = grch38_tran_index,input_fastq_F = rmtRNA.fastq[1],input_fastq_R = rmtRNA.fastq[2],output_prefix = i )
  ### stringtie assemble
  stringtie(input_bam = hisat2.bam,gtf = GRCh38.96.gtf)
}

## generate counts matrix
mergelist = data.frame(sample_pattern,paste0("./",sample_pattern,".gtf"))
write.table(mergelist,file = "mergelist", sep = "\t",row.names = F,col.names = F,quote = F)
run.prepDE.py(inputfile = "mergelist")

## DEA
### load count_matrix
gene_matrix = read.csv(file = "gene_count_matrix.csv",header = T,row.names = 1)

library(DESeq2)
library(BiocParallel)
register(MulticoreParam(24))

database = gene_matrix
condition <- factor(c(rep("T",3),rep("UT",3)))
coldata <- data.frame(row.names = colnames(database), condition)
dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition)
dds <- dds[ rowSums(counts(dds)) > 1, ]
nrow(dds)
dds <- DESeq(dds,parallel = T)
res <- results(dds)
summary(res) 
table(res$padj<0.1)
res <- res[order(res$padj),]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
total.RNA.signresdata<-na.omit(resdata[resdata$padj<0.1 & abs(resdata$log2FoldChange) > 1,])

#riboseq
setwd("~/ribosome/mTOR_Liver_cell/RIBO-Rawdata/")
sample_pattern = unique(gsub(x = list.files(path = "./"), pattern = "_R1.fq.gz|_R2.fq.gz",replacement = ""))

for (i in sample_pattern) {
  print(i)
  ### fastp preprocess
  fastp.fastq = fastp_PE(input_fastq_F = paste0(i,"_R1.fq.gz"),input_fastq_R = paste0(i,"_R2.fq.gz"))
  ### remove rRNA
  rmrRNA.fastq = remove_rRNA_PE(input_fastq_F = fastp.fastq[1],input_fastq_R = fastp.fastq[2],index = rRNA_index)
  ### remove tRNA
  rmtRNA.fastq = remove_tRNA_PE(input_fastq_F = rmrRNA.fastq[1],input_fastq_R = rmrRNA.fastq[2],index = tRNA_index)
  ### hisat2 mapping
  hisat2.bam = hisat2_PE(index = grch38_tran_index,input_fastq_F = rmtRNA.fastq[1],input_fastq_R = rmtRNA.fastq[2],output_prefix = i )
  ### stringtie assemble
  stringtie(input_bam = hisat2.bam,gtf = GRCh38.96.gtf)
}

## generate counts matrix
mergelist = data.frame(sample_pattern,paste0("./",sample_pattern,".gtf"))
write.table(mergelist,file = "mergelist", sep = "\t",row.names = F,col.names = F,quote = F)
run.prepDE.py(inputfile = "mergelist")

## DEA
### load count_matrix
gene_matrix = read.csv(file = "gene_count_matrix.csv",header = T,row.names = 1)

library(DESeq2)
library(BiocParallel)
register(MulticoreParam(24))

database = gene_matrix
condition <- factor(c(rep("T",3),rep("UT",3)))
coldata <- data.frame(row.names = colnames(database), condition)
dds <- DESeqDataSetFromMatrix(countData=database, colData=coldata, design=~condition)
dds <- dds[ rowSums(counts(dds)) > 1, ]
nrow(dds)
dds <- DESeq(dds,parallel = T)
res <- results(dds)
summary(res) 
table(res$padj<0.1)
res <- res[order(res$padj),]
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)),by="row.names",sort=FALSE)
Ribo.RNA.signresdata<-na.omit(resdata[resdata$padj<0.1 & abs(resdata$log2FoldChange) > 1,])

结语

把所有功能都模块化了，真舒服啊！
此外，有些function可以增加参数来进一步增加自由度，比如说线程数，我懒得写了，其实可以慢慢完善的，以后看心情把，毕业季实在太忙了。。。

Hisat2+Stringtie+DESeq2 workflow

前言

运行环境

用到的软件

Refference data

代码

结语

猜你喜欢

热点阅读