Transcript-level expression anal
2021-03-10 本文已影响0人
宗肃書

今天要分享的一篇文章是转录组分析的一篇方法文章
介绍:TopHat2和Cufflinks是之前常用来作转录组分析的两个工具,它们能够完成这几个任务:
(i)
alignment of the reads to the genome;
(ii)
assembly of the alignments into full-length transcripts;
(iii)
quantification of the expression levels of each gene and transcript;
(iv)
calculation of the differences in expression for all genes among the different experimental conditions.
但是最近新推出的软件StringTie和HISAT在完成这些任务的同时可以以更快的速度运行。并可以估算所有基因和转录本的表达水平。流程图如下:

这里注意的要点是:定量时使用的注释文件为stringtie --merge 合并所有样本转录本后的文件
为什么要用stringtie merge 命令?
This step is required because transcripts in some of the samples might be only partially covered by reads, and as a consequence only partial versions of them will be assembled in the initial StringTie run. The merge step creates a set of transcripts that is consistent across all samples, so that the transcripts can be compared in subsequent steps.

该流程的局限性:
- 不能处理三代测序数据;
- 除了stringtie、cufflinks、RESM三个软件,其他组装方法输出的结果与Ballgown不兼容,Ballgown的样本量至少为三个。
- Install the Ballgown packages and other dependencies in R as follows:
>install.packages(″devtools″,repos=″http://cran.us.r-project.org″)
>source(″http://www.bioconductor.org/biocLite.R″)
>biocLite(c(″alyssafrazee/RSkittleBrewer″,″ballgown″, ″genefilter″,″dplyr″,″devtools″))
#!/usr/bin/env Rscript
# run this in the output directory for rnaseq_pipeline.sh
# passing the pheno data csv file as the only argument
args = commandArgs(trailingOnly=TRUE)
if (length(args)==0) {
# assume no output directory argument was given to rnaseq_pipeline.sh
pheno_data_file <- paste0(getwd(), "/chrX_data/geuvadis_phenodata.csv")
} else {
pheno_data_file <- args[1]
}
library(ballgown)
library(RSkittleBrewer)
library(genefilter)
library(dplyr)
library(devtools)
## Read phenotype sample data
pheno_data <- read.csv(pheno_data_file)
## Read in expression data
bg_chrX <- ballgown(dataDir = "ballgown", samplePattern="ERR", pData=pheno_data)
## Filter low abundance genes
bg_chrX_filt <- subset(bg_chrX, "rowVars(texpr(bg_chrX)) > 1", genomesubset=TRUE)
## DE by transcript
results_transcripts <- stattest(bg_chrX_filt, feature='transcript', covariate='sex',
adjustvars=c('population'), getFC=TRUE, meas='FPKM')
## DE by gene
results_genes <- stattest(bg_chrX_filt, feature='gene', covariate='sex',
adjustvars=c('population'), getFC=TRUE, meas='FPKM')
## Add gene name
results_transcripts <- data.frame(geneNames=ballgown::geneNames(bg_chrX_filt),
geneIDs=ballgown::geneIDs(bg_chrX_filt), results_transcripts)
## Sort results from smallest p-value
results_transcripts <- arrange(results_transcripts, pval)
results_genes <- arrange(results_genes, pval)
## Write results to CSV
write.csv(results_transcripts, "chrX_transcripts_results.csv", row.names=FALSE)
write.csv(results_genes, "chrX_genes_results.csv", row.names=FALSE)
## Filter for genes with q-val <0.05
subset(results_transcripts, results_transcripts$qval <=0.05)
subset(results_genes, results_genes$qval <=0.05)
## Plotting setup
#tropical <- c('darkorange', 'dodgerblue', 'hotpink', 'limegreen', 'yellow')
#palette(tropical)
## Plotting gene abundance distribution
#fpkm <- texpr(bg_chrX, meas='FPKM')
#fpkm <- log2(fpkm +1)
#boxplot(fpkm, col=as.numeric(pheno_data$sex), las=2,ylab='log2(FPKM+1)')
## Plot individual transcripts
#ballgown::transcriptNames(bg_chrX)[12]
#plot(fpkm[12,] ~ pheno_data$sex, border=c(1,2),
# main=paste(ballgown::geneNames(bg_chrX)[12], ' : ',ballgown::transcriptNames(bg_chrX)[12]),
# pch=19, xlab="Sex", ylab='log2(FPKM+1)')
#points(fpkm[12,] ~ jitter(as.numeric(pheno_data$sex)), col=as.numeric(pheno_data$sex))
## Plot gene of transcript 1729
#plotTranscripts(ballgown::geneIDs(bg_chrX)[1729], bg_chrX,
# main=c('Gene XIST in sample ERR188234'), sample=c('ERR188234'))
## Plot average expression
#plotMeans(ballgown::geneIDs(bg_chrX)[203], bg_chrX_filt, groupvar="sex", legend=FALSE)
其余后续流程包含详细的命令运行参数,可自行查看
#!/usr/bin/env bash
usage() {
NAME=$(basename $0)
cat <<EOF
Usage:
${NAME} [output_dir]
Wrapper script for HISAT2/StringTie RNA-Seq analysis protocol.
In order to configure the pipeline options (input/output files etc.)
please copy and edit a file rnaseq_pipeline.config.sh which must be
placed in the current (working) directory where this script is being launched.
Output directories "hisat2" and "ballgown" will be created in the
current working directory or, if provided, in the given <output_dir>
(which will be created if it does not exist).
EOF
}
OUTDIR="."
if [[ "$1" ]]; then
if [[ "$1" == "-h" || "$1" == "--help" ]]; then
usage
exit 1
fi
OUTDIR=$1
fi
## load variables
if [[ ! -f ./rnaseq_pipeline.config.sh ]]; then
usage
echo "Error: configuration file (rnaseq_pipeline.config.sh) missing!"
exit 1
fi
source ./rnaseq_pipeline.config.sh
WRKDIR=$(pwd -P)
errprog=""
if [[ ! -x $SAMTOOLS ]]; then
errprog="samtools"
fi
if [[ ! -x $HISAT2 ]]; then
errprog="hisat2"
fi
if [[ ! -x $STRINGTIE ]]; then
errprog="stringtie"
fi
if [[ "$errprog" ]]; then
echo "ERROR: $errprog program not found, please edit the configuration script."
exit 1
fi
if [[ ! -f rnaseq_ballgown.R ]]; then
echo "ERROR: R script rnaseq_ballgown.R not found in current directory!"
exit 1
fi
#determine samtools version
newsamtools=$( ($SAMTOOLS 2>&1) | grep 'Version: 1\.')
set -e
#set -x
if [[ $OUTDIR != "." ]]; then
mkdir -p $OUTDIR
cd $OUTDIR
fi
SCRIPTARGS="$@"
ALIGNLOC=./hisat2
BALLGOWNLOC=./ballgown
LOGFILE=./run.log
for d in "$TEMPLOC" "$ALIGNLOC" "$BALLGOWNLOC" ; do
if [ ! -d $d ]; then
mkdir -p $d
fi
done
# main script block
pipeline() {
echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> START: " $0 $SCRIPTARGS
for ((i=0; i<=${#reads1[@]}-1; i++ )); do
sample="${reads1[$i]%%.*}"
sample="${sample%_*}"
stime=`date +"%Y-%m-%d %H:%M:%S"`
echo "[$stime] Processing sample: $sample"
echo [$stime] " * Alignment of reads to genome (HISAT2)"
$HISAT2 -p $NUMCPUS --dta -x ${GENOMEIDX} \
-1 ${FASTQLOC}/${reads1[$i]} \
-2 ${FASTQLOC}/${reads2[$i]} \
-S ${TEMPLOC}/${sample}.sam 2>${ALIGNLOC}/${sample}.alnstats
echo [`date +"%Y-%m-%d %H:%M:%S"`] " * Alignments conversion (SAMTools)"
if [[ "$newsamtools" ]]; then
$SAMTOOLS view -S -b ${TEMPLOC}/${sample}.sam | \
$SAMTOOLS sort -@ $NUMCPUS -o ${ALIGNLOC}/${sample}.bam -
else
$SAMTOOLS view -S -b ${TEMPLOC}/${sample}.sam | \
$SAMTOOLS sort -@ $NUMCPUS - ${ALIGNLOC}/${sample}
fi
#$SAMTOOLS index ${ALIGNLOC}/${sample}.bam
#$SAMTOOLS flagstat ${ALIGNLOC}/${sample}.bam
#echo "..removing intermediate files"
rm ${TEMPLOC}/${sample}.sam
#rm ${TEMPLOC}/${sample}.unsorted.bam
echo [`date +"%Y-%m-%d %H:%M:%S"`] " * Assemble transcripts (StringTie)"
$STRINGTIE -p $NUMCPUS -G ${GTFFILE} -o ${ALIGNLOC}/${sample}.gtf \
-l ${sample} ${ALIGNLOC}/${sample}.bam
done
## merge transcript file
echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> Merge all transcripts (StringTie)"
ls -1 ${ALIGNLOC}/*.gtf > ${ALIGNLOC}/mergelist.txt
$STRINGTIE --merge -p $NUMCPUS -G ${GTFFILE} \
-o ${BALLGOWNLOC}/stringtie_merged.gtf ${ALIGNLOC}/mergelist.txt