生信R语言学习转录组数据分析

Transcript-level expression anal

2021-03-10  本文已影响0人  宗肃書
image.png
今天要分享的一篇文章是转录组分析的一篇方法文章
介绍:TopHat2和Cufflinks是之前常用来作转录组分析的两个工具,它们能够完成这几个任务:
(i) alignment of the reads to the genome;
(ii) assembly of the alignments into full-length transcripts;
(iii) quantification of the expression levels of each gene and transcript;
(iv) calculation of the differences in expression for all genes among the different experimental conditions.
但是最近新推出的软件StringTie和HISAT在完成这些任务的同时可以以更快的速度运行。并可以估算所有基因和转录本的表达水平。流程图如下:
image.png

这里注意的要点是:定量时使用的注释文件为stringtie --merge 合并所有样本转录本后的文件
为什么要用stringtie merge 命令?
This step is required because transcripts in some of the samples might be only partially covered by reads, and as a consequence only partial versions of them will be assembled in the initial StringTie run. The merge step creates a set of transcripts that is consistent across all samples, so that the transcripts can be compared in subsequent steps.

image.png

该流程的局限性:

  1. 不能处理三代测序数据;
  2. 除了stringtie、cufflinks、RESM三个软件,其他组装方法输出的结果与Ballgown不兼容,Ballgown的样本量至少为三个。
>install.packages(″devtools″,repos=″http://cran.us.r-project.org″)

>source(″http://www.bioconductor.org/biocLite.R″)

>biocLite(c(″alyssafrazee/RSkittleBrewer″,″ballgown″, ″genefilter″,″dplyr″,″devtools″))
#!/usr/bin/env Rscript
# run this in the output directory for rnaseq_pipeline.sh
# passing the pheno data csv file as the only argument 
args = commandArgs(trailingOnly=TRUE)
if (length(args)==0) {
# assume no output directory argument was given to rnaseq_pipeline.sh
  pheno_data_file <- paste0(getwd(), "/chrX_data/geuvadis_phenodata.csv")
} else {
  pheno_data_file <- args[1]
}

library(ballgown)
library(RSkittleBrewer)
library(genefilter)
library(dplyr)
library(devtools)

## Read phenotype sample data
pheno_data <- read.csv(pheno_data_file)

## Read in expression data
bg_chrX <- ballgown(dataDir = "ballgown", samplePattern="ERR", pData=pheno_data)

## Filter low abundance genes
bg_chrX_filt <- subset(bg_chrX, "rowVars(texpr(bg_chrX)) > 1", genomesubset=TRUE)

## DE by transcript
results_transcripts <-  stattest(bg_chrX_filt, feature='transcript', covariate='sex', 
         adjustvars=c('population'), getFC=TRUE, meas='FPKM')

## DE by gene
results_genes <-  stattest(bg_chrX_filt, feature='gene', covariate='sex', 
         adjustvars=c('population'), getFC=TRUE, meas='FPKM')

## Add gene name
results_transcripts <- data.frame(geneNames=ballgown::geneNames(bg_chrX_filt),
          geneIDs=ballgown::geneIDs(bg_chrX_filt), results_transcripts)

## Sort results from smallest p-value
results_transcripts <- arrange(results_transcripts, pval)
results_genes <-  arrange(results_genes, pval)

## Write results to CSV
write.csv(results_transcripts, "chrX_transcripts_results.csv", row.names=FALSE)
write.csv(results_genes, "chrX_genes_results.csv", row.names=FALSE)

## Filter for genes with q-val <0.05
subset(results_transcripts, results_transcripts$qval <=0.05)
subset(results_genes, results_genes$qval <=0.05)

## Plotting setup
#tropical <- c('darkorange', 'dodgerblue', 'hotpink', 'limegreen', 'yellow')
#palette(tropical)

## Plotting gene abundance distribution
#fpkm <- texpr(bg_chrX, meas='FPKM')
#fpkm <- log2(fpkm +1)
#boxplot(fpkm, col=as.numeric(pheno_data$sex), las=2,ylab='log2(FPKM+1)')

## Plot individual transcripts
#ballgown::transcriptNames(bg_chrX)[12]
#plot(fpkm[12,] ~ pheno_data$sex, border=c(1,2),
#     main=paste(ballgown::geneNames(bg_chrX)[12], ' : ',ballgown::transcriptNames(bg_chrX)[12]),
#     pch=19, xlab="Sex", ylab='log2(FPKM+1)')
#points(fpkm[12,] ~ jitter(as.numeric(pheno_data$sex)), col=as.numeric(pheno_data$sex))

## Plot gene of transcript 1729
#plotTranscripts(ballgown::geneIDs(bg_chrX)[1729], bg_chrX,
#                main=c('Gene XIST in sample ERR188234'), sample=c('ERR188234'))

## Plot average expression
#plotMeans(ballgown::geneIDs(bg_chrX)[203], bg_chrX_filt, groupvar="sex", legend=FALSE)


其余后续流程包含详细的命令运行参数,可自行查看

#!/usr/bin/env bash
usage() {
    NAME=$(basename $0)
    cat <<EOF
Usage:
  ${NAME} [output_dir]
Wrapper script for HISAT2/StringTie RNA-Seq analysis protocol.
In order to configure the pipeline options (input/output files etc.)
please copy and edit a file rnaseq_pipeline.config.sh which must be
placed in the current (working) directory where this script is being launched.

Output directories "hisat2" and "ballgown" will be created in the 
current working directory or, if provided, in the given <output_dir>
(which will be created if it does not exist).

EOF
}

OUTDIR="."
if [[ "$1" ]]; then
 if [[ "$1" == "-h" || "$1" == "--help" ]]; then
  usage
  exit 1
 fi
 OUTDIR=$1
fi

## load variables
if [[ ! -f ./rnaseq_pipeline.config.sh ]]; then
 usage
 echo "Error: configuration file (rnaseq_pipeline.config.sh) missing!"
 exit 1
fi

source ./rnaseq_pipeline.config.sh
WRKDIR=$(pwd -P)
errprog=""
if [[ ! -x $SAMTOOLS ]]; then
    errprog="samtools"
fi
if [[ ! -x $HISAT2 ]]; then
    errprog="hisat2"
fi
if [[ ! -x $STRINGTIE ]]; then
    errprog="stringtie"
fi

if [[ "$errprog" ]]; then    
  echo "ERROR: $errprog program not found, please edit the configuration script."
  exit 1
fi

if [[ ! -f rnaseq_ballgown.R ]]; then
   echo "ERROR: R script rnaseq_ballgown.R not found in current directory!"
   exit 1
fi

#determine samtools version
newsamtools=$( ($SAMTOOLS 2>&1) | grep 'Version: 1\.')

set -e
#set -x

if [[ $OUTDIR != "." ]]; then
  mkdir -p $OUTDIR
  cd $OUTDIR
fi

SCRIPTARGS="$@"
ALIGNLOC=./hisat2
BALLGOWNLOC=./ballgown

LOGFILE=./run.log

for d in "$TEMPLOC" "$ALIGNLOC" "$BALLGOWNLOC" ; do
 if [ ! -d $d ]; then
    mkdir -p $d
 fi
done

# main script block
pipeline() {

echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> START: " $0 $SCRIPTARGS

for ((i=0; i<=${#reads1[@]}-1; i++ )); do
    sample="${reads1[$i]%%.*}"
    sample="${sample%_*}"
    stime=`date +"%Y-%m-%d %H:%M:%S"`
    echo "[$stime] Processing sample: $sample"
    echo [$stime] "   * Alignment of reads to genome (HISAT2)"
    $HISAT2 -p $NUMCPUS --dta -x ${GENOMEIDX} \
     -1 ${FASTQLOC}/${reads1[$i]} \
     -2 ${FASTQLOC}/${reads2[$i]} \
     -S ${TEMPLOC}/${sample}.sam 2>${ALIGNLOC}/${sample}.alnstats
    echo [`date +"%Y-%m-%d %H:%M:%S"`] "   * Alignments conversion (SAMTools)"
    if [[ "$newsamtools" ]]; then
     $SAMTOOLS view -S -b ${TEMPLOC}/${sample}.sam | \
      $SAMTOOLS sort -@ $NUMCPUS -o ${ALIGNLOC}/${sample}.bam -
    else
     $SAMTOOLS view -S -b ${TEMPLOC}/${sample}.sam | \
      $SAMTOOLS sort -@ $NUMCPUS - ${ALIGNLOC}/${sample}
    fi
    #$SAMTOOLS index ${ALIGNLOC}/${sample}.bam
    #$SAMTOOLS flagstat ${ALIGNLOC}/${sample}.bam
    
    #echo "..removing intermediate files"
    rm ${TEMPLOC}/${sample}.sam
    #rm ${TEMPLOC}/${sample}.unsorted.bam

    echo [`date +"%Y-%m-%d %H:%M:%S"`] "   * Assemble transcripts (StringTie)"
    $STRINGTIE -p $NUMCPUS -G ${GTFFILE} -o ${ALIGNLOC}/${sample}.gtf \
     -l ${sample} ${ALIGNLOC}/${sample}.bam
done

## merge transcript file
echo [`date +"%Y-%m-%d %H:%M:%S"`] "#> Merge all transcripts (StringTie)"
ls -1 ${ALIGNLOC}/*.gtf > ${ALIGNLOC}/mergelist.txt

$STRINGTIE --merge -p $NUMCPUS -G  ${GTFFILE} \
    -o ${BALLGOWNLOC}/stringtie_merged.gtf ${ALIGNLOC}/mergelist.txt
上一篇 下一篇

猜你喜欢

热点阅读