【miRNA-Seq 实战】二、准备分析数据，差异分析以及靶向基

2022-08-29 本文已影响0人佳奥

这里是佳奥！了解了大致流程，我们开始正式的分析实战。

本次实战的数据来源是这篇文章：EBV-miR-BART8-3p induces epithelial-mesenchymal transition and promotes metastasis of nasopharyngeal carcinoma cells through activating NF-κB and Erk1/2 pathways

参考推文：

http://www.bio-info-trainee.com/6997.html

1 miRNA-Seq环境搭建

参考序列文件

在前一篇我们已经准备好了MIRBASE的miRNA参考序列文件。

mkdir -p ~/refer/miRNA
cd ~/refer/miRNA

wget https://www.mirbase.org/ftp/CURRENT/hairpin.fa.gz ##　38589　reads
wget https://www.mirbase.org/ftp/CURRENT/mature.fa.gz ## 48885 reads 
wget https://www.mirbase.org/ftp/CURRENT/genomes/hsa.gff3

gzip -d hairpin.fa.gz
gzip -d mature.fa.gz

grep sapiens mature.fa |wc -l # 2656 
grep sapiens hairpin.fa |wc # 1917

## Homo sapiens
perl -alne '{if(/^>/){if(/Homo/){$tmp=1}else{$tmp=0}};next if $tmp!=1;s/U/T/g if !/>/;print }' hairpin.fa > hairpin.human.fa
perl -alne '{if(/^>/){if(/Homo/){$tmp=1}else{$tmp=0}};next if $tmp!=1;s/U/T/g if !/>/;print }' mature.fa > mature.human.fa
# 这里值得一提的是miRBase数据库下载的序列，居然都是用U表示的，也就是说就是miRNA序列，而不是转录成该miRNA的基因序列，而我们测序的都是基因序列。

配置conda

conda create -n mirna
conda activate mirna
conda install -y -c bioconda hisat2 bowtie samtools fastp bowtie2 fastx_toolkit fastqc multiqc

bowtie构建索引

##当前refer目录下的文件
(mirna) root 11:02:43 /home/kaoku/refer/miRNA
$ ls -lh
总用量 11M
hairpin.fa
hairpin.human.fa
hsa.gff3
mature.fa
mature.human.fa

bowtie-build mature.human.fa hsa-mature-bowtie-index
bowtie-build hairpin.human.fa hsa-hairpin-bowtie-index

##得到文件
hsa-hairpin-bowtie-index.1.ebwt
hsa-hairpin-bowtie-index.2.ebwt
hsa-hairpin-bowtie-index.3.ebwt
hsa-hairpin-bowtie-index.4.ebwt
hsa-hairpin-bowtie-index.rev.1.ebwt
hsa-hairpin-bowtie-index.rev.2.ebwt
hsa-mature-bowtie-index.1.ebwt
hsa-mature-bowtie-index.2.ebwt
hsa-mature-bowtie-index.3.ebwt
hsa-mature-bowtie-index.4.ebwt
hsa-mature-bowtie-index.rev.1.ebwt
hsa-mature-bowtie-index.rev.2.ebwt

2 文章数据下载

下载SRA文件

vdb-config -i 
##进入配置文件后按c,在location of user-repository设置下载好的文件的位置,设置完成后按s保存x退出界面

##添加到环境
export PATH="$PATH:/home/kaoku/biosoft/sratoolkit/sratoolkit.3.0.0-ubuntu64/bin"

##下载第一个项目p1的SRA文件
SRR1542714
SRR1542715
SRR1542716
SRR1542717
SRR1542718
SRR1542719

cat srr.list | while read id; do ( prefetch $id & ); done

mv SRR* /home/kaoku/project/miRNA/p1/

##第二个项目p2的SRA文件
SRR7707744
SRR7707745
SRR7707746
SRR7707747
SRR7707748
SRR7707749
SRR7707750
SRR7707751
SRR7707752
SRR7707753
SRR7707754

cat srr.list | while read id; do ( prefetch $id & ); done

mv SRR* /home/kaoku/project/miRNA/p2/

转fastq文件

##p1目录
echo {14..19} |sed 's/ /\n/g' |while read id; \
do ( fastq-dump -O ./ --gzip --split-3 SRR15427${id} & ) ; \
done

##p2目录
echo {44..54} |sed 's/ /\n/g' |while read id; \
do ( fastq-dump -O ./ --gzip --split-3 SRR77077${id} & ) ; \
done

质控和清洗测序数据

##新建qc目录，进行质量控制
ls ../raw_fq/*gz | xargs fastqc -t 10 -o  ./
multiqc ./

##清洗主要是fastx_toolkit修剪
ls *gz |while read id
do
echo $id 
# fastqc $id 
zcat $id| fastq_quality_filter -v -q 20 -p 80 -Q33 -i - -o tmp ;
fastx_trimmer -v -f 1 -l 27 -m 15 -i tmp -Q33 -z -o ${id%%.*}_clean.fq.gz ;
# fastqc ${id%%.*}_clean.fq.gz 
done

##会进行两步操作，质量控制并修剪长度
ls *gz |while read id
do
echo $id 
zcat $id| fastq_quality_filter -v -q 20 -p 80 -Q33 -i - -o tmp ;
fastx_trimmer -v -f 1 -l 27 -m 15 -i tmp -Q33 -z -o ${id%%.*}_clean.fq.gz ;
done

#fastq_quality_filter和fastx_trimmer两个命令，都是来自于fastx_toolkit软件包：

#fastq_quality_filter 代表根据质量过滤序列
#fastx_trimmer 代表缩短FASTQ或FASTQ文件中的读数，根据质量修剪（剪切）序列。

##文件大小变化情况
-rw-r--r-- 1 root root  39M  8月 29 16:06 SRR1542714_clean.fq.gz
-rw-r--r-- 1 root root  50M  8月 29 14:50 SRR1542714.fastq.gz

第一种比对策略：bowtie比对 miRBaes数据库下载的序列 +定量

##同时比对mature与hairpin，注意hsa-mature-bowtie-index是文件名前缀，不是目录名
mature=/home/kaoku/refer/miRNA/hsa-mature-bowtie-index
hairpin=/home/kaoku/refer/miRNA/hsa-hairpin-bowtie-index

ls *_clean.fq.gz |while read id
do
echo $id 
bowtie -n 0 -m1 --best --strata $mature $id -S ${id}_matrue.sam 
bowtie -n 0 -m1 --best --strata $hairpin $id -S ${id}_hairpin.sam 
done

##压缩.sam成.bam并删除.sam
ls *.sam|while read id ;do (samtools sort -O bam -@ 5 -o $(basename ${id} ".sam").bam ${id});done
rm *.sam

-n 0 不允许错配，但是结果来看比对率很低，修改成允许一个错配的话：

ls *_clean.fq.gz |while read id
do
echo $id 
bowtie -n 1 -m1 --best --strata $mature $id -S ${id}_matrue.n1.sam 
bowtie -n 1 -m1 --best --strata $hairpin $id -S ${id}_hairpin.n1.sam 
done

随后得到了.bam文件

(mirna) root 16:46:10 /home/kaoku/project/miRNA/p1/align
$ ls
SRR1542714_clean.fq.gz_hairpin.bam  SRR1542715_clean.fq.gz_matrue.bam   SRR1542717_clean.fq.gz_hairpin.bam  SRR1542718_clean.fq.gz_matrue.bam
SRR1542714_clean.fq.gz_matrue.bam   SRR1542716_clean.fq.gz_hairpin.bam  SRR1542717_clean.fq.gz_matrue.bam   SRR1542719_clean.fq.gz_hairpin.bam
SRR1542715_clean.fq.gz_hairpin.bam  SRR1542716_clean.fq.gz_matrue.bam   SRR1542718_clean.fq.gz_hairpin.bam  SRR1542719_clean.fq.gz_matrue.bam

##查看每个基因表达量
samtools view .bam | cut -f 3 | sort | uniq -c

批量走 samtools idxstats 得到表达量矩阵：

ls *.bam | xargs -i samtools index {}
ls *.bam | while read id ;do (samtools idxstats ${id} > ${id}.txt );done
# samtools view matrue.bam |cut -f 3 |sort |uniq -c

第二种比对策略：bowtie2比对参考基因组+定量

下载hg38参考基因组：

wget -c https://genome-idx.s3.amazonaws.com/bt/GRCh38_noalt_as.zip

比对参考基因组

##查看一下index目录
(mirna) root 19:22:44 /home/kaoku/refer/miRNA/hg38
$ ls
GRCh38_noalt_as.zip  hg38.1.bt2  hg38.2.bt2  hg38.3.bt2  hg38.4.bt2  hg38.rev.1.bt2  hg38.rev.2.bt2

index=/home/kaoku/refer/miRNA/hg38/hg38

ls *_clean.fq.gz |while read id
do
echo $id 
bowtie2 -p 4 -x $index -U $id | samtools sort -@ 4 -o ${id}_genome.bam -
done

##比对结果
SRR1542714_clean.fq.gz_genome.bam  SRR1542716_clean.fq.gz_genome.bam  SRR1542718_clean.fq.gz_genome.bam
SRR1542715_clean.fq.gz_genome.bam  SRR1542717_clean.fq.gz_genome.bam  SRR1542719_clean.fq.gz_genome.bam

定量分析

##确保安装subread

gtf=/home/kaoku/refer/miRNA/hsa.gff3

featureCounts -T 4 -F gff -M -t miRNA -g Name -a $gtf -o all.counts.mature.txt *genome* 1>counts.mature.log 2>&1

featureCounts -T 4 -F gff -M -t miRNA_primary_transcript -g Name -a $gtf -o all.counts.hairpin.txt *genome* 1>counts.hairpin.log 2>&1

##结果
all.counts.hairpin.txt
all.counts.hairpin.txt.summary
all.counts.mature.txt 
all.counts.mature.txt.summary
counts.hairpin.log
counts.mature.log

3 比较两种定量分析方法差异

比较miRBaes数据库下载的序列和参考基因组的两组表达矩阵。

QQ截图20220829203444.png

hairpin<-read.table('all.counts.hairpin.txt',header = T)
m1<-hairpin[,7:12]
rownames(m1)<-hairpin[,1]

paste *_clean.fq.gz_hairpin.bam.txt | cut -f 1,3,7,11,15,19,23 > hairpin.txt

m2<-read.table('hairpin.txt',header = F,row.names = 1)
ids<-intersect(rownames(m1),rownames(m2))

##构建merge matrix
overM<-cbind(m1[ids,],m2[ids,])
##LOG转换后的COUNTS矩阵
pheatmap::pheatmap(cor(log2(overM+1)))

QQ截图20220829211943.png

##取CPM后的矩阵
cpmM<-log(edgeR::cpm(overM)+1)
pheatmap::pheatmap(cor(cpmM))

##选取TOP500的MAD基因
highM=cpmM[names(tail(sort(apply(cpmM,1,mad)),500)),]
pheatmap::pheatmap(cor(highM))

##独立选取各自的top500的mad基因，再去交集
hg1=names(tail(sort(apply(cpmM[,1:6],1,mad)),500))
hg2=names(tail(sort(apply(cpmM[,7:12],1,mad)),500))
hg=intersect(hg1,hg2)
hgM=cpmM[hg,]
pheatmap::pheatmap(cor(hgM))

QQ截图20220829212715.png

流程对于定量分析结果影响很大。

##进一步探索两种方法对于比对数的差异
colnames(overM)=NULL

换读取另一个

mature<-read.table('all.counts.mature.txt',header = T)
m1<-mature[,7:12]
rownames(m1)<-mature[,1]
cpmM<-log(edgeR::cpm(overM)+1)
pheatmap::pheatmap(cor(cpmM))

m2<-read.table('mature.txt',header = F,row.names = 1)

ids<-intersect(rownames(m1),rownames(m2))
overM<-cbind(m1[ids,],m2[ids,])
pheatmap::pheatmap(cor(log2(overM+1)))

highM=cpmM[names(tail(sort(apply(cpmM,1,mad)),500)),]
pheatmap::pheatmap(cor(highM))

hg1=names(tail(sort(apply(cpmM[,1:6],1,mad)),500))
hg2=names(tail(sort(apply(cpmM[,7:12],1,mad)),500))
hg=intersect(hg1,hg2)
hgM=cpmM[hg,]
pheatmap::pheatmap(cor(hgM))

colnames(overM)=NULL

paste *_clean.fq.gz_matrue.bam.txt | cut -f 1,3,7,11,15,19,23 > mature.txt

QQ截图20220829213919.png

4 DEG差异分析

##从mature矩阵开始
rm(list = ls())
options(stringsAsFactors = F)
mature<-read.table('all.counts.mature.txt',header = T)
m1<-mature[,7:12]
rownames(m1)<-mature[,1]

##判断分组
pheatmap::pheatmap(cor(m1))

gl<-rep(c('control','ET1'),3)

##DEG分析
group_list<-gl
exprSet<-m1

suppressMessages(library(DESeq2)) 
(colData <- data.frame(row.names=colnames(exprSet), group_list=group_list) )
dds <- DESeqDataSetFromMatrix(countData = exprSet,
                              colData = colData,
                              design = ~ group_list)
dds <- DESeq(dds)

res <- results(dds, 
               contrast=c("group_list","ET1","control"))
resOrdered <- res[order(res$padj),]
head(resOrdered)
DEG=as.data.frame(resOrdered)

DEG <- na.omit(DEG)

##判断上下调基因
df=DEG
colnames(df)
df$V<- -log10(df$pvalue) 
library(ggpubr)
ggscatter(df, x = "log2FoldChange", y = "V", size=0.5)

df$g<-ifelse(df$pvalue>0.01, 'stable',
             ifelse( df$log2FoldChange > 2 , 'up',
                    ifelse( df$log2FoldChange < - 2, 'down', 'stable')))  
table(df$g)

down stable     up 
 43    492     35

5 靶向基因预测