转录组从下机数据到GO、kegg、GSEA
2019-03-25 本文已影响12人
chaimol
所有的命令粘贴于此,用于快速完成分析任务。具体软件参数,见
#!/bin/bash
#复制下机数据到新的文件夹data ,尽量避免操作原始文件~/disk/lyb/
find ./Cleandata -name '*fq.gz'|xargs -i cp {} ./data
#以下内容运行目录 ~/disk/lyb/data/
#1.质控
fastqc *.fq.gz -t 8
bg1='RNA_R1.fq.gz'
bg2='RNA_R2.fq.gz'
bef=(NS-1 NS-2 NS-3 WT-1 WT-2 WT-3)
for ((i=0;i<6;i++));
do
inA1=${bef[$i]}$bg1;
inA2=${bef[$i]}$bg2;
out1=${bef[$i]}"paired-R1.fq.gz";
out2=${bef[$i]}"paired-R2.fq.gz";
unpaired1=${bef[$i]}"unpaired-R1.fq.gz";
unpaired2=${bef[$i]}"unpaired-R2.fq.gz";
java -jar /home/guo/tool/Trimmomatic-0.38/trimmomatic-0.38.jar PE -threads 12 -phred33 $inA1 $inA2 $out1 $unpaired1 $out2 $unpaired2 ILLUMINACLIP:TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
echo $inA1,$inA2,$out1,$out2,$unpaired1,$unpaired2;
done
#运行目录是 /disks/backup/chaim/maize/
/home/chaim/disk/soft/hisat2/extract_exons.py Zea_mays.B73_RefGen_v4.42.gtf > genome.exon
/home/chaim/disk/soft/hisat2/extract_splice_sites.py Zea_mays.B73_RefGen_v4.42.gtf > genome.ss
/home/chaim/disk/soft/hisat2/hisat2_extract_snps_haplotypes_VCF.py zea_mays.vcf> genome.snp
#2.1建立索引
hisat2-build -p 8 Zea_mays.B73_RefGen_v4.42.fa --ss genome.ss --exon genome.exon genome_tran &
#2.2比对
for((i=0;i<6;i++));
do
out1=${bef[$i]}"paired-R1.fq.gz";
out2=${bef[$i]}"paired-R2.fq.gz";
hisat2 -x /disks/backup/chaim/maize/genome_tran -p 16 -1 $out1 -2 $out2 -S ${bef[$i]}".map.sam" --dta-cufflinks --novel-splicesite-outfile ${bef[$i]}".nsplice"
done
#第3步:用samtool,格式转换,将sam转换为bam(共6条)
for((i=0;i<6;i++));
do
samtools sort -@ 8 -o ${bef[$i]}".map.bam" ${bef[$i]}".map.sam" 2>${bef[$i]}"samtool_out"
done
#第4步装配:用stringtie(共三轮)
#组装转录本(6个分别比对到基因组)
for((i=0;i<6;i++));
do
stringtie ${bef[$i]}".map.bam" -G /disks/backup/chaim/maize/Zea_mays.B73_RefGen_v4.42.gtf -p 8 -o ${bef[$i]}".gtf" &
done
#合并各样本(整合6个的结果成一个)
stringtie --merge -G /disks/backup/chaim/maize/Zea_mays.B73_RefGen_v4.42.gtf -p 8 -o merged.gtf NS-1.gtf NS-2.gtf NS-3.gtf WT-1.gtf WT-2.gtf WT-3.gtf 2>stringtie_merge &
#估计表达丰度(以第二轮的结果作为参考序列,6个分别比对)
stringtie ${bef[$i]}".map.bam" -G merged.gtf -p 8 -b ${bef[$i]}"_out" -e -o ${bef[$i]}"-st.gtf" &
#第5步 生成CSV文件
#python路径
python2.7 /disks/backup/chaim/soft/prepDE.py -i gtf2
#第6步 deseq2进行定量分析
source("https://bioconductor.org/biocLite.R")
biocLite("DESeq2")
#输入数据
library(tidyverse)
library(DESeq2)
library(ggplot2)
#import data
#setwd("/home/chaim/disk/lyb/data/")
#setwd("/mnt/d/RNA-seq/")
setwd("D:/RNA-seq/")
countData <- as.matrix(read.csv("gene_count_matrix.csv",row.names="gene_id"))
condition <- factor(c(rep("NS",3),rep("WT",3)),levels = c("NS","WT"))
colData <- data.frame(row.names=colnames(countData),condition)
dds <- DESeqDataSetFromMatrix(countData = countData,colData = colData, design = ~ condition)
dds <- DESeq(dds)
#总体结果查看
res = results(dds)
res = res[order(res$pvalue),]
summary(res)
write.csv(res,file="All_results.csv")
table(res$padj<0.05)
#提取差异基因(DEGs)并进行gene Symbol注释
diff_gene_deseq2 <- subset(res,padj<0.05 & abs(log2FoldChange)>1)
dim(diff_gene_deseq2)
write.csv(diff_gene_deseq2,file = "DEG_treat_vs_control.csv")
# resdata <- res
# threshold <- as.factor(ifelse(resdata$padj < 0.001 & abs(resdata$log2FoldChange) >= 2 ,ifelse(resdata$log2FoldChange >= 2 ,'Up','Down'),'Not'))
# ggplot(resdata,aes(x=log2FoldChange,y=-log10(padj),colour=threshold)) + xlab("log2(Fold Change)")+ylab("-log10(qvalue)") + geom_point(size = 0.5,alpha=1) + ylim(0,200) + xlim(-12,12) + scale_color_manual(values=c("green","grey", "red"))
# #安装biomaRt包
# source("http://bioconductor.org/biocLite.R")
# biocLite("biomaRt")
# install.packages('DT')
# #用bioMart对差异表达基因进行注释
# library("biomaRt")
# listMarts()
#
# ensembl=useMart("ENSEMBL_MART_ENSEMBL")
# all_datasets <- listDatasets(ensembl)
# library(DT)
# datatable(all_datasets,options = list(searching=FALSE,pageLength=5,lengthMenu=c(5,10,15,20)))
#安装clusterProfiler 用于GO/KEGG分析及GSEA
source("https://bioconductor.org/biocLite.R")
biocLite("clusterProfiler")
biocLite("DOSE")
require(DOSE)
library(DO.db)
library(clusterProfiler)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("S4Vectors", version = "3.8")
#安装annotationhub
if(!requireNamespace("BiocManager",quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("AnnotationHub", version = "3.8")
library(AnnotationHub)
require(AnnotationHub)
hub <- AnnotationHub()
query(hub,"zea mays")
maize <- hub[['AH66225']]
length(keys(maize))
columns(maize)
require(clusterProfiler)
bitr(keys(maize)[1],'GID',c("ACCNUM","ENTREZID","UNIGENE"),maize)
"ALIAS","EVIDENCE","EVIDENCELL",
#GO富集分析
#使用enrichGO
sample_genes <- keys(maize)
res=enrichGO(sample_genes,OrgDb=maize,pvalueCutoff=1,qvalueCutoff=1)
ego <- enrichGO(gene=row.names(diff_gene_deseq2),OrgDb = maize,keyType = "GENENAME",ont="MF")
ensids <- c("Zm00001d011037","Zm00001d035600","Zm00001d035599")
cols <- c("SYMBOL","GO")
select(maize,keys = ensids,columns = cols,keytype = "GENENAME")
#气泡图
dotplot(ego,font.size=5)
#网络图
enrichMap(ego,vertex.label.cex=1.2,layout=igraph::layout.kamada.kawai)
#GO图额外安装的包
biocLite("topGO")
biocLite("Rgraphviz")
plotGOgraph(ego)
#gseGO进行GSEA分析
#快速匹配文件中,以gene开头的行,并输出其中的第3,12列内容。
cat genome_table.txt |awk '$1 ~/gene/ {print $3,$12}' >gene_id