RNAseq教程(3.2)

2021-01-04  本文已影响0人  周小钊

目录

1.Module 1 - Introduction to RNA sequencing

  1. Installation
  2. Reference Genomes
  3. Annotations
  4. Indexing
  5. RNA-seq Data
  6. Pre-Alignment QC

2.Module 2 - RNA-seq Alignment and Visualization

  1. Adapter Trim
  2. Alignment
  3. IGV
  4. Alignment Visualization
  5. Alignment QC

3.Module 3 - Expression and Differential Expression

  1. Expression
  2. Differential Expression
  3. DE Visualization
  4. Kallisto for Reference-Free Abundance Estimation

4.Module 4 - Isoform Discovery and Alternative Expression

  1. Reference Guided Transcript Assembly
  2. de novo Transcript Assembly
  3. Transcript Assembly Merge
  4. Differential Splicing
  5. Splicing Visualization

5.Module 5 - De novo transcript reconstruction

  1. De novo RNA-Seq Assembly and Analysis Using Trinity

6.Module 6 - Functional Annotation of Transcripts

  1. Functional Annotation of Assembled Transcripts Using Trinotate

3.2 Differential Expression

Ballgown

用Ballgown比较肿瘤和正常情况。详情请参考手册:

https://www.bioconductor.org/packages/release/bioc/html/ballgown.html `

使用所有复制,对已知(仅参考模式)转录本进行UHR与HBR比较:

首先创建一个文件,列出我们的6个表达式文件,然后查看该文件,检查这些结果:

printf "\"ids\",\"type\",\"path\"\n\"UHR_Rep1\",\"UHR\",\"UHR_Rep1\"\n\"UHR_Rep2\",\"UHR\",\"UHR_Rep2\"\n\"UHR_Rep3\",\"UHR\",\"UHR_Rep3\"\n\"HBR_Rep1\",\"HBR\",\"HBR_Rep1\"\n\"HBR_Rep2\",\"HBR\",\"HBR_Rep2\"\n\"HBR_Rep3\",\"HBR\",\"HBR_Rep3\"\n" > UHR_vs_HBR.csv
cat UHR_vs_HBR.csv

R
#Jason Walker, jason.walker[AT]wustl.edu
#Malachi Griffith, mgriffit[AT]wustl.edu
#Obi Griffith, obigriffith[AT]wustl.edu
#The Genome McDonnell Institute, Washington University School of Medicine

#R tutorial for Informatics for RNA-sequence Analysis workshops

library(ballgown)
library(genefilter)
library(dplyr)
library(devtools)

# Load phenotype data from a file we saved in the current working directory
pheno_data = read.csv("UHR_vs_HBR.csv")

# Load ballgown data structure and save it to a variable "bg"
bg = ballgown(samples=as.vector(pheno_data$path), pData=pheno_data)

# Display a description of this object
bg

# Load all attributes including gene name
bg_table = texpr(bg, 'all')
bg_gene_names = unique(bg_table[, 9:10])

# Save the ballgown object to a file for later use
save(bg, file='bg.rda')

# Perform differential expression (DE) analysis with no filtering
results_transcripts = stattest(bg, feature="transcript", covariate="type", getFC=TRUE, meas="FPKM")
results_genes = stattest(bg, feature="gene", covariate="type", getFC=TRUE, meas="FPKM")
results_genes = merge(results_genes, bg_gene_names, by.x=c("id"), by.y=c("gene_id"))

# Save a tab delimited file for both the transcript and gene results
write.table(results_transcripts, "UHR_vs_HBR_transcript_results.tsv", sep="\t", quote=FALSE, row.names = FALSE)
write.table(results_genes, "UHR_vs_HBR_gene_results.tsv", sep="\t", quote=FALSE, row.names = FALSE)

# Filter low-abundance genes. Here we remove all transcripts with a variance across the samples of less than one
bg_filt = subset (bg,"rowVars(texpr(bg)) > 1", genomesubset=TRUE)

# Load all attributes including gene name
bg_filt_table = texpr(bg_filt , 'all')
bg_filt_gene_names = unique(bg_filt_table[, 9:10])

# Perform DE analysis now using the filtered data
results_transcripts = stattest(bg_filt, feature="transcript", covariate="type", getFC=TRUE, meas="FPKM")
results_genes = stattest(bg_filt, feature="gene", covariate="type", getFC=TRUE, meas="FPKM")
results_genes = merge(results_genes, bg_filt_gene_names, by.x=c("id"), by.y=c("gene_id"))

# Output the filtered list of genes and transcripts and save to tab delimited files
write.table(results_transcripts, "UHR_vs_HBR_transcript_results_filtered.tsv", sep="\t", quote=FALSE, row.names = FALSE)
write.table(results_genes, "UHR_vs_HBR_gene_results_filtered.tsv", sep="\t", quote=FALSE, row.names = FALSE)

# Identify the significant genes with p-value < 0.05
sig_transcripts = subset(results_transcripts, results_transcripts$pval<0.05)
sig_genes = subset(results_genes, results_genes$pval<0.05)

# Output the signifant gene results to a pair of tab delimited files
write.table(sig_transcripts, "UHR_vs_HBR_transcript_results_sig.tsv", sep="\t", quote=FALSE, row.names = FALSE)
write.table(sig_genes, "UHR_vs_HBR_gene_results_sig.tsv", sep="\t", quote=FALSE, row.names = FALSE)

# Exit the R session
quit(save="no")

运行结束后查看结果

head UHR_vs_HBR_gene_results.tsv

这条染色体上有多少个基因?

grep -v feature UHR_vs_HBR_gene_results.tsv | wc -l

在UHR或HBR中有多少通过过滤?

grep -v feature UHR_vs_HBR_gene_results_filtered.tsv | wc -l

在这条染色体上发现了多少差异表达基因(p值< 0.05)?

grep -v feature UHR_vs_HBR_gene_results_sig.tsv | wc -l

展示前20个DE基因。看看IGV中的一些基因——它们有意义吗?

grep -v feature UHR_vs_HBR_gene_results_sig.tsv | sort -rnk 3 | head -n 20 #Higher abundance in UHR
grep -v feature UHR_vs_HBR_gene_results_sig.tsv | sort -nk 3 | head -n 20 #Higher abundance in HBR

将P<0.05的所有基因保存到一个新文件中。

grep -v feature UHR_vs_HBR_gene_results_sig.tsv | cut -f 6 | sed 's/\"//g' > DE_genes.txt
head DE_genes.txt

edgeR

mkdir -p de/htseq_counts
cd de/htseq_counts
perl -ne 'if ($_ =~ /gene_id\s\"(ENSG\S+)\"\;/) { $id = $1; $name = undef; if ($_ =~ /gene_name\s\"(\S+)"\;/) { $name = $1; }; }; if ($id && $name) {print "$id\t$name\n";} if ($_=~/gene_id\s\"(ERCC\S+)\"/){print "$1\t$1\n";}' $RNA_REF_GTF | sort | uniq > ENSG_ID2Name.txt
head ENSG_ID2Name.txt
cut -f 1 ENSG_ID2Name.txt | sort | uniq | wc
cut -f 2 ENSG_ID2Name.txt | sort | uniq | wc
cut -f 2 ENSG_ID2Name.txt | sort | uniq -c | sort -r | head
#Malachi Griffith, mgriffit[AT]wustl.edu
#Obi Griffith, obigriffith[AT]wustl.edu
#The McDonnell Genome Institute, Washington University School of Medicine

#R tutorial for Informatics for RNA-sequence Analysis workshops

#######################
# Loading Data into R #
#######################

#Set working directory where output will go
working_dir = "path_to/de/htseq_counts"
setwd(working_dir)

#Read in gene mapping
mapping=read.table("~/workspace/rnaseq/de/htseq_counts/ENSG_ID2Name.txt", header=FALSE, stringsAsFactors=FALSE, row.names=1)

# Read in count matrix
rawdata=read.table("~/workspace/rnaseq/expression/htseq_counts/gene_read_counts_table_all_final.tsv", header=TRUE, stringsAsFactors=FALSE, row.names=1)

# Check dimensions
dim(rawdata)

# Require at least 25% of samples to have count > 25
quant <- apply(rawdata,1,quantile,0.75)
keep <- which((quant >= 25) == 1)
rawdata <- rawdata[keep,]
dim(rawdata)

#################
# Running edgeR #
#################

# load edgeR
library('edgeR')

# make class labels
class <- factor( c( rep("UHR",3), rep("HBR",3) ))

# Get common gene names
genes=rownames(rawdata)
gene_names=mapping[genes,1]


# Make DGEList object
y <- DGEList(counts=rawdata, genes=genes, group=class)
nrow(y)

# TMM Normalization
y <- calcNormFactors(y)

# Estimate dispersion
y <- estimateCommonDisp(y, verbose=TRUE)
y <- estimateTagwiseDisp(y)

# Differential expression test
et <- exactTest(y)

# Print top genes
topTags(et)

# Print number of up/down significant genes at FDR = 0.05  significance level
summary(de <- decideTestsDGE(et, p=.05))
detags <- rownames(y)[as.logical(de)]


# Output DE genes
# Matrix of significantly DE genes
mat <- cbind(
 genes,gene_names,
 sprintf('%0.3f',log10(et$table$PValue)),
 sprintf('%0.3f',et$table$logFC)
)[as.logical(de),]
colnames(mat) <- c("Gene", "Gene_Name", "Log10_Pvalue", "Log_fold_change")

# Order by log fold change
o <- order(et$table$logFC[as.logical(de)],decreasing=TRUE)
mat <- mat[o,]

# Save table
write.table(mat, file="DE_genes.txt", quote=FALSE, row.names=FALSE, sep="\t")

#To exit R type the following
quit(save="no")

与其他结果比较

cat DE_genes.txt
cat /de/htseq_counts/DE_genes.txt
cut -f 1 DE_genes.txt | sort  > ballgown_DE_gene_symbols.txt
cut -f 2 de/htseq_counts/DE_genes.txt | sort > htseq_counts_edgeR_DE_gene_symbols.txt
上一篇 下一篇

猜你喜欢

热点阅读