基因组坐标区间操作

【R - bioconducter练习】

2019-03-25  本文已影响33人  caokai001

Before start, Make sure following packages are already installed:
BSgenome, GenomicFeatures, GenomicRanges, BSgenome.Hsapiens.UCSC.hg19

library(BSgenome)
available.genomes()
library(BSgenome.Hsapiens.UCSC.hg19)

#####################################################

1. explore sequence composition of fly genome

#####################################################

### get sequence for chromosome 1
Seq= Hsapiens [["chr1"]]
Seq # shows some summaries

## get GC content
bases=alphabetFrequency(Seq,baseOnly=TRUE)
bases[1:4]
ntotBases=sum(bases[1:4])
baseFreq=bases[1:4]/ntotBases
GCcontent=baseFreq["C"]+baseFreq["G"]

## look at CG dinucleotide content
cg=matchPattern("CG", Seq)
ncg=length(cg)

## compute the observed to expected ratio
ncg/(baseFreq["C"]*baseFreq["G"]*ntotBases) ## this shows CG rarely stay together.

## compare to the observed to expected ratio of TG
tg=matchPattern("TG", Seq)
ntg=length(tg)
ntg/(baseFreq["T"]*baseFreq["G"]*ntotBases) ## this shows TG presented more than expected.

######### look at GC content and CG dinucleotide distribution in 1000 bp windows in whole genome.
## Note I'm only doing it in chr1 now.
ss=seq(1, length(Seq), by=1000)
ss=ss[-length(ss)] ## remove the last one
Seq.set=DNAStringSet(Seq, start=ss, end=ss+999)
ff=alphabetFrequency(Seq.set, baseOnly=TRUE)
pCG=(ff[,"C"]+ff[,"G"])/rowSums(ff)
hist(pCG[pCG>0],100)

## CG occurance
Seq.set=DNAStringSet(Seq, start=ss, end=ss+999)
nCG=vcountPattern("CG", Seq.set)
obsExp=nCG*1000/(ff[,"C"]*ff[,"G"])
mean(obsExp,na.rm=TRUE)
hist(obsExp,100) ## see a long tail, those are CpG islands

#####################################################################

2. Explore GC content at gene TSS (transcriptional starting site),

#####################################################################

## and their overlaps with CpG island.
## first obtain genes from UCSC using GenomicFeatures functions
library(GenomicFeatures)
txdb=makeTxDbFromUCSC(genom="hg19",tablename="refGene") ## this is slow, take a few minutes.
genes.hg19 = genes(txdb)

## get transcriptional start site.
## Note that txEnd is the start site for genes on "-" strand
tss=start(genes.hg19)
idx=which(strand(genes.hg19)=="-")
tss[idx]=end(genes.hg19)[idx]

## create GRanges object for the TSS, remove random and hap chromosomes
allchrs = as.character(seqnames(genes.hg19))
idx=c(grep("random", allchrs),grep("hap", allchrs))
## create ranges with TSS +/- 500 bp
TSS=GRanges(seqnames=Rle(allchrs[-idx]), ranges=IRanges(tss[-idx]-500,tss[-idx]+500))

## get GC contents of TSS for genes on chromosome 1
idx.chr1=seqnames(TSS)=="chr1"
Seq.set=DNAStringSet(Seq, start=start(TSS[idx.chr1]), end=end(TSS[idx.chr1]))
ff=alphabetFrequency(Seq.set, baseOnly=TRUE)
pCG.TSS=(ff[,"C"]+ff[,"G"])/rowSums(ff)
hist(pCG.TSS,50)

## compare with genome wide distribution of GC content
d1=density(pCG[pCG>0])
d2=density(pCG.TSS)
plot(d1, lwd=2)
lines(d2, col="red",lwd=2)
legend("topright", legend=c("Genome", "TSS"), lwd=2, col=c("black", "red"))
image.png
## CG dinucleotide around TSS, compute obs/Exp.
nCG=vcountPattern("CG", Seq.set)
obsExp.TSS=nCG*1000/(ff[,"C"]*ff[,"G"])
hist(obsExp.TSS, 50) ## bimodal distribution.

## compare with the genome wide one:
d1=density(obsExp[!is.na(obsExp)])
d2=density(obsExp.TSS)
plot(d1, lwd=2, xlim=c(0,1.5), main="Observed to expected CG ratio")
lines(d2, col="red",lwd=2)
legend("topright", legend=c("Genome", "TSS"), lwd=2, col=c("black", "red"))
image.png

OPTIONAL(选做题)

#####################################################################

3. Overlaps of CpG island and TSS

#####################################################################

Read in CpG island file, create GRanges object and

compute the percentage of TSS covered by CGI.

This part is to be finished by students.

上一篇下一篇

猜你喜欢

热点阅读