ggplot集锦

Txdb创建和读取

2021-08-18  本文已影响0人  日月其除

做chip-seq时候,使用chipseeker对找到的peak进行注释
需要加载相应的txdb文件


参考网站:
http://www.360doc.com/content/18/0402/01/33459258_742149876.shtml
https://mp.weixin.qq.com/s/_OPzvaEAbiMolCA2mqJXLw


方法1

使用gtf构建

library(GenomicFeatrues)
spombe <- makeTxDbFromGFF("test.gff3")
#从ucsc中下载,但是我没跑通
hg19.refseq.db <- makeTxDbFromUCSC(genome="hg19", table="refGene")
image.png

方法2

从Bioconductor中直接下载,Bioconductor提供了30个TxDb包

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene")
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
> hg38.refseq.db = TxDb.Hsapiens.UCSC.hg38.knownGene
> hg38.refseq.db 
TxDb object:
# Db type: TxDb
# Supporting package: GenomicFeatures
# Data source: UCSC
# Genome: hg38
# Organism: Homo sapiens
# Taxonomy ID: 9606
# UCSC Table: knownGene
# UCSC Track: GENCODE V36
# Resource URL: http://genome.ucsc.edu/
# Type of Gene ID: Entrez Gene ID
# Full dataset: yes
# miRBase build ID: NA
# Nb of transcripts: 232184
# Db created by: GenomicFeatures package from Bioconductor
# Creation time: 2021-04-28 16:32:07 +0000 (Wed, 28 Apr 2021)
# GenomicFeatures version at creation time: 1.41.3
# RSQLite version at creation time: 2.2.6
# DBSCHEMAVERSION: 1.2

补充一些txbd文件的操作

参考网站:http://www.360doc.com/content/19/1202/12/50736008_876886520.shtml

> library(TxDb.Hsapiens.UCSC.hg19.knownGene)
> Txdb = TxDb.Hsapiens.UCSC.hg19.knownGene
> seqinfo(Txdb)
Seqinfo object with 93 sequences (1 circular) from hg19 genome:
  seqnames       seqlengths isCircular genome
  chr1            249250621       <NA>   hg19
  chr2            243199373       <NA>   hg19
  chr3            198022430       <NA>   hg19
  chr4            191154276       <NA>   hg19
  chr5            180915260       <NA>   hg19
  ...                   ...        ...    ...
  chrUn_gl000245      36651       <NA>   hg19
  chrUn_gl000246      38154       <NA>   hg19
  chrUn_gl000247      36422       <NA>   hg19
  chrUn_gl000248      39786       <NA>   hg19
  chrUn_gl000249      38502       <NA>   hg19
> columns(Txdb)
 [1] "CDSCHROM"   "CDSEND"     "CDSID"     
 [4] "CDSNAME"    "CDSSTART"   "CDSSTRAND" 
 [7] "EXONCHROM"  "EXONEND"    "EXONID"    
[10] "EXONNAME"   "EXONRANK"   "EXONSTART" 
[13] "EXONSTRAND" "GENEID"     "TXCHROM"   
[16] "TXEND"      "TXID"       "TXNAME"    
[19] "TXSTART"    "TXSTRAND"   "TXTYPE" 
> seqlevels(Txdb) %>% head(10)
 [1] "chr1"  "chr2"  "chr3"  "chr4"  "chr5" 
 [6] "chr6"  "chr7"  "chr8"  "chr9"  "chr10"

> transcripts(Txdb) #转录本信息
GRanges object with 82960 ranges and 2 metadata columns:
                seqnames        ranges strand |
                   <Rle>     <IRanges>  <Rle> |
      [1]           chr1   11874-14409      + |
      [2]           chr1   11874-14409      + |
      [3]           chr1   11874-14409      + |
      [4]           chr1   69091-70008      + |
      [5]           chr1 321084-321115      + |
      ...            ...           ...    ... .
  [82956] chrUn_gl000237        1-2686      - |
  [82957] chrUn_gl000241   20433-36875      - |
  [82958] chrUn_gl000243   11501-11530      + |
  [82959] chrUn_gl000243   13608-13637      + |
  [82960] chrUn_gl000247     5787-5816      - |
              tx_id     tx_name
          <integer> <character>
      [1]         1  uc001aaa.3
      [2]         2  uc010nxq.1
      [3]         3  uc010nxr.1
      [4]         4  uc001aal.1
      [5]         5  uc001aaq.2
      ...       ...         ...
  [82956]     82956  uc011mgu.1
  [82957]     82957  uc011mgv.2
  [82958]     82958  uc011mgw.1
  [82959]     82959  uc022brq.1
  [82960]     82960  uc022brr.1
  -------
  seqinfo: 93 sequences (1 circular) from hg19 genome

> promoter =  promoters(Txdb, upstream = 1000, downstream = 1000); promoter
Warning message:
In valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE) :
  GRanges object contains 1 out-of-bound range
  located on sequence chrUn_gl000223. Note
  that ranges located on a sequence whose
  length is unknown (NA) or on a circular
  sequence are not considered out-of-bound
  (use seqlengths() and isCircular() to get
  the lengths and circularity flags of the
  underlying sequences). You can use trim() to
  trim these ranges. See
  ?`trim,GenomicRanges-method` for more
  information.
GRanges object with 82960 ranges and 2 metadata columns:
                   seqnames        ranges strand
                      <Rle>     <IRanges>  <Rle>
  uc001aaa.3           chr1   10874-12873      +
  uc010nxq.1           chr1   10874-12873      +
  uc010nxr.1           chr1   10874-12873      +
  uc001aal.1           chr1   68091-70090      +
  uc001aaq.2           chr1 320084-322083      +
         ...            ...           ...    ...
  uc011mgu.1 chrUn_gl000237     1687-3686      -
  uc011mgv.2 chrUn_gl000241   35876-37875      -
  uc011mgw.1 chrUn_gl000243   10501-12500      +
  uc022brq.1 chrUn_gl000243   12608-14607      +
  uc022brr.1 chrUn_gl000247     4817-6816      -
             |     tx_id     tx_name
             | <integer> <character>
  uc001aaa.3 |         1  uc001aaa.3
  uc010nxq.1 |         2  uc010nxq.1
  uc010nxr.1 |         3  uc010nxr.1
  uc001aal.1 |         4  uc001aal.1
  uc001aaq.2 |         5  uc001aaq.2
         ... .       ...         ...
  uc011mgu.1 |     82956  uc011mgu.1
  uc011mgv.2 |     82957  uc011mgv.2
  uc011mgw.1 |     82958  uc011mgw.1
  uc022brq.1 |     82959  uc022brq.1
  uc022brr.1 |     82960  uc022brr.1
  -------
  seqinfo: 93 sequences (1 circular) from hg19 genome

> exons(Txdb)
GRanges object with 289969 ranges and 1 metadata column:
                 seqnames      ranges strand |
                    <Rle>   <IRanges>  <Rle> |
       [1]           chr1 11874-12227      + |
       [2]           chr1 12595-12721      + |
       [3]           chr1 12613-12721      + |
       [4]           chr1 12646-12697      + |
       [5]           chr1 13221-14409      + |
       ...            ...         ...    ... .
  [289965] chrUn_gl000241 35706-35859      - |
  [289966] chrUn_gl000241 36711-36875      - |
  [289967] chrUn_gl000243 11501-11530      + |
  [289968] chrUn_gl000243 13608-13637      + |
  [289969] chrUn_gl000247   5787-5816      - |
             exon_id
           <integer>
       [1]         1
       [2]         2
       [3]         3
       [4]         4
       [5]         5
       ...       ...
  [289965]    289965
  [289966]    289966
  [289967]    289967
  [289968]    289968
  [289969]    289969
  -------
  seqinfo: 93 sequences (1 circular) from hg19 genome

同理:

cds(Txdb) #获得cds 区域
transcriptBy(Txdb) #基于基因的转录本分Granges list
exonsBy(Txdb) #基于基因的exons分Granges list

上一篇 下一篇

猜你喜欢

热点阅读