GFF或GTF格式转bed
2022-09-07 本文已影响0人
JeremyL
# 1. gff2bed和gtf2bed
首先gff2bed和gtf2bed都是BEDOPS的程序;所以使用之前需要安装# BEDOPS;
## Linux平台安装BEDOPS
$ git clone https://github.com/bedops/bedops.git
$ cd bedops
$ make
$ make install
复制可执行文件到环境路径;
$ cp bin/* /usr/local/bin
## 使用
GFF文件和GTF文件均来自于gencode文件。
GFF文件:gencode.v19.annotation.gff3
GTF文件:gencode.v19.annotation.gtf
### GFF格式
gff2bed <gencode.v19.annotation.gff3 > test.bed
convert2bed -i gff -o bed <gencode.v19.annotation.gff3 > test.bed
- 文件内容查看
gencode.v19.annotation.gff3
##gff-version 3
#description: evidence-based annotation of the human genome (GRCh37), version 19 (Ensembl 74)
#provider: GENCODE
#contact: gencode@sanger.ac.uk
#format: gff3
#date: 2014-09-18
##sequence-region chr1 1 249250621
chr1 HAVANA gene 11869 14412 . + . ID=ENSG00000223972.4;gene_id=ENSG00000223972.4;transcript_id=ENSG0000
0223972.4;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=pseudogene;transcript_status=KNOWN;transcript_name
=DDX11L1;level=2;havana_gene=OTTHUMG00000000961.2
chr1 HAVANA transcript 11869 14409 . + . ID=ENST00000456328.2;Parent=ENSG00000223972.4;gene_id=ENSG000
00223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=processed_transcr
ipt;transcript_status=KNOWN;transcript_name=DDX11L1-002;level=2;havana_gene=OTTHUMG00000000961.2;havana_transcript=OTTHUMT00000362751
.1;tag=basic
chr1 HAVANA exon 11869 12227 . + . ID=exon:ENST00000456328.2:1;Parent=ENST00000456328.2;gene_id=ENSG0000
0223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=processed_transcri
pt;transcript_status=KNOWN;transcript_name=DDX11L1-002;exon_number=1;exon_id=ENSE00002234944.1;level=2;havana_gene=OTTHUMG00000000961
.2;havana_transcript=OTTHUMT00000362751.1;tag=basic
chr1 HAVANA exon 12613 12721 . + . ID=exon:ENST00000456328.2:2;Parent=ENST00000456328.2;gene_id=ENSG0000
0223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=processed_transcri
pt;transcript_status=KNOWN;transcript_name=DDX11L1-002;exon_number=2;exon_id=ENSE00003582793.1;level=2;havana_gene=OTTHUMG00000000961
.2;havana_transcript=OTTHUMT00000362751.1;tag=basic
test.bed
chr1 11868 12227 ENSG00000223972.4 . + HAVANA exon . ID=exon:ENST00000456328.2:1;Parent=ENST000004
56328.2;gene_id=ENSG00000223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript
_type=processed_transcript;transcript_status=KNOWN;transcript_name=DDX11L1-002;exon_number=1;exon_id=ENSE00002234944.1;level=2;havana
_gene=OTTHUMG00000000961.2;havana_transcript=OTTHUMT00000362751.1;tag=basic
chr1 11868 14409 ENSG00000223972.4 . + HAVANA transcript . ID=ENST00000456328.2;Parent=ENSG00000
223972.4;gene_id=ENSG00000223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcrip
t_type=processed_transcript;transcript_status=KNOWN;transcript_name=DDX11L1-002;level=2;havana_gene=OTTHUMG00000000961.2;havana_trans
cript=OTTHUMT00000362751.1;tag=basic
chr1 11868 14412 ENSG00000223972.4 . + HAVANA gene . ID=ENSG00000223972.4;gene_id=ENSG00000223972.
4;transcript_id=ENSG00000223972.4;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=pseudogene;transcript_stat
us=KNOWN;transcript_name=DDX11L1;level=2;havana_gene=OTTHUMG00000000961.2
chr1 11871 12227 ENSG00000223972.4 . + ENSEMBL exon . ID=exon:ENST00000515242.2:1;Parent=ENST000005
15242.2;gene_id=ENSG00000223972.4;transcript_id=ENST00000515242.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript
_type=transcribed_unprocessed_pseudogene;transcript_status=KNOWN;transcript_name=DDX11L1-201;exon_number=1;exon_id=ENSE00002234632.1;
level=3;havana_gene=OTTHUMG00000000961.2
chr1 11871 14412 ENSG00000223972.4 . + ENSEMBL transcript . ID=ENST00000515242.2;Parent=ENSG00000
223972.4;gene_id=ENSG00000223972.4;transcript_id=ENST00000515242.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcrip
t_type=transcribed_unprocessed_pseudogene;transcript_status=KNOWN;transcript_name=DDX11L1-201;level=3;havana_gene=OTTHUMG00000000961.
2
### GTF 格式
gtf2bed <gencode.v19.annotation.gtf > test.bed
convert2bed -i gtf -o bed <gencode.v19.annotation.gtf > test.bed
# 2. 自己写的shell命令
##GTF
cat gencode.v19.annotation.gtf | awk -F '[\t *;]' '/^chr/{if($3=="transcript"){print $1,$4,$5,$10,$13,$22,$7,$3}}' OFS="\t" >test.bed
cat gencode.v19.annotation.gtf |sed 's/;//' | awk -F '[\t *]' '/^chr/{if($3=="transcript"){print $1,$4,$5,$10,$12,$21,$7,$3}}' OFS="\t" >test.bed
## GFF
cat gencode.v19.annotation.gff3 | awk -F '[\t;]' '/^chr/{if($3=="exon"){print $1,$4,$5,$9,$11,$12,$15,$7,$3}}' OFS="\t" | sed -e 's/ID=//' -e 's/gene_id=//' -e 's/transcript_id=//' -e 's/gene_name=//' >test.bed