生物信息学分析生信基础知识

GFF或GTF格式转bed

2022-09-07  本文已影响0人  JeremyL

# 1. gff2bed和gtf2bed

首先gff2bed和gtf2bed都是BEDOPS的程序;所以使用之前需要安装# BEDOPS

## Linux平台安装BEDOPS

$ git clone https://github.com/bedops/bedops.git
$ cd bedops
$ make
$ make install

复制可执行文件到环境路径;

$ cp bin/* /usr/local/bin

## 使用

GFF文件和GTF文件均来自于gencode文件。
GFF文件:gencode.v19.annotation.gff3
GTF文件:gencode.v19.annotation.gtf

### GFF格式

gff2bed <gencode.v19.annotation.gff3 > test.bed
convert2bed -i gff -o bed <gencode.v19.annotation.gff3 > test.bed
##gff-version 3
#description: evidence-based annotation of the human genome (GRCh37), version 19 (Ensembl 74)
#provider: GENCODE
#contact: gencode@sanger.ac.uk
#format: gff3
#date: 2014-09-18
##sequence-region chr1 1 249250621
chr1    HAVANA  gene    11869   14412   .       +       .       ID=ENSG00000223972.4;gene_id=ENSG00000223972.4;transcript_id=ENSG0000
0223972.4;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=pseudogene;transcript_status=KNOWN;transcript_name
=DDX11L1;level=2;havana_gene=OTTHUMG00000000961.2
chr1    HAVANA  transcript      11869   14409   .       +       .       ID=ENST00000456328.2;Parent=ENSG00000223972.4;gene_id=ENSG000
00223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=processed_transcr
ipt;transcript_status=KNOWN;transcript_name=DDX11L1-002;level=2;havana_gene=OTTHUMG00000000961.2;havana_transcript=OTTHUMT00000362751
.1;tag=basic
chr1    HAVANA  exon    11869   12227   .       +       .       ID=exon:ENST00000456328.2:1;Parent=ENST00000456328.2;gene_id=ENSG0000
0223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=processed_transcri
pt;transcript_status=KNOWN;transcript_name=DDX11L1-002;exon_number=1;exon_id=ENSE00002234944.1;level=2;havana_gene=OTTHUMG00000000961
.2;havana_transcript=OTTHUMT00000362751.1;tag=basic
chr1    HAVANA  exon    12613   12721   .       +       .       ID=exon:ENST00000456328.2:2;Parent=ENST00000456328.2;gene_id=ENSG0000
0223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=processed_transcri
pt;transcript_status=KNOWN;transcript_name=DDX11L1-002;exon_number=2;exon_id=ENSE00003582793.1;level=2;havana_gene=OTTHUMG00000000961
.2;havana_transcript=OTTHUMT00000362751.1;tag=basic

test.bed

chr1    11868   12227   ENSG00000223972.4       .       +       HAVANA  exon    .       ID=exon:ENST00000456328.2:1;Parent=ENST000004
56328.2;gene_id=ENSG00000223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript
_type=processed_transcript;transcript_status=KNOWN;transcript_name=DDX11L1-002;exon_number=1;exon_id=ENSE00002234944.1;level=2;havana
_gene=OTTHUMG00000000961.2;havana_transcript=OTTHUMT00000362751.1;tag=basic
chr1    11868   14409   ENSG00000223972.4       .       +       HAVANA  transcript      .       ID=ENST00000456328.2;Parent=ENSG00000
223972.4;gene_id=ENSG00000223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcrip
t_type=processed_transcript;transcript_status=KNOWN;transcript_name=DDX11L1-002;level=2;havana_gene=OTTHUMG00000000961.2;havana_trans
cript=OTTHUMT00000362751.1;tag=basic
chr1    11868   14412   ENSG00000223972.4       .       +       HAVANA  gene    .       ID=ENSG00000223972.4;gene_id=ENSG00000223972.
4;transcript_id=ENSG00000223972.4;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=pseudogene;transcript_stat
us=KNOWN;transcript_name=DDX11L1;level=2;havana_gene=OTTHUMG00000000961.2
chr1    11871   12227   ENSG00000223972.4       .       +       ENSEMBL exon    .       ID=exon:ENST00000515242.2:1;Parent=ENST000005
15242.2;gene_id=ENSG00000223972.4;transcript_id=ENST00000515242.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript
_type=transcribed_unprocessed_pseudogene;transcript_status=KNOWN;transcript_name=DDX11L1-201;exon_number=1;exon_id=ENSE00002234632.1;
level=3;havana_gene=OTTHUMG00000000961.2
chr1    11871   14412   ENSG00000223972.4       .       +       ENSEMBL transcript      .       ID=ENST00000515242.2;Parent=ENSG00000
223972.4;gene_id=ENSG00000223972.4;transcript_id=ENST00000515242.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcrip
t_type=transcribed_unprocessed_pseudogene;transcript_status=KNOWN;transcript_name=DDX11L1-201;level=3;havana_gene=OTTHUMG00000000961.
2

### GTF 格式

gtf2bed <gencode.v19.annotation.gtf > test.bed
convert2bed -i gtf -o bed <gencode.v19.annotation.gtf > test.bed

# 2. 自己写的shell命令

##GTF

cat gencode.v19.annotation.gtf | awk -F '[\t *;]' '/^chr/{if($3=="transcript"){print $1,$4,$5,$10,$13,$22,$7,$3}}' OFS="\t" >test.bed

cat gencode.v19.annotation.gtf |sed 's/;//' | awk -F '[\t *]' '/^chr/{if($3=="transcript"){print $1,$4,$5,$10,$12,$21,$7,$3}}' OFS="\t" >test.bed

## GFF

cat gencode.v19.annotation.gff3 | awk -F '[\t;]' '/^chr/{if($3=="exon"){print $1,$4,$5,$9,$11,$12,$15,$7,$3}}' OFS="\t" | sed -e 's/ID=//' -e 's/gene_id=//' -e 's/transcript_id=//' -e 's/gene_name=//' >test.bed
上一篇下一篇

猜你喜欢

热点阅读