一起生信啦啦啦

从ensembl提取intron序列 更新中

2018-10-23  本文已影响4人  苏牧传媒

从gff3提取intron的坐标信息:

# 提取mRNA和exon信息:

awk '{if ($3 == "mRNA" ) print $0}' Homo_sapiens.GRCh38.94.gff3 > mRNA.gff3

awk '{if ($3 == "exon" ) print $0}' Homo_sapiens.GRCh38.94.gff3 > exon.gff3

# 分别提取正负链的信息到bed:

awk 'BEGIN{FS="\t|:|;|=";OFS="\t"} {if($7=="+") print "chr"$1,$4,$5,$14,$16,$7,$11}' mRNA.gff3 | awk -F '\t' -v OFS='\t' 'gsub(/-[0-9]+/,"",$5)' > mRNA.+.bed

awk 'BEGIN{FS="\t|:|;|=";OFS="\t"} {if($7=="-") print "chr"$1,$4,$5,$14,$16,$7,$11}' mRNA.gff3 | awk -F '\t' -v OFS='\t' 'gsub(/-[0-9]+/,"",$5)' > mRNA.-.bed

awk 'BEGIN{FS="\t|:|;|=";OFS="\t"} {if($7=="+") print "chr"$1,$4,$5,$11,"rank="$23,$7}' exon.gff3 > exon.+.bed

awk 'BEGIN{FS="\t|:|;|=";OFS="\t"} {if($7=="-") print "chr"$1,$4,$5,$11,"rank="$23,$7}' exon.gff3 > exon.-.bed

# 取mRNA的exon补集:

bedtools subtract -a mRNA.+.bed -b exon.+.bed > intron.+.bed

bedtools subtract -a mRNA.-.bed -b exon.-.bed > intron.-.bed

# 取uniq的内含子坐标:

cat intron.+.bed intron.-.bed | sort -k1,1 -k2,2n | awk '{if($2!=start || $3!=end)print; start=$2;end=$3}'> intron.bed

# 构建成dict文件:

awk -v OFS='\t' '{if($6=="+") print $7,"intron",$1":"$2".."$3,$1,$2,$3,$6,".","intron:xxx",$4,$4","$7,$3-$2; else print $7,"intron",$1":complement("$2".."$3")",$1,$3,$2,$6,".","intron:xxx",$4,$4","$7,$3-$2}' intron.bed | awk -v OFS='\t' 'gsub(/chr/,"",$3)' > intron.dict

上一篇下一篇

猜你喜欢

热点阅读