Linux -- sed / awk 杂学

2020-11-12  本文已影响0人  生信摆渡

删除文件中含特定字符串的行:

# 删除含"MIR"的行,不保存源文件
# sed -i '/MIR/d' test.txt

# 删除含"MIR"的行,但不改变文件本身,操作之后的结果可在终端显示或重定向
sed -e '/MIR/d'  test.txt > test.refined 

# 删除含字符串"MIR"或“LNC"的行,将结果保存到 test.refined 
sed '/MIR/d;/LNC/d' a.txt > test.refined

保留表格中某列是特定值的行

如下所示,我只想保留转录本的注释信息,既第三列为transcript的行

zcat hg19.ensGene.gtf.gz | head
chr1    ensGene transcript  11869   14409   .   +   .   gene_id "ENSG00000223972"; transcript_id "ENST00000456328";  gene_name "ENSG00000223972";
chr1    ensGene exon    11869   12227   .   +   .   gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "1"; exon_id "ENST00000456328.1"; gene_name "ENSG00000223972";
chr1    ensGene exon    12613   12721   .   +   .   gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "2"; exon_id "ENST00000456328.2"; gene_name "ENSG00000223972";
chr1    ensGene exon    13221   14409   .   +   .   gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; exon_number "3"; exon_id "ENST00000456328.3"; gene_name "ENSG00000223972";
chr1    ensGene transcript  11872   14412   .   +   .   gene_id "ENSG00000223972"; transcript_id "ENST00000515242";  gene_name "ENSG00000223972";
chr1    ensGene exon    11872   12227   .   +   .   gene_id "ENSG00000223972"; transcript_id "ENST00000515242"; exon_number "1"; exon_id "ENST00000515242.1"; gene_name "ENSG00000223972";
chr1    ensGene exon    12613   12721   .   +   .   gene_id "ENSG00000223972"; transcript_id "ENST00000515242"; exon_number "2"; exon_id "ENST00000515242.2"; gene_name "ENSG00000223972";
chr1    ensGene exon    13225   14412   .   +   .   gene_id "ENSG00000223972"; transcript_id "ENST00000515242"; exon_number "3"; exon_id "ENST00000515242.3"; gene_name "ENSG00000223972";
chr1    ensGene transcript  11874   14409   .   +   .   gene_id "ENSG00000223972"; transcript_id "ENST00000518655";  gene_name "ENSG00000223972";
chr1    ensGene exon    11874   12227   .   +   .   gene_id "ENSG00000223972"; transcript_id "ENST00000518655"; exon_number "1"; exon_id "ENST00000518655.1"; gene_name "ENSG00000223972";

awk '{if($3~/^transcript$/)print}' hg19.refGene.gtf > genesAnno.gtf

cat genesAnno.gtf | head
chr1    refGene transcript  11869   14362   .   +   .   gene_id "LOC102725121"; transcript_id "NR_148357";  gene_name "LOC102725121";
chr1    refGene transcript  11874   14409   .   +   .   gene_id "DDX11L1"; transcript_id "NR_046018";  gene_name "DDX11L1";
chr22   refGene transcript  24666799    24813706    .   +   .   gene_id "SPECC1L"; transcript_id "NM_015330";  gene_name "SPECC1L";
chr1    refGene transcript  17369   17436   .   -   .   gene_id "MIR6859-1"; transcript_id "NR_106918";  gene_name "MIR6859-1";
chr1    refGene transcript  17369   17436   .   -   .   gene_id "MIR6859-2"; transcript_id "NR_107062";  gene_name "MIR6859-2";
chr1    refGene transcript  17369   17436   .   -   .   gene_id "MIR6859-3"; transcript_id "NR_107063";  gene_name "MIR6859-3";
chr1    refGene transcript  17369   17436   .   -   .   gene_id "MIR6859-4"; transcript_id "NR_128720";  gene_name "MIR6859-4";
chr1    refGene transcript  30366   30503   .   +   .   gene_id "MIR1302-2"; transcript_id "NR_036051";  gene_name "MIR1302-2";
chr1    refGene transcript  30366   30503   .   +   .   gene_id "MIR1302-9"; transcript_id "NR_036266";  gene_name "MIR1302-9";
chr1    refGene transcript  30366   30503   .   +   .   gene_id "MIR1302-10"; transcript_id "NR_036267";  gene_name "MIR1302-10";

cut

sort

上一篇 下一篇

猜你喜欢

热点阅读