Awk文本处理(grep, sed和awk等)Shell和命令

awk学习笔记

2019-03-19  本文已影响132人  ShawnMagic

视频链接

无意间在B站看到一个up做的awk入门教程,思路清晰,没有废话,所以花了一个多小时学习了下,分了3集:

老年人健忘..还是做个笔记吧,下面是视频链接:
awk入门教程-upload by 正月点灯笼

基础

shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print NR "\t" $1 "\t" $2 "\t" $3}' tmp1.xls 
1   transcript_id   Bt_F_FPKM   J_F_FPKM
2   Gh_A01G0001 0.424353    0.580204666667
3   Gh_A01G0002 1.372276    1.12152666667
4   Gh_A01G0003 2.755143    2.54099033333
5   Gh_A01G0004 30.8250546667   29.1618696667
6   Gh_A01G0005 8.28325366667   10.730383
7   Gh_A01G0006 13.7739286667   11.6380556667
8   Gh_A01G0007 3.910698    4.715743
9   Gh_A01G0008 0.581012    1.11709433333
10  Gh_A01G0009 6.04281033333   7.47224133333

shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print NR, $0}' tmp1.xls
1 transcript_id Bt_F_FPKM   J_F_FPKM    BtJ_F_FPKM
2 Gh_A01G0001   0.424353    0.580204666667  0.48476
3 Gh_A01G0002   1.372276    1.12152666667   1.22913966667
4 Gh_A01G0003   2.755143    2.54099033333   2.96335666667
5 Gh_A01G0004   30.8250546667   29.1618696667   29.7063626667
6 Gh_A01G0005   8.28325366667   10.730383   10.7914463333
7 Gh_A01G0006   13.7739286667   11.6380556667   13.528866
8 Gh_A01G0007   3.910698    4.715743    4.198037
9 Gh_A01G0008   0.581012    1.11709433333   0.942497666667
10 Gh_A01G0009  6.04281033333   7.47224133333   6.05517166667

shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print NF "\t" $0}' tmp1.xls
4   transcript_id   Bt_F_FPKM   J_F_FPKM    BtJ_F_FPKM
4   Gh_A01G0001 0.424353    0.580204666667  0.48476
4   Gh_A01G0002 1.372276    1.12152666667   1.22913966667
4   Gh_A01G0003 2.755143    2.54099033333   2.96335666667
4   Gh_A01G0004 30.8250546667   29.1618696667   29.7063626667
4   Gh_A01G0005 8.28325366667   10.730383   10.7914463333
4   Gh_A01G0006 13.7739286667   11.6380556667   13.528866
4   Gh_A01G0007 3.910698    4.715743    4.198037
4   Gh_A01G0008 0.581012    1.11709433333   0.942497666667
4   Gh_A01G0009 6.04281033333   7.47224133333   6.05517166667

shawnwx@DrdeMacBook-Pro Kmeans$ awk '$1 == "Gh_A01G0007"{print}' tmp1.xls 
Gh_A01G0007 3.910698    4.715743    4.198037
# 说明Gh_A01G0007是一个字符串而非变量

内部变量

NR NF FS

# 限定NR == 7打印第7行
shawnwx@DrdeMacBook-Pro Kmeans$ awk 'NR == 7{print}' tmp1.xls 
Gh_A01G0006 13.7739286667   11.6380556667   13.528866
shawnwx@DrdeMacBook-Pro Kmeans$ awk 'NF == 4{print}' tmp1.xls 
# 限定NF == 7打印filed为7的行
transcript_id   Bt_F_FPKM   J_F_FPKM    BtJ_F_FPKM
Gh_A01G0001 0.424353    0.580204666667  0.48476
Gh_A01G0002 1.372276    1.12152666667   1.22913966667
Gh_A01G0003 2.755143    2.54099033333   2.96335666667
Gh_A01G0004 30.8250546667   29.1618696667   29.7063626667
Gh_A01G0005 8.28325366667   10.730383   10.7914463333
Gh_A01G0006 13.7739286667   11.6380556667   13.528866
Gh_A01G0007 3.910698    4.715743    4.198037
Gh_A01G0008 0.581012    1.11709433333   0.942497666667
Gh_A01G0009 6.04281033333   7.47224133333   6.05517166667

# 如果不加文件名的话bash会进入一个awk命令状态,下面输入什么他就会按照awk命令的指示输出
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print $1, $2}'
hello world
hello world
## awk默认的分隔符为空格
hello, world
hello, world
hello,world 123 456
hello,world 123
# ===============定义全局变量================
## 用BEGIN开始,在BEGIN{}里面用FS定义分隔符为,
shawnwx@DrdeMacBook-Pro Kmeans$ awk 'BEGIN{FS=","} {print $1, $2}'
hello world 123 456
hello world 123 456 
## 由于input中用空格分割的,但是刚才定义了分割符是,所以这里把所有的都看成一列
hello,world,123,456
hello world
## awk的输入分隔符和输出分隔符不同,虽然这里修改了默认的输入分隔符为,但是输出分割符仍旧是空格。
shawnwx@DrdeMacBook-Pro Kmeans$ awk 'BEGIN{OFS=","} {print $1, $2}'
hello world 123 456
hello,world
awk 'BEGIN{FS=","; OFS=","} {print $1, $2}'
hello,world,123,456
hello,world
shawnwx@DrdeMacBook-Pro Kmeans$ awk 'BEGIN{FS=","; OFS="\t"} {print $1, $2}'
hello,world,123,456
hello   world
# ===============FILENAME==================
## 如果一个awk后面接两个file会自动把file2接到file1下面,无法区分,这时候加上FILENAME就会显示从第几行开始时file2
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print NR, FILENAME, $0}' tmp1.xls tmp2.xls 
1 tmp1.xls transcript_id    Bt_F_FPKM   J_F_FPKM    BtJ_F_FPKM
2 tmp1.xls Gh_A01G0001  0.424353    0.580204666667  0.48476
3 tmp1.xls Gh_A01G0002  1.372276    1.12152666667   1.22913966667
4 tmp1.xls Gh_A01G0003  2.755143    2.54099033333   2.96335666667
5 tmp1.xls Gh_A01G0004  30.8250546667   29.1618696667   29.7063626667
6 tmp1.xls Gh_A01G0005  8.28325366667   10.730383   10.7914463333
7 tmp1.xls Gh_A01G0006  13.7739286667   11.6380556667   13.528866
8 tmp1.xls Gh_A01G0007  3.910698    4.715743    4.198037
9 tmp1.xls Gh_A01G0008  0.581012    1.11709433333   0.942497666667
10 tmp1.xls Gh_A01G0009 6.04281033333   7.47224133333   6.05517166667
11 tmp2.xls Gh_Sca277334G01 0   0.0122663333333 0
12 tmp2.xls Gh_Sca278127G01 0   0   0
13 tmp2.xls Gh_Sca278164G01 0   0   0
14 tmp2.xls Gh_Sca280882G01 0   0   0
15 tmp2.xls Gh_Sca283304G01 0.110361333333  0.050783    0.0192516666667
16 tmp2.xls Gh_Sca284875G01 0   0   0
17 tmp2.xls Gh_Sca286293G01 0.445272333333  0.442850333333  0.595937
18 tmp2.xls Gh_Sca286786G01 0   0   0
19 tmp2.xls Gh_Sca287394G01 0   0   0
20 tmp2.xls Gh_Sca288207G01 0   0   0
# ================隐藏某列==============
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{$2="xxx";print $0}' tmp1.xls 
transcript_id xxx J_F_FPKM BtJ_F_FPKM
Gh_A01G0001 xxx 0.580204666667 0.48476
Gh_A01G0002 xxx 1.12152666667 1.22913966667
Gh_A01G0003 xxx 2.54099033333 2.96335666667
Gh_A01G0004 xxx 29.1618696667 29.7063626667
Gh_A01G0005 xxx 10.730383 10.7914463333
Gh_A01G0006 xxx 11.6380556667 13.528866
Gh_A01G0007 xxx 4.715743 4.198037
Gh_A01G0008 xxx 1.11709433333 0.942497666667
Gh_A01G0009 xxx 7.47224133333 6.05517166667
# ===============打印文本最后一列=========
# 有些情况下某些列里的字符有空格,空格又是awk默认的分隔符,所以在特殊情况下要打印最后一列并不能用print $具体第几列,或者遇见列数不一致的文本也是这样,所以用print $NF
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print $NF}' tmp1.xls 
BtJ_F_FPKM
0.48476
1.22913966667
2.96335666667
29.7063626667
10.7914463333
13.528866
4.198037
0.942497666667
6.05517166667
# 同理,打印倒二列
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{print $(NF-1)}' tmp1.xls 
J_F_FPKM
0.580204666667
1.12152666667
2.54099033333
29.1618696667
10.730383
11.6380556667
4.715743
1.11709433333
7.47224133333
# =============自定义变量=================================
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=3; print a + b}'

4
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=3; print a b}'

13
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=3; print a - b}'

-2
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=3; print a * b}'

3
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=3; print a / b}'

0.333333
# 取余
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=3; print a % b}'


1
shawnwx@DrdeMacBook-Pro Kmeans$ awk '{a=1; b=2; c=3; print a b+3}'

15

第三讲 Regular Expression 正则表达式

  1. 书写正则表达式用//
shawnwx@DrdeMacBook-Pro Kmeans$ awk '/abc/{print $0}' tmp3.txt 
abc
xxabc
xxabcxx
  1. /a.c/ 表示一个字母a中间任意字符然后字母c
shawnwx@DrdeMacBook-Pro Kmeans$ awk '/a.c/{print $0}' tmp3.txt 
abc
xxabc
xxabcxx
# .只能代表一个字符位,如果多于一个字符也匹配不到
shawnwx@DrdeMacBook-Pro Kmeans$ cat tmp3.txt 
abc
xxabc
xxabcxx
a bc
a b c 
ab c
  1. /a(\).c/ 反斜杠\表示转义字符,没有那个括号,可能和markdown语法冲突了...这样表示精确查找a.c
shawnwx@DrdeMacBook-Pro Kmeans$ vim tmp3.txt 
shawnwx@DrdeMacBook-Pro Kmeans$ awk '/a\.c/{print $0}' tmp3.txt 
a.c
# 如果提取时遇到例如/\?.等awk认定的特殊字符都需要在前面加上\
/a\/c/
/a/c/
/a\\c/
/a\c/
/a\?c\
/a?c/
  1. ^ 和 $
shawnwx@DrdeMacBook-Pro Kmeans$ awk '/^abc/{print $0}' tmp3.txt 
abc
shawnwx@DrdeMacBook-Pro Kmeans$ awk '/abc$/{print $0}' tmp3.txt 
abc
xxabc
  1. []
shawnwx@DrdeMacBook-Pro Kmeans$ awk '/a[bdf]c/{print $0}' tmp3.txt 
abc
xxabc
xxabcxx
  1. *和+
  1. ?
  1. {}
上一篇 下一篇

猜你喜欢

热点阅读