awk

文本处理，每次读取一行，并进行分片（默认用空格分割），然后依次放入变量 $1, $2, $3, ..., $NF 中，其中 $0 用于表示所有分割元素，action 可以对数据进行处理。

# program
[pattern] { action }
# 这里的 pattern 可以省略，也可以是正则，也可以是以下将要讲到的 BEGIN, END 等。
# action 也可以被省略，默认打印所有元素，但是不能都省略 pattern 和 action
$ awk '/regexr/' input_file      # 省略了 action，其等价于 awk '/regexr/{print $0}' input_file
$ awk '{print $1}' input_file    # 省略了pattern

# mail_list 用于 awk 分析使用
Amelia       555-5553     [email protected]    F
Anthony      555-3412     [email protected]   A
Becky        555-7685     [email protected]      A
Bill         555-1675     [email protected]       A
Broderick    555-0542     [email protected] R
Camilla      555-2912     [email protected]     R
Fabius       555-1234     [email protected]    F
Julie        555-6699     [email protected]   F
Martin       555-6480     [email protected]    A
Samuel       555-3430     [email protected]        A
Jean-Paul    555-2127     [email protected]     R

# inventory-shipped
Jan  13  25  15 115
Feb  15  32  24 226
Mar  15  24  34 228
Apr  31  52  63 420
May  16  34  29 208
Jun  31  42  75 492
Jul  24  34  67 436
Aug  15  34  47 316
Sep  13  55  37 277
Oct  29  54  68 525
Nov  20  87  82 577
Dec  17  35  61 401

Jan  21  36  64 620
Feb  26  58  80 652
Mar  24  75  70 495
Apr  21  70  74 514

执行方式

awk [options] -f progfile [--] file …
awk [options] [--] 'program' file …

# -- 表示参数 options 的结束

# eg
$ awk '{print $1}' mail_list 
Amelia
Anthony
Becky
...

# 将处理代码写入 program_file 中，用 -f 方式加载
awk -f program_file input_file1 input_file2 ...

还可以写成脚本方式

#!/bin/awk -f

BEGIN { action }    # 文本开始时执行一次

pattern { action }     # 文本处理主要在这里

END { action }     # 结束时执行一次

先来进行一些简单的应用

$ awk '/li/ { print $0 }' mail-list
-| Amelia       555-5553     [email protected]    F
-| Broderick    555-0542     [email protected] R
-| Julie        555-6699     [email protected]   F
-| Samuel       555-3430     [email protected]        A

$ awk 'NF > 0' mail_list    
# NF 可以理解为分割元素数量

$ awk 'BEGIN { for (i = 1; i <= 7; i++) print int(101 * rand()) }'   
# 随机 0-100 的 7 个数

$ ls -l | awk '{ x += $5 } END{ print "total bytes:" x }'   
 # 统计当前目录文件总大小

$ awk -F: '{ print $1 }' /etc/passwd | sort    
# -F 指定分割符

$ awk 'END { print NR }' mail_list    
# NR 行数，每读入一行都会递增的，结束时就是整个文本的函数了

$ awk 'NR % 2 == 0' mail_list   
# 打印所有偶数行

包含其它文件 @

## test1 ##
BEGIN {print test1 }

## ##
@include "test2"
BEGIN {print "test2"}

正则表达比较 ~ 和 !~

exp ~ /regexp/ # 表示 exp 和 regexp 匹配的话返回真
exp !~ /regxep/ # 表示 exp 和 regexp 不匹配时返回假

$ awk '$1 ~ /J/' inventory-shipped
-| Jan  13  25  15 115
-| Jun  31  42  75 492
-| Jul  24  34  67 436
-| Jan  21  36  64 620

$ awk '$1 !~ /J/' inventory-shipped
-| Feb  15  32  24 226
-| Mar  15  24  34 228
-| Apr  31  52  63 420
-| May  16  34  29 208
…
 
以上俩等价于
awk '{ if ($1 ~ /J/) print $0 }' inventory-shipped
awk '{ if ($1 !~ /J/) print $0 }' inventory-shipped

忽略大小写

x = "aB"
if (x ~ /ab/) …   # this test will fail

IGNORECASE = 1
if (x ~ /ab/) …   # now it will succeed

Record Splitting

RS 默认是换行，通过设置该值可以改变行输入的结尾

$ awk 'BEGIN { RS = "u" }  { print $0 }' mail-list
-| Amelia       555-5553     amelia.zodiac
-| sq
-| [email protected]    F
-| Anthony      555-3412     anthony.assert
-| [email protected]   A
-| Becky        555-7685     becky.algebrar
-| [email protected]      A
-| Bill         555-1675     [email protected]       A
-| Broderick    555-0542     broderick.aliq
...

# RS 可以设置为正则表达
# RT 为 RS 每次匹配到的内容
$ echo record 1 AAAA record 2 BBBB record 3 |
> gawk 'BEGIN { RS = "\n|( *[[:upper:]]+ *)" }
>             { print "Record =", $0,"and RT = [" RT "]" }'
-| Record = record 1 and RT = [ AAAA ]
-| Record = record 2 and RT = [ BBBB ]
-| Record = record 3 and RT = [
-| ]

# 类似的还有 FS, 域分割符，默认是空格，也可以使用正则
$ echo "John Q. Smith, 29 Oak St., Walamazoo, MI 42139" | awk 'BEGIN {FS=","};{print $2}'
-|  29 Oak St.

使用外部命令，这里使用 date 作为示范，现在有一个文件的格式如下：

time.txt
1,2017-10-18 03:44:59,2017-10-18 03:46:05
2,2017-10-18 03:48:28,2017-10-18 03:48:47
3,2017-10-18 03:49:23,2017-10-18 03:56:24
4,2017-10-18 03:55:16,2017-10-18 03:56:56
5,2017-10-18 03:58:08,2017-10-18 03:59:16
6,2017-10-18 03:58:32,2017-10-18 03:59:00
7,2017-10-18 03:59:55,2017-10-18 04:01:13

现在计算两个时间的差，可以写成以下的格式：

gawk -F ',' '{
cmd_f="date +%s -d \""$2"\""; 
cmd_t="date +%s -d \""$3"\""; 
cmd_f|getline a; cmd_t|getline b; 
print b - a;
}' ./temp_time

Linux命令学习-awk

awk