假设我们有这样一个待处理的文件"grade.txt":
M.Tansley 05/99 48311 Green 8 40 44
J.Lulu 06/99 48317 green 9 24 26
P.Bunny 02/99 48 Yellow 12 35 28
J.Troll 07/99 4842 Brown-3 12 26 26
L.Tansley 05/99 4712 Brown-2 12 30 28
#打印整个文件
zhuyupeng@zhuyupeng-PC ~
$ awk '{print $0}' grade.txt
M.Tansley 05/99 48311 Green 8 40 44
J.Lulu 06/99 48317 green 9 24 26
P.Bunny 02/99 48 Yellow 12 35 28
J.Troll 07/99 4842 Brown-3 12 26 26
L.Tansley 05/99 4712 Brown-2 12 30 28
#打印第一和第四个域
zhuyupeng@zhuyupeng-PC ~
$ awk '{print $1,$4}' grade.txt
M.Tansley Green
J.Lulu green
P.Bunny Yellow
J.Troll Brown-3
L.Tansley Brown-2
#打印表头
zhuyupeng@zhuyupeng-PC ~
$ awk 'BEGIN {print "Name Belt\n---------------------------"}
> {print $1"\t"$4}' grade.txt
Name Belt
---------------------------
M.Tansley Green
J.Lulu green
P.Bunny Yellow
J.Troll Brown-3
L.Tansley Brown-2
正则表达式相关:
为使一域号匹配正则表达式,使用符号‘~’后紧跟正则表达式,也可以用 i f语句。awk中if后面的条件用()括起来。
#下面代码打印$4 包含 Brown 的行
zhuyupeng@zhuyupeng-PC ~
$ awk '$4~/Brown/ {print $0}' grade.txt
J.Troll 07/99 4842 Brown-3 12 26 26
L.Tansley 05/99 4712 Brown-2 12 30 28
#非精确匹配
zhuyupeng@zhuyupeng-PC ~
$ awk '$3 ~/48/ {print $0}' grade.txt
M.Tansley 05/99 48311 Green 8 40 44
J.Lulu 06/99 48317 green 9 24 26
P.Bunny 02/99 48 Yellow 12 35 28
J.Troll 07/99 4842 Brown-3 12 26 26
#精确匹配
zhuyupeng@zhuyupeng-PC ~
$ awk '$3=="48" {print $0}' grade.txt
P.Bunny 02/99 48 Yellow 12 35 28
#不匹配 使用 ‘!~’
zhuyupeng@zhuyupeng-PC ~
$ awk '$0 !~ /Brown/' grade.txt
M.Tansley 05/99 48311 Green 8 40 44
J.Lulu 06/99 48317 green 9 24 26
P.Bunny 02/99 48 Yellow 12 35 28
zhuyupeng@zhuyupeng-PC ~
$ awk '$4 != "Brown-2" {print $0}' grade.txt
M.Tansley 05/99 48311 Green 8 40 44
J.Lulu 06/99 48317 green 9 24 26
P.Bunny 02/99 48 Yellow 12 35 28
J.Troll 07/99 4842 Brown-3 12 26 26
#小于
zhuyupeng@zhuyupeng-PC ~
$ awk '$6 < $7 {print $0 "$1 Try better at the next comp"}' grade.txt
M.Tansley 05/99 48311 Green 8 40 44$1 Try better at the next comp
J.Lulu 06/99 48317 green 9 24 26$1 Try better at the next comp
#设置大小写
zhuyupeng@zhuyupeng-PC ~
$ awk '/[Gg]reen/' grade.txt
M.Tansley 05/99 48311 Green 8 40 44
J.Lulu 06/99 48317 green 9 24 26
#匹配第一个域的第三个字符是‘a’
zhuyupeng@zhuyupeng-PC ~
$ awk '$1 ~/^...a/' grade.txt
M.Tansley 05/99 48311 Green 8 40 44
L.Tansley 05/99 4712 Brown-2 12 30 28
#'或'匹配,使用 ‘|’ ,需使用括号括起来
zhuyupeng@zhuyupeng-PC ~
$ awk '$0 ~/(Yellow|Brown)/' grade.txt
P.Bunny 02/99 48 Yellow 12 35 28
J.Troll 07/99 4842 Brown-3 12 26 26
L.Tansley 05/99 4712 Brown-2 12 30 28
先来总结一下awk内置变量:
ARGC 命令行参数个数
ARGV 命令行参数排列
ENVIRON 支持队列中系统环境变量的使用
FILENAME awk浏览文件名
FNR 浏览文件的记录数
FS 设置输入域分隔符,等价于命令行-F选项
NF 浏览记录的域个数
NR 已读的记录数
OFS 输出域分隔符
ORS 输出例句分隔符
RS 控制记录分隔符
zhuyupeng@zhuyupeng-PC ~
$ awk '{print NF,NR,$0} END {print FILENAME}' grade.txt
7 1 M.Tansley 05/99 48311 Green 8 40 44
7 2 J.Lulu 06/99 48317 green 9 24 26
7 3 P.Bunny 02/99 48 Yellow 12 35 28
7 4 J.Troll 07/99 4842 Brown-3 12 26 26
7 5 L.Tansley 05/99 4712 Brown-2 12 30 28
grade.txt
#使用 -F 参数指定分隔符
zhuyupeng@zhuyupeng-PC ~
$ echo $PWD
/home/zhuyupeng
zhuyupeng@zhuyupeng-PC ~
$ echo $PWD | awk -F/ '{print $NF"\t"NF}'
zhuyupeng 3
#设置变量名,将27 赋值给变量BASELINE
zhuyupeng@zhuyupeng-PC ~
$ awk 'BEGIN {BASELINE="27"} $6<BASELINE {print $0}' grade.txt
J.Lulu 06/99 48317 green 9 24 26
J.Troll 07/99 4842 Brown-3 12 26 26
#修改数值域取值,注意‘{}’
zhuyupeng@zhuyupeng-PC ~
$ awk '{if($1=="M.Tansley") $6=$6-1; print $1,$6,$7}' grade.txt
M.Tansley 39 44
J.Lulu 24 26
P.Bunny 35 28
J.Troll 26 26
L.Tansley 30 28
#修改文本域取值
zhuyupeng@zhuyupeng-PC ~
$ awk '{if($1=="J.Troll") $1="J.L.Troll"; print $1}' grade.txt
M.Tansley
J.Lulu
P.Bunny
J.L.Troll
L.Tansley
#创建新的输出域,这里新的输出域为 diff
zhuyupeng@zhuyupeng-PC ~
$ awk 'BEGIN {print "Name \t Difference"} {if($6<$7) {diff=$7-$6; print $1,diff}}' grade.txt
Name Difference
M.Tansley 4
J.Lulu 2
#统计某一个域的和,使用‘+=’ 下面的例子统计第六个域的和
zhuyupeng@zhuyupeng-PC ~
$ awk '(tot+=$6); END{print "Club student total points: " tot}' grade.txt
M.Tansley 05/99 48311 Green 8 40 44
J.Lulu 06/99 48317 green 9 24 26
P.Bunny 02/99 48 Yellow 12 35 28
J.Troll 07/99 4842 Brown-3 12 26 26
L.Tansley 05/99 4712 Brown-2 12 30 28
Club student total points: 155
#注意区别,加‘{}’则不打印文件
zhuyupeng@zhuyupeng-PC ~
$ awk '{(tot+=$6)}; END{print "Club student total points: " tot}' grade.txt
Club student total points: 155
awk 内置字符串函数
gsub(r,s) 在整个$0中用s替代r
gsub(r,s,t) 在整个t中使用s替代r
index(s,t) 在返回s中字符串t的第一个位置
length(s) 放回s长度
match(s,r) 测试s是否包含匹配r的字符串
split(s,a,fs) 在fs上将s分成序列a
sprint(fmt,exp) 返回经fmt格式化后的exp
sub(r,s) 用$0中最左边最长的子串代替s
substr(s,p) 返回字符串s中从p开始的后缀部分
substr(s,p,n) 返回字符串s中从p开始长度为n的后缀部分
#替换,目标串使用正则表达式格式‘//’
zhuyupeng@zhuyupeng-PC ~
$ awk 'gsub(/4842/,4899) {print $0}' grade.txt
J.Troll 07/99 4899 Brown-3 12 26 26
#查询字符串第一次出现的位置,注意使用BEGIN,否则每一行都会打印,字符串使用引号括起来
zhuyupeng@zhuyupeng-PC ~
$ awk 'BEGIN{print index("Bunny","ny")}' grade.txt
4
#长度
zhuyupeng@zhuyupeng-PC ~
$ awk '$1=="J.Troll" {print length($1)" "$1}' grade.txt
7 J.Troll
#match 使用: 找不到返回0,找到返模式串在匹配串中的位置,
#注:单独使用 加BEGIN
zhuyupeng@zhuyupeng-PC ~
$ awk 'BEGIN {print match("ANCD",/d/)}'
0
#以下两种模式都正确
zhuyupeng@zhuyupeng-PC ~
$ awk '$1=="J.Lulu" {print match($1,"u")}' grade.txt
4
zhuyupeng@zhuyupeng-PC ~
$ awk '$1=="J.Lulu" {print match($1,/u/)}' grade.txt
4
#split 返回字符串数组元素个数
zhuyupeng@zhuyupeng-PC ~
$ awk 'BEGIN {print split("123#456#789",myarray,"#");print myarray[1],myarray[2],myarray[3]}'
3
123 456 789
#sub,发现并替换模式的第一个位置
zhuyupeng@zhuyupeng-PC ~
$ awk '$1=="J.Troll" {sub(26,29,$0)} {print $0}' grade.txt
M.Tansley 05/99 48311 Green 8 40 44
J.Lulu 06/99 48317 green 9 24 26
P.Bunny 02/99 48 Yellow 12 35 28
J.Troll 07/99 4842 Brown-3 12 29 26
L.Tansley 05/99 4712 Brown-2 12 30 28
#substr,返回字符串指定范围内的子串
zhuyupeng@zhuyupeng-PC ~
$ awk '$1=="L.Tansley" {print substr($1,1,5)}' grade.txt
L.Tan
#使用substr返回指定位置开始的后缀部分,范围只给了一个参数,注意和上一个例子相对比
zhuyupeng@zhuyupeng-PC ~
$ awk '{print substr($1,3)}' grade.txt
Tansley
Lulu
Bunny
Troll
Tansley
#从shell中向awk传递字符串,通过 echo 加管道的方式
zhuyupeng@zhuyupeng-PC ~
$ echo "Test" | awk '{print length($0)}'
4
zhuyupeng@zhuyupeng-PC ~
$ STR="mydoc.txt"
zhuyupeng@zhuyupeng-PC ~
$ echo $STR | awk '{print substr($STR,7)}'
txt
awk 使用printf
#printf使用类似于C语言
#字符转换
zhuyupeng@zhuyupeng-PC ~
$ echo "65" | awk '{printf "%c\n",$0}'
A
zhuyupeng@zhuyupeng-PC ~
$ echo "99" | awk '{printf "%f\n",$0}'
99.000000
#格式化输出
#打印名字,左对齐,使用‘-’
zhuyupeng@zhuyupeng-PC ~
$ awk '{printf "%-15s %s\n",$1,$3}' grade.txt
M.Tansley 48311
J.Lulu 48317
P.Bunny 48
J.Troll 4842
L.Tansley 4712
#向awk传入参数
zhuyupeng@zhuyupeng-PC ~
$ awk '{if ($5 < AGE) print $0}' AGE=10 grade.txt
M.Tansley 05/99 48311 Green 8 40 44
J.Lulu 06/99 48317 green 9 24 26
zhuyupeng@zhuyupeng-PC ~
$ df -k
文件系统 1K-块 已用 可用 已用% 挂载点
D:/Program Files/bin 76155900 70397660 5758240 93% /usr/bin
D:/Program Files/lib 76155900 70397660 5758240 93% /usr/lib
D:/Program Files 76155900 70397660 5758240 93% /
C: 40857596 32552996 8304600 80% /cygdrive/c
D: 76155900 70397660 5758240 93% /cygdrive/d
zhuyupeng@zhuyupeng-PC ~
$ df -k | awk '($4 ~/^[0-9]/) {if($4 > TRIGGER) print $6"\t"$4}' TRIGGER=80000
93% 70397660
93% 70397660
93% 70397660
/cygdrive/c 8304600
/cygdrive/d 5758240
#awk脚本
下面的脚本是将该命令翻译成为一个完整脚本的形式:awk '(tot+=$6); END{print "Club student total points: " tot}' grade.txt
#!/bin/awk -f
#print a header first
BEGIN{
print "Student Date Member No. Grade Age Points Max"
print "Name Joined Gained Point Available"
print "==================================================================="
}
#let's add the scores of points gained
(tot+=$6)
#finished processing
END{
print "Club student total points :" tot
print "Average Club Student points:" tot/NR
}
#脚本运行是通过secureCRT 登陆远程的服务器运行的,控制台略有不同
[chen@localhost zyp]$ ./stu_tot.awk grade.txt
Student Date Member No. Grade Age Points Max
Name Joined Gained Point Available
===================================================================
M.Tansley 05/99 48311 Green 8 40 44
J.Lulu 06/99 48317 green 9 24 26
P.Bunny 02/99 48 Yellow 12 35 28
J.Troll 07/99 4842 Brown-3 12 26 26
L.Tansley 05/99 4712 Brown-2 12 30 28
Club student total points :155
Average Club Student points:31
#一个文件中如果有相同的行连续出现就只打印一次
strip.awk:
#!/bin/awk -f
#error_strip.awk
#to call: error_strip.awk <filename>
#strips out the ERROR* lines if there are more than one
#ERROR* lines after each filed record.
BEGIN{
error_line=""
}
#tell awk the whole is "ERROR *"
{
if ($0 == "ERROR*" && error_line == "ERROR*")
next;
error_line = $0;
print
}
stip.txt:
INVALID LCSD 98GJ23
ERROR*
ERROR*
CAUTION LPSS ERROR ON ACC NO.
ERROR*
ERROR*
ERROR*
ERROR*
ERROR*
PASS FILED INVALID ON GHSI
ERROR*
CUTION LPSS ERROR ON ACC NO.
ERROR*
ERROR*
[chen@localhost zyp]$ ./strip.awk strip.txt
INVALID LCSD 98GJ23
ERROR*
CAUTION LPSS ERROR ON ACC NO.
ERROR*
PASS FILED INVALID ON GHSI
ERROR*
CUTION LPSS ERROR ON ACC NO.
ERROR*
#在awk中使用FS变量指定分隔符的时候,FS一定要放在BEGIN部分
#!/bin/awk -f
#to call :passwd.awk /etc/passwd
#print out the first and fifth fields
BEGIN{
FS=":"
}
{ print $1,"\t",$5} #第一域是帐号名,第五域是账号所有者
[chen@localhost zyp]$ ./passwd.awk /etc/passwd
root root
bin bin
daemon daemon
adm adm
lp lp
sync sync
shutdown shutdown
halt halt
mail mail
uucp uucp
operator operator
games games
gopher gopher
ftp FTP User
nobody Nobody
...
#向AWK脚本传递参数
age.awk:
#!/bin/awk -f
#name: age.awk
#to call : age.awk AGE=n grade.txt
#prints ages that are lower than the age supplied on the command line
{
if ( $5 < AGE )
print $0
}
grade.txt:(前面已经给出)
[chen@localhost zyp]$ ./age.awk AGE=10 grade.txt
M.Tansley 05/99 48311 Green 8 40 44
J.Lulu 06/99 48317 green 9 24 26
#awk 数组,awk数组是类似于一个键值对,既可以使用数字做下标,也可以使用字符串做下标
前面介绍过split函数,并使用了一个例子:
$awk 'BEGIN {print split("123#456#789",myarray,"#")}'
3
上面例子中,split返回数组myarray下标数,实际上myarray数组为:
myarray[1]="123"
myarray[2]="456"
myarray[3]="789"
数组使用前不必定义,也不必指定数组元素个数。经常使用循环来方位数组,一般这样使用循环:
for(element in array ) print array[element]
#下面脚本先将"123#456#789" 使用split环峰,再循环打印个数组元素
#!/bin/awk -f
#name: arraytest.awk
#prints out an array
BEGIN{
record="123#456#789";
split(record,myarray,"#")
}
END{
for ( i in myarray )
{
print myarray[i]
}
}
#要运行脚本 需要使用/dev/null作为输入文件
[chen@localhost zyp]$ ./arraytest.awk /dev/null
123
456
789
grade_student.txt:
Yellow#Junior
Orange#Senior
Yellow#Junior
Purple#Junior
Brown-2#Junior
White#Senior
Orange#Senior
Red#Junior
Brown-2#Senior
Yellow#Senior
Red#Junior
Blue#Senior
Green#Senior
Purple#Junior
White#Junior
belts.awk:
#!/bin/awk -f
#name: belts.awk
#to call: belts.awk grade2.txt
#loops through the grade2.txt file and counts how many
#belts we have in(yellow,orange,red)
#also count how many adults and juniors we have
#
#start of BEGIN
#set FS and load the arrays and our values
BEGIN{
FS="#"
#load the belt colours we are interested in only
belt["Yellow"]
belt["Orange"]
belt["Red"]
#end of BEGIN
#load the student type
student["Junior"]
student["Senior"]
}
#loop thru array that holds the belt colours against field-1
#if we have a match,keep a running total
{ for (colour in belt)
{
if ($1==colour)
belt[colour]++
}
}
#loop thru array that holds the student type against
#field-2 if we have a match, keep a running total
{ for(senior_or_junior in student)
{
if($2 == senior_or_junior)
student[senior_or_junior]++
}
}
#finished processing so print out the matches..for each array
END{ for(colour in belt)
print "The club has",belt[colour],colour,"Belts"
for(senior_or_junior in student)
print "The club has",student[senior_or_junior]\
, senior_or_junior, "students"
}
##
##
脚本的作用:
1.统计Yellow、Orange和Red级别的人各是多少
2.俱乐部中有多少成年(Senior)和未成年人(Junior)
#
[chen@localhost ~]$ ./belts.awk grade_student.txt
The club has 2 Red Belts
The club has 2 Orange Belts
The club has 3 Yellow Belts
The club has 7 Senior students
The club has 8 Junior students