1 简单解析版
1 需求:
去除日志中字段长度小于等于 11 的日志。
2 输入数据
194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] "GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1" 304 0 "-" "Mozilla/4.0 (compatible;)"
183.49.46.228 - - [18/Sep/2013:06:49:23 +0000] "-" 400 0 "-" "-"
163.177.71.12 - - [18/Sep/2013:06:49:33 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
163.177.71.12 - - [18/Sep/2013:06:49:36 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
101.226.68.137 - - [18/Sep/2013:06:49:42 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
101.226.68.137 - - [18/Sep/2013:06:49:45 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
60.208.6.156 - - [18/Sep/2013:06:49:48 +0000] "GET /wp-content/uploads/2013/07/rcassandra.png HTTP/1.0" 200 185524 "http://cos.name/category/software/packages/" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
222.68.172.190 - - [18/Sep/2013:06:49:57 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939 "http://www.angularjs.cn/A00n" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
222.68.172.190 - - [18/Sep/2013:06:50:08 +0000] "-" 400 0 "-" "-"
183.195.232.138 - - [18/Sep/2013:06:50:16 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
183.195.232.138 - - [18/Sep/2013:06:50:16 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
66.249.66.84 - - [18/Sep/2013:06:50:28 +0000] "GET /page/6/ HTTP/1.1" 200 27777 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
221.130.41.168 - - [18/Sep/2013:06:50:37 +0000] "GET /feed/ HTTP/1.1" 304 0 "-" "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36"
157.55.35.40 - - [18/Sep/2013:06:51:13 +0000] "GET /robots.txt HTTP/1.1" 200 150 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
50.116.27.194 - - [18/Sep/2013:06:51:35 +0000] "POST /wp-cron.php?doing_wp_cron=1379487095.2510800361633300781250 HTTP/1.0" 200 0 "-" "WordPress/3.6; http://blog.fens.me"
58.215.204.118 - - [18/Sep/2013:06:51:35 +0000] "GET /nodejs-socketio-chat/ HTTP/1.1" 200 10818 "http://www.google.com/url?sa=t&rct=j&q=nodejs%20%E5%BC%82%E6%AD%A5%E5%B9%BF%E6%92%AD&source=web&cd=1&cad=rja&ved=0CCgQFjAA&url=%68%74%74%70%3a%2f%2f%62%6c%6f%67%2e%66%65%6e%73%2e%6d%65%2f%6e%6f%64%65%6a%73%2d%73%6f%63%6b%65%74%69%6f%2d%63%68%61%74%2f&ei=rko5UrylAefOiAe7_IGQBw&usg=AFQjCNG6YWoZsJ_bSj8kTnMHcH51hYQkAA&bvm=bv.52288139,d.aGc" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
58.215.204.118 - - [18/Sep/2013:06:51:36 +0000] "GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 HTTP/1.1" 304 0 "http://blog.fens.me/nodejs-socketio-chat/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
58.215.204.118 - - [18/Sep/2013:06:51:35 +0000] "GET /wp-includes/js/jquery/jquery.js?ver=1.10.2 HTTP/1.1" 304 0 "http://blog.fens.me/nodejs-socketio-chat/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
58.215.204.118 - - [18/Sep/2013:06:51:36 +0000] "GET /wp-includes/js/comment-reply.min.js?ver=3.6 HTTP/1.1" 304 0 "http://blog.fens.me/nodejs-socketio-chat/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
58.215.204.118 - - [18/Sep/2013:06:51:36 +0000] "GET /wp-content/uploads/2013/08/chat.png HTTP/1.1" 200 48968 "http://blog.fens.me/nodejs-socketio-chat/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
58.215.204.118 - - [18/Sep/2013:06:51:36 +0000] "GET /wp-content/uploads/2013/08/chat2.png HTTP/1.1" 200 59852 "http://blog.fens.me/nodejs-socketio-chat/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
58.215.204.118 - - [18/Sep/2013:06:51:37 +0000] "GET /wp-content/uploads/2013/08/socketio.png HTTP/1.1" 200 80493 "http://blog.fens.me/nodejs-socketio-chat/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
58.248.178.212 - - [18/Sep/2013:06:51:37 +0000] "GET /nodejs-grunt-intro/ HTTP/1.1" 200 51770 "http://blog.fens.me/series-nodejs/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)"
58.248.178.212 - - [18/Sep/2013:06:51:40 +0000] "GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 HTTP/1.1" 200 7200 "http://blog.fens.me/nodejs-grunt-intro/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)"
58.248.178.212 - - [18/Sep/2013:06:51:40 +0000] "GET /wp-includes/js/comment-reply.min.js?ver=3.6 HTTP/1.1" 200 786 "http://blog.fens.me/nodejs-grunt-intro/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)"
58.248.178.212 - - [18/Sep/2013:06:51:40 +0000] "GET /wp-includes/js/jquery/jquery.js?ver=1.10.2 HTTP/1.1" 200 45307 "http://blog.fens.me/nodejs-grunt-intro/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)"
58.248.178.212 - - [18/Sep/2013:06:51:40 +0000] "GET /wp-includes/js/jquery/jquery.js?ver=1.10.2 HTTP/1.1" 200 93128 "http://blog.fens.me/nodejs-grunt-intro/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)"
58.248.178.212 - - [18/Sep/2013:06:51:40 +0000] "GET /wp-includes/js/comment-reply.min.js?ver=3.6 HTTP/1.1" 200 786 "http://blog.fens.me/nodejs-grunt-intro/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)"
58.215.204.118 - - [18/Sep/2013:06:51:41 +0000] "-" 400 0 "-" "-"
58.215.204.118 - - [18/Sep/2013:06:51:41 +0000] "-" 400 0 "-" "-"
58.215.204.118 - - [18/Sep/2013:06:51:41 +0000] "-" 400 0 "-" "-"
157.55.35.40 - - [18/Sep/2013:06:51:43 +0000] "GET /rhadoop-java-basic/ HTTP/1.1" 200 26780 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
58.248.178.212 - - [18/Sep/2013:06:51:43 +0000] "GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 HTTP/1.1" 200 7200 "http://blog.fens.me/nodejs-grunt-intro/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)"
58.248.178.212 - - [18/Sep/2013:06:51:45 +0000] "GET /wp-content/uploads/2013/08/grunt.png HTTP/1.1" 200 199040 "http://blog.fens.me/nodejs-grunt-intro/" "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; MDDR; InfoPath.2; .NET4.0C)"
58.215.204.118 - - [18/Sep/2013:06:52:26 +0000] "GET /nodejs-async/ HTTP/1.1" 200 12647 "http://blog.fens.me/nodejs-socketio-chat/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
58.215.204.118 - - [18/Sep/2013:06:52:27 +0000] "GET /wp-includes/js/jquery/jquery.js?ver=1.10.2 HTTP/1.1" 304 0 "http://blog.fens.me/nodejs-async/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
58.215.204.118 - - [18/Sep/2013:06:52:27 +0000] "GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 HTTP/1.1" 304 0 "http://blog.fens.me/nodejs-async/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
58.215.204.118 - - [18/Sep/2013:06:52:27 +0000] "GET /wp-includes/js/comment-reply.min.js?ver=3.6 HTTP/1.1" 304 0 "http://blog.fens.me/nodejs-async/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
163.177.71.12 - - [18/Sep/2013:06:52:29 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
58.215.204.118 - - [18/Sep/2013:06:52:29 +0000] "GET /nodejs-async/?cf_action=sync_comments&post_id=2357 HTTP/1.1" 200 48 "http://blog.fens.me/nodejs-async/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
163.177.71.12 - - [18/Sep/2013:06:52:32 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
58.215.204.118 - - [18/Sep/2013:06:52:32 +0000] "-" 400 0 "-" "-"
58.215.204.118 - - [18/Sep/2013:06:52:33 +0000] "-" 400 0 "-" "-"
58.215.204.118 - - [18/Sep/2013:06:52:33 +0000] "-" 400 0 "-" "-"
101.226.68.137 - - [18/Sep/2013:06:52:36 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
101.226.68.137 - - [18/Sep/2013:06:52:39 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
183.195.232.138 - - [18/Sep/2013:06:53:12 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
183.195.232.138 - - [18/Sep/2013:06:53:12 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
222.66.59.174 - - [18/Sep/2013:06:53:30 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939 "http://www.angularjs.cn/A00n" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0"
222.66.59.174 - - [18/Sep/2013:06:53:36 +0000] "-" 400 0 "-" "-"
216.24.201.254 - - [18/Sep/2013:06:53:57 +0000] "GET /feed/ HTTP/1.1" 200 33867 "-" "Mozilla/5.0 (Ubuntu; X11; Linux x86_64; rv:8.0) Gecko/20100101 Firefox/8.0"
58.215.204.118 - - [18/Sep/2013:06:54:48 +0000] "GET /r-json-rjson/ HTTP/1.1" 200 11500 "http://blog.fens.me/nodejs-async/" "Mozilla/5.0 (Windows NT 5.1; rv:23.0) Gecko/20100101 Firefox/23.0"
61.135.216.105 - - [18/Sep/2013:06:54:51 +0000] "GET /comments/feed/ HTTP/1.1" 304 0 "-" "Mozilla/5.0 (compatible;YoudaoFeedFetcher/1.0;http://www.youdao.com/help/reader/faq/topic006/;1 subscribers;)"
222.66.59.174 - - [18/Sep/2013:06:55:19 +0000] "-" 400 0 "-" "-"
163.177.71.12 - - [18/Sep/2013:06:55:24 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
163.177.71.12 - - [18/Sep/2013:06:55:27 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
101.226.68.137 - - [18/Sep/2013:06:55:30 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
101.226.68.137 - - [18/Sep/2013:06:55:33 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
222.66.59.174 - - [18/Sep/2013:06:55:51 +0000] "GET /images/my.jpg HTTP/1.1" 200 19939 "http://www.angularjs.cn/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)"
183.195.232.138 - - [18/Sep/2013:06:56:07 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
183.195.232.138 - - [18/Sep/2013:06:56:07 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /vps-ip-dns/ HTTP/1.1" 200 11403 "http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=6&cad=rja&ved=0CHIQFjAF&url=http%3A%2F%2Fblog.fens.me%2Fvps-ip-dns%2F&ei=j045UrP5AYX22AXsg4G4DQ&usg=AFQjCNGsJfLMNZnwWXNpTSUl6SOEzfF6tg&sig2=YY1oxEybUL7wx3IrVIMfHA&bvm=bv.52288139,d.b2I" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/themes/silesia/style.css HTTP/1.1" 200 7554 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-includes/js/jquery/jquery.js?ver=1.10.2 HTTP/1.1" 200 32851 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.2.1 HTTP/1.1" 200 7200 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-includes/js/comment-reply.min.js?ver=3.6 HTTP/1.1" 200 786 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/themes/silesia/js/jquery.cycle.all.min.js HTTP/1.1" 200 7784 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/themes/silesia/js/load.js HTTP/1.1" 200 715 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/themes/silesia/functions/css/shortcodes.css HTTP/1.1" 200 2899 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/themes/silesia/functions/js/shortcode.js HTTP/1.1" 200 333 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/uploads/2013/06/%E5%8A%A8%E6%80%81IP%E8%A7%A3%E6%9E%90.jpg HTTP/1.1" 200 36779 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/uploads/2013/06/wtmart.png HTTP/1.1" 200 26105 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/uploads/2013/06/linux-dns.png HTTP/1.1" 200 14841 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/uploads/2013/06/win-dns.png HTTP/1.1" 200 30694 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/uploads/2013/06/win-ssh.png HTTP/1.1" 200 17524 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-includes/images/smilies/icon_smile.gif HTTP/1.1" 200 174 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/themes/silesia/images/natty-logo.png HTTP/1.1" 200 1438 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/uploads/2013/05/favicon.ico HTTP/1.1" 200 1150 "-" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /js/baidu.js HTTP/1.1" 200 249 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /js/google.js HTTP/1.1" 200 475 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/themes/silesia/images/slide-bg.png HTTP/1.1" 200 934 "http://blog.fens.me/wp-content/themes/silesia/style.css" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/themes/silesia/images/sprites/post-type.png HTTP/1.1" 200 2009 "http://blog.fens.me/wp-content/themes/silesia/style.css" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:41 +0000] "GET /wp-content/themes/silesia/images/ico-twitter.png HTTP/1.1" 200 2128 "http://blog.fens.me/wp-content/themes/silesia/style.css" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:41 +0000] "GET /wp-content/themes/silesia/images/ico-meta.gif HTTP/1.1" 200 73 "http://blog.fens.me/wp-content/themes/silesia/style.css" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/themes/silesia/images/crubms-div.png HTTP/1.1" 200 1255 "http://blog.fens.me/wp-content/themes/silesia/style.css" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/themes/silesia/images/bullets/5.gif HTTP/1.1" 200 62 "http://blog.fens.me/wp-content/themes/silesia/style.css" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:40 +0000] "GET /wp-content/themes/silesia/images/home-ico.png HTTP/1.1" 200 1103 "http://blog.fens.me/wp-content/themes/silesia/style.css" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:42 +0000] "GET /vps-ip-dns/?cf_action=sync_comments&post_id=499 HTTP/1.1" 200 48 "http://blog.fens.me/vps-ip-dns/" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
71.96.108.116 - - [18/Sep/2013:06:56:46 +0000] "-" 400 0 "-" "-"
71.96.108.116 - - [18/Sep/2013:06:56:46 +0000] "-" 400 0 "-" "-"
3 实现代码:
1)编写 LogMapper
package com.da.weblog;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class LogMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper.Context context)
throws IOException, InterruptedException {
// 1 获取 1 行数据
String line = value.toString();
// 2 解析日志
boolean result = parseLog(line, context);
// 3 日志不合法退出
if (!result) {
return;
}
// 4 设置 key
k.set(line);
// 5 写出数据
context.write(k, NullWritable.get());
}
// 解析日志
private boolean parseLog(String line, Mapper.Context context) {
// 切割
String[] fields = line.split(" ");
if (fields.length > 11) {
// 系统计数器
context.getCounter("LogMapper", "parseLog_true").increment(1);
return true;
} else {
context.getCounter("LogMapper", "parseLog_false").increment(1);
return false;
}
}
}
2)编写 LogDriver
package com.da.weblog;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LogDriver {
public static void main(String[] args) throws Exception {
args = new String[] { "e:/mrinput", "e:/mrout" };
// 1 获取 job 信息
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 2 加载 jar 包
job.setJarByClass(LogDriver.class);
// 3 关联 map
job.setMapperClass(LogMapper.class);
// 4 设置最终输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 设置 reducetask 个数为 0
job.setNumReduceTasks(0);
// 5 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 6 提交
job.waitForCompletion(true);
}
}
2 复杂解析版
1 需求:
对 web 访问日志中的各字段识别切分
去除日志中不合法的记录
根据统计需求,生成各类访问请求过滤数据
2 输入数据:
同上
3 实现代码:
1) 定义一个 bean,用来记录日志数据中的各数据字段
package com.da.weblog2;
public class LogBean {
private String remote_addr;// 记录客户端的 ip 地址
private String remote_user;// 记录客户端用户名称,忽略属性"-"
private String time_local;// 记录访问时间与时区
private String request;// 记录请求的 url 与 http 协议
private String status;// 记录请求状态;成功是 200
private String body_bytes_sent;// 记录发送给客户端文件主体内容大小
private String http_referer;// 用来记录从那个页面链接访问过来的
private String http_user_agent;// 记录客户浏览器的相关信息
private boolean valid = true;// 判断数据是否合法
public String getRemote_addr() {
return remote_addr;
}
public void setRemote_addr(String remote_addr) {
this.remote_addr = remote_addr;
}
public String getRemote_user() {
return remote_user;
}
public void setRemote_user(String remote_user) {
this.remote_user = remote_user;
}
public String getTime_local() {
return time_local;
}
public void setTime_local(String time_local) {
this.time_local = time_local;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getBody_bytes_sent() {
return body_bytes_sent;
}
public void setBody_bytes_sent(String body_bytes_sent) {
this.body_bytes_sent = body_bytes_sent;
}
public String getHttp_referer() {
return http_referer;
}
public void setHttp_referer(String http_referer) {
this.http_referer = http_referer;
}
public String getHttp_user_agent() {
return http_user_agent;
}
public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(this.valid);
sb.append("\001").append(this.remote_addr);
sb.append("\001").append(this.remote_user);
sb.append("\001").append(this.time_local);
sb.append("\001").append(this.request);
sb.append("\001").append(this.status);
sb.append("\001").append(this.body_bytes_sent);
sb.append("\001").append(this.http_referer);
sb.append("\001").append(this.http_user_agent);
return sb.toString();
}
}
2) 编写 LogMapper 程序
package com.da.weblog2;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class LogMapper extends Mapper {
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1 获取 1 行
String line = value.toString();
// 2 解析日志是否合法
LogBean bean = pressLog(line);
if (!bean.isValid()) {
return;
}
k.set(bean.toString());
// 3 输出
context.write(k, NullWritable.get());
}
// 解析日志
private LogBean pressLog(String line) {
LogBean logBean = new LogBean();
// 1 截取
String[] fields = line.split(" ");
if (fields.length > 11) {
// 2 封装数据
logBean.setRemote_addr(fields[0]);
logBean.setRemote_user(fields[1]);
logBean.setTime_local(fields[3].substring(1));
logBean.setRequest(fields[6]);
logBean.setStatus(fields[8]);
logBean.setBody_bytes_sent(fields[9]);
logBean.setHttp_referer(fields[10]);
if (fields.length > 12) {
logBean.setHttp_user_agent(fields[11] + " " + fields[12]);
} else {
logBean.setHttp_user_agent(fields[11]);
}
// 大于 400, HTTP 错误
if (Integer.parseInt(logBean.getStatus()) >= 400) {
logBean.setValid(false);
}
} else {
logBean.setValid(false);
}
return logBean;
}
}
3)编写 LogDriver 程序
package com.da.weblog2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LogDriver {
public static void main(String[] args) throws Exception {
args = new String[] { "e:/mrinput", "e:/mrout" };
// 1 获取 job 信息
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 2 加载 jar 包
job.setJarByClass(LogDriver.class);
// 3 关联 map
job.setMapperClass(LogMapper.class);
// 4 设置最终输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 5 设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 6 提交
job.waitForCompletion(true);
}
}