采用spark RDD清洗apache日志(java版)

日志格式:

8.35.201.164 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/pn.png HTTP/1.1" 200 592
8.35.201.165 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/avatar.php?uid=56212&size=middle HTTP/1.1" 301 -
27.19.74.143 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/uploadbutton_small.png HTTP/1.1" 200 690
8.35.201.160 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/fastreply.gif HTTP/1.1" 200 608
8.35.201.160 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/avatar.php?uid=21212&size=middle HTTP/1.1" 301 -
8.35.201.144 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/avatar.php?uid=28823&size=middle HTTP/1.1" 301 -
8.35.201.161 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/taobao.gif HTTP/1.1" 200 1021
8.35.201.165 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/data/avatar/000/02/93/31_avatar_middle.jpg HTTP/1.1" 200 6519
8.35.201.163 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/security.png HTTP/1.1" 200 2203
8.35.201.165 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/avatar.php?uid=36174&size=middle HTTP/1.1" 301 -
8.35.201.160 - - [30/May/2013:17:38:21 +0800] "GET /static/image/common/pn_post.png HTTP/1.1" 200 3309
8.35.201.164 - - [30/May/2013:17:38:22 +0800] "GET /uc_server/data/avatar/000/05/72/32_avatar_middle.jpg HTTP/1.1" 200 5333
8.35.201.144 - - [30/May/2013:17:38:22 +0800] "GET /static/image/common/icon_quote_e.gif HTTP/1.1" 200 287
8.35.201.161 - - [30/May/2013:17:38:22 +0800] "GET /uc_server/avatar.php?uid=27067&size=small HTTP/1.1" 301 -
8.35.201.160 - - [30/May/2013:17:38:21 +0800] "GET /uc_server/data/avatar/000/05/36/35_avatar_middle.jpg HTTP/1.1" 200 10087
8.35.201.165 - - [30/May/2013:17:38:22 +0800] "GET /data/attachment/common/c5/common_13_usergroup_icon.jpg HTTP/1.1" 200 3462
8.35.201.160 - - [30/May/2013:17:38:22 +0800] "GET /static/image/magic/bump.small.gif HTTP/1.1" 200 1052
8.35.201.165 - - [30/May/2013:17:38:22 +0800] "GET /static/image/common/arw.gif HTTP/1.1" 200 940

 

相关解析代码如下:

package com.nyist.hdl.controller;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;

import com.nyist.hdl.util.ParselogsUtil;

import scala.Tuple3;

/**
 * 清洗tomcat日志
 * 
 * @author zhangchenguang
 *
 */
public class CleanLog {

	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("Spark_public");
				//.setMaster("local");
		JavaSparkContext sc = new JavaSparkContext(conf);

		// List data = Arrays.asList("hello","hello world","hello
		// you","hello me","you and me");
		// JavaRDD distData = sc.parallelize(data);
		JavaRDD distData = sc.textFile("file:///Users/zhangchenguang/Desktop/log.txt");

		JavaRDD> result = distData.map(new Function>() {

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;
			
			@Override
			public Tuple3 call(String value) throws Exception {
				
				String[] sub = ParselogsUtil.parseString(value);
				if (sub[2].startsWith("GET /static") || sub[2].startsWith("GET /uc_server"))
					return null;// 对于静态的记录直接过滤掉,不进行任何处理
				
				if (sub[2].startsWith("GET /")) {
					sub[2] = sub[2].substring("GET /".length());
				}
				if (sub[2].startsWith("POST /")) {
					sub[2] = sub[2].substring("POST /".length());
				} // 过滤掉了开头和结尾的标志信息
				if (sub[2].endsWith(" HTTP/1.1")) {
					sub[2] = sub[2].substring(0, sub[2].length() - " HTTP/1.1".length());
				}
				if (sub[2].endsWith(" HTTP/1.0")) {
					sub[2] = sub[2].substring(0, sub[2].length() - " HTTP/1.0".length());
				}
				
				Tuple3 resTuple = new Tuple3(sub[0], sub[1],sub[2]);
				System.out.println("解析后:==> "+resTuple);
				return resTuple;
			}
		});

		result.foreach(new VoidFunction>() {
			
			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			@Override
			public void call(Tuple3 t) throws Exception {
				if(t != null){
					System.out.println("result: "+t._1()+"=="+t._2()+"=="+t._3());
				}
			}
		});
		
		result.saveAsTextFile("hdfs://localhost:9000/log");
		
		sc.close();
	}

}

 

 

你可能感兴趣的:(Spark,大数据,spark)