import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import scala.Tuple2;
public class SparkDemo {
static final String USER = "huanglong";
public static void main(String[] args) throws Exception {
SparkConf conf =new SparkConf().setAppName("Spark Log").setMaster("local[4]");
JavaSparkContext sc = new JavaSparkContext(conf); //其底层就是scala的sparkcontext
//JavaSparkContext sc = new JavaSparkContext("local[4]", "Spark Log", "/user/huanglong/spark", new String[0], envs);
String file="file:///Users/huanglong/access_05_30.log";
JavaRDD data = sc.textFile(file, 4).cache();
//日志格式
//27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127;
JavaRDD filter = data.filter(new org.apache.spark.api.java.function.Function() {
@Override
public Boolean call(String s) throws Exception {
//过滤合法和非静态的资源的日志
return (s.contains("POST")||s.contains("GET"))&& !s.contains("/static/");
}
});
final SimpleDateFormat tf = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z",
Locale.ENGLISH);
//只有一天的记录,不需要年月日了,时间精度到分钟
final SimpleDateFormat tf1 = new SimpleDateFormat("HH:mm");
JavaRDD