java实现spark(1)

用java实现简单的Spark例子。

数据很简单,是一个被预先处理的日志文件,包括时间、电话号,上行流量和下行流量。一行为一条记录,不同数据之间用制表符隔开。

样本类

样本类是为了将日志文件的一条记录封装起来

package com.icesun.java.accessLog;

import java.io.Serializable;

public class LogInfo implements Serializable {
    private static final long serialVersionUID = 5749943279909593929L;
    private long timeStamp;
    private String phoneNo;
    private long down;
    private long up;


    LogInfo(){}
    LogInfo(long timeStamp, String phoneNo, long down, long up){
        this.timeStamp = timeStamp;
        this.phoneNo = phoneNo;
        this.down = down;
        this.up = up;
    }

    public long getTimeStamp() {
        return timeStamp;
    }

    public String getPhoneNo() {
        return phoneNo;
    }

    public long getDown() {
        return down;
    }

    public long getUp() {
        return up;
    }
}

Spark Core API

package com.icesun.java.accessLog;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

public class LogSpark {
    public static void main(String [] args){
        SparkConf conf = new SparkConf().setMaster("local").setAppName("AccessLog");
        JavaSparkContext sc = new JavaSparkContext(conf);
        sc.setLogLevel("WARN");

        String path = "files/access.log";
        JavaRDD<String> lines = sc.textFile(path);

        JavaPairRDD<String,LogInfo> logPairRDD = RDD2RDDPair(lines);
        JavaPairRDD<String,LogInfo>  reduceByKey = aggregateByDeviceID(logPairRDD);

        reduceByKey.foreach(x -> System.out.println(x._2.getDown()));
        System.out.println(reduceByKey.count());

        sc.close();
    }

    //实现strRDD到LogInfo RDD的转换    电话号为K,LogInfor为V
    private static JavaPairRDD<String, LogInfo> RDD2RDDPair(JavaRDD<String> accessLogRDD){
        return accessLogRDD.mapToPair((PairFunction<String, String, LogInfo>) line -> {
            String[] logInfo = line.split("\t");
            long timeStamp = Long.valueOf(logInfo[0]);
            String phone = logInfo[1];
            long down = Long.valueOf(logInfo[2]);
            long up = Long.valueOf(logInfo[2]);

            LogInfo log = new LogInfo(timeStamp, phone, down, up);
            return new Tuple2<String, LogInfo>(phone, log);
        });
    }

    //实现reduceByKey 电话号为K,将上行流量和下行流量加和
    private static JavaPairRDD<String, LogInfo> aggregateByDeviceID(JavaPairRDD<String, LogInfo> pairRDD){
        return pairRDD.reduceByKey((Function2<LogInfo, LogInfo, LogInfo>)(v1, v2) -> {

                //时间戳为最早的时间
                long timeStamp = v1.getTimeStamp() < v2.getTimeStamp() ? v1.getTimeStamp(): v2.getTimeStamp();
                //上行流量和下行流量进行add
                long up = v1.getUp() + v2.getUp();
                long down = v1.getDown() + v2.getDown();
                String phone = v1.getPhoneNo();
                return new LogInfo(timeStamp, phone, up, down);
            }
        );
    }
}

SparkSQL

package com.icesun.java.accessLog;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;

public class LogSparkSQL {
    public static void main(String[] args){
        SparkConf conf = new SparkConf().setAppName("SparkSQL").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);

//        HiveConf hiveConf = new HiveConf(sc);
        SQLContext sqlContext = new SQLContext(sc);

        JavaRDD<String> lines = sc.textFile("files/access.log");

        //将字符串转换成LogInfoRDD
        JavaRDD<LogInfo> logInfo = lines.map( line ->{
            String[] str = line.split("\t");
            long timeStamp = Long.valueOf(str[0]);
            String phone = str[1];
            long down = Long.valueOf(str[2]);
            long up = Long.valueOf(str[3]);
            LogInfo log = new LogInfo(timeStamp, phone, down, up);
            return log;
        });

        //将RDD转换成DataSet数据集
        Dataset<Row>  df = sqlContext.createDataFrame(logInfo, LogInfo.class);
        //在dataset数据集上进行查询操作
        df.select("phoneNo", "down").where("up > 50000").show();
        
        //将df注册成临时视图,这样可以用SQL表达式进行查询操作
        df.createOrReplaceTempView("log");
        Dataset<Row> seleRs = sqlContext.sql("select * from log where up > 50000 and down < 10000");
        seleRs.toJavaRDD().foreach(row -> System.out.println(row.get(1)));
        


    }
}

原文地址

你可能感兴趣的:(大数据,spark)