大数据IMF传奇行动绝密课程第104-114课:Spark Streaming电商广告点击综合案例

Spark Streaming电商广告点击综合案例

需求分析和技术架构

广告点击系统实时分析
广告来自于广告或者移动App等,广告需要设定在具体的广告位,当用户点击广告的时候,一般都会通过ajax或Socket往后台发送日志数据,在这里我们是要做基于SparkStreaming做实时在线统计。那么数据就需要放进消息系统(Kafka)中,我们的Spark Streaming应用程序就会去Kafka中Pull数据过来进行计算和消费,并把计算后的数据放入到持久化系统中(MySQL)
广告点击系统实时分析的意义:因为可以在线实时的看见广告的投放效果,就为广告的更大规模的投入和调整打下了坚实的基础,从而为公司带来最大化的经济回报。
核心需求:
1、实时黑名单动态过滤出有效的用户广告点击行为:因为黑名单用户可能随时出现,所以需要动态更新;
2、在线计算广告点击流量;
3、Top3热门广告;
4、每个广告流量趋势;
5、广告点击用户的区域分布分析
6、最近一分钟的广告点击量;
7、整个广告点击Spark Streaming处理程序7*24小时运行;

数据格式:
时间、用户、广告、城市等

技术细节:
在线计算用户点击的次数分析,屏蔽IP等;
使用updateStateByKey或者mapWithState进行不同地区广告点击排名的计算;
Spark Streaming+Spark SQL+Spark Core等综合分析数据;
使用Window类型的操作;
高可用和性能调优等等;
流量趋势,一般会结合DB等;
Spark Core

大数据IMF传奇行动绝密课程第104-114课:Spark Streaming电商广告点击综合案例_第1张图片

/**
 * 
 */
package com.tom.spark.SparkApps.sparkstreaming;

import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.Random;

import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;

/**
 * 数据生成代码,Kafka Producer产生数据
 */
public class MockAdClickedStat {

    /**
     * @param args
     */
    public static void main(String[] args) {
        final Random random = new Random();
        final String[] provinces = new String[]{"Guangdong", "Zhejiang", "Jiangsu", "Fujian"};
        final Map cities = new HashMap();
        cities.put("Guangdong", new String[]{"Guangzhou", "Shenzhen", "Dongguan"});
        cities.put("Zhejiang", new String[]{"Hangzhou", "Wenzhou", "Ningbo"});
        cities.put("Jiangsu", new String[]{"Nanjing", "Suzhou", "Wuxi"});
        cities.put("Fujian", new String[]{"Fuzhou", "Xiamen", "Sanming"});

        final String[] ips = new String[] {
                "192.168.112.240",
                "192.168.112.239",
                "192.168.112.245",
                "192.168.112.246",
                "192.168.112.247",
                "192.168.112.248",
                "192.168.112.249",
                "192.168.112.250",
                "192.168.112.251",
                "192.168.112.252",
                "192.168.112.253",
                "192.168.112.254",
        };
        /**
         * Kafka相关的基本配置信息
         */
        Properties kafkaConf = new Properties();
        kafkaConf.put("serializer.class", "kafka.serializer.StringEncoder");
        kafkaConf.put("metadeta.broker.list", "Master:9092,Worker1:9092,Worker2:9092");
        ProducerConfig producerConfig = new ProducerConfig(kafkaConf);
        final Producer producer = new Producer(producerConfig);
        new Thread(new Runnable() {

            public void run() {
                while(true) {
                    //在线处理广告点击流的基本数据格式:timestamp、ip、userID、adID、province、city
                    Long timestamp = new Date().getTime();
                    String ip = ips[random.nextInt(12)]; //可以采用网络上免费提供的ip库
                    int userID = random.nextInt(10000);
                    int adID = random.nextInt(100);
                    String province = provinces[random.nextInt(4)];
                    String city = cities.get(province)[random.nextInt(3)];
                    String clickedAd = timestamp + "\t" + ip + "\t" + userID + "\t" + adID + "\t" + province + "\t" + city;
                    producer.send(new KeyedMessage("AdClicked", clickedAd));

                    try {
                        Thread.sleep(50);
                    } catch (InterruptedException e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                    }
                }
            }
        }).start();
    }

}
package com.tom.spark.SparkApps.sparkstreaming;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;

import kafka.serializer.StringDecoder;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.api.java.JavaStreamingContextFactory;
import org.apache.spark.streaming.kafka.KafkaUtils;

import com.google.common.base.Optional;

import scala.Tuple2;

/**
  * 数据处理,Kafka消费者
  */
public class AdClickedStreamingStats {

    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        //好处:1、checkpoint 2、工厂
        final SparkConf conf = new SparkConf().setAppName("SparkStreamingOnKafkaDirect").setMaster("hdfs://Master:7077/");
        final String checkpointDirectory = "hdfs://Master:9000/library/SparkStreaming/CheckPoint_Data";

        JavaStreamingContextFactory factory = new JavaStreamingContextFactory() {

            public JavaStreamingContext create() {
                // TODO Auto-generated method stub
                return createContext(checkpointDirectory, conf);
            }

        };

        /**
         * 可以从失败中恢复Driver,不过还需要指定Driver这个进程运行在Cluster,并且在提交应用程序的时候制定--supervise;
         */
        JavaStreamingContext javassc = JavaStreamingContext.getOrCreate(checkpointDirectory, factory);
        /**
         * 第三步:创建Spark Streaming输入数据来源input Stream:
         * 1、数据输入来源可以基于File、HDFS、Flume、Kafka、Socket等
         * 2、在这里我们指定数据来源于网络Socket端口,Spark Streaming连接上该端口并在运行的时候一直监听该端口的数据
         *      (当然该端口服务首先必须存在),并且在后续会根据业务需要不断有数据产生(当然对于Spark Streaming
         *      应用程序的运行而言,有无数据其处理流程都是一样的)
         * 3、如果经常在每间隔5秒钟没有数据的话不断启动空的Job其实会造成调度资源的浪费,因为并没有数据需要发生计算;所以
         *      实际的企业级生成环境的代码在具体提交Job前会判断是否有数据,如果没有的话就不再提交Job;
         */

        //创建Kafka元数据来让Spark Streaming这个Kafka Consumer利用

        Map kafkaParameters = new HashMap();
        kafkaParameters.put("metadata.broker.list", "Master:9092,Worker1:9092,Worker2:9092");
        Set topics = new HashSet();
        topics.add("SparkStreamingDirected");

        JavaPairInputDStream adClickedStreaming = KafkaUtils.createDirectStream(javassc, 
                String.class, String.class, 
                StringDecoder.class, StringDecoder.class,
                kafkaParameters, 
                topics);
        /**因为要对黑名单进行过滤,而数据是在RDD中的,所以必然使用transform这个函数;
         * 但是在这里我们必须使用transformToPair,原因是读取进来的Kafka的数据是Pair类型,
         * 另一个原因是过滤后的数据要进行进一步处理,所以必须是读进的Kafka数据的原始类型
         * 
         * 在此再次说明,每个Batch Duration中实际上讲输入的数据就是被一个且仅被一个RDD封装的,你可以有多个
         * InputDStream,但其实在产生job的时候,这些不同的InputDStream在Batch Duration中就相当于Spark基于HDFS
         * 数据操作的不同文件来源而已罢了。
         */

        JavaPairDStream filteredadClickedStreaming = adClickedStreaming.transformToPair(new Function, JavaPairRDD>() {

            public JavaPairRDD call(
                    JavaPairRDD rdd) throws Exception {
                /**
                 * 在线黑名单过滤思路步骤:
                 * 1、从数据库中获取黑名单转换成RDD,即新的RDD实例封装黑名单数据;
                 * 2、然后把代表黑名单的RDD的实例和Batch Duration产生的RDD进行Join操作,
                 * 准确的说是进行leftOuterJoin操作,也就是说使用Batch Duration产生的RDD和代表黑名单的RDD实例进行
                 * leftOuterJoin操作,如果两者都有内容的话,就会是true,否则的话就是false
                 * 
                 * 我们要留下的是leftOuterJoin结果为false;
                 * 
                 */
                final List blackListNames = new ArrayList();
                JDBCWrapper jdbcWrapper = JDBCWrapper.getJDBCInstance();
                jdbcWrapper.doQuery("SELECT * FROM blacklisttable", null, new ExecuteCallBack() {

                    public void resultCallBack(ResultSet result) throws Exception {

                        while(result.next()){
                            blackListNames.add(result.getString(1));
                        }
                    }
                });
                List> blackListTuple = new ArrayList>();
                for(String name : blackListNames) {
                    blackListTuple.add(new Tuple2(name, true));
                }
                List> blacklistFromListDB = blackListTuple; //数据来自于查询的黑名单表并且映射成为

                JavaSparkContext jsc = new JavaSparkContext(rdd.context());

                /**
                 * 黑名单的表中只有userID,但是如果要进行join操作的话就必须是Key-Value,所以在这里我们需要
                 * 基于数据表中的数据产生Key-Value类型的数据集合
                 */

                JavaPairRDD blackListRDD = jsc.parallelizePairs(blacklistFromListDB);

                /**
                 * 进行操作的时候肯定是基于userID进行join,所以必须把传入的rdd进行mapToPair操作转化成为符合格式的RDD
                 *
                 */
                JavaPairRDD> rdd2Pair = rdd.mapToPair(new PairFunction, String, Tuple2>() {

                    public Tuple2> call(
                            Tuple2 t) throws Exception {
                        // TODO Auto-generated method stub
                        String userID = t._2.split("\t")[2];

                        return new Tuple2>(userID, t);
                    }
                });
                JavaPairRDD, Optional>> joined = rdd2Pair.leftOuterJoin(blackListRDD);

                JavaPairRDD result = joined.filter(new Function,Optional>>, Boolean>() {

                    public Boolean call(
                            Tuple2, Optional>> tuple)
                            throws Exception {
                        // TODO Auto-generated method stub
                        Optional optional = tuple._2._2;
                        if(optional.isPresent() && optional.get()){
                            return false;
                        } else {
                            return true;
                        }
                    }
                }).mapToPair(new PairFunction,Optional>>, String, String>() {

                    public Tuple2 call(
                            Tuple2, Optional>> t)
                            throws Exception {
                        // TODO Auto-generated method stub
                        return t._2._1;
                    }
                });
                return result;
            }
        });
        //广告点击的基本数据格式:timestamp、ip、userID、adID、province、city
        JavaPairDStream pairs = filteredadClickedStreaming.mapToPair(new PairFunction, String, Long>() {

            public Tuple2 call(Tuple2 t) throws Exception {
                 String[] splited=t._2.split("\t");
                 String timestamp = splited[0]; //YYYY-MM-DD
                 String ip = splited[1];
                 String userID = splited[2];
                 String adID = splited[3];
                 String province = splited[4];
                 String city = splited[5];      

                 String clickedRecord = timestamp + "_" +ip + "_"+userID+"_"+adID+"_"
                             +province +"_"+city;
                 return new Tuple2(clickedRecord, 1L);
            }
        });
     /**
     * 第4.3步:在单词实例计数为1基础上,统计每个单词在文件中出现的总次数
     */
        JavaPairDStream adClickedUsers= pairs.reduceByKey(new Function2() {
              public Long call(Long i1, Long i2) throws Exception{
                return i1 + i2;
              }
            });
        /*判断有效的点击,复杂化的采用机器学习训练模型进行在线过滤    简单的根据ip判断1天不超过100次;也可以通过一个batch duration的点击次数
            判断是否非法广告点击,通过一个batch来判断是不完整的,还需要一天的数据也可以每一个小时来判断。*/
        JavaPairDStream filterClickedBatch = adClickedUsers.filter(new Function, Boolean>() {

            public Boolean call(Tuple2 v1) throws Exception {
            if (1 < v1._2){
                //更新一些黑名单的数据库表
                return false;
            } else { 
                return true;
                    }

            }
        });

        //filterClickedBatch.print();
        //写入数据库
        filterClickedBatch.foreachRDD(new Function, Void>() {

            public Void call(JavaPairRDD rdd) throws Exception {
                 rdd.foreachPartition(new VoidFunction>>() {

                    public void call(Iterator> partition) throws Exception {
                        //使用数据库连接池的高效读写数据库的方式将数据写入数据库mysql
                        //例如一次插入 1000条 records,使用insertBatch 或 updateBatch
                        //插入的用户数据信息:userID,adID,clickedCount,time
                        //这里面有一个问题,可能出现两条记录的key是一样的,此时需要更新累加操作
                        List userAdClickedList = new ArrayList();
                        while(partition.hasNext()) {
                            Tuple2 record = partition.next();
                            String[] splited = record._1.split("\t");

                            UserAdClicked userClicked = new UserAdClicked();
                            userClicked.setTimestamp(splited[0]);
                            userClicked.setIp(splited[1]);
                            userClicked.setUserID(splited[2]);
                            userClicked.setAdID(splited[3]);
                            userClicked.setProvince(splited[4]);
                            userClicked.setCity(splited[5]);

                            userAdClickedList.add(userClicked);
                        }

                        final List inserting = new ArrayList();
                        final List updating = new ArrayList();

                        JDBCWrapper jdbcWrapper = JDBCWrapper.getJDBCInstance();

                        //表的字段timestamp、ip、userID、adID、province、city、clickedCount
                        for(final UserAdClicked clicked : userAdClickedList) {
                            jdbcWrapper.doQuery("SELECT clickedCount FROM adclicked WHERE"
                                    + " timestamp =? AND userID = ? AND adID = ?",
                                    new Object[]{clicked.getTimestamp(), clicked.getUserID(),
                                            clicked.getAdID()}, new ExecuteCallBack() {

                                                public void resultCallBack(ResultSet result) throws Exception {
                                                    // TODO Auto-generated method stub
                                                    if(result.next()) {
                                                        long count = result.getLong(1);
                                                        clicked.setClickedCount(count);
                                                        updating.add(clicked);
                                                    } else {
                                                        inserting.add(clicked);
                                                        clicked.setClickedCount(1L);
                                                    }
                                                }
                                            });
                        }
                        //表的字段timestamp、ip、userID、adID、province、city、clickedCount
                        List insertParametersList = new ArrayList();
                        for(UserAdClicked insertRecord : inserting) {
                            insertParametersList.add(new Object[] {
                                insertRecord.getTimestamp(),
                                insertRecord.getIp(),
                                insertRecord.getUserID(),
                                insertRecord.getAdID(),
                                insertRecord.getProvince(),
                                insertRecord.getCity(),
                                insertRecord.getClickedCount()
                            });
                        }
                        jdbcWrapper.doBatch("INSERT INTO adclicked VALUES(?, ?, ?, ?, ?, ?, ?)", insertParametersList);

                        //表的字段timestamp、ip、userID、adID、province、city、clickedCount
                        List updateParametersList = new ArrayList();
                        for(UserAdClicked updateRecord : updating) {
                            updateParametersList.add(new Object[] {
                                updateRecord.getTimestamp(),
                                updateRecord.getIp(),
                                updateRecord.getUserID(),
                                updateRecord.getAdID(),
                                updateRecord.getProvince(),
                                updateRecord.getCity(),
                                updateRecord.getClickedCount() + 1
                            });
                        }
                        jdbcWrapper.doBatch("UPDATE adclicked SET clickedCount =  ? WHERE"
                                    + " timestamp =? AND ip = ? AND userID = ? AND adID = ? "
                                    + "AND province = ? AND city = ?", updateParametersList);

                    }
                });
                return null;
            }
        });
     //再次过滤,从数据库中读取数据过滤黑名单
    JavaPairDStream  blackListBasedOnHistory = filterClickedBatch.filter(new Function, Boolean>() {

        public Boolean call(Tuple2 v1) throws Exception {

            //广告点击的基本数据格式:timestamp,ip,userID,adID,province,city
            String[] splited = v1._1.split("\t"); //提取key值
            String date =splited[0];
            String userID =splited[2];
            String adID =splited[3];
            //查询一下数据库同一个用户同一个广告id点击量超过50次列入黑名单
            //接下来 根据date、userID、adID条件去查询用户点击广告的数据表,获得总的点击次数
            //这个时候基于点击次数判断是否属于黑名单点击

            int clickedCountTotalToday = 81 ;

            if (clickedCountTotalToday > 50) {
                  return true;
            }else {
                return false ;
            }

        }


     });
    //map操作,找出用户的id
    JavaDStream   blackListuserIDBasedInBatchOnhistroy =blackListBasedOnHistory.map(new Function, String>() {

        public String call(Tuple2 v1) throws Exception {
            // TODO Auto-generated method stub
            return v1._1.split("\t")[2];
        }
    });

    //有一个问题,数据可能重复,在一个partition里面重复,这个好办;
    //但多个partition不能保证一个用户重复,需要对黑名单的整个rdd进行去重操作。
    //rdd去重了,partition也就去重了,一石二鸟,一箭双雕
    // 找出了黑名单,下一步就写入黑名单数据库表中
    JavaDStream  blackListUniqueuserBasedInBatchOnhistroy = blackListuserIDBasedInBatchOnhistroy.transform(new Function, JavaRDD>() {

        public JavaRDD call(JavaRDD rdd) throws Exception {
            // TODO Auto-generated method stub
            return rdd.distinct();
        }


    });

    // 下一步写入到数据表中

    blackListUniqueuserBasedInBatchOnhistroy.foreachRDD(new Function, Void>() {

        public Void call(JavaRDD rdd) throws Exception {
             rdd.foreachPartition(new VoidFunction>() {

                public void call(Iterator t) throws Exception {
                    // TODO Auto-generated method stub
                    //插入的用户信息可以只包含:useID
                    //此时直接插入黑名单数据表即可。
                    //写入数据库
                    List blackList = new ArrayList();
                    while(t.hasNext()) {
                        blackList.add(new Object[]{t.next()});
                    }
                    JDBCWrapper jdbcWrapper = JDBCWrapper.getJDBCInstance();
                    jdbcWrapper.doBatch("INSERT INTO blacklisttable values (?)", blackList);


                }

            });
            return null;
        }
    });

    /**广告点击累计动态更新,每个updateStateByKey都会在Batch Duration的时间间隔的基础上进行广告点击次数的更新,
     * 更新之后我们一般都会持久化到外部存储设备上,在这里我们存储到MySQL数据库中
     */
    JavaPairDStream updateStateByKeyDSteam = filteredadClickedStreaming.mapToPair(new PairFunction, String, Long>() {

        public Tuple2 call(Tuple2 t)
                throws Exception {
             String[] splited=t._2.split("\t");
             String timestamp = splited[0]; //YYYY-MM-DD
             String ip = splited[1];
             String userID = splited[2];
             String adID = splited[3];
             String province = splited[4];
             String city = splited[5];      

             String clickedRecord = timestamp + "_" +ip + "_"+userID+"_"+adID+"_"
                         +province +"_"+city;
             return new Tuple2(clickedRecord, 1L);
        }
    }).updateStateByKey(new Function2, Optional, Optional>() {

        public Optional call(List v1, Optional v2)
                throws Exception {
            // v1:当前的Key在当前的Batch Duration中出现的次数的集合,例如{1,1,1,。。。,1}
            // v2:当前的Key在以前的Batch Duration中积累下来的结果;
            Long clickedTotalHistory = 0L; 
            if(v2.isPresent()){
                clickedTotalHistory = v2.get();
            }
            for(Long one : v1) {
                clickedTotalHistory += one;
            }
            return Optional.of(clickedTotalHistory);
        }
    });
    updateStateByKeyDSteam.foreachRDD(new Function, Void>() {

        public Void call(JavaPairRDD rdd) throws Exception {
             rdd.foreachPartition(new VoidFunction>>() {

                public void call(Iterator> partition) throws Exception {
                    //使用数据库连接池的高效读写数据库的方式将数据写入数据库mysql
                    //例如一次插入 1000条 records,使用insertBatch 或 updateBatch
                    //插入的用户数据信息:timestamp、adID、province、city
                    //这里面有一个问题,可能出现两条记录的key是一样的,此时需要更新累加操作
                    List AdClickedList = new ArrayList();
                    while(partition.hasNext()) {
                        Tuple2 record = partition.next();
                        String[] splited = record._1.split("\t");

                        AdClicked adClicked = new AdClicked();
                        adClicked.setTimestamp(splited[0]);
                        adClicked.setAdID(splited[1]);
                        adClicked.setProvince(splited[2]);
                        adClicked.setCity(splited[3]);
                        adClicked.setClickedCount(record._2);
                        AdClickedList.add(adClicked);
                    }

                    final List inserting = new ArrayList();
                    final List updating = new ArrayList();

                    JDBCWrapper jdbcWrapper = JDBCWrapper.getJDBCInstance();

                    //表的字段timestamp、ip、userID、adID、province、city、clickedCount
                    for(final AdClicked clicked : AdClickedList) {
                        jdbcWrapper.doQuery("SELECT clickedCount FROM adclickedcount WHERE"
                                + " timestamp = ? AND adID = ? AND province = ? AND city = ?",
                                new Object[]{clicked.getTimestamp(), clicked.getAdID(),
                                        clicked.getProvince(), clicked.getCity()}, new ExecuteCallBack() {

                                            public void resultCallBack(ResultSet result) throws Exception {
                                                // TODO Auto-generated method stub
                                                if(result.next()) {
                                                    long count = result.getLong(1);
                                                    clicked.setClickedCount(count);
                                                    updating.add(clicked);
                                                } else {
                                                    inserting.add(clicked);
                                                    clicked.setClickedCount(1L);
                                                }
                                            }
                                        });
                    }
                    //表的字段timestamp、ip、userID、adID、province、city、clickedCount
                    List insertParametersList = new ArrayList();
                    for(AdClicked insertRecord : inserting) {
                        insertParametersList.add(new Object[] {
                            insertRecord.getTimestamp(),
                            insertRecord.getAdID(),
                            insertRecord.getProvince(),
                            insertRecord.getCity(),
                            insertRecord.getClickedCount()
                        });
                    }
                    jdbcWrapper.doBatch("INSERT INTO adclickedcount VALUES(?, ?, ?, ?, ?)", insertParametersList);

                    //表的字段timestamp、ip、userID、adID、province、city、clickedCount
                    List updateParametersList = new ArrayList();
                    for(AdClicked updateRecord : updating) {
                        updateParametersList.add(new Object[] {
                            updateRecord.getClickedCount(),
                            updateRecord.getTimestamp(),
                            updateRecord.getAdID(),
                            updateRecord.getProvince(),
                            updateRecord.getCity()
                        });
                    }
                    jdbcWrapper.doBatch("UPDATE adclickedcount SET clickedCount =  ? WHERE"
                                + " timestamp =? AND adID = ? AND province = ? AND city = ?", updateParametersList);

                }
            });
            return null;
        }
    });
    /**
     * 对广告点击进行TopN计算,计算出每天每个省份Top5排名的广告
     * 因为我们直接对RDD进行操作,所以使用了transfomr算子;
     */
    updateStateByKeyDSteam.transform(new Function, JavaRDD>() {

        public JavaRDD call(JavaPairRDD rdd) throws Exception {

            JavaRDD rowRDD = rdd.mapToPair(new PairFunction, String, Long>() {

                public Tuple2 call(Tuple2 t)
                        throws Exception {
                    // TODO Auto-generated method stub
                     String[] splited=t._1.split("_");
                     String timestamp = splited[0]; //YYYY-MM-DD
                     String adID = splited[3];
                     String province = splited[4];

                     String clickedRecord = timestamp + "_" + adID + "_" + province;

                     return new Tuple2(clickedRecord, t._2);
                }

            }).reduceByKey(new Function2() {

                public Long call(Long v1, Long v2) throws Exception {
                    // TODO Auto-generated method stub
                    return v1 + v2;
                }
            }).map(new Function, Row>() {

                public Row call(Tuple2 v1) throws Exception {
                    // TODO Auto-generated method stub

                     String[] splited=v1._1.split("_");
                     String timestamp = splited[0]; //YYYY-MM-DD
                     String adID = splited[3];
                     String province = splited[4];


                    return RowFactory.create(timestamp, adID, province, v1._2);
                }
            });

            StructType structType = DataTypes.createStructType(Arrays.asList(
                    DataTypes.createStructField("timestamp", DataTypes.StringType, true),
                    DataTypes.createStructField("adID", DataTypes.StringType, true),
                    DataTypes.createStructField("province", DataTypes.StringType, true),
                    DataTypes.createStructField("clickedCount", DataTypes.LongType, true)
                    ));
            HiveContext hiveContext = new HiveContext(rdd.context());

            DataFrame df = hiveContext.createDataFrame(rowRDD, structType);
            df.registerTempTable("topNTableSource");
            DataFrame result = hiveContext.sql("SELECT timestamp, adID, province, clickedCount, FROM"
                    + " (SELECT timestamp, adID, province,clickedCount, "
                    + "ROW_NUMBER() OVER(PARTITION BY province ORDER BY clickeCount DESC) rank "
                    + "FROM topNTableSource) subquery "
                    + "WHERE rank <= 5");

            return result.toJavaRDD();
        }
    }).foreachRDD(new Function, Void>() {

        public Void call(JavaRDD rdd) throws Exception {
            // TODO Auto-generated method stub
            rdd.foreachPartition(new VoidFunction>() {

                public void call(Iterator t) throws Exception {
                    // TODO Auto-generated method stub
                    List adProvinceTopN = new ArrayList();
                    while(t.hasNext()) {
                        Row row = t.next();
                        AdProvinceTopN item = new AdProvinceTopN();
                        item.setTimestamp(row.getString(0));
                        item.setAdID(row.getString(1));
                        item.setProvince(row.getString(2));
                        item.setClickedCount(row.getLong(3));
                        adProvinceTopN.add(item);
                    }


//                  final List inserting = new ArrayList();
//                  final List updating = new ArrayList();

                    JDBCWrapper jdbcWrapper = JDBCWrapper.getJDBCInstance();

                    Set set = new HashSet();
                    for(AdProvinceTopN item: adProvinceTopN){
                        set.add(item.getTimestamp() + "_" + item.getProvince());
                    }
                    //表的字段timestamp、adID、province、clickedCount
                    ArrayList deleteParametersList = new ArrayList();
                    for(String deleteRecord : set) {
                        String[] splited = deleteRecord.split("_");
                        deleteParametersList.add(new Object[]{
                                splited[0],
                                splited[1]
                        });
                    }

                    jdbcWrapper.doBatch("DELETE FROM adprovincetopn WHERE timestamp = ? AND province = ?", deleteParametersList);


                    //表的字段timestamp、ip、userID、adID、province、city、clickedCount
                    List insertParametersList = new ArrayList();
                    for(AdProvinceTopN insertRecord : adProvinceTopN) {
                        insertParametersList.add(new Object[] {
                                insertRecord.getClickedCount(),
                                insertRecord.getTimestamp(),
                                insertRecord.getAdID(),
                                insertRecord.getProvince()
                        });
                    }
                    jdbcWrapper.doBatch("INSERT INTO adprovincetopn VALUES (?, ?, ?, ?)", insertParametersList);

                }
            });
            return null;
        }
    });

    /**
     * 计算过去半个小时内广告点击的趋势
     * 广告点击的基本数据格式:timestamp、ip、userID、adID、province、city
     */
    filteredadClickedStreaming.mapToPair(new PairFunction, String, Long>() {

        public Tuple2 call(Tuple2 t)
                throws Exception {

            String splited[] = t._2.split("\t");
            String adID = splited[3];

            String time = splited[0]; //Todo:后续需要重构代码实现时间戳和分钟的转换提取。此处需要提取出该广告的点击分钟单位

            return new Tuple2(time + "_" + adID, 1L);
        }
    }).reduceByKeyAndWindow(new Function2() {

        public Long call(Long v1, Long v2) throws Exception {
            // TODO Auto-generated method stub
            return v1 + v2;
        }
    }, new Function2() {

        public Long call(Long v1, Long v2) throws Exception {
            // TODO Auto-generated method stub
            return v1 - v2;
        }
    }, Durations.minutes(30), Durations.milliseconds(5)).foreachRDD(new Function, Void>() {

        public Void call(JavaPairRDD rdd) throws Exception {
            // TODO Auto-generated method stub
            rdd.foreachPartition(new VoidFunction>>() {

                public void call(Iterator> partition)
                        throws Exception {
                    List adTrend = new ArrayList();
                    // TODO Auto-generated method stub
                    while(partition.hasNext()) {
                        Tuple2 record = partition.next();
                        String[] splited = record._1.split("_");
                        String time = splited[0];
                        String adID = splited[1];
                        Long clickedCount = record._2;
                        /**
                         * 在插入数据到数据库的时候具体需要哪些字段?time、adID、clickedCount;
                         * 而我们通过J2EE技术进行趋势绘图的时候肯定是需要年、月、日、时、分这个维度的,所以我们在这里需要
                         * 年月日、小时、分钟这些时间维度;
                         */
                        AdTrendStat adTrendStat = new AdTrendStat();
                        adTrendStat.setAdID(adID);
                        adTrendStat.setClickedCount(clickedCount);
                        adTrendStat.set_date(time); //Todo:获取年月日
                        adTrendStat.set_hour(time); //Todo:获取小时
                        adTrendStat.set_minute(time);//Todo:获取分钟

                        adTrend.add(adTrendStat);
                    }

                    final List inserting = new ArrayList();
                    final List updating = new ArrayList();

                    JDBCWrapper jdbcWrapper = JDBCWrapper.getJDBCInstance();

                    //表的字段timestamp、ip、userID、adID、province、city、clickedCount
                    for(final AdTrendStat trend : adTrend) {
                        final AdTrendCountHistory adTrendhistory = new AdTrendCountHistory();

                        jdbcWrapper.doQuery("SELECT clickedCount FROM adclickedtrend WHERE"
                                + " date =? AND hour = ? AND minute = ? AND AdID = ?",
                                new Object[]{trend.get_date(), trend.get_hour(),  trend.get_minute(),
                                        trend.getAdID()}, new ExecuteCallBack() {

                                            public void resultCallBack(ResultSet result) throws Exception {
                                                // TODO Auto-generated method stub
                                                if(result.next()) {
                                                    long count = result.getLong(1);
                                                    adTrendhistory.setClickedCountHistoryLong(count);
                                                    updating.add(trend);
                                                } else {                                                    
                                                    inserting.add(trend);
                                                }
                                            }
                                        });
                    }
                    //表的字段date、hour、minute、adID、clickedCount
                    List insertParametersList = new ArrayList();
                    for(AdTrendStat insertRecord : inserting) {
                        insertParametersList.add(new Object[] {
                            insertRecord.get_date(),
                            insertRecord.get_hour(),
                            insertRecord.get_minute(),
                            insertRecord.getAdID(),
                            insertRecord.getClickedCount()
                        });
                    }
                    jdbcWrapper.doBatch("INSERT INTO adclickedtrend VALUES(?, ?, ?, ?, ?)", insertParametersList);

                    //表的字段date、hour、minute、adID、clickedCount
                    List updateParametersList = new ArrayList();
                    for(AdTrendStat updateRecord : updating) {
                        updateParametersList.add(new Object[] {
                            updateRecord.getClickedCount(),
                            updateRecord.get_date(),
                            updateRecord.get_hour(),
                            updateRecord.get_minute(),
                            updateRecord.getAdID()

                        });
                    }
                    jdbcWrapper.doBatch("UPDATE adclickedtrend SET clickedCount =  ? WHERE"
                            + " date =? AND hour = ? AND minute = ? AND AdID = ?"
                            , updateParametersList);

                }
            });
            return null;
        }
    });;
    /**
     * Spark Streaming 执行引擎也就是Driver开始运行,Driver启动的时候是位于一条新的线程中的,当然其内部有消息循环体,用于
     * 接收应用程序本身或者Executor中的消息,
     */
    javassc.start();
    javassc.awaitTermination();
    javassc.close();


    }

    private static JavaStreamingContext createContext(String checkpointDirectory, SparkConf conf) {
        // If you do not see this printed, that means the StreamingContext has been loaded
        // from the new checkpoint
        System.out.println("Creating new context");


        // Create the context with a 5 second batch size
        JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(10));
        ssc.checkpoint(checkpointDirectory);

        return ssc;
    }
}

class JDBCWrapper {

    private static JDBCWrapper jdbcInstance = null;
    private static LinkedBlockingQueue dbConnectionPool = new LinkedBlockingQueue();
    static {
        try {
            Class.forName("com.mysql.jdbc.Driver");

        } catch (ClassNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    public static JDBCWrapper getJDBCInstance() {
        if(jdbcInstance == null) {
            synchronized (JDBCWrapper.class) {
                if(jdbcInstance == null) {
                    jdbcInstance = new JDBCWrapper();

                }
            }
        }
        return jdbcInstance;        
    }
    private JDBCWrapper() {
        for(int i = 0; i < 10; i++){
            try {
                Connection conn = DriverManager.getConnection("jdbc:mysql://Master:3306/sparkstreaming","root", "root");
                dbConnectionPool.put(conn);
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }       
    }
    public synchronized Connection getConnection() {
        while(0 == dbConnectionPool.size()){
            try {
                Thread.sleep(20);
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        return dbConnectionPool.poll();
    }
    public int[] doBatch(String sqlText, List paramsList){

        Connection conn = getConnection();
        PreparedStatement preparedStatement = null;
        int[] result = null;
        try {
            conn.setAutoCommit(false);
            preparedStatement = conn.prepareStatement(sqlText);
            for(Object[] parameters: paramsList) {
                for(int i = 0; i < parameters.length; i++){
                    preparedStatement.setObject(i + 1, parameters[i]);
                }               
                preparedStatement.addBatch();
            }
            result = preparedStatement.executeBatch();

            conn.commit();
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            if(preparedStatement != null) {
                try {
                    preparedStatement.close();
                } catch (SQLException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }

            if(conn != null) {
                try {
                    dbConnectionPool.put(conn);
                } catch (InterruptedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }

        return result;  
    }
public void doQuery(String sqlText, Object[] paramsList, ExecuteCallBack callback){

        Connection conn = getConnection();
        PreparedStatement preparedStatement = null;
        ResultSet result = null;
        try {
            preparedStatement = conn.prepareStatement(sqlText);

                for(int i = 0; i < paramsList.length; i++){
                    preparedStatement.setObject(i + 1, paramsList[i]);
                }               

            result = preparedStatement.executeQuery();

            try {
                callback.resultCallBack(result);
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        } catch (SQLException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            if(preparedStatement != null) {
                try {
                    preparedStatement.close();
                } catch (SQLException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }

            if(conn != null) {
                try {
                    dbConnectionPool.put(conn);
                } catch (InterruptedException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
    }
}
interface ExecuteCallBack {
    void resultCallBack(ResultSet result) throws Exception;
}

class UserAdClicked {
    private String timestamp;
    private String ip;
    private String userID;
    private String adID;
    private String province;
    private String city;
    private Long clickedCount;
    public String getTimestamp() {
        return timestamp;
    }
    public void setTimestamp(String timestamp) {
        this.timestamp = timestamp;
    }
    public String getIp() {
        return ip;
    }
    public void setIp(String ip) {
        this.ip = ip;
    }
    public String getUserID() {
        return userID;
    }
    public void setUserID(String userID) {
        this.userID = userID;
    }
    public String getAdID() {
        return adID;
    }
    public void setAdID(String adID) {
        this.adID = adID;
    }
    public String getProvince() {
        return province;
    }
    public void setProvince(String province) {
        this.province = province;
    }
    public String getCity() {
        return city;
    }
    public void setCity(String city) {
        this.city = city;
    }
    public Long getClickedCount() {
        return clickedCount;
    }
    public void setClickedCount(Long clickedCount) {
        this.clickedCount = clickedCount;
    }   
}

class AdClicked {
    private String timestamp;
    private String adID;
    private String province;
    private String city;
    private Long clickedCount;
    public String getTimestamp() {
        return timestamp;
    }
    public void setTimestamp(String timestamp) {
        this.timestamp = timestamp;
    }
    public String getAdID() {
        return adID;
    }
    public void setAdID(String adID) {
        this.adID = adID;
    }
    public String getProvince() {
        return province;
    }
    public void setProvince(String province) {
        this.province = province;
    }
    public String getCity() {
        return city;
    }
    public void setCity(String city) {
        this.city = city;
    }
    public Long getClickedCount() {
        return clickedCount;
    }
    public void setClickedCount(Long clickedCount) {
        this.clickedCount = clickedCount;
    }
}

class AdProvinceTopN {
    private String timestamp;
    private String adID;
    private String province;
    private Long clickedCount;
    public String getTimestamp() {
        return timestamp;
    }
    public void setTimestamp(String timestamp) {
        this.timestamp = timestamp;
    }
    public String getAdID() {
        return adID;
    }
    public void setAdID(String adID) {
        this.adID = adID;
    }
    public String getProvince() {
        return province;
    }
    public void setProvince(String province) {
        this.province = province;
    }
    public Long getClickedCount() {
        return clickedCount;
    }
    public void setClickedCount(Long clickedCount) {
        this.clickedCount = clickedCount;
    }

}

class AdTrendStat {
    private String _date;
    private String _hour;
    private String _minute;
    private String adID;
    private Long clickedCount;
    public String get_date() {
        return _date;
    }
    public void set_date(String _date) {
        this._date = _date;
    }
    public String get_hour() {
        return _hour;
    }
    public void set_hour(String _hour) {
        this._hour = _hour;
    }
    public String get_minute() {
        return _minute;
    }
    public void set_minute(String _minute) {
        this._minute = _minute;
    }
    public String getAdID() {
        return adID;
    }
    public void setAdID(String adID) {
        this.adID = adID;
    }
    public Long getClickedCount() {
        return clickedCount;
    }
    public void setClickedCount(Long clickedCount) {
        this.clickedCount = clickedCount;
    }

}

class AdTrendCountHistory{
    private Long clickedCountHistoryLong;

    public Long getClickedCountHistoryLong() {
        return clickedCountHistoryLong;
    }

    public void setClickedCountHistoryLong(Long clickedCountHistoryLong) {
        this.clickedCountHistoryLong = clickedCountHistoryLong;
    }

}

你可能感兴趣的:(Spark,Streaming)