spark系列二:sparkcore和sparksql综合案例

数据格式:
日期 用户 搜索词 城市 平台 版本
需求:
1、筛选出符合查询条件(城市、平台、版本)的数据
2、统计出每天搜索uv排名前3的搜索词
3、按照每天的top3搜索词的uv搜索总次数,倒序排序
4、将数据保存到hive表中

1、针对原始数据(HDFS文件),获取输入的RDD
2、使用filter算子,去针对输入RDD中的数据,进行数据过滤,过滤出符合查询条件的数据。
  2.1 普通的做法:直接在fitler算子函数中,使用外部的查询条件(Map),但是,这样做的话,是不是查询条件Map,会发送到每一个task上一份副本。(性能并不好)
  2.2 优化后的做法:将查询条件,封装为Broadcast广播变量,在filter算子中使用Broadcast广播变量进行数据筛选。
3、将数据转换为“(日期_搜索词, 用户)”格式,然后呢,对它进行分组,然后再次进行映射,对每天每个搜索词的搜索用户进行去重操作,并统计去重后的数量,即为每天每个搜索词的uv。最后,获得“(日期_搜索词, uv)”
4、将得到的每天每个搜索词的uv,RDD,映射为元素类型为Row的RDD,将该RDD转换为DataFrame
5、将DataFrame注册为临时表,使用Spark SQL的开窗函数,来统计每天的uv数量排名前3的搜索词,以及它的搜索uv,最后获取,是一个DataFrame
6、将DataFrame转换为RDD,继续操作,按照每天日期来进行分组,并进行映射,计算出每天的top3搜索词的搜索uv的总数,然后将uv总数作为key,将每天的top3搜索词以及搜索次数,拼接为一个字符串
7、按照每天的top3搜索总uv,进行排序,倒序排序
8、将排好序的数据,再次映射回来,变成“日期_搜索词_uv”的格式
9、再次映射为DataFrame,并将数据保存到Hive中即可

java版本:
package cn.spark.study.core;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
public class everydaytop {
 public static void main(String[] args) {
  SparkConf conf = new SparkConf()
    .setAppName("hivesource");
    
  JavaSparkContext sc = new JavaSparkContext(conf);
  SQLContext sqlcontext = new SQLContext(sc);
  HiveContext hc = new HiveContext(sc.sc());
  JavaRDD words = sc.textFile("hdfs://master:9000/dailydataset.txt");
  
  Map> queryset = new HashMap>();
  queryset.put("city", Arrays.asList("xian"));
  queryset.put("platform", Arrays.asList("android"));
  queryset.put("version", Arrays.asList("1.0","1.2","1.5","2.0"));
  
  final Broadcast>> querysetbroadcast = sc.broadcast(queryset);
  JavaRDD filterrdd = words.filter(new Function(){
  
   private static final long serialVersionUID = 1L;
   @Override
   public Boolean call(String log) throws Exception {
    String[] logsplited = log.split(" ");
    String city = logsplited[3];
    String platform = logsplited[4];
    String version = logsplited[5];
    
    Map> queryset = querysetbroadcast.value();
    
    
    List cities = queryset.get("city");
    if(cities.size() > 0 && !cities.contains(city)){
     return false;
    }
    List platforms = queryset.get("platform");
    if(platforms.size() > 0 && !platforms.contains(platform)){
     return false;
    }
    List versions = queryset.get("version");
    if(versions.size() > 0 && !versions.contains(version)){
     return false;
    }
    return true;
   }
   
  });
  
  JavaPairRDD pairrdd = filterrdd.mapToPair(new PairFunction(){
   
   private static final long serialVersionUID = 1L;
   @Override
   public Tuple2 call(String log) throws Exception {
    String[] logsplited = log.split(" ");
    String date = logsplited[0];
    String keyword = logsplited[2];
    String user = logsplited[1];
    
    return new Tuple2(date + "_" + keyword,user);
   }
   
  });
  JavaPairRDD> groupedrdd = pairrdd.groupByKey();
  
  JavaPairRDD uvrdd = groupedrdd.mapToPair(new PairFunction>,String,Integer>(){
   private static final long serialVersionUID = 1L;
   private String uv;
   @Override
   public Tuple2 call(Tuple2> tuple) throws Exception {
    String datekeyword = tuple._1;
    Iterator it = tuple._2.iterator();
    List users = new ArrayList();
    while(it.hasNext()){
    String user = it.next();
    if(!users.contains(user)){
     users.add(user);
     
    }
    
    }
    int uv = users.size();
    return new Tuple2(datekeyword,Integer.valueOf(uv));
   }
   
  });
  JavaRDD rowrdd = uvrdd.map(new Function,Row>(){
  
   private static final long serialVersionUID = 1L;
   @Override
   public Row call(Tuple2 tuple) throws Exception {
    String date = tuple._1.split("_")[0];
    String keyword = tuple._1.split("_")[1];
    Integer uv = tuple._2;
    return RowFactory.create(date,keyword,uv);
   }
   
  });
  List structFields = Arrays.asList(
    DataTypes.createStructField("date", DataTypes.StringType, true),
    DataTypes.createStructField("keyword", DataTypes.StringType, true),
    DataTypes.createStructField("uv", DataTypes.LongType, true));
  StructType structType = DataTypes.createStructType(structFields);
  
  DataFrame dateKeywordUvDF = sqlcontext.createDataFrame(rowrdd, structType);
  dateKeywordUvDF.registerTempTable("daily_keyword_uv");
  String sql = "select date,keyword,uv from (select date,keyword,uv,row_number() over(partition by date order by uv DESC) rank from daily_keyword_uv) t1 where rank <= 3";
  DataFrame dailyTop3KeywordDF = sqlcontext.sql(sql);
  JavaRDD dailyTop3KeywordRDD = dailyTop3KeywordDF.javaRDD();
  JavaPairRDD top3DateKeywordUvRDD = dailyTop3KeywordRDD.mapToPair(new PairFunction(){
   private static final long serialVersionUID = 1L;
   @Override
   public Tuple2 call(Row row) throws Exception {
    String date = String.valueOf(row.get(0));
    String keyword = String.valueOf(row.get(1));
    Integer uv = Integer.valueOf(String.valueOf(row.get(2)));
    return new Tuple2(date,keyword + "_" + uv);
   }
   
  });
  JavaPairRDD> top3DateKeywordsRDD = top3DateKeywordUvRDD.groupByKey();
  JavaPairRDD uvDateKeywordsRDD = top3DateKeywordsRDD.mapToPair(new PairFunction>,Integer,String>(){
   
   private static final long serialVersionUID = 1L;
   @Override
   public Tuple2 call(Tuple2> tuple) throws Exception {
    String date = tuple._1;
    String datekeyword = date;
    Iterator it = tuple._2.iterator();
    int total_uv = 0;
    while(it.hasNext()){
     String keyworduv = it.next();
     int uv = Integer.valueOf(keyworduv.split("_")[1]);
     total_uv +=uv;
     datekeyword += "," + keyworduv;
    }
    
    return new Tuple2(total_uv,datekeyword);
   }
   
  });
  
  JavaPairRDD sortedUvDateKeywordsRDD = uvDateKeywordsRDD.sortByKey(false);
  JavaRDD sortedRowRDD = uvDateKeywordsRDD.flatMap(new FlatMapFunction,Row>(){
   
   private static final long serialVersionUID = 1L;
   @Override
   public Iterable call(Tuple2 tuple) throws Exception {
      int total_uv = tuple._1;
      String date = tuple._2.split(",")[0];
      String keyword = tuple._2.split(",")[1].split("_")[0];
      Integer uv = Integer.valueOf(tuple._2.split(",")[1].split("_")[1]);
     
      String keyword1 = tuple._2.split(",")[2].split("_")[0];
      Integer uv1 = Integer.valueOf(tuple._2.split(",")[1].split("_")[1]);
     
      String keyword2 = tuple._2.split(",")[3].split("_")[0];
      Integer uv2 = Integer.valueOf(tuple._2.split(",")[1].split("_")[1]);
      List rows = new ArrayList();
      rows.add(RowFactory.create(date,keyword,uv));
      rows.add(RowFactory.create(date,keyword1,uv1));
      rows.add(RowFactory.create(date,keyword2,uv2));
     
     
    return rows;
   }
   
  });
  
        DataFrame finalDF = sqlcontext.createDataFrame(sortedRowRDD, structType);
  
  finalDF.saveAsTable("daily_top3_keyword_uv");
  
  sc.close();
  
  
  
 }
}

来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/30541278/viewspace-2155018/,如需转载,请注明出处,否则将追究法律责任。

转载于:http://blog.itpub.net/30541278/viewspace-2155018/

你可能感兴趣的:(spark系列二:sparkcore和sparksql综合案例)