数据格式:
日期 用户 搜索词 城市 平台 版本
需求:
1、筛选出符合查询条件(城市、平台、版本)的数据
2、统计出每天搜索uv排名前3的搜索词
3、按照每天的top3搜索词的uv搜索总次数,倒序排序
4、将数据保存到hive表中
1、针对原始数据(HDFS文件),获取输入的RDD
2、使用filter算子,去针对输入RDD中的数据,进行数据过滤,过滤出符合查询条件的数据。
2.1 普通的做法:直接在fitler算子函数中,使用外部的查询条件(Map),但是,这样做的话,是不是查询条件Map,会发送到每一个task上一份副本。(性能并不好)
2.2 优化后的做法:将查询条件,封装为Broadcast广播变量,在filter算子中使用Broadcast广播变量进行数据筛选。
3、将数据转换为“(日期_搜索词, 用户)”格式,然后呢,对它进行分组,然后再次进行映射,对每天每个搜索词的搜索用户进行去重操作,并统计去重后的数量,即为每天每个搜索词的uv。最后,获得“(日期_搜索词, uv)”
4、将得到的每天每个搜索词的uv,RDD,映射为元素类型为Row的RDD,将该RDD转换为DataFrame
5、将DataFrame注册为临时表,使用Spark SQL的开窗函数,来统计每天的uv数量排名前3的搜索词,以及它的搜索uv,最后获取,是一个DataFrame
6、将DataFrame转换为RDD,继续操作,按照每天日期来进行分组,并进行映射,计算出每天的top3搜索词的搜索uv的总数,然后将uv总数作为key,将每天的top3搜索词以及搜索次数,拼接为一个字符串
7、按照每天的top3搜索总uv,进行排序,倒序排序
8、将排好序的数据,再次映射回来,变成“日期_搜索词_uv”的格式
9、再次映射为DataFrame,并将数据保存到Hive中即可
java版本:
package cn.spark.study.core;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
public class everydaytop {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setAppName("hivesource");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlcontext = new SQLContext(sc);
HiveContext hc = new HiveContext(sc.sc());
JavaRDD words = sc.textFile("hdfs://master:9000/dailydataset.txt");
Map> queryset = new HashMap>();
queryset.put("city", Arrays.asList("xian"));
queryset.put("platform", Arrays.asList("android"));
queryset.put("version", Arrays.asList("1.0","1.2","1.5","2.0"));
final Broadcast
private static final long serialVersionUID = 1L;
@Override
public Boolean call(String log) throws Exception {
String[] logsplited = log.split(" ");
String city = logsplited[3];
String platform = logsplited[4];
String version = logsplited[5];
Map> queryset = querysetbroadcast.value();
List cities = queryset.get("city");
if(cities.size() > 0 && !cities.contains(city)){
return false;
}
List platforms = queryset.get("platform");
if(platforms.size() > 0 && !platforms.contains(platform)){
return false;
}
List versions = queryset.get("version");
if(versions.size() > 0 && !versions.contains(version)){
return false;
}
return true;
}
});
JavaPairRDD pairrdd = filterrdd.mapToPair(new PairFunction(){
private static final long serialVersionUID = 1L;
@Override
public Tuple2 call(String log) throws Exception {
String[] logsplited = log.split(" ");
String date = logsplited[0];
String keyword = logsplited[2];
String user = logsplited[1];
return new Tuple2(date + "_" + keyword,user);
}
});
JavaPairRDD> groupedrdd = pairrdd.groupByKey();
JavaPairRDD uvrdd = groupedrdd.mapToPair(new PairFunction>,String,Integer>(){
private static final long serialVersionUID = 1L;
private String uv;
@Override
public Tuple2 call(Tuple2> tuple) throws Exception {
String datekeyword = tuple._1;
Iterator it = tuple._2.iterator();
List users = new ArrayList();
while(it.hasNext()){
String user = it.next();
if(!users.contains(user)){
users.add(user);
}
}
int uv = users.size();
return new Tuple2(datekeyword,Integer.valueOf(uv));
}
});
JavaRDD rowrdd = uvrdd.map(new Function,Row>(){
private static final long serialVersionUID = 1L;
@Override
public Row call(Tuple2 tuple) throws Exception {
String date = tuple._1.split("_")[0];
String keyword = tuple._1.split("_")[1];
Integer uv = tuple._2;
return RowFactory.create(date,keyword,uv);
}
});
List structFields = Arrays.asList(
DataTypes.createStructField("date", DataTypes.StringType, true),
DataTypes.createStructField("keyword", DataTypes.StringType, true),
DataTypes.createStructField("uv", DataTypes.LongType, true));
StructType structType = DataTypes.createStructType(structFields);
DataFrame dateKeywordUvDF = sqlcontext.createDataFrame(rowrdd, structType);
dateKeywordUvDF.registerTempTable("daily_keyword_uv");
String sql = "select date,keyword,uv from (select date,keyword,uv,row_number() over(partition by date order by uv DESC) rank from daily_keyword_uv) t1 where rank <= 3";
DataFrame dailyTop3KeywordDF = sqlcontext.sql(sql);
JavaRDD dailyTop3KeywordRDD = dailyTop3KeywordDF.javaRDD();
JavaPairRDD top3DateKeywordUvRDD = dailyTop3KeywordRDD.mapToPair(new PairFunction(){
private static final long serialVersionUID = 1L;
@Override
public Tuple2 call(Row row) throws Exception {
String date = String.valueOf(row.get(0));
String keyword = String.valueOf(row.get(1));
Integer uv = Integer.valueOf(String.valueOf(row.get(2)));
return new Tuple2(date,keyword + "_" + uv);
}
});
JavaPairRDD> top3DateKeywordsRDD = top3DateKeywordUvRDD.groupByKey();
JavaPairRDD uvDateKeywordsRDD = top3DateKeywordsRDD.mapToPair(new PairFunction>,Integer,String>(){
private static final long serialVersionUID = 1L;
@Override
public Tuple2 call(Tuple2> tuple) throws Exception {
String date = tuple._1;
String datekeyword = date;
Iterator it = tuple._2.iterator();
int total_uv = 0;
while(it.hasNext()){
String keyworduv = it.next();
int uv = Integer.valueOf(keyworduv.split("_")[1]);
total_uv +=uv;
datekeyword += "," + keyworduv;
}
return new Tuple2(total_uv,datekeyword);
}
});
JavaPairRDD sortedUvDateKeywordsRDD = uvDateKeywordsRDD.sortByKey(false);
JavaRDD sortedRowRDD = uvDateKeywordsRDD.flatMap(new FlatMapFunction,Row>(){
private static final long serialVersionUID = 1L;
@Override
public Iterable call(Tuple2 tuple) throws Exception {
int total_uv = tuple._1;
String date = tuple._2.split(",")[0];
String keyword = tuple._2.split(",")[1].split("_")[0];
Integer uv = Integer.valueOf(tuple._2.split(",")[1].split("_")[1]);
String keyword1 = tuple._2.split(",")[2].split("_")[0];
Integer uv1 = Integer.valueOf(tuple._2.split(",")[1].split("_")[1]);
String keyword2 = tuple._2.split(",")[3].split("_")[0];
Integer uv2 = Integer.valueOf(tuple._2.split(",")[1].split("_")[1]);
List rows = new ArrayList();
rows.add(RowFactory.create(date,keyword,uv));
rows.add(RowFactory.create(date,keyword1,uv1));
rows.add(RowFactory.create(date,keyword2,uv2));
return rows;
}
});
DataFrame finalDF = sqlcontext.createDataFrame(sortedRowRDD, structType);
finalDF.saveAsTable("daily_top3_keyword_uv");
sc.close();
}
}
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/30541278/viewspace-2155018/,如需转载,请注明出处,否则将追究法律责任。