使用spark streaming 实时统计新注册的用户流程如下:
代码如下:
mysql
mysql-connector-java
5.1.17
org.apache.hive
hive-exec
2.1.0
org.apache.spark
spark-hive_2.11
2.1.0
org.apache.spark
spark-sql_2.11
2.1.0
org.apache.spark
spark-streaming_2.11
2.1.0
org.apache.spark
spark-streaming-kafka-0-10_2.11
2.1.0
redis.clients
jedis
2.9.0
SparkConf conf = new SparkConf() ;
conf.setAppName("kafka") ;
conf.setMaster("local[3]") ;
// 先创建SparkSession
final SparkSession spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate() ;
//创建java streaming上下文
JavaStreamingContext ssc = new JavaStreamingContext(
new JavaSparkContext(spark.sparkContext()) , Durations.seconds(2)) ;
//kafka参数
Map kafkaParams = new HashMap();
kafkaParams.put("bootstrap.servers" , "localhost:9092") ;
kafkaParams.put("key.deserializer" , "org.apache.kafka.common.serialization.StringDeserializer") ;
kafkaParams.put("value.deserializer" , "org.apache.kafka.common.serialization.StringDeserializer") ;
kafkaParams.put("auto.offset.reset" , "latest") ;
kafkaParams.put("group.id" , "raw_logs") ;
kafkaParams.put("enable.auto.commit" ,"true") ;
//位置策略 , 控制消费者在哪个主机上启动
//消费者策略 , 控制消费哪个主题,哪个分区,哪个偏移量
LocationStrategy ls = LocationStrategies.PreferConsistent() ;
List tps = new ArrayList( ) ;
tps.add(new TopicPartition("raw_log_handleTopic" , 0)) ;
ConsumerStrategy cs = ConsumerStrategies.Assign(tps , kafkaParams) ;
//kafka消息流
JavaDStream> ds1 = KafkaUtils.createDirectStream(ssc , ls ,cs) ;
//3, 过滤原始日志,提取startUp表数据
//4, 在表中查询数据,存入redis
ssc.start();
ssc.awaitTermination();
//提取到日志串#.#.#.#.
JavaDStream ds2 = ds1.map(new Function, Row>() {
public Row call(ConsumerRecord v1) throws Exception {
String topic = v1.topic() ;
int par = v1.partition() ;
long offset = v1.offset() ;
String value = v1.value();
String mesg="topic= "+topic + ", partition= "+par + ", offset= "+offset + ", value= "+value;
System.out.println("mesg===> " +mesg);
String[] arr = value.split("#");
return RowFactory.create(
Float.parseFloat(arr[0]),
arr[1],
arr[2],
Long.parseLong(arr[3]),
Integer.parseInt(arr[4]),
arr[5]) ;
}
}) ;
ds2.print();
ds2.foreachRDD(new VoidFunction>() {
public void call(JavaRDD rdd) throws Exception {
SparkSession spark = SparkSession.builder()
.config(rdd.context().getConf())
.enableHiveSupport()
.getOrCreate();
StructField[] fields = new StructField[6];
fields[0] = new StructField("servertimems", DataTypes.FloatType, false, Metadata.empty());
fields[1] = new StructField("servertimestr", DataTypes.StringType, false, Metadata.empty());
fields[2] = new StructField("clientip", DataTypes.StringType, false, Metadata.empty());
fields[3] = new StructField("clienttimems", DataTypes.LongType, false, Metadata.empty());
fields[4] = new StructField("status", DataTypes.IntegerType, false, Metadata.empty());
fields[5] = new StructField("log", DataTypes.StringType, false, Metadata.empty());
StructType type = new StructType(fields);
//过滤无效数据
Dataset df1 = spark.createDataFrame(rdd, type);
df1.createOrReplaceTempView("_temp");
Dataset df2 = spark.sql("select forkstartuplogs(servertimestr , clienttimems , clientip , log) from _temp");
df2.createOrReplaceTempView("_temp2");
String aggSql = "select concat(appid,'#',appversion,'#',brand,'#',appplatform,'#',devicestyle,'#',ostype,'#',deviceid) key," +
"min(createdatms) mn," +
"max(createdatms) mx from _temp2 group by " +
"concat(appid,'#',appversion,'#',brand,'#',appplatform,'#',devicestyle,'#',ostype,'#',deviceid)" ;
//在sql语句中聚合rdd内的最值
spark.sql(aggSql).foreachPartition(new ForeachPartitionFunction() {
public void call(Iterator t) throws Exception {
//创建redis实例
Jedis redis = new Jedis("s101", 6379);
redis.select(1);
while(t.hasNext()){
Row row = t.next() ;
String key = row.getAs("key") ;
long mn = row.getAs("mn") ;
long mx = row.getAs("mx") ;
String oldvalue = redis.get(key);
if (oldvalue == null) {
redis.set(key, mn + "," + mx);
} else {
String[] arr = oldvalue.split(",");
long oldMin = Long.parseLong(arr[0]);
long oldMax = Long.parseLong(arr[1]);
redis.set(key, Math.min(mn, oldMin) + "," + Math.max(mx, oldMax));
}
}
redis.close();
}
});