spark streaming + redis : 实时统计日注册率

使用spark streaming 实时统计新注册的用户流程如下:
spark streaming + redis : 实时统计日注册率_第1张图片
代码如下:

1, 添加maven依赖



	mysql
	mysql-connector-java
	5.1.17


	org.apache.hive
	hive-exec
	2.1.0




	org.apache.spark
	spark-hive_2.11
	2.1.0


	org.apache.spark
	spark-sql_2.11
	2.1.0




	org.apache.spark
	spark-streaming_2.11
	2.1.0


	org.apache.spark
	spark-streaming-kafka-0-10_2.11
	2.1.0




	redis.clients
	jedis
	2.9.0

2,启动spark流计算

SparkConf conf = new SparkConf() ;
conf.setAppName("kafka") ;
conf.setMaster("local[3]") ;

// 先创建SparkSession
final SparkSession spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate() ;

//创建java streaming上下文
JavaStreamingContext ssc = new JavaStreamingContext(
new JavaSparkContext(spark.sparkContext()) , Durations.seconds(2)) ;

//kafka参数
Map kafkaParams = new HashMap();
kafkaParams.put("bootstrap.servers" , "localhost:9092") ;
kafkaParams.put("key.deserializer" , "org.apache.kafka.common.serialization.StringDeserializer") ;
kafkaParams.put("value.deserializer" , "org.apache.kafka.common.serialization.StringDeserializer") ;
kafkaParams.put("auto.offset.reset" , "latest") ;
kafkaParams.put("group.id" , "raw_logs") ;
kafkaParams.put("enable.auto.commit" ,"true") ;


//位置策略 , 控制消费者在哪个主机上启动
//消费者策略 , 控制消费哪个主题,哪个分区,哪个偏移量
LocationStrategy ls = LocationStrategies.PreferConsistent() ;
List tps = new ArrayList( ) ;
tps.add(new TopicPartition("raw_log_handleTopic" , 0)) ;
ConsumerStrategy cs = ConsumerStrategies.Assign(tps , kafkaParams) ;

//kafka消息流
JavaDStream> ds1 = KafkaUtils.createDirectStream(ssc , ls ,cs) ;

//3, 过滤原始日志,提取startUp表数据
//4, 在表中查询数据,存入redis

ssc.start();
ssc.awaitTermination();

3, 过滤原始日志,提取startUp表数据

//提取到日志串#.#.#.#.
JavaDStream ds2 = ds1.map(new Function, Row>() {
	public Row call(ConsumerRecord v1) throws Exception {
		String topic = v1.topic() ;
		int par = v1.partition() ;
		long offset = v1.offset() ;
		String value = v1.value();

              String mesg="topic= "+topic + ", partition= "+par + ", offset= "+offset + ", value= "+value;
              System.out.println("mesg===> " +mesg);

              String[] arr = value.split("#");
              return RowFactory.create(
				Float.parseFloat(arr[0]),
				arr[1],
				arr[2],
				Long.parseLong(arr[3]),
				Integer.parseInt(arr[4]),
				arr[5]) ;
	}
}) ;
ds2.print();

ds2.foreachRDD(new VoidFunction>() {
public void call(JavaRDD rdd) throws Exception {
SparkSession spark = SparkSession.builder()
						 .config(rdd.context().getConf())
						 .enableHiveSupport()
						 .getOrCreate();

StructField[] fields = new StructField[6];
fields[0] = new StructField("servertimems", DataTypes.FloatType, false, Metadata.empty());
fields[1] = new StructField("servertimestr", DataTypes.StringType, false, Metadata.empty());
fields[2] = new StructField("clientip", DataTypes.StringType, false, Metadata.empty());
fields[3] = new StructField("clienttimems", DataTypes.LongType, false, Metadata.empty());
fields[4] = new StructField("status", DataTypes.IntegerType, false, Metadata.empty());
fields[5] = new StructField("log", DataTypes.StringType, false, Metadata.empty());
StructType type = new StructType(fields);

//过滤无效数据
Dataset df1 = spark.createDataFrame(rdd, type);
df1.createOrReplaceTempView("_temp");
Dataset df2 = spark.sql("select forkstartuplogs(servertimestr , clienttimems , clientip , log) from _temp");
df2.createOrReplaceTempView("_temp2");

4, 在表中查询数据,存入redis

String aggSql = "select concat(appid,'#',appversion,'#',brand,'#',appplatform,'#',devicestyle,'#',ostype,'#',deviceid) key," +
					"min(createdatms) mn," +
					"max(createdatms) mx  from _temp2 group by " +
					"concat(appid,'#',appversion,'#',brand,'#',appplatform,'#',devicestyle,'#',ostype,'#',deviceid)" ;
//在sql语句中聚合rdd内的最值
spark.sql(aggSql).foreachPartition(new ForeachPartitionFunction() {
public void call(Iterator t) throws Exception {
	//创建redis实例
	Jedis redis = new Jedis("s101", 6379);
	redis.select(1);

	while(t.hasNext()){
		Row row = t.next() ;
		String key = row.getAs("key") ;
		long mn = row.getAs("mn") ;
		long mx = row.getAs("mx") ;

		String oldvalue = redis.get(key);
		if (oldvalue == null) {
			redis.set(key, mn + "," + mx);
		} else {
			String[] arr = oldvalue.split(",");
			long oldMin = Long.parseLong(arr[0]);
			long oldMax = Long.parseLong(arr[1]);
			redis.set(key, Math.min(mn, oldMin) + "," + Math.max(mx, oldMax));
		}
	}
	redis.close();
}
});

你可能感兴趣的:(大数据hadoop-spark)