package com.xxx.busi;
import com.xxx.common.DBUtils;
import com.xxx.common.JavaSparkSessionSingleton;
import com.xxx.common.OffsetUtil;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import scala.Tuple2;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.function.ToIntFunction;
public class DayUpdateCartCntTopN {
static String flag1="更新购物车接口,uid:";
static String flag2="返回给前端的购物车信息cartInfoMap:";
static String addSql="insert into bigdata_update_cart_cnt(uid, up_date, cnt) values(?,?,?)";
static SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");
static int setp=300;
public static void main(String[] args) {
ResourceBundle kafkaData = ResourceBundle.getBundle("META-INF/kafka");
SparkConf sc=new SparkConf().setMaster("local[2]").setAppName("DayUpdateCartCntTopN");
//控制sparkstreaming启动时,积压问题并设置背压机制,自适应批次的record变化,来控制任务的堆积
//(1)确保在kill任务时,能够处理完最后一批数据,再关闭程序,不会发生强制kill导致数据处理中断,没处理完的数据丢失
sc.set("spark.streaming.stopGracefullyOnShutdown", "true");
//(2)开启后spark自动根据系统负载选择最优消费速率
sc.set("spark.streaming.backpressure.enabled", "true");
//(3)开启的情况下,限制第一次批处理应该消费的数据,因为程序冷启动 队列里面有大量积压,防止第一次全部读取,造成系统阻塞
sc.set("spark.streaming.backpressure.initialRate", "1000");
//(4)限制每秒每个消费线程读取每个kafka分区最大的数据量
sc.set("spark.streaming.kafka.maxRatePerPartition", "1000");
/**
* 注意:
只有(4)激活的时候,每次消费的最大数据量,就是设置的数据量,如果不足这个数,就有多少读多少,如果超过这个数字,就读取这个数字的设置的值
只有(2)+(4)激活的时候,每次消费读取的数量最大会等于(4)设置的值,最小是spark根据系统负载自动推断的值,消费的数据量会在这两个范围之内变化根据系统情况,但第一次启动会有多少读多少数据。此后按(2)+(4)设置规则运行
(2)+(3)+(4)同时激活的时候,跟上一个消费情况基本一样,但第一次消费会得到限制,因为我们设置第一次消费的频率了
*/
JavaStreamingContext jssc=new JavaStreamingContext(sc, Durations.seconds(180));//120秒一次
jssc.checkpoint(".");//设置上一个批次的值存在的目录,在生产环境中,放在hdfs某个文件下,相对安全些
// 首先要创建一份kafka参数map
Map kafkaParams = new HashMap();
// 这里是不需要zookeeper节点,所以这里放broker.list
String brokerslist=kafkaData.getString("BROKERS_LIST");
String topics = kafkaData.getString("TOPICS");
String groupId=kafkaData.getString("GROUP_ID");
//Kafka服务监听端口
kafkaParams.put("bootstrap.servers",brokerslist);
//指定kafka输出key的数据类型及编码格式(默认为字符串类型编码格式为uft-8)
kafkaParams.put("key.deserializer", StringDeserializer.class);
//指定kafka输出value的数据类型及编码格式(默认为字符串类型编码格式为uft-8)
kafkaParams.put("value.deserializer", StringDeserializer.class);
//消费者ID,随意指定
kafkaParams.put("group.id", groupId);
//earliest
//当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
//latest
//当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
kafkaParams.put("auto.offset.reset", "earliest");
//如果true,consumer定期地往zookeeper写入每个分区的offset
kafkaParams.put("enable.auto.commit", true);
//这里不用担心时间久了kafka的数据会很多,因为kafka有自动清理机制,
// 默认把消息数据保存168小时,超过的都会自动清理,配置在server.properties文件中的log.retention.hours
//我们该用另外一种方式来获取多分区中的topic和offset
Collection topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
//通过KafkaUtils.createDirectStream(...)获得kafka数据,kafka相关参数由kafkaParams指定
try {
JavaInputDStream> kafkaStream = KafkaUtils.createDirectStream(
jssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.Subscribe(topicsSet, kafkaParams)
);
//使用map先对InputDStream进行取值操作
//先过滤无用的数据
JavaDStream lines=kafkaStream.filter(new Function, Boolean>() {
@Override
public Boolean call(ConsumerRecord orgRecord) throws Exception {
String today=sdf.format(new Date());
String orgVal=orgRecord.value();
return orgVal.contains(flag1)&&orgVal.contains(flag2)&&orgVal.startsWith(today);//满足条件的才会被留下,不满足条件都会被过滤掉
}
}).map(new Function, String>() {
@Override
public String call(ConsumerRecord consumerRecord) throws Exception {
String line=consumerRecord.value();
int f1=line.indexOf(flag1)+flag1.length();
String temp=line.substring(f1);
int f2=temp.indexOf(",",0);
String lastupTime= line.substring(11,23);
String uid=temp.substring(0,f2);
String day=line.substring(0,10);
return uid+"_"+day;
}
});
lines.print();
JavaPairDStream pairs = lines.mapToPair(s -> new Tuple2<>(s, 1));
//状态保留
JavaPairDStream runningCounts = pairs.updateStateByKey(
new Function2,Optional,Optional>() {
@Override
public Optional call(List values, Optional state) throws Exception {
Integer updateValue = 0;
if (state.isPresent()) {//是否为空
// 如果有值就获取
updateValue = state.get();
}
// 累加
for (Integer value : values) {
updateValue += value;
}
return Optional.of(updateValue);
}
}
);
runningCounts.print();
runningCounts.foreachRDD(new VoidFunction>() {
@Override
public void call(JavaPairRDD javaPairRDD) throws Exception {
JavaRDD newRdd = javaPairRDD.map(new Function, String>() {
@Override
public String call(Tuple2 recod) throws Exception {
return recod._2+"_"+recod._1;
}
});
// System.out.println("更换位置newRdd:"+newRdd.collect());
JavaPairRDD fRdd=newRdd.mapToPair(new PairFunction() {
@Override
public Tuple2 call(String s) throws Exception {
String[] vals=s.split("_");
return new Tuple2(Integer.valueOf(vals[0]),vals[1]+"_"+vals[2]);
}
}).sortByKey(false);
processJavaRDDData(fRdd);
}
});
jssc.start();
jssc.awaitTermination();
jssc.stop();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
private static void processJavaRDDData(JavaPairRDD fRdd) throws Exception {
List> orderList=fRdd.collect();
List> top5=null;
if (null!=orderList&&orderList.size()>=5){
top5=orderList.subList(0,5);
}else{
top5=orderList;
}
System.out.println("top5结果集:"+top5);
//[330,11111111_2019-09-08]
Connection connection = DBUtils.getConnection();
connection.setAutoCommit(false);
String time = top5.get(0)._2.split("_")[1];
String delSql="DELETE FROM bigdata_update_cart_cnt where up_date='"+time+"'";//清空表数据,保留表结构
PreparedStatement dst = connection.prepareStatement(delSql);
dst.executeUpdate();
connection.commit();
int recordsize = 0;
dst = connection.prepareStatement(addSql);
for (Tuple2 row:top5) {
String[] vals=row._2.split("_");
String uid = vals[0];
String day = vals[1];
int cnt =row._1();
dst.setString(1, uid);
dst.setString(2, day);
dst.setInt(3, cnt);
dst.addBatch();
if ((recordsize + 1) % setp == 0) {
dst.executeBatch();//执行批量处理
}
recordsize++;
}
if (recordsize % setp != 0) {//不被整除,才执行这个
dst.executeBatch();//执行批量处理
}
connection.commit();//手动提交
DBUtils.DBclose(connection, dst, null);
}
//处理DataSet类型的数据
private static void processData(Dataset result) throws Exception {
if (null!=result) {
JavaRDD javaRest = result.toJavaRDD();
Connection connection = DBUtils.getConnection();
connection.setAutoCommit(false);
String time = javaRest.collect().get(0).get(1)+"";
String delSql="DELETE FROM bigdata_update_cart_cnt where up_date='"+time+"'";//清空表数据,保留表结构
PreparedStatement dst = connection.prepareStatement(delSql);
dst.executeUpdate();
connection.commit();
int recordsize = 0;
dst = connection.prepareStatement(addSql);
for (Row row : javaRest.collect()) {
String uid = row.get(0) + "";
String day = row.get(1) + "";
int cnt = Integer.valueOf(row.get(2) + "");
dst.setString(1, uid);
dst.setString(2, day);
dst.setInt(3, cnt);
dst.addBatch();
if ((recordsize + 1) % setp == 0) {
dst.executeBatch();//执行批量处理
}
recordsize++;
}
if (recordsize % setp != 0) {//不被整除,才执行这个
dst.executeBatch();//执行批量处理
}
connection.commit();//手动提交
DBUtils.DBclose(connection, dst, null);
}
}
}