开发环境:
- spark 2.3
- kafka 1.1.1
黑名单数据是从mysql中获取的。源数据是从kafka中获取的,数据格式就是简单的姓名,为了与黑名单数据做join,源数据和黑名单数据都需要转换成键值对的形式。
Java代码:
package cn.spark.streaming;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import kafka.serializer.StringDecoder;
import scala.Tuple2;
/**
* use transform filter balcklist
* base on kafka message queue
*
*/
public class BlackListFilter {
public static void main(String[] args) throws Exception{
SparkConf conf = new SparkConf().setAppName("BlackListFilter");
// create context
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
// open checkpoint mechanism
jssc.checkpoint(args[0]);
// properties map
Map KafkaParams = new HashMap();
KafkaParams.put("bootstrap.servers", "hserver-1:9092,hserver-2:9092,hserver-3:9092");
KafkaParams.put("gruop.id", "BlackListFilter");
KafkaParams.put("auto.offest.reset", "smallest");
// topic set
Set topics = new HashSet();
topics.add(args[1]);
// create DStream
JavaPairInputDStream InputPairDstream =
KafkaUtils.createDirectStream(
jssc,
String.class,
String.class,
StringDecoder.class,
StringDecoder.class,
KafkaParams,
topics
);
// get blocklist from mysql
SparkSession spark = SparkSession
.builder()
.enableHiveSupport()
.getOrCreate();
// read data --> return Dataset
Dataset BlackList = spark
.read()
.format("jdbc")
.option("url", "jdbc:mysql://hserver-1:3306/retail_db")
.option("driver", "com.mysql.jdbc.Driver")
.option("dbtable", "blacklist")
.option("username", "root")
.option("password", "root")
.load();
// transform Dataset into JavaRDD
JavaRDD BlackListRDD = BlackList.toJavaRDD();
// transform JavaRDD into JavaPairRDD --> the second element type is Boolean
final JavaPairRDD BlackListPairRDD =
BlackListRDD.mapToPair(
new PairFunction() {
private static final long serialVersionUID = -6634120981007776151L;
@Override
public Tuple2 call(Row name) throws Exception {
return new Tuple2(name.getString(0), true);
}
});
// transform kafka data flow
JavaDStream VaildListDStream = InputPairDstream.transform(
new Function, JavaRDD>() {
private static final long serialVersionUID = -7488950207291980402L;
@Override
public JavaRDD call(JavaPairRDD KafkaDataRDD) throws Exception {
// create source RDD --> UserRDD: access log
JavaPairRDD UserRDD =
KafkaDataRDD.mapToPair(
new PairFunction, String, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2 call(Tuple2 tuple) throws Exception {
return new Tuple2(tuple._2, "........");
}
});
// leftOutJoin
JavaPairRDD>> JoinRDD =
UserRDD.leftOuterJoin(BlackListPairRDD);
// do blacklist filtering
JavaPairRDD>> FilterRDD =
JoinRDD.filter(
new Function>>, Boolean>() {
private static final long serialVersionUID = 791090533213057710L;
@Override
public Boolean call(Tuple2>> tuple) throws Exception {
if(tuple._2._2.isPresent() && tuple._2._2.get()){
return false;
} else {
return true;
}
}
});
// mapToPair
JavaRDD resultRDD =
FilterRDD.map(
new Function<
Tuple2>>, String>() {
private static final long serialVersionUID = -54290472445703194L;
@Override
public String call(Tuple2>> tuple)
throws Exception {
return tuple._1 + "--->" + tuple._2._1;
}
});
return resultRDD;
}
});
// print result
VaildListDStream.print();
jssc.start();
jssc.awaitTermination();
jssc.close();
spark.close();
}
}