-------------------------------------------------------AdClickedStreamingStatus -------------------------------------------------------------------------------------------------
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import com.google.common.base.Optional;
import kafka.serializer.StringDecoder;
import scala.Tuple2;
/**
* 广告点击的基本数据格式:timestamp、ip、userID、adID、province、city
*
*/
public class AdClickedStreamingStatus {
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setMaster("local[5]")
//.setMaster("spark://master:7077")
.setAppName("AdClickedStreamingStats");
/*SparkConf conf = new SparkConf().setMaster("spark://Master:7077").
setAppName("SparkStreamingOnKafkaReceiver");*/
JavaStreamingContext jsc = new JavaStreamingContext(conf, Durations.seconds(10));
jsc.checkpoint("d:/checkpoint");
/**
* 创建Kafka元数据,来让Spark Streaming这个Kafka Consumer利用
*/
Map
kafkaParameters = new HashMap();
kafkaParameters.put("metadata.broker.list",
"master:9092,slave1:9092,slave2:9092");
Set topics = new HashSet();
topics.add("AdClicked");
JavaPairInputDStream adClickedStreaming = KafkaUtils.createDirectStream(jsc,
String.class, String.class,
StringDecoder.class, StringDecoder.class,
kafkaParameters,
topics);
/**
* 因为要对黑名单进行在线过滤,而数据是在RDD中的,所以必然使用transform这个函数;
* 但是在这里我们必须使用transformToPair,原因是读取进来的Kafka的数据是Pair类型的,另外
* 一个原因是过滤后的数据要进行进一步处理,所以必须是读进来的Kafka数据的原始类型DStream
*
* 在此:再次说明每个Batch Duration中实际上讲输入的数据就是被一个且仅仅被一个RDD封装的,你可以有多个
* InputDstream,但是其实在产生Job的时候,这些不同的InputDstream在Batch Duration中就相当于Spark基于
* HDFS数据操作的不同文件来源而已罢了。
*/
JavaPairDStream filteredadClickedStreaming = adClickedStreaming.transformToPair(new Function, JavaPairRDD>() {
@Override
public JavaPairRDD call(JavaPairRDD rdd) throws Exception {
/**
* 在线黑名单过滤思路步骤:
* 1,从数据库中获取黑名单转换成RDD,即新的RDD实例封装黑名单数据;
* 2,然后把代表黑名单的RDD的实例和Batch Duration产生的rdd进行join操作,准确的说是进行
* leftOuterJoin操作,也就是说使用Batch Duration产生的rdd和代表黑名单的RDD的实例进行
* leftOuterJoin操作,如果两者都有内容的话,就会是true,否则的话就是false;
*
* 我们要留下的是leftOuterJoin操作结果为false;
*
*/
List blackListNames = new ArrayList();
JDBCWrapper jdbcWrapper = JDBCWrapper.getJDBCInstance();
/* jdbcWrapper.doQuery("SELECT * FROM blacklisttable", null, new ExecuteCallBack(){
@Override
public void resultCallBack(ResultSet result) throws Exception {
while(result.next()){
blackListNames.add(result.getString(1));
}
}
});*/
List> blackListTuple = new ArrayList>();
for (String name : blackListNames){
blackListTuple.add(new Tuple2(name, true));
}
List> blackListFromDB = blackListTuple; //数据来自于查询的黑名单表并且映射成为
JavaSparkContext jsc = new JavaSparkContext(rdd.context());
/**
* 黑名单的表中只有userID,但是如果要进行join操作的话,就必须是Key-Value,所以
* 在这里我们需要基于数据表中的数据产生Key-Value类型的数据集合;
*/
JavaPairRDD blackListRDD = jsc.parallelizePairs(blackListFromDB);
/**
* 进行操作的时候肯定是基于userID进行join的,所以必须把传入的rdd进行mapToPair操作转化成为符合
* 格式的rdd
*
* 广告点击的基本数据格式:timestamp、ip、userID、adID、province、city
*/
JavaPairRDD> rdd2Pair = rdd.mapToPair(new PairFunction, String, Tuple2>() {
@Override
public Tuple2> call(Tuple2 t) throws Exception {
System.out.println(t._2.split("/t")[0]);
String userID = t._2.split("/t")[2];
System.out.println("userID=" + userID);
return new Tuple2>(userID, t);
}
});
JavaPairRDD, Optional>> joined = rdd2Pair.leftOuterJoin(blackListRDD);
JavaPairRDD result = joined.filter(new Function Tuple2,Optional>>, Boolean>() {
@Override
public Boolean call(Tuple2, Optional>> v1)
throws Exception {
Optional optional = v1._2._2;
if (optional.isPresent() && optional.get()){
return false;
} else {
return true;
}
}
}).mapToPair(new PairFunction,Optional>>, String, String>() {
@Override
public Tuple2 call(
Tuple2, Optional>> t) throws Exception {
// TODO Auto-generated method stub
return t._2._1;
}
});
return result;
}
});
/*
* 第四步:接下来就像对于RDD编程一样基于DStream进行编程!!!原因是DStream是RDD产生的模板(或者说类),在Spark Streaming具体
* 发生计算前,其实质是把每个Batch的DStream的操作翻译成为对RDD的操作!!!
*对初始的DStream进行Transformation级别的处理,例如map、filter等高阶函数等的编程,来进行具体的数据计算
* 广告点击的基本数据格式:timestamp、ip、userID、adID、province、city
*/
JavaPairDStream pairs = filteredadClickedStreaming.mapToPair(new PairFunction, String, Long>() {
@Override
public Tuple2 call(Tuple2 t) throws Exception {
String[] splited = t._2.split("_");
String timestamp = splited[0]; //yyyy-MM-dd
String ip = splited[1];
String userID = splited[2];
String adID = splited[3];
String province = splited[4];
String city = splited[5];
String clickedRecord = timestamp + "_" + ip + "_" + userID + "_" + adID + "_"
+ province + "_" + city;
return new Tuple2(clickedRecord, 1L);
}
});
/*
* 第四步:对初始的DStream进行Transformation级别的处理,例如map、filter等高阶函数等的编程,来进行具体的数据计算
* 计算每个Batch Duration中每个User的广告点击量
*/
JavaPairDStream adClickedUsers = pairs.reduceByKey(new Function2(){
@Override
public Long call(Long v1, Long v2) throws Exception {
// TODO Auto-generated method stub
return v1 + v2;
}
});
/**
*
* 计算出什么叫有效的点击?
* 1,复杂化的一般都是采用机器学习训练好模型直接在线进行过滤;
* 2,简单的?可以通过一个Batch Duration中的点击次数来判断是不是非法广告点击,但是实际上讲非法广告
* 点击程序会尽可能模拟真实的广告点击行为,所以通过一个Batch来判断是 不完整的,我们需要对例如一天(也可以是每一个小时)
* 的数据进行判断!
* 3,比在线机器学习退而求次的做法如下:
* 例如:一段时间内,同一个IP(MAC地址)有多个用户的帐号访问;
* 例如:可以统一一天内一个用户点击广告的次数,如果一天点击同样的广告操作50次的话,就列入黑名单;
*
* 黑名单有一个重点的特征:动态生成!!!所以每一个Batch Duration都要考虑是否有新的黑名单加入,此时黑名单需要存储起来
* 具体存储在什么地方呢,存储在DB/Redis中即可;
*
* 例如邮件系统中的“黑名单”,可以采用Spark Streaming不断的监控每个用户的操作,如果用户发送邮件的频率超过了设定的值,可以
* 暂时把用户列入“黑名单”,从而阻止用户过度频繁的发送邮件。
*/
JavaPairDStream filteredClickInBatch = adClickedUsers.filter(new Function, Boolean>() {
@Override
public Boolean call(Tuple2 v1) throws Exception {
if ( 1 < v1._2){
//更新一下黑名单的数据表
return false;
} else {
return true;
}
}
});
// Todo。。。。
/*
* 此处的print并不会直接出发Job的执行,因为现在的一切都是在Spark Streaming框架的控制之下的,对于Spark Streaming
* 而言具体是否触发真正的Job运行是基于设置的Duration时间间隔的
*
* 诸位一定要注意的是Spark Streaming应用程序要想执行具体的Job,对Dtream就必须有output Stream操作,
* output Stream有很多类型的函数触发,类print、saveAsTextFile、saveAsHadoopFiles等,最为重要的一个
* 方法是foraeachRDD,因为Spark Streaming处理的结果一般都会放在Redis、DB、DashBoard等上面,foreachRDD
* 主要就是用用来完成这些功能的,而且可以随意的自定义具体数据到底放在哪里!!!
*
*/
filteredClickInBatch.print();
filteredClickInBatch.foreachRDD(new Function, Void>() {
@Override
public Void call(JavaPairRDD rdd) throws Exception {
rdd.foreachPartition(new VoidFunction>>() {
@Override
public void call(Iterator> partition) throws Exception {
/**
* 在这里我们使用数据库连接池的高效读写数据库的方式把数据写入数据库MySQL;
* 由于传入的参数是一个Iterator类型的集合,所以为了更加高效的操作我们需要批量处理
* 例如说一次性插入1000条Record,使用insertBatch或者updateBatch类型的操作;
* 插入的用户信息可以只包含:timestamp、ip、userID、adID、province、city
* 这里面有一个问题:可能出现两条记录的Key是一样的,此时就需要更新累加操作
*/
List userAdClickedList = new ArrayList();
while (partition.hasNext()){
Tuple2 record = partition.next();
String[] splited = record._1.split("_");
UserAdClicked userClicked = new UserAdClicked();
userClicked.setTimestamp(splited[0]);
userClicked.setIp(splited[1]);
userClicked.setUserID(splited[2]);
userClicked.setAdID(splited[3]);
userClicked.setProvince(splited[4]);
userClicked.setCity(splited[5]);
userAdClickedList.add(userClicked);
}
List inserting = new ArrayList();
List updating = new ArrayList();
JDBCWrapper jdbcWrapper = JDBCWrapper.getJDBCInstance();
//adclicked 表的字段:timestamp、ip、userID、adID、province、city、clickedCount
for (UserAdClicked clicked : userAdClickedList){
jdbcWrapper.doQuery("SELECT count(1) FROM adclicked WHERE "
+ " timestamp = ? AND userID = ? AND adID = ?",
new Object[]{clicked.getTimestamp(), clicked.getUserID(), clicked.getAdID()},
new ExecuteCallBack() {
@Override
public void resultCallBack(ResultSet result) throws Exception {
if(result.next()){
long count = result.getLong(1);
clicked.setClickedCount(count);
updating.add(clicked);
} else {
inserting.add(clicked);
}
}
});
}
//adclicked 表的字段:timestamp、ip、userID、adID、province、city、clickedCount
ArrayList insertParametersList = new ArrayList();
for(UserAdClicked inserRecord : inserting){
insertParametersList.add(new Object[]{
inserRecord.getTimestamp(),
inserRecord.getIp(),
inserRecord.getUserID(),
inserRecord.getAdID(),
inserRecord.getProvince(),
inserRecord.getCity(),
inserRecord.getClickedCount()
});
}
jdbcWrapper.doBatch("INSERT INTO adclicked VALUES(?,?,?,?,?,?,?)", insertParametersList);
//adclicked 表的字段:timestamp、ip、userID、adID、province、city、clickedCount
ArrayList updateParametersList = new ArrayList();
for(UserAdClicked updateRecord : updating){
updateParametersList.add(new Object[]{
updateRecord.getTimestamp(),
updateRecord.getIp(),
updateRecord.getUserID(),
updateRecord.getAdID(),
updateRecord.getProvince(),
updateRecord.getCity(),
updateRecord.getClickedCount()
});
}
jdbcWrapper.doBatch("UPDATE adclicked set clickedCount = ? WHERE "
+ " timestamp = ? AND ip = ? AND userID = ? AND adID = ? AND province = ? "
+ "AND city = ? ", updateParametersList);
}
});
return null;
}
});
JavaPairDStream blackListBasedOnHistory = filteredClickInBatch.filter(new Function, Boolean>() {
@Override
public Boolean call(Tuple2 v1) throws Exception {
//广告点击的基本数据格式:timestamp、ip、userID、adID、province、city
String[] splited = v1._1.split("_");
String date = splited[0];
String userID = splited[2];
String adID = splited[3];
/**
* 接下来根据date、userID、adID等条件去查询用户点击广告的数据表,获得总的点击次数
* 这个时候基于点击次数判断是否属于黑名单点击 *
*/
int clickedCountTotalToday = 81;
if (clickedCountTotalToday > 50)
{
return true;
} else {
return false;
}
}
});
/**
* 必须对黑名单的整个RDD进行去重操作!!!
*/
JavaDStream blackListuserIDtBasedOnHistory = blackListBasedOnHistory.map(new Function, String>() {
@Override
public String call(Tuple2 v1) throws Exception {
// TODO Auto-generated method stub
return v1._1.split("_")[2];
}
});
JavaDStream blackListUniqueuserIDtBasedOnHistory = blackListuserIDtBasedOnHistory.transform(new Function, JavaRDD>() {
@Override
public JavaRDD call(JavaRDD rdd) throws Exception {
// TODO Auto-generated method stub
return rdd.distinct();
}
});
//下一步写入黑名单数据表中
blackListUniqueuserIDtBasedOnHistory.foreachRDD(new Function, Void>() {
@Override
public Void call(JavaRDD rdd) throws Exception {
rdd.foreachPartition(new VoidFunction>() {
@Override
public void call(Iterator t) throws Exception {
/**
* 在这里我们使用数据库连接池的高效读写数据库的方式把数据写入数据库MySQL;
* 由于传入的参数是一个Iterator类型的集合,所以为了更加高效的操作我们需要批量处理
* 例如说一次性插入1000条Record,使用insertBatch或者updateBatch类型的操作;
* 插入的用户信息可以只包含:useID
* 此时直接插入黑名单数据表即可。
*/
List blackList = new ArrayList();
while(t.hasNext()){
blackList.add(new Object[]{(Object)t.next()});
}
JDBCWrapper jdbcWrapper = JDBCWrapper.getJDBCInstance();
jdbcWrapper.doBatch("INSERT INTO blacklisttable VALUES (?) ", blackList);
}
});
return null;
}
});
/**
* 广告点击累计动态更新,每个updateStateByKey都会在Batch Duration的时间间隔的基础上进行更高点击次数的更新,
* 更新之后我们一般都会持久化到外部存储设备上,在这里我们存储到MySQL数据库中;
*/
filteredadClickedStreaming.mapToPair(new PairFunction, String, Long>() {
@Override
public Tuple2 call(Tuple2 t) throws Exception {
String[] splited = t._2.split("\t");
String timestamp = splited[0]; //yyyy-MM-dd
String ip = splited[1];
String userID = splited[2];
String adID = splited[3];
String province = splited[4];
String city = splited[5];
String clickedRecord = timestamp + "_" + adID + "_"
+ province + "_" + city;
return new Tuple2(clickedRecord, 1L);
}
}).updateStateByKey(new Function2, Optional, Optional>() {
@Override
public Optional call(List v1, Optional v2) throws Exception {
/**在历史的数据的基础上进行更新
* v1:代表是当前的key在当前的Batch Duration中出现次数的集合,例如{1,1,1,1,1,1}
* v2:代表当前key在以前的Batch Duration中积累下来的结果;我们要再v2的基础上不断加v1的值
*/
Long clickedTotalHistory = 0L;
if(v2.isPresent()) {//如果v2存在
clickedTotalHistory = v2.get();//拿v2的值
}
//不用reduceBykey是因为会产生很多shuffle,shuffle里面有很多内容的。updateStateByKey可以算过去一天,1年
for(Long one : v1){//循环v1
clickedTotalHistory += one;//一直在基础上进行累加
}
return Optional.of(clickedTotalHistory);
}
}).foreachRDD(new Function, Void>() {
@Override
public Void call(JavaPairRDD rdd) throws Exception {
rdd.foreachPartition(new VoidFunction>>() {
@Override
public void call(Iterator> partition) throws Exception {
/**
* 在这里我们使用数据库连接池的高效读写数据库的方式把数据写入数据库MySQL;
* 由于传入的参数是一个Iterator类型的集合,所以为了更加高效的操作我们需要批量处理
* 例如说一次性插入1000条Record,使用insertBatch或者updateBatch类型的操作;
* 插入的用户信息可以只包含:timestamp、adID、province、city
* 这里面有一个问题:可能出现两条记录的Key是一样的,此时就需要更新累加操作
*/
List adClickedList = new ArrayList();
while (partition.hasNext()){
Tuple2 record = partition.next();
String[] splited = record._1.split("_");
AdClicked adClicked = new AdClicked();
adClicked.setTimestamp(splited[0]);
adClicked.setAdID(splited[1]);
adClicked.setProvince(splited[2]);
adClicked.setCity(splited[3]);
adClicked.setClickedCount(record._2);
adClickedList.add(adClicked);
}
JDBCWrapper jdbcWrapper = JDBCWrapper.getJDBCInstance();
List inserting = new ArrayList();
List updating = new ArrayList();
//adclicked 表的字段:timestamp、ip、userID、adID、province、city、clickedCount
for (AdClicked clicked : adClickedList){
jdbcWrapper.doQuery("SELECT count(1) FROM adclickedcount WHERE "
+ " timestamp = ? AND adID = ? AND province = ? AND city = ? ",
new Object[]{clicked.getTimestamp(), clicked.getAdID(), clicked.getProvince(),clicked.getCity()},
new ExecuteCallBack() {
@Override
public void resultCallBack(ResultSet result) throws Exception {
if(result.next()){
long count = result.getLong(1);
clicked.setClickedCount(count);
updating.add(clicked);
} else {
inserting.add(clicked);
}
}
});
}
//adclicked 表的字段:timestamp、ip、userID、adID、province、city、clickedCount
ArrayList insertParametersList = new ArrayList();
for(AdClicked inserRecord : inserting){
insertParametersList.add(new Object[]{
inserRecord.getTimestamp(),
inserRecord.getAdID(),
inserRecord.getProvince(),
inserRecord.getCity(),
inserRecord.getClickedCount()
});
}
jdbcWrapper.doBatch("INSERT INTO adclickedcount VALUES(?,?,?,?,?)", insertParametersList);
//adclicked 表的字段:timestamp、ip、userID、adID、province、city、clickedCount
/* ArrayList updateParametersList = new ArrayList();
for(AdClicked updateRecord : updating){
updateParametersList.add(new Object[]{
updateRecord.getTimestamp(),
updateRecord.getAdID(),
updateRecord.getProvince(),
updateRecord.getCity(),
updateRecord.getClickedCount()
});
}
jdbcWrapper.doBatch("UPDATE adclickedcount set clickedCount = ? WHERE "
+ " timestamp = ? AND adID = ? AND province = ? AND city = ? ", updateParametersList);*/
}
});
return null;
}
});
/*
* Spark Streaming执行引擎也就是Driver开始运行,Driver启动的时候是位于一条新的线程中的,当然其内部有消息循环体,用于
* 接受应用程序本身或者Executor中的消息;
*/
jsc.start();
jsc.awaitTermination();
jsc.close();
}
}
class JDBCWrapper {
private static JDBCWrapper jdbcInstance = null;
private static LinkedBlockingQueue dbConnectionPool = new LinkedBlockingQueue ();
static {
try {
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static JDBCWrapper getJDBCInstance(){
if (jdbcInstance == null){
synchronized(JDBCWrapper.class){
if (jdbcInstance == null){
jdbcInstance = new JDBCWrapper();
}
}
}
return jdbcInstance;
}
private JDBCWrapper(){
for (int i = 0; i < 10; i++){
try {
Connection conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/sparkstreaming","root","root");
dbConnectionPool.put(conn);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public synchronized Connection getConnection(){
while (0 == dbConnectionPool.size()){
try {
Thread.sleep(20);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return dbConnectionPool.poll();
}
public int[] doBatch(String sqlText, List paramsList) {
Connection conn = getConnection();
PreparedStatement preparedStatement = null;
int[] result = null;
try {
conn.setAutoCommit(false);
preparedStatement = conn.prepareStatement(sqlText);
for (Object[] parameters : paramsList){
for(int i = 0; i < parameters.length; i++){
preparedStatement.setObject(i+1, parameters[i]);
}
preparedStatement.addBatch();
}
result = preparedStatement.executeBatch();
conn.commit();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (preparedStatement != null){
try {
preparedStatement.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if (conn != null){
try {
dbConnectionPool.put(conn);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
return result;
}
public void doQuery(String sqlText, Object[] paramsList, ExecuteCallBack callBack) {
System.out.println("sqlText=" + sqlText);
Connection conn = getConnection();
PreparedStatement preparedStatement = null;
ResultSet result = null;
try {
preparedStatement = conn.prepareStatement(sqlText);
for(int i = 0; i < paramsList.length; i++){
preparedStatement.setObject(i+1, paramsList[i]);
}
result = preparedStatement.executeQuery();
callBack.resultCallBack(result);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (preparedStatement != null){
try {
preparedStatement.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if (conn != null){
try {
dbConnectionPool.put(conn);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
}
interface ExecuteCallBack {
void resultCallBack(ResultSet result) throws Exception;
}
class UserAdClicked {
private String timestamp;
private String ip;
private String userID;
private String adID;
private String province;
private String city;
private Long clickedCount;
public Long getClickedCount() {
return clickedCount;
}
public void setClickedCount(Long clickedCount) {
this.clickedCount = clickedCount;
}
public String getTimestamp() {
return timestamp;
}
public void setTimestamp(String timestamp) {
this.timestamp = timestamp;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public String getUserID() {
return userID;
}
public void setUserID(String userID) {
this.userID = userID;
}
public String getAdID() {
return adID;
}
public void setAdID(String adID) {
this.adID = adID;
}
public String getProvince() {
return province;
}
public void setProvince(String province) {
this.province = province;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
}
class AdClicked{
private String timestamp;
private String adID;
private String province;
private String city;
private Long clickedCount;
public String getTimestamp() {
return timestamp;
}
public void setTimestamp(String timestamp) {
this.timestamp = timestamp;
}
public String getAdID() {
return adID;
}
public void setAdID(String adID) {
this.adID = adID;
}
public String getProvince() {
return province;
}
public void setProvince(String province) {
this.province = province;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public Long getClickedCount() {
return clickedCount;
}
public void setClickedCount(Long clickedCount) {
this.clickedCount = clickedCount;
}
}
-------------------------------------------------------------MockAdClickedStats ----------------------------------------------------------------------------------------------------------------------------------
import java.util.Date; import java.util.HashMap; import java.util.Properties; import java.util.Random; import kafka.javaapi.producer.Producer; import kafka.producer.KeyedMessage; import kafka.producer.ProducerConfig; public class MockAdClickedStats { public static void main(String[] args){ Random random = new Random(); String[] provinces = new String[]{"Guangdong","Zhejiang","Jiangsu","Fujian"}; HashMap cityes = new HashMap(); cityes.put("Guangdong",new String[]{"Guangzhou","Shenzhen","DongGuan"}); cityes.put("Zhejiang",new String[]{"Hangzhou","Wenzhou","Ninbo"}); cityes.put("Jiangsu",new String[]{"Nanjing","Suzhou","wuxi"}); cityes.put("Fujian",new String[]{"Fuzhou","Ximen","DongGuan"}); String[] ips = new String[]{ "192.168.112.240", "192.168.112.241", "192.168.112.242", "192.168.112.243", "192.168.112.244", "192.168.112.245", "192.168.112.246", "192.168.112.247", "192.168.112.248", "192.168.112.249", "192.168.112.250", "192.168.112.251", "192.168.112.252", "192.168.112.253" }; Properties kafkaConf = new Properties(); kafkaConf.put("serializer.class","kafka.serializer.StringEncoder"); kafkaConf.put("metadata.broker.list","master:9092,slave1:9092,slave2:9092"); ProducerConfig producerConfig = new ProducerConfig(kafkaConf); Producer producer = new Producer(producerConfig); new Thread(new Runnable(){ @Override public void run(){ while(true){ Long timestamp = new Date().getTime(); String ip = ips[random.nextInt(14)]; int userID = random.nextInt(10000); int adID = random.nextInt(100); String province = provinces[random.nextInt(4)]; String city = cityes.get(province)[random.nextInt(3)]; String clickedAd = timestamp + "\t" + ip + "\t" + userID + "\t" + adID + "\t" + province + "\t" + city; producer.send(new KeyedMessage("AdClicked",clickedAd)); System.out.println(clickedAd); try{ Thread.sleep(50); }catch(InterruptedException e){ e.printStackTrace(); } } } }).start(); } }