速查手册:
JavaReceiverInputDStream lines = jssc.socketTextStream("master", 9999);
JavaDStream lines = jssc.textFileStream("hdfs://Master:9000/sparkStreaming/data");//监控新增的文件。
#agent1表示代理名称
agent1.sources=source1
agent1.sinks=sink1
agent1.channels=channel1
#Spooling Directory是监控指定文件夹中新文件的变化,一旦新文件出现,就解析该文件内容,然后写入到channle。写入完成后,标记该文件已完成或者删除该文件。
#配置source1
agent1.sources.source1.type=spooldir
agent1.sources.source1.spoolDir=/flume_data
agent1.sources.source1.channels=channel1
agent1.sources.source1.fileHeader = false
agent1.sources.source1.interceptors = i1
agent1.sources.source1.interceptors.i1.type = timestamp
#配置sink1
agent1.sinks.sink1.type=hdfs
agent1.sinks.sink1.hdfs.path=hdfs://hadoop0:9000/flume
agent1.sinks.sink1.hdfs.fileType=DataStream
agent1.sinks.sink1.hdfs.writeFormat=TEXT
agent1.sinks.sink1.hdfs.rollInterval=1
agent1.sinks.sink1.channel=channel1
agent1.sinks.sink1.hdfs.filePrefix=%Y-%m-%d
#配置channel1
agent1.channels.channel1.type=file
agent1.channels.channel1.checkpointDir=/root/hmbbs_tmp/123
agent1.channels.channel1.dataDirs=/root/hmbbs_tmp/
执行命令bin/flume-ng agent -n agent1 -c conf -f conf/example -Dflume.root.logger=DEBUG,console
#agent1表示代理名称
agent1.sources=source1
agent1.sinks=sink1
agent1.channels=channel1
#Spooling Directory是监控指定文件夹中新文件的变化,一旦新文件出现,就解析该文件内容,然后写入到channle。写入完成后,标记该文件已完成或者删除该文件。
#配置source1
agent1.sources.source1.type=spooldir
agent1.sources.source1.spoolDir=/flume_data
agent1.sources.source1.channels=channel1
agent1.sources.source1.fileHeader = false
agent1.sources.source1.interceptors = i1
agent1.sources.source1.interceptors.i1.type = timestamp
#配置sink1
#agent1.sinks.sink1.type=hdfs
#agent1.sinks.sink1.hdfs.path=hdfs://hadoop0:9000/flume
#agent1.sinks.sink1.hdfs.fileType=DataStream
#agent1.sinks.sink1.hdfs.writeFormat=TEXT
#agent1.sinks.sink1.hdfs.rollInterval=1
#agent1.sinks.sink1.channel=channel1
#agent1.sinks.sink1.hdfs.filePrefix=%Y-%m-%d
agent1.sinks.sink1.type=avro
agent1.sinks.sink1.channel = channel1
agent1.sinks.sink1.hostname=master
agent1.sinks.sink1.port=9999
#配置channel1
agent1.channels.channel1.type=file
agent1.channels.channel1.checkpointDir=/root/hmbbs_tmp/123
agent1.channels.channel1.dataDirs=/root/hmbbs_tmp/
添加依赖:
<dependency>
<groupId>org.apache.spark</group>
<artifactId>spark-streaming-flume_2.10</artifactId>
<version>1.6.0</version>
</dependency>
JavaReceiverInputDStream lines =FlumeUtils.createStream(jssc,"master",9999)
JavaDStream words = lines.flatMap(new FlatMapFunction<SparkFlumeEvent, String>() {
@Override
public Iterable<String> call(SparkFlumeEvent event) throws Exception {
//一个文件就相当于一个event
String line = new String(event.event().getBody().array());
return Arrays.asList((line.split(" ")));
}
});
agent1.sinks.sink1.type=org.apache.spark.streamingflume.sink.SparkSink
agent1.sinks.sink1.channel = channel1
agent1.sinks.sink1.hostname=master
agent1.sinks.sink1.port=9999
下载jar包到flume的lib下面:http://spark.apache.org/docs/latest/streaming-flume-integration.html
JavaReceiverInputDStream lines = FlumeUtils.createPollingStream(jssc,"master",9999);
参数1是streamingContext实例;
参数2是ZooKeeper集群信息(接受Kafka数据的时候会从Zookeeper中获取offerset等元数据信息)
参数3是Cousumer Group
参数4是消费的Topic以及并发读取Topic中Partition的线程数
Map<String,Integer> topicConsumerConcurrency = new HashMap<String,Integer>();
topicConsumerConcurrency .put("helloKafkaFromSparkStreaming",2)
JavaPairReceiverInputDstream<String,String> lines = KafkaUtils.createStream(jsc,"master:2181,slave1:2181,slave2:2181","MyFirstConsumerGroup",topicConsumerConcurrency );
lines.flatMap(new FlatMapFunction<String,String>,String(){
public Iterable<String>call(Tuple2<String,String>tuple) throws Exception{
return Array.asList(tuple._2.split(" "));
}
})
启动kafka(nuhup后台)
nohup ./kafka-server-start.sh ../config/server.properties &
创建topic
./kafka-topics.sh --create --zookeeper master:2181,slave1:2181,slave2:2181 --replication-factor 3 --partition 1 --topic helloKafkaFromSparkStreaming
创建producer
./kafka-console-producer.sh --broker-list master:9092,slave9092,slave2:9092 --topic helloKafkaFromSparkStreaming
创建consumer
bin/kafka-console-consumer.sh --zookeeper master:2181,slave1:2181,slave2:2181 --topic helloKafkaFromSparkStreaming --from-begining
HashMap<String,String> KafkaParams = new HashMap<String,String>();
KafkaParams.put("metadata.borker.list","master:9092,slave1:9092,slave2:9092");
HashSet<String> topics = new HashSet<String>();
topics.add("topics");
JavaPairInputDStream<String, String> messages = KafkaUtils.createDirectStream(jssc,String.class,String.class, StringDecoder.class,StringDecoder.class,KafkaParams,topics);
scala版本:KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,topicsSet);
jsc.checkpoing(" ");//必须指定
JavaPairDStream<String,Integer> wordsCount = pairs.updataStateBykey(newFuction2<List<Integer>,Optional<Integer>,Option<Integer>,Optional<Integer>>(){
public Option<Integer> call(List<Integer>values,Option<Ingeger>state)throws Exception{
Integer updatedValue = 0;
if(state.isPresent(){
updatedValue = state.get();
}
for(Integer value:values){
updatedValues += value;
}
return Optional.of(updatedValue);
}
})
val blackList = Array(("hadoop",true),("mahout",true))
val blackListRDD = sc.sparkContext.parallelize(blackList,8)
val adsClickStream = sss.socketTextStream("master",9999)
//adf格式:time name
//map结果:name,(time,name)
val adsClickStreamFormatted = adsClickStream.map{ads => (ads.split(" ")(1),ads)}
adsClickStreamFormatted.trransform(userClickRDD =>{
val joinedBlackListRDD = userClickRDD.leftOuterJoin(blackListRDD)
val validClicked = joinedBlackListRDD .filter(joinedItem =>{
if(joinedItem._2._2.getOrElse(false)){
false
} else {
true
}
})
validClicked .map(validClicked => {validClick._2._1}
注意:窗口和滑动时间间隔一定是Batch Interval的整数倍
//数据格式 user item
val hottestStream= ssc.socketTextStream("master",9999);
val searchPair = hottestStream.map(_.split(" ")(1)).map(item => (item,1)
//val hottestDStream = searchPair.reduceByKeyAndWindow((v1:Int,v2:Int) => v1+v2,Seconds(60),Seconds(20))
val hottestDStream = searchPair.reduceByKeyAndWindow((v1:Int,v2:Int) => v1+v2,(v1:Int,v2:int) => v1-v2,Seconds(60),Seconds(20))
hottestDStream.transform(hottestItemRDD => {
val top3 = hottestItemRDD.map(pair => (pair._2,pair._1)).sortByKey(false).map(pair => (pair._2,pair._1)).take(3)
for(item <- top3) {
println(item)
}
hottestItemRDD //这个无所谓,因为transform需要返回一个RDD
}).print()
val wordCounts = words.map(x=>(x,1)).reduceByKey(_+_)
wordCounts.foreachRDD { rdd =>
rdd.foreachPartition { partitionOfRecords =>
val connection = ConnectionPool.getConnection();
partitionOfRecords.foreach(record => {
val sql = "insert into streaming_itemcount(item,count) values(' "+ record._1+" ' ,"+record._2 +")"
val stmt = connection.createStatement();
stmt.executeUpdate(sql);
})
ConnectionPool.returnConnection(connection)
}
}
连接池简单实现:
public class ConnectionPool {
private static LinkedList<Connection> connectionQueue;
static {
try {
Class.forName("com.mysql.jdbc.Driver");
} catch (ClassNotFoundException e){
e.printStackTrace();
}
}
public synchronized static Connection getConnection() {
try{
if(connectionQueue == null) {
connectionQueue = new LinkedList<Connection>();
for(int i = 0; i < 5; i++) {
Connection conn = DriverManager.getConnection(
"jdbc:mysql://master:3306/sparkstreaming",
"root",
"root");
connectionQueue.push(conn);
}
}
}catch(Exception e) {
e.printStackTrace();
}
return connectionQueue.poll();
}
public static void returnConnection(Connection conn) {connectionQueue.push(conn);}
使用sparkStreaming SQL来在线动态计算电商中不同类别中最热门的商品排名。
//数据格式: user item category
val userClickLogsDStream = ssc.socketTextStream("master",9999);
val formattedUserClickLogsDStream = userClickLogsDStream.map(clickLog =>
(clickLog.split(" ")(2)+"_"+clickLog.split(" ")(1),1))
val categoryUserClickLogsDStream = formattedUserClickLogsDStream.reduceByKeyAndWindow(_+_,_-_,Seconds(60),Second(20))
categoryUserClickLogsDStream.foreachRDD{ rdd =>
val categoryItemRow = rdd.map(reducedItem => {
val category = reducedItem._1.split("_")(0)
val item= reducedItem._1.split("_")(1)
val click_count = reducedItem._2
Row(category,item,click_count )
})
val structType = StructType(Array(
StructField("category",StringType,true)
StructField("item",StringType,true)
StructField("click_count ",IntegerType,true)
))
val hiveContext = new hiveContext(rdd.sparkContext)
categoryItemDF = hiveContext.createDataFrame(categoryItemRow,structType)
categoryItemDF.registerTempTable("categoryItemTable")
val resultDataFram = hiveContext.sql("select category,item,click_count from (select category,item,click_count,row_number()"+
" over (partition by category order by click_count desc ) rank from categoryItemTable) subquery " +
" where rank <= 3" )
val resultRowRDD = resultDataFram.rdd
resultRowRDD .foreachRDD { rdd =>
rdd.foreachPartition { partitionOfRecords =>
if(partitionOfRecords.isEmpty) {
} else {
val connection = ConnectionPool.getConnection();
partitionOfRecords.foreach(record => {
val sql = "insert into categorytop3(category,item,click_count) values(' "+ record.getAs("category")+" ' ,' "+record.getAs("item")+" '," + record.getAs("click_count") + ")"
val stmt = connection.createStatement();
stmt.executeUpdate(sql);
})
ConnectionPool.returnConnection(connection)
}
}
}
}
/** * 论坛数据自动生成代码,数据格式如下: * date:日期,格式为yyyy-MM-dd * timestamp:时间戳 * userID:用户ID * pageID:页面ID * channel:板块ID * action:点击和注册 */ public class SparkStreamingDataManuallyProducerforKafka extends Thread{ //论坛板块 static String[] channelNames = new String[] { "spark","scala","kafka","Flink","hadoop","Storm", "Hive","Impala","Hbase","ML" }; static String[] actionNames = new String[]{"View","Register"}; private String topic;//发送给Kafak的数据的类别 private Producer<Integer,String> producerForKaka; private static String dataToday; private static Random random; public SparkStreamingDataManuallyProducerforKafka(String topic){ dataToday = new SimpleDateFormat("yyyy-MM-dd").format(new Date()); this.topic = topic; random = new Random(); Properties conf = new Properties(); conf.put("metadata.broker.list", "master:9092,slave1:9092,slave2:9092"); conf.put("serializer.class","kafka.serializer.StringEncoder"); producerForKaka = new Producer<Integer,String>(new ProducerConfig(conf)); } @Override public void run() { int counter = 0; while(true) { counter++; String userLog = userLogs(); System.out.println("product:" + userLog); producerForKaka.send(new KeyedMessage<Integer, String>(topic,userLog)); if(500 == counter) { counter = 0; try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } } } } public static void main(String[] args) { new SparkStreamingDataManuallyProducerforKafka("UserLogs").start(); } private static String userLogs() { StringBuffer userLogBuffer = new StringBuffer(""); long timestamp = new Date().getTime(); long userID = 0L; long pageID = 0L; //随机生成的用户ID userID = random.nextInt((int) 2000); //随机生成的页面ID pageID = random.nextInt((int) 2000); //随机生成Chan String channel = channelNames[random.nextInt(10)]; //随机生成action行为 String action = actionNames[random.nextInt(2)]; userLogBuffer.append(dataToday) .append("\t") .append(timestamp) .append("\t") .append(userID) .append("\t") .append(pageID) .append("\t") .append(channel) .append("\t") .append(action); return userLogBuffer.toString(); } }
HashMap<String,String> KafkaParams = new HashMap<String,String>();
KafkaParams.put("metadata.borker.list","master:9092,slave1:9092,slave2:9092");
HashSet<String> topics = new HashSet<String>();
topics.add("topics");
JavaPairInputDStream<String, String> lines= KafkaUtils.createDirectStream(jssc,String.class,String.class, StringDecoder.class,StringDecoder.class,KafkaParams,topics);
JavaPairInputDStream<String, String> logDStream = lines.filter(new Function<Tuple2<String,String,>,Boolean> () {
public Boolean call(Tuple2<String,String> v1) throws Exception {
String[] logs = v1._2.split("\t");
String action = logs[5];
if("View".equals(action)){
return true;
} else {
return false;
}
}
});
JavaPairInputDStream<Long,Long> pairs = logDStream.mapToPair(new PairFunction<String,String>,Long,Long>(){
public Tuple2<Long,Long> call(Tuple2<String,String,t) throws Exception{
String[] logs = t._2.split("\t");
Long pageId = Long.valueOf(logs[3]);
return new Tuple2<Long,Long>(pageId,1L);
}
});
JavaPairDSteam<Long,Long> wordsCount = pairs.reduceByKey(new Function<Long,Long,Long>() {
public Long call(long v1,Long v2) throws Exception {
return v1+v2;
}
});
wordsCount .print();
private static void onlineJumped(JavaPairInputDStream<String,String> lines ) { lines.mapToPair(new PairFunction<Tuple2<String, String>,Long,Long>() { @Override public Tuple2<Long, Long> call(Tuple2<String, String> t) throws Exception { String[] logs = t._2().split("\t"); Long usrId = Long.valueOf(logs[2] != null ? logs[2]:"-1"); return new Tuple2<Long, Long>(usrId,1L); } }).filter(new Function<Tuple2<Long, Long>, Boolean>() { @Override public Boolean call(Tuple2<Long, Long> v1) throws Exception { if(1 == v1._2()) { return true; } else { return false; } } }).count().print(); }
public class Test { private static volatile Broadcast<List<String>> broadcastList = null; private static volatile Accumulator<Integer> accumulator = null; public static void main(String[] args) { SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("wordCountOnline"); JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5)); broadcastList = jssc.sparkContext().broadcast(java.util.Arrays.asList("Hadoop","Mahout","Hive")); accumulator = jssc.sparkContext().accumulator(0, "OnlineBlackCounter"); JavaReceiverInputDStream lines = jssc.socketTextStream("master", 9999); JavaPairDStream<String,Integer> pairs = lines.mapToPair(new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String word) throws Exception { return new Tuple2<String, Integer>(word, 1); } }); JavaPairDStream<String,Integer> wordCount = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer v1, Integer v2) throws Exception { return v1 + v2; } }); wordCount.foreachRDD(new Function2<JavaPairRDD<String, Integer>, Time, Void>() { @Override public Void call(JavaPairRDD<String, Integer> rdd, Time time) throws Exception { rdd.filter(new Function<Tuple2<String, Integer>, Boolean>() { @Override public Boolean call(Tuple2<String, Integer> wordPair) throws Exception { if(broadcastList.value().contains((wordPair._1()))){ accumulator.add(wordPair._2()); return false; } else { return true; } } }).collect(); System.out.println("BlackList appeared:" + accumulator.value()+ "times"); return null; } }); jssc.start(); jssc.awaitTermination(); jssc.close(); } }