需求
kafka中会不断产生用户的操作日志,主要内容为(userid,operation,time),在hbase中存储了(userid,cityid)
需要统计每5分钟内不同的城市有过多少次操作
思路:
1.先处理kafka的用户日志,统计每5分钟会有多少个(userid)
2.再通过查询hbase的数据将userid映射为对应的cityid
3.此时的数据应该为(time,cityid,1),再做一次reducebykey即可
kafka模拟生成操作日志的方法不再赘述
可以查看之前那篇kafka实战里面有提到
hbase 中,将userid作为rowkey,cityid为列
代码部分
package main.scala
//需求:
//1.hbase中有数据 (用户id,城市id)000001 shanghai
//2.kafka中不停地生产用户操作日志 (用户id,操作,时间) 000001 w 2018-5-17 09:00
//3.需要每5分钟统计一次操作日志的用户的数量,按城市划分
//kafka topic: city_count
import java.sql.Date
import java.text.SimpleDateFormat
import java.util.Properties
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.{Duration, StreamingContext}
import org.apache.spark.streaming.kafka.KafkaUtils
import com.alibaba.fastjson.JSON
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
//import org.apache.hadoop.hbase.client.{ConnectionFactory, Get}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Get}
//import kafka.producer.ProducerConfig
//import kafka.producer.Producer
//import kafka.producer.KeyedMessage
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client
import org.apache.hadoop.hbase.util.Bytes
object kafka_spark_hbase {
//建立与hbase的连接
val configuration = HBaseConfiguration.create()
configuration.set("hbase.zookeeper.property.clientPort", "2181")
configuration.set("hbase.zookeeper.quorum", "localhost")
val connection = ConnectionFactory.createConnection(configuration)
val admin = connection.getAdmin()//获取admin
val table = connection.getTable(TableName.valueOf("user_city"))//获取hbase的表
val table2 = connection.getTable(TableName.valueOf("time_city"))
def main(args: Array[String]) {
val sparkConf = new SparkConf().setMaster("local[2]").setAppName("kafka-spark-demo")
val scc = new org.apache.spark.streaming.StreamingContext(sparkConf, Duration(10000))//new一个spark-streaming的上下文
// scc.checkpoint(".") // 暂时用不到
val topics = Set("city_count") //我们需要消费的kafka数据的topic
val kafkaParam = Map(
"metadata.broker.list" -> "localhost:9092", // kafka的broker list地址
"auto.offset.reset" -> "smallest"//这个参数可以让streaming消费topic的时候从头开始消费
)
val stream: InputDStream[(String, String)] = createStream(scc, kafkaParam, topics)//建立流,读数据,传入上下文,kafka配置,主题名字
val wordCount = stream.map(l=>json_an(l._2)).map(l=>(deal_user_city(l.toString),1)).reduceByKey(_+_)//处理
//将处理好的数据存回hbase
wordCount.map{line=>
val time=line._1.toString
val city=line._2.toString
val p = new Put(time.getBytes)
//为put操作指定 column 和 value (以前的 put.add 方法被弃用了)
p.addColumn("info".getBytes, "city".getBytes, city.getBytes)
//提交
table2.put(p)
"store successs!"
}.print()
// wordCount.print()//输出到控制台看看结果
scc.start() // 真正启动程序
scc.awaitTermination() //阻塞等待
}
/**
* 创建一个从kafka获取数据的流.
*
* @param scc spark streaming上下文
* @param kafkaParam kafka相关配置
* @param topics 需要消费的topic集合
* @return
*/
def createStream(scc: StreamingContext, kafkaParam: Map[String, String], topics: Set[String]) = {
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](scc, kafkaParam, topics)
}
//处理时间
def formatData(line: String) = {
val date = new SimpleDateFormat("yyyy-MM-dd H:mm")
val d = new SimpleDateFormat("yyyy-MM-dd")
val dateFormated = date.parse(line)
val dateFormated3 = date.parse(line.split(" ")(0) + " 0:0")
val dateFormated2 = date.format(dateFormated)
val dateFormated4 = date.format(dateFormated3)
val dateFf = date.parse(dateFormated2).getTime
val dateFf2 = date.parse(dateFormated4).getTime
val r = dateFf - dateFf2
val hash = r / 300000
val final_date = new Date(hash.toInt * 300000 + dateFf2)
date.format(final_date)
}
//
def deal_user_city(str:String)={
//(time,id)
val str_1=str.split(',')
val id=str_1(1).split(')')(0)
//通过Get方法查询hbase的数据进行匹配
val g = new Get(id.getBytes)
val result = table.get(g)
//获取对应列的值
val value = Bytes.toString(result.getValue("info".getBytes,"city".getBytes))
str_1(0).split('(')(1)+' '+value
}
//字符串处理。在这里是提取时间
def json_an(str: String) = {
if (str.length < 10) {
1
}
else {
val json = JSON.parseObject(str)
val main_v = json.get("payload")
val v= main_v.toString.split(",")
if (v.length == 3) {
(formatData(v(2)),v(0))
}
else {
"NAN"
}
}
}
}
在此说明一下处理kafka的流时为什么要用json处理
因为使用的是connect模式,在该模式下,写入topic的内容就是json格式
pom.xml
4.0.0
com.huangxiao
streaming
1.0-SNAPSHOT
org.apache.spark
spark-streaming_2.11
2.3.0
org.apache.spark
spark-streaming-kafka_2.11
1.6.3
com.alibaba
fastjson
1.2.47
org.apache.hbase
hbase-server
1.4.4
io.netty
netty-all