import java.io.PrintWriter
import java.text.SimpleDateFormat
import java.util.{Date, Properties}
import org.apache.kafka.clients.producer.{KafkaProducer, Producer, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
import scala.util.Random
object MockData {
/**
*
* 获取几位随机数的方法
* @param index 位数
* @param random
* @return
*/
def randomNum(index:Int,random:Random): String ={
var str=""
for(i <- 0 until index) {
str+=random.nextInt(10)
}
str
}
/**
* 填充数据
* @param index 填充几位
* @param num 随机数范围
* @param random
*/
def filrZero(index:Int,num:Int,random: Random): String ={
val randomNum = random.nextInt(num)
var str=randomNum.toString
if (randomNum<10){
str=("%0"+index+"d").format(randomNum)
}
str
}
/**
* 填充数据
* @param index 填充几位
* @param num 随机数范围
* @param random
*/
def filrZero2(index:Int,num:Int,random: Random): String ={
val randomNum = random.nextInt(num)
var str=randomNum.toString
if (randomNum<20){
str=("%0"+index+"d").format(randomNum)
}
str
}
/**
* 往kafka中发送数据
* @param content
*/
def sendKafkaData(content:String): Unit ={
val props = new Properties();
props.put("bootstrap.servers", "star.com:9092");
val producer = new KafkaProducer[String,String](props, new StringSerializer(), new StringSerializer());
producer.send(new ProducerRecord[String,String]("topic_car",content));
producer.close()
}
/**
* 初始化文件
*/
def initFileData(path:String): PrintWriter ={
new PrintWriter(path)
}
/**
* 往本地文件中存储
* @param content 内容
*/
def sendFileData(pw:PrintWriter,content:String): Unit ={
pw.write(content+"\n")
}
def fileDataClose(pw:PrintWriter): Unit ={
pw.close()
}
def mock() : Unit ={
val pw = initFileData("G:/data/MockData.txt")
val random = new Random()
for (i <- 1 to 200){
//获取当天的时间
val day = new SimpleDateFormat("yyyy-MM-dd").format(new Date())
//println(day)
val locations=Array("鲁", "豫", "湘", "广", "深", "沪", "晋", "粤", "赣", "京")
//65到90之间的一个随机数
val a=(65+new Random().nextInt(26)).asInstanceOf[Char]
//随机5位
val b=randomNum(5,random )
//车牌号
var car=locations(random.nextInt(locations.length))+a+b
for (i<- 1 until random.nextInt(100)){
//抓拍时间1(当前时间)
//val day2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date())
//抓拍时间2(随机时间)
val actionTime=day+" "+filrZero(2,24,random)+":"+filrZero(2,59,random)+":"+filrZero(2,59,random)
//车速
val speed=random.nextInt(200)+1
//道路 00-20填充0
val readId= filrZero(2,20,random)
//区 00-20填充0
val areaId= filrZero(2,20,random)
//卡口4位 前两位为0 后两位0-20
val monitorId = filrZero2(4, 20, random)
//摄像头5位
val camoraId="0"+randomNum(4,random)
val content=day+"\t"+monitorId+"\t"+camoraId+"\t"+car+"\t"+actionTime+"\t"+speed+"\t"+readId+"\t"+areaId
sendKafkaData(content)
sendFileData(pw, content)
}
}
fileDataClose(pw)
Thread.sleep(50)
}
def main(args: Array[String]): Unit = {
mock()
}
}
import org.apache.spark.sql.SparkSession
object Test1 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("test1").getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val file=args(0)
val df=spark.read
.option("sep","\t")
.schema("day string,monitor_id string,camera_id string,car string,action_time string,speed int,road_id string,area_id string")
.csv(file)
df.createTempView("mockdata")
//9. 从data.txt中读取数据使用SQL按区,道路分组统计每个摄像头抓拍下的的最高车速,并按照区,道路,车速倒序排序。
val df1=spark.sql("select area_id,road_id,max(speed) max_speed from mockdata group by area_id,road_id order by area_id,road_id,max_speed desc")
df1.show()
//10. 从data.txt中读取数据使用SQL统计每个摄像头下当天的平均车速。并且倒序排序。
val df2 = spark.sql("select day,camera_id,round(avg(speed),2) avgspeed from mockdata group by day,camera_id order by avgspeed desc")
df2.show()
//11. 从data.txt中读取数据使用SQL统计出每个区每条路当天经过的车辆数量合计总数,并按照总数正序排序。
val df3 = spark.sql("select day,road_id,area_id,count(distinct car) countcar from mockdata group by day,road_id,area_id order by countcar")
df3.show()
//12. 从data.txt中读取数据使用SQL统计出各个区,每条路车速最高的五条记录并按照车速倒序排序
val df4=spark.sql("select t.* from ( select *,row_number() over(partition by road_id order by speed desc) rank from mockdata) t where t.rank<=5")
df4.show()
//13. 从data.txt中读取数据自定义函数其中最后一个数据01-04对应海淀区,05-07 对应和平区,08-09对应门头沟区,10-11 对应西城区,其余为房山区。
spark.udf.register("get_area", (area_id: Int) => {
var area = ""
if (area_id >= 1 && area_id <= 4) {
area = "海淀区"
}else if(area_id >= 5 && area_id <= 7){
area = "和平区"
}else if(area_id >= 8 && area_id <= 9){
area = "门头沟区"
}else if(area_id >= 10 && area_id <= 11){
area = "西城区"
}else{
area = "房山区"
}
area
})
//14. 从data.txt中读取数据利用8的自定义函数求出各个区的 平均车速
val df5 = spark.sql("select get_area(area_id) area_name,round(avg(speed),2) avg_speed from mockdata group by area_name")
df5.show()
spark.close()
}
}
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
object Test2 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("test2")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
val ssc = new StreamingContext(sc, Seconds(5))
//15. 创建SparkStreamming消费者获取监控信息
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "star.com:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "use_a_separate_group_id_for_each_stream",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean) //false 偏移量不自动提交
)
//连接主题
val topics = Array("topic_car")
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
//16. 统计该监控5s内车捕获到的车牌牌总数并打印
val cars = stream.map(line => line.value().split("\t")(3))
val result1 = cars.transform(rdd => rdd.distinct()).count()
result1.print()
//17. 统计出5s内同一车牌出现次数超过2次及以上的车牌号码(请在演示时展现2次以上的效果)
val result2 = cars.map((_,1)).reduceByKey(_+_).filter(_._2 >= 2)
result2.print()
//手动提交偏移量
stream.foreachRDD { rdd =>
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
ssc.start() //启动
ssc.awaitTermination() //阻塞
}
}
Test1
bin/spark-submit --class 全限定类名 xxxxx.jar file:/opt/spark-2.4.5/xxx.txt(数据文件)
Test2
bin/spark-submit --class 全限定类名 xxxxx.jar