从网上找到SparkSql分析慕课网日志数据的视频,用其中的数据巩固复习上个星期学的Spark知识;
需求很简单,就是根据日志数据从地市、流量、点击数三个方面求一系列topN。
原始数据:
提取我们需要的字段:1.ip ; 2.时间+市区 ;3.流量(状态码后面的);4.页面发送的URL;
183.162.52.7 - - [10/Nov/2016:00:01:02 +0800] "POST /api3/getadv HTTP/1.1" 200 813 "www.imooc.com" "-" cid=0×tamp=1478707261865&uid=2871142&marking=androidbanner&secrect=a6e8e14701ffe9f6063934780d9e2e6d&token=f51e97d1cb1a9caac669ea8acc162b96 "mukewang/5.0.0 (Android 5.1.1; Xiaomi Redmi 3 Build/LMY47V),Network 2G/3G" "-" 10.100.134.244:80 200 0.027 0.027
10.100.0.1 - - [10/Nov/2016:00:01:02 +0800] "HEAD / HTTP/1.1" 301 0 "117.121.101.40" "-" - "curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.2.3 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2" "-" - - - 0.000117.35.88.11 - - [10/Nov/2016:00:01:02 +0800] "GET /article/ajaxcourserecommends?id=124 HTTP/1.1" 200 2345 "www.imooc.com" "http://www.imooc.com/code/1852" - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36" "-" 10.100.136.65:80 200 0.616 0.616182.106.215.93 - - [10/Nov/2016:00:01:02 +0800] "POST /socket.io/1/ HTTP/1.1" 200 94 "chat.mukewang.com" "-" - "android-websockets-2.0" "-" 10.100.15.239:80 200 0.004 0.004
用SparkSql对数据进行处理需要下面几个步骤:
第一次清洗:
切出字段,并且转换时间格式
//数据清洗
//输入:访问时间、访问url、流量、ip
//输出:url,cmsType(video/article)、cmsId(课程编号)、流量、ip、城市信息、访问时间、天
val domain="http://www.imooc.com/"
val cleardata = data.map(line => {
val splits = line.split(" ")
val ip = splits(0)
val city = IpUtils.convertIP(ip)
val time = splits(3) + " " + splits(4)
val url = splits(11).replaceAll("\"", "")
val traffic = splits(9).toLong
(DateUtils.parse(time),url,traffic,ip)
})filter(_._2.length>2)
第二次清洗:
def ip2Long(ip: String): Long = {
val fragments = ip.split("[.]")
var ipNum = 0L
for (i <- 0 until fragments.length){
ipNum = fragments(i).toLong | ipNum << 8L
}
ipNum
}
def readRules(path: String): Array[(Long, Long, String)] = {
//读取ip规则
val bf: BufferedSource = Source.fromFile(path)
val lines: Iterator[String] = bf.getLines()
//对ip规则进行整理,并放入到内存
val rules: Array[(Long, Long, String)] = lines.map(line => {
val fileds = line.split("[|]")
val startNum = fileds(2).toLong
val endNum = fileds(3).toLong
val province = fileds(6)
(startNum, endNum, province)
}).toArray
rules
}
def binarySearch(lines: Array[(Long, Long, String)], ip: Long) : Int = {
var low = 0
var high = lines.length - 1
while (low <= high) {
val middle = (low + high) / 2
if ((ip >= lines(middle)._1) && (ip <= lines(middle)._2))
return middle
if (ip < lines(middle)._1)
high = middle - 1
else {
low = middle + 1
}
}
-1
}
def convertIP(ip:String):String={
//规则数据是在内存中
val rules: Array[(Long, Long, String)] = readRules("/Users/zx/Desktop/ip/ip.txt")
//将ip地址转换成十进制
val ipNum = ip2Long(ip)
//查找
val index = binarySearch(rules, ipNum)
//根据脚本到rules中查找对应的数据
val tp = rules(index)
val province = tp._3
province
}
//二次清洗
val clearedData: RDD[Row]= cleardata.map(line => {
val ip = line._4
val city = IpUtils.convertIP(ip)
var url=""
var cmsType = ""
var cmsId = 0l
try {
url = line._2.toString
val cms = url.substring(url.indexOf(domain) + domain.length)
val cmsTypeId = cms.split("/")
if (cmsTypeId.length > 1) {
cmsType = cmsTypeId(0)
cmsId = cmsTypeId(1).toLong
}
} catch {
case exception: Exception =>
url=""
cmsId=0l
cmsType=""
}
val traffic = line._3
val time = line._1
val day = time.substring(0, 10)
Row(time, cmsType, cmsId, ip, traffic, url,city, day)
}
整体还是挺简单的,和视频里面有点不一样:
def ClassAccessTopNStat(spark:SparkSession,accessDF:DataFrame,day:String,topN:Int):Unit={
import spark.implicits._
accessDF.createTempView("ClassCount")
//某一天每个学科下,每个课程的访问次数
val ClassAccessDF = spark.sql(s"select day,cmsType,cmsId,count(*) as counts from ClassCount where day='$day' GROUP BY cmsType,cmsId,day")
//求每个学科下最受欢迎课程topn
ClassAccessDF.createTempView("ClassTopN")
val ClassAccessTopDF=spark.sql(s"select * ,dense_rank() over(order by counts desc) cms_rk from (select day,cmsType,cmsId,counts,row_number() over(partition by cmsType order by counts desc) sub_rk from ClassTopN) temp where sub_rk<=$topN")
// ClassAccessDF.show(false)
ClassAccessTopDF.foreachPartition(it => {
val list = new ListBuffer[TopClassBean]
it.foreach(tp => {
val day = tp.getAs[String]("day")
val cmsType = tp.getAs[String]("cmsType")
val cmsId = tp.getAs[Long]("cmsId")
val counts = tp.getAs[Long]("counts")
val sub_rk = tp.getAs[Int]("sub_rk")
val cms_rk = tp.getAs[Int]("cms_rk")
list.append(new TopClassBean(day, cmsType, cmsId, counts, sub_rk, cms_rk))
})
MysqlUtils.TopNClassdataToMysql(list)
})
}
val windowSpec2 = Window.partitionBy("city")
.orderBy(col("counts").desc)
.rowsBetween(Long.MinValue,0)
def CityAccessTopNStat(spark: SparkSession, accessDF: DataFrame, day: String, topN: Int,cmsType:String):Unit = {
import spark.implicits._
accessDF.createTempView("cityCount")
val temp1=spark.sql(s"select day,cmsType,city,count(*) as counts from cityCount where day='$day' group by day,cmsType,city")
//按访问课程类型分组聚合排序
val cms_temp2 = temp1.orderBy("cmsType")
val windowSpec = Window.partitionBy("cmsType").orderBy(col("counts").desc)
val cms_temp3 = cms_temp2.select(col("day"),
col("cmsType"),
col("city"),
col("counts"),
dense_rank().over(windowSpec).as("city_rk")
).where($"cmsType"===cmsType)
cms_temp3.foreachPartition(tp=>{
val list=new ListBuffer[CityClassBean]
tp.foreach(it=>{
val day=it.getAs[String]("day")
val cmsType=it.getAs[String]("cmsType")
val city=it.getAs[String]("city")
val counts=it.getAs[Long]("counts")
val city_rk=it.getAs[Int]("city_rk")
list.append(new CityClassBean(day,cmsType,city,counts,city_rk))
})
MysqlUtils.TopNCitydataToMysql(list)
})
//按城市进行累计汇总,排序
val citytemp1=temp1.orderBy("city")
val windowSpec2 = Window.partitionBy("city").orderBy(col("counts").desc).rowsBetween(Long.MinValue,0)
val citytemp2=citytemp1.select(
col("day"),
col("cmsType"),
col("city"),
col("counts"),
sum(col("counts")).over(windowSpec2) as("movingAvg")
)
}
数据通过Bean元素传输;
def TopNClassdataToMysql(list: ListBuffer[TopClassBean]): Unit = {
//一个迭代器代表一个分区,分区中有多条数据
var conn:Connection=null
var pstm: PreparedStatem=null
try{
conn = getConnection()
conn.setAutoCommit(false) //设置手动提交
//将数据通过Connection写入到数据库
val pstm: PreparedStatement = conn.prepareStatement("INSERT INTO class_topn(day,cmsType,cmsId,counts,sub_rk,cms_rk) VALUES (?,?,?,?,?,?)")
for (tp <- list) {
pstm.setString(1, tp.day)
pstm.setString(2, tp.cmsType)
pstm.setLong(3, tp.cmsId)
pstm.setLong(4, tp.counts)
pstm.setInt(5, tp.sub_rk)
pstm.setInt(6, tp.cms_rk)
pstm.addBatch()
}
pstm.executeBatch()
conn.commit() //手工提交}
}catch {
case exception: Exception=>exception.printStackTrace()
}finally {
release(conn,pstm)
}
}