package hadoop
import java.security.MessageDigest
import java.text.SimpleDateFormat
import IPInfo.IP
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SQLContext, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import java.util.Properties
import com.sun.deploy.net.URLEncoder
/**
* Created by sicong on 2017/1/13.
*/
object YiTianYiKeAnalyser {
val prop = new Properties()
prop.put("user", "root")
prop.put("password", "")
case class Record(user: String, ip: String, country: String, province: String, city: String,
restime: Int, time: Long, code: Int, size: Long,
firm: String, device: String, rom: String, ke: String)
case class Devices(city: String,num:Int,device: String)
case class Ipmap(ip: String, provinceCode: Int, cityCode: Int, province: String, city: String)
case class CityFlow(city:String,flow:Long)
case class StatusCode(code:Int,num:Int)
case class ThreadCache(dateParser: SimpleDateFormat, sha1Digester: MessageDigest)
val threadSafeCache = new ThreadLocal[ThreadCache]();
val Iphelpk = new IP()
def getIpInfohga(Str: String): String = {
Iphelpk.mains(Str)
}
def logbegin(): Unit = {
val spark = SparkSession
.builder()
.appName("Spark SQL Example").master("local[4]")
.config("spark.some.config.option", "some-value")
.getOrCreate()
// readsp(spark)
parseLog("/Users/sicong/Downloads/yitianyike.txt", spark)
}
def getNeedParseLog(): Array[String] = {
// TODO
// 1. 当前时间前推 7 小时;2 当前时间前推 12 小时;
// 以 1、2 为时间范围,查询日志列表
// 日志列表与近期的处理记录比对,若获得的日志为处理,则解析日志,成功后标记为已处理
Array("/Users/Simon/Downloads/7xna64.com2.z0.glb.qiniucdn.com_2017-01-03-23_0602")
}
def logdevicesMysql(kk: Dataset[Devices], s: String):Unit={
val prop = new Properties()
prop.put("user", "root")
prop.put("password", "")
kk.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", s,prop)
}
def CizeFlowStatus(logrdd: RDD[Record],sparkSession: SparkSession) ={
import sparkSession.implicits._
val dataOfFlow=logrdd.map(x=>(x.city,x.size)).groupByKey().map(x=>(x._1,(x._2.sum.toDouble/(1024)).round))
val logMysqldata=dataOfFlow.flatMap(x=>
Seq(CityFlow(x._1,x._2))).toDS()
logMysqldata.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", "test1.CityFlow",prop)
}
def HttpStatusCode(logrdd:RDD[Record],sparkSession: SparkSession): Unit ={
import sparkSession.implicits._
val logMysqldata= logrdd.map(x=>(x.code,x)).groupByKey().flatMap(x=>{
Seq(StatusCode(x._1,x._2.size))
}).toDS()
logMysqldata.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", "test1.StatusCode",prop)
}
def cityTopURL(logrdd:RDD[Record],sparkSession: SparkSession): Unit ={
import sparkSession.implicits._
logrdd.map(x=>(x.ke+x.city,x)).groupByKey().map(x=>(x._2.size,x._1)).sortBy(x=>x,false,1).foreach(x=>println(x))
}
def parseLog(url: String, spark: SparkSession): Unit = {
import spark.implicits._
val peopleDF = spark.sparkContext
.textFile(url)
val logrdd = peopleDF.flatMap(line => {
val record = parses(line)
if (record != null) {
Seq(record)
} else {
Seq()
}
})
//这里对logrdd进行缓存到内存cache 因为接下来的每个算子action如果没有缓存到内存是会每次重新从头开始计算
logrdd.cache()
//统计个个省份对应的流量的接口
// CizeFlowStatus(logrdd,spark)
//统计个个状态码的占有率
// HttpStatusCode(logrdd,spark)
//cityTopURL(logrdd,spark)
//
// val logrdds=logrdd.map(x=>(x.ip+x.device,x))
// .groupByKey().map(x=>(x._2.head))
// val datacount=logrdds.count()
// val deviceOfNum=logrdds.map(x=>(x.device,x)).groupByKey().map(x=>(x._2.head.device,x._2.head.city,x._2.size))
// val kk= deviceOfNum.flatMap(x=>{
// Seq(Devices(x._2,x._3,x._1))
// }).toDS()
// logdevicesMysql(kk,"test1.Devices")
//filter(x=>x._2.size > 1)
// foreach(x=>println(x))
//.take(1000).foreach(x=>{println(x.device)})
/// .toDS()
// logrdd.foreach(x=>println(x))
// spark.sql("set names utf8")
// val jdbcDF = spark.read
// .format("jdbc")
// .option("url", "jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8")
// .option("dbtable", "test1.Devices")
// .option("user", "root")
// .option("password", "")
// .load()
//
//怎么样写入数据库
// val prop = new Properties()
// prop.put("user", "root")
// prop.put("password", "")
// kk.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", "test1.Devices",prop)
//val prop = new Properties()
// // prop.put("user", "root")
// // prop.put("password", "")
// // peopleDf.write.mode("append").jdbc("jdbc:mysql://localhost:3306/mytest", "mytest.student3",prop)
// .map(x => (x.ip+x.device, x))
//.groupByKey().filter(x=>x._2.size > 1).map(x=>x._2.head).take(1000).foreach(x=>{println(x.device)})
// .foreach {
// x =>
// println(s"ip:${x._1}")
// x._2.foreach(y => println(y.device))
// println()
// }
// .map(x => (x.ip, x)).groupByKey().map(_._2.head).toDS()
// logrdd.createOrReplaceTempView("fusion")
// spark.sql("select * from fusion").foreach(x => println(x))
// spark.sql("select count(*) from fusion").foreach(x => print(x))
}
def logprovincecity(str: String): Array[String] = {
val Iphelp = new IP();
val data = Iphelp.mains(str)
data.substring(1, data.length - 1).split(",")
}
def parses(line: String): Record = {
setThreadCache()
val as = line.split(" ")
val ip = as(0)
val restime = as(2).toInt
val time = "1403931367000".toLong
val code = as(8).toInt
val size = as(9).toLong
val ua = line.substring(line.lastIndexOf(" \"") + 2, line.lastIndexOf("\""))
val region = logprovincecity(ip)
val province = changeEncodeing(region(0))
val city = changeEncodeing(region(1))
val country = changeEncodeing(region(2))
//val province =region(0)
// val city = region(1)
// val country =region(2)
val driver = parseUa(ua)
val firm = driver._1
val device = driver._2
val rom = driver._3
val user = mixtureUser(ip, ua)
val ke = parseToKey(as(6))
val obj = Record(user, ip, country, province, city,
restime, time, code, size,
firm, device, rom, ke)
obj
}
def changeEncodeing(string: String): String={
string
}
// 106.18.21.156 - 282 [03/Jan/2017:23:30:14 +0800] "GET http://7xna64.com2.z0.glb.qiniucdn.com/Fjm_mLtcPN3DbTtLpywOmX5gq9cl.jpg?imageView2/2/w/1080/h/1920&e=1483545599&token=Q-hCY0VbL4F6NTX3TgRvE_T3vcpNEo2Gr3S9RA-b:ffDUURujc65VJLj1mKdGDMOrhIg= HTTP/1.1" 200 478114 "-" "AndroidDownloadManager/5.1+(Linux;+U;+Android+5.1;+OPPO+R9m+Build/LMY47I)"
// 139.148.121.96 - 248 [03/Jan/2017:23:30:11 +0800] "GET http://7xna64.com2.z0.glb.qiniucdn.com/FiU3bxGjI6PutwVphDQQihBgP0uw.jpg?imageView2/2/w/1080/h/1920&e=1483545599&token=Q-hCY0VbL4F6NTX3TgRvE_T3vcpNEo2Gr3S9RA-b:1wKdyBO_iYMQh7_MBqGcifYQX50= HTTP/1.1" 200 552867 "-" "AndroidDownloadManager/5.1.1+(Linux;+U;+Android+5.1.1;+OPPO+R9+Plusm+A+Build/LMY47V)"
// 220.178.4.219 - 1 [03/Jan/2017:23:30:35 +0800] "GET http://7xna64.com2.z0.glb.qiniucdn.com/FiwmSuSIuu981zLWENSCOJvIoj2P.jpg?imageView2/2/w/1080/h/1920&e=1483592399&token=Q-hCY0VbL4F6NTX3TgRvE_T3vcpNEo2Gr3S9RA-b:vsvEgQcb8-cU3BDLNp6sLCG72DI= HTTP/1.1" 200 456693 "-" "Dalvik/2.1.0+(Linux;+U;+Android+5.1.1;+NX529J+Build/LMY47V)"
// def parse(line: String): Record = {
// println(line)
// try {
// setThreadCache()
//
// val as = line.split(" ")
//
// val ip = as(0)
// val restime = as(2).toInt
// val time = parseToDate(line)
//
// val code = as(8).toInt
// val size = as(9).toLong
// val ua = line.substring(line.lastIndexOf(" \"") + 2, line.lastIndexOf("\""))
// val region = parseRegion(ip)
// val province = region._1
// val city = region._2
//
// val driver = parseUa(ua)
// val firm = driver._1
// val device = driver._2
// val rom = driver._3
//
// val user = mixtureUser(ip, ua)
// val country = ""
// val key = parseToKey(as(6))
//
// val obj = Record(user, ip, country, province, city,
// restime, time, code, size,
// firm, device, rom, key)
// obj
// } catch {
// case e: Exception => {
// println(s"wrong line: ${line}")
// return null
// }
// }
// }
def setThreadCache(): Unit = {
val cache = threadSafeCache.get()
if (cache == null) {
val dateParser = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss ZZZZ")
val sha1 = MessageDigest.getInstance("SHA1")
threadSafeCache.set(ThreadCache(dateParser, sha1))
}
}
def parseToDate(line: String): Long = {
threadSafeCache.get().dateParser.parse(line.substring(line.indexOf("[") + 1, line.indexOf("]"))).getTime() / 1000
}
private val Iphelp = new IP();
def getIpInfo(Str: String): String = {
Iphelp.mains(Str)
}
def parseRegion(ip: String) = {
// TODO
// IP.load()
("", "")
}
// AndroidDownloadManager/5.1.1+(Linux;+U;+Android+5.1.1;+OPPO+R9+Plusm+A+Build/LMY47V)
// Dalvik/2.1.0+(Linux;+U;+Android+5.1.1;+NX529J+Build/LMY47V)
// Dalvik/2.1.0+(Linux;+U;+Android+5.1.1;+NX523J_V1+Build/LMY47V)
// Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+vivo+Y55A+Build/MMB29M)
// AndroidDownloadManager/5.1+(Linux;+U;+Android+5.1;+OPPO+R9m+Build/LMY47I)
// ua 也包含其它字符
// -
// Java/1.7.0_09
// Go-http-client/1.1
// VAYXXLWZIKRFDGFHPOXDNHJTDLTNBTV
// ("Android 6.0.1", "vivo Y55A", "Build/MMB29M")
def parseUa(ua: String): (String, String, String) = {
try {
val t1 = ua.split(";").reverse
val t2 = t1(0).split("\\+")
return (t1(1).replaceAll("\\+", " ").trim, t2.slice(0, t2.length - 1).mkString(" ").trim, t2(t2.length - 1))
} catch {
case e: Exception => {
return ("Error", "Error", "Error")
}
}
}
def mixtureUser(ip: String, ua: String) = {
hash(ip + ":" + ua)
}
def hash(s: String): String = {
threadSafeCache.get().sha1Digester.digest(s.getBytes).map("%02x".format(_)).mkString
}
def parseToKey(url: String) = {
// https://a 至少有 9 个字符
val l = url.indexOf("?", 9);
val end = if (l > 0) l else url.length()
url.substring(url.indexOf("/", 9) + 1, end)
}
def getIpInfos(Str: String): Array[String] = {
// val hell = new hello();
// hell.getipdata(Str).split(";")
return Array()
}
def readsp(spark: SparkSession): Unit ={
import spark.implicits._
val prop = new Properties()
prop.put("user", "root")
prop.put("password", "")
val jdbcDF2 = spark.read
.jdbc("jdbc:mysql://localhost:3306", "test1.tutorials_tbl",prop)
jdbcDF2.foreach(x=>println(x))
}
def main(args: Array[String]): Unit = {
IP.load("/Users/sicong/Downloads/17monipdb/17monipdb.dat");
logbegin()
}
}