ip库的信息在这里下载 http://www.ipip.net/download.html
182.146.100.97 - 3 [03/Jan/2017:23:30:01 +0800] "GET http://7xna64.com2.z0.glb.qiniucdn.com/Fq9M_Gn0RRWy9eprb0T0CAdrybv3.jpg?imageView2/2/w/1080/h/1920&e=1483592399&token=Q-hCY0VbL4F6NTX3TgRvE_T3vcpNEo2Gr3S9RA-b:HJPKZifauy-LOmjJgA5F1uG9ibs= HTTP/1.1" 200 219736 "-" "Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+NX549J+Build/MMB29M)"
代码案例
hadoop
import java.security.MessageDigest
import java.text.SimpleDateFormat
import java.util.{Locale, Properties}
import IPInfo.IP
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}
/**
* Created by sicong on 2017/4/19.
* 182.146.100.97 - 3 [03/Jan/2017:23:30:01 +0800] "GET http://7xna64.com2.z0.glb.qiniucdn.com/Fq9M_Gn0RRWy9eprb0T0CAdrybv3.jpg?imageView2/2/w/1080/h/1920&e=1483592399&token=Q-hCY0VbL4F6NTX3TgRvE_T3vcpNEo2Gr3S9RA-b:HJPKZifauy-LOmjJgA5F1uG9ibs= HTTP/1.1" 200 219736 "-" "Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+NX549J+Build/MMB29M)"
*
*/
object paseLogData {
val prop = new Properties()
prop.put("user", "root")
prop.put("password", "")
case class Record(user: String, ip: String, country: String, province: String, city: String,
restime: Int, time: Long, code: Int, size: Long,
firm: String, device: String, rom: String, ke: String)
case class Devices(city: String,num:Int,device: String)
case class Ipmap(ip: String, provinceCode: Int, cityCode: Int, province: String, city: String)
case class CityFlow(city:String,flow:Long)
case class StatusCode(code:Int,num:Int)
case class ThreadCache(dateParser: SimpleDateFormat, sha1Digester: MessageDigest)
val threadSafeCache = new ThreadLocal[ThreadCache]();
val Iphelpk = new IP()
def getIpInfohga(Str: String): String = {
Iphelpk.mains(Str)
}
//主入口程序
def logbegin(): Unit = {
val spark = SparkSession
.builder()
.appName("Spark SQL Example").master("local[4]")
.config("spark.some.config.option", "some-value")
.getOrCreate()
// readsp(spark)
parseLog("/Users/sicong/Downloads/yitianyike.txt", spark)
}
def getNeedParseLog(): Array[String] = {
// TODO
// 1. 当前时间前推 7 小时;2 当前时间前推 12 小时;
// 以 1、2 为时间范围,查询日志列表
// 日志列表与近期的处理记录比对,若获得的日志为处理,则解析日志,成功后标记为已处理
Array("/Users/Simon/Downloads/7xna64.com2.z0.glb.qiniucdn.com_2017-01-03-23_0602")
}
def logdevicesMysql(kk: Dataset[Devices], s: String):Unit={
val prop = new Properties()
prop.put("user", "root")
prop.put("password", "")
kk.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", s,prop)
}
def CizeFlowStatus(logrdd: RDD[Record],sparkSession: SparkSession) ={
import sparkSession.implicits._
val dataOfFlow=logrdd.map(x=>(x.city,x.size)).groupByKey().map(x=>(x._1,(x._2.sum.toDouble/(1024)).round))
val logMysqldata=dataOfFlow.flatMap(x=>
Seq(CityFlow(x._1,x._2))).toDS()
logMysqldata.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", "test1.CityFlow",prop)
}
def HttpStatusCode(logrdd:RDD[Record],sparkSession: SparkSession): Unit ={
import sparkSession.implicits._
val logMysqldata= logrdd.map(x=>(x.code,x)).groupByKey().flatMap(x=>{
Seq(StatusCode(x._1,x._2.size))
}).toDS()
logMysqldata.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", "test1.StatusCode",prop)
}
def cityTopURL(logrdd:RDD[Record],sparkSession: SparkSession): Unit ={
import sparkSession.implicits._
logrdd.map(x=>(x.ke+x.city,x)).groupByKey().map(x=>(x._2.size,x._1)).sortBy(x=>x,false,1).foreach(x=>println(x))
}
//spark 的解析入口
def parseLog(url: String, spark: SparkSession): Unit = {
import spark.implicits._
val peopleDF = spark.sparkContext
.textFile(url)
val logrdd = peopleDF.flatMap(line => {
val record = parses(line)
if (record != null) {
Seq(record)
} else {
Seq()
}
})
//这里对logrdd进行缓存到内存cache 因为接下来的每个算子action如果没有缓存到内存是会每次重新从头开始计算
//统计个个省份对应的流量的接口
CizeFlowStatus(logrdd,spark)
//统计个个状态码的占有率
HttpStatusCode(logrdd,spark)
cityTopURL(logrdd,spark)
}
//ip 查询归属地的信息
def logprovincecity(str: String): Array[String] = {
val Iphelp = new IP();
val data = Iphelp.mains(str)
data.substring(1, data.length - 1).split(",")
}
def parses(line: String): Record = {
setThreadCache()
val as = line.split(" ")
val ip = as(0)
val restime = as(2).toInt
val time = parseVisitTime(as(3))
val code = as(8).toInt
val size = as(9).toLong
//切分出出ua Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+vivo+Y55A+Build/MMB29M)
val ua = line.substring(line.lastIndexOf(" \"") + 2, line.lastIndexOf("\""))
val region = logprovincecity(ip)
val province = changeEncodeing(region(0))
val city = changeEncodeing(region(1))
val country = changeEncodeing(region(2))
val driver = parseUa(ua)
val firm = driver._1
val device = driver._2
val rom = driver._3
val user = mixtureUser(ip, ua)
val ke = parseToKey(as(6))
val obj = Record(user, ip, country, province, city,
restime, time, code, size,
firm, device, rom, ke)
obj
}
def changeEncodeing(string: String): String={
string
}
def parseVisitTime(string: String):Long={
println(string)
var timeData=string.substring(1,string.length)
println(timeData)
val loc = new Locale("en")
val fm = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",loc)
val tm = timeData
val dt2 = fm.parse(tm);
var dates=dt2.getTime()
dates.toString.substring(0,dates.toString.length-3).toLong
}
def setThreadCache(): Unit = {
val cache = threadSafeCache.get()
if (cache == null) {
val dateParser = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss ZZZZ")
val sha1 = MessageDigest.getInstance("SHA1")
threadSafeCache.set(ThreadCache(dateParser, sha1))
}
}
private val Iphelp = new IP();
def getIpInfo(Str: String): String = {
Iphelp.mains(Str)
}
// AndroidDownloadManager/5.1.1+(Linux;+U;+Android+5.1.1;+OPPO+R9+Plusm+A+Build/LMY47V)
// Dalvik/2.1.0+(Linux;+U;+Android+5.1.1;+NX529J+Build/LMY47V)
// Dalvik/2.1.0+(Linux;+U;+Android+5.1.1;+NX523J_V1+Build/LMY47V)
// Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+vivo+Y55A+Build/MMB29M)
// AndroidDownloadManager/5.1+(Linux;+U;+Android+5.1;+OPPO+R9m+Build/LMY47I)
// ua 也包含其它字符
// -
// Java/1.7.0_09
// Go-http-client/1.1
// VAYXXLWZIKRFDGFHPOXDNHJTDLTNBTV
// ("Android 6.0.1", "vivo Y55A", "Build/MMB29M")
def parseUa(ua: String): (String, String, String) = {
try {
val t1 = ua.split(";").reverse
val t2 = t1(0).split("\\+")
return (t1(1).replaceAll("\\+", " ").trim, t2.slice(0, t2.length - 1).mkString(" ").trim, t2(t2.length - 1))
} catch {
case e: Exception => {
return ("Error", "Error", "Error")
}
}
}
def mixtureUser(ip: String, ua: String) = {
hash(ip + ":" + ua)
}
def hash(s: String): String = {
threadSafeCache.get().sha1Digester.digest(s.getBytes).map("%02x".format(_)).mkString
}
def parseToKey(url: String) = {
// https://a 至少有 9 个字符
val l = url.indexOf("?", 9);
val end = if (l > 0) l else url.length()
url.substring(url.indexOf("/", 9) + 1, end)
}
def getIpInfos(Str: String): Array[String] = {
// val hell = new hello();
// hell.getipdata(Str).split(";")
return Array()
}
def readsp(spark: SparkSession): Unit ={
import spark.implicits._
val prop = new Properties()
prop.put("user", "root")
prop.put("password", "")
val jdbcDF2 = spark.read
.jdbc("jdbc:mysql://localhost:3306", "test1.tutorials_tbl",prop)
jdbcDF2.foreach(x=>println(x))
}
def main(args: Array[String]): Unit = {
//加载ip库
IP.load("/Users/sicong/scalas/17monipdb.dat");
logbegin()
}
}