import java.sql.ResultSet
import java.sql.Statement
import java.sql.DriverManager
object OperateHive {
//创建一个hive链接
val dirverName = "org.apache.hive.jdbc.HiveDriver"
try
Class.forName(dirverName)
catch {
case e: ClassNotFoundException =>
e.printStackTrace()
System.exit(1)
}
def buildcon():Statement={ //创建连接
val con = DriverManager.getConnection("jdbc:hive2://address:10000/database", "user", "password")
con.createStatement()
}
def ExcuteSql(stmt:Statement,sql:String):ResultSet= {//执行sql
val SqlString = sql
stmt.executeQuery(SqlString)
}
def conClose(stmt:Statement){ //断开连接
stmt.close()
}
}
import org.apache.spark.sql.SparkSession
object ShowLogResult {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.master("local[4]")
.appName("LogAnalyzeshow")
.getOrCreate()
val jdbcDF=spark.read.format("jdbc")
.option("driver","org.apache.hive.jdbc.HiveDriver")
.option("url","jdbc:hive2://address:10000/database")
.option("dbtable", "tablename")
.option("user","youruser")
.option("password","yourpassword")
.load()
jdbcDF.show()
jdbcDF.createGlobalTempView("log")
val df=spark.sql("""SELECT b FROM global_temp.log""")
println(df.collect().apply(0).apply(0).toString)
}
}
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import scala.collection.mutable.ListBuffer
object HiveLogAnalyze {
case class logschema( //这是表中数据的格式
date: String,
time: String,
c_ip: String,
cs_referee: String
)
def main(args: Array[String]): Unit = {
// val sparksql = SparkSession
// .builder
// .master("local[4]")
// .appName("LogAnalyze")
// .getOrCreate()
val conf = new SparkConf().setAppName("loganalyze")
.setMaster("local[4]")
.set("spark.driver.memory", "2G")
.set("spark.executor.memory","2G")
val sc = new SparkContext(conf)
// val sparksql = new SQLContext(sc)
var startline=0 //在这里由于处理的表中数据较多,利用了for循环来处理多条sql,实现对数据的分批处理,这里需要注意limit的查询效率
val step=500000 //103,510,803
val sqllist=new ListBuffer[String]
while (startline<103510803) {
val sql = "select a1,a2,a10,a13 from mytable limit"+" "+startline.toString+","+step.toString
println(sql)
startline += step
sqllist.append(sql)
}
for(sql<-sqllist){
val stmt=OperateHive.buildcon()
val LogAll = new ListBuffer[logschema]
val resultset = OperateHive.ExcuteSql(stmt,sql)
while (resultset.next()) {
val data = {
logschema(
resultset.getString(1),
resultset.getString(2),
resultset.getString(3),
resultset.getString(4)
)
}
LogAll.append(data)
}
OperateHive.conClose(stmt)
val logrdd=sc.makeRDD(LogAll)
logrdd.cache()
根据ip分组
logrdd.sortBy(row => row.c_ip).sortBy(row => row.date).sortBy(row => row.time)
.groupBy(ling => ling.c_ip).foreach(row =>
IoOperation.WriteRddOneTxt(row._1, row._2)
)
提取固定后缀函数
def GetIISHtml(data:String):Option[String]={
val partfilter:Regex=""".*?\?.*?""".r
val opt=partfilter.findFirstMatchIn(data)
if (opt.isEmpty){
val part:Regex=""".*?\.aspx|.*?\.html|.*?\.shtml""".r
val option=part.findFirstMatchIn(data)
if (option.isEmpty){
None
}else {
Some(option.get.group(0))
}
}
else{
None
}
}
相邻的url去重,写入到txt文件
def WriteRddOneTxt(ipadress:String,data:Iterable[HiveLogAnalyze.logschema]):Unit={
val filename="H:\\IISWebData\\LogResult.txt"
val file=new File(filename)
if(!file.exists()){
val writer=new PrintWriter(file)
writer.close()
}
val out=new FileWriter(filename,true)
var DataCheck="@"
val Data=new ListBuffer[Array[String]]
//对数据进行过滤
data.foreach { line =>
val r=IoOperation.GetIISHtml(line.cs_referee)
if (!r.isEmpty) {
if (DataCheck!=r.get) {
DataCheck=r.get
Data.append(Array(line.date, line.time, line.c_ip,r.get))
}
}
}
for(i<-0 until Data.length){
out.write(Data.apply(i).apply(0))
out.write("$")
out.write(Data.apply(i).apply(1))
out.write("$")
out.write(Data.apply(i).apply(2))
out.write("$")
out.write(Data.apply(i).apply(3))
out.write("\r\n")//加/n 导入hdfs中方便一点
}
out.close()
}