关于日志用户行为分析(flume,kafka,sparkstreaming,HBase)

  • 用户行为日志记录的模拟
  • flume文件的编写以及测试
  • sparkstreaming代码的编写以及测试
  • 数据存储到Hbase

1.日志记录的产生,注意由于数据的获取比较困难,所以这里模拟日志数据,主要的代码如下

#coding:utf-8
__author__ = 'venus'

import random
import time
#课程列表
course_lists=[
    "class/309.html",
    "class/213.html",
    "class/134.html",
    "class/112.html",
    "class/156.html",
    "class/189.html",
    "class/123.html",
    "learn/785",
    "learn/786",
    "course/list"
]

ip_splice=["123","34","67","345","178","98","190","147"]

http_referers=[
    "https://www.baidu.com/s?wd={query}",
    "https://www.sogou.com/web?query={query}",
    "https://cn.bing.com/search?q={query}",
    "https://search.yahoo.com/search?p={query}"
]

search_keywords=[
    "spark实战课程",
    "hadoop实战课程",
    "python的入门课程",
    "spring入门到精通",
    "hibernate简单课程"
]

status_code=["200","404","500"]

def sample_referers():
    if random.uniform(0,1)>0.4:
        return '-'
    refers=random.sample(http_referers,1)[0]
    words=random.sample(search_keywords,1)[0]
    return str(refers).format(query=words)

def sample_statusCode():
    return random.sample(status_code,1)[0]
def sample_url():
    return random.sample(course_lists,1)[0]

def sample_ip():
    return ".".join(random.sample(ip_splice, 4))

def genrate_journel(count=10):
    time_str=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
    with open("/software/project/logs/access.log","w+") as fs:
        while count>=1:
            journel="{ip}\t{time_str}\t\"GET /{url} HTTP/1.1\"\t{statusCode}\t{referers}".format(url=sample_url(),ip=sample_ip(),referers=sample_referers(),statusCode=sample_statusCode(),time_str=time_str)
            count=count-1;
            fs.write(journel+"\n")

if __name__ =="__main__":
    genrate_journel(100)

在linux上使用crontab进行定时任务,来产生大量的日志数据

 crontab -e

*/1 * * * * python /software/project/generate_journel.py

以上的定时任务是每分钟执行一次

可以使用tail -200f ./logs/access.log进行查看

2.flume代码的编写

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -f /software/project/logs/access.log
a1.sources.r1.shell = /bin/sh -c

# Describe the sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = 172.17.78.220:9092
a1.sinks.k1.kafka.topic = kafka_streaming
a1.sinks.k1.flumeBatchSize = 20


# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

启动flume(注意启动之前要先启动kafka)

bin/flume-ng agent --conf conf --conf-file ../project/exec_memory_kafka.conf --name a1 -Dflume.root.logger=INFO,console

可以使用kafka的消费者进行查看(注意要想看到结果,要先启动消费者)bin/kafka-console-consumer.sh --bootstrap-server 172.17.78.220:9092 --topic kafka_streaming

3.sparkstreaming代码的编写

   1).数据的清洗(Journel(178.123.34.345,20190307155001,123,200,-))

def main(args: Array[String]): Unit = {

  if(args.length!=3){
    System.err.println("Usage:Kafka_streaming   ")
    System.exit(1)
  }
  val sparkConf=new SparkConf()
  val ssc=new StreamingContext(sparkConf,Seconds(5))
  val Array(servers,groupId,topic)=args
  val kafkaParams = Map[String, Object](
    "bootstrap.servers" -> servers,
    "key.deserializer" -> classOf[StringDeserializer],
    "value.deserializer" -> classOf[StringDeserializer],
    "group.id" -> groupId,
    "auto.offset.reset" -> "latest",
    "enable.auto.commit" -> (false: java.lang.Boolean)
  )

  val topics = topic.split(",")
  val ds = KafkaUtils.createDirectStream[String, String](
    ssc,
    PreferConsistent,
    Subscribe[String, String](topics, kafkaParams)
  )
  ds.map(_.value()).map(line=>{
    val datas=line.split("\t")
    val url=datas(0)
    val time=datas(1)
    //178.98.34.190  2019-03-07 15:02:01  "GET /class/189.html HTTP/1.1" 500  https://www.baidu.com/s?wd=python的入门课程

    val dd=datas(2).split(" ")(1)
    var courseId=0
    if(dd.startsWith("/class")){
      val tmp=dd.split("/")(2)
      courseId=tmp.substring(0,tmp.lastIndexOf(".")).toInt
    }
    Journel(url,DateUtil.parseToMintute(time),courseId,datas(3).toInt,datas(4))
  }).filter(i=> i.courseId!=0).print()
  ssc.start()
  ssc.awaitTermination()
}

数据流处理过程

val filterData=ds.map(_.value()).map(line=>{
  val datas=line.split("\t")
  val url=datas(0)
  val time=datas(1)
  //178.98.34.190  2019-03-07 15:02:01  "GET /class/189.html HTTP/1.1" 500  https://www.baidu.com/s?wd=python的入门课程

  val dd=datas(2).split(" ")(1)
  var courseId=0
  if(dd.startsWith("/class")){
    val tmp=dd.split("/")(2)
    courseId=tmp.substring(0,tmp.lastIndexOf(".")).toInt
  }
  Journel(url,DateUtil.parseToMintute(time),courseId,datas(3).toInt,datas(4))
}).filter(i=> i.courseId!=0)
//目标就是将数据组合成  20190308_86

filterData.map(j=>{
  (j.time.substring(0,8)+"_"+j.courseId,1)
}).reduceByKey(_+_).foreachRDD(rdd=>{
  rdd.foreachPartition(partitonsRecoder=>{
    val data=new ListBuffer[CourseClickCount]
    partitonsRecoder.foreach(r=>{
      data.append(CourseClickCount(r._1,r._2))
    })
    CourseClickDAO.save(data)
  })
})


//就是某一天从某一个网站引流过来点击课程的点击量
filterData.map(j=>{
  var host=""
  val referer=j.referer.replaceAll("//","/").split("/")
  if(referer.length>2){
    host=referer(1)
  }
  (host,j.time.substring(0,8),j.courseId)
}).filter(r=>r._1!="").map(r=>(r._2+"_"+r._1+"_"+r._3,1)).reduceByKey(_+_).foreachRDD(rdd=>{
  rdd.foreachPartition(partitonsRecoder=>{
    val data=new ListBuffer[CourseReferClick]
    partitonsRecoder.foreach(r=>{
      data.append(CourseReferClick(r._1,r._2))
    })
    CourseReferClickDao.save(data)
  })
})

将数据存储到HBase

/**
 * 这里使用单例模式
 */
public class HBaseUtil {
    private static  HBaseAdmin hbaseAdmin=null;
    private static  Configuration conf=null;
    private HBaseUtil() throws Exception {
        conf=new Configuration();
        conf.set("hbase.zookeeper.quorum","172.17.78.220:2181");
        conf.set("hbaserootdir","hdfs://172.17.78.220:9000/hbase");
        hbaseAdmin=new HBaseAdmin(conf);
    }
    private static  HBaseUtil hbaseUtil=null;
    public static synchronized HBaseUtil getInstance() throws Exception {
        if (hbaseUtil==null){
            hbaseUtil=new HBaseUtil();
        }
        return hbaseUtil;
    }

    public HTable getTable(String tableName) throws Exception {
        HTable hTable=new HTable(conf,tableName);
        return hTable;
    }

    public void put(String tableName,String rowKey,String cf,String qulifier,String value) throws Exception {
        HTable hTable=getTable(tableName);
        Put put=new Put(Bytes.toBytes(rowKey));
        put.addColumn(Bytes.toBytes(cf),Bytes.toBytes(qulifier),Bytes.toBytes(value));
        hTable.put(put);
        hTable.close();
    }

    public static void main(String[] args) throws Exception {
        HBaseUtil hBaseUtil=HBaseUtil.getInstance();
        hBaseUtil.put("immoc_course_clickcount","20190307_127","infor","clickcount","70");
    }
}

ase class CourseClickCount(date_courseId: String, clickCount: Long)

object CourseClickDAO {

  val tableName = "immoc_course_clickcount"
  val cf = "infor"
  val qualifier = "clickcount"

  def save  (list: ListBuffer[CourseClickCount]) = {
    val hTable = HBaseUtil.getInstance().getTable(tableName)
    for (c <- list) {
      hTable.incrementColumnValue(Bytes.toBytes(c.date_courseId), Bytes.toBytes(cf),
        Bytes.toBytes(qualifier),
        c.clickCount)
    }
  }

  def count(rowKey:String)={
    val hTable = HBaseUtil.getInstance().getTable(tableName)
    val get=new Get(Bytes.toBytes(rowKey))
    val result=hTable.get(get)
    val value=result.getValue(cf.getBytes(),qualifier.getBytes())
    if(value==null){
      0l
    }else{
      Bytes.toLong(value)
    }
  }
}

**
  * 需求:查看某一天从某一个引用点击某个课程的访问量
  * @param date_referer_courseId:rowKey的设计:date:某一天 referer:某一引用譬如百度  courseId:某个课程的
  * @param clickCount:点击量
  */
case class CourseReferClick(date_referer_courseId:String,clickCount:Int)

object CourseReferClickDao {
  val tableName = "immoc_referer_course_clickcount"
  val cf = "infor"
  val qualifier = "clickcount"

  def save  (list: ListBuffer[CourseReferClick]) = {
    val hTable = HBaseUtil.getInstance().getTable(tableName)
    for (c <- list) {
      hTable.incrementColumnValue(Bytes.toBytes(c.date_referer_courseId), Bytes.toBytes(cf),
        Bytes.toBytes(qualifier),
        c.clickCount)
    }
  }

  def count(rowKey:String)={
    val hTable = HBaseUtil.getInstance().getTable(tableName)
    val get=new Get(Bytes.toBytes(rowKey))
    val result=hTable.get(get)
    val value=result.getValue(cf.getBytes(),qualifier.getBytes())
    if(value==null){
      0l
    }else{
      Bytes.toLong(value)
    }
  }
}

代码的提交:

/software/spark-2.4.0/bin/spark-submit  --class com.cn.spark04.kafka.SparkJournel --jars $(echo /software/hbase-1.2.0/lib/*.jar | tr ' ' ',') --name SparkJournel --packages org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.0 --master local[2] /software/project/sparkstreaming-1.0-SNAPSHOT.jar 172.17.78.220:9092 group01 kafka_streaming

 

注意--jars的使用:因为里面使用了HBase的库,所以要引入响应的jar包

注:来自慕课网课程

你可能感兴趣的:(大数据,kafka)