1.日志记录的产生,注意由于数据的获取比较困难,所以这里模拟日志数据,主要的代码如下
#coding:utf-8
__author__ = 'venus'
import random
import time
#课程列表
course_lists=[
"class/309.html",
"class/213.html",
"class/134.html",
"class/112.html",
"class/156.html",
"class/189.html",
"class/123.html",
"learn/785",
"learn/786",
"course/list"
]
ip_splice=["123","34","67","345","178","98","190","147"]
http_referers=[
"https://www.baidu.com/s?wd={query}",
"https://www.sogou.com/web?query={query}",
"https://cn.bing.com/search?q={query}",
"https://search.yahoo.com/search?p={query}"
]
search_keywords=[
"spark实战课程",
"hadoop实战课程",
"python的入门课程",
"spring入门到精通",
"hibernate简单课程"
]
status_code=["200","404","500"]
def sample_referers():
if random.uniform(0,1)>0.4:
return '-'
refers=random.sample(http_referers,1)[0]
words=random.sample(search_keywords,1)[0]
return str(refers).format(query=words)
def sample_statusCode():
return random.sample(status_code,1)[0]
def sample_url():
return random.sample(course_lists,1)[0]
def sample_ip():
return ".".join(random.sample(ip_splice, 4))
def genrate_journel(count=10):
time_str=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
with open("/software/project/logs/access.log","w+") as fs:
while count>=1:
journel="{ip}\t{time_str}\t\"GET /{url} HTTP/1.1\"\t{statusCode}\t{referers}".format(url=sample_url(),ip=sample_ip(),referers=sample_referers(),statusCode=sample_statusCode(),time_str=time_str)
count=count-1;
fs.write(journel+"\n")
if __name__ =="__main__":
genrate_journel(100)
在linux上使用crontab进行定时任务,来产生大量的日志数据
crontab -e
*/1 * * * * python /software/project/generate_journel.py
以上的定时任务是每分钟执行一次
可以使用tail -200f ./logs/access.log进行查看
2.flume代码的编写
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -f /software/project/logs/access.log
a1.sources.r1.shell = /bin/sh -c
# Describe the sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = 172.17.78.220:9092
a1.sinks.k1.kafka.topic = kafka_streaming
a1.sinks.k1.flumeBatchSize = 20
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动flume(注意启动之前要先启动kafka)
bin/flume-ng agent --conf conf --conf-file ../project/exec_memory_kafka.conf --name a1 -Dflume.root.logger=INFO,console
可以使用kafka的消费者进行查看(注意要想看到结果,要先启动消费者)bin/kafka-console-consumer.sh --bootstrap-server 172.17.78.220:9092 --topic kafka_streaming
3.sparkstreaming代码的编写
1).数据的清洗(Journel(178.123.34.345,20190307155001,123,200,-))
def main(args: Array[String]): Unit = {
if(args.length!=3){
System.err.println("Usage:Kafka_streaming ")
System.exit(1)
}
val sparkConf=new SparkConf()
val ssc=new StreamingContext(sparkConf,Seconds(5))
val Array(servers,groupId,topic)=args
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> servers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = topic.split(",")
val ds = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
ds.map(_.value()).map(line=>{
val datas=line.split("\t")
val url=datas(0)
val time=datas(1)
//178.98.34.190 2019-03-07 15:02:01 "GET /class/189.html HTTP/1.1" 500 https://www.baidu.com/s?wd=python的入门课程
val dd=datas(2).split(" ")(1)
var courseId=0
if(dd.startsWith("/class")){
val tmp=dd.split("/")(2)
courseId=tmp.substring(0,tmp.lastIndexOf(".")).toInt
}
Journel(url,DateUtil.parseToMintute(time),courseId,datas(3).toInt,datas(4))
}).filter(i=> i.courseId!=0).print()
ssc.start()
ssc.awaitTermination()
}
数据流处理过程
val filterData=ds.map(_.value()).map(line=>{
val datas=line.split("\t")
val url=datas(0)
val time=datas(1)
//178.98.34.190 2019-03-07 15:02:01 "GET /class/189.html HTTP/1.1" 500 https://www.baidu.com/s?wd=python的入门课程
val dd=datas(2).split(" ")(1)
var courseId=0
if(dd.startsWith("/class")){
val tmp=dd.split("/")(2)
courseId=tmp.substring(0,tmp.lastIndexOf(".")).toInt
}
Journel(url,DateUtil.parseToMintute(time),courseId,datas(3).toInt,datas(4))
}).filter(i=> i.courseId!=0)
//目标就是将数据组合成 20190308_86
filterData.map(j=>{
(j.time.substring(0,8)+"_"+j.courseId,1)
}).reduceByKey(_+_).foreachRDD(rdd=>{
rdd.foreachPartition(partitonsRecoder=>{
val data=new ListBuffer[CourseClickCount]
partitonsRecoder.foreach(r=>{
data.append(CourseClickCount(r._1,r._2))
})
CourseClickDAO.save(data)
})
})
//就是某一天从某一个网站引流过来点击课程的点击量
filterData.map(j=>{
var host=""
val referer=j.referer.replaceAll("//","/").split("/")
if(referer.length>2){
host=referer(1)
}
(host,j.time.substring(0,8),j.courseId)
}).filter(r=>r._1!="").map(r=>(r._2+"_"+r._1+"_"+r._3,1)).reduceByKey(_+_).foreachRDD(rdd=>{
rdd.foreachPartition(partitonsRecoder=>{
val data=new ListBuffer[CourseReferClick]
partitonsRecoder.foreach(r=>{
data.append(CourseReferClick(r._1,r._2))
})
CourseReferClickDao.save(data)
})
})
将数据存储到HBase
/**
* 这里使用单例模式
*/
public class HBaseUtil {
private static HBaseAdmin hbaseAdmin=null;
private static Configuration conf=null;
private HBaseUtil() throws Exception {
conf=new Configuration();
conf.set("hbase.zookeeper.quorum","172.17.78.220:2181");
conf.set("hbaserootdir","hdfs://172.17.78.220:9000/hbase");
hbaseAdmin=new HBaseAdmin(conf);
}
private static HBaseUtil hbaseUtil=null;
public static synchronized HBaseUtil getInstance() throws Exception {
if (hbaseUtil==null){
hbaseUtil=new HBaseUtil();
}
return hbaseUtil;
}
public HTable getTable(String tableName) throws Exception {
HTable hTable=new HTable(conf,tableName);
return hTable;
}
public void put(String tableName,String rowKey,String cf,String qulifier,String value) throws Exception {
HTable hTable=getTable(tableName);
Put put=new Put(Bytes.toBytes(rowKey));
put.addColumn(Bytes.toBytes(cf),Bytes.toBytes(qulifier),Bytes.toBytes(value));
hTable.put(put);
hTable.close();
}
public static void main(String[] args) throws Exception {
HBaseUtil hBaseUtil=HBaseUtil.getInstance();
hBaseUtil.put("immoc_course_clickcount","20190307_127","infor","clickcount","70");
}
}
ase class CourseClickCount(date_courseId: String, clickCount: Long)
object CourseClickDAO {
val tableName = "immoc_course_clickcount"
val cf = "infor"
val qualifier = "clickcount"
def save (list: ListBuffer[CourseClickCount]) = {
val hTable = HBaseUtil.getInstance().getTable(tableName)
for (c <- list) {
hTable.incrementColumnValue(Bytes.toBytes(c.date_courseId), Bytes.toBytes(cf),
Bytes.toBytes(qualifier),
c.clickCount)
}
}
def count(rowKey:String)={
val hTable = HBaseUtil.getInstance().getTable(tableName)
val get=new Get(Bytes.toBytes(rowKey))
val result=hTable.get(get)
val value=result.getValue(cf.getBytes(),qualifier.getBytes())
if(value==null){
0l
}else{
Bytes.toLong(value)
}
}
}
**
* 需求:查看某一天从某一个引用点击某个课程的访问量
* @param date_referer_courseId:rowKey的设计:date:某一天 referer:某一引用譬如百度 courseId:某个课程的
* @param clickCount:点击量
*/
case class CourseReferClick(date_referer_courseId:String,clickCount:Int)
object CourseReferClickDao {
val tableName = "immoc_referer_course_clickcount"
val cf = "infor"
val qualifier = "clickcount"
def save (list: ListBuffer[CourseReferClick]) = {
val hTable = HBaseUtil.getInstance().getTable(tableName)
for (c <- list) {
hTable.incrementColumnValue(Bytes.toBytes(c.date_referer_courseId), Bytes.toBytes(cf),
Bytes.toBytes(qualifier),
c.clickCount)
}
}
def count(rowKey:String)={
val hTable = HBaseUtil.getInstance().getTable(tableName)
val get=new Get(Bytes.toBytes(rowKey))
val result=hTable.get(get)
val value=result.getValue(cf.getBytes(),qualifier.getBytes())
if(value==null){
0l
}else{
Bytes.toLong(value)
}
}
}