1、功能开发:
功能一:今天到现在为止,每个栏目的访问量
功能二:从搜索引擎引流过来的,每个栏目的访问量
#coding=UTF-8
import random
import time
url_paths = [
"www/2",
"www/1",
"www/6",
"www/4",
"www/3",
"pianhua/130",
"toukouxu/821"
]
status_code = [404,302,200]
ip_slices = [132,156,124,10,29,167,143,187,30,100]
http_referers = [
"https://www.baidu.com/s?wd={query}",
"https://www.sogou.com/web?qu={query}",
"http://cn.bing.com/search?q={query}",
"https://search.yahoo.com/search?p={query}"
]
search_keyword = [
"猎场",
"快乐人生",
"极限挑战",
"我的体育老师",
"幸福满院"
]
#ip��ַ
def sample_ip():
slice = random.sample(ip_slices,4)
return ".".join([str(item) for item in slice])
def sample_url():
return random.sample(url_paths,1)[0]
def sample_status():
return random.sample(status_code,1)[0]
def sample_referer():
if random.uniform(0,1) > 0.2:
return "-"
refer_str = random.sample(http_referers,1)
#print refer_str[0]
query_str = random.sample(search_keyword,1)
#print query_str[0]
return refer_str[0].format(query=query_str[0])
#����log
def generate_log(count=10):
time_str = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
#f = open("D://test//in.log","w+")
f = open("/root/data/shishi/data20200401.log","a+")
while count >= 1:
query_log = "{ip}\t{localtime}\t\"GET {url} HTTP/1.0\"\t{referece}\t{status1}".format(ip=sample_ip(),url=sample_url(),status1=sample_status(),referece=sample_referer(),localtime=time_str)
#print query_log
f.write(query_log+"\n")
count = count-1
if __name__ == '__main__':
generate_log(100)
#print "1111"
1、mkLog.sh脚本
python /root/data/shishi/generate.py
2、定时(每一分钟执行一次)执行生产日志脚本
crontab -e
*/1 * * * * /root/data/shishi/mkLog.sh
1、启动zookeeper、kafka
bin/kafka-server-start.sh config/server.properties &
创建一个主题
bin/kafka-topics.sh --create --zookeeper hdp-1:2181 --replication-factor 1 --partitions 1 --topic flumeTopic
bin/kafka-topics.sh --list --zookeeper hdp-1:2181
启动 Kafka consumer:(测试是否可以收到消息)
bin/kafka-console-consumer.sh --topic flumeTopic --bootstrap-server hdp-1:9092 --from-beginning
启动flume
./flume-ng agent -C ../conf/ -f ../conf2/flume_kafka.conf -n ag1 -Dflume.root.logger=INFO,console
1、选择什么数据库作为统计结构存储呢?
关系型数据库 RDBMS:MySQL Oracle
day categoryId click_count
20171117 1 10
20171117 2 19
下一个批次数据进来以后,我们需要取出 20171117 1 对应的值 10 + 对应的数据,比较麻烦。而Hbase 一个 API 就能搞定,非常方便。
2、Hbase表设计
(1)启动hdfs,zookeeper
(2)创建存储表:每个栏目的访问量表、搜索引擎下每个栏目的访问量表
create 'category_clickcount','info'
create 'category_search_clickcount','info'
#查看表 list
#查看结构 desc ‘category_clickcount’
#查询数据 scan ‘category_clickcount’
3、HbaseUtil工具类,实现连接hbase添加数据(单例)
package com.xin.hbase;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.io.IOException;
/**
* Created by xinBa.
* User: 辛聪明
* Date: 2020/4/2
*/
public class HbaseUtil {
HBaseAdmin admin = null;
Configuration configration = null;
/**
* 私有构造方法 配置configuration,获得admin
*/
private HbaseUtil(){
configration = new Configuration();
configration.set("hbase.zookeeper.quorum","hdp-1:2181");
configration.set("hbase.rootdir","hdfs://hdp-1/hbase");
try {
admin = new HBaseAdmin(configration);
} catch (IOException e) {
e.printStackTrace();
}
}
// 静态类对象
private static HbaseUtil instance = null;
// 同步锁,获取一个不为null的类对象 --> 单例
public static synchronized HbaseUtil getInstance(){
if(null == instance){
instance = new HbaseUtil();
}
return instance;
}
/**
* 根据表名获取到 Htable 实例
*/
public HTable getTable(String tableName){
HTable table = null;
try {
table = new HTable(configration,tableName);
} catch (IOException e) {
e.printStackTrace();
}
return table;
}
/**
* 添加一条记录到 Hbase 表 70 30 128 32 核 200T 8000
* @param tableName Hbase 表名
* @param rowkey Hbase 表的 rowkey
* @param cf Hbase 表的 columnfamily
* @param column Hbase 表的列
* @param value 写入 Hbase 表的值
*/
public void put(String tableName,String rowkey,String cf,String column,String value){
HTable table = getTable(tableName);
Put put = new Put(Bytes.toBytes(rowkey));
put.add(Bytes.toBytes(cf),Bytes.toBytes(column),Bytes.toBytes(value));
try {
table.put(put);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
//HTable table = HBaseUtil.getInstance().getTable("category_clickcount");
//System.out.println(table.getName().getNameAsString());
String tableName = "category_clickcount";
String rowkey = "20271111_88";
String cf="info";
String column ="click_count";
String value = "2";
HbaseUtil.getInstance().put(tableName,rowkey,cf,column,value);
}
}
1、SparkStreaming实时消费kafka数据,进行数据处理后结果保存到hbase
pom.xml
org.apache.spark
spark-core_2.11
2.1.0
mysql
mysql-connector-java
5.1.33
org.apache.hadoop
hadoop-client
2.7.1
org.apache.hbase
hbase-client
1.2.0
org.apache.hbase
hbase-server
1.2.0
org.apache.spark
spark-streaming_2.11
2.1.0
org.apache.spark
spark-streaming-kafka_2.11
1.5.2
org.apache.kafka
kafka-clients
2.1.1
net.alchim31.maven
scala-maven-plugin
3.2.2
org.apache.maven.plugins
maven-compiler-plugin
3.5.1
net.alchim31.maven
scala-maven-plugin
scala-compile-first
process-resources
add-source
compile
scala-test-compile
process-test-resources
testCompile
org.apache.maven.plugins
maven-compiler-plugin
compile
compile
org.apache.maven.plugins
maven-shade-plugin
2.4.2
package
shade
*:*
META-INF/*.SF
META-INF/*.DSA
META-INF/*.RSA
Consummer.scala消费数据
package com.xin.kafka
import com.xin.dao.{CategaryClickCountDAO, CategorySearchClickCountDao}
import com.xin.dao.CategaryClickCountDAO.CategaryClickCount
import com.xin.dao.CategorySearchClickCountDao.CategarSearchClickCount
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable.ListBuffer
/**
* Created by xinBa.
* User: 辛聪明
* Date: 2020/4/2
*/
object Consummer {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("shishi")
// .setMaster("local[*]")
val ssc = new StreamingContext(sparkConf,Seconds(5))
val kafkaDStream: ReceiverInputDStream[(String, String)] = KafkaUtils.createStream(
ssc,
"hdp-1:2181", //zookeeper
"flumeTopic", //消费者组groupid
Map("flumeTopic" -> 3) //map中存放多个topic主题,格式为:
)
//将消费的数据转成DStream kafka传过来数据是 k默认null,v是我们输入的值
val logs: DStream[String] = kafkaDStream.flatMap((tuple: (String, String)) =>tuple._2.split(","))
/**
* 一、清洗数据,过滤出无用数据.将数据封装成DStream[clickLog]
* 日志格式:143.29.187.156 2020-04-01 17:42:41 "GET www/4 HTTP/1.0" https://www.sogou.com/web?qu=猎场 404
*/
val cleanData: DStream[clickLog] = logs.map(line =>{
val infos = line.split("\t")
val url = infos(2).split(" ")(1)
var categaryId = 0
//把爱奇艺的类目编号拿到了
if(url.startsWith("www")){
categaryId = url.split("/")(1).toInt
}
// infos(0)-->ip infos(1)-->date categaryId-->节目编号
// infos(4)-->状态码 infos(3):搜索方式
clickLog(infos(0),DataUtils.parseToMinute(infos(1)),categaryId,infos(4).toInt,infos(3))
}).filter((clickLog: clickLog) =>clickLog.categaryId != 0)
/**
* 二、保存收集数据到 HBase里面
* 功能需求:每个类别每天的点击量
*/
cleanData.map(log=>{
//date:yyyyMMdd
(log.date.substring(0,8)+"_"+log.categaryId,1)
}).reduceByKey(_+_).foreachRDD(rdd=>{
rdd.foreachPartition((partriosRdds: Iterator[(String, Int)]) =>{
val list = new ListBuffer[CategaryClickCount]
partriosRdds.foreach(pair=>{
list.append(CategaryClickCount(pair._1,pair._2))
})
//将结果数据保存到hbase,计数器实现同rowkey的count累加
CategaryClickCountDAO.save(list)
})
})
/**
* 三、保存收集数据到 HBase里面
* 功能需求:从搜索引擎引流过来的每个类别每天的点击量
*/
cleanData.map(log=>{
// https://www.sogou.com/web?qu=猎场
val refren: String = log.types
val strings: Array[String] = refren.replaceAll("//","/").split("/")
var host = ""
if(strings.length>2){
host = strings(1)
}
(host,log.categaryId,log.date)
}).filter(_._1 != "").map(x=>{
(x._3.substring(0,8)+"_"+x._1+"_"+x._2,1)
}).reduceByKey(_+_).foreachRDD(rdd=>{
rdd.foreachPartition((partriosRdds: Iterator[(String, Int)]) =>{
val list = new ListBuffer[CategarSearchClickCount]
partriosRdds.foreach(pair=>{
list.append(CategarSearchClickCount(pair._1,pair._2))
})
//将结果数据保存到hbase,计数器实现同rowkey的count累加
CategorySearchClickCountDao.save(list)
})
})
ssc.start()
ssc.awaitTermination()
}
case class clickLog(ip:String,date : String,categaryId:Int,statusid:Int,types:String)
}
CategaryClickCountDAO
package com.xin.dao
import com.xin.hbase.HbaseUtil
import org.apache.hadoop.hbase.client.{Get, HTable}
import org.apache.hadoop.hbase.util.Bytes
import scala.collection.mutable.ListBuffer
/**
* 功能开发:今天到现在为止,每个栏目的访问量
*/
object CategaryClickCountDAO {
val tableName = "category_clickcount"
val cf = "info"
val qualifer = "click_count" //属性
/**
* 保存数据
* @param list
*/
def save(list:ListBuffer[CategaryClickCount]): Unit ={
val table: HTable = HbaseUtil.getInstance().getTable(tableName)
for(els <- list){
//计数器 相同rowkey(categaryID),相同列簇、相同属性的amount(clickCout)会相加
table.incrementColumnValue(Bytes.toBytes(els.categaryID),Bytes.toBytes(cf),Bytes.toBytes(qualifer),els.clickCout);
}
}
/**
* 获取指定rowkey的指定列簇指定属性的值,null-->0L not null-->转换long类型
* 主要功能:获取count数量
* @param day_categary
*/
def count(day_categary:String) : Long={
val table: HTable =HbaseUtil.getInstance().getTable(tableName) //获取表
val get = new Get(Bytes.toBytes(day_categary)) //获取指定rowkey数据
val value: Array[Byte] = table.get(get).getValue(Bytes.toBytes(cf), Bytes.toBytes(qualifer))
if(value == null){
0L
}else{
Bytes.toLong(value)
}
}
def main(args: Array[String]): Unit = {
// val list = new ListBuffer[CategaryClickCount]
// list.append(CategaryClickCount("20171122_1",1))
// list.append(CategaryClickCount("20171122_9", 2))
// list.append(CategaryClickCount("20171122_10", 3))
// save(list)
print(count("20200404_4"))
}
case class CategaryClickCount(categaryID:String,clickCout:Int)
}
CategorySearchClickCountDao
package com.xin.dao
import com.xin.hbase.HbaseUtil
import org.apache.hadoop.hbase.client.{Get, HTable}
import org.apache.hadoop.hbase.util.Bytes
import scala.collection.mutable.ListBuffer
/**
* Created by xinBa.
* User: 辛聪明
* Date: 2020/4/4
* 功能二:功能一+从搜索引擎引流过来的
* (本类和CategoryClickCountDao除了表名,属性名之外基本一致)
*/
object CategorySearchClickCountDao {
val tableName = "category_search_clickcount"
val cf = "info"
val qualifer = "search_click_count"
def save(list: ListBuffer[CategarSearchClickCount]): Unit ={
val table: HTable = HbaseUtil.getInstance().getTable(tableName)
list.foreach(child=>{
table.incrementColumnValue(Bytes.toBytes(child.day_search_categary),Bytes.toBytes(cf),
Bytes.toBytes(qualifer),child.clickCount)
})
}
def count(day_categary:String) : Long={
val table =HbaseUtil.getInstance().getTable(tableName)
val get = new Get(Bytes.toBytes(day_categary))
val value = table.get(get).getValue(Bytes.toBytes(cf), Bytes.toBytes(qualifer))
if(value == null){
0L
}else{
Bytes.toLong(value)
}
}
def main(args: Array[String]): Unit = {
val list = new ListBuffer[CategarSearchClickCount]
list.append(CategarSearchClickCount("20171122_1_8",300))
list.append(CategarSearchClickCount("20171122_2_9", 600))
list.append(CategarSearchClickCount("20171122_2_10", 1600))
save(list)
print(count("20171122_2_2")+"---")
}
case class CategarSearchClickCount(day_search_categary:String,clickCount:Int)
}
DataUtils
package com.xin.kafka
import java.util.Date
import org.apache.commons.lang3.time.FastDateFormat
/**
* Created by xinBa.
* User: 辛聪明
* Date: 2020/4/2
* 注释: 此工具类实现将yyyy-MM-dd HH:mm:ss格式转换成yyyyMMdd
*/
object DataUtils {
val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss");
val TARGE_FORMAT = FastDateFormat.getInstance ("yyyyMMdd");
def getTime (time: String) = {
YYYYMMDDHHMMSS_FORMAT.parse (time).getTime
}
def parseToMinute (time: String) = {
TARGE_FORMAT.format (new Date (getTime (time) ) )
}
def main (args: Array[String] ): Unit = {
println (parseToMinute ("2017-11-22 01:20:20") )
}
}
1、环境准备
启动hdfs、zookeeper、kafka、flume、hbase、spark以及确保crontab执行
2、执行jar程序
spark-submit \
--master spark://hdp-1:7077 \
--class com.xin.kafka.Consummer \
/root/data/shishi/sparkdemo.jar
3、观察hbase两个表数据变化,JAVA API 操作是以十六进制存储数值数据。一般用数字0到9和字母A到F(或a~f)表示,其中:A~F表示10~15,这些称作十六进制数字。
1、构建SpringBoot项目,添加依赖
org.springframework.boot
spring-boot-starter-thymeleaf
org.apache.hbase
hbase-client
1.2.0
2、Utils类,根据天来获取 HBase 表中的类目访问次数
package com.xin.spark.utils;
import com.xin.spark.domain.CategoryClickCount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.PrefixFilter;
import org.apache.hadoop.hbase.util.Bytes;
import org.junit.Test;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* HBase 操作工具类
*/
public class HBaseUtils {
private HBaseAdmin admin = null;
private Configuration configration = null;
/**
* 私有构造方法
*/
private HBaseUtils(){
configration = new Configuration();
configration.set("hbase.zookeeper.quorum", "hdp-1:2181");
configration.set("hbase.rootdir", "hdfs://hdp-1/hbase");
try {
admin = new HBaseAdmin(configration);
} catch (IOException e) {
e.printStackTrace();
}
}
private static HBaseUtils instance = null;
/**
* 获取单实例对象
* @return
*/
public static synchronized HBaseUtils getInstance(){
if(null == instance){
instance = new HBaseUtils();
}
return instance;
}
/**
* 根据表明获取到 Htable 实例
* @param tableName
* @return
*/
public HTable getTable(String tableName){
HTable table = null;
try {
table = new HTable(configration,tableName);
} catch (Exception e) {
e.printStackTrace();
}
return table;
}
/**
* 添加一条记录到 Hbase 表 70 30 128 32 核 200T 8000
*
* @param tableName Hbase 表名
* @param rowkey Hbase 表的 rowkey
* @param cf Hbase 表的 columnfamily
* @param column Hbase 表的列
* @param value 写入 Hbase 表的值
*/
public void put(String tableName,String rowkey,String cf,String column,String value){
HTable table = getTable(tableName);
Put put = new Put(Bytes.toBytes(rowkey));
put.add(Bytes.toBytes(cf),Bytes.toBytes(column),Bytes.toBytes(value));
try {
table.put(put);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 根据表名输入条件获取 Hbase 的记录数
*/
public Map query(String tableName, String condition) throws IOException {
Map map = new HashMap<>();
HTable table = getTable(tableName);
String cf = "info";
String qualifier = "click_count";
// 创建扫描仪
Scan scan = new Scan();
// 创建前缀过滤器
Filter filter = new PrefixFilter(Bytes.toBytes(condition));
scan.setFilter(filter);
ResultScanner rs = table.getScanner(scan);
for (Result result : rs) {
// 得到rowkey
String row = Bytes.toString(result.getRow());
//Byte直接转long,先转string在转long会报错
long clickCount = Bytes.toLong(result.getValue(cf.getBytes(),
qualifier.getBytes()));
map.put(row, clickCount);
}
return map;
}
public void getOneDataByRowKey(String tableName,String rowkey)throws Exception{
HTable table = getTable(tableName);
Get g=new Get(Bytes.toBytes(rowkey));
Result r=table.get(g);
for(KeyValue k:r.raw()){
System.out.println("行号: "+Bytes.toStringBinary(k.getRow()));
System.out.println("时间戳: "+k.getTimestamp());
System.out.println("列簇: "+Bytes.toStringBinary(k.getFamily()));
System.out.println("列: "+Bytes.toStringBinary(k.getQualifier()));
//if(Bytes.toStringBinary(k.getQualifier()).equals("myage")){
// System.out.println("值: "+Bytes.toInt(k.getValue()));
//}else{
long ss= Bytes.toLong(k.getValue());
System.out.println("值: "+ss);
//}
}
table.close();
}
public static void main(String[] args) throws Exception {
Map map = HBaseUtils.getInstance().query("category_clickcount",
"20200404");
for (Map.Entry entry : map.entrySet()) {
System.out.println(entry.getKey() + " : " + entry.getValue());
}
// HBaseUtils.getInstance().getOneDataByRowKey("category_clickcount","20200404_1");
}
}
类别访问数量实体类 CategoryClickCount
package com.xin.spark.domain;
/**
* Created by xinBa.
* User: 辛聪明
* Date: 2020/4/5
* 类别访问数量实体类
*/
public class CategoryClickCount {
private String categoryName;
private long value;
public String getCategoryName() {
return categoryName;
}
public void setCategoryName(String categoryName) {
this.categoryName = categoryName;
}
public long getValue() {
return value;
}
public void setValue(long value) {
this.value = value;
}
@Override
public String toString() {
return "CategoryClickCount{" +
"categoryName='" + categoryName + '\'' +
", value=" + value +
'}';
}
}
dao类,实现功能调用
package com.xin.spark.dao;
import com.xin.spark.domain.CategoryClickCount;
import com.xin.spark.utils.HBaseUtils;
import org.springframework.stereotype.Repository;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
/**
* Created by xinBa.
* User: 辛聪明
* Date: 2020/4/5
*/
@Repository
public class CategoryClickCountDAO {
// 查询调用hbaseutils
public List query(String day) throws IOException {
List list = new ArrayList<>();
Map map = HBaseUtils.getInstance().query("category_clickcount",day);
for (Map.Entry entry : map.entrySet()) {
CategoryClickCount categoryClickCount = new CategoryClickCount();
categoryClickCount.setCategoryName(entry.getKey());;
categoryClickCount.setValue(entry.getValue());
list.add(categoryClickCount);
}
return list;
}
public static void main(String[] args) throws IOException {
CategoryClickCountDAO dao = new CategoryClickCountDAO();
List list = dao.query("20200404");
for (CategoryClickCount c : list) {
System.out.println(c.getValue());
}
}
}
controller类
package com.xin.spark.controller;
import com.xin.spark.dao.CategoryClickCountDAO;
import com.xin.spark.domain.CategoryClickCount;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.ResponseBody;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.servlet.ModelAndView;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by xinBa.
* User: 辛聪明
* Date: 2020/4/5
* 视频类目访问量实时查询展示功能实现以及扩展
*/
@RestController
public class SparkStatAPP {
private static Map courses = new HashMap<>();
static {
courses.put("1","偶像爱情");
courses.put("2","宫斗谋权");
courses.put("3","玄幻史诗");
courses.put("4", "都市生活");
courses.put("5", "罪案谍战");
courses.put("6", "历险科幻");
}
@Autowired
CategoryClickCountDAO courseClickCountDAO;
@RequestMapping(value = "/CategoryClickCount", method = RequestMethod.POST)
@ResponseBody
public List courseClickCount() throws Exception {
List list = courseClickCountDAO.query("20200404");
for(CategoryClickCount model:list){
String s = courses.get(model.getCategoryName().substring(9));
if (s!=null){
model.setCategoryName(s);
}else {
model.setCategoryName("其他");
}
}
System.out.println("list:"+list);
return list;
}
@RequestMapping(value = "/echarts", method = RequestMethod.GET)
public ModelAndView echarts(){
return new ModelAndView("echarts");
}
}
前端展示页面
实时统计品类点击