一、项目简介
----------------------------------------------
1.hadoop+hbase+flume+zookeeper实现电信级海量通话日志数据的存储,随机
访问与实时读写。通过hash技术对rowkey进行分析处理,解决hbase的热点问题,协同
coprocessor,解决系统的高吞吐量和查询负载问题以及如何避免中间结果导致通知风暴或
死递归问题,让同学们体验到大数据技术在企业中实战应用
2.整体架构分析
hadoop体系架构与ha配置方案。
hbase体系架构与ha配置方案
flume实时收集架构方案。
SSM实现前端web实现以及与后端HBase的交互架构方案。
hive+oozie实现的周期任务调度。
Spark streaming实现窗口化敏感词实时监控方案。
3.hbase中callLogs表的设计与实现。
通话信息的内容分析与常用场景分析以及对rowkey的设计与实现。
重点讲解盐析的原理与热点问题的解决。rowkey的设计原则与实战
中的技巧。
4.协处理原理与应用实战讲解。
被叫通话记录的设计思想讲解,以及通过协处理器方式实现callog日志主叫记录
被主换位与同步写入。在callog是表中数据的存储序列与双向查询方方式的一致
性透明结果处理。
5.Hadoop以及HBase的HA集群配置与实战。
hadoop的使用QJM的高可用架构配置讲解,ResourceManager的高可用架构配置讲解。
zookeeper的工作原理以及配置、实操演练,hbase与Hadoop HA集成注意事项以及客户端
API编程细节处理。
二、创建新工程
------------------------------------------------
1.创建新工程 -- CallLogSystem
三、创建模拟日志生成程序模块CallLogGenModel
-------------------------------------------------
1.创建模块 -- CallLogGenModel,添加Maven支持
2.创建类calllog.gen.main.App.class
3.编写App类
---------------------------------------
package calllog.gen.main;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.*;
public class App {
//电话簿
public static Map callers = new HashMap();
//电话号码
public static List phoneNumbers = new ArrayList();
static{
callers.put("15811111111", "史让");
callers.put("18022222222", "赵嗄");
callers.put("15133333333", "张锕 ");
callers.put("13269364444", "王以");
callers.put("15032295555", "张噢");
callers.put("17731086666", "张类");
callers.put("15338597777", "李平");
callers.put("15733218888", "杜跑");
callers.put("15614209999", "任阳");
callers.put("15778421111", "梁鹏");
callers.put("18641241111", "郭彤");
callers.put("15732641111", "刘飞");
callers.put("13341101111", "段星");
callers.put("13560191111", "唐华");
callers.put("18301581111", "杨谋");
callers.put("13520401111", "温英");
callers.put("18332561111", "朱宽");
callers.put("18620191111", "刘宗");
phoneNumbers.addAll(callers.keySet());
}
public static void main(String [] args)
{
if(args == null || args.length == 0)
{
System.out.println("no args");
System.exit(-1);
}
genCallLog(args[0]);
}
/**
* 生成通话日志
*/
private static void genCallLog(String logFilePath) {
try {
//文件写入器
FileWriter fw = new FileWriter(logFilePath, true);
Random random = new Random();
while (true) {
//主叫
String caller = phoneNumbers.get(random.nextInt(callers.size()));
String callerName = callers.get(caller);
//被叫 (!= 主叫)
String callee = phoneNumbers.get(random.nextInt(callers.size()));
while (callee.equals(caller)) {
callee = phoneNumbers.get(random.nextInt(callers.size()));
}
String calleeName = callers.get(callee);
//通话时长(<10min)
int duration = random.nextInt(60 * 10) + 1;
DecimalFormat df = new DecimalFormat();
df.applyPattern("000");
String dur = df.format(duration);
//通话时间timeStr
int year = 2018;
int month = random.nextInt(12);
int day = random.nextInt(29) + 1;
int hour = random.nextInt(24);
int min = random.nextInt(60);
int sec = random.nextInt(60);
Calendar calendar = Calendar.getInstance();
calendar.set(year,month,day,hour,min,sec);
Date date = calendar.getTime();
//如果时间超过今天就重新qushijian取时间.
Date now = new Date();
if (date.compareTo(now) > 0) {
continue ;
}
SimpleDateFormat dfs = new SimpleDateFormat();
dfs.applyPattern("yyyy/MM/dd HH:mm:ss");
String timeStr = dfs.format(date);
//通话日志
//String callLog = caller + "," + callerName + "," + callee + "," + calleeName + "," + timeStr + "," + dur;
String callLog = caller + "," + callee + "," + timeStr + "," + dur;
fw.write(callLog+ "\r\n");
fw.flush();
Thread.sleep(200);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
4.打成jar包,扔到Linux上执行s100/s200
b.ubuntu上创建目录
$> mkdir /home/ubuntu/calllog
a.执行命令
cmd> java -cp CallLogGenModel-1.0-SNAPSHOT.jar calllog.gen.main.App d:\\calllog\\calllog.log
$> java -cp /share/calllog/CallLogGenModel-1.0-SNAPSHOT.jar calllog.gen.main.App /home/ubuntu/calllog/calllog.log
c.编写快捷脚本 ~/calllog/calllog.sh
#!/bin/bash
java -cp /share/calllog/CallLogGenModel-1.0-SNAPSHOT.jar calllog.gen.main.App /home/ubuntu/calllog/calllog.log
d.修改calllog.sh权限
$calllog> chmod 777 calllog.sh
e.执行calllog.sh脚本
$calllog> ./calllog.sh
四、启动s100 s200 的flume,开始实时收集日志calllog.log [s100 s200]
---------------------------------------------------------
1.编写flume配置文件[flume/conf/calllog.conf]
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type=exec
# -c +0 如果从头开始收集 -F:持续收集后续数据,否则进程停止。
a1.sources.r1.command=tail -F -c +0 /home/ubuntu/calllog/calllog.log
a1.channels.c1.type=memory
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.topic = calllog
a1.sinks.k1.kafka.bootstrap.servers = s200:9092 s300:9092 s400:9092
a1.sinks.k1.kafka.flumeBatchSize = 20
a1.sinks.k1.kafka.producer.acks = 1
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
2.在s100和s200上启动flume,开始收集日志
$s100> flume-ng agent -f /soft/flume/conf/calllog.conf -n a1 &
$s200> flume-ng agent -f /soft/flume/conf/calllog.conf -n a1 &
五、启动kafka集群
--------------------------------------------------
1.启动zk集群[s100 s200 s300]
$> zkServer.sh start
$> xcall.sh jps
2.启动kafka集群[s200 s300 s400]
$> /soft/kafka/bin/kafka-server-start.sh -daemon /soft/kafka/config/server.properties
$> netstat -ano | grep 9092
3.创建kafka主题
$> kafka-topics.sh --create --zookeeper s100:2181 --replication-factor 3 --partitions 4 --topic calllog
$> kafka-topics.sh --list --zookeeper s100:2181
4.在s300上开启kafka控制台消费者,消费flume收集的calllog主题,用于测试.
$s300> kafka-console-consumer.sh --zookeeper s100:2181 --topic calllog
5.在s100和s200上 开启日志生成app,查看s300控制台输出情况
$s100> ~/calllog/calllog.sh
$s200> ~/calllog/calllog.sh
六、编写真正的kafka消费者HBase -- 从kafka提取消息,存放到hbase中
---------------------------------------------------------------------
1.启动hadoop集群[s100 s500 / s200 s300 s400], 完全分布式 + HA
a.$s100> start-all.sh
b.$s100> xcall.sh jps
6656 Jps
6353 ResourceManager
6261 DFSZKFailoverController
3317 QuorumPeerMain
5818 NameNode
----xcall : jps from s200 ----
6224 DataNode
6721 NodeManager
7025 Jps
6465 JournalNode
3847 QuorumPeerMain
4335 Kafka
----xcall : jps from s300 ----
6088 NodeManager
6409 Jps
4330 Kafka
5595 DataNode
5836 JournalNode
3612 QuorumPeerMain
----xcall : jps from s400 ----
4242 Kafka
5241 DataNode
5738 NodeManager
5482 JournalNode
6059 Jps
----xcall : jps from s500 ----
5317 Jps
5064 DFSZKFailoverController
4826 NameNode
c.查看webui
http://s100:50070
2.启动hbase集群[s100 s500/ s200 s300 s400]
a.在s100上启动集群
$s100>start-hbase.sh
b.在s500上启动备份master节点
$s500> hbase-daemon.sh start master
c.查看webui
http://s100:16010
3.创建hbase名字空间 + 表
a.进入hbase终端
$s100> hbase shell
b.创建名字空间和表
$s100> create_namespace 'call'
$s100> create 'call:calllogs','f1'
4.编程实现 -- 创建kafka消费者,订阅calllog主题
a.创建模块CalllogCustomerModel,添加maven支持
b.添加maven依赖
4.0.0
groupId
CalllogCustomerModel
1.0-SNAPSHOT
org.apache.kafka
kafka_2.11
0.10.0.1
org.apache.hbase
hbase-client
1.2.4
c.创建包
calllog.kafka.hbase.customer
d.编写属性文件[resources/kafka.properties]
zookeeper.connect=s100:2181,s200:2181,s300:2181
group.id=calllog
zookeeper.session.timeout.ms=500
zookeeper.sync.time.ms=250
auto.commit.interval.ms=1000
#从头消费
auto.offset.reset=smallest
#主题
topic=calllog
#表名
table.name=call:calllogs
#分区数
partition.number=100
#主叫标记
caller.flag=0
#hash区域的模式
hashcode.pattern=00
e.拷贝hbase-site.xml配置文件到resources目录下
hbase.cluster.distributed
true
hbase.rootdir
hdfs://mycluster/hbase
hbase.zookeeper.quorum
192.168.43.131:2181,192.168.43.132:2181,192.168.43.133:2181
hbase.zookeeper.property.dataDir
/home/ubuntu/zookeeper
f.拷贝hdfs-site.xml到resources目录下
dfs.replication
3
dfs.ha.automatic-failover.enabled
true
dfs.hosts
/soft/hadoop/etc/dfs-hosts-include.conf
dfs.hosts.exclude
/soft/hadoop/etc/dfs-hosts-exclude.conf
dfs.nameservices
mycluster
dfs.ha.namenodes.mycluster
nn1,nn2
dfs.namenode.rpc-address.mycluster.nn1
s100:8020
dfs.namenode.rpc-address.mycluster.nn2
s500:8020
dfs.namenode.http-address.mycluster.nn1
s100:50070
dfs.namenode.http-address.mycluster.nn2
s500:50070
dfs.namenode.shared.edits.dir
qjournal://s200:8485;s300:8485;s400:8485/mycluster
dfs.client.failover.proxy.provider.mycluster
org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
dfs.ha.fencing.methods
sshfence
shell(/bin/true)
dfs.ha.fencing.ssh.private-key-files
/home/ubuntu/.ssh/id_rsa
dfs.journalnode.edits.dir
/home/ubuntu/hadoop/journal
g.编写工具类PropertiesUtil -- 外部加载prop
---------------------------------------------------
package calllog.kafka.hbase.customer;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
public class PropertiesUtil {
public static Properties props;
static {
try {
//外部加载属性文件props
InputStream is = ClassLoader.getSystemResourceAsStream("kafka.properties");
props = new Properties();
props.load(is);
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 获取属性
*/
public static String getPorp(String key)
{
return props.getProperty(key);
}
}
h.编写类HbaseDao类 -- Hbase的访问数据对象,通过dao访问hbase
1)设rowkey:常用的主要指标,全部编写进来,而且要保证定长
区域号[00-99] , 1_id[主号码] , time , 标识[0/1 主叫/背叫] , 2_id[从属号码] , 时长
区域号[00-99] = (1_id[后四位] + time[yyyyMM]).hash() % 100[区域数]
2)代码
--------------------------------------
package calllog.kafka.hbase.customer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.text.DecimalFormat;
/**
* hbase的数据访问对象
*/
public class HbaseDao {
private Table table = null;
private DecimalFormat df = new DecimalFormat();
//设计rowkey的分区标识
private int partitions;
//0 -- 主叫 1 -- 被叫
private String flag;
public HbaseDao() {
try {
//获取配置文件
Configuration conf = HBaseConfiguration.create();
//工厂类创建连接
Connection conn = ConnectionFactory.createConnection(conf);
//get table
TableName tbName = TableName.valueOf(PropertiesUtil.getPorp("table.name"));
table = conn.getTable(tbName);
df.applyPattern(PropertiesUtil.getPorp("hashcode.pattern"));
partitions = Integer.parseInt(PropertiesUtil.getPorp("partition.number"));
flag = PropertiesUtil.getPorp("caller.flag");
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 向hbase中put数据
*/
public void put(String log)
{
if(log == null || log.equals(""))
{
return;
}
try {
//设计rowkey
String rowKey = "";
//解析日志
String [] strs = log.split(",");
if(strs != null && strs.length == 4)
{
String caller = strs[0];
String callee = strs[1];
String time = strs[2];
String duration = strs[3];
//计算区域号
String hash = getRegionNumber(caller, time);
rowKey = getRowkey(hash,caller,flag,time,callee,duration);
//开始put
Put p = new Put(Bytes.toBytes(rowKey));
p.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("caller"),Bytes.toBytes(caller));
p.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("callee"),Bytes.toBytes(callee));
p.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("callTime"),Bytes.toBytes(time));
p.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("callDuration"),Bytes.toBytes(duration));
table.put(p);
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 获取区域号码 -- Rowkey设计用
* @return
*/
public String getRegionNumber(String caller, String calltime)
{
//取得电话号码的后四位
String last4Code = caller.substring(caller.length() - 4);
//取得通话时间的年月
String month = calltime.substring(0, 6);
int hash = (Integer.parseInt(last4Code) ^ Integer.parseInt(month)) % partitions;
return df.format(hash);
}
/**
* 获取rowkey
*/
public String getRowkey(String hash, String caller,String time,String flag, String callee,String dur)
{
return hash + "," + caller + "," + time + "," + flag + "," + callee + "," + dur;
}
}
i.编写主类 -- HbaseCustomer
---------------------------------------------
package calllog.kafka.hbase.customer;
import kafka.consumer.Consumer;
import kafka.consumer.ConsumerConfig;
import kafka.message.MessageAndMetadata;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import kafka.consumer.ConsumerIterator;
import kafka.consumer.KafkaStream;
import kafka.javaapi.consumer.ConsumerConnector;
import java.util.Properties;
/**
* hbase消费者,从kafka获取日志信息,存储到hbase中
*/
public class HbaseCustomer {
public static void main(String [] args)
{
//hbasedao
HbaseDao dao = new HbaseDao();
//创建消费者配置文件
ConsumerConfig config = new ConsumerConfig(PropertiesUtil.props);
//创建消费者
ConsumerConnector consumer = Consumer.createJavaConsumerConnector(new ConsumerConfig(PropertiesUtil.props));
//绑定主题
String topic = PropertiesUtil.getPorp("topic");
Map map = new HashMap();
map.put(topic, new Integer(1));
//开始消费
Map>> kafkaMsg = consumer.createMessageStreams(map);
List> msgList = kafkaMsg.get(topic);
String kafka_hbaseMsg = "";
for(KafkaStream msg : msgList)
{
ConsumerIterator mm = msg.iterator();
while (mm.hasNext()) {
MessageAndMetadata next = mm.next();
byte [] m = next.message();
//获取消息
kafka_hbaseMsg = new String(m);
//写入hbase
dao.put(kafka_hbaseMsg);
}
}
}
}
j.使用idea进行关联jar打包
File --> project strructure --> artifacts --> ...
l.在window上执行,测试
cmd> java -cp CalllogCustomerModel.jar calllog.kafka.hbase.customer.HbaseCustomer
m.将jar包放到共享文件夹下,在ubuntu上执行程序,查看程序是否正确执行
$> java -cp CalllogCustomerModel.jar calllog.kafka.hbase.customer.HbaseCustomer
七、编写web程序,从hbase中提取数据,进行可视化展示
--------------------------------------------------------
1.导入上次课程的SSM工程
File --> project struct --> Models --> + --> ssm.imi --> meven '+' 添加pom.xml
2.进行一系列web设置:
a. setting --> Application Server --> 添加tomcat服务器
b. File --> project structure --> Artifacs --> + 添加ssmweb模块 --> +添加 右侧支持依赖的jar包和依赖的外部配置文件
c. Run --> edit configuarations --> 添加tomcat local app --> deployment + --> 添加自己的web模块(ssm) --> 部署热更新
3.运行,输入网址 http://localhost:8080/user/findall?pn=1,测试程序
4.在domain包中添加calllog类
----------------------------------------------
package com.it18zhang.ssm.domain;
/**
* calllog的domain类 -- 标准javabean
*/
public class Calllog {
private String caller;
private String callee;
private String callTime;
private String callDuration;
public String getCaller() {
return caller;
}
public void setCaller(String caller) {
this.caller = caller;
}
public String getCallee() {
return callee;
}
public void setCallee(String callee) {
this.callee = callee;
}
public String getCallTime() {
return callTime;
}
public void setCallTime(String callTime) {
this.callTime = callTime;
}
public String getCallDuration() {
return callDuration;
}
public void setCallDuration(String callDuration) {
this.callDuration = callDuration;
}
}
5.添加calllog service接口CalllogService.interface
------------------------------------------------------------
package com.it18zhang.ssm.service;
import Calllog;
import java.util.List;
/**
* Calllog的服务类 -- 用于定制与服务器交互的规则
*/
public interface CalllogService {
//查询所有的calllog
public List findAll();
}
6.添加CalllogService的实现类CalllogServiceImpl,用于与hbase进行交互
-------------------------------------------------------------------------
a.准备必要的配置文件,拷贝[hbase-site.xml / hdfs-site.xml]到resouces目录下
b.添加maven依赖
org.apache.kafka
kafka_2.11
0.10.0.1
org.apache.hbase
hbase-client
1.2.4
c.编写类CalllogServiceImpl
------------------------------------------
package com.it18zhang.ssm.service.impl;
import com.it18zhang.ssm.domain.Calllog;
import com.it18zhang.ssm.service.CalllogService;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.util.*;
/**
* CalllogService的实现类
*/
@Service("calllogService")
public class CalllogServiceImpl implements CalllogService {
private Table table;
public CalllogServiceImpl()
{
try {
//获取配置文件
Configuration conf = HBaseConfiguration.create();
//工厂类创建连接
Connection conn = ConnectionFactory.createConnection(conf);
//get table
TableName tbName = TableName.valueOf("call:calllogs");
table = conn.getTable(tbName);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 查询所有的calllog
* 全表扫描
* @return
*/
public List findAll() {
List list = new ArrayList();
try {
//扫描
Scan scan = new Scan();
ResultScanner rs = table.getScanner(scan);
Iterator it = rs.iterator();
byte[] famliy = Bytes.toBytes("f1");
byte[] callerf = Bytes.toBytes("caller");
byte[] calleef = Bytes.toBytes("callee");
byte[] callTimef = Bytes.toBytes("callTime");
byte[] callDurationf = Bytes.toBytes("callDuration");
Calllog calllog = null;
while (it.hasNext()) {
Result next = it.next();
String caller = Bytes.toString(next.getValue(famliy, callerf));
String callee = Bytes.toString(next.getValue(famliy, calleef));
String callTime = Bytes.toString(next.getValue(famliy, callTimef));
String callDuration = Bytes.toString(next.getValue(famliy, callDurationf));
calllog = new Calllog();
calllog.setCaller(caller);
calllog.setCallee(callee);
calllog.setCallTime(callTime);
calllog.setCallDuration(callDuration);
list.add(calllog);
}
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
}
7.添加CalllogContorller -- 用于web界面显示
-------------------------------------------------------
package com.it18zhang.ssm.web.controller;
import com.it18zhang.ssm.domain.Calllog;
import com.it18zhang.ssm.service.CalllogService;
import com.it18zhang.ssm.service.impl.CalllogServiceImpl;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.RequestMapping;
import javax.annotation.Resource;
import java.util.List;
@Controller
public class CalllogController {
@Resource(name="calllogService")
private CalllogService cs;
@RequestMapping("calllog/findAll")
public String findAll(Model model)
{
List list = cs.findAll();
model.addAttribute("calllogs", list);
return "calllog/calllogList";
}
}
8.添加jsp页面calllog/calllogList.jsp
<%@ page contentType="text/html;charset=UTF-8" language="java" %>
<%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c" %>
通话记录
主叫
被叫
通话时间
通话时长