模拟业务场景:
一个大型的电商网站,每天都需要分析当天的成交量。如果使用mysql去分析,会非常慢 ,甚至会导致mysql宕机。要进行海量数据分析,需要将mysql中的数据同步到其他的海量数据存储介质(HDFS、hbase)中。那如何来导出呢?
存在的问题
mysql本来压力就比较大,sqoop再执行查询时,还需要执行sql查询,到加大mysql的压力,导致mysql速度更慢
logstash 解决方案二
存在的问题
logstash也需要在mysql中执行sql语句,也会加大mysql的压力,拖慢mysql
canal 解决方案三
实际操作步骤:
推荐使用sqlyog来创建数据库、创建表
binlog 日志介绍
开启 binlog
步骤
实现
配置说明
#配置binlog日志的存放路径为/var/lib/mysql目录,文件以mysql-bin开头
log-bin=/var/lib/mysql/mysql-bin
配置mysql中每一行记录的变化都会详细记录下来
binlog-format=ROW
配置当前机器器的服务ID(如果是mysql集群,不能重复)
server_id=1
6. 重启mysql
service mysql restart
7. mysql -u root -p 登录到mysql,执行以下命令
show variables like ‘%log_bin%’;
8. mysql输出以下内容,表示binlog已经成功开启
9. 进入到 /var/lib/mysql 可以查看到mysql-bin.000001文件已经生成
Canal原理
10. Canal模拟mysql slave的交互协议,伪装自己为mysql slave
11. 向mysql master发送dump协议
12. mysql master收到dump协议,发送binary log给slave(canal)
13. canal解析binary log字节流对象
Canal采集程序搭建
使用java语言将canal中的binlog日志解析,并写入到Kafka中
编写配置文件加载代码
实现
创建GlobalConfigUtil,用来读取application.properties中的配置。使用以下代码来读取application.properties 中的配置
ResourceBundle bundle = ResourceBundle.getBundle("配置文件名", Locale.ENGLISH);
String host = bundle.getString("属性key");
将 application.properties中的canal 和kafka配置读取出来
编写main方法测试是否能够正确读取配置
参考代码
public class GlobalConfigUtil {
public static ResourceBundle bundle = ResourceBundle.getBundle("application", Locale.ENGLISH);
public static String canalHost = bundle.getString("canal.host");
public static String canalPort = bundle.getString("canal.port");
public static String canalInstance = bundle.getString("canal.instance");
public static String mysqlUsername = bundle.getString("mysql.username");
public static String mysqlPassword = bundle.getString("mysql.password");
public static String kafkaBootstrapServers = bundle.getString("kafka.bootstrap.servers");
public static String kafkaZookeeperConnect = bundle.getString("kafka.zookeeper.connect");
public static String kafkaInputTopic = bundle.getString("kafka.input.topic");
public static void main(String[] args) {
System.out.println(canalHost);
System.out.println(canalPort);
System.out.println(canalInstance);
System.out.println(mysqlUsername);
System.out.println(mysqlPassword);
System.out.println(kafkaBootstrapServers);
System.out.println(kafkaZookeeperConnect);
System.out.println(kafkaInputTopic);
}
}
注意:
使用ResourceBundle.getBundle(“application”, Locale.ENGLISH); 读取 application.properties 读取配置文件, 不需要写后缀名
编写Kafka工具类代码KafkaSender.java
/**
* Kafka生产消息工具类
*/
public class KafkaSender {
private String topic;
public KafkaSender(String topic){
super();
this.topic = topic;
}
/**
* 发送消息到Kafka指定topic
*
* @param topic topic名字
* @param key 键值
* @param data 数据
*/
public static void sendMessage(String topic , String key , String data){
Producer<String, String> producer = createProducer();
producer.send(new KeyedMessage<String , String>(topic , key , data));
}
/**
* 创建生产者实例
* @return
*/
private static Producer<String , String> createProducer(){
Properties properties = new Properties();
properties.put("metadata.broker.list" , GlobalConfigUtil.kafkaBootstrap);
properties.put("zookeeper.connect" , GlobalConfigUtil.kafkaZookeeper);
properties.put("serializer.class" , StringEncoder.class.getName());
return new Producer<String, String>(new ProducerConfig(properties));
}
}
Canal解析binlog日志工具类代码
1.将mysql中的binlog日志解析
2.将解析后的数据写入到Kafka
测试工具类代码
/**
* Canal解析binlog日志工具类
*/
public class CanalClient {
static class ColumnValuePair {
private String columnName;
private String columnValue;
private Boolean isValid;
public ColumnValuePair(String columnName, String columnValue, Boolean isValid) {
this.columnName = columnName;
this.columnValue = columnValue;
this.isValid = isValid;
}
public String getColumnName() {
return columnName;
}
public void setColumnName(String columnName) {
this.columnName = columnName;
}
public String getColumnValue() {
return columnValue;
}
public void setColumnValue(String columnValue) {
this.columnValue = columnValue;
}
public Boolean getValid() {
return isValid;
}
public void setValid(Boolean valid) {
isValid = valid;
}
}
/**
* 获取Canal连接
*
* @param host 主机名
* @param port 端口号
* @param instance canal实例名
* @param username 用户名
* @param password 密码
* @return 返回canal连接器
*/
public static CanalConnector getConn(String host, int port, String instance, String username, String password) {
CanalConnector canalConnector = CanalConnectors.newSingleConnector(new InetSocketAddress(host, port), instance, username, password);
return canalConnector;
}
/**
* 解析Binlog日志
*
* @param entries Binlog消息实体
* @param emptyCount 操作的序号
*/
public static void analysis(List<CanalEntry.Entry> entries, int emptyCount) {
for (CanalEntry.Entry entry : entries) {
// 只解析mysql事务的操作,其他的不解析
if (entry.getEntryType() == CanalEntry.EntryType.TRANSACTIONBEGIN ||
entry.getEntryType() == CanalEntry.EntryType.TRANSACTIONEND) {
continue;
}
// 那么解析binlog
CanalEntry.RowChange rowChange = null;
try {
rowChange = CanalEntry.RowChange.parseFrom(entry.getStoreValue());
} catch (Exception e) {
e.printStackTrace();
}
// 获取操作类型字段(增加 删除 修改)
CanalEntry.EventType eventType = rowChange.getEventType();
// 获取binlog文件名称
String logfileName = entry.getHeader().getLogfileName();
// 读取当前操作在binlog文件的位置
long logfileOffset = entry.getHeader().getLogfileOffset();
// 获取当前操作所属的数据库
String dbName = entry.getHeader().getSchemaName();
// 获取当前操作所属的表
String tableName = entry.getHeader().getTableName();//当前操作的是哪一张表
long timestamp = entry.getHeader().getExecuteTime();//执行时间
// 解析操作的行数据
for (CanalEntry.RowData rowData : rowChange.getRowDatasList()) {
// 删除操作
if (eventType == CanalEntry.EventType.DELETE) {
// 获取删除之前的所有列数据
dataDetails(rowData.getBeforeColumnsList(), logfileName, logfileOffset, dbName, tableName, eventType, emptyCount,timestamp);
}
// 新增操作
else if (eventType == CanalEntry.EventType.INSERT) {
// 获取新增之后的所有列数据
dataDetails(rowData.getAfterColumnsList(), logfileName, logfileOffset, dbName, tableName, eventType, emptyCount,timestamp);
}
// 更新操作
else {
// 获取更新之后的所有列数据
dataDetails(rowData.getAfterColumnsList(), logfileName, logfileOffset, dbName, tableName, eventType, emptyCount,timestamp);
}
}
}
}
/**
* 解析具体一条Binlog消息的数据
*
* @param columns 当前行所有的列数据
* @param logFileName binlog文件名
* @param logFileOffset 当前操作在binlog中的位置
* @param dbName 当前操作所属数据库名称
* @param tableName 当前操作所属表名称
* @param eventType 当前操作类型(新增、修改、删除)
* @param emptyCount 操作的序号
*/
private static void dataDetails(List<CanalEntry.Column> columns,
String logFileName,
Long logFileOffset,
String dbName,
String tableName,
CanalEntry.EventType eventType,
int emptyCount,
long timestamp) {
// 找到当前那些列发生了改变 以及改变的值
List<ColumnValuePair> columnValueList = new ArrayList<ColumnValuePair>();
for (CanalEntry.Column column : columns) {
ColumnValuePair columnValuePair = new ColumnValuePair(column.getName(), column.getValue(), column.getUpdated());
columnValueList.add(columnValuePair);
}
String key = UUID.randomUUID().toString();
JSONObject jsonObject = new JSONObject();
jsonObject.put("logFileName", logFileName);
jsonObject.put("logFileOffset", logFileOffset);
jsonObject.put("dbName", dbName);
jsonObject.put("tableName", tableName);
jsonObject.put("eventType", eventType);
jsonObject.put("columnValueList", columnValueList);
jsonObject.put("emptyCount", emptyCount);
jsonObject.put("timestamp", timestamp);
// 拼接所有binlog解析的字段
String data = JSON.toJSONString(jsonObject);
System.out.println(data);
// 解析后的数据发送到kafka
KafkaSender.sendMessage(GlobalConfigUtil.kafkaInput, key, data);
}
/**
* 客户端入口方法
* @param args
*/
public static void main(String[] args) {
// 加载配置文件
String host = GlobalConfigUtil.canalHost;
int port = Integer.parseInt(GlobalConfigUtil.canalPort);
String instance = GlobalConfigUtil.canalInstance;
String username = GlobalConfigUtil.mysqlUsername;
String password = GlobalConfigUtil.mysqlPassword;
// 获取Canal连接
CanalConnector conn = getConn(host, port, instance, username, password);
// 从binlog中读取数据
int batchSize = 100;
int emptyCount = 1;
try {
conn.connect();
conn.subscribe(".*\\..*");
conn.rollback();
int totalCount = 120; //循环次数
while (totalCount > emptyCount) {
// 获取数据
Message message = conn.getWithoutAck(batchSize);
long id = message.getId();
int size = message.getEntries().size();
if (id == -1 || size == 0) {
//没有读取到任何数据
} else {
//有数据,那么解析binlog日志
analysis(message.getEntries(), emptyCount);
emptyCount++;
}
// 确认消息
conn.ack(message.getId());
}
} catch (Exception e) {
e.printStackTrace();
} finally {
conn.disconnect();
}
}
}
INSERT INTO commodity(commodityId , commodityName , commodityTypeId , originalPrice , activityPrice) VALUES (1
, '耐克' , 1 , 888.00 , 820.00);
INSERT INTO commodity(commodityId , commodityName , commodityTypeId , originalPrice , activityPrice) VALUES (2
, '阿迪达斯' , 1 , 900.00 , 870.00);
INSERT INTO commodity(commodityId , commodityName , commodityTypeId , originalPrice , activityPrice) VALUES (3
, 'MacBook Pro' , 2 , 18000.00 , 17500.00);
INSERT INTO commodity(commodityId , commodityName , commodityTypeId , originalPrice , activityPrice) VALUES (4
, '联想' , 2 , 5500.00 , 5320.00);
INSERT INTO commodity(commodityId , commodityName , commodityTypeId , originalPrice , activityPrice) VALUES (5
, '索菲亚' , 3 , 35000.00 , 30100.00);
INSERT INTO commodity(commodityId , commodityName , commodityTypeId , originalPrice , activityPrice) VALUES (6
, '欧派' , 3 , 43000.00 , 40000.00);
{"emptyCount":2,"logFileName":"mysql-
bin.000002","dbName":"pyg","logFileOffset":250,"eventType":"INSERT","columnValueList":
[{"columnName":"commodityId","columnValue":"1","isValid":"true"},
{"columnName":"commodityName","columnValue":"耐克","isValid":"true"},
{"columnName":"commodityTypeId","columnValue":"1","isValid":"true"},
{"columnName":"originalPrice","columnValue":"888.0","isValid":"true"},
{"columnName":"activityPrice","columnValue":"820.0","isValid":"true"}],"tableName":"commodity","timestamp":1553741346000}
配置文件如下:
# canal配置
canal.host=cdh1
canal.port=11111
canal.instance=example
mysql.username=root
mysql.password=123456
#kafka的配置
kafka.bootstrap.servers=cdh1:9092,cdh2:9092,cdh3:9092
kafka.zookeeper.connect=cdh1:2181,cdh2:2181,cdh3:2181
kafka.input.topic=canal