集成思路,使用Debezium采集MySQL Binlog 发送到kafka,使用kafka-connect-hdfs 组件,消费kafka 日志到hive 中。
Debzium Binlog connector 采集配置
{
"name": "json-inventory-customers",
"config": {
"connector.class": "io.debezium.connector.mysql.MySqlConnector",
"tasks.max": "1",
"database.hostname": "psd-hadoop039",
"database.port": "3306",
"database.user": "debezium",
"database.password": "dbz",
"database.serverTimezone": "UTC",
"database.server.name": "json",
"database.whitelist": "inventory",
"database.history.kafka.bootstrap.servers": "psd-hadoop039:9092",
"database.history.kafka.topic": "dbhistory.json.inventory",
"table.whitelist": "inventory.customers",
"key.converter": "org.apache.kafka.connect.json.JsonConverter",
"value.converter": "org.apache.kafka.connect.json.JsonConverter",
"key.converter.schemas.enable":"true",
"value.converter.schemas.enable":"true",
"include.schema.changes": "true",
"transforms": "unwrap",
"binary.handling.mode": "hex",
"time.precision.mode": "connect",
"transforms.unwrap.type": "io.debezium.transforms.ExtractNewRecordState",
"transforms.unwrap.drop.tombstones": "true",
"transforms.unwrap.delete.handling.mode": "rewrite",
"transforms.unwrap.add.headers": "name,db,table,op,db",
"transforms.unwrap.add.fields": "name,db,table,op,file,pos,row,ts_ms,source.ts_ms"
}
}
kafka connect hdfs 消费配置
{
"name": "json.inventory.customers.sink",
"config": {
"connector.class": "io.confluent.connect.hdfs.HdfsSinkConnector",
"format.class": "io.confluent.connect.hdfs.parquet.ParquetFormat",
"tasks.max": "1",
"topics": "json.inventory.customers",
"hadoop.conf.dir":"/etc/hadoop/conf",
"store.url": "hdfs://cdhtest",
"logs.dir": "/user/dts/logs",
"topics.dir":"/user/dts/topics",
"flush.size": "1",
"rotate.interval.ms":"10000",
"hive.integration":true,
"hive.database":"dts",
"hive.metastore.uris":"thrift://cdh-10-21-17-95:9083",
"partitioner.class":"io.confluent.connect.hdfs.partitioner.HourlyPartitioner",
"locale":"zh",
"timezone":"Asia/Shanghai",
"path.format":"YYYYMMddHH/",
"schema.compatibility":"BACKWARD"
}
}
全量采集出来数据(配置的tf 非原生格式)
{
"schema": {
"type": "struct",
"fields": [{
"type": "int32",
"optional": false,
"field": "id"
}, {
"type": "string",
"optional": false,
"field": "first_name"
}, {
"type": "string",
"optional": false,
"field": "last_name"
}, {
"type": "string",
"optional": false,
"field": "email"
}, {
"type": "string",
"optional": true,
"field": "__name"
}, {
"type": "string",
"optional": true,
"field": "__db"
}, {
"type": "string",
"optional": true,
"field": "__table"
}, {
"type": "string",
"optional": true,
"field": "__op"
}, {
"type": "string",
"optional": true,
"field": "__file"
}, {
"type": "int64",
"optional": true,
"field": "__pos"
}, {
"type": "int32",
"optional": true,
"field": "__row"
}, {
"type": "int64",
"optional": true,
"field": "__ts_ms"
}, {
"type": "int64",
"optional": true,
"field": "__source_ts_ms"
}, {
"type": "string",
"optional": true,
"field": "__deleted"
}],
"optional": false,
"name": "json.inventory.customers.Value"
},
"payload": {
"id": 1001,
"first_name": "Sally",
"last_name": "Thomas",
"email": "[email protected]",
"__name": "json",
"__db": "inventory",
"__table": "customers",
"__op": "c",
"__file": "mysql-bin.000003",
"__pos": 10524,
"__row": 0,
"__ts_ms": 1596448180633,
"__source_ts_ms": 0,
"__deleted": "false"
}
}
集成到Hive 的数据
表结构
CREATE EXTERNAL TABLE `json_inventory_customers`(
`id` int,
`first_name` string,
`last_name` string,
`email` string,
`__name` string,
`__db` string,
`__table` string,
`__op` string,
`__file` string,
`__pos` bigint,
`__row` int,
`__ts_ms` bigint,
`__source_ts_ms` bigint,
`__deleted` string)
PARTITIONED BY (
`year` string COMMENT '',
`month` string COMMENT '',
`day` string COMMENT '',
`hour` string COMMENT '')
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
'hdfs://cdhtest/user/dts/topics/json.inventory.customers'
TBLPROPERTIES (
'transient_lastDdlTime'='1596527679')
数据
json_inventory_customers.id 1001
json_inventory_customers.first_name Sally
json_inventory_customers.last_name Thomas
json_inventory_customers.email sally.thomas@acme.com
json_inventory_customers.__name json
json_inventory_customers.__db inventory
json_inventory_customers.__table customers
json_inventory_customers.__op c
json_inventory_customers.__file mysql-bin.000003
json_inventory_customers.__pos 10524
json_inventory_customers.__row 0
json_inventory_customers.__ts_ms 1596527669214
json_inventory_customers.__source_ts_ms 0
json_inventory_customers.__deleted false
json_inventory_customers.year 2020
json_inventory_customers.month 08
json_inventory_customers.day 04
json_inventory_customers.hour 15
Debezium snapshot、insert、update、delete 操作日志 kafka-connect-hdfs 消费无问题,但是 新增字段后 insert 报错,如下
[2020-08-04 16:08:29,274] DEBUG Closing WAL (io.confluent.connect.hdfs.wal.FSWAL:154)
[2020-08-04 16:08:29,282] DEBUG WorkerSinkTask{id=json.customers.sink-0} Skipping offset commit, no change since last commit (org.apache.kafka.connect.runtime.WorkerSinkTask:431)
[2020-08-04 16:08:29,283] DEBUG WorkerSinkTask{id=json.customers.sink-0} Finished offset commit successfully in 9 ms for sequence number 84: null (org.apache.kafka.connect.runtime.WorkerSinkTask:268)
[2020-08-04 16:08:29,283] ERROR WorkerSinkTask{id=json.customers.sink-0} Task threw an uncaught and unrecoverable exception (org.apache.kafka.connect.runtime.WorkerTask:186)
org.apache.kafka.connect.errors.ConnectException: Exiting WorkerSinkTask due to unrecoverable exception.
at org.apache.kafka.connect.runtime.WorkerSinkTask.deliverMessages(WorkerSinkTask.java:568)
at org.apache.kafka.connect.runtime.WorkerSinkTask.poll(WorkerSinkTask.java:326)
at org.apache.kafka.connect.runtime.WorkerSinkTask.iteration(WorkerSinkTask.java:228)
at org.apache.kafka.connect.runtime.WorkerSinkTask.execute(WorkerSinkTask.java:196)
at org.apache.kafka.connect.runtime.WorkerTask.doRun(WorkerTask.java:184)
at org.apache.kafka.connect.runtime.WorkerTask.run(WorkerTask.java:234)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility
at io.confluent.connect.hdfs.TopicPartitionWriter.write(TopicPartitionWriter.java:407)
at io.confluent.connect.hdfs.DataWriter.write(DataWriter.java:386)
at io.confluent.connect.hdfs.HdfsSinkTask.put(HdfsSinkTask.java:127)
at org.apache.kafka.connect.runtime.WorkerSinkTask.deliverMessages(WorkerSinkTask.java:546)
... 10 more
Caused by: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility
at io.confluent.connect.storage.schema.StorageSchemaCompatibility.validateAndCheck(StorageSchemaCompatibility.java:157)
at io.confluent.connect.storage.schema.StorageSchemaCompatibility.shouldChangeSchema(StorageSchemaCompatibility.java:320)
at io.confluent.connect.hdfs.TopicPartitionWriter.write(TopicPartitionWriter.java:361)
... 13 more
[2020-08-04 16:08:29,283] ERROR WorkerSinkTask{id=json.customers.sink-0} Task is being killed and will not recover until manually restarted (org.apache.kafka.connect.runtime.WorkerTask:187)
[2020-08-04 16:08:29,283] INFO Shutting down Hive executor service. (io.confluent.connect.hdfs.DataWriter:484)
[2020-08-04 16:08:29,283] INFO Awaiting termination. (io.confluent.connect.hdfs.DataWriter:489)
Schema version required for BACKWARD compatibility 很明显,BACKWARD 模式必须 有Schema version 版本,改改集成模式不就OK了,简单的很。 schema.compatibility 配置参数说明
删除消费connector,重置kafka,修改schema.compatibility=NONE ,重新注册一个connector。
很不幸,要与hive集成不能集成schema.compatibility不能为NONE 。报错如下:
[2020-08-04 16:26:15,340] ERROR WorkerSinkTask{id=json.customers.sink-0} Task threw an uncaught and unrecoverable exception (org.apache.kafka.connect.runtime.WorkerTask:186)
org.apache.kafka.connect.errors.ConnectException: Couldn't start HdfsSinkConnector due to configuration error.
at io.confluent.connect.hdfs.HdfsSinkTask.start(HdfsSinkTask.java:90)
at org.apache.kafka.connect.runtime.WorkerSinkTask.initializeAndStart(WorkerSinkTask.java:305)
at org.apache.kafka.connect.runtime.WorkerSinkTask.execute(WorkerSinkTask.java:193)
at org.apache.kafka.connect.runtime.WorkerTask.doRun(WorkerTask.java:184)
at org.apache.kafka.connect.runtime.WorkerTask.run(WorkerTask.java:234)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.kafka.common.config.ConfigException: Hive Integration requires schema compatibility to be BACKWARD, FORWARD or FULL
at io.confluent.connect.hdfs.HdfsSinkTask.start(HdfsSinkTask.java:63)
... 9 more
用 FORWARD 和 FULL 尝试均失败!google 无果!问题来了,灵魂拷问,哪里来的版本?话不多说,源码走一波
异常跑出的类:io.confluent.connect.storage.schema.StorageSchemaCompatibility#validateAndCheck
protected boolean validateAndCheck(Schema valueSchema, Schema currentSchema) {
if (currentSchema == null && valueSchema == null) {
return false;
} else if (currentSchema == valueSchema) {
return false;
} else if (currentSchema != null && valueSchema != null) {
if ((valueSchema.version() == null || currentSchema.version() == null) && this != NONE) {
throw new SchemaProjectorException("Schema version required for " + this.toString() + " compatibility");
} else {
return this.check(valueSchema, currentSchema);
}
} else {
throw new SchemaProjectorException("Switch between schema-based and schema-less data is not supported");
}
}
哪里调用了这个方法 ? io.confluent.connect.hdfs.TopicPartitionWriter#write 里面的 compatibility.shouldChangeSchema(record, null, currentSchema) 调用 io.confluent.connect.storage.schema.StorageSchemaCompatibility#shouldChangeSchema 调用 io.confluent.connect.storage.schema.StorageSchemaCompatibility#validateAndCheck
在 validateAndCheck 逻辑中发现 肯定是 valueSchema.version() == null 或 currentSchema.version() == null 为 空,其实就是 io.confluent.connect.hdfs.TopicPartitionWriter#write 里面 Schema valueSchema = record.valueSchema()拿到的valueSchema 的版本值为空。我们通过采集出来的 json 数据看到的确没有版本字段。
知道了问题所在解决就简单了,怎么把版本字段加上去呢?
查看了下Debezium采集的源码 发现 schema 生成是自己build 出来的,改源码不现实,讲道理来说开源的东西没有这么弱,应该有别的解决方案,想过同tf添加这个字段。在Debezium源码里面找了下没有对应的tf(后面发现其实kafka connect 提供了这个加字段的tf)。排查问题的时候发现 avro 是带有版本信息的,于是乎把Debezium 采集序列化格式改成了 avro 配置如下 ,hdfs 消费配置不变。
{
"name": "avro-inventory-customers",
"config": {
"connector.class": "io.debezium.connector.mysql.MySqlConnector",
"tasks.max": "1",
"database.hostname": "psd-hadoop039",
"database.port": "3306",
"database.user": "debezium",
"database.password": "dbz",
"database.server.name": "avro",
"database.whitelist": "inventory",
"database.history.kafka.bootstrap.servers": "psd-hadoop039:9092",
"database.history.kafka.topic": "dbhistory.avro.inventory",
"table.whitelist": "inventory.customers",
"key.converter": "io.confluent.connect.avro.AvroConverter",
"key.converter.schema.registry.url": "http://psd-hadoop039:8081",
"value.converter": "io.confluent.connect.avro.AvroConverter",
"value.converter.schema.registry.url": "http://psd-hadoop039:8081",
"include.schema.changes": "true",
"transforms": "unwrap",
"binary.handling.mode":"hex",
"time.precision.mode":"connect",
"transforms.unwrap.type": "io.debezium.transforms.ExtractNewRecordState",
"transforms.unwrap.drop.tombstones": "true",
"transforms.unwrap.delete.handling.mode": "rewrite",
"transforms.unwrap.add.headers": "name,db,table,op,db",
"transforms.unwrap.add.fields": "name,db,table,op,file,pos,row,ts_ms,source.ts_ms"
}
}
备注:说明下 为了部署起来方便使用了 confluentinc 社区版,部署十分简单
下载 confluent-5.5.1-2.12.tar.gz 加压
将用到的 debezium-debezium-connector-mysql-1.2.0 、confluentinc-kafka-connect-hdfs-5.5.1
解压到 $confluentinc_home/share/java 里面
启动命令 :bin/confluent local start
停止命令:bin/confluent local stop
上面这套操作下来 添加、修改、删除字段 hdfs sink 消费都不报错了。 但是通过hive 查数据的时候报错如下
Bad status for request TFetchResultsReq(fetchType=0, operationHandle=TOperationHandle(hasResultSet=True, modifiedRowCount=None, operationType=0, operationId=THandleIdentifier(secret='\xea\x99B\x16)\x94I\xdb\xa4\xd1\x8d\xb1_\xf8\\\xff', guid='\x11z\xff\x04\xb1\xdbM\x15\x8f;G d\xe2G}')), orientation=4, maxRows=100): TFetchResultsResp(status=TStatus(errorCode=0, errorMessage='java.io.IOException: java.lang.ClassCastException: org.apache.hadoop.io.Text cannot be cast to org.apache.hadoop.io.IntWritable', sqlState=None, infoMessages=['*org.apache.hive.service.cli.HiveSQLException:java.io.IOException: java.lang.ClassCastException: org.apache.hadoop.io.Text cannot be cast to org.apache.hadoop.io.IntWritable:14:13', 'org.apache.hive.service.cli.operation.SQLOperation:getNextRowSet:SQLOperation.java:375', 'org.apache.hive.service.cli.operation.OperationManager:getOperationNextRowSet:OperationManager.java:287', 'org.apache.hive.service.cli.session.HiveSessionImpl:fetchResults:HiveSessionImpl.java:752', 'org.apache.hive.service.cli.CLIService:fetchResults:CLIService.java:438', 'org.apache.hive.service.cli.thrift.ThriftCLIService:FetchResults:ThriftCLIService.java:689', 'org.apache.hive.service.cli.thrift.TCLIService$Processor$FetchResults:getResult:TCLIService.java:1553', 'org.apache.hive.service.cli.thrift.TCLIService$Processor$FetchResults:getResult:TCLIService.java:1538', 'org.apache.thrift.ProcessFunction:process:ProcessFunction.java:39', 'org.apache.thrift.TBaseProcessor:process:TBaseProcessor.java:39', 'org.apache.hive.service.auth.TSetIpAddressProcessor:process:TSetIpAddressProcessor.java:56', 'org.apache.thrift.server.TThreadPoolServer$WorkerProcess:run:TThreadPoolServer.java:286', 'java.util.concurrent.ThreadPoolExecutor:runWorker:ThreadPoolExecutor.java:1149', 'java.util.concurrent.ThreadPoolExecutor$Worker:run:ThreadPoolExecutor.java:624', 'java.lang.Thread:run:Thread.java:748', '*java.io.IOException:java.lang.ClassCastException: org.apache.hadoop.io.Text cannot be cast to org.apache.hadoop.io.IntWritable:18:4', 'org.apache.hadoop.hive.ql.exec.FetchOperator:getNextRow:FetchOperator.java:508', 'org.apache.hadoop.hive.ql.exec.FetchOperator:pushRow:FetchOperator.java:415', 'org.apache.hadoop.hive.ql.exec.FetchTask:fetch:FetchTask.java:138', 'org.apache.hadoop.hive.ql.Driver:getResults:Driver.java:1797', 'org.apache.hive.service.cli.operation.SQLOperation:getNextRowSet:SQLOperation.java:370', '*java.lang.ClassCastException:org.apache.hadoop.io.Text cannot be cast to org.apache.hadoop.io.IntWritable:22:4', 'org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector:get:WritableIntObjectInspector.java:36', 'org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils:getString:PrimitiveObjectInspectorUtils.java:810', 'org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorConverter$StringConverter:convert:PrimitiveObjectInspectorConverter.java:453', 'org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters$StructConverter:convert:ObjectInspectorConverters.java:396', 'org.apache.hadoop.hive.ql.exec.FetchOperator:getNextRow:FetchOperator.java:491'], statusCode=3), results=None, hasMoreRows=None)
google 了下发现原因是 ParquetFormat 文件的头信息记录有问题,有个蛇皮的解决方案,先用impala 设置 set PARQUET_FALLBACK_SCHEMA_RESOLUTION=name; 自身覆盖,问题解决,但是不适用生产。 另一个就解决方案:“format.class”: “io.confluent.connect.hdfs.avro.AvroFormat”,最后 hdfs sink 消费配置如下,完美解决 字段变更问题
{
"name": "avro.inventory.customers.sink",
"config": {
"connector.class": "io.confluent.connect.hdfs.HdfsSinkConnector",
"format.class": "io.confluent.connect.hdfs.avro.AvroFormat",
"key.converter": "io.confluent.connect.avro.AvroConverter",
"key.converter.schema.registry.url": "http://psd-hadoop039:8081",
"value.converter": "io.confluent.connect.avro.AvroConverter",
"value.converter.schema.registry.url": "http://psd-hadoop039:8081",
"key.converter.schemas.enable": "true",
"value.converter.schemas.enable": "true",
"tasks.max": "1",
"topics": "avro.inventory.customers",
"hadoop.conf.dir":"/etc/hadoop/conf",
"store.url": "hdfs://cdhtest",
"logs.dir": "/user/dts/logs",
"topics.dir":"/user/dts/topics",
"flush.size": "2",
"rotate.interval.ms":"10000",
"hive.integration":true,
"hive.database":"dts",
"hive.metastore.uris":"thrift://cdh-10-21-17-95:9083",
"partitioner.class":"io.confluent.connect.hdfs.partitioner.HourlyPartitioner",
"locale":"zh",
"timezone":"Asia/Shanghai",
"path.format":"YYYYMMddHH/",
"schema.compatibility":"BACKWARD"
}
}