Debzium系列-Debzium MySQL Binlog 集成到Hive 坑(一)

环境说明

  • kakfa 2.5
  • debezium-debezium-connector-mysql-1.2.0
  • confluentinc-kafka-connect-hdfs-5.5.1( hadoop 2.x 的)

集成思路,使用Debezium采集MySQL Binlog 发送到kafka,使用kafka-connect-hdfs 组件,消费kafka 日志到hive 中。

Debzium Binlog connector 采集配置

{
    "name": "json-inventory-customers",
    "config": {
        "connector.class": "io.debezium.connector.mysql.MySqlConnector",
        "tasks.max": "1",
        "database.hostname": "psd-hadoop039",
        "database.port": "3306",
        "database.user": "debezium",
        "database.password": "dbz",
        "database.serverTimezone": "UTC",
        "database.server.name": "json",
        "database.whitelist": "inventory",
        "database.history.kafka.bootstrap.servers": "psd-hadoop039:9092",
        "database.history.kafka.topic": "dbhistory.json.inventory",
        "table.whitelist": "inventory.customers",
        "key.converter": "org.apache.kafka.connect.json.JsonConverter",
        "value.converter": "org.apache.kafka.connect.json.JsonConverter",
        "key.converter.schemas.enable":"true",
        "value.converter.schemas.enable":"true",
        "include.schema.changes": "true",
        "transforms": "unwrap",
        "binary.handling.mode": "hex",
        "time.precision.mode": "connect",
        "transforms.unwrap.type": "io.debezium.transforms.ExtractNewRecordState",
        "transforms.unwrap.drop.tombstones": "true",
        "transforms.unwrap.delete.handling.mode": "rewrite",
        "transforms.unwrap.add.headers": "name,db,table,op,db",
        "transforms.unwrap.add.fields": "name,db,table,op,file,pos,row,ts_ms,source.ts_ms"
    }
}

kafka connect hdfs 消费配置

{
	"name": "json.inventory.customers.sink",
	"config": {
		"connector.class": "io.confluent.connect.hdfs.HdfsSinkConnector",
		"format.class": "io.confluent.connect.hdfs.parquet.ParquetFormat",
		"tasks.max": "1",
		"topics": "json.inventory.customers",
        "hadoop.conf.dir":"/etc/hadoop/conf",
		"store.url": "hdfs://cdhtest",
		"logs.dir": "/user/dts/logs",
        "topics.dir":"/user/dts/topics",
		"flush.size": "1",
		"rotate.interval.ms":"10000",
        "hive.integration":true,
        "hive.database":"dts",
        "hive.metastore.uris":"thrift://cdh-10-21-17-95:9083",
        "partitioner.class":"io.confluent.connect.hdfs.partitioner.HourlyPartitioner",
        "locale":"zh",
        "timezone":"Asia/Shanghai",
        "path.format":"YYYYMMddHH/",
        "schema.compatibility":"BACKWARD"
	}
}

全量采集出来数据(配置的tf 非原生格式)

{
	"schema": {
		"type": "struct",
		"fields": [{
			"type": "int32",
			"optional": false,
			"field": "id"
		}, {
			"type": "string",
			"optional": false,
			"field": "first_name"
		}, {
			"type": "string",
			"optional": false,
			"field": "last_name"
		}, {
			"type": "string",
			"optional": false,
			"field": "email"
		}, {
			"type": "string",
			"optional": true,
			"field": "__name"
		}, {
			"type": "string",
			"optional": true,
			"field": "__db"
		}, {
			"type": "string",
			"optional": true,
			"field": "__table"
		}, {
			"type": "string",
			"optional": true,
			"field": "__op"
		}, {
			"type": "string",
			"optional": true,
			"field": "__file"
		}, {
			"type": "int64",
			"optional": true,
			"field": "__pos"
		}, {
			"type": "int32",
			"optional": true,
			"field": "__row"
		}, {
			"type": "int64",
			"optional": true,
			"field": "__ts_ms"
		}, {
			"type": "int64",
			"optional": true,
			"field": "__source_ts_ms"
		}, {
			"type": "string",
			"optional": true,
			"field": "__deleted"
		}],
		"optional": false,
		"name": "json.inventory.customers.Value"
	},
	"payload": {
		"id": 1001,
		"first_name": "Sally",
		"last_name": "Thomas",
		"email": "[email protected]",
		"__name": "json",
		"__db": "inventory",
		"__table": "customers",
		"__op": "c",
		"__file": "mysql-bin.000003",
		"__pos": 10524,
		"__row": 0,
		"__ts_ms": 1596448180633,
		"__source_ts_ms": 0,
		"__deleted": "false"
	}
}

集成到Hive 的数据

表结构
CREATE EXTERNAL TABLE `json_inventory_customers`(
  `id` int, 
  `first_name` string, 
  `last_name` string, 
  `email` string, 
  `__name` string, 
  `__db` string, 
  `__table` string, 
  `__op` string, 
  `__file` string, 
  `__pos` bigint, 
  `__row` int, 
  `__ts_ms` bigint, 
  `__source_ts_ms` bigint, 
  `__deleted` string)
PARTITIONED BY ( 
  `year` string COMMENT '', 
  `month` string COMMENT '', 
  `day` string COMMENT '', 
  `hour` string COMMENT '')
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  'hdfs://cdhtest/user/dts/topics/json.inventory.customers'
TBLPROPERTIES (
  'transient_lastDdlTime'='1596527679')

数据  

json_inventory_customers.id	1001
json_inventory_customers.first_name	Sally
json_inventory_customers.last_name	Thomas
json_inventory_customers.email	sally.thomas@acme.com
json_inventory_customers.__name	json
json_inventory_customers.__db	inventory
json_inventory_customers.__table	customers
json_inventory_customers.__op	c
json_inventory_customers.__file	mysql-bin.000003
json_inventory_customers.__pos	10524
json_inventory_customers.__row	0
json_inventory_customers.__ts_ms	1596527669214
json_inventory_customers.__source_ts_ms	0
json_inventory_customers.__deleted	false
json_inventory_customers.year	2020
json_inventory_customers.month	08
json_inventory_customers.day	04
json_inventory_customers.hour	15

问题描述

Debezium snapshot、insert、update、delete 操作日志 kafka-connect-hdfs 消费无问题,但是 新增字段后 insert 报错,如下

[2020-08-04 16:08:29,274] DEBUG Closing WAL (io.confluent.connect.hdfs.wal.FSWAL:154)
[2020-08-04 16:08:29,282] DEBUG WorkerSinkTask{id=json.customers.sink-0} Skipping offset commit, no change since last commit (org.apache.kafka.connect.runtime.WorkerSinkTask:431)
[2020-08-04 16:08:29,283] DEBUG WorkerSinkTask{id=json.customers.sink-0} Finished offset commit successfully in 9 ms for sequence number 84: null (org.apache.kafka.connect.runtime.WorkerSinkTask:268)
[2020-08-04 16:08:29,283] ERROR WorkerSinkTask{id=json.customers.sink-0} Task threw an uncaught and unrecoverable exception (org.apache.kafka.connect.runtime.WorkerTask:186)
org.apache.kafka.connect.errors.ConnectException: Exiting WorkerSinkTask due to unrecoverable exception.
	at org.apache.kafka.connect.runtime.WorkerSinkTask.deliverMessages(WorkerSinkTask.java:568)
	at org.apache.kafka.connect.runtime.WorkerSinkTask.poll(WorkerSinkTask.java:326)
	at org.apache.kafka.connect.runtime.WorkerSinkTask.iteration(WorkerSinkTask.java:228)
	at org.apache.kafka.connect.runtime.WorkerSinkTask.execute(WorkerSinkTask.java:196)
	at org.apache.kafka.connect.runtime.WorkerTask.doRun(WorkerTask.java:184)
	at org.apache.kafka.connect.runtime.WorkerTask.run(WorkerTask.java:234)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility
	at io.confluent.connect.hdfs.TopicPartitionWriter.write(TopicPartitionWriter.java:407)
	at io.confluent.connect.hdfs.DataWriter.write(DataWriter.java:386)
	at io.confluent.connect.hdfs.HdfsSinkTask.put(HdfsSinkTask.java:127)
	at org.apache.kafka.connect.runtime.WorkerSinkTask.deliverMessages(WorkerSinkTask.java:546)
	... 10 more
Caused by: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility
	at io.confluent.connect.storage.schema.StorageSchemaCompatibility.validateAndCheck(StorageSchemaCompatibility.java:157)
	at io.confluent.connect.storage.schema.StorageSchemaCompatibility.shouldChangeSchema(StorageSchemaCompatibility.java:320)
	at io.confluent.connect.hdfs.TopicPartitionWriter.write(TopicPartitionWriter.java:361)
	... 13 more
[2020-08-04 16:08:29,283] ERROR WorkerSinkTask{id=json.customers.sink-0} Task is being killed and will not recover until manually restarted (org.apache.kafka.connect.runtime.WorkerTask:187)
[2020-08-04 16:08:29,283] INFO Shutting down Hive executor service. (io.confluent.connect.hdfs.DataWriter:484)
[2020-08-04 16:08:29,283] INFO Awaiting termination. (io.confluent.connect.hdfs.DataWriter:489) 

原因分析 & 排查过程

Schema version required for BACKWARD compatibility 很明显,BACKWARD 模式必须 有Schema version 版本,改改集成模式不就OK了,简单的很。 schema.compatibility 配置参数说明
删除消费connector,重置kafka,修改schema.compatibility=NONE ,重新注册一个connector。
很不幸,要与hive集成不能集成schema.compatibility不能为NONE 。报错如下:

[2020-08-04 16:26:15,340] ERROR WorkerSinkTask{id=json.customers.sink-0} Task threw an uncaught and unrecoverable exception (org.apache.kafka.connect.runtime.WorkerTask:186)
org.apache.kafka.connect.errors.ConnectException: Couldn't start HdfsSinkConnector due to configuration error.
	at io.confluent.connect.hdfs.HdfsSinkTask.start(HdfsSinkTask.java:90)
	at org.apache.kafka.connect.runtime.WorkerSinkTask.initializeAndStart(WorkerSinkTask.java:305)
	at org.apache.kafka.connect.runtime.WorkerSinkTask.execute(WorkerSinkTask.java:193)
	at org.apache.kafka.connect.runtime.WorkerTask.doRun(WorkerTask.java:184)
	at org.apache.kafka.connect.runtime.WorkerTask.run(WorkerTask.java:234)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.kafka.common.config.ConfigException: Hive Integration requires schema compatibility to be BACKWARD, FORWARD or FULL
	at io.confluent.connect.hdfs.HdfsSinkTask.start(HdfsSinkTask.java:63)
	... 9 more

用 FORWARD 和 FULL 尝试均失败!google 无果!问题来了,灵魂拷问,哪里来的版本?话不多说,源码走一波

异常跑出的类:io.confluent.connect.storage.schema.StorageSchemaCompatibility#validateAndCheck

protected boolean validateAndCheck(Schema valueSchema, Schema currentSchema) {
        if (currentSchema == null && valueSchema == null) {
            return false;
        } else if (currentSchema == valueSchema) {
            return false;
        } else if (currentSchema != null && valueSchema != null) {
            if ((valueSchema.version() == null || currentSchema.version() == null) && this != NONE) {
                throw new SchemaProjectorException("Schema version required for " + this.toString() + " compatibility");
            } else {
                return this.check(valueSchema, currentSchema);
            }
        } else {
            throw new SchemaProjectorException("Switch between schema-based and schema-less data is not supported");
        }
    } 

哪里调用了这个方法 ? io.confluent.connect.hdfs.TopicPartitionWriter#write 里面的 compatibility.shouldChangeSchema(record, null, currentSchema) 调用 io.confluent.connect.storage.schema.StorageSchemaCompatibility#shouldChangeSchema 调用 io.confluent.connect.storage.schema.StorageSchemaCompatibility#validateAndCheck
在 validateAndCheck 逻辑中发现 肯定是 valueSchema.version() == null 或 currentSchema.version() == null 为 空,其实就是 io.confluent.connect.hdfs.TopicPartitionWriter#write 里面 Schema valueSchema = record.valueSchema()拿到的valueSchema 的版本值为空。我们通过采集出来的 json 数据看到的确没有版本字段。
知道了问题所在解决就简单了,怎么把版本字段加上去呢?
查看了下Debezium采集的源码 发现 schema 生成是自己build 出来的,改源码不现实,讲道理来说开源的东西没有这么弱,应该有别的解决方案,想过同tf添加这个字段。在Debezium源码里面找了下没有对应的tf(后面发现其实kafka connect 提供了这个加字段的tf)。排查问题的时候发现 avro 是带有版本信息的,于是乎把Debezium 采集序列化格式改成了 avro 配置如下 ,hdfs 消费配置不变。

{
    "name": "avro-inventory-customers",
    "config": {
    "connector.class": "io.debezium.connector.mysql.MySqlConnector",
    "tasks.max": "1",
    "database.hostname": "psd-hadoop039",
    "database.port": "3306",
    "database.user": "debezium",
    "database.password": "dbz",
    "database.server.name": "avro",
    "database.whitelist": "inventory",
    "database.history.kafka.bootstrap.servers": "psd-hadoop039:9092",
    "database.history.kafka.topic": "dbhistory.avro.inventory",
    "table.whitelist": "inventory.customers",
    "key.converter": "io.confluent.connect.avro.AvroConverter",
    "key.converter.schema.registry.url": "http://psd-hadoop039:8081",
    "value.converter": "io.confluent.connect.avro.AvroConverter",
    "value.converter.schema.registry.url": "http://psd-hadoop039:8081",
    "include.schema.changes": "true",
    "transforms": "unwrap",
    "binary.handling.mode":"hex",
    "time.precision.mode":"connect",
    "transforms.unwrap.type": "io.debezium.transforms.ExtractNewRecordState",
    "transforms.unwrap.drop.tombstones": "true",
    "transforms.unwrap.delete.handling.mode": "rewrite",
    "transforms.unwrap.add.headers": "name,db,table,op,db",
    "transforms.unwrap.add.fields": "name,db,table,op,file,pos,row,ts_ms,source.ts_ms"
    }
}

备注:说明下 为了部署起来方便使用了 confluentinc 社区版,部署十分简单

下载 confluent-5.5.1-2.12.tar.gz 加压  
将用到的 debezium-debezium-connector-mysql-1.2.0 、confluentinc-kafka-connect-hdfs-5.5.1 
解压到 $confluentinc_home/share/java 里面 
启动命令 :bin/confluent local start
停止命令:bin/confluent local stop 

上面这套操作下来 添加、修改、删除字段 hdfs sink 消费都不报错了。 但是通过hive 查数据的时候报错如下

Bad status for request TFetchResultsReq(fetchType=0, operationHandle=TOperationHandle(hasResultSet=True, modifiedRowCount=None, operationType=0, operationId=THandleIdentifier(secret='\xea\x99B\x16)\x94I\xdb\xa4\xd1\x8d\xb1_\xf8\\\xff', guid='\x11z\xff\x04\xb1\xdbM\x15\x8f;G d\xe2G}')), orientation=4, maxRows=100): TFetchResultsResp(status=TStatus(errorCode=0, errorMessage='java.io.IOException: java.lang.ClassCastException: org.apache.hadoop.io.Text cannot be cast to org.apache.hadoop.io.IntWritable', sqlState=None, infoMessages=['*org.apache.hive.service.cli.HiveSQLException:java.io.IOException: java.lang.ClassCastException: org.apache.hadoop.io.Text cannot be cast to org.apache.hadoop.io.IntWritable:14:13', 'org.apache.hive.service.cli.operation.SQLOperation:getNextRowSet:SQLOperation.java:375', 'org.apache.hive.service.cli.operation.OperationManager:getOperationNextRowSet:OperationManager.java:287', 'org.apache.hive.service.cli.session.HiveSessionImpl:fetchResults:HiveSessionImpl.java:752', 'org.apache.hive.service.cli.CLIService:fetchResults:CLIService.java:438', 'org.apache.hive.service.cli.thrift.ThriftCLIService:FetchResults:ThriftCLIService.java:689', 'org.apache.hive.service.cli.thrift.TCLIService$Processor$FetchResults:getResult:TCLIService.java:1553', 'org.apache.hive.service.cli.thrift.TCLIService$Processor$FetchResults:getResult:TCLIService.java:1538', 'org.apache.thrift.ProcessFunction:process:ProcessFunction.java:39', 'org.apache.thrift.TBaseProcessor:process:TBaseProcessor.java:39', 'org.apache.hive.service.auth.TSetIpAddressProcessor:process:TSetIpAddressProcessor.java:56', 'org.apache.thrift.server.TThreadPoolServer$WorkerProcess:run:TThreadPoolServer.java:286', 'java.util.concurrent.ThreadPoolExecutor:runWorker:ThreadPoolExecutor.java:1149', 'java.util.concurrent.ThreadPoolExecutor$Worker:run:ThreadPoolExecutor.java:624', 'java.lang.Thread:run:Thread.java:748', '*java.io.IOException:java.lang.ClassCastException: org.apache.hadoop.io.Text cannot be cast to org.apache.hadoop.io.IntWritable:18:4', 'org.apache.hadoop.hive.ql.exec.FetchOperator:getNextRow:FetchOperator.java:508', 'org.apache.hadoop.hive.ql.exec.FetchOperator:pushRow:FetchOperator.java:415', 'org.apache.hadoop.hive.ql.exec.FetchTask:fetch:FetchTask.java:138', 'org.apache.hadoop.hive.ql.Driver:getResults:Driver.java:1797', 'org.apache.hive.service.cli.operation.SQLOperation:getNextRowSet:SQLOperation.java:370', '*java.lang.ClassCastException:org.apache.hadoop.io.Text cannot be cast to org.apache.hadoop.io.IntWritable:22:4', 'org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector:get:WritableIntObjectInspector.java:36', 'org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils:getString:PrimitiveObjectInspectorUtils.java:810', 'org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorConverter$StringConverter:convert:PrimitiveObjectInspectorConverter.java:453', 'org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters$StructConverter:convert:ObjectInspectorConverters.java:396', 'org.apache.hadoop.hive.ql.exec.FetchOperator:getNextRow:FetchOperator.java:491'], statusCode=3), results=None, hasMoreRows=None)

google 了下发现原因是 ParquetFormat 文件的头信息记录有问题,有个蛇皮的解决方案,先用impala 设置 set PARQUET_FALLBACK_SCHEMA_RESOLUTION=name; 自身覆盖,问题解决,但是不适用生产。 另一个就解决方案:“format.class”: “io.confluent.connect.hdfs.avro.AvroFormat”,最后 hdfs sink 消费配置如下,完美解决 字段变更问题

{
	"name": "avro.inventory.customers.sink",
	"config": {
		"connector.class": "io.confluent.connect.hdfs.HdfsSinkConnector",
		"format.class": "io.confluent.connect.hdfs.avro.AvroFormat",
		"key.converter": "io.confluent.connect.avro.AvroConverter",
        "key.converter.schema.registry.url": "http://psd-hadoop039:8081",
        "value.converter": "io.confluent.connect.avro.AvroConverter",
        "value.converter.schema.registry.url": "http://psd-hadoop039:8081",
		"key.converter.schemas.enable": "true",
		"value.converter.schemas.enable": "true",
		"tasks.max": "1",
		"topics": "avro.inventory.customers",
        "hadoop.conf.dir":"/etc/hadoop/conf",
		"store.url": "hdfs://cdhtest",
		"logs.dir": "/user/dts/logs",
        "topics.dir":"/user/dts/topics",
		"flush.size": "2",
		"rotate.interval.ms":"10000",
        "hive.integration":true,
        "hive.database":"dts",
        "hive.metastore.uris":"thrift://cdh-10-21-17-95:9083",
        "partitioner.class":"io.confluent.connect.hdfs.partitioner.HourlyPartitioner",
        "locale":"zh",
        "timezone":"Asia/Shanghai",
        "path.format":"YYYYMMddHH/",
        "schema.compatibility":"BACKWARD"
	}
}

总结

  • 排查问题中对流程不熟悉,网上资料极少,分析这看,部分逻辑不严谨

后续

  • 研究下kafka connect 插件的调试方式。

你可能感兴趣的:(CDC同步系列)