十五(1)、streamsets

1、从mysql提取数据到hbase,报错:

com.streamsets.pipeline.api.base.OnRecordErrorException: HBASE_27 - Missing row key field '/rk0001' in record
	at com.streamsets.pipeline.hbase.api.impl.AbstractHBaseProducer.getBytesForRowKey(AbstractHBaseProducer.java:121)
	at com.streamsets.pipeline.hbase.impl.HBaseProducer0_98.doPut(HBaseProducer0_98.java:158)
	at com.streamsets.pipeline.hbase.impl.HBaseProducer0_98.writeRecordsInHBase(HBaseProducer0_98.java:77)
	at com.streamsets.pipeline.stage.destination.hbase.HBaseTarget.writeBatch(HBaseTarget.java:284)
	at com.streamsets.pipeline.stage.destination.hbase.HBaseTarget.lambda$write$1(HBaseTarget.java:265)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:422)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1866)
	at com.streamsets.pipeline.stage.destination.hbase.HBaseTarget.write(HBaseTarget.java:264)
	at com.streamsets.pipeline.api.base.configurablestage.DTarget.write(DTarget.java:34)
	at com.streamsets.datacollector.runner.StageRuntime.lambda$execute$2(StageRuntime.java:290)
	at com.streamsets.datacollector.runner.StageRuntime.execute(StageRuntime.java:235)
	at com.streamsets.datacollector.runner.StageRuntime.execute(StageRuntime.java:298)
	at com.streamsets.datacollector.runner.StagePipe.process(StagePipe.java:219)
	at com.streamsets.datacollector.runner.preview.PreviewPipelineRunner.lambda$runSourceLessBatch$0(PreviewPipelineRunner.java:348)
	at com.streamsets.datacollector.runner.PipeRunner.acceptConsumer(PipeRunner.java:221)
	at com.streamsets.datacollector.runner.PipeRunner.executeBatch(PipeRunner.java:142)
	at com.streamsets.datacollector.runner.preview.PreviewPipelineRunner.runSourceLessBatch(PreviewPipelineRunner.java:344)
	at com.streamsets.datacollector.runner.preview.PreviewPipelineRunner.runPollSource(PreviewPipelineRunner.java:326)
	at com.streamsets.datacollector.runner.preview.PreviewPipelineRunner.run(PreviewPipelineRunner.java:218)
	at com.streamsets.datacollector.runner.Pipeline.run(Pipeline.java:537)
	at com.streamsets.datacollector.runner.preview.PreviewPipeline.run(PreviewPipeline.java:51)
	at com.streamsets.datacollector.execution.preview.sync.SyncPreviewer.start(SyncPreviewer.java:233)
	at com.streamsets.datacollector.execution.preview.async.AsyncPreviewer.lambda$start$0(AsyncPreviewer.java:101)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.lambda$call$0(SafeScheduledExecutorService.java:226)
	at com.streamsets.datacollector.security.GroupsInScope.execute(GroupsInScope.java:33)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.call(SafeScheduledExecutorService.java:222)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.lambda$call$0(SafeScheduledExecutorService.java:226)
	at com.streamsets.datacollector.security.GroupsInScope.execute(GroupsInScope.java:33)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.call(SafeScheduledExecutorService.java:222)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
	at com.streamsets.datacollector.metrics.MetricSafeScheduledExecutorService$MetricsTask.run(MetricSafeScheduledExecutorService.java:100)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

将row key写成[0],就正常了。

2、streamsets里面用json parser的时候报错:

com.streamsets.pipeline.api.base.OnRecordErrorException: JSONP_00 - JSON field 'streamsetsDruid::0::0::0' does not exist in record '/ddd/two'. Cannot parse the field.
	at com.streamsets.pipeline.stage.processor.jsonparser.JsonParserProcessor.process(JsonParserProcessor.java:46)
	at com.streamsets.pipeline.api.base.SingleLaneRecordProcessor.process(SingleLaneRecordProcessor.java:53)
	at com.streamsets.pipeline.api.base.SingleLaneProcessor.process(SingleLaneProcessor.java:95)
	at com.streamsets.pipeline.api.base.configurablestage.DProcessor.process(DProcessor.java:35)
	at com.streamsets.datacollector.runner.StageRuntime.lambda$execute$2(StageRuntime.java:286)
	at com.streamsets.datacollector.runner.StageRuntime.execute(StageRuntime.java:235)
	at com.streamsets.datacollector.runner.StageRuntime.execute(StageRuntime.java:298)
	at com.streamsets.datacollector.runner.StagePipe.process(StagePipe.java:219)
	at com.streamsets.datacollector.runner.preview.PreviewPipelineRunner.lambda$runSourceLessBatch$0(PreviewPipelineRunner.java:348)
	at com.streamsets.datacollector.runner.PipeRunner.acceptConsumer(PipeRunner.java:221)
	at com.streamsets.datacollector.runner.PipeRunner.executeBatch(PipeRunner.java:142)
	at com.streamsets.datacollector.runner.preview.PreviewPipelineRunner.runSourceLessBatch(PreviewPipelineRunner.java:344)
	at com.streamsets.datacollector.runner.preview.PreviewPipelineRunner.runPollSource(PreviewPipelineRunner.java:326)
	at com.streamsets.datacollector.runner.preview.PreviewPipelineRunner.run(PreviewPipelineRunner.java:218)
	at com.streamsets.datacollector.runner.Pipeline.run(Pipeline.java:537)
	at com.streamsets.datacollector.runner.preview.PreviewPipeline.run(PreviewPipeline.java:51)
	at com.streamsets.datacollector.execution.preview.sync.SyncPreviewer.start(SyncPreviewer.java:233)
	at com.streamsets.datacollector.execution.preview.async.AsyncPreviewer.lambda$start$0(AsyncPreviewer.java:101)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.lambda$call$0(SafeScheduledExecutorService.java:226)
	at com.streamsets.datacollector.security.GroupsInScope.execute(GroupsInScope.java:33)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.call(SafeScheduledExecutorService.java:222)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.lambda$call$0(SafeScheduledExecutorService.java:226)
	at com.streamsets.datacollector.security.GroupsInScope.execute(GroupsInScope.java:33)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.call(SafeScheduledExecutorService.java:222)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
	at com.streamsets.datacollector.metrics.MetricSafeScheduledExecutorService$MetricsTask.run(MetricSafeScheduledExecutorService.java:100)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

其中streamsetsDruid是前面origin 为Kafa的topic

最后找到原因:因为json parser 配置的field path路径不对,在前面使用log parser的时候,新字段自己加了斜杠,导致在json parser 这里识别错误。因为报错没体现出来,所以很难发现。

3、streamsets里面origin:kafka consumer,data format里面有一个设置参数Max Record Length (chars),默认是1024,设置小了的话,会报下面的错误:

com.streamsets.pipeline.api.base.OnRecordErrorException: KAFKA_37 - Cannot parse record from message 'streamsetsDruid::0::107': java.io.IOException: Mark invalid
	at com.streamsets.pipeline.stage.origin.kafka.BaseKafkaSource.processKafkaMessageDefault(BaseKafkaSource.java:265)
	at com.streamsets.pipeline.stage.origin.kafka.BaseKafkaSource.processKafkaMessageDefault(BaseKafkaSource.java:224)
	at com.streamsets.pipeline.stage.origin.kafka.StandaloneKafkaSource.produce(StandaloneKafkaSource.java:86)
	at com.streamsets.pipeline.api.base.configurablestage.DSource.produce(DSource.java:38)
	at com.streamsets.datacollector.runner.StageRuntime.lambda$execute$2(StageRuntime.java:283)
	at com.streamsets.datacollector.runner.StageRuntime.execute(StageRuntime.java:235)
	at com.streamsets.datacollector.runner.StageRuntime.execute(StageRuntime.java:298)
	at com.streamsets.datacollector.runner.StagePipe.process(StagePipe.java:219)
	at com.streamsets.datacollector.runner.preview.PreviewPipelineRunner.runPollSource(PreviewPipelineRunner.java:321)
	at com.streamsets.datacollector.runner.preview.PreviewPipelineRunner.run(PreviewPipelineRunner.java:218)
	at com.streamsets.datacollector.runner.Pipeline.run(Pipeline.java:537)
	at com.streamsets.datacollector.runner.preview.PreviewPipeline.run(PreviewPipeline.java:51)
	at com.streamsets.datacollector.execution.preview.sync.SyncPreviewer.start(SyncPreviewer.java:233)
	at com.streamsets.datacollector.execution.preview.async.AsyncPreviewer.lambda$start$0(AsyncPreviewer.java:101)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.lambda$call$0(SafeScheduledExecutorService.java:226)
	at com.streamsets.datacollector.security.GroupsInScope.execute(GroupsInScope.java:33)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.call(SafeScheduledExecutorService.java:222)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.lambda$call$0(SafeScheduledExecutorService.java:226)
	at com.streamsets.datacollector.security.GroupsInScope.execute(GroupsInScope.java:33)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.call(SafeScheduledExecutorService.java:222)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
	at com.streamsets.datacollector.metrics.MetricSafeScheduledExecutorService$MetricsTask.run(MetricSafeScheduledExecutorService.java:100)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Mark invalid
	at com.streamsets.pipeline.lib.io.AbstractOverrunDelimitedReader.reset(AbstractOverrunDelimitedReader.java:80)
	at com.streamsets.pipeline.lib.io.OverrunLineReader.reset(OverrunLineReader.java:53)
	at com.streamsets.pipeline.lib.csv.CsvMultiCharDelimitedParser.read(CsvMultiCharDelimitedParser.java:212)
	at com.streamsets.pipeline.lib.parser.delimited.DelimitedCharDataParser.parse(DelimitedCharDataParser.java:118)
	at com.streamsets.pipeline.lib.parser.WrapperDataParserFactory$WrapperDataParser.lambda$parse$0(WrapperDataParserFactory.java:105)
	at com.streamsets.pipeline.api.impl.CreateByRef.call(CreateByRef.java:40)
	at com.streamsets.pipeline.lib.parser.WrapperDataParserFactory$WrapperDataParser.parse(WrapperDataParserFactory.java:105)
	at com.streamsets.pipeline.stage.origin.kafka.BaseKafkaSource.processKafkaMessageDefault(BaseKafkaSource.java:244)
	... 26 more

4、streamsets里面origin:kafka consumer,传入kafka的数据,里面有双引号:"
但是配置data format里面的Quote Character就是双引号",所以包含双引号的数据被认为是错误,报错:

com.streamsets.pipeline.api.base.OnRecordErrorException: KAFKA_37 - Cannot parse record from message 'streamsetsDruid::0::109': com.streamsets.pipeline.lib.parser.DataParserException: DATA_PARSER_02 - Parser error: 'org.jparsec.error.ParserException: line 1, column 166:
EOF expected, " encountered.'
	at com.streamsets.pipeline.stage.origin.kafka.BaseKafkaSource.processKafkaMessageDefault(BaseKafkaSource.java:265)
	at com.streamsets.pipeline.stage.origin.kafka.BaseKafkaSource.processKafkaMessageDefault(BaseKafkaSource.java:224)
	at com.streamsets.pipeline.stage.origin.kafka.StandaloneKafkaSource.produce(StandaloneKafkaSource.java:86)
	at com.streamsets.pipeline.api.base.configurablestage.DSource.produce(DSource.java:38)
	at com.streamsets.datacollector.runner.StageRuntime.lambda$execute$2(StageRuntime.java:283)
	at com.streamsets.datacollector.runner.StageRuntime.execute(StageRuntime.java:235)
	at com.streamsets.datacollector.runner.StageRuntime.execute(StageRuntime.java:298)
	at com.streamsets.datacollector.runner.StagePipe.process(StagePipe.java:219)
	at com.streamsets.datacollector.runner.preview.PreviewPipelineRunner.runPollSource(PreviewPipelineRunner.java:321)
	at com.streamsets.datacollector.runner.preview.PreviewPipelineRunner.run(PreviewPipelineRunner.java:218)
	at com.streamsets.datacollector.runner.Pipeline.run(Pipeline.java:537)
	at com.streamsets.datacollector.runner.preview.PreviewPipeline.run(PreviewPipeline.java:51)
	at com.streamsets.datacollector.execution.preview.sync.SyncPreviewer.start(SyncPreviewer.java:233)
	at com.streamsets.datacollector.execution.preview.async.AsyncPreviewer.lambda$start$0(AsyncPreviewer.java:101)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.lambda$call$0(SafeScheduledExecutorService.java:226)
	at com.streamsets.datacollector.security.GroupsInScope.execute(GroupsInScope.java:33)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.call(SafeScheduledExecutorService.java:222)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.lambda$call$0(SafeScheduledExecutorService.java:226)
	at com.streamsets.datacollector.security.GroupsInScope.execute(GroupsInScope.java:33)
	at com.streamsets.pipeline.lib.executor.SafeScheduledExecutorService$SafeCallable.call(SafeScheduledExecutorService.java:222)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
	at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
	at com.streamsets.datacollector.metrics.MetricSafeScheduledExecutorService$MetricsTask.run(MetricSafeScheduledExecutorService.java:100)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: com.streamsets.pipeline.lib.parser.DataParserException: DATA_PARSER_02 - Parser error: 'org.jparsec.error.ParserException: line 1, column 166:
EOF expected, " encountered.'
	at com.streamsets.pipeline.lib.parser.WrapperDataParserFactory$WrapperDataParser.normalizeException(WrapperDataParserFactory.java:147)
	at com.streamsets.pipeline.lib.parser.WrapperDataParserFactory$WrapperDataParser.parse(WrapperDataParserFactory.java:107)
	at com.streamsets.pipeline.stage.origin.kafka.BaseKafkaSource.processKafkaMessageDefault(BaseKafkaSource.java:244)
	... 26 more
Caused by: org.jparsec.error.ParserException: line 1, column 166:
EOF expected, " encountered.
	at org.jparsec.ScannerState.run(ScannerState.java:80)
	at org.jparsec.Parser$Mode$1.run(Parser.java:859)
	at org.jparsec.Parser.parse(Parser.java:830)
	at org.jparsec.Parser.parse(Parser.java:807)
	at com.streamsets.pipeline.lib.csv.CsvMultiCharDelimitedParser.read(CsvMultiCharDelimitedParser.java:203)
	at com.streamsets.pipeline.lib.parser.delimited.DelimitedCharDataParser.parse(DelimitedCharDataParser.java:118)
	at com.streamsets.pipeline.lib.parser.WrapperDataParserFactory$WrapperDataParser.lambda$parse$0(WrapperDataParserFactory.java:105)
	at com.streamsets.pipeline.api.impl.CreateByRef.call(CreateByRef.java:40)
	at com.streamsets.pipeline.lib.parser.WrapperDataParserFactory$WrapperDataParser.parse(WrapperDataParserFactory.java:105)
	... 27 more

当传入Kafka的数据有双引号的时候,在streamsets里面origin:kafka consumer,data format里面的Quote Character就设置成其他不出现在数据中的符号,然后数据就正常了。

你可能感兴趣的:(CDH)