datax的mysql 到hdfs文件系统 --- 支持高可用

datax的mysql 到hdfs文件系统上! 支持高可用

  • mysqlreaderTohdfswriter.json
[root@slave1 mytemplate]# python /opt/datax/bin/datax.py -r mysqlreader -w hdfswriter >> mysqlreaderTohdfswriter.json
[root@slave1 mytemplate]# cat mysqlreaderTohdfswriter.json
{
    "job": {
        "content": [
            {
                "reader": {
                    "name": "mysqlreader", # plugin/reader下需要存在模板!
                    "parameter": {
                        "column": [],
                        "connection": [
                            {
                                "jdbcUrl": [],
                                "table": []
                            }
                        ],
                        "password": "",
                        "username": "",
                        "where": ""
                    }
                },
                "writer": {
                    "name": "hdfswriter",
                    "parameter": {
                        "column": [],
                        "compress": "",
                        "defaultFS": "",
                        "fieldDelimiter": "",
                        "fileName": "",
                        "fileType": "",
                        "path": "",
                        "writeMode": ""
                    }
                }
            }
        ],
        "setting": {
            "speed": {
                "channel": ""
            }
        }
    }
}
---
# ha 
[root@slave1 mytemplate]# cat mysqlreaderTohdfswriter.json
{
    "job": {
        "content": [
            {
                "reader": {
                    "name": "mysqlreader",
                    "parameter": {
                        "column": ["stu_id","stu_age","stu_name"],
                        "connection": [
                            {
                                "jdbcUrl": ["jdbc:mysql://slave1:3306/javaAndBigdata"],
                                "table": ["student"]
                            }
                        ],
                        "password": "javaAndBigdata",
                        "username": "root",
                        "where": "stu_id != 3"
                    }
                },
                "writer": {
                    "name": "hdfswriter",
                    "parameter": {
                        "column": [{"name":"stu_id","type":"int"},{"name":"stu_age","type":"int"},{"name":"stu_name","type":"string"}], # hdfs没有知道mysql的列的数据类型!!
                        "hadoopConfig":{ #和core-site.xml 、 hdfs-site.xml文件保存一致! 高可用!
              "dfs.nameservices": "mycluster",
              "dfs.ha.namenodes.mycluster": "nn1,nn2",
              "dfs.namenode.rpc-address.mycluster.nn1": "leader:8020", # 与hdfs-site.xml文件一致;我的不是9000!
              "dfs.namenode.rpc-address.mycluster.nn2": "slave1:8020",
              "dfs.client.failover.proxy.provider.mycluster": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
            },
                        "defaultFS": "hdfs://mycluster",
                        "fieldDelimiter": "\t",
                        "fileName": "firstStudent.csv",
                        "fileType": "text", #只支持两个格式:ORC和TEXT
                        "path": "/datax/datas/", # 需要先创建好对应的目录!
                        "writeMode": "append" 
                    }
                }
            }
        ],
        "setting": {
            "speed": {
                "channel": "2"
            }
        }
    }
}
# 疑问: 为什么channel没有发生作用呢?
  • 执行脚本:mysqlreaderTohdfswriter.json
[root@slave1 mytemplate]# python /opt/datax/bin/datax.py /opt/datax/mytemplate/mysqlreaderTohdfswriter.json

DataX (DATAX-OPENSOURCE-3.0), From Alibaba !
Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.


2022-03-24 22:06:51.724 [main] INFO  VMInfo - VMInfo# operatingSystem class => sun.management.OperatingSystemImpl
2022-03-24 22:06:51.732 [main] INFO  Engine - the machine info  =>

        osInfo: Oracle Corporation 1.8 25.171-b11
        jvmInfo:        Linux amd64 3.10.0-1127.el7.x86_64
        cpu num:        2

        totalPhysicalMemory:    -0.00G
        freePhysicalMemory:     -0.00G
        maxFileDescriptorCount: -1
        currentOpenFileDescriptorCount: -1

        GC Names        [PS MarkSweep, PS Scavenge]

        MEMORY_NAME                    | allocation_size                | init_size
        PS Eden Space                  | 256.00MB                       | 256.00MB
        Code Cache                     | 240.00MB                       | 2.44MB
        Compressed Class Space         | 1,024.00MB                     | 0.00MB
        PS Survivor Space              | 42.50MB                        | 42.50MB
        PS Old Gen                     | 683.00MB                       | 683.00MB
        Metaspace                      | -0.00MB                        | 0.00MB


2022-03-24 22:06:51.753 [main] INFO  Engine -
{
        "content":[
                {
                        "reader":{
                                "name":"mysqlreader",
                                "parameter":{
                                        "column":[
                                                "stu_id",
                                                "stu_age",
                                                "stu_name"
                                        ],
                                        "connection":[
                                                {
                                                        "jdbcUrl":[
                                                                "jdbc:mysql://slave1:3306/javaAndBigdata"
                                                        ],
                                                        "table":[
                                                                "student"
                                                        ]
                                                }
                                        ],
                                        "password":"********",
                                        "username":"root",
                                        "where":"stu_id != 3"
                                }
                        },
                        "writer":{
                                "name":"hdfswriter",
                                "parameter":{
                                        "column":[
                                                {
                                                        "name":"stu_id",
                                                        "type":"int"
                                                },
                                                {
                                                        "name":"stu_age",
                                                        "type":"int"
                                                },
                                                {
                                                        "name":"stu_name",
                                                        "type":"string"
                                                }
                                        ],
                                        "defaultFS":"hdfs://mycluster",
                                        "fieldDelimiter":"\t",
                                        "fileName":"firstStudent.csv",
                                        "fileType":"text",
                                        "hadoopConfig":{
                                                "dfs.client.failover.proxy.provider.mycluster":"org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider",
                                                "dfs.ha.namenodes.mycluster":"nn1,nn2",
                                                "dfs.namenode.rpc-address.mycluster.nn1":"leader:8020",
                                                "dfs.namenode.rpc-address.mycluster.nn2":"slave1:8020",
                                                "dfs.nameservices":"mycluster"
                                        },
                                        "path":"/datax/datas/",
                                        "writeMode":"append"
                                }
                        }
                }
        ],
        "setting":{
                "speed":{
                        "channel":"2"
                }
        }
}

2022-03-24 22:06:51.770 [main] WARN  Engine - prioriy set to 0, because NumberFormatException, the value is: null
2022-03-24 22:06:51.772 [main] INFO  PerfTrace - PerfTrace traceId=job_-1, isEnable=false, priority=0
2022-03-24 22:06:51.772 [main] INFO  JobContainer - DataX jobContainer starts job.
2022-03-24 22:06:51.774 [main] INFO  JobContainer - Set jobId = 0
2022-03-24 22:06:52.096 [job-0] INFO  OriginalConfPretreatmentUtil - Available jdbcUrl:jdbc:mysql://slave1:3306/javaAndBigdata?yearIsDateType=false&zeroDateTimeBehavior=convertToNull&tinyInt1isBit=false&rewriteBatchedStatements=true.
2022-03-24 22:06:52.107 [job-0] INFO  OriginalConfPretreatmentUtil - table:[student] has columns:[stu_id,stu_age,stu_name].
三月 24, 2022 10:06:52 下午 org.apache.hadoop.util.NativeCodeLoader <clinit>
警告: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2022-03-24 22:06:53.058 [job-0] INFO  JobContainer - jobContainer starts to do prepare ...
2022-03-24 22:06:53.059 [job-0] INFO  JobContainer - DataX Reader.Job [mysqlreader] do prepare work .
2022-03-24 22:06:53.060 [job-0] INFO  JobContainer - DataX Writer.Job [hdfswriter] do prepare work .
2022-03-24 22:06:53.144 [job-0] INFO  HdfsWriter$Job - 由于您配置了writeMode append, 写入前不做清理工作, [/datax/datas/] 目录下写入相应文件名前缀  [firstStudent.csv] 的文件
2022-03-24 22:06:53.144 [job-0] INFO  JobContainer - jobContainer starts to do split ...
2022-03-24 22:06:53.144 [job-0] INFO  JobContainer - Job set Channel-Number to 2 channels.
2022-03-24 22:06:53.149 [job-0] INFO  JobContainer - DataX Reader.Job [mysqlreader] splits to [1] tasks.
2022-03-24 22:06:53.150 [job-0] INFO  HdfsWriter$Job - begin do split...
2022-03-24 22:06:53.153 [job-0] INFO  HdfsWriter$Job - splited write file name:[hdfs://mycluster/datax/datas__ba33f0a6_0193_4a0f_bfb3_6c604eac5944/firstStudent.csv__f902188b_ebd3_40da_8f53_e7a425d71dc5]
2022-03-24 22:06:53.154 [job-0] INFO  HdfsWriter$Job - end do split.
2022-03-24 22:06:53.154 [job-0] INFO  JobContainer - DataX Writer.Job [hdfswriter] splits to [1] tasks.
2022-03-24 22:06:53.168 [job-0] INFO  JobContainer - jobContainer starts to do schedule ...
2022-03-24 22:06:53.171 [job-0] INFO  JobContainer - Scheduler starts [1] taskGroups.
2022-03-24 22:06:53.173 [job-0] INFO  JobContainer - Running by standalone Mode.
2022-03-24 22:06:53.183 [taskGroup-0] INFO  TaskGroupContainer - taskGroupId=[0] start [1] channels for [1] tasks.
2022-03-24 22:06:53.187 [taskGroup-0] INFO  Channel - Channel set byte_speed_limit to -1, No bps activated.
2022-03-24 22:06:53.187 [taskGroup-0] INFO  Channel - Channel set record_speed_limit to -1, No tps activated.
2022-03-24 22:06:53.231 [taskGroup-0] INFO  TaskGroupContainer - taskGroup[0] taskId[0] attemptCount[1] is started
2022-03-24 22:06:53.234 [0-0-0-reader] INFO  CommonRdbmsReader$Task - Begin to read record by Sql: [select stu_id,stu_age,stu_name from student where (stu_id != 3)
] jdbcUrl:[jdbc:mysql://slave1:3306/javaAndBigdata?yearIsDateType=false&zeroDateTimeBehavior=convertToNull&tinyInt1isBit=false&rewriteBatchedStatements=true].
2022-03-24 22:06:53.249 [0-0-0-writer] INFO  HdfsWriter$Task - begin do write...
2022-03-24 22:06:53.249 [0-0-0-writer] INFO  HdfsWriter$Task - write to file : [hdfs://mycluster/datax/datas__ba33f0a6_0193_4a0f_bfb3_6c604eac5944/firstStudent.csv__f902188b_ebd3_40da_8f53_e7a425d71dc5]
2022-03-24 22:06:53.272 [0-0-0-reader] INFO  CommonRdbmsReader$Task - Finished read record by Sql: [select stu_id,stu_age,stu_name from student where (stu_id != 3)
] jdbcUrl:[jdbc:mysql://slave1:3306/javaAndBigdata?yearIsDateType=false&zeroDateTimeBehavior=convertToNull&tinyInt1isBit=false&rewriteBatchedStatements=true].
2022-03-24 22:06:53.437 [0-0-0-writer] INFO  HdfsWriter$Task - end do write
2022-03-24 22:06:53.534 [taskGroup-0] INFO  TaskGroupContainer - taskGroup[0] taskId[0] is successed, used[342]ms
2022-03-24 22:06:53.534 [taskGroup-0] INFO  TaskGroupContainer - taskGroup[0] completed it's tasks.
2022-03-24 22:07:03.199 [job-0] INFO  StandAloneJobContainerCommunicator - Total 9 records, 38 bytes | Speed 3B/s, 0 records/s | Error 0 records, 0 bytes |  All Task WaitWriterTime 0.000s |  All Task WaitReaderTime 0.000s | Percentage 100.00%
2022-03-24 22:07:03.199 [job-0] INFO  AbstractScheduler - Scheduler accomplished all tasks.
2022-03-24 22:07:03.200 [job-0] INFO  JobContainer - DataX Writer.Job [hdfswriter] do post work.
2022-03-24 22:07:03.200 [job-0] INFO  HdfsWriter$Job - start rename file [hdfs://mycluster/datax/datas__ba33f0a6_0193_4a0f_bfb3_6c604eac5944/firstStudent.csv__f902188b_ebd3_40da_8f53_e7a425d71dc5] to file [hdfs://mycluster/datax/datas/firstStudent.csv__f902188b_ebd3_40da_8f53_e7a425d71dc5].
2022-03-24 22:07:03.219 [job-0] INFO  HdfsWriter$Job - finish rename file [hdfs://mycluster/datax/datas__ba33f0a6_0193_4a0f_bfb3_6c604eac5944/firstStudent.csv__f902188b_ebd3_40da_8f53_e7a425d71dc5] to file [hdfs://mycluster/datax/datas/firstStudent.csv__f902188b_ebd3_40da_8f53_e7a425d71dc5].
2022-03-24 22:07:03.220 [job-0] INFO  HdfsWriter$Job - start delete tmp dir [hdfs://mycluster/datax/datas__ba33f0a6_0193_4a0f_bfb3_6c604eac5944] .
2022-03-24 22:07:03.229 [job-0] INFO  HdfsWriter$Job - finish delete tmp dir [hdfs://mycluster/datax/datas__ba33f0a6_0193_4a0f_bfb3_6c604eac5944] .
2022-03-24 22:07:03.229 [job-0] INFO  JobContainer - DataX Reader.Job [mysqlreader] do post work.
2022-03-24 22:07:03.229 [job-0] INFO  JobContainer - DataX jobId [0] completed successfully.
2022-03-24 22:07:03.230 [job-0] INFO  HookInvoker - No hook invoked, because base dir not exists or is a file: /opt/datax/hook
2022-03-24 22:07:03.334 [job-0] INFO  JobContainer -
         [total cpu info] =>
                averageCpu                     | maxDeltaCpu                    | minDeltaCpu
                -1.00%                         | -1.00%                         | -1.00%


         [total gc info] =>
                 NAME                 | totalGCCount       | maxDeltaGCCount    | minDeltaGCCount    | totalGCTime        | maxDeltaGCTime     | minDeltaGCTime
                 PS MarkSweep         | 1                  | 1                  | 1                  | 0.032s             | 0.032s             | 0.032s
                 PS Scavenge          | 1                  | 1                  | 1                  | 0.016s             | 0.016s             | 0.016s

2022-03-24 22:07:03.335 [job-0] INFO  JobContainer - PerfTrace not enable!
2022-03-24 22:07:03.335 [job-0] INFO  StandAloneJobContainerCommunicator - Total 9 records, 38 bytes | Speed 3B/s, 0 records/s | Error 0 records, 0 bytes |  All Task WaitWriterTime 0.000s |  All Task WaitReaderTime 0.000s | Percentage 100.00%
2022-03-24 22:07:03.336 [job-0] INFO  JobContainer -
任务启动时刻                    : 2022-03-24 22:06:51
任务结束时刻                    : 2022-03-24 22:07:03
任务总计耗时                    :                 11s
任务平均流量                    :                3B/s
记录写入速度                    :              0rec/s
读出记录总数                    :                   9
读写失败总数                    :                   0
  • mysql到hdfs的错误(坑):
# error01:
经DataX智能分析,该任务最可能的错误原因是:
# 只有两种方式!!!
com.alibaba.datax.common.exception.DataXException: Code:[HdfsWriter-02], Description:[您填写的参数值不合法.]. - HdfsWriter插件目前只支持ORC和TEXT两种格式的文件,请将filetype选项的值配置为ORC或者TEXT
        at com.alibaba.datax.common.exception.DataXException.asDataXException(DataXException.java:26)
        at com.alibaba.datax.plugin.writer.hdfswriter.HdfsWriter$Job.validateParameter(HdfsWriter.java:56)
        at com.alibaba.datax.plugin.writer.hdfswriter.HdfsWriter$Job.init(HdfsWriter.java:42)
        at com.alibaba.datax.core.job.JobContainer.initJobWriter(JobContainer.java:704)
        at com.alibaba.datax.core.job.JobContainer.init(JobContainer.java:304)
        at com.alibaba.datax.core.job.JobContainer.start(JobContainer.java:113)
        at com.alibaba.datax.core.Engine.start(Engine.java:92)
        at com.alibaba.datax.core.Engine.entry(Engine.java:171)
        at com.alibaba.datax.core.Engine.main(Engine.java:204)

error02:
#hdfs文件系统没有对应的路径!
经DataX智能分析,该任务最可能的错误原因是:
com.alibaba.datax.common.exception.DataXException: Code:[HdfsWriter-02], Description:[您填写的参数值不合法.]. - 您配置的path: [/datax/datas/] 不存在, 请先在hive端创建对应的数据库和表.
        at com.alibaba.datax.common.exception.DataXException.asDataXException(DataXException.java:26)
        at com.alibaba.datax.plugin.writer.hdfswriter.HdfsWriter$Job.prepare(HdfsWriter.java:184)
        at com.alibaba.datax.core.job.JobContainer.prepareJobWriter(JobContainer.java:724)
        at com.alibaba.datax.core.job.JobContainer.prepare(JobContainer.java:309)
        at com.alibaba.datax.core.job.JobContainer.start(JobContainer.java:115)
        at com.alibaba.datax.core.Engine.start(Engine.java:92)
        at com.alibaba.datax.core.Engine.entry(Engine.java:171)
        at com.alibaba.datax.core.Engine.main(Engine.java:204)

error03:
# 字段不匹配
Exception in thread "taskGroup-0" com.alibaba.datax.common.exception.DataXException: Code:[MYSQLErrCode-05], Description:[SQL语句执行出错,请检查Where条件是否存在拼写或语法错误].  - 执行的SQL为: select stu_id,stu_age,stu_name from student where (id != 3) 具体错误信息为:com.mysql.jdbc.exceptions.jdbc4.MySQLSyntaxErrorException: Unknown column 'id' in 'where clause'
        at com.alibaba.datax.common.exception.DataXException.asDataXException(DataXException.java:26)
        at com.alibaba.datax.plugin.rdbms.util.RdbmsException.asQueryException(RdbmsException.java:81)
        at com.alibaba.datax.plugin.rdbms.reader.CommonRdbmsReader$Task.startRead(CommonRdbmsReader.java:220)
        at com.alibaba.datax.plugin.reader.mysqlreader.MysqlReader$Task.startRead(MysqlReader.java:81)
        at com.alibaba.datax.core.taskgroup.runner.ReaderRunner.run(ReaderRunner.java:57)
        at java.lang.Thread.run(Thread.java:748)

  • hdfs webui效果图
    datax的mysql 到hdfs文件系统 --- 支持高可用_第1张图片

你可能感兴趣的:(datax,hadoop,hive,spring,大数据)