DataX是阿里开源的一个异构数据源离线同步工具,底层源码使用java开发,编译完成后用python执行,首先我们先来分析datax.py的执行文件
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import sys
import os
import signal
import subprocess
import time
import re
import socket
import json
from optparse import OptionParser
from optparse import OptionGroup
from string import Template
import codecs
import platform
def isWindows():
return platform.system() == 'Windows'
DATAX_HOME = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATAX_VERSION = 'DATAX-OPENSOURCE-3.0'
if isWindows():
codecs.register(lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
CLASS_PATH = ("%s/lib/*") % (DATAX_HOME)
else:
CLASS_PATH = ("%s/lib/*:.") % (DATAX_HOME)
LOGBACK_FILE = ("%s/conf/logback.xml") % (DATAX_HOME)
DEFAULT_JVM = "-Xms1g -Xmx1g -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=%s/log" % (DATAX_HOME)
DEFAULT_PROPERTY_CONF = "-Dfile.encoding=UTF-8 -Dlogback.statusListenerClass=ch.qos.logback.core.status.NopStatusListener -Djava.security.egd=file:///dev/urandom -Ddatax.home=%s -Dlogback.configurationFile=%s" % (
DATAX_HOME, LOGBACK_FILE)
ENGINE_COMMAND = "java -server ${jvm} %s -classpath %s ${params} com.alibaba.datax.core.Engine -mode ${mode} -jobid ${jobid} -job ${job}" % (
DEFAULT_PROPERTY_CONF, CLASS_PATH)
REMOTE_DEBUG_CONFIG = "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,address=9999"
RET_STATE = {
"KILL": 143,
"FAIL": -1,
"OK": 0,
"RUN": 1,
"RETRY": 2
}
def getLocalIp():
try:
return socket.gethostbyname(socket.getfqdn(socket.gethostname()))
except:
return "Unknown"
def suicide(signum, e):
global child_process
print >> sys.stderr, "[Error] DataX receive unexpected signal %d, starts to suicide." % (signum)
if child_process:
child_process.send_signal(signal.SIGQUIT)
time.sleep(1)
child_process.kill()
print >> sys.stderr, "DataX Process was killed ! you did ?"
sys.exit(RET_STATE["KILL"])
def register_signal():
if not isWindows():
global child_process
signal.signal(2, suicide)
signal.signal(3, suicide)
signal.signal(15, suicide)
def getOptionParser():
usage = "usage: %prog [options] job-url-or-path"
parser = OptionParser(usage=usage)
prodEnvOptionGroup = OptionGroup(parser, "Product Env Options",
"Normal user use these options to set jvm parameters, job runtime mode etc. "
"Make sure these options can be used in Product Env.")
prodEnvOptionGroup.add_option("-j", "--jvm", metavar="", dest="jvmParameters", action="store",
default=DEFAULT_JVM, help="Set jvm parameters if necessary.")
prodEnvOptionGroup.add_option("--jobid", metavar="", dest="jobid", action="store", default="-1",
help="Set job unique id when running by Distribute/Local Mode.")
prodEnvOptionGroup.add_option("-m", "--mode", metavar="",
action="store", default="standalone",
help="Set job runtime mode such as: standalone, local, distribute. "
"Default mode is standalone.")
prodEnvOptionGroup.add_option("-p", "--params", metavar="",
action="store", dest="params",
help='Set job parameter, eg: the source tableName you want to set it by command, '
'then you can use like this: -p"-DtableName=your-table-name", '
'if you have mutiple parameters: -p"-DtableName=your-table-name -DcolumnName=your-column-name".'
'Note: you should config in you job tableName with ${tableName}.')
prodEnvOptionGroup.add_option("-r", "--reader", metavar="",
action="store", dest="reader",type="string",
help='View job config[reader] template, eg: mysqlreader,streamreader')
prodEnvOptionGroup.add_option("-w", "--writer", metavar="",
action="store", dest="writer",type="string",
help='View job config[writer] template, eg: mysqlwriter,streamwriter')
parser.add_option_group(prodEnvOptionGroup)
devEnvOptionGroup = OptionGroup(parser, "Develop/Debug Options",
"Developer use these options to trace more details of DataX.")
devEnvOptionGroup.add_option("-d", "--debug", dest="remoteDebug", action="store_true",
help="Set to remote debug mode.")
devEnvOptionGroup.add_option("--loglevel", metavar="", dest="loglevel", action="store",
default="info", help="Set log level such as: debug, info, all etc.")
parser.add_option_group(devEnvOptionGroup)
return parser
def generateJobConfigTemplate(reader, writer):
readerRef = "Please refer to the %s document:\n https://github.com/alibaba/DataX/blob/master/%s/doc/%s.md \n" % (reader,reader,reader)
writerRef = "Please refer to the %s document:\n https://github.com/alibaba/DataX/blob/master/%s/doc/%s.md \n " % (writer,writer,writer)
print readerRef
print writerRef
jobGuid = 'Please save the following configuration as a json file and use\n python {DATAX_HOME}/bin/datax.py {JSON_FILE_NAME}.json \nto run the job.\n'
print jobGuid
jobTemplate={
"job": {
"setting": {
"speed": {
"channel": ""
}
},
"content": [
{
"reader": {},
"writer": {}
}
]
}
}
readerTemplatePath = "%s/plugin/reader/%s/plugin_job_template.json" % (DATAX_HOME,reader)
writerTemplatePath = "%s/plugin/writer/%s/plugin_job_template.json" % (DATAX_HOME,writer)
try:
readerPar = readPluginTemplate(readerTemplatePath);
except Exception, e:
print "Read reader[%s] template error: can\'t find file %s" % (reader,readerTemplatePath)
try:
writerPar = readPluginTemplate(writerTemplatePath);
except Exception, e:
print "Read writer[%s] template error: : can\'t find file %s" % (writer,writerTemplatePath)
jobTemplate['job']['content'][0]['reader'] = readerPar;
jobTemplate['job']['content'][0]['writer'] = writerPar;
print json.dumps(jobTemplate, indent=4, sort_keys=True)
def readPluginTemplate(plugin):
with open(plugin, 'r') as f:
return json.load(f)
def isUrl(path):
if not path:
return False
assert (isinstance(path, str))
m = re.match(r"^http[s]?://\S+\w*", path.lower())
if m:
return True
else:
return False
def buildStartCommand(options, args):
commandMap = {}
tempJVMCommand = DEFAULT_JVM
if options.jvmParameters:
tempJVMCommand = tempJVMCommand + " " + options.jvmParameters
if options.remoteDebug:
tempJVMCommand = tempJVMCommand + " " + REMOTE_DEBUG_CONFIG
print 'local ip: ', getLocalIp()
if options.loglevel:
tempJVMCommand = tempJVMCommand + " " + ("-Dloglevel=%s" % (options.loglevel))
if options.mode:
commandMap["mode"] = options.mode
# jobResource 可能是 URL,也可能是本地文件路径(相对,绝对)
jobResource = args[0]
if not isUrl(jobResource):
jobResource = os.path.abspath(jobResource)
if jobResource.lower().startswith("file://"):
jobResource = jobResource[len("file://"):]
jobParams = ("-Dlog.file.name=%s") % (jobResource[-20:].replace('/', '_').replace('.', '_'))
if options.params:
jobParams = jobParams + " " + options.params
if options.jobid:
commandMap["jobid"] = options.jobid
commandMap["jvm"] = tempJVMCommand
commandMap["params"] = jobParams
commandMap["job"] = jobResource
return Template(ENGINE_COMMAND).substitute(**commandMap)
def printCopyright():
print '''
DataX (%s), From Alibaba !
Copyright (C) 2010-2016, Alibaba Group. All Rights Reserved.
''' % DATAX_VERSION
sys.stdout.flush()
# 程序入口
if __name__ == "__main__":
# 打印版本号
printCopyright()
# 获取执行程序参数选项
parser = getOptionParser()
# 获取用户执行程序传入的参数
options, args = parser.parse_args(sys.argv[1:])
# 判断reader和writer参数是否为空,如果不为空导入官方模板并退出
if options.reader is not None and options.writer is not None:
generateJobConfigTemplate(options.reader,options.writer)
sys.exit(RET_STATE['OK'])
# 如果传入文件路径不是1个,打印帮助选项,并退出
if len(args) != 1:
parser.print_help()
sys.exit(RET_STATE['FAIL'])
# 生成启动指令
startCommand = buildStartCommand(options, args)
# print startCommand
# 执行指令
child_process = subprocess.Popen(startCommand, shell=True)
# 监听除windows系统外的信号
register_signal()
(stdout, stderr) = child_process.communicate()
# 系统退出返回码
sys.exit(child_process.returncode)
我在主程序的代码上分别做了注释,下面我将逐一详细分析
首先程序会打印datax的版本号
随后利用optparse 模块配置传入参数的格式,我们可以看到参数分为两组一组为运行参数,我们常用的就是-p,另一组用来选择debug模式和日志等级
值得一提的是,在后面有两个对参数的判断,如果-r和-w都不为空,则调用datax自带的模板
其次如果传入的路径不是唯一则会打印help的提示,并退出程序, 运行效果分别如下
在确认传入参数后,将这些参数合并成为java指令
我们尝试将指令打印出来发现就是一条java指令,只是python利用Popen来执行了这条java指令
生成指令后就是执行指令,随后则是监听信号,将监听signum数字为2,3,15的信号,分别为SIGINT,SIGQUIT,SIGTERM都是为退出的指令,在接受信号后程序将会停止,并打印停止信息
我们用ctrl+z打断程序 会打印如下信息,信号为2
最后程序会根据退出码退出