# -*- coding=utf-8 -*-
import warnings
warnings.filterwarnings("ignore")
# public Database config
srcMysqlConfig_Tv_server = {
'host': 'MysqlHostInnerIp',
# 'host': 'MysqlHostOuterIp',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 50506,
'db': 'Tv_server'
}
srcMysqlConfig_Tv_user = {
'host': 'MysqlHostInnerIp',
# 'host': 'MysqlHostOuterIp',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 50514,
'db': 'Tv_user'
}
srcMysqlConfig_Tv_seed = {
'host': 'MysqlHostInnerIp',
# 'host': 'MysqlHostOuterIp',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 50029,
'db': 'Tv_seed'
}
srcMysqlConfig_Tv_event = {
'host': 'MysqlHostInnerIp',
# 'host': 'MysqlHostOuterIp',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 50512,
'db': 'Tv_event'
}
srcMysqlConfig_Tv_hadoop_stat = {
'host': 'MysqlHostInnerIp',
# 'host': 'MysqlHostOuterIp',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 6605,
'db': 'Tv_hadoop_stat'
}
# target tmp data path config
tmp_data_dir = '/home/hadoop/nisj/mysql2Hive/dataTmp'
# log path config
log_dir = '/home/hadoop/nisj/mysql2Hive/logs'
# -*- coding=utf-8 -*-
from conf.systemParConf import *
warnings.filterwarnings("ignore")
srcTabConfig_game_zone = {
'srcMysql_config': srcMysqlConfig_jellyfish_server,
'src_tabName': 'game_zone',
'tabType': 'single',
'loadType': 'increment[substr(created_time,1,10)]'
}
srcTabConfig_game = {
'srcMysql_config': srcMysqlConfig_jellyfish_seed,
'src_tabName': 'game',
'tabType': 'single',
'loadType': 'whole'
}
srcTabConfig_match_apply = {
'srcMysql_config': srcMysqlConfig_jellyfish_event,
'src_tabName': 'bless_wall_201612',
'tabType': 'single',
'loadType': 'whole'
}
srcTabConfig_user_profile = {
'srcMysql_config': srcMysqlConfig_jellyfish_user,
'src_tabName': 'user_profile',
'tabType': 'submeter-256',
'loadType': 'whole'
}
srcTabConfig_live_history_status = {
'srcMysql_config': srcMysqlConfig_jellyfish_server,
'src_tabName': 'live_history_status',
'tabType': 'submeter-28',
'loadType': 'increment[substr(switch_time,1,10)]'
}
# -*- coding=utf-8 -*-
import warnings
import datetime
from conf.tableParConf import *
warnings.filterwarnings("ignore")
def getNowDay():
DayNow = datetime.datetime.today().strftime('%Y-%m-%d')
return DayNow
def getYesterDay():
YesterDay = (datetime.datetime.today() - datetime.timedelta(1)).strftime('%Y-%m-%d')
return YesterDay
def getRunDay(runDay):
if runDay == '':
runDay = getYesterDay()
else:
runDay = runDay
return runDay
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def getSrcMysqlConfig(srcMysql_config):
srcMysql_config = srcMysql_config
return srcMysql_config['host'], srcMysql_config['port'], srcMysql_config['user'], srcMysql_config['passwd'], srcMysql_config['db']
def getTabParName(tabParListFilter):
with open("conf/tableParConf.py") as tabPar:
tabParList = []
for ConfigLine in tabPar:
if '#' not in ConfigLine and ConfigLine.replace('\n', '') != '' and ' = {' in ConfigLine:
tabParList.append(ConfigLine.split(' = ')[0])
if tabParListFilter == ['']:
tabParList = tabParList
else:
tabParList = tabParListFilter
return tabParList
# batch_test
# print getRunDay(runDay='')
# -*- coding=utf-8 -*-
import os
import re
from public.getSystemPar import *
warnings.filterwarnings("ignore")
def mysqlTabCreateScript(srcMysql_config, src_tabName, tabType, loadType):
# 参数初始化赋值
host = getSrcMysqlConfig(srcMysql_config)[0]
port = getSrcMysqlConfig(srcMysql_config)[1]
user = getSrcMysqlConfig(srcMysql_config)[2]
passwd = getSrcMysqlConfig(srcMysql_config)[3]
db = getSrcMysqlConfig(srcMysql_config)[4]
if tabType == 'single':
srcTabName = src_tabName
elif 'submeter' in tabType:
srcTabName = src_tabName + "_0"
srcTabStructure = os.popen("""source /etc/profile; \
/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \
-N -e"set names utf8; \
select a2.column_name,case when a2.data_type like '%int' then 'bigint' else 'string' end data_type
from information_schema.TABLES a1
left join information_schema.columns a2 on a1.TABLE_SCHEMA=a2.TABLE_SCHEMA and a1.TABLE_NAME=a2.TABLE_NAME
where a1.TABLE_SCHEMA='{db}' and a1.table_name ='{srcTabName}'
order by a2.ORDINAL_POSITION;" \
""" .format(host=host, port=port, user=user, passwd=passwd, db=db, srcTabName=srcTabName)).readlines();
srcTabCol_list = []
for stcList in srcTabStructure:
stc = re.split('\t', stcList.replace('\n', ''))
srcTabCol_list.append(stc)
TabCreateScript = 'drop table if exists xxx_{src_tabName};\ncreate table xxx_{src_tabName}(\n'.format(src_tabName=src_tabName)
colList = []
for srcColType in srcTabCol_list:
TabCreateScript = TabCreateScript + '\`' + srcColType[0] + '\`' + ' ' + srcColType[1] + ',\n'
colList.append(srcColType[0])
if 'whole' in loadType:
TabCreateScript = TabCreateScript[:-2] + ")row format delimited fields terminated by '\t' lines terminated by '\n';"
elif 'increment' in loadType:
TabCreateScript = TabCreateScript[:-2] + ") partitioned by(data_day varchar(10)) row format delimited fields terminated by '\t' lines terminated by '\n';"
return TabCreateScript, colList
# Batch Test
import logging
import datetime
from conf.systemParConf import log_dir
def logRecord(message):
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(funcName)s %(levelname)s %(message)s',
datefmt='%Y-%m-%d %a %H:%M:%S',
filename='{log_dir}/dataEtl_{currDay}.log'.format(log_dir=log_dir, currDay=datetime.datetime.today().strftime('%Y-%m-%d')),
filemode='a')
logging.info('{message}'.format(message=message))
# -*- coding=utf-8 -*-
from public.getSrcMetadata import *
warnings.filterwarnings("ignore")
def HiveCreateTab(srcMysql_config, src_tabName, tabType, logType):
TabCreateScript = mysqlTabCreateScript(srcMysql_config, src_tabName, tabType, logType)[0]
os.system("""/usr/lib/hive-current/bin/hive -e "{TabCreateScript}" """.format(TabCreateScript=TabCreateScript))
# Batch Test
# -*- coding=utf-8 -*-
from public.getSrcMetadata import *
from public.getSystemPar import *
warnings.filterwarnings("ignore")
def mysqlDataDownload(srcMysql_config, src_tabName, tabType, loadType, runDay):
# 参数初始化赋值
host = getSrcMysqlConfig(srcMysql_config)[0]
port = getSrcMysqlConfig(srcMysql_config)[1]
user = getSrcMysqlConfig(srcMysql_config)[2]
passwd = getSrcMysqlConfig(srcMysql_config)[3]
db = getSrcMysqlConfig(srcMysql_config)[4]
if os.path.exists('{tmp_data_dir}/'.format(tmp_data_dir=tmp_data_dir)) == False:
os.system('mkdir -p {tmp_data_dir}/'.format(tmp_data_dir=tmp_data_dir))
colList = mysqlTabCreateScript(srcMysql_config, src_tabName, tabType, loadType)[1]
allColChars = ''
for colName in colList:
allColChars = allColChars + 'replace(replace(replace(replace(\`' + colName + '\`,\',\',\'[comma]\'),\'\\n\',\'[newline-n]\'),\'\\r\',\'[newline-r]\'),\'\\t\',\'[tab]\')' + ','
if tabType == 'single':
submeter_cnt = 1
elif 'submeter' in tabType:
submeter_cnt = int(str(tabType).replace('submeter-', ''))
if 'whole' in loadType:
sql_where = ''
elif 'increment' in loadType:
sql_where = 'where ' + str(loadType.replace('increment[', '')).replace(']', '') + ' = \'{runDay}\''.format(runDay=runDay)
os.system("rm -rf {tmp_data_dir}/xxx_{src_tabName}.txt".format(src_tabName=src_tabName, tmp_data_dir=tmp_data_dir))
for submeterPlus in range(0, submeter_cnt, 1):
if submeter_cnt == 1:
submeterPlus = ''
else:
submeterPlus = "_" + str(submeterPlus)
os.system("""source /etc/profile; \
/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \
-N -e"set names utf8; \
select {allColChars} from {db}.{src_tabName}{submeterPlus} {sql_where};" \
>>{tmp_data_dir}/xxx_{src_tabName}.txt \
""".format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, tmp_data_dir=tmp_data_dir, allColChars=allColChars[:-1], submeterPlus=submeterPlus, sql_where=sql_where))
def DataUploadHive(src_tabName, loadType, runDay):
partitionProc_sql = """alter table xxx_{src_tabName} drop if exists partition (data_day = '{runDay}');alter table xxx_{src_tabName} add partition (data_day = '{runDay}');""".format(src_tabName=src_tabName, runDay=runDay)
if 'whole' in loadType:
os.system("""/usr/lib/hive-current/bin/hive -e "load data local inpath '{tmp_data_dir}/xxx_{src_tabName}.txt' overwrite into table xxx_{src_tabName};" """.format(src_tabName=src_tabName, tmp_data_dir=tmp_data_dir))
elif 'increment' in loadType:
os.system("""/usr/lib/hive-current/bin/hive -e "{partitionProc_sql}load data local inpath '{tmp_data_dir}/xxx_{src_tabName}.txt' overwrite into table xxx_{src_tabName} partition (data_day = '{runDay}');" """.format(src_tabName=src_tabName, tmp_data_dir=tmp_data_dir, runDay=runDay, partitionProc_sql=partitionProc_sql))
os.system("rm -rf {tmp_data_dir}/xxx_{src_tabName}.txt ".format(src_tabName=src_tabName, tmp_data_dir=tmp_data_dir))
def MysqlData2hive(srcMysql_config, src_tabName, tabType, loadType, runDay):
mysqlDataDownload(srcMysql_config, src_tabName, tabType, loadType, runDay)
DataUploadHive(src_tabName, loadType, runDay)
# Batch Test
# -*- coding=utf-8 -*-
from mysql2hive.mysqlData2Hive import *
from mysql2hive.hiveTabCreate import *
from public.logRecord import logRecord
warnings.filterwarnings("ignore")
def Mysql2Hive(srcMysql_config, src_tabName, tabType, loadType, isIncrementInt, runDay):
if 'whole' in loadType:
HiveCreateTab(srcMysql_config, src_tabName, tabType, loadType)
elif 'increment' in loadType:
if isIncrementInt == 'True':
HiveCreateTab(srcMysql_config, src_tabName, tabType, loadType)
MysqlData2hive(srcMysql_config, src_tabName, tabType, loadType, runDay)
def mysql2HiveSerialBatch(isIncrementInt, runDay, tabParListFilter):
for tabParConfigName in getTabParName(tabParListFilter):
tabParConfig = eval(tabParConfigName)
logRecord(message='Table {src_tabName}({tabType}) data {loadType} load from mysql to hive start...'.format(src_tabName=tabParConfig['src_tabName'], tabType=tabParConfig['tabType'], loadType=tabParConfig['loadType']))
Mysql2Hive(tabParConfig['srcMysql_config'], tabParConfig['src_tabName'], tabParConfig['tabType'], tabParConfig['loadType'], isIncrementInt, runDay)
logRecord(message='Table {src_tabName}({tabType}) data {loadType} load from mysql to hive finished!'.format(src_tabName=tabParConfig['src_tabName'], tabType=tabParConfig['tabType'], loadType=tabParConfig['loadType']))
def mysql2HiveSerialCtl(isIncrementInt, runDayList, tabParListFilter):
for runDay in runDayList:
mysql2HiveSerialBatch(isIncrementInt=isIncrementInt, runDay=getRunDay(runDay), tabParListFilter=tabParListFilter)
# 当且仅当【isIncrementInt='True'】时,进行增量传输数据的初始化;其他任何情况都不进行增量传输数据初始化
# 当【runDayList = ['']】时,进行正常数据跑批;其他则是list日期内的数据跑批,如runDayList = ['2017-06-18','2017-06-19','2017-06-20']
# 当【tabParListFilter = ['']】时,则不进行表配置的过滤,tabParCong中所有未被注释的表配置都会进行数据的跑批;过滤如:tabParListFilter = ['srcTabConfig_game_zone','srcTabConfig_game']
# 日常正常跑批示例:
tabParListFilter = ['']
isIncrementInt = 'F'
runDayList = ['']
mysql2HiveSerialCtl(isIncrementInt, runDayList, tabParListFilter)
# 若干表的补数
# tabParListFilter = ['srcTabConfig_live_history_status']
# isIncrementInt = 'F'
# runDayList = ['2017-06-21','2017-06-22','2017-06-23']
# mysql2HiveSerialCtl(isIncrementInt, runDayList, tabParListFilter)
[hadoop@emr-worker-9 mysql2Hive]$ crontab -l
48 10 * * * python /home/hadoop/nisj/mysql2Hive/SerialBatch.py >> /home/hadoop/nisj/mysql2Hive/logs/SerialBatch.log 2>&1
附:程序路径