Mysql的数据表分为普通的表及以键值与256取模分表两种形式;流程顺序是先根据Mysql表结构进行Hive库上的建表,然后从Mysql下载到OS上,最后将数据装载到Hive库中。
目前,暂不考虑数据的增量更新;主要考虑目前的数据环境较乱,可以较方便的将数据拉取到大数据平台,然后进行相关的计算。
程序调用有并行和串行两种方式,是否用并行调度的那个脚本,取决于服务器的磁盘空间大小及源Mysql库的性能。
代码调用的时候(串并行),主要修改两方面:Mysql数据源配置信息及传表名、表类型参数给调度程序。
1、参数处理代码
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive/ParProc.py
# -*- coding=utf-8 -*-
import warnings
import datetime
warnings.filterwarnings("ignore")
def getNowDay():
DayNow = datetime.datetime.today().strftime('%Y-%m-%d')
return DayNow
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def getSrcMysqlConfig():
srcMysql_config = {
'host': 'host-string',
# 'host': 'host-string',
'user': 'user-string',
'passwd': 'passwd-string',
'port': 50506,
'db': 'db-string'
}
return srcMysql_config['host'], srcMysql_config['port'], srcMysql_config['user'], srcMysql_config['passwd'], srcMysql_config['db']
2、Hive表结构生成代码
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive/HiveCreateTab.py
# -*- coding=utf-8 -*-
import os
import re
from ParProc import *
warnings.filterwarnings("ignore")
def mysqlTabCreateScript(src_tabName, tabType):
# 参数初始化赋值
host = getSrcMysqlConfig()[0]
port = getSrcMysqlConfig()[1]
user = getSrcMysqlConfig()[2]
passwd = getSrcMysqlConfig()[3]
db = getSrcMysqlConfig()[4]
if tabType == 'single':
srcTabStructure = os.popen("""source /etc/profile; \
/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \
-N -e"set names utf8; \
select a2.column_name,case when a2.data_type like '%int' then 'bigint' else 'string' end data_type
from information_schema.TABLES a1
left join information_schema.columns a2 on a1.TABLE_SCHEMA=a2.TABLE_SCHEMA and a1.TABLE_NAME=a2.TABLE_NAME
where a1.TABLE_SCHEMA='{db}' and a1.table_name ='{src_tabName}'
order by a2.ORDINAL_POSITION;" \
""" .format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName)).readlines();
elif tabType == 'submeter':
srcTabStructure = os.popen("""source /etc/profile; \
/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \
-N -e"set names utf8; \
select a2.column_name,case when a2.data_type like '%int' then 'bigint' else 'string' end data_type
from information_schema.TABLES a1
left join information_schema.columns a2 on a1.TABLE_SCHEMA=a2.TABLE_SCHEMA and a1.TABLE_NAME=a2.TABLE_NAME
where a1.TABLE_SCHEMA='{db}' and a1.table_name ='{src_tabName}_0'
order by a2.ORDINAL_POSITION;" \
""" .format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName)).readlines();
srcTabCol_list = []
for stcList in srcTabStructure:
stc = re.split('\t', stcList.replace('\n', ''))
srcTabCol_list.append(stc)
TabCreateScript = 'drop table if exists xxx_{src_tabName};\ncreate table xxx_{src_tabName}(\n'.format(src_tabName=src_tabName)
for srcColType in srcTabCol_list:
TabCreateScript = TabCreateScript + '\`' + srcColType[0] + '\`' + ' ' + srcColType[1] + ',\n'
TabCreateScript = TabCreateScript[:-2]+")row format delimited fields terminated by '\t' lines terminated by '\n';;"
return TabCreateScript
def HiveCreateTab(src_tabName, tabType):
TabCreateScript = mysqlTabCreateScript(src_tabName, tabType)
os.system("""/usr/lib/hive-current/bin/hive -e "{TabCreateScript}" """.format(TabCreateScript=TabCreateScript))
# Batch Test
# HiveCreateTab(src_tabName='game_zone', tabType='single')
# HiveCreateTab(src_tabName='gift_record', tabType='submeter')
3、数据download&upload的代码
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive/MysqlData2Hive.py
# -*- coding=utf-8 -*-
import os
from ParProc import *
warnings.filterwarnings("ignore")
def mysqlDataDownload(src_tabName, tabType):
# 参数初始化赋值
host = getSrcMysqlConfig()[0]
port = getSrcMysqlConfig()[1]
user = getSrcMysqlConfig()[2]
passwd = getSrcMysqlConfig()[3]
db = getSrcMysqlConfig()[4]
if os.path.exists('/home/hadoop/nisj/Mysql2Hive/tmp_data/') == False:
os.system('mkdir -p /home/hadoop/nisj/Mysql2Hive/tmp_data/')
if tabType == 'single':
os.system("""source /etc/profile; \
/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \
-N -e"set names utf8; \
select * from {src_tabName};" \
>/home/hadoop/nisj/Mysql2Hive/tmp_data/xxx_{src_tabName}.txt \
""" .format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName))
elif tabType == 'submeter':
os.system("rm -rf /home/hadoop/nisj/Mysql2Hive/tmp_data/xxx_{src_tabName}.txt".format(src_tabName=src_tabName))
for submeterPlus in range(0, 256, 1):
os.system("""source /etc/profile; \
/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \
-N -e"set names utf8; \
select * from {db}.{src_tabName}_{submeterPlus} ;" \
>>/home/hadoop/nisj/Mysql2Hive/tmp_data/xxx_{src_tabName}.txt \
""".format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, submeterPlus=submeterPlus))
def DataUploadHive(src_tabName):
os.system("""/usr/lib/hive-current/bin/hive -e "load data local inpath '/home/hadoop/nisj/Mysql2Hive/tmp_data/xxx_{src_tabName}.txt' overwrite into table xxx_{src_tabName};" """.format(src_tabName=src_tabName))
os.system("rm -rf /home/hadoop/nisj/Mysql2Hive/tmp_data/xxx_{src_tabName}.txt ".format(src_tabName=src_tabName))
def MysqlData2hive(src_tabName, tabType):
mysqlDataDownload(src_tabName, tabType)
DataUploadHive(src_tabName)
# Batch Test
# MysqlData2hive(src_tabName='game_zone', tabType='single')
# MysqlData2hive(src_tabName='gift_record', tabType='submeter')
4、串行调度的代码
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive/Mysql2HiveCtl.py
# -*- coding=utf-8 -*-
from HiveCreateTab import *
from MysqlData2Hive import *
warnings.filterwarnings("ignore")
def Mysql2Hive(src_tabName, tabType):
HiveCreateTab(src_tabName, tabType)
mysqlDataDownload(src_tabName, tabType)
DataUploadHive(src_tabName)
# Batch Test
Mysql2Hive(src_tabName='game_zone', tabType='single')
Mysql2Hive(src_tabName='live_history_status', tabType='submeter')
Mysql2Hive(src_tabName='user_profile', tabType='submeter')
Mysql2Hive(src_tabName='dafa_timeline_game_video', tabType='single')
5、并行调度的代码
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive/BatchThread.py
# -*- coding=utf-8 -*-
import threadpool
import time
from Mysql2HiveCtl import *
warnings.filterwarnings("ignore")
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
batch_SrcTab_list = [(['game_zone', 'single'], None), (['live_history_status', 'submeter'], None), (['user_profile', 'submeter'], None), (['dafa_timeline_game_video', 'single'], None)]
requests = []
request_Mysql2Hive_batchCtl = threadpool.makeRequests(Mysql2Hive, batch_SrcTab_list)
requests.extend(request_Mysql2Hive_batchCtl)
main_pool = threadpool.ThreadPool(8)
[main_pool.putRequest(req) for req in requests]
if __name__ == '__main__':
while True:
try:
time.sleep(30)
main_pool.poll()
except KeyboardInterrupt:
print("**** Interrupted!")
break
except threadpool.NoResultsPending:
break
if main_pool.dismissedWorkers:
print("Joining all dismissed worker threads...")
main_pool.joinAllDismissedWorkers()
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time