其他类似相关参考:
Python自动化拉取Mysql数据并装载到Hive(V3.0)-->
https://blog.csdn.net/BabyFish13/article/details/73885033
Python自动化拉取Mysql数据并装载到Hive(V2.0)-->
http://blog.csdn.net/babyfish13/article/details/70792158
Python自动化拉取Mysql数据并装载到Hive-->
http://blog.csdn.net/babyfish13/article/details/73618331
有两个版本,第一个版本是通过Mysql与Hive表对表传输,其间可以指定过滤条件,也可以多分表传输,但分表的话每次调用一次程序太慢,所以效率会有影响;第二个版本通过sqoop的query参数进行传输,变量及参数在sql语句中进行控制,所以效率会好一些。
其实,
无论是通过sqoop、或是数据select出来再load,或者是通过datax传输,其只是手段,要形成方案,都需要通过python或shell将其串联起来,才能供我们随心所欲的使用。
1、sqoop表对表传输
/Users/nisj/PycharmProjects/BiDataProc/love/mysqlData2HiveBySqoop0329.py
# -*- coding=utf-8 -*-
import os
import re
import warnings
import datetime
warnings.filterwarnings("ignore")
# src Database config
srcMysqlConfig_jellyfish_server = {
'host': 'MysqlHost',
# 'host': 'MysqlHost',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 50506,
'db': 'jellyfish_server'
}
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def getSrcMysqlConfig(srcMysql_config):
srcMysql_config = srcMysql_config
return srcMysql_config['host'], srcMysql_config['port'], srcMysql_config['user'], srcMysql_config['passwd'], srcMysql_config['db']
def getMysqlTabCreateScript(srcMysql_config, src_tabName, tabType):
# Parameter initialization
host = getSrcMysqlConfig(srcMysql_config)[0]
port = getSrcMysqlConfig(srcMysql_config)[1]
user = getSrcMysqlConfig(srcMysql_config)[2]
passwd = getSrcMysqlConfig(srcMysql_config)[3]
db = getSrcMysqlConfig(srcMysql_config)[4]
if tabType == 'single':
src_postfix = ''
elif 'submeter' in tabType:
src_postfix = '_0'
srcTabStructure = os.popen("""source /etc/profile; \
/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \
-N -e"set names utf8; \
select a2.column_name,case when a2.data_type like '%int' then 'bigint' else 'string' end data_type
from information_schema.TABLES a1
left join information_schema.columns a2 on a1.TABLE_SCHEMA=a2.TABLE_SCHEMA and a1.TABLE_NAME=a2.TABLE_NAME
where a1.TABLE_SCHEMA='{db}' and a1.table_name ='{src_tabName}{src_postfix}'
order by a2.ORDINAL_POSITION;" \
""" .format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, src_postfix=src_postfix)).readlines();
srcTabCol_list = []
for stcList in srcTabStructure:
stc = re.split('\t', stcList.replace('\n', ''))
srcTabCol_list.append(stc)
TabCreateScript = 'use ods;\ndrop table if exists {db}_{src_tabName};\ncreate table {db}_{src_tabName}(\n'.format(src_tabName=src_tabName, db=db)
for srcColType in srcTabCol_list:
TabCreateScript = TabCreateScript + '\`' + srcColType[0] + '\`' + ' ' + srcColType[1] + ',\n'
TabCreateScript = TabCreateScript[:-2]+") partitioned by (\`pt_day\` string) row format delimited fields terminated by '\t' lines terminated by '\n' location 'hdfs://emr-cluster/user/hive/warehouse/ods.db/{db}_{src_tabName}';".format(src_tabName=src_tabName, db=db)
return TabCreateScript
def HiveCreateTab(srcMysql_config, src_tabName, tabType):
TabCreateScript = getMysqlTabCreateScript(srcMysql_config, src_tabName, tabType)
os.system("""/usr/lib/hive-current/bin/hive -e "{TabCreateScript}" """.format(TabCreateScript=TabCreateScript))
def mysqlData2Hive(srcMysql_config, src_tabName, tabType, runDay, whereCondition):
# Parameter initialization
host = getSrcMysqlConfig(srcMysql_config)[0]
port = getSrcMysqlConfig(srcMysql_config)[1]
user = getSrcMysqlConfig(srcMysql_config)[2]
passwd = getSrcMysqlConfig(srcMysql_config)[3]
db = getSrcMysqlConfig(srcMysql_config)[4]
# add partitions
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e "
use ods;
alter table {db}_{src_tabName} drop if exists partition (pt_day='{runDay}'); \
alter table {db}_{src_tabName} add partition (pt_day='{runDay}');" \
""".format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, runDay=runDay))
# get submeter table count
if tabType == 'single':
submeter_cnt = 1
elif 'submeter' in tabType:
submeter_cnt = int(str(tabType).replace('submeter-', ''))
# partition table data load
for submeterPlus in range(0, submeter_cnt, 1):
# get submeter table postfix
if tabType == 'single':
submeterPostfix = ''
elif 'submeter' in tabType:
submeterPostfix = '_'+str(submeterPlus)
os.system("""source /etc/profile; \
sqoop import \
--connect jdbc:mysql://{host}:{port}/{db}?zeroDateTimeBehavior=convertToNull \
--username {user} \
--password {passwd} \
--table {src_tabName}{submeterPostfix} \
--target-dir hdfs://emr-cluster/user/hive/warehouse/ods.db/{db}_{src_tabName}/pt_day={runDay} \
--append \
--fields-terminated-by '\t' \
--lines-terminated-by '\n' \
--driver com.mysql.jdbc.Driver -m 1 \
--where "{whereCondition}" \
""".format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, submeterPostfix=submeterPostfix, runDay=runDay, whereCondition=whereCondition))
# Batch Test
HiveCreateTab(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='live_history_status', tabType='submeter-3')
for runDay in dateRange(beginDate='2018-03-25', endDate='2018-03-26'):
mysqlData2Hive(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='live_history_status', tabType='submeter-3', runDay=runDay, whereCondition="substr(updated_time, 1, 10) = '{runDay}'".format(runDay=runDay))
# HiveCreateTab(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='big_fans_detail', tabType='single')
# for runDay in dateRange(beginDate='2018-03-25', endDate='2018-03-26'):
# mysqlData2Hive(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='big_fans_detail', tabType='single', runDay=runDay, whereCondition="")
2、通过query进行数据传输
/Users/nisj/PycharmProjects/BiDataProc/love/mysqlData2HiveBySqoop.py
# -*- coding=utf-8 -*-
import os
import re
import warnings
import datetime
warnings.filterwarnings("ignore")
# src Database config
srcMysqlConfig_jellyfish_server = {
'host': 'MysqlHost',
# 'host': 'MysqlHost',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 50506,
'db': 'jellyfish_server'
}
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def getSrcMysqlConfig(srcMysql_config):
srcMysql_config = srcMysql_config
return srcMysql_config['host'], srcMysql_config['port'], srcMysql_config['user'], srcMysql_config['passwd'], srcMysql_config['db']
def getMysqlTabScript(srcMysql_config, src_tabName, tabType, whereCondition):
# Parameter initialization
host = getSrcMysqlConfig(srcMysql_config)[0]
port = getSrcMysqlConfig(srcMysql_config)[1]
user = getSrcMysqlConfig(srcMysql_config)[2]
passwd = getSrcMysqlConfig(srcMysql_config)[3]
db = getSrcMysqlConfig(srcMysql_config)[4]
if tabType == 'single':
src_postfix = ''
elif 'submeter' in tabType:
src_postfix = '_0'
srcTabStructure = os.popen("""source /etc/profile; \
/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \
-N -e"set names utf8; \
select a2.column_name,case when a2.data_type like '%int' then 'bigint' else 'string' end data_type
from information_schema.TABLES a1
left join information_schema.columns a2 on a1.TABLE_SCHEMA=a2.TABLE_SCHEMA and a1.TABLE_NAME=a2.TABLE_NAME
where a1.TABLE_SCHEMA='{db}' and a1.table_name ='{src_tabName}{src_postfix}'
order by a2.ORDINAL_POSITION;" \
""" .format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, src_postfix=src_postfix)).readlines();
srcTabCol_list = []
for stcList in srcTabStructure:
stc = re.split('\t', stcList.replace('\n', ''))
srcTabCol_list.append(stc)
TabCreateScript = 'use ods;\ndrop table if exists {db}_{src_tabName};\ncreate table {db}_{src_tabName}(\n'.format(src_tabName=src_tabName, db=db)
TabSelectScriptHalf = 'select '
for srcColType in srcTabCol_list:
TabSelectScriptHalf = TabSelectScriptHalf + '' + srcColType[0] + ','
TabCreateScript = TabCreateScript + '\`' + srcColType[0] + '\`' + ' ' + srcColType[1] + ',\n'
TabSelectScriptHalf = TabSelectScriptHalf[:-1]
TabCreateScript = TabCreateScript[:-2] + ") partitioned by (\`pt_day\` string) row format delimited fields terminated by '\t' lines terminated by '\n' location 'hdfs://emr-cluster/user/hive/warehouse/ods.db/{db}_{src_tabName}';".format(src_tabName=src_tabName, db=db)
# get submeter table count
if tabType == 'single':
submeter_cnt = 1
elif 'submeter' in tabType:
submeter_cnt = int(str(tabType).replace('submeter-', ''))
# partition table data load
TabSelectScript=''
for submeterPlus in range(0, submeter_cnt, 1):
# get submeter table postfix
if tabType == 'single':
submeterPostfix = ''
elif 'submeter' in tabType:
submeterPostfix = '_' + str(submeterPlus)
TabSelectScriptSingle = TabSelectScriptHalf + " from {src_tabName}{submeterPostfix} where {whereCondition}\nunion all\n".format(src_tabName=src_tabName, submeterPostfix=submeterPostfix, whereCondition=whereCondition)
TabSelectScript = TabSelectScript + TabSelectScriptSingle
TabSelectScript = TabSelectScript[:-11] + " and \$CONDITIONS;"
return TabCreateScript, TabSelectScript
def HiveCreateTab(srcMysql_config, src_tabName, tabType):
TabCreateScript = getMysqlTabScript(srcMysql_config, src_tabName, tabType, whereCondition="")[0]
os.system("""/usr/lib/hive-current/bin/hive -e "{TabCreateScript}" """.format(TabCreateScript=TabCreateScript))
def mysqlData2Hive(srcMysql_config, src_tabName, tabType, runDay, whereCondition):
# Parameter initialization
host = getSrcMysqlConfig(srcMysql_config)[0]
port = getSrcMysqlConfig(srcMysql_config)[1]
user = getSrcMysqlConfig(srcMysql_config)[2]
passwd = getSrcMysqlConfig(srcMysql_config)[3]
db = getSrcMysqlConfig(srcMysql_config)[4]
TabSelectScript = getMysqlTabScript(srcMysql_config, src_tabName, tabType, whereCondition)[1]
# add partitions
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e "
use ods;
alter table {db}_{src_tabName} drop if exists partition (pt_day='{runDay}'); \
alter table {db}_{src_tabName} add partition (pt_day='{runDay}');" \
""".format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, runDay=runDay))
# partition table data load
os.system("""source /etc/profile; \
sqoop import \
--connect jdbc:mysql://{host}:{port}/{db}?zeroDateTimeBehavior=convertToNull \
--username {user} \
--password {passwd} \
--query "{TabSelectScript}" \
--target-dir hdfs://emr-cluster/user/hive/warehouse/ods.db/{db}_{src_tabName}/pt_day={runDay} \
--delete-target-dir \
--fields-terminated-by '\t' \
--lines-terminated-by '\n' \
--num-mappers 1 \
--compress \
--compression-codec org.apache.hadoop.io.compress.SnappyCodec \
--direct \
--driver com.mysql.jdbc.Driver -m 8 \
""".format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, runDay=runDay, TabSelectScript=TabSelectScript))
# Batch Test
HiveCreateTab(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='live_history_status', tabType='submeter-256')
HiveCreateTab(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='big_fans_detail', tabType='single')
for runDay in dateRange(beginDate='2018-03-26', endDate='2018-03-28'):
mysqlData2Hive(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='live_history_status', tabType='submeter-256', runDay=runDay, whereCondition="substr(updated_time, 1, 10) = '{runDay}'".format(runDay=runDay))
mysqlData2Hive(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='big_fans_detail', tabType='single', runDay=runDay, whereCondition="point=20000")
说明:
1、与建表建分区结合,暂没有考虑传输特定几个字段的情况,可以在以后的版本中优化。
2、总体目前在同一个脚本中,以后可以考虑根据功能分开。
3、总体包含元数据脚本获取模块、hive建表模块、数据传输模块。