Python自动化拉取Mysql数据并装载到Hive

程序调用有并行和串行两种方式,是否用并行调度的那个脚本,取决于服务器的磁盘空间大小及源Mysql库的性能。
代码调用的时候(串并行),主要修改两方面: 1、如果不存在数据源则在【ParProc.py】重新配置一个;2、在串并行调度的时候,传数据源配置名、表名、表类型参数给调度程序。
本次更新修改,主要涉及数据的传参等更为合理一些;实际上,在进行数据传输的时候,还有一个 Bug就是数据传输的文本字段含有回车换行,则数据装载的时候就会有问题。解决方案可以参考【Python自动化拉取Mysql数据并装载到Oracle--> http://blog.csdn.net/babyfish13/article/details/69568515】一文中Mysql向Oracle传输数据的方式即可;这里就暂不修改了。
1、数据源配置及参数的获取
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive/ParProc.py
# -*- coding=utf-8 -*-
import warnings
import datetime

warnings.filterwarnings("ignore")

# src Database config
srcMysqlConfig_jellyfish_server = {
    'host': 'interNetIp',
    # 'host': 'outNetIp',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': MysqlPort,
    'db': 'jellyfish_server'
}

srcMysqlConfig_jellyfish_user = {
    'host': 'interNetIp',
    # 'host': 'outNetIp',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': MysqlPort,
    'db': 'jellyfish_user'
}

srcMysqlConfig_jellyfish_seed = {
    'host': 'interNetIp',
    # 'host': 'outNetIp',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': MysqlPort,
    'db': 'jellyfish_seed'
}

srcMysqlConfig_jellyfish_event = {
    'host': 'interNetIp',
    # 'host': 'outNetIp',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': MysqlPort,
    'db': 'jellyfish_event'
}

srcMysqlConfig_jellyfish_hadoop_stat = {
    'host': 'interNetIp',
    # 'host': 'outNetIp',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': MysqlPort,
    'db': 'jellyfish_hadoop_stat'
}

def getNowDay():
    DayNow = datetime.datetime.today().strftime('%Y-%m-%d')
    return DayNow

def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

def getSrcMysqlConfig(srcMysql_config):
    srcMysql_config = srcMysql_config
    return srcMysql_config['host'], srcMysql_config['port'], srcMysql_config['user'], srcMysql_config['passwd'], srcMysql_config['db']

# print getSrcMysqlConfig(srcMysql_config=srcMysqlConfig_jellyfish_server)

2、获取源表结构及Hive建表
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive/HiveCreateTab.py
# -*- coding=utf-8 -*-
import os
import re
from ParProc import *

warnings.filterwarnings("ignore")


def mysqlTabCreateScript(srcMysql_config, src_tabName, tabType):
    # 参数初始化赋值
    host = getSrcMysqlConfig(srcMysql_config)[0]
    port = getSrcMysqlConfig(srcMysql_config)[1]
    user = getSrcMysqlConfig(srcMysql_config)[2]
    passwd = getSrcMysqlConfig(srcMysql_config)[3]
    db = getSrcMysqlConfig(srcMysql_config)[4]

    if tabType == 'single':
        srcTabStructure = os.popen("""source /etc/profile; \
                /usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \
                -N -e"set names utf8; \
                select a2.column_name,case when a2.data_type like '%int' then 'bigint' else 'string' end data_type
                from information_schema.TABLES a1
                left join information_schema.columns a2 on a1.TABLE_SCHEMA=a2.TABLE_SCHEMA and a1.TABLE_NAME=a2.TABLE_NAME
                where a1.TABLE_SCHEMA='{db}' and a1.table_name ='{src_tabName}'
                order by a2.ORDINAL_POSITION;" \
                """ .format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName)).readlines();

    elif tabType == 'submeter':
        srcTabStructure = os.popen("""source /etc/profile; \
                /usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \
                -N -e"set names utf8; \
                select a2.column_name,case when a2.data_type like '%int' then 'bigint' else 'string' end data_type
                from information_schema.TABLES a1
                left join information_schema.columns a2 on a1.TABLE_SCHEMA=a2.TABLE_SCHEMA and a1.TABLE_NAME=a2.TABLE_NAME
                where a1.TABLE_SCHEMA='{db}' and a1.table_name ='{src_tabName}_0'
                order by a2.ORDINAL_POSITION;" \
                """ .format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName)).readlines();

    srcTabCol_list = []
    for stcList in srcTabStructure:
        stc = re.split('\t', stcList.replace('\n', ''))
        srcTabCol_list.append(stc)
    TabCreateScript = 'drop table if exists xxx_{src_tabName};\ncreate table xxx_{src_tabName}(\n'.format(src_tabName=src_tabName)
    for srcColType in srcTabCol_list:
        TabCreateScript = TabCreateScript + '\`' + srcColType[0] + '\`' + ' ' + srcColType[1] + ',\n'

    TabCreateScript = TabCreateScript[:-2]+")row format delimited fields terminated by '\t' lines terminated by '\n';;"
    return TabCreateScript

def HiveCreateTab(srcMysql_config, src_tabName, tabType):
    TabCreateScript = mysqlTabCreateScript(srcMysql_config, src_tabName, tabType)
    os.system("""/usr/lib/hive-current/bin/hive -e "{TabCreateScript}" """.format(TabCreateScript=TabCreateScript))

# Batch Test

3、Mysql数据download及Hive数据装载
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive/MysqlData2Hive.py
# -*- coding=utf-8 -*-
import os
from ParProc import *

warnings.filterwarnings("ignore")


def mysqlDataDownload(srcMysql_config, src_tabName, tabType):
    # 参数初始化赋值
    host = getSrcMysqlConfig(srcMysql_config)[0]
    port = getSrcMysqlConfig(srcMysql_config)[1]
    user = getSrcMysqlConfig(srcMysql_config)[2]
    passwd = getSrcMysqlConfig(srcMysql_config)[3]
    db = getSrcMysqlConfig(srcMysql_config)[4]

    if os.path.exists('/home/hadoop/nisj/Mysql2Hive/tmp_data/') == False:
        os.system('mkdir -p /home/hadoop/nisj/Mysql2Hive/tmp_data/')

    if tabType == 'single':
        os.system("""source /etc/profile; \
                /usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \
                -N -e"set names utf8; \
                select * from {src_tabName};" \
                >/home/hadoop/nisj/Mysql2Hive/tmp_data/xxx_{src_tabName}.txt \
                """ .format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName))

    elif tabType == 'submeter':
        os.system("rm -rf /home/hadoop/nisj/Mysql2Hive/tmp_data/xxx_{src_tabName}.txt".format(src_tabName=src_tabName))
        for submeterPlus in range(0, 256, 1):
            os.system("""source /etc/profile; \
                    /usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -D{db} \
                    -N -e"set names utf8; \
                    select * from {db}.{src_tabName}_{submeterPlus} ;" \
                    >>/home/hadoop/nisj/Mysql2Hive/tmp_data/xxx_{src_tabName}.txt \
                    """.format(host=host, port=port, user=user, passwd=passwd, db=db, src_tabName=src_tabName, submeterPlus=submeterPlus))

def DataUploadHive(src_tabName):
    os.system("""/usr/lib/hive-current/bin/hive -e "load data local inpath '/home/hadoop/nisj/Mysql2Hive/tmp_data/xxx_{src_tabName}.txt' overwrite into table xxx_{src_tabName};" """.format(src_tabName=src_tabName))
    os.system("rm -rf /home/hadoop/nisj/Mysql2Hive/tmp_data/xxx_{src_tabName}.txt ".format(src_tabName=src_tabName))

def MysqlData2hive(srcMysql_config, src_tabName, tabType):
    mysqlDataDownload(srcMysql_config, src_tabName, tabType)
    DataUploadHive(src_tabName)

# Batch Test

4、Mysql2Hive的串行调度
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive/Mysql2HiveCtl.py
# -*- coding=utf-8 -*-
from HiveCreateTab import *
from MysqlData2Hive import *

warnings.filterwarnings("ignore")


def Mysql2Hive(srcMysql_config, src_tabName, tabType):
    HiveCreateTab(srcMysql_config, src_tabName, tabType)
    MysqlData2hive(srcMysql_config, src_tabName, tabType)

# Batch Test
# Mysql2Hive(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='game_zone', tabType='single')
# Mysql2Hive(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='live_history_status', tabType='submeter')
# Mysql2Hive(srcMysql_config=srcMysqlConfig_jellyfish_user, src_tabName='user_profile', tabType='submeter')
# Mysql2Hive(srcMysql_config=srcMysqlConfig_jellyfish_seed, src_tabName='room', tabType='single')
# Mysql2Hive(srcMysql_config=srcMysqlConfig_jellyfish_user, src_tabName='user_id_card', tabType='single')
# Mysql2Hive(srcMysql_config=srcMysqlConfig_jellyfish_seed, src_tabName='game', tabType='single')
# Mysql2Hive(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='match_apply', tabType='single')
# Mysql2Hive(srcMysql_config=srcMysqlConfig_jellyfish_server, src_tabName='user_daily_sign_record', tabType='single')
# Mysql2Hive(srcMysql_config=srcMysqlConfig_jellyfish_event, src_tabName='event_online_count', tabType='single')
Mysql2Hive(srcMysql_config=srcMysqlConfig_jellyfish_event, src_tabName='event_award_201611', tabType='single')
Mysql2Hive(srcMysql_config=srcMysqlConfig_jellyfish_hadoop_stat, src_tabName='room_group', tabType='single')

5、Mysql2Hive的并行调度
/Users/nisj/PycharmProjects/BiDataProc/Mysql2Hive/BatchThread.py
# -*- coding=utf-8 -*-
import threadpool
import time
from Mysql2HiveCtl import *

warnings.filterwarnings("ignore")

today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time


batch_SrcTab_list = [([srcMysqlConfig_jellyfish_server, 'game_zone', 'single'], None), ([srcMysqlConfig_jellyfish_server, 'live_history_status', 'submeter'], None), ([srcMysqlConfig_jellyfish_user, 'user_profile', 'submeter'], None)]
requests = []
request_Mysql2Hive_batchCtl = threadpool.makeRequests(Mysql2Hive, batch_SrcTab_list)
requests.extend(request_Mysql2Hive_batchCtl)
main_pool = threadpool.ThreadPool(8)
[main_pool.putRequest(req) for req in requests]

if __name__ == '__main__':
    while True:
        try:
            time.sleep(30)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time

你可能感兴趣的:(BigData,Mysql,#,Hive,Solution,projectCase)