Python多线程跑Hive批一例

本示例的功能主要是按天计算新设备在接下来一个月内每天的留存、注册转化、转化用户充值及注册用户留存等。程序分为三大部分:数据日期的处理、计算留存等指标的过程及批量代码多线程调度。
1、数据日期处理的代码
/Users/nisj/PycharmProjects/EsDataProc/RemainProcByDay/DayProc.py
# -*- coding=utf-8 -*-
import warnings
import datetime

warnings.filterwarnings("ignore")

def getNowDay():
    DayNow = datetime.datetime.today().strftime('%Y-%m-%d')
    return DayNow

def getFristDay():
    FristDay=datetime.datetime.strptime('2015-06-29', '%Y-%m-%d').strftime('%Y-%m-%d')
    return FristDay

def getDayAllList(run_day):
    DayAllList=[]
    begin_date = datetime.datetime.strptime(getFristDay(), "%Y-%m-%d")

    if run_day != '':
        end_date = datetime.datetime.strptime(run_day, '%Y-%m-%d')
    else:
        end_date = datetime.datetime.strptime(getNowDay(), "%Y-%m-%d")

    while begin_date <= end_date:
        date_str = begin_date.strftime("%Y-%m-%d")
        DayAllList.append(date_str)
        begin_date += datetime.timedelta(days=1)
    return DayAllList

def getDayRunList(run_day):
    DayRunList=[]
    if run_day != '':
        end_date = datetime.datetime.strptime(run_day, '%Y-%m-%d')
    else:
        end_date = datetime.datetime.strptime(getNowDay(), "%Y-%m-%d")

    if end_date - datetime.timedelta(days=60) <= datetime.datetime.strptime(getFristDay(), "%Y-%m-%d"):
        begin_date = datetime.datetime.strptime(getFristDay(), "%Y-%m-%d")
    else:
        begin_date = end_date - datetime.timedelta(days=60)

    while begin_date <= end_date:
        date_str = begin_date.strftime("%Y-%m-%d")
        DayRunList.append(date_str)
        begin_date += datetime.timedelta(days=1)

    return DayRunList

def getDayRemainRunIntervalList(str_begin_date, str_end_date):
    DayList=[]
    begin_date = datetime.datetime.strptime(str_begin_date, "%Y-%m-%d")
    end_date = datetime.datetime.strptime(str_end_date, "%Y-%m-%d")

    while begin_date <= end_date:
        date_str = begin_date.strftime("%Y-%m-%d")
        DayList.append(date_str)
        begin_date += datetime.timedelta(days=1)
    return DayList

# Batch Test
# run_day = '2016-07-10'
# for remain_day in getDayAllList(run_day):
#     for his_day in getDayRunList(run_day=remain_day):
#         print remain_day, his_day

# one day
# for his_day in getDayRunList(run_day=run_day):
#     print run_day, his_day

# run_day = '2017-01-01'
# for remain_day in getDayRunList(run_day):
#     print remain_day,run_day
    # for his_day in getDayRunList(run_day=remain_day):
    #     print remain_day, his_day

# 计算2017年的数据信息
# str_begin_date = '2017-01-01'
# str_end_date = '2017-02-05'
# for remain_day in getDayRemainRunIntervalList(str_begin_date, str_end_date):
#     for his_day in getDayRunList(run_day=remain_day):
#         print remain_day,his_day

2、留存等指标的计算
/Users/nisj/PycharmProjects/EsDataProc/RemainProcByDay/Hive_remain_byDay_proc.py
# -*- coding=utf-8 -*-
from DayProc import *
import os,re,time

warnings.filterwarnings("ignore")


def identifier_remain_byday(remain_day, his_day):
    os.system("""/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -e "use funnyai_data; \
                delete from bi_identifier_remain_byday where his_day='%s' and remain_day='%s'; \
                 " """ % (his_day, remain_day))

    identifier_remain_data = os.popen("""source /etc/profile; \
            /usr/lib/hive-current/bin/hive -e " \
            with tab_new_identifier_byDay as ( \
            select appsource,appkey,identifier from bi_all_new_identifier_log \
            where pt_day = '%s' \
            ), \
            tab_access_log_byDay as ( \
            select identifier from bi_all_access_log \
            where pt_day = '%s' \
            group by identifier) \
            select '%s' his_day,'%s' remain_day, \
            a1.appsource,a1.appkey,count(a1.identifier) remain_identifier_cnt \
            from tab_new_identifier_byDay a1 \
            inner join tab_access_log_byDay a2 on a1.identifier=a2.identifier \
            group by a1.appsource,a1.appkey;" \
            """ % (his_day, remain_day, his_day, remain_day)).readlines();

    ird_list = []
    for ir_list in identifier_remain_data:
        ir = re.split('\t', ir_list.replace('\n', ''))
        ird_list.append(ir)
    for ird in ird_list:
        his_day = ird[0]
        remain_day = ird[1]
        appsource = ird[2]
        appkey = ird[3]
        remain_identifier_cnt = ird[4]
        etl_time = time.strftime('%Y-%m-%d %X', time.localtime())

        os.system("""/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -e "use funnyai_data; \
        insert into bi_identifier_remain_byday(his_day, remain_day, appsource, appkey, remain_identifier_cnt, etl_time) \
        select '%s','%s','%s','%s','%s','%s'; \
         " """ % (his_day, remain_day, appsource, appkey, remain_identifier_cnt, etl_time))


def user_registert_transf_byday(remain_day, his_day):
    os.system("""/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -e "use funnyai_data; \
                delete from bi_user_registert_transf_byday where his_day='%s' and remain_day='%s'; \
                 " """ % (his_day, remain_day))

    user_registert_transf_data = os.popen("""source /etc/profile; \
            /usr/lib/hive-current/bin/hive -e " \
            select '%s' his_day,'%s' remain_day, \
            appsource,appkey,count(*) new_ide_reg_cnt \
            from bi_all_register_info \
            where (iden_day = '%s') and (pt_day = '%s') \
            group by appsource,appkey;" \
            """ % (his_day, remain_day, his_day, remain_day)).readlines();

    urtd_list = []
    for urt_list in user_registert_transf_data:
        urt = re.split('\t', urt_list.replace('\n', ''))
        urtd_list.append(urt)
    for urtd in urtd_list:
        his_day = urtd[0]
        remain_day = urtd[1]
        appsource = urtd[2]
        appkey = urtd[3]
        new_ide_reg_cnt = urtd[4]
        etl_time = time.strftime('%Y-%m-%d %X', time.localtime())

        os.system("""/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -e "use funnyai_data; \
            insert into bi_user_registert_transf_byday(his_day, remain_day, appsource, appkey, new_ide_reg_cnt, etl_time) \
            select '%s','%s','%s','%s','%s','%s'; \
             " """ % (his_day, remain_day, appsource, appkey, new_ide_reg_cnt, etl_time))


def user_remain_pay_byday(remain_day, his_day):
    os.system("""/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -e "use funnyai_data; \
                delete from bi_user_remain_pay_byday where his_day='%s' and remain_day='%s'; \
                 " """ % (his_day, remain_day))

    user_remain_pay_data = os.popen("""source /etc/profile; \
            /usr/lib/hive-current/bin/hive -e " \
            with tab_user_register_info as( \
            select uid,appsource,appkey from bi_all_register_info \
            where (iden_day = '%s') \
            ), \
            tab_user_pay_info as ( \
            select uid,sum(amount) pay_amount,count(*) pay_cnt from data_chushou_pay_info \
            where state=0 \
            and (pt_day = '%s') \
            group by uid \
            ) \
            select '%s' his_day,'%s' remain_day, \
            a1.appsource,a1.appkey,sum(pay_amount) pay_amount,sum(pay_cnt) pay_cnt,count(a1.uid) pay_uid_cnt \
            from tab_user_register_info a1 \
            inner join tab_user_pay_info a2 on a1.uid=a2.uid \
            group by a1.appsource,a1.appkey;" \
            """ % (his_day, remain_day, his_day, remain_day)).readlines();

    urpd_list = []
    for urp_list in user_remain_pay_data:
        urp = re.split('\t', urp_list.replace('\n', ''))
        urpd_list.append(urp)
    for urpd in urpd_list:
        his_day = urpd[0]
        remain_day = urpd[1]
        appsource = urpd[2]
        appkey = urpd[3]
        pay_amount = urpd[4]
        pay_cnt = urpd[5]
        pay_uid_cnt = urpd[6]
        etl_time = time.strftime('%Y-%m-%d %X', time.localtime())

        os.system("""/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -e "use funnyai_data; \
                insert into bi_user_remain_pay_byday(his_day, remain_day, appsource, appkey, pay_amount, pay_cnt, pay_uid_cnt, etl_time) \
                select '%s','%s','%s','%s','%s','%s','%s','%s'; \
                 " """ % (his_day, remain_day, appsource, appkey, pay_amount, pay_cnt, pay_uid_cnt, etl_time))


def registert_user_remain_byday(remain_day, his_day):
    os.system("""/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -e "use funnyai_data; \
                delete from bi_registert_user_remain_byday where his_day='%s' and remain_day='%s'; \
                 " """ % (his_day, remain_day))

    registert_user_remain_data = os.popen("""source /etc/profile; \
            /usr/lib/hive-current/bin/hive -e " \
            add jar /home/hadoop/nisj/udf-jar/hadoop_udf_radixChange.jar; \
            create temporary function RadixChange as 'com.kascend.hadoop.RadixChange'; \
            with tab_user_register_info as( \
            select uid,appsource,appkey from bi_all_register_info \
            where (iden_day = '%s') \
            ), \
            tab_access_log_byDay as ( \
            select RadixChange(lower(uid),16,10) uid from bi_all_access_log \
            where pt_day = '%s' \
            group by RadixChange(lower(uid),16,10)) \
            select '%s' his_day,'%s' remain_day, \
            a1.appsource,a1.appkey,count(a2.uid) uid_remain_cnt \
            from tab_user_register_info a1 \
            inner join tab_access_log_byDay a2 on a1.uid=a2.uid \
            group by a1.appsource,a1.appkey;" \
            """ % (his_day, remain_day, his_day, remain_day)).readlines();

    rurd_list = []
    for rur_list in registert_user_remain_data:
        rur = re.split('\t', rur_list.replace('\n', ''))
        rurd_list.append(rur)
    for rurd in rurd_list:
        his_day = rurd[0]
        remain_day = rurd[1]
        appsource = rurd[2]
        appkey = rurd[3]
        uid_remain_cnt = rurd[4]
        etl_time = time.strftime('%Y-%m-%d %X', time.localtime())

        os.system("""/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass -e "use funnyai_data; \
            insert into bi_registert_user_remain_byday(his_day, remain_day, appsource, appkey, uid_remain_cnt, etl_time) \
            select '%s','%s','%s','%s','%s','%s'; \
             " """ % (his_day, remain_day, appsource, appkey, uid_remain_cnt, etl_time))


# Batch Test
# run_day = '2016-07-10'
# for remain_day in getDayAllList(run_day):
#     for his_day in getDayRunList(run_day=remain_day):
#         print remain_day, his_day

# for his_day in getDayRunList(run_day=run_day):
#     # print run_day, his_day
#     identifier_remain_byday(remain_day=run_day, his_day=his_day)

附:指标Sql打印测试
/Users/nisj/PycharmProjects/EsDataProc/RemainProcByDay/Hive_remain_byDay_proc_printSql.py
# -*- coding=utf-8 -*-
from DayProc import *

warnings.filterwarnings("ignore")


def identifier_remain_byday(remain_day, his_day):

    sql_text = """source /etc/profile; \
            /usr/lib/hive-current/bin/hive -e " \
            with tab_new_identifier_byDay as ( \
            select appsource,appkey,identifier from bi_all_new_identifier_log \
            where pt_day = '%s' \
            ), \
            tab_access_log_byDay as ( \
            select identifier from bi_all_access_log \
            where pt_day = '%s' \
            group by identifier) \
            select '%s' his_day,'%s' remain_day, \
            a1.appsource,a1.appkey,count(a1.identifier) remain_identifier_cnt \
            from tab_new_identifier_byDay a1 \
            inner join tab_access_log_byDay a2 on a1.identifier=a2.identifier \
            group by a1.appsource,a1.appkey;" \
            """ % (his_day, remain_day, his_day, remain_day);
    print sql_text


def user_registert_transf_byday(remain_day, his_day):

    sql_text = """source /etc/profile; \
            /usr/lib/hive-current/bin/hive -e " \
            select '%s' his_day,'%s' remain_day, \
            appsource,appkey,count(*) new_ide_reg_cnt \
            from bi_all_register_info \
            where (iden_day = '%s') and (pt_day = '%s') \
            group by appsource,appkey;" \
            """ % (his_day, remain_day, his_day, remain_day);
    print sql_text

def user_remain_pay_byday(remain_day, his_day):

    sql_text = """source /etc/profile; \
            /usr/lib/hive-current/bin/hive -e " \
            with tab_user_register_info as( \
            select uid,appsource,appkey from bi_all_register_info \
            where (iden_day = '%s') \
            ), \
            tab_user_pay_info as ( \
            select uid,sum(amount) pay_amount,count(*) pay_cnt from data_chushou_pay_info \
            where state=0 \
            and (pt_day = '%s') \
            group by uid \
            ) \
            select '%s' his_day,'%s' remain_day, \
            a1.appsource,a1.appkey,sum(pay_amount) pay_amount,sum(pay_cnt) pay_cnt,count(a1.uid) pay_uid_cnt \
            from tab_user_register_info a1 \
            inner join tab_user_pay_info a2 on a1.uid=a2.uid \
            group by a1.appsource,a1.appkey;" \
            """ % (his_day, remain_day, his_day, remain_day);
    print sql_text


def registert_user_remain_byday(remain_day, his_day):

    sql_text = """source /etc/profile; \
            /usr/lib/hive-current/bin/hive -e " \
            add jar /home/hadoop/nisj/udf-jar/hadoop_udf_radixChange.jar; \
            create temporary function RadixChange as 'com.kascend.hadoop.RadixChange'; \
            with tab_user_register_info as( \
            select uid,appsource,appkey from bi_all_register_info \
            where (iden_day = '%s') \
            ), \
            tab_access_log_byDay as ( \
            select RadixChange(lower(uid),16,10) uid from bi_all_access_log \
            where pt_day = '%s' \
            group by RadixChange(lower(uid),16,10)) \
            select '%s' his_day,'%s' remain_day, \
            a1.appsource,a1.appkey,count(a2.uid) uid_remain_cnt \
            from tab_user_register_info a1 \
            inner join tab_access_log_byDay a2 on a1.uid=a2.uid \
            group by a1.appsource,a1.appkey;" \
            """ % (his_day, remain_day, his_day, remain_day);
    print sql_text




# Batch Test
# run_day = '2016-07-10'
# for remain_day in getDayAllList(run_day):
#     for his_day in getDayRunList(run_day=remain_day):
#         print remain_day, his_day

# for his_day in getDayRunList(run_day=run_day):
#     # print run_day, his_day
#     identifier_remain_byday(remain_day=run_day, his_day=his_day)

3、多线程调度
/Users/nisj/PycharmProjects/EsDataProc/RemainProcByDay/BatchThread.py
# -*- coding=utf-8 -*-
import threadpool
from Hive_remain_byDay_proc import *

warnings.filterwarnings("ignore")

today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time


# run_day = '2017-02-03'
# for remain_day in getDayAllList(run_day):
#     for his_day in getDayRunList(run_day=remain_day):
#         print remain_day, his_day

# one day
# for his_day in getDayRunList(run_day=run_day):
#     print run_day, his_day

# batch_day_list = []
# for his_day in getDayRunList(run_day=run_day):
#     batch_day_list.append(([run_day, his_day], None))

# 计算2017年的数据信息
str_begin_date = '2017-01-01'
str_end_date = '2017-02-05'
batch_day_list = []
for remain_day in getDayRemainRunIntervalList(str_begin_date, str_end_date):
    for his_day in getDayRunList(run_day=remain_day):
        batch_day_list.append(([remain_day, his_day], None))

requests = []
request_identifier_remain_byday = threadpool.makeRequests(identifier_remain_byday, batch_day_list)
request_user_registert_transf_byday = threadpool.makeRequests(user_registert_transf_byday, batch_day_list)
request_user_remain_pay_byday = threadpool.makeRequests(user_remain_pay_byday, batch_day_list)
request_registert_user_remain_byday = threadpool.makeRequests(registert_user_remain_byday, batch_day_list)

requests.extend(request_identifier_remain_byday)
requests.extend(request_user_registert_transf_byday)
requests.extend(request_user_remain_pay_byday)
requests.extend(request_registert_user_remain_byday)

main_pool = threadpool.ThreadPool(16)
[main_pool.putRequest(req) for req in requests]

if __name__ == '__main__':
    while True:
        try:
            time.sleep(100)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time

期间可能会用到[threadpool.py](如果threadpool模块没有安装),需要将对应代码文件拷到【/Users/nisj/PycharmProjects/EsDataProc/RemainProcByDay/threadpool.py】。



你可能感兴趣的:(Python,Solution)