用Python进行主播次日留存、留存粉丝周及月充值的计算示例

关注点:1、Hive临时表的使用
2、Hive执行任务因为自动MapJoin而产生的内存不足情况的调优
3、次日留存及周充值的装载和月充值的更新
4、传统数据库ifnull功能Hive中coalesce函数的使用
5、Mysql小表数据在hive上的装载

python代码实现脚本:
/Users/nisj/PycharmProjects/BiDataProc/Demand/hadoopStat/anchorPullnewRemainAndPay.py

# -*- coding=utf-8 -*-
import datetime
import time
import os
import warnings
import sys
import re
reload(sys)
sys.setdefaultencoding('utf8')

warnings.filterwarnings("ignore")

yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')

def getDayForThis(runDay):
    yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    day1Before = (datetime.datetime.strptime(runDay, '%Y-%m-%d') - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    day1After = (datetime.datetime.strptime(runDay, '%Y-%m-%d') + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    day7After = (datetime.datetime.strptime(runDay, '%Y-%m-%d') + datetime.timedelta(days=6)).strftime('%Y-%m-%d')
    dayRun30start = (datetime.datetime.strptime(runDay, '%Y-%m-%d') + datetime.timedelta(days=(6-29))).strftime('%Y-%m-%d')
    dayRun30end = day7After
    return yesterday, day1Before, day1After, day7After, dayRun30start, dayRun30end

def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

def mysqlMiniData2hive():
    miniData = os.popen("""source /etc/profile; \
                /usr/bin/mysql  -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass --default-character-set=utf8 -N -e "use jellyfish_hadoop_stat; \
                select uid,room_id from invite_anchor; \
                " """).readlines();

    miniDataList = []
    for miniDataRow in miniData:
        miniD = re.split('\t', miniDataRow.replace('\n', '').replace('`', '').replace('\'', '').replace('"', ''))
        miniDataList.append(miniD)

    os.system("""source /etc/profile; \
               /usr/lib/hive-current/bin/hive -e " \
                truncate table xxx_invite_anchor_min; \
                " """)

    i = 0
    insert2HiveSqlText = "insert into xxx_invite_anchor_min(uid,room_id) values "
    for miniDataVal in miniDataList:
        # print miniDataVal[0],miniDataVal[1],miniDataVal[2]
        uid = miniDataVal[0]
        room_id = miniDataVal[1]
        # etl_time = time.strftime('%Y-%m-%d %X', time.localtime())

        i += 1

        insert2HiveSqlText = insert2HiveSqlText + "({uid},{room_id}),".format(uid=uid, room_id=room_id)

        if (i % 8888888 == 0):
            insert2HiveSqlText = insert2HiveSqlText[0:-1] + ";"
            os.system("""source /etc/profile; \
                       /usr/lib/hive-current/bin/hive -e " \
                        {insert2HiveSqlText} \
                        " """.format(insert2HiveSqlText=insert2HiveSqlText))

            insert2HiveSqlText = "insert into xxx_invite_anchor_min(uid,room_id) values "

    insert2HiveSqlText = insert2HiveSqlText[0:-1] + ";"
    os.system("""source /etc/profile; \
               /usr/lib/hive-current/bin/hive -e " \
                {insert2HiveSqlText} \
                " """.format(insert2HiveSqlText=insert2HiveSqlText))

def anchorPullnewRemaiAndPay(runDay):
    yesterday=getDayForThis(runDay)[0]
    day1Before=getDayForThis(runDay)[1]
    day1After=getDayForThis(runDay)[2]
    day7After=getDayForThis(runDay)[3]
    dayRun30start=getDayForThis(runDay)[4]
    dayRun30end=getDayForThis(runDay)[5]
    os.system("""source /etc/profile; \
                /usr/lib/hive-current/bin/hive -e " \
                add jar /home/hadoop/nisj/udf-jar/hadoop_udf_radixChange.jar; \
                create temporary function RadixChange as 'com.kascend.hadoop.RadixChange'; \
                create temporary table xxxxx_tab_access_morrow as \
                select distinct RadixChange(lower(uid),16,10) uid \
                from bi_all_access_log \
                where pt_day='{day1After}'; \
                create temporary table xxxxx_tab_user_frist_subscriber as \
                select room_id,fans_uid,state,first_subscriber_date \
                from (select room_id,uid fans_uid,state,substr(created_time,1,10) first_subscriber_date,row_number()over(partition by uid order by created_time asc) rk from oss_room_subscriber_roomid where pt_day='{yesterday}') x \
                where rk=1 and first_subscriber_date='{runDay}'; \
                create temporary table xxxxx_tab_user_frist_subscriber_formonth as \
                select room_id,fans_uid,state,first_subscriber_date \
                from (select room_id,uid fans_uid,state,substr(created_time,1,10) first_subscriber_date,row_number()over(partition by uid order by created_time asc) rk from oss_room_subscriber_roomid where pt_day='{yesterday}') x \
                where rk=1 and first_subscriber_date='{dayRun30start}'; \
                create temporary table xxxxx_tab_user_infor as \
                select a2.nickname,a1.id room_id,a2.uid anchor_uid \
                from oss_room_v2 a1 \
                left join oss_bi_all_user_profile a2 on a1.creator_uid=a2.uid \
                where a1.pt_day='{yesterday}' and a2.pt_day='{yesterday}'; \
                create temporary table xxxxx_tab_fans_pay_1week as \
                select uid,sum(amount) pay_amount \
                from data_chushou_pay_info \
                where pt_day between '{runDay}' and '{day7After}' and state=0 \
                group by uid; \
                create temporary table xxxxx_tab_fans_pay_1month as \
                select uid,sum(amount) pay_amount \
                from data_chushou_pay_info  \
                where pt_day between '{dayRun30start}' and '{dayRun30end}' and state=0 \
                group by uid; \
                set hive.auto.convert.join=false; \
                drop table if exists xxxxx_tab_anchor_pullnew_remain_pay1week; \
                create table xxxxx_tab_anchor_pullnew_remain_pay1week as \
                select a1.first_subscriber_date calc_date,a1.room_id,a2.anchor_uid,a2.nickname,count(distinct a1.fans_uid) fans_add_cnt, \
                count(a5.uid) fans_morrow_remain_cnt,sum(a61.pay_amount) week_payamount \
                from xxxxx_tab_user_frist_subscriber a1 \
                inner join xxx_invite_anchor_min a4 on a1.room_id=a4.room_id \
                left join xxxxx_tab_user_infor a2 on a1.room_id=a2.room_id \
                left join xxxxx_tab_access_morrow a5 on a1.fans_uid=a5.uid \
                left join xxxxx_tab_fans_pay_1week a61 on a1.fans_uid=a61.uid \
                group by a1.first_subscriber_date,a1.room_id,a2.anchor_uid,a2.nickname; \
                drop table if exists xxxxx_tab_anchor_pullnew_remain_pay1month; \
                create table xxxxx_tab_anchor_pullnew_remain_pay1month as \
                select a1.first_subscriber_date calc_date,a1.room_id,a2.anchor_uid,a2.nickname,count(distinct a1.fans_uid) fans_add_cnt, \
                sum(a62.pay_amount) month_payamount \
                from xxxxx_tab_user_frist_subscriber_formonth a1 \
                inner join xxx_invite_anchor_min a4 on a1.room_id=a4.room_id \
                left join xxxxx_tab_user_infor a2 on a1.room_id=a2.room_id \
                left join xxxxx_tab_fans_pay_1month a62 on a1.fans_uid=a62.uid \
                group by a1.first_subscriber_date,a1.room_id,a2.anchor_uid,a2.nickname \
                ; \
                " """.format(runDay=runDay, yesterday=yesterday, day1After=day1After, day7After=day7After, dayRun30start=dayRun30start, dayRun30end=dayRun30end));

def anchorPullnewRemaiAndPay2Mysql(runDay):
    anchorPullnews=os.popen("""source /etc/profile; \
                /usr/lib/hive-current/bin/hive -e " \
                select calc_date,room_id,coalesce(anchor_uid,0) anchor_uid,coalesce(nickname,'无昵称') nickname,coalesce(fans_add_cnt,0) fans_add_cnt,coalesce(fans_morrow_remain_cnt,0) fans_morrow_remain_cnt,coalesce(week_payamount,0) week_payamount \
                from xxxxx_tab_anchor_pullnew_remain_pay1week \
                where calc_date='{runDay}'; \
                " """.format(runDay=runDay)).readlines();

    anchorPullnew_list = []
    for anchorPullnewList in anchorPullnews:
        anchorPullnew = re.split('\t', anchorPullnewList.replace('\n', ''))
        anchorPullnew_list.append(anchorPullnew)

    # data rollback
    os.system("""source /etc/profile; \
                /usr/bin/mysql  -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass --default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
                delete from jellyfish_hadoop_stat.anchor_pullnew_morrowremain_daypay where calc_date='{runDay}' \
                " """.format(runDay=runDay))

    i = 0
    insert_sql_text = "insert into jellyfish_hadoop_stat.anchor_pullnew_morrowremain_daypay(calc_date,room_id,anchor_uid,nickname,fans_add_cnt,fans_morrow_remain_cnt,week_payamount,etl_time,update_time) values "
    for anchorPullnew in anchorPullnew_list:
        calc_date=anchorPullnew[0]
        room_id=anchorPullnew[1]
        anchor_uid=anchorPullnew[2]
        nickname=str(anchorPullnew[3]).replace('\n', '').replace('`', '').replace('\'', '').replace('"', '').replace('\\', '')
        fans_add_cnt=anchorPullnew[4]
        fans_morrow_remain_cnt=anchorPullnew[5]
        week_payamount=anchorPullnew[6]
        etl_time=time.strftime('%Y-%m-%d %X', time.localtime())

        i += 1

        insert_sql_text = insert_sql_text + "('{calc_date}',{room_id},{anchor_uid},'{nickname}',{fans_add_cnt},{fans_morrow_remain_cnt},{week_payamount},'{etl_time}','{update_time}'),".format(calc_date=calc_date, room_id=room_id, anchor_uid=anchor_uid, nickname=nickname, fans_add_cnt=fans_add_cnt, fans_morrow_remain_cnt=fans_morrow_remain_cnt, week_payamount=week_payamount, etl_time=etl_time, update_time=etl_time)

        if (i % 1000 == 0):
            insert_sql_text = insert_sql_text[0:-1] + ";"
            os.system("""source /etc/profile; \
                    /usr/bin/mysql  -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass --default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
                    {insert_sql_text} \
                    " """.format(insert_sql_text=insert_sql_text))

            insert_sql_text = "insert into jellyfish_hadoop_stat.anchor_pullnew_morrowremain_daypay(calc_date,room_id,anchor_uid,nickname,fans_add_cnt,fans_morrow_remain_cnt,week_payamount,etl_time,update_time) values "

    insert_sql_text = insert_sql_text[0:-1] + ";"
    os.system("""source /etc/profile; \
                /usr/bin/mysql  -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass --default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
                {insert_sql_text} \
                " """.format(insert_sql_text=insert_sql_text))

def anchorPullnewMonthPayUpdate2Mysql(runDay):
    dayRun30start=getDayForThis(runDay)[4]
    anchorPullnews=os.popen("""source /etc/profile; \
                /usr/lib/hive-current/bin/hive -e " \
                select calc_date,room_id,coalesce(anchor_uid,0) anchor_uid,coalesce(nickname,'无昵称') nickname,coalesce(fans_add_cnt,0) fans_add_cnt,coalesce(month_payamount,0) month_payamount \
                from xxxxx_tab_anchor_pullnew_remain_pay1month \
                where calc_date='{dayRun30start}'; \
                " """.format(dayRun30start=dayRun30start)).readlines();

    anchorPullnew_list = []
    for anchorPullnewList in anchorPullnews:
        anchorPullnew = re.split('\t', anchorPullnewList.replace('\n', ''))
        anchorPullnew_list.append(anchorPullnew)

    for anchorPullnew in anchorPullnew_list:
        calc_date=anchorPullnew[0]
        room_id=anchorPullnew[1]
        anchor_uid=anchorPullnew[2]
        nickname=str(anchorPullnew[3]).replace('\n', '').replace('`', '').replace('\'', '').replace('"', '').replace('\\', '')
        fans_add_cnt=anchorPullnew[4]
        month_payamount=anchorPullnew[5]
        etl_time=time.strftime('%Y-%m-%d %X', time.localtime())

        # 数据更新
        os.system("""source /etc/profile; \
                /usr/bin/mysql  -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass --default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
                update jellyfish_hadoop_stat.anchor_pullnew_morrowremain_daypay \
                   set month_payamount={month_payamount},update_time='{update_time}' \
                where calc_date='{dayRun30start}' \
                  and room_id={room_id} \
                  and anchor_uid={anchor_uid}; \
                " """.format(dayRun30start=dayRun30start, room_id=room_id, anchor_uid=anchor_uid, month_payamount=month_payamount, update_time=etl_time))

# Batch Test
batchDay = (datetime.date.today() - datetime.timedelta(days=8)).strftime('%Y-%m-%d')
# mysqlMiniData2hive()
# anchorPullnewRemaiAndPay(runDay=batchDay)
# anchorPullnewRemaiAndPay2Mysql(runDay=batchDay)
# anchorPullnewMonthPayUpdate2Mysql(runDay=batchDay)

for batchDay in dateRange(beginDate='2017-06-01', endDate='2017-12-12'):
    mysqlMiniData2hive()
    anchorPullnewRemaiAndPay(runDay=batchDay)
    anchorPullnewRemaiAndPay2Mysql(runDay=batchDay)
    anchorPullnewMonthPayUpdate2Mysql(runDay=batchDay)

 

你可能感兴趣的:(Python,projectCase)