Python中将mysql小量数据拉入Hive计算后结果再导入Mysql模板示例

本文只是一个将Mysql里的小量表,先拉入Hive中;然后结合Hive里原已有的巨量表进行统计分析计算;最后将统计结果再导入Mysql中供查询及报表展示。仅是一个模板示例,供以后参考使用。
/Users/nisj/PycharmProjects/BiDataProc/Demand/hadoopStat/anchorOperateGroup.py
# -*- coding=utf-8 -*-
import datetime
import time
import os
import warnings
import sys
import re
reload(sys)
sys.setdefaultencoding('utf8')

warnings.filterwarnings("ignore")

yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')

def getDay7BeforeAndYesterday(runDay):
    yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    day7Before = (datetime.datetime.strptime(runDay, '%Y-%m-%d') - datetime.timedelta(days=7)).strftime('%Y-%m-%d')
    return yesterday, day7Before

def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

def mysqlMiniData2hive(runDay):
    miniData = os.popen("""source /etc/profile; \
                /usr/bin/mysql  -hMysqlHost -P6605 -uMysqlUser -pMysqlPass--default-character-set=utf8 -N -e "use jellyfish_hadoop_stat; \
                select \`group\`,room_id from room_group; \
                " """.format(runDay=runDay)).readlines();

    miniDataList = []
    for miniDataRow in miniData:
        miniD = re.split('\t', miniDataRow.replace('\n', '').replace('`', '').replace('\'', '').replace('"', ''))
        miniDataList.append(miniD)

    os.system("""source /etc/profile; \
               /usr/lib/hive-current/bin/hive -e " \
                truncate table xxx_room_group_min; \
                " """)

    i = 0
    insert2HiveSqlText = "insert into xxx_room_group_min(group_id,room_id) values "
    for miniDataVal in miniDataList:
        # print miniDataVal[0],miniDataVal[1],miniDataVal[2]
        group_id = miniDataVal[0]
        room_id = miniDataVal[1]
        # etl_time = time.strftime('%Y-%m-%d %X', time.localtime())

        i += 1

        insert2HiveSqlText = insert2HiveSqlText + "({group_id},{room_id}),".format(group_id=group_id, room_id=room_id)

        if (i % 8888888 == 0):
            insert2HiveSqlText = insert2HiveSqlText[0:-1] + ";"
            os.system("""source /etc/profile; \
                       /usr/lib/hive-current/bin/hive -e " \
                        {insert2HiveSqlText} \
                        " """.format(insert2HiveSqlText=insert2HiveSqlText))

            insert2HiveSqlText = "insert into xxx_room_group_min(group_id,room_id) values "

    insert2HiveSqlText = insert2HiveSqlText[0:-1] + ";"
    os.system("""source /etc/profile; \
               /usr/lib/hive-current/bin/hive -e " \
                {insert2HiveSqlText} \
                " """.format(insert2HiveSqlText=insert2HiveSqlText))

def anchorOperateGroup2Mysql(runDay):
    day7Before=getDay7BeforeAndYesterday(runDay)[1]
    anchorOperateGroups=os.popen("""source /etc/profile; \
                /usr/lib/hive-current/bin/hive -e " \
                with tab_user_frist_subscriber as (select x.room_id,x.uid view_uid,x.state,substr(x.created_time,1,10) subscriber_date \
                from (select room_id,uid,state,created_time,row_number()over(partition by uid order by created_time asc) rk \
                from oss_room_subscriber_roomid \
                where pt_day ='{yesterday}') x \
                inner join (select uid \
                from oss_bi_all_user_profile \
                where pt_day='{runDay}' and substr(created_time,1,10) between '{day7Before}' and '{runDay}' \
                group by uid) x1 on x.uid=x1.uid \
                inner join xxx_room_group_min x2 on x.room_id=x2.room_id \
                where x.rk=1 and substr(x.created_time,1,10)='{runDay}') \
                select subscriber_date,room_id,count(distinct view_uid) newadd_subscriber_cnt \
                from tab_user_frist_subscriber \
                group by subscriber_date,room_id; \
                " """.format(runDay=runDay, yesterday=yesterday, day7Before=day7Before)).readlines();

    anchorOperateGroup_list = []
    for anchorOperateGroupList in anchorOperateGroups:
        anchorOperateGroup = re.split('\t', anchorOperateGroupList.replace('\n', ''))
        anchorOperateGroup_list.append(anchorOperateGroup)

    # data rollback
    os.system("""source /etc/profile; \
                /usr/bin/mysql  -hMysqlHost -P6605 -uMysqlUser -pMysqlPass--default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
                delete from jellyfish_hadoop_stat.group_anchor_fans_stat where statistics_date='{runDay}' \
                " """.format(runDay=runDay))

    i = 0
    insert_sql_text = "insert into jellyfish_hadoop_stat.group_anchor_fans_stat(room_id,statistics_date,fans_count,created_time) values "
    for anchorOperateGroup in anchorOperateGroup_list:
        statistics_date=anchorOperateGroup[0]
        room_id=anchorOperateGroup[1]
        fans_count=anchorOperateGroup[2]
        etl_time=time.strftime('%Y-%m-%d %X', time.localtime())

        i += 1

        insert_sql_text = insert_sql_text + "({room_id},'{statistics_date}',{fans_count},'{etl_time}'),".format(statistics_date=statistics_date, room_id=room_id, fans_count=fans_count, etl_time=etl_time)

        if (i % 1000 == 0):
            insert_sql_text = insert_sql_text[0:-1] + ";"
            os.system("""source /etc/profile; \
                    /usr/bin/mysql  -hMysqlHost -P6605 -uMysqlUser -pMysqlPass--default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
                    {insert_sql_text} \
                    " """.format(insert_sql_text=insert_sql_text))

            insert_sql_text = "insert into jellyfish_hadoop_stat.group_anchor_fans_stat(room_id,statistics_date,fans_count,created_time) values "

    insert_sql_text = insert_sql_text[0:-1] + ";"
    os.system("""source /etc/profile; \
                /usr/bin/mysql  -hMysqlHost -P6605 -uMysqlUser -pMysqlPass--default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
                {insert_sql_text} \
                " """.format(insert_sql_text=insert_sql_text))


# Batch Test
mysqlMiniData2hive(runDay=yesterday)
anchorOperateGroup2Mysql(runDay=yesterday)

# for runDay in dateRange(beginDate='2017-11-01', endDate='2017-11-13'):
#     mysqlMiniData2hive(runDay=runDay)
#     anchorOperateGroup2Mysql(runDay=runDay)

你可能感兴趣的:(Python,Solution)