本文只是一个将Mysql里的小量表,先拉入Hive中;然后结合Hive里原已有的巨量表进行统计分析计算;最后将统计结果再导入Mysql中供查询及报表展示。仅是一个模板示例,供以后参考使用。
/Users/nisj/PycharmProjects/BiDataProc/Demand/hadoopStat/anchorOperateGroup.py
# -*- coding=utf-8 -*-
import datetime
import time
import os
import warnings
import sys
import re
reload(sys)
sys.setdefaultencoding('utf8')
warnings.filterwarnings("ignore")
yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
def getDay7BeforeAndYesterday(runDay):
yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
day7Before = (datetime.datetime.strptime(runDay, '%Y-%m-%d') - datetime.timedelta(days=7)).strftime('%Y-%m-%d')
return yesterday, day7Before
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def mysqlMiniData2hive(runDay):
miniData = os.popen("""source /etc/profile; \
/usr/bin/mysql -hMysqlHost -P6605 -uMysqlUser -pMysqlPass--default-character-set=utf8 -N -e "use jellyfish_hadoop_stat; \
select \`group\`,room_id from room_group; \
" """.format(runDay=runDay)).readlines();
miniDataList = []
for miniDataRow in miniData:
miniD = re.split('\t', miniDataRow.replace('\n', '').replace('`', '').replace('\'', '').replace('"', ''))
miniDataList.append(miniD)
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
truncate table xxx_room_group_min; \
" """)
i = 0
insert2HiveSqlText = "insert into xxx_room_group_min(group_id,room_id) values "
for miniDataVal in miniDataList:
# print miniDataVal[0],miniDataVal[1],miniDataVal[2]
group_id = miniDataVal[0]
room_id = miniDataVal[1]
# etl_time = time.strftime('%Y-%m-%d %X', time.localtime())
i += 1
insert2HiveSqlText = insert2HiveSqlText + "({group_id},{room_id}),".format(group_id=group_id, room_id=room_id)
if (i % 8888888 == 0):
insert2HiveSqlText = insert2HiveSqlText[0:-1] + ";"
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
{insert2HiveSqlText} \
" """.format(insert2HiveSqlText=insert2HiveSqlText))
insert2HiveSqlText = "insert into xxx_room_group_min(group_id,room_id) values "
insert2HiveSqlText = insert2HiveSqlText[0:-1] + ";"
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
{insert2HiveSqlText} \
" """.format(insert2HiveSqlText=insert2HiveSqlText))
def anchorOperateGroup2Mysql(runDay):
day7Before=getDay7BeforeAndYesterday(runDay)[1]
anchorOperateGroups=os.popen("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
with tab_user_frist_subscriber as (select x.room_id,x.uid view_uid,x.state,substr(x.created_time,1,10) subscriber_date \
from (select room_id,uid,state,created_time,row_number()over(partition by uid order by created_time asc) rk \
from oss_room_subscriber_roomid \
where pt_day ='{yesterday}') x \
inner join (select uid \
from oss_bi_all_user_profile \
where pt_day='{runDay}' and substr(created_time,1,10) between '{day7Before}' and '{runDay}' \
group by uid) x1 on x.uid=x1.uid \
inner join xxx_room_group_min x2 on x.room_id=x2.room_id \
where x.rk=1 and substr(x.created_time,1,10)='{runDay}') \
select subscriber_date,room_id,count(distinct view_uid) newadd_subscriber_cnt \
from tab_user_frist_subscriber \
group by subscriber_date,room_id; \
" """.format(runDay=runDay, yesterday=yesterday, day7Before=day7Before)).readlines();
anchorOperateGroup_list = []
for anchorOperateGroupList in anchorOperateGroups:
anchorOperateGroup = re.split('\t', anchorOperateGroupList.replace('\n', ''))
anchorOperateGroup_list.append(anchorOperateGroup)
# data rollback
os.system("""source /etc/profile; \
/usr/bin/mysql -hMysqlHost -P6605 -uMysqlUser -pMysqlPass--default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
delete from jellyfish_hadoop_stat.group_anchor_fans_stat where statistics_date='{runDay}' \
" """.format(runDay=runDay))
i = 0
insert_sql_text = "insert into jellyfish_hadoop_stat.group_anchor_fans_stat(room_id,statistics_date,fans_count,created_time) values "
for anchorOperateGroup in anchorOperateGroup_list:
statistics_date=anchorOperateGroup[0]
room_id=anchorOperateGroup[1]
fans_count=anchorOperateGroup[2]
etl_time=time.strftime('%Y-%m-%d %X', time.localtime())
i += 1
insert_sql_text = insert_sql_text + "({room_id},'{statistics_date}',{fans_count},'{etl_time}'),".format(statistics_date=statistics_date, room_id=room_id, fans_count=fans_count, etl_time=etl_time)
if (i % 1000 == 0):
insert_sql_text = insert_sql_text[0:-1] + ";"
os.system("""source /etc/profile; \
/usr/bin/mysql -hMysqlHost -P6605 -uMysqlUser -pMysqlPass--default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
{insert_sql_text} \
" """.format(insert_sql_text=insert_sql_text))
insert_sql_text = "insert into jellyfish_hadoop_stat.group_anchor_fans_stat(room_id,statistics_date,fans_count,created_time) values "
insert_sql_text = insert_sql_text[0:-1] + ";"
os.system("""source /etc/profile; \
/usr/bin/mysql -hMysqlHost -P6605 -uMysqlUser -pMysqlPass--default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
{insert_sql_text} \
" """.format(insert_sql_text=insert_sql_text))
# Batch Test
mysqlMiniData2hive(runDay=yesterday)
anchorOperateGroup2Mysql(runDay=yesterday)
# for runDay in dateRange(beginDate='2017-11-01', endDate='2017-11-13'):
# mysqlMiniData2hive(runDay=runDay)
# anchorOperateGroup2Mysql(runDay=runDay)