关注点:1、Hive临时表的使用
2、Hive执行任务因为自动MapJoin而产生的内存不足情况的调优
3、次日留存及周充值的装载和月充值的更新
4、传统数据库ifnull功能Hive中coalesce函数的使用
5、Mysql小表数据在hive上的装载
python代码实现脚本:
/Users/nisj/PycharmProjects/BiDataProc/Demand/hadoopStat/anchorPullnewRemainAndPay.py
# -*- coding=utf-8 -*-
import datetime
import time
import os
import warnings
import sys
import re
reload(sys)
sys.setdefaultencoding('utf8')
warnings.filterwarnings("ignore")
yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
def getDayForThis(runDay):
yesterday = (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
day1Before = (datetime.datetime.strptime(runDay, '%Y-%m-%d') - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
day1After = (datetime.datetime.strptime(runDay, '%Y-%m-%d') + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
day7After = (datetime.datetime.strptime(runDay, '%Y-%m-%d') + datetime.timedelta(days=6)).strftime('%Y-%m-%d')
dayRun30start = (datetime.datetime.strptime(runDay, '%Y-%m-%d') + datetime.timedelta(days=(6-29))).strftime('%Y-%m-%d')
dayRun30end = day7After
return yesterday, day1Before, day1After, day7After, dayRun30start, dayRun30end
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def mysqlMiniData2hive():
miniData = os.popen("""source /etc/profile; \
/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass --default-character-set=utf8 -N -e "use jellyfish_hadoop_stat; \
select uid,room_id from invite_anchor; \
" """).readlines();
miniDataList = []
for miniDataRow in miniData:
miniD = re.split('\t', miniDataRow.replace('\n', '').replace('`', '').replace('\'', '').replace('"', ''))
miniDataList.append(miniD)
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
truncate table xxx_invite_anchor_min; \
" """)
i = 0
insert2HiveSqlText = "insert into xxx_invite_anchor_min(uid,room_id) values "
for miniDataVal in miniDataList:
# print miniDataVal[0],miniDataVal[1],miniDataVal[2]
uid = miniDataVal[0]
room_id = miniDataVal[1]
# etl_time = time.strftime('%Y-%m-%d %X', time.localtime())
i += 1
insert2HiveSqlText = insert2HiveSqlText + "({uid},{room_id}),".format(uid=uid, room_id=room_id)
if (i % 8888888 == 0):
insert2HiveSqlText = insert2HiveSqlText[0:-1] + ";"
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
{insert2HiveSqlText} \
" """.format(insert2HiveSqlText=insert2HiveSqlText))
insert2HiveSqlText = "insert into xxx_invite_anchor_min(uid,room_id) values "
insert2HiveSqlText = insert2HiveSqlText[0:-1] + ";"
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
{insert2HiveSqlText} \
" """.format(insert2HiveSqlText=insert2HiveSqlText))
def anchorPullnewRemaiAndPay(runDay):
yesterday=getDayForThis(runDay)[0]
day1Before=getDayForThis(runDay)[1]
day1After=getDayForThis(runDay)[2]
day7After=getDayForThis(runDay)[3]
dayRun30start=getDayForThis(runDay)[4]
dayRun30end=getDayForThis(runDay)[5]
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
add jar /home/hadoop/nisj/udf-jar/hadoop_udf_radixChange.jar; \
create temporary function RadixChange as 'com.kascend.hadoop.RadixChange'; \
create temporary table xxxxx_tab_access_morrow as \
select distinct RadixChange(lower(uid),16,10) uid \
from bi_all_access_log \
where pt_day='{day1After}'; \
create temporary table xxxxx_tab_user_frist_subscriber as \
select room_id,fans_uid,state,first_subscriber_date \
from (select room_id,uid fans_uid,state,substr(created_time,1,10) first_subscriber_date,row_number()over(partition by uid order by created_time asc) rk from oss_room_subscriber_roomid where pt_day='{yesterday}') x \
where rk=1 and first_subscriber_date='{runDay}'; \
create temporary table xxxxx_tab_user_frist_subscriber_formonth as \
select room_id,fans_uid,state,first_subscriber_date \
from (select room_id,uid fans_uid,state,substr(created_time,1,10) first_subscriber_date,row_number()over(partition by uid order by created_time asc) rk from oss_room_subscriber_roomid where pt_day='{yesterday}') x \
where rk=1 and first_subscriber_date='{dayRun30start}'; \
create temporary table xxxxx_tab_user_infor as \
select a2.nickname,a1.id room_id,a2.uid anchor_uid \
from oss_room_v2 a1 \
left join oss_bi_all_user_profile a2 on a1.creator_uid=a2.uid \
where a1.pt_day='{yesterday}' and a2.pt_day='{yesterday}'; \
create temporary table xxxxx_tab_fans_pay_1week as \
select uid,sum(amount) pay_amount \
from data_chushou_pay_info \
where pt_day between '{runDay}' and '{day7After}' and state=0 \
group by uid; \
create temporary table xxxxx_tab_fans_pay_1month as \
select uid,sum(amount) pay_amount \
from data_chushou_pay_info \
where pt_day between '{dayRun30start}' and '{dayRun30end}' and state=0 \
group by uid; \
set hive.auto.convert.join=false; \
drop table if exists xxxxx_tab_anchor_pullnew_remain_pay1week; \
create table xxxxx_tab_anchor_pullnew_remain_pay1week as \
select a1.first_subscriber_date calc_date,a1.room_id,a2.anchor_uid,a2.nickname,count(distinct a1.fans_uid) fans_add_cnt, \
count(a5.uid) fans_morrow_remain_cnt,sum(a61.pay_amount) week_payamount \
from xxxxx_tab_user_frist_subscriber a1 \
inner join xxx_invite_anchor_min a4 on a1.room_id=a4.room_id \
left join xxxxx_tab_user_infor a2 on a1.room_id=a2.room_id \
left join xxxxx_tab_access_morrow a5 on a1.fans_uid=a5.uid \
left join xxxxx_tab_fans_pay_1week a61 on a1.fans_uid=a61.uid \
group by a1.first_subscriber_date,a1.room_id,a2.anchor_uid,a2.nickname; \
drop table if exists xxxxx_tab_anchor_pullnew_remain_pay1month; \
create table xxxxx_tab_anchor_pullnew_remain_pay1month as \
select a1.first_subscriber_date calc_date,a1.room_id,a2.anchor_uid,a2.nickname,count(distinct a1.fans_uid) fans_add_cnt, \
sum(a62.pay_amount) month_payamount \
from xxxxx_tab_user_frist_subscriber_formonth a1 \
inner join xxx_invite_anchor_min a4 on a1.room_id=a4.room_id \
left join xxxxx_tab_user_infor a2 on a1.room_id=a2.room_id \
left join xxxxx_tab_fans_pay_1month a62 on a1.fans_uid=a62.uid \
group by a1.first_subscriber_date,a1.room_id,a2.anchor_uid,a2.nickname \
; \
" """.format(runDay=runDay, yesterday=yesterday, day1After=day1After, day7After=day7After, dayRun30start=dayRun30start, dayRun30end=dayRun30end));
def anchorPullnewRemaiAndPay2Mysql(runDay):
anchorPullnews=os.popen("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
select calc_date,room_id,coalesce(anchor_uid,0) anchor_uid,coalesce(nickname,'无昵称') nickname,coalesce(fans_add_cnt,0) fans_add_cnt,coalesce(fans_morrow_remain_cnt,0) fans_morrow_remain_cnt,coalesce(week_payamount,0) week_payamount \
from xxxxx_tab_anchor_pullnew_remain_pay1week \
where calc_date='{runDay}'; \
" """.format(runDay=runDay)).readlines();
anchorPullnew_list = []
for anchorPullnewList in anchorPullnews:
anchorPullnew = re.split('\t', anchorPullnewList.replace('\n', ''))
anchorPullnew_list.append(anchorPullnew)
# data rollback
os.system("""source /etc/profile; \
/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass --default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
delete from jellyfish_hadoop_stat.anchor_pullnew_morrowremain_daypay where calc_date='{runDay}' \
" """.format(runDay=runDay))
i = 0
insert_sql_text = "insert into jellyfish_hadoop_stat.anchor_pullnew_morrowremain_daypay(calc_date,room_id,anchor_uid,nickname,fans_add_cnt,fans_morrow_remain_cnt,week_payamount,etl_time,update_time) values "
for anchorPullnew in anchorPullnew_list:
calc_date=anchorPullnew[0]
room_id=anchorPullnew[1]
anchor_uid=anchorPullnew[2]
nickname=str(anchorPullnew[3]).replace('\n', '').replace('`', '').replace('\'', '').replace('"', '').replace('\\', '')
fans_add_cnt=anchorPullnew[4]
fans_morrow_remain_cnt=anchorPullnew[5]
week_payamount=anchorPullnew[6]
etl_time=time.strftime('%Y-%m-%d %X', time.localtime())
i += 1
insert_sql_text = insert_sql_text + "('{calc_date}',{room_id},{anchor_uid},'{nickname}',{fans_add_cnt},{fans_morrow_remain_cnt},{week_payamount},'{etl_time}','{update_time}'),".format(calc_date=calc_date, room_id=room_id, anchor_uid=anchor_uid, nickname=nickname, fans_add_cnt=fans_add_cnt, fans_morrow_remain_cnt=fans_morrow_remain_cnt, week_payamount=week_payamount, etl_time=etl_time, update_time=etl_time)
if (i % 1000 == 0):
insert_sql_text = insert_sql_text[0:-1] + ";"
os.system("""source /etc/profile; \
/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass --default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
{insert_sql_text} \
" """.format(insert_sql_text=insert_sql_text))
insert_sql_text = "insert into jellyfish_hadoop_stat.anchor_pullnew_morrowremain_daypay(calc_date,room_id,anchor_uid,nickname,fans_add_cnt,fans_morrow_remain_cnt,week_payamount,etl_time,update_time) values "
insert_sql_text = insert_sql_text[0:-1] + ";"
os.system("""source /etc/profile; \
/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass --default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
{insert_sql_text} \
" """.format(insert_sql_text=insert_sql_text))
def anchorPullnewMonthPayUpdate2Mysql(runDay):
dayRun30start=getDayForThis(runDay)[4]
anchorPullnews=os.popen("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
select calc_date,room_id,coalesce(anchor_uid,0) anchor_uid,coalesce(nickname,'无昵称') nickname,coalesce(fans_add_cnt,0) fans_add_cnt,coalesce(month_payamount,0) month_payamount \
from xxxxx_tab_anchor_pullnew_remain_pay1month \
where calc_date='{dayRun30start}'; \
" """.format(dayRun30start=dayRun30start)).readlines();
anchorPullnew_list = []
for anchorPullnewList in anchorPullnews:
anchorPullnew = re.split('\t', anchorPullnewList.replace('\n', ''))
anchorPullnew_list.append(anchorPullnew)
for anchorPullnew in anchorPullnew_list:
calc_date=anchorPullnew[0]
room_id=anchorPullnew[1]
anchor_uid=anchorPullnew[2]
nickname=str(anchorPullnew[3]).replace('\n', '').replace('`', '').replace('\'', '').replace('"', '').replace('\\', '')
fans_add_cnt=anchorPullnew[4]
month_payamount=anchorPullnew[5]
etl_time=time.strftime('%Y-%m-%d %X', time.localtime())
# 数据更新
os.system("""source /etc/profile; \
/usr/bin/mysql -hMysqlHost -PMysqlPort -uMysqlUser -pMysqlPass --default-character-set=utf8 -e "use jellyfish_hadoop_stat; \
update jellyfish_hadoop_stat.anchor_pullnew_morrowremain_daypay \
set month_payamount={month_payamount},update_time='{update_time}' \
where calc_date='{dayRun30start}' \
and room_id={room_id} \
and anchor_uid={anchor_uid}; \
" """.format(dayRun30start=dayRun30start, room_id=room_id, anchor_uid=anchor_uid, month_payamount=month_payamount, update_time=etl_time))
# Batch Test
batchDay = (datetime.date.today() - datetime.timedelta(days=8)).strftime('%Y-%m-%d')
# mysqlMiniData2hive()
# anchorPullnewRemaiAndPay(runDay=batchDay)
# anchorPullnewRemaiAndPay2Mysql(runDay=batchDay)
# anchorPullnewMonthPayUpdate2Mysql(runDay=batchDay)
for batchDay in dateRange(beginDate='2017-06-01', endDate='2017-12-12'):
mysqlMiniData2hive()
anchorPullnewRemaiAndPay(runDay=batchDay)
anchorPullnewRemaiAndPay2Mysql(runDay=batchDay)
anchorPullnewMonthPayUpdate2Mysql(runDay=batchDay)