1、目标表建表脚本
drop table if exists xxxxx_gift_record_byroom_bymonth;
CREATE TABLE `xxxxx_gift_record_byroom_bymonth`(
`room_id` bigint,
`room_creator_uid` bigint,
`gift_point` bigint)
PARTITIONED BY (
pt_month string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://emr-cluster/user/hive/warehouse/xxxxx_gift_record_byroom_bymonth';
drop table if exists xxxxx_gift_record_bigfans_byroom_bymonth;
CREATE TABLE `xxxxx_gift_record_bigfans_byroom_bymonth`(
`room_id` bigint,
`room_creator_uid` bigint,
`gift_point` bigint)
PARTITIONED BY (
pt_month string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://emr-cluster/user/hive/warehouse/xxxxx_gift_record_bigfans_byroom_bymonth';
2、简易并行跑批模板脚本
/Users/nisj/PycharmProjects/BiDataProc/love/HiveRunData-wuyan-parallel.py
# -*- coding=utf-8 -*-
import os
import datetime
import warnings
import threadpool
import time
warnings.filterwarnings("ignore")
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def monthRange(beginDate, endDate):
monthSet = set()
for date in dateRange(beginDate, endDate):
monthSet.add(date[0:7])
monthList = []
for month in monthSet:
monthList.append(month)
return sorted(monthList)
def hiveRunData(pt_month):
# 按月份所有收礼的计算
# os.system("""/usr/lib/hive-current/bin/hive -e " \
# alter table xxxxx_gift_record_byroom_bymonth drop if exists partition(pt_month='{pt_month}'); \
# alter table xxxxx_gift_record_byroom_bymonth add partition(pt_month='{pt_month}') location '{pt_month}'; \
# insert overwrite table xxxxx_gift_record_byroom_bymonth partition(pt_month='{pt_month}') \
# select a1.room_id,a1.room_creator_uid,sum(a1.gift_point) gift_point \
# from honeycomb_all_gift_record a1 \
# where a1.pt_month='{pt_month}' \
# group by a1.pt_month,a1.room_id,a1.room_creator_uid; \
# " """.format(pt_month=pt_month));
# 按月份铁粉送礼的计算
os.system("""/usr/lib/hive-current/bin/hive -e " \
alter table xxxxx_gift_record_bigfans_byroom_bymonth drop if exists partition(pt_month='{pt_month}'); \
alter table xxxxx_gift_record_bigfans_byroom_bymonth add partition(pt_month='{pt_month}') location '{pt_month}'; \
insert overwrite table xxxxx_gift_record_bigfans_byroom_bymonth partition(pt_month='{pt_month}') \
select a1.room_id,a1.room_creator_uid,sum(a1.gift_point) gift_point \
from honeycomb_all_gift_record a1 \
inner join fans_all_big_fans_relation_total a2 on a1.room_id=a2.room_id and a1.uid=a2.uid and a1.pt_day=a2.pt_day \
where a1.pt_month='{pt_month}' and a2.state=0 \
group by a1.pt_month,a1.room_id,a1.room_creator_uid; \
" """.format(pt_month=pt_month));
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:", now_time
# parList = monthRange(beginDate='2015-07-01', endDate='2018-06-25')
parList = monthRange(beginDate='2017-12-01', endDate='2018-06-25')
requests = []
request_hiveRunData = threadpool.makeRequests(hiveRunData, parList)
requests.extend(request_hiveRunData)
main_pool = threadpool.ThreadPool(9)
[main_pool.putRequest(req) for req in requests]
if __name__ == '__main__':
while True:
try:
time.sleep(30)
main_pool.poll()
except KeyboardInterrupt:
print("**** Interrupted!")
break
except threadpool.NoResultsPending:
break
if main_pool.dismissedWorkers:
print("Joining all dismissed worker threads...")
main_pool.joinAllDismissedWorkers()
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:", now_time
3、并行批后的计算脚本
drop table if exists xxwuy_history_empirical_value;
create table xxwuy_history_empirical_value as
with tab_intimacy_calc as (
select room_id,uid,sum(intimacy) intimacy
from xxx_big_fans_intimacy_detail
where substr(created_time,1,10)>='2017-12-01'
group by room_id,uid),
tab_gift_point_eff as (
select a1.room_id,a1.room_creator_uid,sum(a1.gift_point-coalesce(a2.gift_point,0)) gift_point_eff
from xxxxx_gift_record_byroom_bymonth a1
left join xxxxx_gift_record_bigfans_byroom_bymonth a2 on a1.pt_month=a2.pt_month and a1.room_id=a2.room_id and a1.room_creator_uid=a2.room_creator_uid
group by a1.room_id,a1.room_creator_uid),
tab_user_coin as (
select uid,sum(coin) coin
from xxx_user_coin_record
where coin>=0 and substr(created_time,1,10)<='2018-05-31'
group by uid)
select a1.uid,a2.intimacy,a2.big_fans_cnt,a2.intimacy_max,a3.coin,a5.gift_point_eff,coalesce(a2.intimacy,0)*0.7+coalesce(a3.coin,0)*4+(coalesce(a5.gift_point_eff,0)/49)*0.6 his_empval,a4.point_count
from oss_bi_all_user_profile a1
left join (select a2.creator_uid,sum(a1.intimacy) intimacy,count(a1.uid) big_fans_cnt,max(intimacy) intimacy_max
from tab_intimacy_calc a1
inner join oss_bi_all_room a2 on a1.room_id=a2.creator_uid
where a2.pt_day='2018-05-31' and a2.state=0
group by a2.creator_uid) a2 on a1.uid=a2.creator_uid
left join tab_user_coin a3 on a1.uid=a3.uid
left join xxx_user_gift_stat a4 on a1.uid=a4.uid
left join tab_gift_point_eff a5 on a1.uid=a5.room_creator_uid
where a1.pt_day='2018-05-31' and a1.state=0 and a1.last_login_time>='2018-03-01 00:00:00';
附:另一个相似用法的更简洁脚本
/Users/nisj/PycharmProjects/BiDataProc/love/HiveRunData-huangxt-parallel.py
# -*- coding=utf-8 -*-
import os
import datetime
import warnings
import threadpool
import time
warnings.filterwarnings("ignore")
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def hiveRunData(pt_day):
os.system("""/usr/lib/hive-current/bin/hive -e " \
alter table xxxxxx_accesslog_huangxt drop if exists partition(pt_day='{pt_day}'); \
alter table xxxxxx_accesslog_huangxt add partition(pt_day='{pt_day}') location '{pt_day}'; \
insert overwrite table xxxxxx_accesslog_huangxt partition(pt_day='{pt_day}') \
select pt_hour,requesturi,appkey,uid,identify \
from oss_bi_all_field_access_log \
where pt_day='{pt_day}' and requesturi like '/information/%.htm'; \
" """.format(pt_day=pt_day));
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:", now_time
parList = dateRange(beginDate='2018-04-25', endDate='2018-06-20')
requests = []
request_hiveRunData = threadpool.makeRequests(hiveRunData, parList)
requests.extend(request_hiveRunData)
main_pool = threadpool.ThreadPool(9)
[main_pool.putRequest(req) for req in requests]
if __name__ == '__main__':
while True:
try:
time.sleep(30)
main_pool.poll()
except KeyboardInterrupt:
print("**** Interrupted!")
break
except threadpool.NoResultsPending:
break
if main_pool.dismissedWorkers:
print("Joining all dismissed worker threads...")
main_pool.joinAllDismissedWorkers()
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:", now_time
注:并行的threadpool模块要先安装,或者拷贝到相同目录下。