1、单独查询示例
with tab_live as(
select room_id,pt_day,sum(unix_timestamp(updated_time)-unix_timestamp(switch_time))/60 live_mins
from (
select room_id,pt_day,string(switch_time) switch_time,string(updated_time) updated_time
from oss_bi_all_live_history_status
where pt_day between '2019-04-01' and '2019-04-12' and room_id=2246205 and (switch_time between concat(pt_day,' 12:00:00') and concat(pt_day,' 14:00:00') and updated_time between concat(pt_day,' 12:00:00') and concat(pt_day,' 14:00:00'))
union all
select room_id,pt_day,string(switch_time) switch_time,concat(pt_day,' 14:00:00') updated_time
from oss_bi_all_live_history_status
where pt_day between '2019-04-01' and '2019-04-12' and room_id=2246205 and (switch_time between concat(pt_day,' 12:00:00') and concat(pt_day,' 14:00:00') and updated_time >=concat(pt_day,' 14:00:00'))
union all
select room_id,pt_day,concat(pt_day,' 12:00:00') switch_time,string(updated_time) updated_time
from oss_bi_all_live_history_status
where pt_day between '2019-04-01' and '2019-04-12' and room_id=2246205 and (switch_time <= concat(pt_day,' 12:00:00') and updated_time between concat(pt_day,' 12:00:00') and concat(pt_day,' 14:00:00'))
union all
select room_id,pt_day,concat(pt_day,' 12:00:00') switch_time,concat(pt_day,' 14:00:00') updated_time
from oss_bi_all_live_history_status
where pt_day between '2019-04-01' and '2019-04-12' and room_id=2246205 and (switch_time <= concat(pt_day,' 12:00:00') and updated_time >= concat(pt_day,' 14:00:00')))x
group by room_id,pt_day),
tab_subscr as(
select substr(created_time,1,10) pt_day,room_id,count(uid) new_fans_cnt
from oss_bi_all_room_subscriber_roomid
where pt_day='2019-04-12'
and (substr(created_time,1,10) between '2019-04-01' and '2019-04-12') and room_id=2246205
and (substr(created_time,12,2) between '12' and '13')
group by substr(created_time,1,10),room_id),
tab_gift as(
select pt_day,room_id,sum(gift_point) gift_point
from honeycomb_all_gift_record
where pt_day between '2019-04-01' and '2019-04-12' and room_id=2246205
and (substr(created_time,12,2) between '12' and '13')
group by pt_day,room_id)
select a1.pt_day,a1.room_id,a1.live_mins,a2.new_fans_cnt,a3.gift_point
from tab_live a1
left join tab_subscr a2 on a1.room_id=a2.room_id and a1.pt_day=a2.pt_day
left join tab_gift a3 on a1.room_id=a3.room_id and a1.pt_day=a3.pt_day;
2、批量查询建目标表
drop table if exists xxxxx_wangss_bat;
CREATE TABLE `xxxxx_wangss_bat`(
`live_mins` bigint,
`new_fans_cnt` bigint,
`gift_point` bigint,
`pt_day` string,
`timegange` string)
PARTITIONED BY (
room_id string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://emr-cluster/user/hive/warehouse/xxxxx_wangss_bat';
3、数据批量跑批的脚本
# -*- coding=utf-8 -*-
import os
import datetime
import warnings
import threadpool
import time
warnings.filterwarnings("ignore")
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def monthRange(beginDate, endDate):
monthSet = set()
for date in dateRange(beginDate, endDate):
monthSet.add(date[0:7])
monthList = []
for month in monthSet:
monthList.append(month)
return sorted(monthList)
def hiveRunData(room_id,start_date,end_date,start_hour,end_hour,before_end_hour):
os.system("""/usr/lib/hive-current/bin/hive -e " \
alter table xxxxx_wangss_bat drop if exists partition(room_id='{room_id}'); \
alter table xxxxx_wangss_bat add partition(room_id='{room_id}') location '{room_id}'; \
with tab_live as( \
select room_id,pt_day,sum(unix_timestamp(updated_time)-unix_timestamp(switch_time))/60 live_mins \
from ( \
select room_id,pt_day,string(switch_time) switch_time,string(updated_time) updated_time \
from oss_bi_all_live_history_status \
where pt_day between '{start_date}' and '{end_date}' and room_id={room_id} and (switch_time between concat(pt_day,' {start_hour}:00:00') and concat(pt_day,' {end_hour}:00:00') and updated_time between concat(pt_day,' {start_hour}:00:00') and concat(pt_day,' {end_hour}:00:00')) \
union all \
select room_id,pt_day,string(switch_time) switch_time,concat(pt_day,' {end_hour}:00:00') updated_time \
from oss_bi_all_live_history_status \
where pt_day between '{start_date}' and '{end_date}' and room_id={room_id} and (switch_time between concat(pt_day,' {start_hour}:00:00') and concat(pt_day,' {end_hour}:00:00') and updated_time >=concat(pt_day,' {end_hour}:00:00')) \
union all \
select room_id,pt_day,concat(pt_day,' {start_hour}:00:00') switch_time,string(updated_time) updated_time \
from oss_bi_all_live_history_status \
where pt_day between '{start_date}' and '{end_date}' and room_id={room_id} and (switch_time <= concat(pt_day,' {start_hour}:00:00') and updated_time between concat(pt_day,' {start_hour}:00:00') and concat(pt_day,' {end_hour}:00:00')) \
union all \
select room_id,pt_day,concat(pt_day,' {start_hour}:00:00') switch_time,concat(pt_day,' {end_hour}:00:00') updated_time \
from oss_bi_all_live_history_status \
where pt_day between '{start_date}' and '{end_date}' and room_id={room_id} and (switch_time <= concat(pt_day,' {start_hour}:00:00') and updated_time >= concat(pt_day,' {end_hour}:00:00')))x \
group by room_id,pt_day), \
tab_subscr as( \
select substr(created_time,1,10) pt_day,room_id,count(uid) new_fans_cnt \
from oss_bi_all_room_subscriber_roomid \
where pt_day='{end_date}' \
and (substr(created_time,1,10) between '{start_date}' and '{end_date}') and room_id={room_id} \
and (substr(created_time,12,2) between '{start_hour}' and '{before_end_hour}') \
group by substr(created_time,1,10),room_id), \
tab_gift as( \
select pt_day,room_id,sum(gift_point) gift_point \
from honeycomb_all_gift_record \
where pt_day between '{start_date}' and '{end_date}' and room_id={room_id} \
and (substr(created_time,12,2) between '{start_hour}' and '{before_end_hour}') \
group by pt_day,room_id) \
insert overwrite table xxxxx_wangss_bat partition(room_id='{room_id}') \
select a1.live_mins,a2.new_fans_cnt,a3.gift_point,a1.pt_day,concat('{start_hour}','~','{end_hour}') \
from tab_live a1 \
left join tab_subscr a2 on a1.room_id=a2.room_id and a1.pt_day=a2.pt_day \
left join tab_gift a3 on a1.room_id=a3.room_id and a1.pt_day=a3.pt_day; \
" """.format(room_id=room_id,start_date=start_date,end_date=end_date,start_hour=start_hour,end_hour=end_hour,before_end_hour=before_end_hour));
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:", now_time
parList = [(['2246205', '2019-04-01', '2019-04-12','12','14','13'], None), (['68241549', '2019-04-01', '2019-04-12','20','22','21'], None),
(['22206458', '2019-04-01', '2019-04-13','16','18','17'], None), (['61470010', '2019-04-01', '2019-04-13','20','22','21'], None),
(['32015087', '2019-04-01', '2019-04-14','12','14','13'], None), (['31996', '2019-04-01', '2019-04-14','10','12','11'], None),
(['3761631', '2019-04-01', '2019-04-15','08','10','09'], None), (['46919224', '2019-04-01', '2019-04-15','14','16','17'], None)]
# parList = monthRange(beginDate='2017-{start_hour}-01', endDate='2018-06-25')
requests = []
request_hiveRunData = threadpool.makeRequests(hiveRunData, parList)
requests.extend(request_hiveRunData)
main_pool = threadpool.ThreadPool(9)
[main_pool.putRequest(req) for req in requests]
if __name__ == '__main__':
while True:
try:
time.sleep(30)
main_pool.poll()
except KeyboardInterrupt:
print("**** Interrupted!")
break
except threadpool.NoResultsPending:
break
if main_pool.dismissedWorkers:
print("Joining all dismissed worker threads...")
main_pool.joinAllDismissedWorkers()
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:", now_time
说明:
1-脚本中注意多个参数的传入方式;
2-如播放历史表分时段数据取数的方式。