Hive多参数跑批python一例

1、单独查询示例

with tab_live as(
select room_id,pt_day,sum(unix_timestamp(updated_time)-unix_timestamp(switch_time))/60 live_mins
from (
select room_id,pt_day,string(switch_time) switch_time,string(updated_time) updated_time
from oss_bi_all_live_history_status 
where pt_day between '2019-04-01' and '2019-04-12' and room_id=2246205 and (switch_time between concat(pt_day,' 12:00:00') and concat(pt_day,' 14:00:00') and updated_time between concat(pt_day,' 12:00:00') and concat(pt_day,' 14:00:00'))
union all
select room_id,pt_day,string(switch_time) switch_time,concat(pt_day,' 14:00:00') updated_time
from oss_bi_all_live_history_status 
where pt_day between '2019-04-01' and '2019-04-12' and room_id=2246205 and (switch_time between concat(pt_day,' 12:00:00') and concat(pt_day,' 14:00:00') and updated_time >=concat(pt_day,' 14:00:00'))
union all
select room_id,pt_day,concat(pt_day,' 12:00:00') switch_time,string(updated_time) updated_time
from oss_bi_all_live_history_status 
where pt_day between '2019-04-01' and '2019-04-12' and room_id=2246205 and (switch_time <= concat(pt_day,' 12:00:00') and updated_time between concat(pt_day,' 12:00:00') and concat(pt_day,' 14:00:00'))
union all
select room_id,pt_day,concat(pt_day,' 12:00:00') switch_time,concat(pt_day,' 14:00:00') updated_time
from oss_bi_all_live_history_status 
where pt_day between '2019-04-01' and '2019-04-12' and room_id=2246205 and (switch_time <= concat(pt_day,' 12:00:00') and updated_time >= concat(pt_day,' 14:00:00')))x
group by room_id,pt_day),
tab_subscr as(
select substr(created_time,1,10) pt_day,room_id,count(uid) new_fans_cnt
from oss_bi_all_room_subscriber_roomid 
where pt_day='2019-04-12'
  and (substr(created_time,1,10) between '2019-04-01' and '2019-04-12') and room_id=2246205
  and (substr(created_time,12,2) between '12' and '13')
group by substr(created_time,1,10),room_id),
tab_gift as(
select pt_day,room_id,sum(gift_point) gift_point
from honeycomb_all_gift_record
where pt_day between '2019-04-01' and '2019-04-12' and room_id=2246205
  and (substr(created_time,12,2) between '12' and '13')
group by pt_day,room_id)
select a1.pt_day,a1.room_id,a1.live_mins,a2.new_fans_cnt,a3.gift_point
from tab_live a1
left join tab_subscr a2 on a1.room_id=a2.room_id and a1.pt_day=a2.pt_day
left join tab_gift a3 on a1.room_id=a3.room_id and a1.pt_day=a3.pt_day;

2、批量查询建目标表

drop table if exists xxxxx_wangss_bat;
CREATE TABLE `xxxxx_wangss_bat`(
  `live_mins` bigint, 
  `new_fans_cnt` bigint, 
  `gift_point` bigint, 
  `pt_day` string, 
  `timegange` string)
PARTITIONED BY ( 
  room_id string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://emr-cluster/user/hive/warehouse/xxxxx_wangss_bat';

3、数据批量跑批的脚本

# -*- coding=utf-8 -*-
import os
import datetime
import warnings
import threadpool
import time

warnings.filterwarnings("ignore")


def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates


def monthRange(beginDate, endDate):
    monthSet = set()
    for date in dateRange(beginDate, endDate):
        monthSet.add(date[0:7])
    monthList = []
    for month in monthSet:
        monthList.append(month)
    return sorted(monthList)


def hiveRunData(room_id,start_date,end_date,start_hour,end_hour,before_end_hour):
    os.system("""/usr/lib/hive-current/bin/hive -e " \
            alter table xxxxx_wangss_bat drop if exists partition(room_id='{room_id}'); \
            alter table xxxxx_wangss_bat add partition(room_id='{room_id}') location '{room_id}'; \
            with tab_live as( \
            select room_id,pt_day,sum(unix_timestamp(updated_time)-unix_timestamp(switch_time))/60 live_mins \
            from ( \
            select room_id,pt_day,string(switch_time) switch_time,string(updated_time) updated_time \
            from oss_bi_all_live_history_status \
            where pt_day between '{start_date}' and '{end_date}' and room_id={room_id} and (switch_time between concat(pt_day,' {start_hour}:00:00') and concat(pt_day,' {end_hour}:00:00') and updated_time between concat(pt_day,' {start_hour}:00:00') and concat(pt_day,' {end_hour}:00:00')) \
            union all \
            select room_id,pt_day,string(switch_time) switch_time,concat(pt_day,' {end_hour}:00:00') updated_time \
            from oss_bi_all_live_history_status \
            where pt_day between '{start_date}' and '{end_date}' and room_id={room_id} and (switch_time between concat(pt_day,' {start_hour}:00:00') and concat(pt_day,' {end_hour}:00:00') and updated_time >=concat(pt_day,' {end_hour}:00:00')) \
            union all \
            select room_id,pt_day,concat(pt_day,' {start_hour}:00:00') switch_time,string(updated_time) updated_time \
            from oss_bi_all_live_history_status \
            where pt_day between '{start_date}' and '{end_date}' and room_id={room_id} and (switch_time <= concat(pt_day,' {start_hour}:00:00') and updated_time between concat(pt_day,' {start_hour}:00:00') and concat(pt_day,' {end_hour}:00:00')) \
            union all \
            select room_id,pt_day,concat(pt_day,' {start_hour}:00:00') switch_time,concat(pt_day,' {end_hour}:00:00') updated_time \
            from oss_bi_all_live_history_status \
            where pt_day between '{start_date}' and '{end_date}' and room_id={room_id} and (switch_time <= concat(pt_day,' {start_hour}:00:00') and updated_time >= concat(pt_day,' {end_hour}:00:00')))x \
            group by room_id,pt_day), \
            tab_subscr as( \
            select substr(created_time,1,10) pt_day,room_id,count(uid) new_fans_cnt \
            from oss_bi_all_room_subscriber_roomid \
            where pt_day='{end_date}' \
              and (substr(created_time,1,10) between '{start_date}' and '{end_date}') and room_id={room_id} \
              and (substr(created_time,12,2) between '{start_hour}' and '{before_end_hour}') \
            group by substr(created_time,1,10),room_id), \
            tab_gift as( \
            select pt_day,room_id,sum(gift_point) gift_point \
            from honeycomb_all_gift_record \
            where pt_day between '{start_date}' and '{end_date}' and room_id={room_id} \
              and (substr(created_time,12,2) between '{start_hour}' and '{before_end_hour}') \
            group by pt_day,room_id) \
            insert overwrite table xxxxx_wangss_bat partition(room_id='{room_id}') \
            select a1.live_mins,a2.new_fans_cnt,a3.gift_point,a1.pt_day,concat('{start_hour}','~','{end_hour}') \
            from tab_live a1 \
            left join tab_subscr a2 on a1.room_id=a2.room_id and a1.pt_day=a2.pt_day \
            left join tab_gift a3 on a1.room_id=a3.room_id and a1.pt_day=a3.pt_day; \
            " """.format(room_id=room_id,start_date=start_date,end_date=end_date,start_hour=start_hour,end_hour=end_hour,before_end_hour=before_end_hour));


now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:", now_time

parList = [(['2246205', '2019-04-01', '2019-04-12','12','14','13'], None), (['68241549', '2019-04-01', '2019-04-12','20','22','21'], None),
(['22206458', '2019-04-01', '2019-04-13','16','18','17'], None), (['61470010', '2019-04-01', '2019-04-13','20','22','21'], None),
(['32015087', '2019-04-01', '2019-04-14','12','14','13'], None), (['31996', '2019-04-01', '2019-04-14','10','12','11'], None),
(['3761631', '2019-04-01', '2019-04-15','08','10','09'], None), (['46919224', '2019-04-01', '2019-04-15','14','16','17'], None)]

# parList = monthRange(beginDate='2017-{start_hour}-01', endDate='2018-06-25')

requests = []
request_hiveRunData = threadpool.makeRequests(hiveRunData, parList)
requests.extend(request_hiveRunData)
main_pool = threadpool.ThreadPool(9)
[main_pool.putRequest(req) for req in requests]

if __name__ == '__main__':
    while True:
        try:
            time.sleep(30)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:", now_time

说明:
1-脚本中注意多个参数的传入方式;
2-如播放历史表分时段数据取数的方式。

你可能感兴趣的:(Python,Solution)