Hive并行跑批简易一例

1、目标表建表脚本
drop table if exists xxxxx_gift_record_byroom_bymonth;
CREATE TABLE `xxxxx_gift_record_byroom_bymonth`(
  `room_id` bigint, 
  `room_creator_uid` bigint, 
  `gift_point` bigint)
PARTITIONED BY ( 
  pt_month string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://emr-cluster/user/hive/warehouse/xxxxx_gift_record_byroom_bymonth';

drop table if exists xxxxx_gift_record_bigfans_byroom_bymonth;
CREATE TABLE `xxxxx_gift_record_bigfans_byroom_bymonth`(
  `room_id` bigint, 
  `room_creator_uid` bigint, 
  `gift_point` bigint)
PARTITIONED BY ( 
  pt_month string)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://emr-cluster/user/hive/warehouse/xxxxx_gift_record_bigfans_byroom_bymonth';
2、简易并行跑批模板脚本
/Users/nisj/PycharmProjects/BiDataProc/love/HiveRunData-wuyan-parallel.py
# -*- coding=utf-8 -*-
import os
import datetime
import warnings
import threadpool
import time

warnings.filterwarnings("ignore")


def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates


def monthRange(beginDate, endDate):
    monthSet = set()
    for date in dateRange(beginDate, endDate):
        monthSet.add(date[0:7])
    monthList = []
    for month in monthSet:
        monthList.append(month)
    return sorted(monthList)


def hiveRunData(pt_month):
    # 按月份所有收礼的计算
    # os.system("""/usr/lib/hive-current/bin/hive -e " \
    #         alter table xxxxx_gift_record_byroom_bymonth drop if exists partition(pt_month='{pt_month}'); \
    #         alter table xxxxx_gift_record_byroom_bymonth add partition(pt_month='{pt_month}') location '{pt_month}'; \
    #         insert overwrite table xxxxx_gift_record_byroom_bymonth partition(pt_month='{pt_month}') \
    #         select a1.room_id,a1.room_creator_uid,sum(a1.gift_point) gift_point \
    #         from honeycomb_all_gift_record a1 \
    #         where a1.pt_month='{pt_month}' \
    #         group by a1.pt_month,a1.room_id,a1.room_creator_uid; \
    #         " """.format(pt_month=pt_month));

    # 按月份铁粉送礼的计算
    os.system("""/usr/lib/hive-current/bin/hive -e " \
            alter table xxxxx_gift_record_bigfans_byroom_bymonth drop if exists partition(pt_month='{pt_month}'); \
            alter table xxxxx_gift_record_bigfans_byroom_bymonth add partition(pt_month='{pt_month}') location '{pt_month}'; \
            insert overwrite table xxxxx_gift_record_bigfans_byroom_bymonth partition(pt_month='{pt_month}') \
            select a1.room_id,a1.room_creator_uid,sum(a1.gift_point) gift_point \
            from honeycomb_all_gift_record a1 \
            inner join fans_all_big_fans_relation_total a2 on a1.room_id=a2.room_id and a1.uid=a2.uid and a1.pt_day=a2.pt_day \
            where a1.pt_month='{pt_month}' and a2.state=0 \
            group by a1.pt_month,a1.room_id,a1.room_creator_uid; \
            " """.format(pt_month=pt_month));


now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:", now_time

# parList = monthRange(beginDate='2015-07-01', endDate='2018-06-25')
parList = monthRange(beginDate='2017-12-01', endDate='2018-06-25')

requests = []
request_hiveRunData = threadpool.makeRequests(hiveRunData, parList)
requests.extend(request_hiveRunData)
main_pool = threadpool.ThreadPool(9)
[main_pool.putRequest(req) for req in requests]

if __name__ == '__main__':
    while True:
        try:
            time.sleep(30)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:", now_time
3、并行批后的计算脚本
drop table if exists xxwuy_history_empirical_value;
create table xxwuy_history_empirical_value as
with tab_intimacy_calc as (
select room_id,uid,sum(intimacy) intimacy
from xxx_big_fans_intimacy_detail
where substr(created_time,1,10)>='2017-12-01'
group by room_id,uid),
tab_gift_point_eff as (
select a1.room_id,a1.room_creator_uid,sum(a1.gift_point-coalesce(a2.gift_point,0)) gift_point_eff
from xxxxx_gift_record_byroom_bymonth a1
left join xxxxx_gift_record_bigfans_byroom_bymonth a2 on a1.pt_month=a2.pt_month and a1.room_id=a2.room_id and a1.room_creator_uid=a2.room_creator_uid
group by a1.room_id,a1.room_creator_uid),
tab_user_coin as (
select uid,sum(coin) coin
from xxx_user_coin_record
where coin>=0 and substr(created_time,1,10)<='2018-05-31'
group by uid)
select a1.uid,a2.intimacy,a2.big_fans_cnt,a2.intimacy_max,a3.coin,a5.gift_point_eff,coalesce(a2.intimacy,0)*0.7+coalesce(a3.coin,0)*4+(coalesce(a5.gift_point_eff,0)/49)*0.6 his_empval,a4.point_count
from oss_bi_all_user_profile a1
left join (select a2.creator_uid,sum(a1.intimacy) intimacy,count(a1.uid) big_fans_cnt,max(intimacy) intimacy_max
from tab_intimacy_calc a1
inner join oss_bi_all_room a2 on a1.room_id=a2.creator_uid
where a2.pt_day='2018-05-31' and a2.state=0
group by a2.creator_uid) a2 on a1.uid=a2.creator_uid
left join tab_user_coin a3 on a1.uid=a3.uid
left join xxx_user_gift_stat a4 on a1.uid=a4.uid
left join tab_gift_point_eff a5 on a1.uid=a5.room_creator_uid
where a1.pt_day='2018-05-31' and a1.state=0 and a1.last_login_time>='2018-03-01 00:00:00';
附:另一个相似用法的更简洁脚本
/Users/nisj/PycharmProjects/BiDataProc/love/HiveRunData-huangxt-parallel.py
# -*- coding=utf-8 -*-
import os
import datetime
import warnings
import threadpool
import time

warnings.filterwarnings("ignore")


def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates


def hiveRunData(pt_day):
    os.system("""/usr/lib/hive-current/bin/hive -e " \
            alter table xxxxxx_accesslog_huangxt drop if exists partition(pt_day='{pt_day}'); \
            alter table xxxxxx_accesslog_huangxt add partition(pt_day='{pt_day}') location '{pt_day}'; \
            insert overwrite table xxxxxx_accesslog_huangxt partition(pt_day='{pt_day}') \
            select pt_hour,requesturi,appkey,uid,identify \
            from oss_bi_all_field_access_log \
            where pt_day='{pt_day}' and requesturi like '/information/%.htm'; \
            " """.format(pt_day=pt_day));


now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:", now_time

parList = dateRange(beginDate='2018-04-25', endDate='2018-06-20')

requests = []
request_hiveRunData = threadpool.makeRequests(hiveRunData, parList)
requests.extend(request_hiveRunData)
main_pool = threadpool.ThreadPool(9)
[main_pool.putRequest(req) for req in requests]

if __name__ == '__main__':
    while True:
        try:
            time.sleep(30)
            main_pool.poll()
        except KeyboardInterrupt:
            print("**** Interrupted!")
            break
        except threadpool.NoResultsPending:
            break

    if main_pool.dismissedWorkers:
        print("Joining all dismissed worker threads...")
        main_pool.joinAllDismissedWorkers()

now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:", now_time
注:并行的threadpool模块要先安装,或者拷贝到相同目录下。

你可能感兴趣的:(Python,Solution)