用Python进行Hive里分区表数据跑批模板

1、目标分区表的创建
drop table if exists xxxxxx_fans_view;
create table xxxxxx_fans_view(
  datehour string, 
  uid string, 
  roomid string, 
  roomcreatoruid string,
  staytime string)
partitioned by ( 
  pt_day string);

drop table if exists xxxxxx_fans_login;
create table xxxxxx_fans_login(
  datehour string, 
  uid string, 
  logincnt string)
partitioned by ( 
  pt_day string);
2、进行跑批的Python脚本
/Users/nisj/PycharmProjects/BiDataProc/love/HiveRunData.py
# -*- coding=utf-8 -*-
import os
import re
import datetime
import warnings

warnings.filterwarnings("ignore")

def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

def hiveRunData(pt_day):
    pt_month=pt_day[0:7]
    os.system("""/usr/lib/hive-current/bin/hive -e " \
            alter table xxxxxx_fans_view drop partition(pt_day='{pt_day}'); \
            alter table xxxxxx_fans_view add partition(pt_day='{pt_day}') location '{pt_day}'; \
            insert overwrite table xxxxxx_fans_view partition(pt_day='{pt_day}') \
            select substr(date_time,1,13) datehour,parms['uid'] uid,parms['roomId'] roomId,parms['roomCreatorUid'] roomCreatorUid,count(1) stayTime \
            from oss_bi_all_room_heartbeat_log a1 \
            where pt_month='{pt_month}' and pt_day='{pt_day}' \
            group by substr(date_time,1,13),parms['uid'],parms['roomId'],parms['roomCreatorUid']; \
            " """.format(pt_month=pt_month, pt_day=pt_day));

    os.system("""/usr/lib/hive-current/bin/hive -e " \
            alter table xxxxxx_fans_login drop partition(pt_day='{pt_day}'); \
            alter table xxxxxx_fans_login add partition(pt_day='{pt_day}') location '{pt_day}'; \
            insert overwrite table xxxxxx_fans_login partition(pt_day='{pt_day}') \
            select substr(date_time,1,13) datehour,parms['uid'] uid,count(*) logincnt \
            from oss_bi_all_login_log a1 \
            where pt_month='{pt_month}' and pt_day='{pt_day}' \
            group by substr(date_time,1,13),parms['uid']; \
            " """.format(pt_month=pt_month, pt_day=pt_day));

for ptDay in dateRange(beginDate='2017-12-23', endDate='2018-01-22'):
    print ptDay
    hiveRunData(pt_day=ptDay)
事实上,并没有什么复杂的;就是记一下,备查。

你可能感兴趣的:(#,Hive,Python)