此文中,相当于是对【Hive数据汇总导入Mysql(Draw lessons from colleagues)】一文的整理。
1、在进行代码跑批之前,需要将mysql里的目标表,按要求先建好。
2、需要增减任务时,只需要在【configSql.py】增减相应的配置即可。
3、采用beeline客户端进行hive数据的汇总及查询;实际上查询sql可以任意,也可以多个,只要最终返回一个带结果的数据集就可以了;在其之前可以有数据删除及临时表创建等语句。
4、mysql数据插入sql分两块,数据回滚及hive汇总数据的插入;当然也可以根据需要进行mysql数据的更新。
5、设计的目的进行将hive中的汇总数据插入到mysql目标表中,但其实非汇总的明细数据也一样;每次处理的结果数据集建议在万条记录以下,如果太多了,插入mysql处理起来比较费时费力。
6、针对数据量较多的情况,考虑了事务性处理以提高效率;但数据量还是要尽量控制在万条以内。
同时,可以参考如下相关文章:
Hive汇总统计数据自动化传输到Mysql数据库-跑批参数文本配置及提取使用-->
http://blog.csdn.net/babyfish13/article/details/73188712
Hive汇总统计数据自动化传输到Mysql数据库-->
http://blog.csdn.net/babyfish13/article/details/72701512
Hive数据汇总导入Mysql(Draw lessons from colleagues)-->
https://blog.csdn.net/babyfish13/article/details/78979161
1、连接配置
/Users/nisj/PycharmProjects/BiDataProc/OpenETL-hiveSum2mysql/configConn.py
# -*- coding=utf-8 -*-
hiveConn = {
"hive_old": {
"jdbcConn": "jdbc:hive2://HiveHost:10000"
},
"hive_new": {
"jdbcConn": "jdbc:hive2://HivenewHost:10000"
}
}
mysqlConn = {
"funnyai_data": {
"ip": "MysqlHost",
# "ip": "MysqlHost",
"port": 6603,
"db": "funnyai_data",
"username": "MysqlUser",
"password": "MysqlPass"
},
"jellyfish_hadoop_stat": {
"ip": "MysqlHost",
"port": 6605,
"db": "jellyfish_hadoop_stat",
"username": "MysqlUser",
"password": "MysqlPass"
}
}
2、数据传输SQL配置
/Users/nisj/PycharmProjects/BiDataProc/OpenETL-hiveSum2mysql/configSql.py
# -*- coding=utf-8 -*-
TaskHive2mysql={}
TaskHive2mysql["oss_bi_all_room"] = {
"enable": True,
"comment": "房间明细信息",
"reader": {
"type": "hive",
"hiveName": "hive_old",
"query_sql": """
select * from oss_bi_all_room where pt_day='2018-04-18' limit 888;
"""
},
"writer": {
"type": "mysql",
"conn": "jellyfish_hadoop_stat",
"clear_sql": """
delete from xxx_room_test
-- where created_time="{0}";
""",
"insert_sql": """
insert into xxx_room_test (id,live_id,is_profession,creator_uid,subcriber_count,last_live_time,state,created_time,updated_time)
values
("{1}", "{2}", "{3}", "{4}", "{5}", "{6}", "{7}", "{8}", "{9}")
;
"""
}
}
TaskHive2mysql["xxx_gamezone_bringnew_audience_test_static_daily"] = {
"enable": True,
"comment": "游戏专区拉新-新增观众数及其当日充值和弹幕发送及次日留存情况",
"reader": {
"type": "hive",
"hiveName": "hive_old",
"query_sql": """
with tab_view_game as(
select a1.uid,a1.gameid,a1.view_time
from (select uid,gameid,sum(view_time) view_time,row_number()over(partition by uid order by sum(view_time) desc) rk
from recommend_data_view
where pt_day=date_add('{0}',-1)
group by uid,gameid) a1
where a1.rk=1),
tab_newidentifier_newuser as(
select uid
from oss_bi_type_of_all_user
where pt_day=date_add('{0}',-1) and type=1
),
tab_pay_info as (
select uid,amount
from oss_bi_all_chushou_pay_info
where pt_day=date_add('{0}',-1) and state=0),
tab_message_info as (
select parms['uid'] uid,parms['liveGameId'] gameid,parms['liveGameName'] gamename,count(*) message_cnt
from oss_bi_all_message_send_log
where pt_day=date_add('{0}',-1)
group by parms['uid'],parms['liveGameId'],parms['liveGameName']),
tab_view_nextday as(select uid,gameid
from recommend_data_view
where pt_day=date_add('{0}',0)
group by uid,gameid)
select a2.gameid,a6.name gamename,count(distinct a1.uid) new_register_game_view_cnt,count(distinct a3.uid) pay_uid_cnt,sum(a3.amount) pay_amount,count(distinct a4.uid) message_send_uid_cnt,sum(a4.message_cnt) message_send_cnt,count(distinct a5.uid) audience_new_next_remain_cnt,count(distinct a5.uid)/count(distinct a1.uid) audience_new_next_remain_rate,from_unixtime(unix_timestamp()) created_time,from_unixtime(unix_timestamp()) updated_time
from tab_newidentifier_newuser a1
inner join tab_view_game a2 on a1.uid=a2.uid
left join tab_pay_info a3 on a1.uid=a3.uid
left join tab_message_info a4 on a1.uid=a4.uid
left join tab_view_nextday a5 on a1.uid=a5.uid
left join data_chushou_game a6 on a2.gameid=a6.id
group by a2.gameid,a6.name
;
"""
},
"writer": {
"type": "mysql",
"conn": "jellyfish_hadoop_stat",
"clear_sql": """
delete from xxx_gamezone_bringnew_audience_test
where calc_date="{0}";
""",
"insert_sql": """
insert into xxx_gamezone_bringnew_audience_test (calc_date,game_id,game_name,new_register_game_view_cnt,pay_uid_cnt,pay_amount,message_send_uid_cnt,message_send_cnt,audience_new_next_remain_cnt,audience_new_next_remain_rate,created_time,updated_time)
values
("{0}", "{1}", "{2}", "{3}" , "{4}", "{5}", "{6}", "{7}", "{8}", "{9}", "{10}", "{11}")
;
"""
}
}
3、数据传输及处理具体脚本
/Users/nisj/PycharmProjects/BiDataProc/OpenETL-hiveSum2mysql/funReadWrite.py
# -*- coding=utf-8 -*-
import os
from configConn import hiveConn, mysqlConn
def runSqlOnMysqlShell(conn, sql):
if conn in mysqlConn:
my_conf = mysqlConn[conn]
return "mysql -h{0} -P{1} -u{2} -p{3} -e \"\"\"set names utf8; use {4}; {5}\"\"\" ".format(
my_conf['ip'], my_conf['port'], my_conf['username'],
my_conf['password'], my_conf['db'], sql)
else:
return None
def runSqlOnHive(taskConf, runDay):
mpNameSql = """
SET mapred.job.name=' hiveSum2Mysql-test ({0}) ';
""".format(runDay)
hiveSql = mpNameSql + taskConf['reader']['query_sql'].format(
runDay).replace('"', "'").replace('`', '\`') # 替换所有的 双引号 成 单引号
jdbcConn = hiveConn[taskConf['reader']['hiveName']]['jdbcConn']
querySql = " source ~/.bash_profile && beeline --outputformat=csv2 --showHeader=false -u '{0}' -n hadoop -p '' -e \"\"\"{1}\"\"\" ".format(
jdbcConn, hiveSql)
print querySql
queryResultList = os.popen(querySql).read().split("\n")
if len(queryResultList) > 1:
return queryResultList[:-1]
else:
raise Exception("No query data is come out!")
# print runSqlOnHive(taskConf=TaskHive2mysql["oss_bi_all_room"], runDay='2018-04-18')
def runSqlOnMysql(taskConf, runDay, hiveDataResults):
if 'max_bulk_insert' in taskConf['writer']:
maxInsert = taskConf['writer']['max_bulk_insert']
else:
maxInsert = 28
runSqlList = []
# 组装清空sql
clear_sql = taskConf['writer']['clear_sql'].format(runDay).replace(
'"', "'").replace('`', '\`')
if not clear_sql.strip()[-1:] == ';':
clear_sql += ';'
runSqlList.append(clear_sql)
# 组装插入sql
insert_sql = ''
insert_count = 0
for line in hiveDataResults:
if insert_count >= maxInsert:
runSqlList.append(insert_sql)
insert_count = 0
insert_sql = ''
words = line.strip().split(',')
insert_sql += taskConf['writer']['insert_sql'].format(
runDay, *words).replace('"', "'").replace('`', '\`')
if not insert_sql.strip()[-1:] == ';':
insert_sql += ';'
insert_count += 1
if insert_count > 0:
runSqlList.append(insert_sql)
# 执行所有分批sql
for run_sql in runSqlList:
sqlOnMysql_cmd = runSqlOnMysqlShell(taskConf['writer']['conn'],
run_sql) + " && echo 'sql on mysql exec success!' "
# print sqlOnMysql_cmd
# 执行,并输出执行结果
os.system(sqlOnMysql_cmd)
4、统筹调度脚本
/Users/nisj/PycharmProjects/BiDataProc/OpenETL-hiveSum2mysql/overallPlanning.py
# -*- coding=utf-8 -*-
import datetime
from configSql import *
from funReadWrite import runSqlOnHive, runSqlOnMysql
def dataDayRun(taskConf, runDay):
hiveDataResults = []
if TaskHive2mysql["oss_bi_all_room"]['reader']['type'] == 'hive':
try:
hiveDataResults = runSqlOnHive(taskConf, runDay)
except Exception, e:
print e
if taskConf['writer']['type'] == 'mysql':
runSqlOnMysql(taskConf, runDay, hiveDataResults)
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
if __name__ == '__main__':
# for runDay in dateRange(beginDate='2018-03-01', endDate='2018-03-31'):
# print runDay
dataDayRun(taskConf=TaskHive2mysql["oss_bi_all_room"], runDay='2018-05-06')
dataDayRun(taskConf=TaskHive2mysql["xxx_gamezone_bringnew_audience_test_static_daily"], runDay='2018-05-06')
5、说明
方案结合调度系统,可以进行报表数据的周期性传输。
配置的两个示例,一个是明细数据的直播拉取,一个是hive汇总数据传输;各具有代表意义。
其目标mysql建表语句如下:
show create table xxx_room_test;
CREATE TABLE `xxx_room_test` (
`id` bigint(20) DEFAULT NULL,
`live_id` varchar(100) DEFAULT NULL,
`is_profession` int(11) DEFAULT NULL,
`creator_uid` bigint(20) DEFAULT NULL,
`subcriber_count` bigint(20) DEFAULT NULL,
`last_live_time` varchar(100) DEFAULT NULL,
`state` bigint(20) DEFAULT NULL,
`created_time` varchar(100) DEFAULT NULL,
`updated_time` varchar(100) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
show create table xxx_gamezone_bringnew_audience_test;
CREATE TABLE `xxx_gamezone_bringnew_audience_test` (
`id` int(11) NOT NULL DEFAULT '0' COMMENT '自增ID',
`calc_date` date DEFAULT NULL COMMENT '统计日期',
`game_id` bigint(20) DEFAULT NULL COMMENT '游戏id',
`game_name` varchar(500) DEFAULT '' COMMENT '游戏名称',
`new_register_game_view_cnt` int(11) DEFAULT '0' COMMENT '当日新增观众数',
`pay_uid_cnt` int(11) DEFAULT '0' COMMENT '当日新增观众-充值人数',
`pay_amount` decimal(22,3) DEFAULT '0.000' COMMENT '当日新增观众-充值金额',
`message_send_uid_cnt` int(11) DEFAULT '0' COMMENT '当日新增观众-弹幕发送人数',
`message_send_cnt` int(11) DEFAULT '0' COMMENT '当日新增观众-弹幕发送次数',
`audience_new_next_remain_cnt` int(11) DEFAULT '0' COMMENT '当日新增观众-次日留存数',
`audience_new_next_remain_rate` decimal(6,2) DEFAULT '0.00' COMMENT '当日新增观众-次日留存率',
`created_time` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`updated_time` datetime DEFAULT '2999-12-31 23:59:59' COMMENT '统计时间'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;