--前提条件:
# yum -y install python-devel gcc-c++ python-pip python gcc
告警:
The required version of setuptools (>=3.4.4) is not available,
and can't be installed while this script is running. Please
install a more recent version first, using
'easy_install -U setuptools'.
(Currently using setuptools 0.9.8 (/usr/lib/python2.7/site-packages))
-- 报错信息:
bitarray/_bitarray.c:9:20: fatal error: Python.h: No such file or directory
#include "Python.h"
--解决办法:
#yum -y install python-devel python
--升级安装pip
pip install --upgrade pip
--升级安装setuptools
pip install --upgrade setuptools
-- 升级之后的版本:
# pip list | grep -i -E 'pip|setuptools'
DEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.
pip 19.0.3
setuptools 41.0.0
--安装impala:
pip install impyla
Collecting impyla
Using cached https://files.pythonhosted.org/packages/2b/de/3cadc54ca0aac684e03e5f66ae027bd5bf455f9becf9cc81a435b3834889/impyla-0.14.2.2.tar.gz
Requirement already satisfied: six in /usr/lib/python2.7/site-packages (from impyla) (1.9.0)
Collecting bitarray (from impyla)
Using cached https://files.pythonhosted.org/packages/e2/1e/b93636ae36d08d0ee3aec40b08731cc97217c69db9422c0afef6ee32ebd2/bitarray-0.8.3.tar.gz
Requirement already satisfied: thrift<=0.9.3 in /usr/lib/python2.7/site-packages (from impyla) (0.9.3)
Installing collected packages: bitarray, impyla
Running setup.py install for bitarray ... done
Running setup.py install for impyla ... done
Successfully installed bitarray-0.8.3 impyla-0.14.2.2
--安装impala cli:
# pip install impalacli
Collecting impalacli
Using cached https://files.pythonhosted.org/packages/9e/0d/c1d0a57256bafc4609666a23437208f7ccba4e5f54a53c9b099b441e4da8/impalacli-0.1.5.tar.gz
Requirement already satisfied: click in /usr/lib64/python2.7/site-packages (from impalacli) (7.0)
Requirement already satisfied: cli-helpers==1.0.2 in /usr/lib/python2.7/site-packages (from impalacli) (1.0.2)
Requirement already satisfied: docopt==0.6.2 in /usr/lib/python2.7/site-packages (from impalacli) (0.6.2)
Collecting impyla==0.14.1 (from impalacli)
Using cached https://files.pythonhosted.org/packages/6f/96/92f933cd216f9ff5d7f4ba7e0615a51ad4e3beb31a7de60f7df365378bb9/impyla-0.14.1-py2-none-any.whl
Collecting prompt-toolkit==1.0.15 (from impalacli)
Using cached https://files.pythonhosted.org/packages/d1/b0/1a6c262da35c779dd79550137aa7c298a424987240a28792ec5ccf48f848/prompt_toolkit-1.0.15-py2-none-any.whl
Collecting Pygments==2.2.0 (from impalacli)
Using cached https://files.pythonhosted.org/packages/02/ee/b6e02dc6529e82b75bb06823ff7d005b141037cb1416b10c6f00fc419dca/Pygments-2.2.0-py2.py3-none-any.whl
Requirement already satisfied: terminaltables>=3.0.0 in /usr/lib/python2.7/site-packages (from cli-helpers==1.0.2->impalacli) (3.1.0)
Requirement already satisfied: tabulate[widechars]>=0.8.2 in /usr/lib/python2.7/site-packages (from cli-helpers==1.0.2->impalacli) (0.8.3)
Requirement already satisfied: backports.csv>=1.0.0 in /usr/lib/python2.7/site-packages (from cli-helpers==1.0.2->impalacli) (1.0.7)
Requirement already satisfied: six in /usr/lib/python2.7/site-packages (from impyla==0.14.1->impalacli) (1.9.0)
Requirement already satisfied: thrift<=0.9.3 in /usr/lib/python2.7/site-packages (from impyla==0.14.1->impalacli) (0.9.3)
Requirement already satisfied: bitarray in /usr/lib64/python2.7/site-packages (from impyla==0.14.1->impalacli) (0.8.3)
Requirement already satisfied: wcwidth in /usr/lib/python2.7/site-packages (from prompt-toolkit==1.0.15->impalacli) (0.1.7)
Installing collected packages: impyla, prompt-toolkit, Pygments, impalacli
Found existing installation: impyla 0.14.2.2
Uninstalling impyla-0.14.2.2:
Successfully uninstalled impyla-0.14.2.2
Running setup.py install for impalacli ... done
Successfully installed Pygments-2.2.0 impalacli-0.1.5 impyla-0.14.1 prompt-toolkit-1.0.15
--核对:
# pip list | grep -i im
DEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.
impalacli 0.1.5
impyla 0.14.1
--代码连接示例:
# vim task02.py
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# ------------------操作日志表定义------------------------------------
# create table if not exists yjp_common.etl_operation_log(
# id string,
# author string,
# etl_layer string,
# package_name string,
# target_table string,
# etl_date string,
# start_time string,
# end_time string,
# etl_start_rows bigint,
# etl_end_rows bigint,
# etl_status string,
# primary key(id)
# ) partition by hash partitions 3
# stored as kudu
# ---------------------------------------------------------------
from impala.dbapi import connect
import sys
import datetime
import time
import uuid
###############################定义函数#############################################
def getBatchTimeStamp(tableColumnValue, timedelay=10, format='%Y%m%d%H%M%S'):
sql = '''
select batchendtime
from yjp_common.etl_increload_timestamp
where tablename = '{0}' limit 1
'''.format(str.lower(tableColumnValue))
cur.execute(sql)
temp = None
for row in cur:
temp = str(row[0])
ds = datetime.timedelta(minutes=timedelay)
scriptStartTime_origin = temp
dt_orgin = datetime.datetime.strptime(scriptStartTime_origin, format)
start_ts = int(time.mktime(dt_orgin.timetuple()))
scriptStartTime_new = datetime.datetime.strptime(scriptStartTime_origin, format) - ds
start_new_ts = int(time.mktime(scriptStartTime_new.timetuple()))
scriptStartTime = scriptStartTime_new.strftime(format)
endTime = datetime.datetime.now()
scriptEndTime = endTime.strftime(format)
end_ts = int(time.mktime(endTime.timetuple()))
return (scriptStartTime_origin, start_new_ts, scriptEndTime, start_ts, end_ts)
def getDataRowsNumber(arg):
'''
查询表中记录数
:param tablename:
:return:
'''
if arg is not None:
sql = "select count(*) from {0}".format(arg)
cur.execute(sql)
num = 0
for row in cur:
num = row[0]
return num
# 插入yjp_common.etl_operation_log
def insertOperationLog(args):
'''
传入一个参数元组,第一个参数为表名(schema.table),剩余参数为对应列值
:param args:
:return:
'''
if args is not None:
sql = '''
insert into yjp_common.etl_operation_log(
id,author,etl_layer,package_name,target_table,
etl_date,start_time,end_time,etl_start_rows,etl_end_rows,etl_status
)
values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
'''
cur.execute(sql, args)
# 插入yip_common.etl_increload_timestamp
def insertIncreloadTimeStamp(args):
'''
传入一个参数元组,第一个参数为表名(schema.table),剩余参数为对应列值
:param args:
:return:
'''
if args is not None:
sql = '''
update etl_increload_timestamp
set batchstarttime=%s, batchendtime=%s, batchstart=%s, batchend=%s
where tablename=%s
'''
cur.execute("use yjp_common")
cur.execute(sql, args)
def readSql(path):
try:
with open(path, 'r') as f:
sql = f.read()
except:
print("------------the file 'sql_01.sql' does't exists!----------")
return sql
############################################################################
# 1.定义变量并初始化, 参数错误退出脚本
try:
if len(sys.argv) == 9:
packageName = sys.argv[0]
hostname = sys.argv[1]
port = int(sys.argv[2])
etlDate = sys.argv[3]
author = sys.argv[4]
etlLayer = sys.argv[5]
isInsert = int(sys.argv[6])
targetTable = sys.argv[7]
sqlScript = sys.argv[8]
else:
print("---------the arguments you have inputed was wrong, please checks the arguments' numbers!-----------")
sys.exit(1)
except:
print("---------the arguments you have inputed was wrong, please checks the argument's type!-----------")
sys.exit(1)
# 2.调用impaly初始化connector并获取游标
conn = connect(hostname, port, auth_mechanism='NOSASL')
cur = conn.cursor()
# 取增量时间
# 第一个值:上一批次结束时间
# 第二个值:上一批次结束时间前十分钟的时间戳
# 第三个值:当前脚本操作时间
# 第四个值:第一个值的时间戳
# 第五个值:第三个值的时间戳
times = getBatchTimeStamp(targetTable)
# print(times[0])
# print(times[1])
# print(times[2])
# print(times[3])
# print(times[4])
# 记录操作
begin = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
begin_rows = getDataRowsNumber(targetTable)
sql_01 = readSql(sqlScript)
# print(sql_01)
cur.execute(sql_01, (times[1], times[4]))
end = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
end_rows = getDataRowsNumber(targetTable)
# 插入yip_common.etl_increload_timestamp
if isInsert == 1:
args1 = (times[0], times[2], times[3], times[4], targetTable)
# print(args1)
insertIncreloadTimeStamp(args1)
# 插入yjp_common.etl_operation_log
args2 = (str(uuid.uuid1()), author, etlLayer, packageName, targetTable, etlDate, begin, end, begin_rows, end_rows, '1')
insertOperationLog(args2)
print('-----test completed!----')
cur.close()
conn.close()
--调用示例:
#!/bin/sh
# export PYTHON_EGG_CACHE=/root/myeggs
# argments Info:
# 1: ip
# 2: port
# 3: time (shell script inputed)
# 4: author(this sql script)
# 5: layer(eg: lz, ods, dw, dwd...)
# 6: should be insert into yip_common.etl_increload_timestamp(eg:0(false),1(true))
# 6: target table(the table datas will be writed to )
# 7: the sql script name(shell-script and python-script must be the same path)
python task02.py 197.255.20.215 21050 $1 tianjun lz 0 yjp_dw.dim_trd_bizuser step00.sql
参考:
https://github.com/cloudera/impyla