impala 和python的交互



--前提条件:
# yum -y install python-devel gcc-c++ python-pip python gcc
告警:
The required version of setuptools (>=3.4.4) is not available,
    and can't be installed while this script is running. Please
    install a more recent version first, using
    'easy_install -U setuptools'.
    
    (Currently using setuptools 0.9.8 (/usr/lib/python2.7/site-packages))
-- 报错信息:
 bitarray/_bitarray.c:9:20: fatal error: Python.h: No such file or directory
     #include "Python.h"
--解决办法:
#yum -y install python-devel python
--升级安装pip
pip install --upgrade pip

--升级安装setuptools
pip install --upgrade setuptools

-- 升级之后的版本:
# pip list | grep -i -E 'pip|setuptools'
DEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.
pip                          19.0.3 
setuptools                   41.0.0 

--安装impala:
pip install impyla
 
Collecting impyla
  Using cached https://files.pythonhosted.org/packages/2b/de/3cadc54ca0aac684e03e5f66ae027bd5bf455f9becf9cc81a435b3834889/impyla-0.14.2.2.tar.gz
Requirement already satisfied: six in /usr/lib/python2.7/site-packages (from impyla) (1.9.0)
Collecting bitarray (from impyla)
  Using cached https://files.pythonhosted.org/packages/e2/1e/b93636ae36d08d0ee3aec40b08731cc97217c69db9422c0afef6ee32ebd2/bitarray-0.8.3.tar.gz
Requirement already satisfied: thrift<=0.9.3 in /usr/lib/python2.7/site-packages (from impyla) (0.9.3)
Installing collected packages: bitarray, impyla
  Running setup.py install for bitarray ... done
  Running setup.py install for impyla ... done
Successfully installed bitarray-0.8.3 impyla-0.14.2.2

--安装impala cli:

# pip install impalacli
Collecting impalacli
  Using cached https://files.pythonhosted.org/packages/9e/0d/c1d0a57256bafc4609666a23437208f7ccba4e5f54a53c9b099b441e4da8/impalacli-0.1.5.tar.gz
Requirement already satisfied: click in /usr/lib64/python2.7/site-packages (from impalacli) (7.0)
Requirement already satisfied: cli-helpers==1.0.2 in /usr/lib/python2.7/site-packages (from impalacli) (1.0.2)
Requirement already satisfied: docopt==0.6.2 in /usr/lib/python2.7/site-packages (from impalacli) (0.6.2)
Collecting impyla==0.14.1 (from impalacli)
  Using cached https://files.pythonhosted.org/packages/6f/96/92f933cd216f9ff5d7f4ba7e0615a51ad4e3beb31a7de60f7df365378bb9/impyla-0.14.1-py2-none-any.whl
Collecting prompt-toolkit==1.0.15 (from impalacli)
  Using cached https://files.pythonhosted.org/packages/d1/b0/1a6c262da35c779dd79550137aa7c298a424987240a28792ec5ccf48f848/prompt_toolkit-1.0.15-py2-none-any.whl
Collecting Pygments==2.2.0 (from impalacli)
  Using cached https://files.pythonhosted.org/packages/02/ee/b6e02dc6529e82b75bb06823ff7d005b141037cb1416b10c6f00fc419dca/Pygments-2.2.0-py2.py3-none-any.whl
Requirement already satisfied: terminaltables>=3.0.0 in /usr/lib/python2.7/site-packages (from cli-helpers==1.0.2->impalacli) (3.1.0)
Requirement already satisfied: tabulate[widechars]>=0.8.2 in /usr/lib/python2.7/site-packages (from cli-helpers==1.0.2->impalacli) (0.8.3)
Requirement already satisfied: backports.csv>=1.0.0 in /usr/lib/python2.7/site-packages (from cli-helpers==1.0.2->impalacli) (1.0.7)
Requirement already satisfied: six in /usr/lib/python2.7/site-packages (from impyla==0.14.1->impalacli) (1.9.0)
Requirement already satisfied: thrift<=0.9.3 in /usr/lib/python2.7/site-packages (from impyla==0.14.1->impalacli) (0.9.3)
Requirement already satisfied: bitarray in /usr/lib64/python2.7/site-packages (from impyla==0.14.1->impalacli) (0.8.3)
Requirement already satisfied: wcwidth in /usr/lib/python2.7/site-packages (from prompt-toolkit==1.0.15->impalacli) (0.1.7)
Installing collected packages: impyla, prompt-toolkit, Pygments, impalacli
  Found existing installation: impyla 0.14.2.2
    Uninstalling impyla-0.14.2.2:
      Successfully uninstalled impyla-0.14.2.2
  Running setup.py install for impalacli ... done
Successfully installed Pygments-2.2.0 impalacli-0.1.5 impyla-0.14.1 prompt-toolkit-1.0.15

--核对:
# pip list  | grep -i im
DEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.
impalacli                    0.1.5  
impyla                       0.14.1 


--代码连接示例:
# vim task02.py
#!/usr/bin/env python
# -*- coding:utf-8 -*-

# ------------------操作日志表定义------------------------------------
# create table if not exists yjp_common.etl_operation_log(
#     id string,
#     author string,
#     etl_layer string,
#     package_name string,
#     target_table string,
#     etl_date string,
#     start_time string,
#     end_time string,
#     etl_start_rows bigint,
#     etl_end_rows bigint,
#     etl_status string,
#     primary key(id)
# ) partition by hash partitions 3
# stored as kudu
# ---------------------------------------------------------------

from impala.dbapi import connect
import sys
import datetime
import time
import uuid

###############################定义函数#############################################
def getBatchTimeStamp(tableColumnValue, timedelay=10, format='%Y%m%d%H%M%S'):
    sql = '''
        select batchendtime 
        from yjp_common.etl_increload_timestamp 
        where tablename = '{0}' limit 1
    '''.format(str.lower(tableColumnValue))
    cur.execute(sql)
    temp = None
    for row in cur:
        temp = str(row[0])
    ds = datetime.timedelta(minutes=timedelay)
    scriptStartTime_origin = temp
    dt_orgin = datetime.datetime.strptime(scriptStartTime_origin, format)
    start_ts = int(time.mktime(dt_orgin.timetuple()))
    scriptStartTime_new = datetime.datetime.strptime(scriptStartTime_origin, format) - ds
    start_new_ts = int(time.mktime(scriptStartTime_new.timetuple()))
    scriptStartTime = scriptStartTime_new.strftime(format)
    endTime = datetime.datetime.now()
    scriptEndTime = endTime.strftime(format)
    end_ts = int(time.mktime(endTime.timetuple()))
    return (scriptStartTime_origin, start_new_ts, scriptEndTime, start_ts, end_ts)


def getDataRowsNumber(arg):
    '''
    查询表中记录数
    :param tablename:
    :return:
    '''
    if arg is not None:
        sql = "select count(*) from {0}".format(arg)
        cur.execute(sql)
        num = 0
        for row in cur:
            num = row[0]
    return num

# 插入yjp_common.etl_operation_log
def insertOperationLog(args):
    '''
    传入一个参数元组,第一个参数为表名(schema.table),剩余参数为对应列值
    :param args:
    :return:
    '''
    if args is not None:
        sql = '''
            insert into yjp_common.etl_operation_log(
             id,author,etl_layer,package_name,target_table,
             etl_date,start_time,end_time,etl_start_rows,etl_end_rows,etl_status
            ) 
            values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
        '''
        cur.execute(sql, args)

# 插入yip_common.etl_increload_timestamp
def insertIncreloadTimeStamp(args):
    '''
    传入一个参数元组,第一个参数为表名(schema.table),剩余参数为对应列值
    :param args:
    :return:
    '''
    if args is not None:
        sql = '''
               update etl_increload_timestamp 
               set batchstarttime=%s, batchendtime=%s, batchstart=%s, batchend=%s
               where tablename=%s
        '''
        cur.execute("use yjp_common")
        cur.execute(sql, args)


def readSql(path):
    try:
        with open(path, 'r') as f:
            sql = f.read()
    except:
        print("------------the file 'sql_01.sql' does't exists!----------")
    return sql

############################################################################

# 1.定义变量并初始化, 参数错误退出脚本
try:
    if len(sys.argv) == 9:
        packageName = sys.argv[0]
        hostname = sys.argv[1]
        port = int(sys.argv[2])
        etlDate = sys.argv[3]
        author = sys.argv[4]
        etlLayer = sys.argv[5]
        isInsert = int(sys.argv[6])
        targetTable = sys.argv[7]
        sqlScript = sys.argv[8]

    else:
        print("---------the arguments you have inputed was wrong, please checks the arguments' numbers!-----------")
        sys.exit(1)
except:
    print("---------the arguments you have inputed was wrong, please checks the argument's type!-----------")
    sys.exit(1)


# 2.调用impaly初始化connector并获取游标
conn = connect(hostname, port, auth_mechanism='NOSASL')
cur = conn.cursor()

# 取增量时间
# 第一个值:上一批次结束时间
# 第二个值:上一批次结束时间前十分钟的时间戳
# 第三个值:当前脚本操作时间
# 第四个值:第一个值的时间戳
# 第五个值:第三个值的时间戳

times = getBatchTimeStamp(targetTable)
# print(times[0])
# print(times[1])
# print(times[2])
# print(times[3])
# print(times[4])

# 记录操作
begin = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
begin_rows = getDataRowsNumber(targetTable)

sql_01 = readSql(sqlScript)
# print(sql_01)

cur.execute(sql_01, (times[1], times[4]))


end = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
end_rows = getDataRowsNumber(targetTable)

# 插入yip_common.etl_increload_timestamp
if isInsert == 1:
    args1 = (times[0], times[2], times[3], times[4], targetTable)
    # print(args1)
    insertIncreloadTimeStamp(args1)

# 插入yjp_common.etl_operation_log
args2 = (str(uuid.uuid1()), author, etlLayer, packageName, targetTable, etlDate, begin, end, begin_rows, end_rows, '1')
insertOperationLog(args2)

print('-----test completed!----')

cur.close()
conn.close()

--调用示例:
#!/bin/sh

# export PYTHON_EGG_CACHE=/root/myeggs

# argments Info:
# 1: ip
# 2: port
# 3: time (shell script inputed)
# 4: author(this sql script)
# 5: layer(eg: lz, ods, dw, dwd...)
# 6: should be insert into yip_common.etl_increload_timestamp(eg:0(false),1(true))
# 6: target table(the table datas will be writed to )
# 7: the sql script name(shell-script and python-script must be the same path)

python task02.py 197.255.20.215 21050 $1 tianjun lz 0 yjp_dw.dim_trd_bizuser step00.sql


参考:
https://github.com/cloudera/impyla

 

你可能感兴趣的:(Hadoop)