在工作中想要使用Python对HBASE进行操作,主要用来获取数据进行分析,HBASE提供了 Thrift 借口,通过查看API 进行了一些的尝试,下面就是使用Python的相关代码,在使用之前需要启动 HBASE的Thrift和安装python的几个模块,在代码一开始的部分已经注明。
使用的 Python 2.7
API:https://wiki.apache.org/hadoop/Hbase/ThriftApi
# encoding=utf-8
'''
该脚本用于尝试 使用 python 通过 Thrift 连接并操作 HBase 数据库
prepare:
1. 启动 ThriftServer 于 HBASE
> hbase-deamn.sh start thrift/thrift2
> 在此,HBASE提供两种 thrift/thrift2 由于种种原因,语法并不兼容,其中 2 的语法封装更优雅,但部分 DDL 操作
不完善,而且 thrift API 资料相对多一些,所以先使用thrift 尝试
2. jps 应该有 ThriftServer 进程
3.Python 需要安装 thrift 和 hbase 模块,有网络的直接 pip,没有网络的把相同版本的模块代码下载下来用 sys.path.append('PATH') 引用,安装后的代码一般在 $PYTHON_HOME/Lib/site-packages
> pip install thrift
pip install hbase-thrift
'''
from thrift import Thrift
from thrift.transport import TSocket, TTransport
from thrift.protocol import TBinaryProtocol
from hbase import Hbase
# server端地址和端口,web是HMaster也就是thriftServer主机名,9090是thriftServer默认端口
transport = TSocket.TSocket('192.168.1.92', 9090)
# 可以设置超时
transport.setTimeout(5000)
# 设置传输方式(TFramedTransport或TBufferedTransport)
trans = TTransport.TBufferedTransport(transport)
# 设置传输协议
protocol = TBinaryProtocol.TBinaryProtocol(trans)
# 确定客户端
client = Hbase.Client(protocol)
# 打开连接
transport.open()
from hbase.ttypes import ColumnDescriptor, Mutation, BatchMutation, TRegionInfo
from hbase.ttypes import IOError, AlreadyExists
tableName = "profilesSys:user_behavior_detail_his"
rowkey = "04FLHWDSwxda9a5c51a81b783915160063052050"
# 获取所有表名
tableNames = client.getTableNames()
print('tableNames:',tableNames)
# 获取列族,返回map
columnDescriptors = client.getColumnDescriptors(tableName)
print("columnName",columnDescriptors)
# 获取该表的所有Regions,包括起止key等信息,返回list
tableRegions = client.getTableRegions(tableName)
# 获取行(tableName,rowKey) return List
row = client.getRow(tableName,rowkey)
print("row:",row)
# 获取 row 里的某一列
rowColumn = client.get(tableName,rowkey,"bbdi:openId")
print("rowColumn",rowColumn)
# 获取 row 里的多列时间戳最新的,None 则为所有列
rowColumns = client.getRowWithColumns(tableName,rowkey,["bbdi:openId","bbdi:tempLogId"])
print("rowColumns",rowColumns)
# client.mutateRow(tableName[1],"jason",)
# 创建表
try:
# 创建列族,这里只传了第一个参数 name
'''
struct ColumnDescriptor {
1:Text name,
2:i32 maxVersions = 3,
3:string compression = "NONE",
4:bool inMemory = 0,
5:string bloomFilterType = "NONE",
6:i32 bloomFilterVectorSize = 0,
7:i32 bloomFilterNbHashes = 0,
8:bool blockCacheEnabled = 0,
9:i32 timeToLive = -1
}
'''
desc = ColumnDescriptor(name="colNameTest1")
# 创建表 (tableMame,[列族们])
client.createTable('our_table1', [desc])
print client.getTableNames()
except AlreadyExists, tx:
print "Thrift exception"
print '%s' % (tx.message)
# 插入行
mutations = [Mutation(column="colNameTest1:Name", value="Jason"),Mutation(column="colNameTest1:age", value="5")]
client.mutateRow("our_table1","rowKey",mutations)
#插入多行
rowMutations = [BatchMutation("rowkey1",mutations),BatchMutation("rowkey2",mutations)]
client.mutateRows("our_table1",rowMutations)
# 删除一行
client.deleteAllRow("our_table1","rowkey2")
# scan
# ScannerID scannerOpen(Text tableName, Text startRow, list columns)
scanId = client.scannerOpen("our_table1","",["colNameTest1"])
scanRescult = client.scannerGet(scanId) #从scan中取一条
scanRescult1 = client.scannerGetList(scanId,50) #从scan中取多条,同一个ScanID 上面去过一条,下面就取不到了
print(scanRescult)
print(scanRescult1)
# 关闭该扫描
client.scannerClose(scanId);