一个简单粗暴的方法从MySQL数据库抽取数据到Hbase实现的过程:
rowKey利用MySQL表的主键ID特性作为HBASE的id
code:
####
#!/usr/bin/env python
#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import MySQLdb
import datetime,time
sys.path.append('/usr/lib/python2.6/site-packages/hbase')
from thrift import Thrift
from thrift.transport import TSocket
from thrift.transport import TTransport
from thrift.protocol import TBinaryProtocol
from hbase import Hbase
from hbase.ttypes import *
import csv
from hbase.ttypes import ColumnDescriptor, Mutation, BatchMutation, TRegionInfo
from hbase.ttypes import IOError, AlreadyExists
def client_conn():
transport=TSocket.TSocket("172.16.10.87",9090)
transport=TTransport.TBufferedTransport(transport)
protocol=TBinaryProtocol.TBinaryProtocol(transport)
client=Hbase.Client(protocol)
transport.open()
return client
if __name__=="__main__":
client=client_conn()
conn = MySQLdb.connect(host="172.161.110.10", user="dlan", passwd="root123", port=5029, db='coolqi', charset='utf8')
cur = conn.cursor()
sql="select * from ca_record where ca_time>=STR_TO_DATE('20170720','%Y%m%d')"
print sql
cur.execute(sql)
data=cur.fetchall()
for k in xrange(len(data)):
datalist2=[]
rowKey=data[k][0]
print rowKey
user_id=data[k][1]
ca_result=data[k][2]
ca_time=data[k][3]
real_name=data[k][4]
id_card=data[k][5]
sex=data[k][6]
datalist=[user_id,ca_result,ca_time,real_name,id_card,sex]
datalist1=["user_id","ca_result","ca_time","real_name","id_card","sex"]
for j in range(len(datalist)):
args=str(datalist[j])
#print args
if isinstance(args,str):
mutations="[Mutation(column="+"'"+datalist1[j]+':'+str(j)+"'"+","+"value="+"'"+str(args)+"')"+"]"
datalist2.append(mutations)
#print datalist2
client.mutateRow('ca_record',str(rowKey),[Mutation(column=datalist1[j]+':'+str(j),value=str(args))])
#client.mutateRows('ca_record',datalist2)
###在for k in xrange(len(data)): 可以利用enumerate()函数减少上面的循环 .