happybase的使用

一、启动thrift

因为用到了happybase,需要先在服务器上启动thrift服务, 关闭终端thrift继续运行命令如下:

nohup hbase thrift -p 9090 start

二、读取hbase的代码

class GetHbase(object):
    def __init__(self, hostname,table_name,start_date):
        self.hostname=hostname        #主机名
        self.table_name = table_name  #表名
        self.start_date = start_date

    def getdata(self):
        connection = happybase.Connection(self.hostname, autoconnect=False)
        connection.open()
        print  "已成功连接到Hbase"
        print  "准备连接到表weibo_content"
        table = connection.table(self.table_name)
        scanner = table.scan()  # scanner浏览的是Hbase中所有字段数据
        print   "已成功连接到Hbase中表weibo_content"
        # for e in scanner:
        #     print e
        # 下面开始读取这次运行需要的数据
        mydata = list()
        d = dict()
        #读入大于start_date的数据
        print "开始读取%s之后的数据" % (self.start_date)
        for key, data in scanner:
           if data['cont:pubDate'] >= self.start_date:
               d['pubDate'] = data['cont:pubDate']
               Timedict = TimeMatch(d['pubDate']) #将日期解析为week_num,month_num等形式
               d['author'] = data['cont:author']
               # 过滤'cont:content'为空的值
               try:
                   d['content'] = data['cont:content']
               except Exception as e:
                   del data
               d = {'pubDate':d['pubDate'],'author':d['author'],'content':d['content'],'Timedict':Timedict}
               # print type(d),'\n',d,'\n',d['content']
               mydata.append(d)
        return d

三、调用

mydata为读出的数据

mydata = GetHbase(host, table_name, start_date).getdata()

你可能感兴趣的:(spark,我的笔记)