在读取hbase的时候,由于hbase存储的是16进制的字节码,我这里用了各种方法,都无法在转换为df的时候,将其转换为中文,看了很多方法,各种decode, encode都没有找到合适的方法,如果有哪位同学,在看到这篇文章并解决了这个问题,麻烦告知我一声,谢谢
读取hbase 需将hbase下lib的几个常用包,软连接到spark的jars目录下
出现无法读取hive.sql的问题,将hive-site.xml文件连接到spark的conf目录下
spark-examples_2.11-1.6.0-typesafe-001.jar 下载地址:点此处
#Author:Dengwenxing
# -*- coding: utf-8 -*-
# @Time :2019/12/30 15:09
# @Site :
# @fILE : hbaseReader.py
# @Software :
import sys, os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as fun
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time, copy, re, math
from datetime import date
from datetime import datetime, timedelta
import json
import logging
from random import randint
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
logger = logging.getLogger(__name__)
reload(sys)
sys.setdefaultencoding('utf-8')
warehouse_location = '/user/hive/warehouse/'
conf=SparkConf().set('spark.driver.maxResultSize', '10g')
conf.set('spark.yarn.executor.memoryOverhead', '30g')
conf.set('spark.yarn.am.cores', 5)
conf.set('spark.executor.memory', '40g')
conf.set('spark.executor.instances', 50)
conf.set('spark.executor.cores', 8)
conf.set('spark.executor.extraJavaOptions', '-XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseG1GC')
conf.set("spark.sql.warehouse.dir", warehouse_location)
spark = SparkSession \
.builder \
.config(conf=conf) \
.enableHiveSupport() \
.getOrCreate()
def setHbaseConf(ips=None,tableName=None,Znode="/hbase-unsecure",useing="input",rowStart=None,rowEnd=None):
'''
:param ips: [ip1,ip2,...]
:param tableName: hbase tablename
:param Znode: habse Znode in zookeeper
:param rowStart: row start rowkey
:param rowEnd: row end rowkey
:return: hbaseConf
'''
if not ips:
print("ips is null")
sys.exit(1)
if not tableName:
print("tablename is null")
sys.exit(1)
ips = ','.join(ips)
tableName = tableName
if useing == "input":
hbaseConf = {
"hbase.zookeeper.quorum": ips,
"hbase.mapreduce.inputtable": tableName,
"zookeeper.znode.parent": Znode
}
else:
hbaseConf = {
"hbase.zookeeper.quorum": ips,
"hbase.mapred.outputtable": tableName,
"zookeeper.znode.parent": Znode,
"mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
"mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
"mapreduce.job.output.value.class": "org.apache.hadoop.hbase.io.Writable"
}
return hbaseConf
if rowStart is None or rowEnd is None:
return hbaseConf
else:
hbaseConf["hbase.mapreduce.scan.row.start"] = rowStart
hbaseConf["hbase.mapreduce.scan.row.end"] = rowEnd
return hbaseConf
def hbaseRDD2DF():
pass
def resule2df(rdd):
def result2Dict(colsumns):
result = {}
rows = [json.loads(i) for i in colsumns]
for row in rows:
column = row["qualifier"]
value = row["value"]
result[column] = value
return result
def maketuple(colsumns):
rowkey = [colsumns[0]]
values = [colsumns[1][key] for key in colsumns[1]]
return tuple(rowkey+values)
res1 = rdd.map(lambda (k, v): (k, v.split('\n'))).map(lambda (k, v): (k, result2Dict(v)))
columns = ["rowkey"] + res1.map(lambda x:[key for key in x[1]]).take(1)[0]
# df = res1.map(maketuple).toDF(columns)
res = res1.map(maketuple).toDF(columns)
return res
def hbaseSimpleReader():
ips = [***,***]
hbaseConf = setHbaseConf(ips=ips, tableName="default:vertex_person")
keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"
hbase_rdd = spark.sparkContext.newAPIHadoopRDD(
"org.apache.hadoop.hbase.mapreduce.TableInputFormat",
"org.apache.hadoop.hbase.io.ImmutableBytesWritable",
"org.apache.hadoop.hbase.client.Result",
keyConverter=keyConv,
valueConverter=valueConv,
conf=hbaseConf
)
return resule2df(hbase_rdd)
def hbaseSimpleWriter(rdd):
ips = [***,***]
hbaseConf = setHbaseConf(ips=ips, tableName="vertex_person",useing="output")
keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
rdd.saveAsNewAPIHadoopDataset(conf=hbaseConf,keyConverter=keyConv,valueConverter=valueConv)
def run():
df = spark.read.orc("path")
def formatHbaseOutput(row):
''' 写入rdd的格式为(rowkey,[rowkey, col_family, column, value]) '''
cf = 'info'
cols = ["zjhm","xm","age"]
rowkey = str(randint(1,9)) + row.zjhm
zjhm = (rowkey,[rowkey,cf,cols[0],row.zjhm])
xm = (rowkey,[rowkey,cf,cols[1],row.xm])
age = (rowkey,[rowkey,cf,cols[2],row.age])
result = [zjhm,xm,age]
return result
rdd = df.rdd.flatMap(formatHbaseOutput)
hbaseSimpleWriter(rdd)
if __name__ == '__main__':
logger.info( '================start time:%s' % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
hbaseSimpleReader()
run()
logger.info('=================end time:%s' % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
#spark-submit --py-files spark-examples_2.11-1.6.0-typesafe-001.jar --master yarn --deploy-mode cluster --name hbaseReader --driver-memory 40G --queue bbd_01 hbaseReader.py