pyspark读取hbase,并将spark-rdd转化为dataframe

pyspark连接hbase,并将spark-rdd转化为dataframe@TOC

建立spark连接,获取rdd

#-*- coding:utf-8 -*-
import  json
from pyspark.sql import SparkSession

host = '192.168.11.xxx'
#table name
table = 'I_OCS_COLLECT'
#建立spark连接
spark = SparkSession.builder.master("yarn-client").appName("test").getOrCreate()
hbaseconf = {"hbase.zookeeper.quorum": host, 
		"hbase.mapreduce.inputtable": table
		#定义起止行
             #"hbase.mapreduce.scan.row.start": row,
            # "hbase.mapreduce.scan.row.stop": row1
             }
keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"
#得到rdd
hbase_rdd = spark.sparkContext.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat",
"org.apache.hadoop.hbase.io.ImmutableBytesWritable",
"org.apache.hadoop.hbase.client.Result", 
keyConverter=keyConv, valueConverter=valueConv, conf=hbaseconf)

数据处理

定义函数deal_missing_dec,定义不能为空的列

def deal_missing_dec(no_row_key_colnames):
    def deal_missing(x):
        result = {}
        for i in no_row_key_colnames:
            if i in x[1].keys():
                result[i]=x[1][i]
            else:
                result[i]='missing'
        return (x[0],result)
    return deal_missing

将rdd转化为dataframe

def deal_row(x):
    return [x[0]]+list(x[1].values())
    
def rdd_to_df(hbase_rdd):
    data_split = hbase_rdd.map(lambda x:(x[0],x[1].split('\n')))
    data_cols = data_split.map(lambda x:(x[0],call_transfor(x[1])))
    no_row_key_colnames = data_cols.map(lambda x:[i for i in x[1]]).take(2)[1]
    deal_missing = deal_missing_dec(no_row_key_colnames)
    no_missing = data_cols.map(deal_missing)
    data = no_missing.map(deal_row).toDF(['row_key']+no_row_key_colnames)
    return data

调用
fdc_data = rdd_to_df(hbase_rdd)
print fdc_data.show()
结果:
pyspark读取hbase,并将spark-rdd转化为dataframe_第1张图片

参考文章:
https://blog.csdn.net/hchzhao_1985/article/details/82717949
http://dblab.xmu.edu.cn/blog/1715-2/

你可能感兴趣的:(pyspark)