# Author:Dengwenxing
# -*- coding: utf-8 -*-
# @Time :2019/12/30 15:09
# @Site :
# @fILE : esReader.py
# @Software :
import sys, os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as fun
from pyspark.sql.functions import *
from pyspark.sql.types import Row
import time, copy, re, math
from datetime import date
from datetime import datetime, timedelta
import json
import logging
from random import randint
from collections import OrderedDict
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
logger = logging.getLogger(__name__)
reload(sys)
sys.setdefaultencoding('utf-8')
conf=SparkConf().set('spark.driver.maxResultSize', '2g')
conf.set('spark.yarn.am.cores', 5)
conf.set('spark.executor.memory', '10g')
conf.set('spark.executor.instances', 50)
conf.set('spark.executor.cores', 4)
conf.set('spark.executor.extraJavaOptions', '-XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseG1GC')
spark = SparkSession \
.builder \
.config(conf=conf) \
.appName('test_data') \
.enableHiveSupport() \
.getOrCreate()
def write_es(df,type,index='graphspace',map_id='id',mode='append'):
'''
df 写入es
:param df: dataframe结果集,[id, *colums]
:param type: 文档类型
:param map_id: id 的对应字段默认 id
:param mode: 默认追加模式
:param index: 默认索引库(graphspace)
:return:
'''
options = OrderedDict()
options["es.nodes"] = "0.0.0.0,1.1.1.1"
options["es.resource"] = "%s/%s" % (index, type)
options["es.mapping.id"] = map_id
df.write \
.format("org.elasticsearch.spark.sql") \
.options(**options) \
.mode(mode) \
.save()
def read_es(type,query='',index='graphspace'):
'''
从es数据库查询数据
:param type: 文档类型
:param query: 查询ddl
:param index: 默认索引库(graphspace)
:return: dataFrame
'''
options = OrderedDict()
options["es.nodes"] = "0.0.0.0,1.1.1.1"
options["es.resource"] = "%s/%s"%(index,type)
if query:
options["es.query"] = query
df = spark.read \
.format("org.elasticsearch.spark.sql") \
.options(**options) \
.load()
return df
if __name__ == '__main__':
logger.info('================start time:%s' % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
data = [('make', 25), ('tubu', 22)]
save_df = spark.createDataFrame(data, ['name', 'age'])
write_es(save_df,'zjhm',index='test')
read_es('zjhm',index='test')
logger.info('=================end time:%s' % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
代码如上,需要一个jar包 elasticsearch-hadoop-6.5.1.jar,因为会用到里面的序列化类等,下载地址寻找合适自己的版本,在提交任务的时候,通过--jars 引入需要的jar包即可,这只是一个小demo,如果需要上生产还是需要优化的, 以上。