spark 读写 es

 测试代码:

package make.zhangsheniMain

/**
  * Hello world!
  *
  */
import make.bean.CaseClass.{people, people_id}
import make.service.EsService
import make.tools.{DateUtils, PropertiesTool}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object es_test {

  def properties_test(): Unit = {
    val username = PropertiesTool.getproperties("username", "jdbc.properties")
    val password = PropertiesTool.getproperties("password", "jdbc.properties")

    println(username, password)
  }

  def test1(spark: SparkSession) :Unit = {
    val resrdd: RDD[people] = EsService.read4es(spark, "person/man").map(x => {
      val persons = x._2.toMap

      val name = persons.getOrElse("name", "").toString
      val age = persons.getOrElse("age", 0).asInstanceOf[Int]
      val date = DateUtils.format_date(persons.getOrElse("date", "").toString)
      people(name, age, date)
    })
    //    resrdd.foreach(println)
    //    EsService.save2es(resrdd, "people/man")
    val df = EsService.read4es_df(spark, "person/man")
    df.show()
    println(df.schema)
    df.createOrReplaceTempView("tmp")
    var sql =
      """
        |select name, age, from_unixtime(unix_timestamp(date,'EEE MMM dd HH:mm:ss zzz yyyy')) as date from tmp
      """.stripMargin

    spark.sql(sql).show()
  }

  def test2(spark: SparkSession) : Unit= {
    val query =
      s"""
         {
         |	"query":{
         |		"match": {
         |			"name": "make"
         |		}
         |	}
         |}
       """.stripMargin
    val resrdd = EsService.query4es(spark,query, "people/man")
    resrdd.foreach(println)
  }

  def test3(spark: SparkSession) : Unit= {
    val query =
      s"""
         {
         |	"query":{
         |		"match": {
         |			"name": "瓦力"
         |		}
         |	}
         |}
       """.stripMargin
    //查询es,并写入到另外一个索引
    val resrdd: RDD[people_id] = EsService.query4es(spark,query, "person/man").map(line =>{
      val peopleid = line._1
      val persons = line._2.toMap

      val name = persons.getOrElse("name", "").toString
      val age = persons.getOrElse("age", 0).asInstanceOf[Int]
      val date = DateUtils.format_date(persons.getOrElse("date", "").toString)
      people_id(peopleid, name, age, date)
    })
    resrdd.foreach(println)
    EsService.save2es(resrdd, "people/man")
  }


  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder()
      .master("local[*]")
      .appName("IndicServiceApp")
      .config("spark.network.timeout","1200s")
      .getOrCreate()

//    test1(spark)
//    test2(spark)
    test3(spark)
  }

  case class person (id: Int, name: String, age: Int, date: String)
}

es服务配置代码:

package make.service

/**
  * @Author: maketubu
  * @Date: 2019/10/31 14:54
  */
import make.tools.PropertiesTool
import org.elasticsearch.spark.rdd.EsSpark
import make.bean.CaseClass._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.elasticsearch.spark.sql.EsSparkSQL

object EsService {
  val nodes = PropertiesTool.getproperties("es.nodes","es.properties")
  val port = PropertiesTool.getproperties("es.port","es.properties")
  val cluter_name = PropertiesTool.getproperties("cluster.name", "es.properties")

  val essaveconfig = Map("es.nodes"-> nodes
    ,"es.port" -> port
    ,"es.write.operation" -> "upsert"  //相同rowkey即更新数据
    ,"cluster.name" -> "my_app"
    ,"es.mapping.id" -> "peopleid"    //对应的结构id字段名
    ,"spark.es.mapping.date.rich" -> "false")

  val esconfig = Map("es.nodes"-> nodes
    ,"es.port" -> port
    ,"es.write.operation" -> "upsert"
    ,"cluster.name" -> "my_app")

  //RDD[caseclass],caseclass的结构与index的结构对应
  //如果在插入的时候指定id的列,则会存在自动生成的id的列可能为空的情况
  def save2es(rdd:RDD[people_id], path: String):Unit={
//    import spark.implicits._
//    val df = rdd.toDF()
//    EsSparkSQL.saveToEs(df,path,essaveconfig)
      EsSpark.saveToEs(rdd, path,essaveconfig)
  }

  
  def read4es_df(spark:SparkSession, path: String) :DataFrame = {
    val df: DataFrame = spark.sqlContext.read.format("org.elasticsearch.spark.sql").options(esconfig).load(path)
    df
  }

  //返回的是(id, map)
  def read4es(spark: SparkSession, path: String) :RDD[(String, collection.Map[String, AnyRef])]= {
    val sc = spark.sparkContext
    val resrdd = EsSpark.esRDD(sc, path, esconfig)
    resrdd
  }

  def query4es(spark: SparkSession, query: String, path: String) :RDD[(String, collection.Map[String, AnyRef])]= {
    val sc = spark.sparkContext
    val resrdd = EsSpark.esRDD(sc, path, query, esconfig)
    resrdd
  }
}

这里的写入都是以case class 的形式写入的,还可以以map, json的形式写入详细请参考这个博客

https://www.iteblog.com/archives/1728.html#MapElasticSearch

你可能感兴趣的:(spark,es)