spark2.2.0 将数据写入elasticsearch7.2.0(idea sbt 项目)

写在前面:

idea sbt 项目

spark2.2.0

cdh6.0.1

elasticsearch7.2.0

step1.需要sbt依赖

name := "biz_xy_diy"

version := "0.1"

scalaVersion := "2.11.8"


resolvers ++= Seq(
  "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cdh-releases-rcs/",
  "Elasticsearch Repository" at "https://s3.amazonaws.com/download.elasticsearch.org/lucenesnapshots/83f9835"
)

libraryDependencies += "org.apache.spark"%"spark-core_2.11"%"2.2.0-cdh6.0.1"

libraryDependencies += "org.apache.spark"%"spark-sql_2.11"%"2.2.0-cdh6.0.1"

libraryDependencies += "org.apache.spark" % "spark-streaming_2.11" % "2.2.0-cdh6.0.1"

libraryDependencies += "org.apache.spark" % "spark-streaming-kafka-0-10_2.11" % "2.2.0-cdh6.0.1"

libraryDependencies += "org.apache.spark" % "spark-mllib_2.11" % "2.2.0-cdh6.0.1"

libraryDependencies += "org.apache.spark" % "spark-hive_2.11" % "2.2.0-cdh6.0.1"

libraryDependencies += "org.apache.hbase" % "hbase-client" % "2.0.0-cdh6.0.1"

libraryDependencies += "org.apache.hbase" % "hbase-common" % "2.0.0-cdh6.0.1"

libraryDependencies += "org.apache.hbase" % "hbase-server" % "2.0.0-cdh6.0.1"

libraryDependencies += "org.apache.hbase" % "hbase-mapreduce" % "2.0.0-cdh6.0.1"

libraryDependencies += "org.apache.hbase" % "hbase-spark" % "2.0.0-cdh6.0.1"

libraryDependencies += "mysql" % "mysql-connector-java" % "5.1.6"
// https://mvnrepository.com/artifact/org.apache.hbase/hbase
libraryDependencies += "org.apache.hbase" % "hbase" % "2.0.0-cdh6.0.1" pomOnly()
// https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common
libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "3.0.0-cdh6.0.1"

libraryDependencies += "com.fasterxml.jackson.core" % "jackson-core" % "2.6.5"

dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.6.5"

dependencyOverrides += "com.fasterxml.jackson.module" % "jackson-module-scala_2.11" % "2.6.5"

libraryDependencies += "net.sf.json-lib" % "json-lib" % "2.3" from "http://repo1.maven.org/maven2/net/sf/json-lib/json-lib/2.3/json-lib-2.3-jdk15.jar"

libraryDependencies += "org.elasticsearch.client" % "elasticsearch-rest-high-level-client" % "7.2.0"

libraryDependencies += "org.elasticsearch" % "elasticsearch" % "7.2.0"

libraryDependencies += "junit" % "junit" % "4.12" % Test

libraryDependencies += "org.apache.logging.log4j" % "log4j-core" % "2.12.0"

// https://mvnrepository.com/artifact/log4j/log4j
libraryDependencies += "log4j" % "log4j" % "1.2.17"

step2.封装工具类 ESUtils(java)

package cms;
import org.elasticsearch.action.bulk.BackoffPolicy;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;

public class ESUtils {

    /**
     * 获取监听
     * @return
     */
    public static BulkProcessor.Listener getBulkListener(){
        BulkProcessor.Listener listener =  new BulkProcessor.Listener() {
            @Override
            public void beforeBulk(long executionId, BulkRequest request) {

            }

            @Override
            public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {

            }

            @Override
            public void afterBulk(long executionId, BulkRequest request, Throwable failure) {

            }
        };
        return listener;
    }


    /**
     * 获取处理器
     * @param client
     * @param listener
     * @param bulkcnumber
     * @return
     * @throws InterruptedException
     */
    public static BulkProcessor  getBulkprocessor(RestHighLevelClient client, BulkProcessor.Listener listener,int bulkcnumber) throws InterruptedException {
        BulkProcessor bulkProcessor = BulkProcessor.builder(
                (request, bulkListener) ->
                        client.bulkAsync(request, RequestOptions.DEFAULT, bulkListener),
                listener).build();
        BulkProcessor.Builder builder = BulkProcessor.builder(
                (request, bulkListener) ->
                        client.bulkAsync(request, RequestOptions.DEFAULT, bulkListener),
                listener);
        builder.setBulkActions(bulkcnumber);
        builder.setBulkSize(new ByteSizeValue(1L, ByteSizeUnit.MB));
        builder.setConcurrentRequests(0);
        builder.setFlushInterval(TimeValue.timeValueSeconds(10L));
        builder.setBackoffPolicy(BackoffPolicy
                .constantBackoff(TimeValue.timeValueSeconds(1L), 3));
        return bulkProcessor;
    }
}

step3.从mysql 写入 es(异步提交)

class ReadMsql2EsAsync extends Serializable {
  val log =  Logger.getLogger("org").setLevel(Level.ERROR)
  val HOST = "yourhostname"
  //var client: RestHighLevelClient = null
  val PORT = 9200
  val HTTP = "HTTP"
  val MODEL_LOCAL = "local[*]"// yarn local[*]

  def readMysql : DataFrame = {
    val sparkSession = SparkSession
      .builder()
      .appName("testwES")
      .master(MODEL_LOCAL)
      .getOrCreate()
    val pro = new Properties()
    pro.setProperty("user", "username")
    pro.setProperty("password", "*****")
    val df =  sparkSession.read.jdbc("jdbc:mysql://yourhostname:3306/test","vendor2",pro)
    df
  }

  def w2es (): Unit ={
    val df = readMysql
    val indexName = "test_index"  //索引名称
    val indexType = "cms_su_dtl" //索引类型
    val start = System.currentTimeMillis()

    df.foreachPartition(par =>{
      var i = 0
      val blukRequest = new BulkRequest()
      blukRequest.timeout("3m")
      val listener = ESUtils.getBulkListener
      val client = new RestHighLevelClient(RestClient.builder(new HttpHost(HOST, PORT, HTTP)))

      //获取皮操作处理
      val bulkprocessor =  ESUtils.getBulkprocessor(client,listener,400)
      //重用map
      val mapf = new util.HashMap[String, Any]()
      if(!par.isEmpty){
        par.foreach(x =>{
          mapf.put("id", x.get(x.fieldIndex("id")))
          mapf.put("name", x.get(x.fieldIndex("name")))
          mapf.put("serialNumber", x.get(x.fieldIndex("serialNumber")))
          mapf.put("price", x.get(x.fieldIndex("price")))
          mapf.put("stock_number", x.get(x.fieldIndex("stock_number")))
         // mapf.put("create_time", x.get(x.fieldIndex("create_time")))
          mapf.put("venName", x.get(x.fieldIndex("venName")))
          //println(mapf)
          val id = mapf.get("id").toString
          val request = new IndexRequest(indexName,indexType)
          request.id(id)
          request.timeout("3m")
          request.source(mapf)
          bulkprocessor.add(request)
        })
      }
      bulkprocessor.awaitClose(30L, TimeUnit.SECONDS)
      bulkprocessor.close()
      client.close()
    })
    val end = System.currentTimeMillis()
    val spend = (end - start)/1000
    Logger.getLogger("spend").info(indexName+ " :" + spend + " s")
  }
}

 

 

你可能感兴趣的:(spark,elasticsearch)