hive数据导入到elasticsearch网上很多教程,但是elasticsearch导入到hive网上查阅了有两种办法,
1.创建hive和elasticsearch的映射表,然后利用insert into语句或者insert overwrite 语句导入到另一个hive表中
2.就是利用代码来实现
今天就介绍下第二种方法把,本文用的spark 来进行elasticsearch to hive的 ,不说了,直接上代码
pom.xml文件
如下:
com.taobao.ym_dmp
1.0-SNAPSHOT
dmp_tags
4.0.0
jar
2.11.0
1.8
net.minidev
json-smart
2.3
junit
junit
4.4
test
org.apache.spark
spark-core_2.11
2.2.0
org.apache.spark
spark-sql_2.11
2.2.0
org.scala-lang
scala-library
${scala.version}
org.apache.httpcomponents
httpclient
4.5.2
org.mongodb.scala
mongo-scala-driver_2.11
2.1.0
org.mongodb.spark
mongo-spark-connector_2.11
2.2.0
cz.mallat.uasparser
uasparser
0.6.2
joda-time
joda-time
2.10.1
com.alibaba
fastjson
1.2.58
mysql
mysql-connector-java
8.0.13
com.amazonaws
aws-java-sdk-s3
1.11.588
com.aerospike
aerospike-client
4.2.0
org.elasticsearch
elasticsearch-spark-20_2.11
7.0.0
org.elasticsearch.client
elasticsearch-rest-high-level-client
6.2.4
src/main/scala
src/test/scala
org.codehaus.mojo
build-helper-maven-plugin
1.9.1
add-source
generate-sources
add-source
org.apache.maven.plugins
maven-compiler-plugin
3.1
${jdk.version}
org.scala-tools
maven-scala-plugin
compile
testCompile
${scala.version}
-target:jvm-1.5
org.apache.maven.plugins
maven-shade-plugin
2.4.1
package
shade
*:*
META-INF/*.SF
META-INF/*.DSA
META-INF/*.RSA
spark代码如下:
package com.taobao.dmp.impl
import java.net.URI
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.http.client.methods.HttpGet
import org.apache.http.entity.ContentType
import org.apache.http.nio.entity.NStringEntity
import org.apache.http.util.EntityUtils
import org.apache.http.{HttpEntity, HttpHost}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.elasticsearch.client.{Response, RestClient}
import org.elasticsearch.spark._
import org.joda.time.DateTime
import org.joda.time.format.DateTimeFormat
import scala.collection.mutable.ArrayBuffer
import scala.util.control.Breaks._
case class IfaClass(ifa:String,bundles:Array[String],countrys:Array[String])
object Es2Hive {
def run(): Unit ={
//get Audience Str
try{
//生成sparksession对象
val conf = new SparkConf()
conf.set("es.nodes","xxx.xxx.xxx.xxx")//此处为elasticsearch ip
conf.set("es.port","9200")
conf.set("es.index.auto.create","true")
conf.set("spark.es.nodes.wan.only","false")
conf.set("spark.defalut.parallelism","750")
conf.set("es.batch.size.bytes", "50mb")
conf.set("es.batch.size.entries", "10000")
conf.set("es.scroll.size", "10000")
val ss = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
generateUserTagBySpark(ss)
//关闭spark
ss.close()
} catch{
case e: Exception => {
e.printStackTrace()
throw new Exception("genrate audience")
}
}
}
def generateUserTagBySpark(ss: SparkSession) ={
// val TargetFilePath = s"s3://www.taobao.com/hive_dataware/dmp/t_dmp_target_audience_tbl/day=$Day/audience_id=$AudienceId/"
// FileSystem.get(new URI("s3://www.taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(TargetFilePath), true)
//IfaClass(ifa:String,bundles:Array[String],countrys:Array[String])
// import ss.implicits._
val queryDsl=
s"""
|{
| "query":{
| "match_all": {}
| }
|}
""".stripMargin
//IfaClass(line._1,line._2.get("bundles"),line._2.get("countrys"))
for(index <- 0 to 7){
val rdd = ss.sparkContext.esRDD(s"t_dmp_idfa_bundle_country_array_tbl_$index",
queryDsl).map(line=>IfaClass(line._1,assemArr(line._2.get("bundles").toString),assemArr(line._2.get("countrys").toString)))
import ss.implicits._
val ifaBundleCountryResult=rdd.toDF()
println(s"generate final t_dmp_idfa_bundle_country_array_tbl_$index start")
//dylan先删除历史数据
val MediaFilePath = s"s3://www.taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_tbl_$index"
FileSystem.get(new URI("s3://www.taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(MediaFilePath), true)
// ifaBundleCountryResult.write.format("orc").save("s3://www.taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_tbl")
ifaBundleCountryResult.repartition(500).write.format("orc").save(MediaFilePath)
println(s"write to t_dmp_idfa_bundle_country_array_tbl_$index success")
ifaBundleCountryResult.unpersist(true)
}
}
def assemArr(assSr:String):Array[String]={
val arr: Array[String] =assSr.replace("Some(Buffer(","").replace(")","").split(",")
arr
}
def main(args: Array[String]): Unit = {
run()
}
}