最近项目中用到spark和ES的集成,涉及到一系列对dataFrame列的操作,时间主要花在列的增删改上面。
整个类采用的是spark+ES+Redis架构,ES存基础数据,redis 存储条件,根据redis的条件从ES筛选出符合条件的记录。
val sqlContex = SparkSession.builder().master("local[2]") .appName("spark2ES") // .config("hive.metastore.uris", "thrift://172.1.1.199:9083") //集成hive .config("spark.testing.memory", "2147480000") .config("es.index.auto.create", "true") .config("es.nodes", "xxxx") // .enableHiveSupport() // 增加hive支持 .getOrCreate()
// 读取es数据
val optionMap = Map("path" -> "ods_wj_apk_index/docs", "pushdown" -> "true") val esDF = sqlContex.read.format("org.elasticsearch.spark.sql").options(optionMap).load()
// 增加列操作
// 方法一 sql的自定义函数 val topic ="topic123" tempDataFrame.createOrReplaceTempView("temp") sqlContex.sqlContext.udf.register("replaceCol", (str:String) => topic) val addDf =sqlContex.sqlContext.sql(s"select *,replaceCol(content) as topicName from temp") addDf.show() // 方法二 dataFrame withcolumn 自定义函数 val topic ="topic123" val replace = (x:String)=>{topic} val replaceCol = udf(replace) val data = tempDataFrame.withColumn("topicName",replaceCol(tempDataFrame("content"))) data.show()
// 修改列值 重新构造dataFrame,新dataFrame先取不用改变的列,编写自定义函数构造新列,再加到新
dataFrame
esDF.createOrReplaceTempView("temp") // 创建自定义函数 sqlContex.udf.register("Time2Time", Time2Time _) // 获取colName val linkedColNames = getLinkedColNames(esDF.schema.fieldNames) val addDf = sqlContex.sqlContext.sql(s"select $linkedColNames,Time2Time(update_time) AS update_time,Time2Time(create_time) AS create_time,from temp where $conditon") // addDf.saveToEs("ods_wj_scenes_detail/docs") addDf.select("update_time").show(50)
//删除列操作
tempDataFrame.drop("topicName")
下面是整代码
object EsFilterforNetworkMonitoring { def main(args: Array[String]): Unit = { val sqlContex = SparkSession.builder().master("local[2]") .appName("spark2ES") // .config("hive.metastore.uris", "thrift://172.1.1.199:9083") .config("spark.testing.memory", "2147480000") .config("es.index.auto.create", "true") .config("es.nodes", "172.10.4.4:9200,172.10.4.5:9200,172.10.4.6:9200") // .enableHiveSupport() .getOrCreate() // section 1: 读取ES数据 val optionMap = Map("path" -> "ods_wj_apk_index/docs", "pushdown" -> "true") val esDF = sqlContex.read.format("org.elasticsearch.spark.sql").options(optionMap).load() val topicMap = Map("topic123456789" -> "1==1&&2==2") // 筛选出满足条件的数据 for ((k, v) <- topicMap) { val topic = k val conditon = v.replaceAll("==", "=").replaceAll("&&", " and ").replace("||", " or ") println("============================= " + conditon) esDF.createOrReplaceTempView("temp") // 创建自定义函数 sqlContex.udf.register("Time2Time", Time2Time _) sqlContex.sqlContext.udf.register("replaceCol", (str: String) => topic) // 获取colName val linkedColNames = getLinkedColNames(esDF.schema.fieldNames) println( "linkedNames : "+linkedColNames) val addDf = sqlContex.sqlContext.sql(s"select $linkedColNames,Time2Time(update_time) AS update_time,Time2Time(create_time) AS create_time,replaceCol(apk_name) as topicName from temp where $conditon") // addDf.saveToEs("ods_wj_scenes_detail/docs") addDf.select("update_time").show(50) } def Time2Time(tp: java.sql.Timestamp): String = { if (tp == null ) { null } else { try { val sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") val datestr: String = sdf.format(tp.getTime) datestr } catch { case e: Exception => e.printStackTrace() null } } } // 拼接字符串 def getLinkedColNames(arr:Array[String]):String={ var linkedColName ="" for(ele<-arr if(!("update_time".equals(ele)||("create_time".equals(ele))))){ if(linkedColName.length==0){ linkedColName=ele }else{ linkedColName=linkedColName+","+ele } } linkedColName } }