Exception in thread "main" java.lang.UnsupportedOperationException: No Encoder found for org.apache.spark.sql.Row
- field (class: "org.apache.spark.sql.Row", name: "_2")
- root class: "scala.Tuple2"
at org.apache.spark.sql.catalyst.ScalaReflection$.org$apache$spark$sql$catalyst$ScalaReflection$$serializerFor(ScalaReflection.scala:625)
at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$10.apply(ScalaReflection.scala:619)
at org.apache.spark.sql.catalyst.ScalaReflection$$anonfun$10.apply(ScalaReflection.scala:607)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)
at scala.collection.immutable.List.foreach(List.scala:381)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)
at scala.collection.immutable.List.flatMap(List.scala:344)
at org.apache.spark.sql.catalyst.ScalaReflection$.org$apache$spark$sql$catalyst$ScalaReflection$$serializerFor(ScalaReflection.scala:607)
at org.apache.spark.sql.catalyst.ScalaReflection$.serializerFor(ScalaReflection.scala:438)
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$.apply(ExpressionEncoder.scala:71)
at org.apache.spark.sql.Encoders$.product(Encoders.scala:275)
at org.apache.spark.sql.LowPrioritySQLImplicits$class.newProductEncoder(SQLImplicits.scala:233)
at org.apache.spark.sql.SQLImplicits.newProductEncoder(SQLImplicits.scala:33)
at com.xmd.flow.test$.specifyingTheSchema(test.scala:57)
at com.xmd.flow.test$.main(test.scala:105)
at com.xmd.flow.test.main(test.scala)
people.txt的内容为:
Michael, 29, 男
Andy, 30, 女
Justin, 19, 男
Andy, 31, 女
Justin, 18, 男
错误代码的方法是:
/**
* Spark SQL支持将现有RDD转换为Datasets的两种不同方法。本例子是使用编程的方式指定schema
* 1、使用反射推理schema
* 2、以编程方式指定schema
*
* @param spark
*/
def specifyingTheSchema(spark: SparkSession): Unit = {
import org.apache.spark.sql.types._
import spark.implicits._
// 创建一个RDD
val peopleRDD = spark.sparkContext.textFile("spark-2.2.0/examples/src/main/resources/people.txt")
// schema 是一个String的编码方式
val schemaString = "name age gender"
// 基于String创建schema
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)
// 将RDD转换为行
val rowRDD = peopleRDD
.map(_.split(","))
.map(attributes => Row(attributes(0), attributes(1).trim, attributes(2).trim))
// 用这个schema创建RDD
val peopleDF = spark.createDataFrame(rowRDD, schema)
peopleDF.createOrReplaceTempView("people")
// SQL可以在使用DataFrames创建的临时视图中运行
val results = spark.sql("SELECT * FROM people")//.rdd
println(results.schema)
results.show()
results.map{row => (row.getAs[String]("name") + "#" + row.getAs[String]("age"), row)}
.rdd
.groupByKey()
.foreach(item => {
println(item._1)
for(row <- item._2) {
println("------" + row.getAs[String]("gender"))
}
})
}
3、正确的代码:
def specifyingTheSchema(spark: SparkSession): Unit = {
import org.apache.spark.sql.types._
import spark.implicits._
// 创建一个RDD
val peopleRDD = spark.sparkContext.textFile("/spark-2.2.0/examples/src/main/resources/people.txt")
// schema 是一个String的编码方式
val schemaString = "name age gender"
// 基于String创建schema
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)
// 将RDD转换为行
val rowRDD = peopleRDD
.map(_.split(","))
.map(attributes => Row(attributes(0), attributes(1).trim, attributes(2).trim))
// 用这个schema创建RDD
val peopleDF = spark.createDataFrame(rowRDD, schema)
peopleDF.createOrReplaceTempView("people")
// SQL可以在使用DataFrames创建的临时视图中运行
val results = spark.sql("SELECT * FROM people")//.rdd
println(results.schema)
results.show()
import org.apache.spark.sql.catalyst.encoders.RowEncoder
// implicit val encoder = RowEncoder(schema)
val encoder = Encoders.tuple(
Encoders.STRING,
RowEncoder(
results.schema
/*StructType(Seq(
StructField("name", StringType),
StructField("age", StringType),
StructField("gender", StringType)))*/
))
// val conversionBaseKV: KeyValueGroupedDataset[String, (String, String)] =
results.map{row => (row.getAs[String]("name") + "#" + row.getAs[String]("age"), row)}(encoder)
.rdd
.groupByKey()
.foreach(item => {
println(item._1)
for(row <- item._2) {
println("------" + row.getAs[String]("gender"))
}
})
}
val encoder = Encoders.tuple(
Encoders.STRING,
RowEncoder(
results.schema
/*StructType(Seq(
StructField("name", StringType),
StructField("age", StringType),
StructField("gender", StringType)))*/
))
// val conversionBaseKV: KeyValueGroupedDataset[String, (String, String)] =
results.map{row => (row.getAs[String]("name") + "#" + row.getAs[String]("age"), row)}(encoder)