Spark SQL 写入 Json 格式文件报错 org.apache.spark.sql.AnalysisException: Found duplicate column(s)

 

错误场景

如下两个 Json 文件

person.json

{"name":"路飞","age":17,"deptno":1,"money":15}
{"name":"索隆","age":18,"deptno":1,"money":9}
{"name":"乔巴","age":5,"deptno":1,"money":5}
{"name":"艾斯","age":18,"deptno":2,"money":18}
{"name":"萨博","age":18,"deptno":2,"money":16}
{"name":"香克斯","age":32,"deptno":3,"money":30}

dept.json

{"name":"草帽","deptno":1}
{"name":"白胡子","deptno":2}
{"name":"红发","deptno":3}

当通过如下语句,join 两个表在写入到 新的 Json 文件的时候报错

    val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)

    // 1)相关处理
    val person =  sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
    val dept = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\dept.json")

    person.join(dept, person("deptno") === dept("deptno"), "outer").repartition(1)
        .write.format("json").save("C:\\Users\\LUFFY\\Desktop\\testData\\joinJson")

错误如下:

org.apache.spark.sql.AnalysisException: Found duplicate column(s) when inserting into file:/C:/Users/LUFFY/Desktop/testData/joinJson: `deptno`, `name`;

	at org.apache.spark.sql.util.SchemaUtils$.checkColumnNameDuplication(SchemaUtils.scala:85)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:68)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:155)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:676)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:78)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:290)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
	at sparkSql.demo1.testError(demo1.scala:142)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
	at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
	at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
	at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
	at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325)
	at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78)
	at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57)
	at org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
	at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
	at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
	at org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
	at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
	at org.junit.runners.ParentRunner.run(ParentRunner.java:363)
	at org.junit.runner.JUnitCore.run(JUnitCore.java:137)
	at com.intellij.junit4.JUnit4IdeaTestRunner.startRunnerWithArgs(JUnit4IdeaTestRunner.java:68)
	at com.intellij.rt.execution.junit.IdeaTestRunner$Repeater.startRunnerWithArgs(IdeaTestRunner.java:47)
	at com.intellij.rt.execution.junit.JUnitStarter.prepareStreamsAndStart(JUnitStarter.java:242)
	at com.intellij.rt.execution.junit.JUnitStarter.main(JUnitStarter.java:70)

 

问题原因

列名重复

解决方案

修改列名,如下:

@Test
  def testJoinSave() ={
    val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)

    // 1)相关处理
    val person =  sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
    val dept = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\dept.json")

    //另存为 Json
    //分别拿出两张表的列名
     val c_person = person.columns
     val c_dept = dept.columns
     //分别对两张表的别名进行设置

     val person_tmp = person.select(c_person.map(n => person(n).as("person_" + n)): _*)
     val dept_tmp = dept.select(c_dept.map(n => dept(n).as("dept_" + n)): _*)

     person_tmp.join(dept_tmp, person_tmp("person_deptno") === dept_tmp("dept_deptno"), "outer").repartition(1)
        .write.format("json").save("C:\\Users\\LUFFY\\Desktop\\testData\\joinJson")
  }

 

你可能感兴趣的:(Spark,SparkSQL,大数据,Hadoop,Spark,Spark,Spark,SQL,报错,存储,Json)