Spark处理XML文件与XLSX/XLS文件

方法有很多,现提供一种比较简单的方法。
//需导入spark-xml_2.10-0.4.0.jar
package com.beagledata.spark

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

/**
  * xml数据处理(SQLContext)
  *
  * Created by drguo on 2017/8/18.
  * blog.csdn.net/dr_guo
  */

object PCSDataSQLProcess {

  val conf = new SparkConf().setAppName("pcsdata-sql")
    .set("spark.jars.packages", "io.netty:netty-common:4.1.8.Final")
    .set("spark.jars.exclude", "io.netty:netty-common")
    //.setMaster("local")

  val sc = new SparkContext(conf)

  val sqlContext = new SQLContext(sc)

  def main(args: Array[String]): Unit = {

    val df = sqlContext.read
      .format("com.databricks.spark.xml")
      .option("rowTag", "ROW")
      .load("/data1/Data/sinopec/sourceFile/xx.xml")
      //.load("src/main/resources/pcsTestData.xml")
    val pcsdf = df.select("ZDJG_V", "BGT", "SXZDDL", "XXZDDL").na.drop()//去掉缺失值,一行中有一个字段缺失整行去掉
    val pcsrdd = pcsdf//.rdd.filter(_.length==4)
      .map(x => x(0).toString+";"+x(1).toString.split("\\|")(0).split("\\$")(0)+";"+x(2).toString+";"+x(3).toString )
      //.map(_.split(";")).filter(_.length>1).map(x => x(0)+";"+x(1)+";"+x(2)+";"+x(3))
    //pcsrdd.foreach(println)
    pcsrdd.repartition(40).saveAsTextFile("/data1/Data/sinopec/xxData")

  }
}

参考资料:https://github.com/databricks/spark-xml

/**
    * 读取Excel,兼容 Excel 2003/2007/2010
    * @param path
    * @return List
    */
  def readXlsx(path : String) : ListBuffer[(String,String)] = {
    val fmt = new SimpleDateFormat("yy-M-d")
    val is = new FileInputStream(path)
    val workbook = WorkbookFactory.create(is)
    val sheet = workbook.getSheetAt(0) //获取第一个sheet
    val rowCount = sheet.getPhysicalNumberOfRows() //获取总行数

    val lb = new ListBuffer[(String, String)]()

    for(i <- 1 until rowCount){
      val row = sheet.getRow(i)

      // 得到第一列第一行的单元格
      val cellwellname: Cell = row.getCell(0)
      var cellDCDATE = row.getCell(1)
      val cellDNAME0 = row.getCell(3)
      //print("celltype----" + cellwellname.getCellType + " ")
      //println()
      //print("cellstyle----" + cellwellname.getCellStyle + " ")
      //同一字段不同数据类型处理
      var wellname = ""
      if (cellwellname.getCellType == 0){
        wellname = cellwellname.getNumericCellValue.toString
      } else {//cellwellname.getCellType == 1
        wellname = cellwellname.getRichStringCellValue.getString.trim
      }
      val DCDATE = fmt.format(cellDCDATE.getDateCellValue)
      val DNAME0 = cellDNAME0.getStringCellValue
      lb.+=((wellname + "\t" + DCDATE, DNAME0))
    }
    lb
  }

你可能感兴趣的:(Spark,大数据动物园)