Spark读取csv、json文件

Spark读取csv文件

package nj.zb.kb11

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.{
     DataFrame, SparkSession}
import org.apache.spark.{
     SparkConf, SparkContext}

object ReadCsvDemo {
     
  def main(args: Array[String]): Unit = {
     
    val conf = new SparkConf().setMaster("local[*]").setAppName("sparkreadcsv")
    val sc = SparkContext.getOrCreate(conf)
    val lines = sc.textFile("E:\\ideaProjects\\sparkstu\\in\\users.csv")

    val lines1: RDD[Array[String]] = lines.filter(x=>x.startsWith("user_id")==false).map(x=>x.split(","))


    //加载csv,去除head,另一个方法
    // lines1.collect().foreach(x=>println(x.toList))



    //加载csv文件,去除head

//    val lines2 = lines.mapPartitionsWithIndex((index, value) => {
     
//      if (index == 0)
//        value.drop(1)
//      else
//        value
//    })
//    val lines3: RDD[Array[String]] = lines2.map(x=>x.split(","))
//
//    for (x<- lines3){
     
//      println(x.toList)
//    }


    //以上内容是   sparkContext 操作

     val spark: SparkSession = SparkSession.builder().appName("ReadCsvSparkSession").master("local[*]").getOrCreate()
    val df: DataFrame = spark.read.format("csv").option("header",true).load("in/users.csv")
    df.printSchema()
    df.show(10)
    val df2: DataFrame = df.select("user_id","birthyear")
    //将  birthyear的 schema  从string类型转换成  double 类型
    val df3: DataFrame = df2.withColumn("birthyear",df2("birthyear").cast(DoubleType))

    df3.filter(x=>x.getDouble(1) <= 1995).show(10)
  }
}

Spark读取Json文件

package nj.zb.kb11

import org.apache.spark.rdd.RDD
import org.apache.spark.{
     SparkConf, SparkContext}
import org.apache.spark.sql.{
     DataFrame, DataFrameReader, SparkSession}

object ReadJsonDemo {
     
  def main(args: Array[String]): Unit = {
     
    val conf = new SparkConf().setMaster("local[*]").setAppName("sparkreadcsv")
    val sc = SparkContext.getOrCreate(conf)
    val spark: SparkSession = SparkSession.builder().appName("ReadCsvSparkSession").master("local[*]").getOrCreate()


    //sparkContext 方式
//    val lines: RDD[String] = sc.textFile("in/user.json")
//    import scala.util.parsing.json.JSON
//    val rdd: RDD[Option[Any]] = lines.map(x=>JSON.parseFull(x))
//    rdd.collect().foreach(println)

    //schema方式
    val frame: DataFrame = spark.read.format("json").option("head",true).load("in/user.json")
    frame.printSchema()

  }
}

你可能感兴趣的:(初学,spark)