sparkSQL之数据源读取parquet、json、csv案例

1、读取parquet数据源

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by Administrator on 2017/2/3.
  */
object ParquetLoadData {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("ParquetLoadData")
    val sc = new SparkContext(conf)
    val sqlContext = new  SQLContext(sc)
    val usersDF = sqlContext.read.parquet("hdfs://master:9000/student/2016113012/spark/users.parquet")
    usersDF.registerTempTable("t_users")
    //查询name
    val usersNameDF = sqlContext.sql("select name from t_users")
    //转换成RDD并执行相关操作
    usersNameDF.rdd.map(row => "Name:"+row(0)).collect().foreach(username => println(username))

  }

}

2、json格式数据源

import org.apache.spark.sql.{DataFrame, Dataset, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import redis.clients.jedis.Tuple

import scala.collection.mutable.ArrayBuffer

//功能
case class Person (name: String , age: Long)
case class PersonScore (n: String , score: Long)

object DataFrame_Chapter_13 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("chapter_13").setMaster("local")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    import org.apache.spark.sql.functions._
    import sqlContext.implicits._

    val persons: DataFrame = sqlContext.read.json("src/people.json")
    val personsDS: Dataset[Person] = persons.as[Person]
//    personsDS.show()
    //+---+-------+
    //|age|   name|
    //+---+-------+
    //| 16|Michael|
    //| 16|Michael|
    //| 30|   Andy|
    //| 19| Justin|
    //| 29| Justin|
    //| 46|Michael|
    //+---+-------+
    }
}

3、读取csv数据源

import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}

//功能一:采用RDD+case class 的方式来构造dataSets,实现某个电影中男性女性中不同年龄的有多少人

case class Rating(var userId:String,var movieId:String,var Rating:Double,var TimeStamp:String)
case class User(var userId:String,var age:String,var gender:String,var job:String)

object DataFrame_12_6 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
    conf.setAppName("user_action_analysis_12_1_3").setMaster("local")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    val ratingRdd = sc.textFile("src/ratings.csv").map(_.split("\\,")).
      map(line=>{Rating(line(0).trim,line(1),line(2).toDouble,line(3))})

    import sqlContext.implicits._
    val ratingDS=ratingRdd.toDF().cache()
    }
}

 

你可能感兴趣的:(spark)