练习数据为电影推荐系统的数据,该部分数据收集了不同时间段的数据,我们可以根据电影分析业务需求下载不同规模大小的数据源文件。下载地址为:https://grouplens.org/datasets/movielens/ 。
该种方式简单,由于推断过程中无法指定字段名,可在练习是使用, 代码片如下:
.
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{
IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{
DataFrame, Row, SparkSession}
import org.apache.spark.{
SparkConf, SparkContext}
/**
* author :
* date :Created in 2019/6/11 14:49
* description:${description}
* modified By:
*/
object CreateDataframe {
def main(args: Array[String]): Unit = {
var dataPath ="D:\\testdData\\ml-1m\\"
val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("movieTop10")
//sparkSession封装了SparkContext和SQLContext,并且在builder的getOrCreate方法中判断是否符合要求的SparkSession存在,有责是用,没有则进行创建
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
//获取SparkSession的SparkContext
val sc: SparkContext = spark.sparkContext
//从文件中获取数据,UserID::Gender::Age::Occupationid::Zip-code
val usersRDD: RDD[(String, String, String, String, String)] = sc.textFile(dataPath + "users.dat").map(_.split("::")).map(x=>(x(0),x(1),x(2),x(3),x(4)))
//方式一:通过反射推断rdd的schema,简单但不建议使用
import spark.implicits._
val inferDF: DataFrame = usersRDD.map(x=>(x._1.trim,x._2.trim,x._3.trim.toDouble.toInt,x._4.trim,x._5.trim)).toDF()
println("通过反射推断rdd的schema,效果如下:")
inferDF.show()
inferDF.printSchema()
}
}
该种方法直接指定schema信息,将rdd转换成指定数据格式,代码片如下:
.
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{
IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{
DataFrame, Row, SparkSession}
import org.apache.spark.{
SparkConf, SparkContext}
/**
* author :
* date :Created in 2019/6/11 14:49
* description:${description}
* modified By:
*/
object CreateDataframe {
def main(args: Array[String]): Unit = {
var dataPath ="D:\\testdData\\ml-1m\\"
val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("movieTop10")
//sparkSession封装了SparkContext和SQLContext,并且在builder的getOrCreate方法中判断是否符合要求的SparkSession存在,有责是用,没有则进行创建
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
//获取SparkSession的SparkContext
val sc: SparkContext = spark.sparkContext
//从文件中获取数据,UserID::Gender::Age::Occupationid::Zip-code
val usersRDD: RDD[(String, String, String, String, String)] = sc.textFile(dataPath + "users.dat").map(_.split("::")).map(x=>(x(0),x(1),x(2),x(3),x(4)))
//方式二:通过StructType创建dataframe
//创建schema信息
val structSchema: StructType = StructType(
List(
StructField("userid", StringType, true),
StructField("Gender", StringType, true),
StructField("Age", IntegerType, true),
StructField("Occupationid", StringType, true),
StructField("zipCode", StringType, true)
)
)
//将rdd映射到rowrdd上,注意将数据格式化为相应的类型
val structRow: RDD[Row] = usersRDD.map(line=>Row(line._1.trim,line._2.trim,line._3.trim.toDouble.toInt,line._4.trim,line._5.trim))
//创建dataframe
val structDf: DataFrame = spark.createDataFrame(structRow,structSchema)
println("通过StructType创建dataframe,效果如下:")
structDf.show()
structDf.printSchema()
}
}
该种方式创建表结构的对应字段类,格式化数据创建dataframe,与方式二效果相同,代码片如下:
.
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{
IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{
DataFrame, Row, SparkSession}
import org.apache.spark.{
SparkConf, SparkContext}
/**
* author :
* date :Created in 2019/6/11 14:49
* description:${description}
* modified By:
*/
object CreateDataframe {
def main(args: Array[String]): Unit = {
var dataPath ="D:\\testdData\\ml-1m\\"
val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("movieTop10")
//sparkSession封装了SparkContext和SQLContext,并且在builder的getOrCreate方法中判断是否符合要求的SparkSession存在,有责是用,没有则进行创建
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
//获取SparkSession的SparkContext
val sc: SparkContext = spark.sparkContext
//从文件中获取数据,UserID::Gender::Age::Occupationid::Zip-code
val usersRDD: RDD[(String, String, String, String, String)] = sc.textFile(dataPath + "users.dat").map(_.split("::")).map(x=>(x(0),x(1),x(2),x(3),x(4)))
//方式三:通过定义类格式化为dataframe
//格式化数据
val usersRow: RDD[users] = usersRDD.map(x => users(
x._1.trim.asInstanceOf[String], x._2.trim.asInstanceOf[String], x._3.trim.toDouble.asInstanceOf[Int], x._4.trim.asInstanceOf[String], x._5.trim.asInstanceOf[String]
))
//创建dataframe
val classDF: DataFrame = spark.createDataFrame(usersRow)
println("通过定义类格式化为dataframe,效果如下:")
classDF.show()
classDF.printSchema()
}
}
//定义dataframe的schema信息
case class users(UserID: String, Gender: String, Age:Int, Occupationid: String, zipCode: String)