val spark=SparkSession.builder().appName("dfdemo")
.master("local[*]")
.getOrCreate()
val personList: List[Person] = List(
Person("1", "jack", 22),
Person("2", "radrek", 23),
Person("3", "memo", 26),
Person("3", "micorekeys", 31),
Person("4", "mike", 30),
Person("5", "miki", 21),
Person("6", "kitty", 22),
Person("7", "beam", 24),
Person("8", "jack", 22),
Person("9", "hancksom", 24),
Person("10", "herry", 32),
Person("11", "kily", 37),
Person("12", "tom", 34),
Person("13", "sam", 32),
Person("14", "lily", 27),
Person("15", "mary", 23),
Person("16", "john", 26),
Person("17", "jom", 24),
Person("18", "vico", 21)
)
val pRDD: RDD[Person] = spark.sparkContext.parallelize(personList)
import spark.implicits._
val pdf=pRDD.toDF()
pdf.show()
// pdf.show(rowNumber,是否不需要显示全部列) //参数1:显示记录数,记录比较多时,默认只显示20条,参数2:是否不全部显示
显示如下
+---+----------+---+
|id |name |age|
+---+----------+---+
|1 |jack |22 |
|2 |radrek |23 |
|3 |memo |26 |
|3 |micorekeys|31 |
|4 |mike |30 |
|5 |miki |21 |
|6 |kitty |22 |
|7 |beam |24 |
|8 |jack |22 |
|9 |hancksom |24 |
|10 |herry |32 |
|11 |kily |37 |
|12 |tom |34 |
|13 |sam |32 |
|14 |lily |27 |
|15 |mary |23 |
|16 |john |26 |
|17 |jom |24 |
|18 |vico |21 |
+---+----------+---+
case class Person(id:String,name:String,age:Int)
toDF() // 转为DataFrame
toDF(列名重命名)
// toDF("sid","username","age")
// 对上面的加载做列名的重命名 pRDD.toDF("sid","username","age").show(false)
+---+----------+---+
|sid|username |age|
+---+----------+---+
|1 |jack |22 |
|2 |radrek |23 |
|3 |memo |26 |
|3 |micorekeys|31 |
|4 |mike |30 |
|5 |miki |21 |
|6 |kitty |22 |
|7 |beam |24 |
|8 |jack |22 |
|9 |hancksom |24 |
|10 |herry |32 |
|11 |kily |37 |
|12 |tom |34 |
|13 |sam |32 |
|14 |lily |27 |
|15 |mary |23 |
|16 |john |26 |
|17 |jom |24 |
|18 |vico |21 |
+---+----------+---+
pdf.createTempView("person")
pdf.createOrReplaceView("psn") //如果临时表存在则不操作,否则重新创建历史表
即这个临时表作用范围在整个个应用程序上
创建全局临时表在整个不同SparkSession间
pdf.select("sid","username").show() // 选择查询的列,相当于SQL中的select语句
pdf.where(new Column("age").>(23)).show(false) //确定限制条件,同SQL中的where子句
pdf.select("age").groupBy("age").count().sort($"count".desc).show()
按年龄分组统计人数及按照人数降序排列
采用lombok----不过使用全部带参的构造器的时候有点问题,就只能自己手写了,因此没有使用
@AllArgsConstructor注解
package com.mycat.spark;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import java.io.Serializable;
@NoArgsConstructor
public class Person2 implements Serializable {
@Getter
@Setter
private String id;
@Getter
@Setter
private String name;
@Getter
@Setter
private Integer age;
public Person2(String id, String name, Integer age) {
this.id = id;
this.name = name;
this.age = age;
}
}
采用工具类
val spark=SparkSession.builder().appName("dfdemo") .master("local[*]") .getOrCreate() val listPerson = List( new Person2("12", "tom", 34), new Person2("13", "sam", 32), new Person2("14", "lily", 27), new Person2("15", "mary", 23), new Person2("16", "john", 26) ) val jList=JavaConversions.seqAsJavaList(listPerson) val df = spark.createDataFrame(jList,classOf[Person2]) df.show(false)
RDD方式
val spark=SparkSession.builder().appName("dfdemo") .master("local[*]") .getOrCreate() val listPerson = List( new Person2("12", "tom", 34), new Person2("13", "sam", 32), new Person2("14", "lily", 27), new Person2("15", "mary", 23), new Person2("16", "john", 26) ) val jList=JavaConversions.seqAsJavaList(listPerson) val df = spark.createDataFrame(jList,classOf[Person2]) pDF.show(false)
case class Person(var id:String,var name:String,var age:Int)
Row
val spark=SparkSession.builder().appName("dfdemo")
.master("local[*]")
.getOrCreate()
val personList: List[Person] = List(
Person("1", "jack", 22),
Person("2", "radrek", 23),
Person("3", "memo", 26),
Person("3", "micorekeys", 31),
Person("4", "mike", 30),
Person("5", "miki", 21),
Person("6", "kitty", 22),
Person("7", "beam", 24),
Person("8", "jack", 22),
Person("9", "hancksom", 24),
Person("10", "herry", 32),
Person("11", "kily", 37),
Person("12", "tom", 34),
Person("13", "sam", 32),
Person("14", "lily", 27),
Person("15", "mary", 23),
Person("16", "john", 26),
Person("17", "jom", 24),
Person("18", "vico", 21)
)
val rows=personList.map(person => {
Row(person.id,person.name,person.age)
})
val schema=StructType(List(
StructField("id",StringType,true),
StructField("name",StringType,true),
StructField("age",IntegerType,true)
))
val pDF=spark.createDataFrame(JavaConversions.seqAsJavaList(rows),schema)
pDF.show(false)
RowRDD
val spark=SparkSession.builder().appName("dfdemo")
.master("local[*]")
.getOrCreate()
val personList: List[Person] = List(
Person("1", "jack", 22),
Person("2", "radrek", 23),
Person("3", "memo", 26),
Person("3", "micorekeys", 31),
Person("4", "mike", 30),
Person("5", "miki", 21),
Person("6", "kitty", 22),
Person("7", "beam", 24),
Person("8", "jack", 22),
Person("9", "hancksom", 24),
Person("10", "herry", 32),
Person("11", "kily", 37),
Person("12", "tom", 34),
Person("13", "sam", 32),
Person("14", "lily", 27),
Person("15", "mary", 23),
Person("16", "john", 26),
Person("17", "jom", 24),
Person("18", "vico", 21)
)
val rows=personList.map(person => {
Row(person.id,person.name,person.age)
})
val schema=StructType(List(
StructField("id",StringType,true),
StructField("name",StringType,true),
StructField("age",IntegerType,true)
))
val rowRDD=spark.sparkContext.parallelize(rows)
spark.createDataFrame(rowRDD,schema).show(false)
case class Person(var id:String,var name:String,var age:Int)
val spark=SparkSession.builder().appName("dfdemo")
.master("local[*]")
.getOrCreate()
val listPerson = List(
Person("12", "tom", 34),
Person("13", "sam", 32),
Person("14", "lily", 27),
Person("15", "mary", 23),
Person("16", "john", 26)
)
import spark.implicits._
val pdf = spark.createDataset(listPerson)
pdf.show(false)
val spark=SparkSession.builder().appName("dfdemo")
.master("local[*]")
.getOrCreate()
val listPerson = List(
Person("12", "tom", 34),
Person("13", "sam", 32),
Person("14", "lily", 27),
Person("15", "mary", 23),
Person("16", "john", 26)
)
import spark.implicits._
val frame: DataFrame = spark.createDataFrame(spark.sparkContext.parallelize(listPerson))
frame.show(false)
所有转换都必须引入:
import spark.implicits._
rdd.toDF()
rdd.toDS()
df.rdd
ds.rdd
ds.toDF()