1、为什么要将RDD转换成DataFrame,
直接针对HDFS等任何可以构建为RDD数据,进行SparkSQL的sql查询
2、SparkSQL支持RDD转换成DataFrame的方式如下:
1>反射方式;
2>通过编程接口创建DataFrame;
方法一:使用createDataFrame方法;
val schema=StructType(
seq(
StructField("name",StringType,true),
StructField("age",IntegerType,true)
)
)
val rowRDD=sparkSession.sparkContext.textFile("/temp/person.txt",2)
.map(x=>x.split(",")).map(x=>Row(x(0),x(1).trim().toInt))
sparksession.creteDataFrame(rowRDD,schema)
}
}
eg[java]:
calss RDDToDataFrame{
public static void main(String [] args){
SparkConf conf =new SparkConf().setMaster("local").setAppName("RDDToDataFrame")
JavaSparkContext sc=new JavaSparkContext(conf);
SQLContext sqlcontext=new SQLContext(sc);
//创建rdd,rdd->RDD(Row)
JavaRDD
javaRDD
public Row call(String line) throws Exception{
String[] lineSplited=line.split(",");
//数据进行封装到一个一个row中
return RowFactory.create(Integer.valueOf(lineSplited[0]),lineSplited[1],
Integer.valueOf(lineSplited[2]));
}
});
//动态构造元数据
List
structFields.add(DataTypes.createStructField("id",DataTypes.IntegerType,true));
structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));
structFields.add(DataTypes.createStructField("age",DataTypes.IntegerType,true));
structType structType=DataTypes.createStructType(structFields);
//动态构造的元数据,将RDD转换成DataFrame
Dataset
personDF.regidterTempTable("person");
Dataset
List
for(Row row:teenagerRDD){
System.out.println(row);
}
}
scala版本;
object RDD2DataFrame{
def main(args:Array[String]):Unit={
val conf=new SparkConf().setMaster("local").setAppName("RDD2DataFrame");
val sc=new SparkContext(conf);
val sQLContext=new SQLContext(sc);
//first:构造出元素为Row的普通RDD
val peresonRDD=sc.textFile("./person.txt")
.map(line=>Row(line.split(",")(0).toInt,line.split(",")(1),line.split(",")(2).toInt));
//编程方式动态构建元数据
val structType=StructType(Array(
StructField("id",IntegerType,true),StructField("name",StringType,true),
StructField("age",IntegerType,true) ))
//第三步 进行RDD到DataFrame的转换
val personDF=sQLContext.creatDataFrame(personRDD,structType);
personDF.registerTemple("person");
val teenagerDF=sQLContext.sql("select * from person where age<=18");
val teenagerRDD=teenagerDF.rdd.collect().foreach(row=>println(row));
}
}
eg:方法二使用反射来推断包含了特定数据类型的RDD的元数据;
java版:
class RDD2DataFrame{
public static void main(String[] args){
SparkConf conf =new SparkConf().setAppName("RDD2DataFrame").setMaster("local");
JavaSparkContext sc=new javaSparkContext(conf);
SQLContext sqlcontext=new SQLContext(sc);
javaRDD
javaRDD
public Person call(String line) throws Exception{
String[] lineSplited=line.split(",");
Person per=new Person();
per.setAge(Integer.valueOf(lineSplited[0]));
per.setId(Integer.valueOf(lineSplited[1]));
per.setName(lineSplited[2]);
return per;
}
});
//使用反射的方式,将RDD转换成DataFrame
Dataset
//注册中间表
personDF.registerTempTable("Person");
//针对临时表执行sql语句
Dataset
//将查询出来的DataFrame,再次转换成RDD
JavaRDD
JavaRDD
public Person call(Row row) throws Exception{
person per =new person();
per.setAge(row.getInt(0));
per.setId(row.getInt(1));
per.setName(row.getString(2));
return per
}
});
//将数据collect回来,打印出来
List
for(person per:personList){
System.out.println(per);
}
}
}
实体person:
class person implements Serializable{
private int id;
private int age;
private String name;
......
}
scala版本
objct RDD2DataFrame{
def main(args:Array[String]):Unit={
val conf =new SparkConf().setMaster("local").setAppName("RDD2DataFrame");
val sc=new SparkContext(conf);
val sqlContext=new SQLContext(sc);
//scala需要手动的导入一个隐式转换
import sqlContext.implicits._
case class person(id:Int,name:String,age:Int)
//case 的class的RDD元素,直接使用RDD的toDF,便可以转换为DataFrame元素
val personDF=sc.textFile("./person.txt",1)
.map(line=>line.split(","))
.map(arr=>person(arr(0).trim().toInt,arr(1),arr(2).trim().toInt))
.toDF();
//注册临时表
personDF.registerTempTable("person");
val teenagerDF=sqlContext.sql("select * from person where age<=18");
val teenagerRDD=teenagerDF.rdd
teenagerRDD.map(row=>person(row(0).toString().toInt,row(1).toString,row(2).toString.toInt));
.collect()
.foreach(per=>println(per.id+":"+per.name+":"+per.age))
}
}