sparkSQL的RDD转换成DataFrame

1、为什么要将RDD转换成DataFrame,

直接针对HDFS等任何可以构建为RDD数据,进行SparkSQL的sql查询

2、SparkSQL支持RDD转换成DataFrame的方式如下:

1>反射方式;

2>通过编程接口创建DataFrame;

方法一:使用createDataFrame方法;

val schema=StructType(

  seq(

   StructField("name",StringType,true),

   StructField("age",IntegerType,true)

      )

)

val rowRDD=sparkSession.sparkContext.textFile("/temp/person.txt",2)

                        .map(x=>x.split(",")).map(x=>Row(x(0),x(1).trim().toInt))

     sparksession.creteDataFrame(rowRDD,schema)

}

}

eg[java]:

calss RDDToDataFrame{

 public static void main(String [] args){

 SparkConf conf =new SparkConf().setMaster("local").setAppName("RDDToDataFrame")

JavaSparkContext sc=new JavaSparkContext(conf);

SQLContext sqlcontext=new SQLContext(sc);

//创建rdd,rdd->RDD(Row)

JavaRDD lines=sc.textFile(./person.txt);

javaRDD rows=lines.map(new Function){

 public Row call(String line) throws Exception{

 String[] lineSplited=line.split(",");

//数据进行封装到一个一个row中

return RowFactory.create(Integer.valueOf(lineSplited[0]),lineSplited[1],

   Integer.valueOf(lineSplited[2]));

}

});

//动态构造元数据

List structFields=new ArrayList();

structFields.add(DataTypes.createStructField("id",DataTypes.IntegerType,true));

structFields.add(DataTypes.createStructField("name",DataTypes.StringType,true));

structFields.add(DataTypes.createStructField("age",DataTypes.IntegerType,true));

structType structType=DataTypes.createStructType(structFields);

//动态构造的元数据,将RDD转换成DataFrame

Dataset personDF=sqlContext.createDataFrame(rows,structType);

personDF.regidterTempTable("person");

Dataset teenagerDF=sqlContext.sql("select * from person where age<=18")

List teenagerRDD=teenagerDF.javaRDD().collect();

for(Row row:teenagerRDD){

  System.out.println(row);

}

}

scala版本;

object RDD2DataFrame{

 def main(args:Array[String]):Unit={

   val conf=new SparkConf().setMaster("local").setAppName("RDD2DataFrame");

   val sc=new SparkContext(conf);

  val sQLContext=new SQLContext(sc);

  //first:构造出元素为Row的普通RDD

val peresonRDD=sc.textFile("./person.txt")

      .map(line=>Row(line.split(",")(0).toInt,line.split(",")(1),line.split(",")(2).toInt));

     //编程方式动态构建元数据

     val structType=StructType(Array(

      StructField("id",IntegerType,true),StructField("name",StringType,true),

      StructField("age",IntegerType,true) ))

     //第三步 进行RDD到DataFrame的转换

      val personDF=sQLContext.creatDataFrame(personRDD,structType);

      personDF.registerTemple("person");

      val teenagerDF=sQLContext.sql("select * from person where age<=18");

      val teenagerRDD=teenagerDF.rdd.collect().foreach(row=>println(row));

}

}

eg:方法二使用反射来推断包含了特定数据类型的RDD的元数据;

java版:

class RDD2DataFrame{

 public static void main(String[] args){

   SparkConf conf =new SparkConf().setAppName("RDD2DataFrame").setMaster("local");

   JavaSparkContext sc=new javaSparkContext(conf);

  SQLContext sqlcontext=new SQLContext(sc);

 

  javaRDD lines=sc.textFile("./person.txt");

  javaRDD persons=lines.map(new Function(){

 

 public Person call(String line) throws Exception{

  String[] lineSplited=line.split(",");

  Person per=new Person();

  per.setAge(Integer.valueOf(lineSplited[0]));

  per.setId(Integer.valueOf(lineSplited[1]));

  per.setName(lineSplited[2]);

 

 return per;

}

});

//使用反射的方式,将RDD转换成DataFrame

Dataset personDF=sqlContext.createDataFrame(Person,person.calss);

//注册中间表

personDF.registerTempTable("Person");

//针对临时表执行sql语句

Dataset teenagerDF=sqlContext.sql("select * from person where age<=19");

//将查询出来的DataFrame,再次转换成RDD

JavaRDD teenagerRDD=teenagerDF.javaRDD();

JavaRDDteenagerPersonRDD=teengerRDD.map(new Function(){

public Person call(Row row) throws Exception{

 person per =new person();

per.setAge(row.getInt(0));

per.setId(row.getInt(1));

per.setName(row.getString(2));

return per

}

});

//将数据collect回来,打印出来

List personList=teenagerStudentRDD.collect();

for(person per:personList){

 System.out.println(per);

}

}

}

实体person:

class person implements Serializable{

private int id;

private int age;

private String name;

......

}

scala版本

objct RDD2DataFrame{

def main(args:Array[String]):Unit={

 val conf =new SparkConf().setMaster("local").setAppName("RDD2DataFrame");

 val sc=new SparkContext(conf);

 val sqlContext=new SQLContext(sc);

//scala需要手动的导入一个隐式转换

import sqlContext.implicits._

case class person(id:Int,name:String,age:Int)

//case 的class的RDD元素,直接使用RDD的toDF,便可以转换为DataFrame元素

val personDF=sc.textFile("./person.txt",1)

       .map(line=>line.split(","))

       .map(arr=>person(arr(0).trim().toInt,arr(1),arr(2).trim().toInt))

       .toDF();

//注册临时表

personDF.registerTempTable("person");

val teenagerDF=sqlContext.sql("select * from person where age<=18");

val teenagerRDD=teenagerDF.rdd

teenagerRDD.map(row=>person(row(0).toString().toInt,row(1).toString,row(2).toString.toInt));

.collect()

.foreach(per=>println(per.id+":"+per.name+":"+per.age))

}

}

你可能感兴趣的:(大数据-spark)