Spark支持两种方式将RDDs转为为SchemaRDDs:一种方法是使用反射来推断包含特定对象类型的RDD的模式(schema);一种方法是通过编程接口来实现,这个接口允许你构造一个模式,然后在存在的RDDs上使用它。
import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.sql.SQLContext /** * Created by Administrator on 2015/10/17. */ object UseCaseClass { case class Person(firstName: String, lastName: String, age: Int) def main(args: Array[String]): Unit ={ if(args.length != 1){ System.err.println("Usage: <data path>") System.exit(1) } val conf = new SparkConf() val sc = new SparkContext(conf) //Loaded with sqlContext(which is the instance of HiveContext not SQLContext) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val data = sc.textFile(args(0)) val personRDD = data.map(_.split(",")).map(person => Person(person(0), person(1), person(2).toInt)) //Convert the personRDD into the personDF DataFrame val personDF = personRDD.toDF() //Register the personDF as a table personDF.registerTempTable("person") //Run a SQL query against it val people = sqlContext.sql("SELECT * FROM person WHERE age < 30") people.collect().foreach(println) sc.stop() } }
StructType(fields: Array[StructField]) StructField(name: String, dataType: DataType, nullable: Boolean = true, metadata: Metadata = Metadata.empty)
IntegerType FloatType BooleanType ShortType LongType ByteType DoubleType StringType
import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.sql.SQLContext import org.apache.spark.sql.Row import org.apache.spark.sql.types.{IntegerType, StringType, StructType, StructField} /** * Created by Administrator on 2015/10/17. */ object SpecifySchema { def main(args: Array[String]): Unit ={ if(args.length != 1){ System.err.println("Usage: <data path>") System.exit(1) } val conf = new SparkConf() val sc = new SparkContext(conf) //Loaded with sqlContext(which is the instance of HiveContext not SQLContext) val sqlContext = new SQLContext(sc) val data = sc.textFile(args(0)) //Convert the RDD of array[string] to the RDD of the Row objects val personRow = data.map(_.split(",")).map(person => Row(person(0), person(1), person(2).toInt)) /** * Create schema using the StructType and StructField objects. * The StructField object takes parameters in the form of param name, param type, and nullability */ val schema = StructType( Array( StructField("firstName", StringType, true), StructField("lastName", StringType, true), StructField("age", IntegerType, true) ) ) //Apply schema to create the personDF DataFrame val personDF = sqlContext.createDataFrame(personRow, schema) //Register the personDF as a table personDF.registerTempTable("person") //Run a SQL query against it val people = sqlContext.sql("SELECT * FROM person WHERE age < 30") people.collect().foreach(println) sc.stop() } }