Spark SQL示例

Spark SQL尝试

// data from 2014.9.12
val transfer = sc.textFile("hdfs://LDKJSERVER1046:8020/user/flume/transfer20/year=2014/month=09/day=12/*.tsv")
val structuredTransfer20RDD = transfer.map(line => {
                                            val trunks = line.split("\t")
                                            if(trunks.length == 33){
                                                (trunks(6), trunks(8), trunks(9), trunks(14), trunks(12), trunks(3))
                                               }})
val rdd = structuredTransfer20RDD.filter(arg => arg != ()).map(arg => arg.asInstanceOf[(String, String, String, String, String, String)])
// ORM class, pay attention to the name "f","t",etc
// field name must not be the keywords in sql syntax, and upper case seems illegal
case class ZapyaTransfer(f: String, t: String, mdfive: String, name: String, size: String, time: String)
val ormTransfers = rdd.map(arg => ZapyaTransfer(arg._1, arg._2, arg._3, arg._4, arg._5, arg._6))

val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.createSchemaRDD
ormTransfers.registerAsTable("transfers")
// this sql is just a easy example
val temp = sqlContext.sql("SELECT count(*) FROM transfers WHERE f = '8677380150235314813F370E464'")
temp.map(c => "count(*): " + c(0)).collect().foreach(println)

你可能感兴趣的:(Spark SQL示例)