spark笔记-SQL

spark--SQL笔记1.6
spark-shell --master local[1]


创建
val sc: SparkContext // An existing SparkContext.
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
// this is used to implicitly convert an RDD to a DataFrame.
import sqlContext.implicits._




b.txt的内容格式name,age  asda,18  asdwrf,20
//RDD演示
val sqlContext= new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
case class Person(name:String,age:Int)
val people=sc.textFile("hdfs://192.168.192.137:9000/data/b.txt").map(_.split(",")).map(p=>Person(p(0),p(1).trim.toInt)).toDF()
people.registerAsTable("people")  //注册成表


val teenagers = sqlContext.sql("SELECT name, age FROM people WHERE age >= 13 AND age <= 39")
teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
teenagers.map(t => "Name: " + t.getAs[String]("name")).collect().foreach(println)
teenagers.map(_.getValuesMap[Any](List("name", "age"))).collect().foreach(println)




val sqlContext= new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
case class Person(name:String,age:Int)
// Create the DataFrame  .toDF() 或者 val df = sqlContext.read.json("examples/src/main/resources/people.json")
val df=sc.textFile("hdfs://192.168.192.137:9000/data/b.txt").map(_.split(",")).map(p=>Person(p(0),p(1).trim.toInt)).toDF()
//显示表
df.show()  
//显示树形结构
df.printSchema()
//查询某字段内容
df.select("name").show()
// 查询所有内容,但是age字段的内容+1
df.select(df("name"), df("age") + 1).show()
//查询满足某条件的字段--操作可以转换成表,然后用sql语句查询
df.where('age >= 10).where('age <= 39).select('name).show()
df.where("age >= 10").where("age <= 39").select("name").show()
//查询满足某条件的内容
df.filter(df("age") > 21).show()
// 某字段分组后的数量 分组计数--操作可以转换成表,然后用sql语句查询
df.groupBy("age").count().show()
// 左联表(注意是3个等号!)--操作可以转换成表,然后用sql语句查询
df.join(df2, df("name") === df2("name"), "left").show()


创建Datasets数据集集合
val ds = Seq(1, 2, 3).toDS()
ds.map(_ + 1).collect() // Returns: Array(2, 3, 4)


case class Person(name: String, age: Long)
val ds = Seq(Person("Andy", 32)).toDS() //注释 DS--Datasets


val path = "examples/src/main/resources/people.json"
val people = sqlContext.read.json(path).as[Person]


//DSL演示
val teenagers_dsl = df.where("age >= 10").where("age <= 39").select("name")
teenagers_dsl.map(t => "Name: " + t(0)).collect().foreach(println)


//导出parquet演示--保存成spark默认数据源文件
df.select("name", "age").write.format("parquet").save("hdfs://192.168.192.137:9000/data/namesAndAges.parquet")
people.write.parquet("people.parquet")  //people--toDF()  DF--DataFrame


//导入
val df = sqlContext.read.format("json").load("examples/src/main/resources/people.json")
val people = sqlContext.read.json("examples/src/main/resources/people.json")
val df2 = sqlContext.read.load("hdfs://192.168.192.137:9000/data/namesAndAges.parquet")
val parquetFile = sqlContext.read.parquet("people.parquet")


df2.show()
//直接运行SQL文件
val df3 = sqlContext.sql("SELECT * FROM parquet.`hdfs://192.168.192.137:9000/data/namesAndAges.parquet`")
val df3 = sqlContext.sql("SELECT name FROM parquet.`hdfs://192.168.192.137:9000/data/namesAndAges.parquet`")


val sqlContext= new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
case class Person(name:String,age:Int)
val people=sc.textFile("hdfs://192.168.192.137:9000/data/b.txt").map(_.split(",")).map(p=>Person(p(0),p(1).trim.toInt)).toDF()
people.write.parquet("people.parquet")  //(默认保存到hdfs://hadoop-master.dragon.org:9000/user/root/)
val parquetFile = sqlContext.read.parquet("people.parquet")
parquetFile.registerTempTable("parquetFile")  //注册成表
val teenagers = sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 39")
teenagers.map(t => "Name: " + t(0)).collect().foreach(println)


分区
//RDD转换成DataFrame的包
import sqlContext.implicits._
//创建DF  single|double是字段名称
val df1 = sc.makeRDD(1 to 5).map(i => (i, i * 2)).toDF("single", "double")
//指定分区路径hdfs://hadoop-master.dragon.org:9000/user/root/data/test_table/key=1
df1.write.parquet("data/test_table/key=1")
//创建第二个DF且分区
val df2 = sc.makeRDD(6 to 10).map(i => (i, i * 3)).toDF("single", "triple")
df2.write.parquet("data/test_table/key=2")
//读取分区表--把"mergeSchema"设置成"true"就可以读取所有分区的表
val df3 = sqlContext.read.option("mergeSchema", "true").parquet("data/test_table")
//显示结构
df3.printSchema()


利用HIVE完成查询-----出错,请更换支持hive的spark版本
后台启动hive --service metastore > metastore.log 2>&1 &
启动spark-shell --master local[1]
创建HIVE
val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)


sqlContext.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
sqlContext.sql("LOAD DATA LOCAL INPATH '/opt/modules/spark-1.6.0-bin-hadoop2.6/examples/src/main/resources/kv1.txt' INTO TABLE src")


// Queries are expressed in HiveQL
sqlContext.sql("FROM src SELECT key, value").collect().foreach(println)


//查询有多少行数据
hiveContext.hql("Select count(*) from SOGOUQ1").collect().foreach(println)


//显示前10行数据
hiveContext.hql("select * from SOGOUQ1 limit 10").collect().foreach(println)



你可能感兴趣的:(sql,spark)