注意:
可以将DataFrame存储成parquet文件。保存成parquet文件的方式有两种
df.write().mode(SaveMode.Overwrite).format("parquet").save("./sparksql/parquet");
df.write().mode(SaveMode.Overwrite).parquet("./sparksql/parquet");
SaveMode指定文件保存时的模式,参数解释如下。
Overwrite:覆盖
Append:追加
ErrorIfExists:如果存在就报错
Ignore:如果存在就忽略
方式一:使用format方法
package com.gw.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SaveMode
object Parquet {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("parquet")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
//读取json文件,读取出来的文件是RDD形式
val jsonRDD = sc.textFile("sparksql/json")
//将RDD转换成DataFrame形式
val df = sqlContext.read.json(jsonRDD)
//将DF保存为parquet文件
df.write.mode(SaveMode.Overwrite).format("parquet").save("./sparksql/parquet")
//读取parquet文件
var result = sqlContext.read.parquet("./sparksql/parquet")
result.show()
sc.stop()
}
}
运行结果
此时刷新项目,在项目的sparksql目录下会多出一个目录,如下图
方式二:使用parquet方法
package com.gw.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SaveMode
object Parquet {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("parquet")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
//读取json文件,读取出来的文件是RDD形式
val jsonRDD = sc.textFile("sparksql/json")
//将RDD转换成DataFrame形式
val df = sqlContext.read.json(jsonRDD)
//将DF保存为parquet文件
df.write.mode(SaveMode.Overwrite).parquet("./sparksql/parquet")
//读取parquet文件
var result = sqlContext.read.format("parquet").load("./sparksql/parquet")
result.show()
sc.stop()
}
}
在mysql数据库中新建一个库:spark,在该库中新建两张数据表:score,person
创建库语句:create database spark default charset utf8;
使用数据库:use spark
创建数据库表语句
create table score(
id int primary key auto_increment,
name varchar(20),
score int
)engine=innodb default charset utf8;
create table person(
id int primary key auto_increment,
name varchar(20),
age int
)engine=innodb default charset utf8;
添加信息
insert into score values (1,'张三',98);
insert into score values (2,'李四',78);
insert into score values (3,'王五',68);
insert into score values (4,'赵六',88);
insert into person values (1,'张三',23);
insert into person values (2,'李四',33);
insert into person values (3,'王五',25);
insert into person values (4,'赵六',26);
在项目中添加连接数据库的jar包
需求:连接数据库,读取数据库中数据
方式一:
package com.gw.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import java.util.HashMap
object Mysql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("mysql")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
//第一种方式读取Mysql数据库表创建DF
//获取数据库连接信息
val options = new HashMap[String,String]();
options.put("url", "jdbc:mysql://localhost:3306/spark")
options.put("driver","com.mysql.jdbc.Driver")
options.put("user","root")
options.put("password", "root")
options.put("dbtable","person")
val person = sqlContext.read.format("jdbc").options(options).load()
person.show()
}
}
运行结果
方式二:
package com.gw.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
object Mysql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("mysql")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
//第二种方式读取Mysql数据库表创建DF
//获取数据库连接信息
val reader = sqlContext.read.format("jdbc")
reader.option("url", "jdbc:mysql://localhost:3306/spark")
reader.option("driver","com.mysql.jdbc.Driver")
reader.option("user","root")
reader.option("password","root")
reader.option("dbtable", "score")
val score = reader.load()
score.show()
}
}
运行结果
两张表连接查询
package com.gw.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import java.util.HashMap
object Mysql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("mysql")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
/**
* 第一种方式读取Mysql数据库表创建DF
*/
val options = new HashMap[String,String]();
options.put("url", "jdbc:mysql://localhost:3306/spark")
options.put("driver","com.mysql.jdbc.Driver")
options.put("user","root")
options.put("password", "root")
options.put("dbtable","person")
val person = sqlContext.read.format("jdbc").options(options).load()
person.registerTempTable("person")
/**
* 第二种方式读取Mysql数据库表创建DF
*/
val reader = sqlContext.read.format("jdbc")
reader.option("url", "jdbc:mysql://localhost:3306/spark")
reader.option("driver","com.mysql.jdbc.Driver")
reader.option("user","root")
reader.option("password","root")
reader.option("dbtable", "score")
val score = reader.load()
score.registerTempTable("score")
//连接查询
val result = sqlContext.sql("select person.id,person.age,score.name,score.score from person,score where person.name = score.name")
result.show()
}
}
运行结果
向数据库中添加数据
需求:将查询出来的数据添加到一张result表中
package com.gw.sparksql
import java.util.HashMap
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SaveMode
import java.util.Properties
object Mysql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("mysql")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
/**
* 第一种方式读取Mysql数据库表创建DF
*/
val options = new HashMap[String,String]();
options.put("url", "jdbc:mysql://localhost:3306/spark")
options.put("driver","com.mysql.jdbc.Driver")
options.put("user","root")
options.put("password", "root")
options.put("dbtable","person")
val person = sqlContext.read.format("jdbc").options(options).load()
person.registerTempTable("person")
/**
* 第二种方式读取Mysql数据库表创建DF
*/
val reader = sqlContext.read.format("jdbc")
reader.option("url", "jdbc:mysql://localhost:3306/spark")
reader.option("driver","com.mysql.jdbc.Driver")
reader.option("user","root")
reader.option("password","root")
reader.option("dbtable", "score")
val score = reader.load()
score.registerTempTable("score")
//连接查询
val result = sqlContext.sql("select person.id,person.age,score.name,score.score from person,score where person.name = score.name")
result.show()
/**
* 将数据写入到Mysql表中
*/
val properties = new Properties()
properties.setProperty("user", "root")
properties.setProperty("password", "root")
result.write.mode(SaveMode.Append).jdbc("jdbc:mysql://localhost:3306/spark", "result", properties)
sc.stop()
}
}
此时在数据库中查询,发现多出一张result表,表中存储的是刚刚连接查询出来的数据
HiveContext是SQLContext的子类,连接Hive建议使用HiveContext。
由于本地没有Hive环境,要提交到集群运行,提交命令:
./spark-submit --master spark://node01:7077,node02:7077
--executor-cores 1 --executor-memory 2G --total-executor-cores 1
--class com.gw.sparksql.dataframe.CreateDFFromHive /root/test/HiveTest.jar
package com.gw.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.spark.sql.SaveMode
object Hive {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("HiveSource")
val sc = new SparkContext(conf)
/**
* HiveContext是SQLContext的子类。
*/
val hiveContext = new HiveContext(sc)
hiveContext.sql("use spark")
hiveContext.sql("drop table if exists student_infos")
hiveContext.sql("create table if not exists student_infos (name string,age int) row format delimited fields terminated by 't'")
hiveContext.sql("load data local inpath '/root/test/student_infos' into table student_infos")
hiveContext.sql("drop table if exists student_scores")
hiveContext.sql("create table if not exists student_scores (name string,score int) row format delimited fields terminated by 't'")
hiveContext.sql("load data local inpath '/root/test/student_scores' into table student_scores")
val df = hiveContext.sql("select si.name,si.age,ss.score from student_infos si,student_scores ss where si.name = ss.name")
hiveContext.sql("drop table if exists good_student_infos")
/**
* 将结果写入到hive表中
*/
df.write.mode(SaveMode.Overwrite).saveAsTable("good_student_infos")
sc.stop()
}
}