Spark SQL is Apache Spark’s module for working with structured data.
// sparksql 可以采用一种统一的方式去对接任意的外部数据源
val dataFrame = sparkSession.read.文件格式的方法名("该文件格式的路径")
val rdd = dataFrame.rdd
1 youyou 38
2 Tony 25
3 laowang 18
4 dali 30
object Case01_ReadText {
def main(args: Array[String]): Unit = {
// 创建 SparkSession
val spark = SparkSession.builder().appName(this.getClass.getSimpleName)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.master("local[*]")
.getOrCreate()
val df: DataFrame = spark.read.text(this.getClass.getClassLoader.getResource("person.txt").getPath)
/**
* 打印schema信息
* root
* |-- value: string (nullable = true)
*/
df.printSchema
println("----------------")
println(df.count()) // 4
/**
* +------------+
* | value|
* +------------+
* | 1 youyou 38|
* | 2 Tony 25|
* |3 laowang 18|
* | 4 dali 30|
* +------------+
*/
println("----------------")
df.show()
ss.stop()
}
}
case class Person(id: String, name: String, age: Int)
object Case02_ReadTextV2 {
def main(args: Array[String]): Unit = {
// 创建 SparkSession
val spark = SparkSession.builder().appName(this.getClass.getSimpleName)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.master("local[*]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
// 添加隐式转换
import spark.implicits._
val rdd1: RDD[Array[String]] = sc.textFile(this.getClass.getClassLoader.getResource("person.txt").getPath)
.map(x => x.split(" "))
// 将 rdd 与样例类进行关联
val personRDD: RDD[Person] = rdd1.map(x => Person(x(0), x(1), x(2).toInt))
// 将 rdd 转成 DataFrame
val df = personRDD.toDF
/**
* root
* |-- id: string (nullable = true)
* |-- name: string (nullable = true)
* |-- age: integer (nullable = false)
*/
df.printSchema()
/**
* +---+-------+---+
* | id| name|age|
* +---+-------+---+
* | 1| youyou| 38|
* | 2| Tony| 25|
* | 3|laowang| 18|
* | 4| dali| 30|
* +---+-------+---+
*/
df.show()
spark.stop()
}
}
{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}
object Case03_ReadJson {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(this.getClass.getSimpleName)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val df: DataFrame = spark.read.json(this.getClass.getClassLoader.getResource("person.json").getPath)
/**
* root
* |-- age: long (nullable = true)
* |-- name: string (nullable = true)
*/
df.printSchema
println("--------------")
/**
* +----+-------+
* | age| name|
* +----+-------+
* |null|Michael|
* | 30| Andy|
* | 19| Justin|
* +----+-------+
*/
df.show()
spark.stop()
}
}
spark-2.3.3-bin-hadoop2.7/examples/src/main/resources/users.parquet
复制到自己的工程的 resources 目录object Case04_ReadParquet {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(this.getClass.getSimpleName)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.master("local[*]")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val df: DataFrame = spark.read.parquet(this.getClass.getClassLoader.getResource("users.parquet").getPath)
/**
* root
* |-- name: string (nullable = true)
* |-- favorite_color: string (nullable = true)
* |-- favorite_numbers: array (nullable = true)
* | |-- element: integer (containsNull = true)
*/
df.printSchema
/**
* +------+--------------+----------------+
* | name|favorite_color|favorite_numbers|
* +------+--------------+----------------+
* |Alyssa| null| [3, 9, 15, 20]|
* | Ben| red| []|
* +------+--------------+----------------+
*/
df.show
spark.stop()
}
}
object Case05_StructTypeSchema {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
val rdd: RDD[Array[String]] = sc.textFile(this.getClass.getClassLoader.getResource("person.txt").getPath)
.map(x => x.split(" "))
// 将rdd与Row对象关联
val rowRDD: RDD[Row] = rdd.map(x => Row(x(0), x(1), x(2).toInt))
// 指定dataFrame的schema信息,这里指定的字段个数和类型必须要跟Row对象保持一致
val schema = StructType(
StructField("id", StringType) ::
StructField("name", StringType) ::
StructField("age", IntegerType) :: Nil
)
// 利用rdd生成DataFrame
val df: DataFrame = spark.createDataFrame(rowRDD, schema)
/**
* root
* |-- id: string (nullable = true)
* |-- name: string (nullable = true)
* |-- age: integer (nullable = true)
*/
df.printSchema
/**
* +---+-------+---+
* | id| name|age|
* +---+-------+---+
* | 1| youyou| 38|
* | 2| Tony| 25|
* | 3|laowang| 18|
* | 4| dali| 30|
* +---+-------+---+
*/
df.show()
// 用sql的方式查询结构化数据
df.createTempView("person")
/**
* +---+-------+---+
* | id| name|age|
* +---+-------+---+
* | 1| youyou| 38|
* | 2| Tony| 25|
* | 3|laowang| 18|
* | 4| dali| 30|
* +---+-------+---+
*/
spark.sql("select * from person").show()
spark.stop()
}
}
// 将 rdd 转成 DataFrame
val personDF = personRDD.toDF
personDF.printSchema()
personDF.show()
/************************** DSL风格语法 start *************************/
// 1. 查询指定字段
personDF.select("name").show
personDF.select($"name").show
// 2. 实现 age+1
personDF.select($"name", $"age", $"age" + 1).show
// 3. 实现 age>30 过滤
personDF.filter($"age" > 30).show
// 4. 按照 age 分组统计
personDF.groupBy("age").count.show
// 5. 按照age分组统计次数降序
personDF.groupBy("age").count().sort($"age".desc).show
/************************** DSL风格语法 end *************************/
/************************** SQL风格语法 start *************************/
// 1. DataFrame注册成表
personDF.createTempView("person")
// 2. 使用SparkSession调用sql方法统计查询
spark.sql("select * from person").show
spark.sql("select name from person").show
spark.sql("select name, age from person").show
spark.sql("select * from person where age > 30").show
spark.sql("select count(*) from person where age > 30").show
spark.sql("select age, count(*) from person group by age").show
spark.sql("select age, count(*) as count from person group by age").show
spark.sql("select * from person order by age desc").show
/************************** SQL风格语法 end *************************/
type DataFrame=DataSet[Row]
val ds = spark.createDataset(1 to 10) // scala 集合
ds.show
val ds = spark.createDataset(sc.textFile("/person.txt")) //rdd
ds.show
sc.textFile("/person.txt").toDS
List(1,2,3,4,5).toDS
val ds = dataFrame.as[强类型]
List(1,2,3,4,5).toDS.map(x => x * 10)
关系是怎样的?
三者如何相互转换
case class Person(id: String, name: String, age: Int)
object Case06_SparkConversion {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
val spark = SparkSession.builder().config(conf).getOrCreate()
val sc = spark.sparkContext
sc.setLogLevel("WARN")
// 隐式转换
import spark.implicits._
val rdd = sc.textFile(this.getClass.getClassLoader.getResource("person.txt").getPath)
.map(x => x.split(" "))
// 把rdd与样例类进行关联
val personRDD = rdd.map(x => Person(x(0), x(1), x(2).toInt))
// 1. rdd -> df
val df1 = personRDD.toDF
df1.show
// 2. rdd -> ds
val ds1 = personRDD.toDS
ds1.show
// 3. df -> rdd
val rdd1 = df1.rdd
println(rdd1.collect.toList)
// 4. ds -> rdd
val rdd2 = ds1.rdd
println(rdd2.collect.toList)
// 5. ds -> df
val df2: DataFrame = ds1.toDF
df2.show
// df -> ds
val ds2: Dataset[Person] = df2.as[Person]
ds2.show
spark.stop()
}
}
/**
* 使用 SparkSQL读写MySQL表中的数据
*/
object Case07_ReadMySQL {
def main(args: Array[String]): Unit = {
// 1. 创建 SparkConf 对象
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
// 2. 创建 SparkSession 对象
val spark = SparkSession.builder().config(conf).getOrCreate()
// 3. 创建 DataFrame
val url = "jdbc:mysql://192.168.254.132:3306/mydb?characterEncoding=UTF-8"
val tableName = "jobdetail"
val props = new Properties()
props.setProperty("user", "root")
props.setProperty("password", "123456")
val mysqlDF: DataFrame = spark.read.jdbc(url, tableName, props)
// 4. 读取 MySQL 表中的数据
// 4.1 打印schema信息
mysqlDF.printSchema()
// 4.2 展示数据
mysqlDF.show()
// 4.3 将dataFrame注册成表
mysqlDF.createTempView("job_detail")
spark.sql("select * from job_detail where city = '广东'").show()
spark.stop()
}
}
object Case08_ReadCsvWriteMySQL {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val df: DataFrame = spark.read.format("csv")
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") // 时间转换
.option("header", "true") // 第一行数据都是head(字段属性的意思)
// .option("multiLine", "true") // 数据可能换行
.load(this.getClass.getClassLoader.getResource("data").getPath)
df.createOrReplaceTempView("job_detail")
spark.sql("select job_name,job_url,job_location,job_salary,job_company,job_experience,job_class,job_given,job_detail,company_type,company_person,search_key,city from job_detail where job_company = '北京无极慧通科技有限公司'").show(80)
val props = new Properties()
props.put("user", "root")
props.put("password", "123456")
df.write.mode(SaveMode.Append).jdbc(
"jdbc:mysql://192.168.254.132:3306/mydb?useSSL=false&useUnicode=true&characterEncoding=UTF-8",
"mydb.jobdetail_copy", props
)
}
}
$ pwd
/bigdata/install/hive-3.1.2/conf
$ scp hive-site.xml node01:/bigdata/install/spark-2.3.3-bin-hadoop2.7/conf/
$ scp hive-site.xml node02:/bigdata/install/spark-2.3.3-bin-hadoop2.7/conf/
$ scp hive-site.xml node03:/bigdata/install/spark-2.3.3-bin-hadoop2.7/conf/
$ ll mysql-connector-java-5.1.38.jar
-rw-rw-r--. 1 hadoop hadoop 983911 12月 6 2021 mysql-connector-java-5.1.38.jar
$ pwd
/bigdata/install/hive-3.1.2/lib
$ scp mysql-connector-java-5.1.38.jar node01:/bigdata/install/spark-2.3.3-bin-hadoop2.7/jars/
$ scp mysql-connector-java-5.1.38.jar node02:/bigdata/install/spark-2.3.3-bin-hadoop2.7/jars/
$ scp mysql-connector-java-5.1.38.jar node03:/bigdata/install/spark-2.3.3-bin-hadoop2.7/jars/
spark.sql.warehouse.dir=spark-warehouse
,此时将 hive 与 spark sql 整合完成后,在通过 spark-sql 脚本启动时,会在当前目录下创建一个 spark.sql.warehouse.dir 为 spark-warehouse 的目录,存放由 spark-sql 创建数据库和创建表的数据信息,与之前 hive 的数据信息不是放在同一个路径下(可以互相访问)。但是此时 spark-sql 中表的数据在本地,不利于操作,也不安全。--conf spark.sql.warehouse.dir=hdfs://node01:8020/user/hive/warehouse
CREATE EXTERNAL TABLE `student`(
`ID` bigint COMMENT '',
`CreatedBy` string COMMENT '创建人',
`CreatedTime` string COMMENT '创建时间',
`UpdatedBy` string COMMENT '更新人',
`UpdatedTime` string COMMENT '更新时间',
`Version` int COMMENT '版本号',
`name` string COMMENT '姓名'
) COMMENT '学生表'
PARTITIONED BY (`dt` String COMMENT 'partition')
row format delimited fields terminated by '\t'
location '/student';
INSERT INTO TABLE student partition(dt='2022-07-12') VALUES(1, "xxx", "2022-07-12", "", "", 1, "zhangsan");
INSERT INTO TABLE student partition(dt='2022-07-12') VALUES(2, "xxx", "2022-07-12", "", "", 2, "lisi");
$ spark-sql --master local[2] \
--executor-memory 512m --total-executor-cores 3 \
--conf spark.sql.warehouse.dir=hdfs://node01:8020/user/hive/warehouse
# 执行查询
select * from student;
#!/bin/sh
# 定义 spark sql 提交脚本的头信息
SUBMIT_INFO="spark-sql --master spark://node01:7077 --executor-memory 1g --total-executor-cores 4 --conf spark.sql.warehouse.dir=hdfs://node01:8020/user/hive/warehouse"
# 定义一个 sql 语句
SQL="select * from student;"
# 执行 sql 语句,类似于 hive -e sql
echo "$SUBMIT_INFO"
echo "$SQL"
$SUBMIT_INFO -e "$SQL"
$ sh spark_on_hive.sh
<property>
<name>hive.metastore.urisname>
<value>thrift://node03:9083value>
<description>Thrift URI for the remote metastoredescription>
property>
<property>
<name>hive.server2.thrift.min.worker.threadsname>
<value>5value>
property>
<property>
<name>hive.server2.thrift.max.worker.threadsname>
<value>500value>
property>
$ pwd
/bigdata/install/hive-3.1.2/conf
$ scp hive-site.xml node01:/bigdata/install/spark-2.3.3-bin-hadoop2.7/conf/
$ scp hive-site.xml node02:/bigdata/install/spark-2.3.3-bin-hadoop2.7/conf/
$ scp hive-site.xml node03:/bigdata/install/spark-2.3.3-bin-hadoop2.7/conf/
hive --service metastore
$ pwd
/bigdata/install/spark-2.3.3-bin-hadoop2.7/sbin
$ ./start-thriftserver.sh --master local[*] --executor-memory 2g --total-executor-cores 5
$ beeline --color=true
beeline> !connect jdbc:hive2://node03:10000
Connecting to jdbc:hive2://node03:10000
Enter username for jdbc:hive2://node03:10000: hadoop
Enter password for jdbc:hive2://node03:10000: ******
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-hive_2.11artifactId>
<version>2.3.3version>
dependency>
object Case09_SparkSQLOnHive {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]")
.enableHiveSupport() // 启用hive
.config("spark.sql.warehouse.dir", "hdfs://node01:8020/user/hive/warehouse")
.getOrCreate()
val df: DataFrame = spark.sql("select * from student")
df.show()
// 直接写表达式,通过 insert into 插入
df.write.saveAsTable("student1")
spark.sql("insert into student1 select * from student")
}
}
<dependency>
<groupId>org.json4sgroupId>
<artifactId>json4s-jackson_2.11artifactId>
<version>3.3.0version>
dependency>
create 'spark_hbase','info'
put 'spark_hbase','0001','info:name','tangseng'
put 'spark_hbase','0001','info:age','30'
put 'spark_hbase','0001','info:sex','0'
put 'spark_hbase','0001','info:addr','beijing'
put 'spark_hbase','0002','info:name','sunwukong'
put 'spark_hbase','0002','info:age','508'
put 'spark_hbase','0002','info:sex','0'
put 'spark_hbase','0002','info:addr','shanghai'
put 'spark_hbase','0003','info:name','zhubajie'
put 'spark_hbase','0003','info:age','715'
put 'spark_hbase','0003','info:sex','0'
put 'spark_hbase','0003','info:addr','shenzhen'
put 'spark_hbase','0004','info:name','bailongma'
put 'spark_hbase','0004','info:age','1256'
put 'spark_hbase','0004','info:sex','0'
put 'spark_hbase','0004','info:addr','donghai'
put 'spark_hbase','0005','info:name','shaheshang'
put 'spark_hbase','0005','info:age','1008'
put 'spark_hbase','0005','info:sex','0'
put 'spark_hbase','0005','info:addr','tiangong'
create "spark_hbase_copy",'info'
object Case10_SparkSQLOnHBase {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
import spark.implicits._
val hconf: Configuration = HBaseConfiguration.create
hconf.set(HConstants.ZOOKEEPER_QUORUM, "node01:2181,node02:2181,node03:2181")
val hbaseContext = new HBaseContext(spark.sparkContext, hconf)
// 定义映射的 catalog
val catalog: String = "{\"table\":{\"namespace\":\"default\",\"name\":\"spark_hbase\"},\"rowkey\":\"key\",\"columns\":{\"f0\":{\"cf\":\"rowkey\",\"col\":\"key\",\"type\":\"string\"},\"f1\":{\"cf\":\"info\",\"col\":\"addr\",\"type\":\"string\"},\"f2\":{\"cf\":\"info\",\"col\":\"age\",\"type\":\"boolean\"},\"f3\":{\"cf\":\"info\",\"col\":\"name\",\"type\":\"string\"}}}";
// 读取HBase数据
val ds: DataFrame = spark.read.format("org.apache.hadoop.hbase.spark")
.option(HBaseTableCatalog.tableCatalog, catalog)
.load()
ds.show(10)
val catalogCopy: String = catalog.replace("spark_hbase", "spark_hbase_out")
// 数据写入HBase
ds.write.format("org.apache.hadoop.hbase.spark")
.option(HBaseTableCatalog.tableCatalog, catalogCopy)
.mode(SaveMode.Overwrite)
.save()
}
}
object Case11_SparkUDF {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
val spark = SparkSession.builder().config(conf).getOrCreate()
val df: DataFrame = spark.read.format("csv")
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.option("header", "true")
.option("multiLine", "true")
.load("/Volumes/F/MyGitHub/bigdata/spark-demo/src/main/resources/深圳链家二手房成交明细.csv")
df.createOrReplaceTempView("house_sale")
// 注册UDF
spark.udf.register("house_udf", new UDF1[String, String] {
val pattern: Pattern = Pattern.compile("^[0-9]*$")
override def call(input: String): String = {
val matcher = pattern.matcher(input)
if (matcher.matches()) input
else "1990"
}
}, DataTypes.StringType)
// 使用UDF
spark.sql("select house_udf(house_age) from house_sale limit 200").show()
spark.stop()
}
}
object Case12_SparkUDAF {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
val spark = SparkSession.builder().config(conf).getOrCreate()
val df: DataFrame = spark.read.format("csv")
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.option("header", "true")
.option("multiLine", "true")
.load("/Volumes/F/MyGitHub/bigdata/spark-demo/src/main/resources/深圳链家二手房成交明细.csv")
df.createOrReplaceTempView("house_sale")
spark.sql("select floor from house_sale limit 30").show()
spark.udf.register("udaf", new MyAverage)
spark.sql("select floor, udaf(house_sale_money) from house_sale group by floor").show()
df.printSchema()
spark.stop()
}
}
class MyAverage extends UserDefinedAggregateFunction {
// 聚合函数输入函数的数据类型
override def inputSchema: StructType = StructType(StructField("floor", DoubleType) :: Nil)
// 聚合缓冲区中值的数据类型
override def bufferSchema: StructType = {
StructType(StructField("sum", DoubleType) :: StructField("count", LongType) :: Nil)
}
// 返回值类型
override def dataType: DataType = DoubleType
// 对于相同输入是否一直返回相同的输出
override def deterministic: Boolean = true
// 初始化
override def initialize(buffer: MutableAggregationBuffer): Unit = {
// 用于存储不同类型的楼房的总成交额
buffer(0) = 0D
// 用于存储不同类型的楼房的总个数
buffer(1) = 0L
}
// 相同Execute间的数据合并(分区内聚合)
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
if (!input.isNullAt(0)) {
buffer(0) = buffer.getDouble(0) + input.getDouble(0)
buffer(1) = buffer.getLong(1) + 1
}
}
// 不同Execute间的数据合并(分区外聚合)
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1(0) = buffer1.getDouble(0) + buffer2.getDouble(0)
buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1)
}
// 计算最终结果
override def evaluate(buffer: Row): Any = buffer.getDouble(0) / buffer.getLong(1)
}
object Case13_SparkUDTF {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
val df: DataFrame = spark.read.format("csv")
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.option("header", "true")
.option("multiLine", "true")
.load("/Volumes/F/MyGitHub/bigdata/spark-demo/src/main/resources/深圳链家二手房成交明细.csv")
df.createOrReplaceTempView("house_sale")
// 注册UDTF算子,这里无法使用sparkSession.udf.register(),注意包全路径
spark.sql("CREATE TEMPORARY FUNCTION MySplit as 'com.yw.spark.example.sql.cases.MySplit'")
spark.sql("select part_place, MySplit(part_place, ' ') from house_sale limit 50").show()
spark.stop()
}
}
class MySplit extends GenericUDTF {
override def initialize(args: Array[ObjectInspector]): StructObjectInspector = {
if (args.length != 2) {
throw new UDFArgumentLengthException("UserDefinedUDTF takes only two argument")
}
// 判断第一个参数是不是字符串参数
if (args(0).getCategory() != ObjectInspector.Category.PRIMITIVE) {
throw new UDFArgumentException("UserDefinedUDTF takes string as a parameter")
}
// 列名:会被用户传递的覆盖
val fieldNames: ArrayList[String] = new ArrayList[String]()
fieldNames.add("col1")
// 返回列以什么格式输出,这里是string,添加几个就是几个列,和上面的名字个数对应个数
val fieldOIs: ArrayList[ObjectInspector] = new ArrayList[ObjectInspector]()
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector)
ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs)
}
override def process(objects: Array[AnyRef]): Unit = {
// 获取数据
val data: String = objects(0).toString
// 获取分隔符
val splitKey: String = objects(1).toString
// 切分数据
val words: Array[String] = data.split(splitKey)
// 遍历写出
words.foreach(x => {
// 将数据放入集合
val tmp: Array[String] = new Array[String](1)
tmp(0) = x
forward(tmp)
})
}
override def close(): Unit = {
// 没有流操作
}
}