表和数据
/*
Navicat Premium Data Transfer
Source Server : pdm
Source Server Type : MySQL
Source Server Version : 50173
Source Host : 192.168.1.100:3306
Source Schema : test
Target Server Type : MySQL
Target Server Version : 50173
File Encoding : 65001
Date: 12/06/2019 11:05:31
*/
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for Course
-- ----------------------------
DROP TABLE IF EXISTS `Course`;
CREATE TABLE `Course` (
`CId` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
`Cname` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
`TId` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL
) ENGINE = MyISAM CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Records of Course
-- ----------------------------
INSERT INTO `Course` VALUES ('01', '语文', '02');
INSERT INTO `Course` VALUES ('02', '数学', '01');
INSERT INTO `Course` VALUES ('03', '英语', '03');
-- ----------------------------
-- Table structure for SC
-- ----------------------------
DROP TABLE IF EXISTS `SC`;
CREATE TABLE `SC` (
`SId` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
`CId` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
`score` decimal(18, 1) DEFAULT NULL
) ENGINE = MyISAM CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Records of SC
-- ----------------------------
INSERT INTO `SC` VALUES ('01', '01', 80.0);
INSERT INTO `SC` VALUES ('01', '02', 90.0);
INSERT INTO `SC` VALUES ('01', '03', 99.0);
INSERT INTO `SC` VALUES ('02', '01', 70.0);
INSERT INTO `SC` VALUES ('02', '02', 60.0);
INSERT INTO `SC` VALUES ('02', '03', 80.0);
INSERT INTO `SC` VALUES ('03', '01', 80.0);
INSERT INTO `SC` VALUES ('03', '02', 80.0);
INSERT INTO `SC` VALUES ('03', '03', 80.0);
INSERT INTO `SC` VALUES ('04', '01', 50.0);
INSERT INTO `SC` VALUES ('04', '02', 30.0);
INSERT INTO `SC` VALUES ('04', '03', 20.0);
INSERT INTO `SC` VALUES ('05', '01', 76.0);
INSERT INTO `SC` VALUES ('05', '02', 87.0);
INSERT INTO `SC` VALUES ('06', '01', 31.0);
INSERT INTO `SC` VALUES ('06', '03', 34.0);
INSERT INTO `SC` VALUES ('07', '02', 89.0);
INSERT INTO `SC` VALUES ('07', '03', 98.0);
-- ----------------------------
-- Table structure for Student
-- ----------------------------
DROP TABLE IF EXISTS `Student`;
CREATE TABLE `Student` (
`SId` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
`Sname` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
`Sage` datetime DEFAULT NULL,
`Ssex` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL
) ENGINE = MyISAM CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Records of Student
-- ----------------------------
INSERT INTO `Student` VALUES ('01', '赵雷', '1990-01-01 00:00:00', '男');
INSERT INTO `Student` VALUES ('02', '钱电', '1990-12-21 00:00:00', '男');
INSERT INTO `Student` VALUES ('03', '孙风', '1990-12-20 00:00:00', '男');
INSERT INTO `Student` VALUES ('04', '李云', '1990-12-06 00:00:00', '男');
INSERT INTO `Student` VALUES ('05', '周梅', '1991-12-01 00:00:00', '女');
INSERT INTO `Student` VALUES ('06', '吴兰', '1992-01-01 00:00:00', '女');
INSERT INTO `Student` VALUES ('07', '郑竹', '1989-01-01 00:00:00', '女');
INSERT INTO `Student` VALUES ('09', '张三', '2017-12-20 00:00:00', '女');
INSERT INTO `Student` VALUES ('10', '李四', '2017-12-25 00:00:00', '女');
INSERT INTO `Student` VALUES ('11', '李四', '2012-06-06 00:00:00', '女');
INSERT INTO `Student` VALUES ('12', '赵六', '2013-06-13 00:00:00', '女');
INSERT INTO `Student` VALUES ('13', '孙七', '2014-06-01 00:00:00', '女');
-- ----------------------------
-- Table structure for Teacher
-- ----------------------------
DROP TABLE IF EXISTS `Teacher`;
CREATE TABLE `Teacher` (
`TId` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
`Tname` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL
) ENGINE = MyISAM CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Records of Teacher
-- ----------------------------
INSERT INTO `Teacher` VALUES ('01', '张三');
INSERT INTO `Teacher` VALUES ('02', '李四');
INSERT INTO `Teacher` VALUES ('03', '王五');
SET FOREIGN_KEY_CHECKS = 1;
准备环境
val sc = new SparkSession
.Builder()
.appName("source_data_mysql001")
.master("local")
.getOrCreate()
val url = "jdbc:mysql://192.168.1.100:3306/Strengthen"
val SC = "SC"
val df_SC = sc.read.jdbc(url, SC, dbConnProperties())
df_SC.show() //使用一个action算子来检查是否能读取数据
1.1查询" 01 “课程比” 02 "课程成绩高的学生id及课程分数
val frame1 = sc.sql("select * from sc s1 inner join SC s2 " +
"on s1.SId = s2.SId " +
"where s1.CId = '01' and s2.CId = '02' and s1.score > s2.score")
frame1.show()
1.1 查询同时存在" 01 “课程和” 02 "课程的情况
spark2是支持子查询的
val frame = sc.sql("select * from sc s1 " +
"where s1.SId in " +
"(select distinct(s2.SId) from sc s2 where s2.CId = '02') " +
"and s1.CId = '01'")
frame.show()
+---+---+-----+
|SId|CId|score|
+---+---+-----+
| 01| 01| 80.0|
| 05| 01| 76.0|
| 03| 01| 80.0|
| 02| 01| 70.0|
| 04| 01| 50.0|
+---+---+-----+
2.0语法和mysql感觉没啥区别,where和inner join on也是效果一样
val frame = sc.sql(
"select * from sc s1,sc s2 " +
"where s1.SId = s2.SId " +
"and s1.CId = '01' " +
"and s2.CId = '02' " +
"order by s1.SId")
+---+---+-----+---+---+-----+
|SId|CId|score|SId|CId|score|
+---+---+-----+---+---+-----+
| 01| 01| 80.0| 01| 02| 90.0|
| 02| 01| 70.0| 02| 02| 60.0|
| 03| 01| 80.0| 03| 02| 80.0|
| 04| 01| 50.0| 04| 02| 30.0|
| 05| 01| 76.0| 05| 02| 87.0|
+---+---+-----+---+---+-----+
1.2 查询存在" 01 “课程但可能不存在” 02 "课程的情况(不存在时显示为 null )
重点在于之前的关于join on and 和where and的区别
on and 把02不存在的设置为null
where过滤只保留01课程
val frame = sc.sql("select * from sc s1 " +
"left join sc s2 " +
"on s1.SId = s2.SId " +
"and s2.CId = '02' " +
"where s1.CId = '01' order by s1.SId")
+---+---+-----+----+----+-----+
|SId|CId|score| SId| CId|score|
+---+---+-----+----+----+-----+
| 01| 01| 80.0| 01| 02| 90.0|
| 02| 01| 70.0| 02| 02| 60.0|
| 03| 01| 80.0| 03| 02| 80.0|
| 04| 01| 50.0| 04| 02| 30.0|
| 05| 01| 76.0| 05| 02| 87.0|
| 06| 01| 31.0|null|null| null|
+---+---+-----+----+----+-----+
1.3 查询不存在" 01 “课程但存在” 02 "课程的情况
join on and 和where and,然后过滤得到 is null
但是例如01,03都不存在,02存在,感觉有问题
val frame = sc.sql("select * from sc s1 " +
"left join sc s2 " +
"on s1.SId = s2.SId " +
"and s2.CId = '01' "+
"where s1.CId = '02' and s2.CId is null " +
"order by s1.SId")
+---+---+-----+----+----+-----+
|SId|CId|score| SId| CId|score|
+---+---+-----+----+----+-----+
| 07| 02| 89.0|null|null| null|
+---+---+-----+----+----+-----+
val frame = sc.sql("select * from sc s1 " +
"where s1.SId not in " +
"(select SId from sc where sc.CId = '01') " +
"and s1.CId = '02' " +
"order by s1.SId")
|SId|CId|score|
+---+---+-----+
| 07| 02| 89.0|
+---+---+-----+
2 查询平均成绩大于等于 60 分的同学的学生编号和学生姓名和平均成绩
先查询在join,因为spark不可以在使用group by时查询没有出现在grou by 后面的字段
select stu.*,scavg.avg from stu join
(selectSId,avg(sc.score) as avg from sc
group by SId having avg(sc.score) >= 60) scavg
on stu.SId = scavg.SId
+---+-----+-------------------+----+--------+
|SId|Sname| Sage|Ssex| avg|
+---+-----+-------------------+----+--------+
| 07| 郑竹|1989-01-01 00:00:00| 女|93.50000|
| 01| 赵雷|1990-01-01 00:00:00| 男|89.66667|
| 05| 周梅|1991-12-01 00:00:00| 女|81.50000|
| 03| 孙风|1990-12-20 00:00:00| 男|80.00000|
| 02| 钱电|1990-12-21 00:00:00| 男|70.00000|
+---+-----+-------------------+----+--------+
3 查询在 SC 表存在成绩的学生信息
子查询性能消耗太大,不建议,所以这样写
select stu.* from stu
inner join (select distinct(SID) as id
from sc where score is not null) sc
on stu.SId = sc.id
+---+-----+-------------------+----+
|SId|Sname| Sage|Ssex|
+---+-----+-------------------+----+
| 07| 郑竹|1989-01-01 00:00:00| 女|
| 01| 赵雷|1990-01-01 00:00:00| 男|
| 05| 周梅|1991-12-01 00:00:00| 女|
| 03| 孙风|1990-12-20 00:00:00| 男|
| 02| 钱电|1990-12-21 00:00:00| 男|
| 06| 吴兰|1992-01-01 00:00:00| 女|
| 04| 李云|1990-12-06 00:00:00| 男|
+---+-----+-------------------+----+
这样也行,建议第一种
select distinct(stu.*) from sc join stu
on sc.SId = stu.SId
where score is not null
4.1 查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩(没成绩的显示为 null )
select stu.*,s.count,s.sum from stu
left join
(select SId,count(DISTINCT CId) as count,sum(score) as sum
from sc group by SId) as s
on stu.SId = s.SId
order by stu.SId
+---+-----+-------------------+----+-----+-----+
|SId|Sname| Sage|Ssex|count| sum|
+---+-----+-------------------+----+-----+-----+
| 01| 赵雷|1990-01-01 00:00:00| 男| 3|269.0|
| 02| 钱电|1990-12-21 00:00:00| 男| 3|210.0|
| 03| 孙风|1990-12-20 00:00:00| 男| 3|240.0|
| 04| 李云|1990-12-06 00:00:00| 男| 3|100.0|
| 05| 周梅|1991-12-01 00:00:00| 女| 2|163.0|
| 06| 吴兰|1992-01-01 00:00:00| 女| 2| 65.0|
| 07| 郑竹|1989-01-01 00:00:00| 女| 2|187.0|
| 09| 张三|2017-12-20 00:00:00| 女| null| null|
| 10| 李四|2017-12-25 00:00:00| 女| null| null|
| 11| 李四|2012-06-06 00:00:00| 女| null| null|
| 12| 赵六|2013-06-13 00:00:00| 女| null| null|
| 13| 孙七|2014-06-01 00:00:00| 女| null| null|
+---+-----+-------------------+----+-----+-----+
4.2 没成绩显示为null
NVL(name,-1)若name为空,则返回-1,若name不为空值,则返回其自身
select stu.*,s.count,NVL(s.sum,0) as nvl from stu
left join
(select SId,count(DISTINCT CId) as count,sum(score) as sum
from sc group by SId) as s
on stu.SId = s.SId
order by stu.SId
+---+-----+-------------------+----+-----+-----+
|SId|Sname| Sage|Ssex|count| nvl|
+---+-----+-------------------+----+-----+-----+
| 01| 赵雷|1990-01-01 00:00:00| 男| 3|269.0|
| 02| 钱电|1990-12-21 00:00:00| 男| 3|210.0|
| 03| 孙风|1990-12-20 00:00:00| 男| 3|240.0|
| 04| 李云|1990-12-06 00:00:00| 男| 3|100.0|
| 05| 周梅|1991-12-01 00:00:00| 女| 2|163.0|
| 06| 吴兰|1992-01-01 00:00:00| 女| 2| 65.0|
| 07| 郑竹|1989-01-01 00:00:00| 女| 2|187.0|
| 09| 张三|2017-12-20 00:00:00| 女| null| 0.0|
| 10| 李四|2017-12-25 00:00:00| 女| null| 0.0|
| 11| 李四|2012-06-06 00:00:00| 女| null| 0.0|
| 12| 赵六|2013-06-13 00:00:00| 女| null| 0.0|
| 13| 孙七|2014-06-01 00:00:00| 女| null| 0.0|
+---+-----+-------------------+----+-----+-----+
spark用group by 不能用 ,因为 包含了未进行分组的字段,mysql可以用*
val frame = sc.sql(
"select SId,avg(score) from sc " +
"group by SId " +
"having avg(score) > 80 " +
"order by SId")
+---+----------+
|SId|avg(score)|
+---+----------+
| 01| 89.66667|
| 05| 81.50000|
| 07| 93.50000|
+---+----------+
支持limit,在order by 后使用
val frame = sc.sql(
"select * from sc " +
"order by SId "+
"limit 1")
+---+---+-----+
|SId|CId|score|
+---+---+-----+
| 01| 01| 80.0|
+---+---+-----+
但是不支持limit分页,mysql可以
select * from SC order by SId limit 1,10
row_number over 完成分页
以什么分组,组内按照那个字段排名
select *,Row_Number() over (partition by SId order by score) rank from SC where SId in('01','02') order by SId
+---+---+-----+----+
|SId|CId|score|rank|
+---+---+-----+----+
| 01| 02| 90.0| 2|
| 01| 01| 80.0| 1|
| 01| 03| 99.0| 3|
| 02| 02| 60.0| 1|
| 02| 01| 70.0| 2|
| 02| 03| 80.0| 3|
+---+---+-----+----+
+---+---+-----+----+
全局的排序分页
select *,Row_Number() over ( order by score) rank from SC where SId in('01','02') order by rank
+---+---+-----+----+
|SId|CId|score|rank|
+---+---+-----+----+
| 02| 02| 60.0| 1|
| 02| 01| 70.0| 2|
| 01| 01| 80.0| 3|
| 02| 03| 80.0| 4|
| 01| 02| 90.0| 5|
| 01| 03| 99.0| 6|
+---+---+-----+----+
spark 的 like
like '8%'
但是 like '李%' 就查不到
中文字符有问题
url我是这样写的
jdbc:mysql://192.168.1.100:3306/Strengthen
要加上
?useUnicode=true&characterEncoding=utf8