1
2
3
4
5
6
|
root@thor:~
/spark
# ./sbin/start-master.sh
less
..
/logs/spark-root-org
.apache.spark.deploy.master.Master-1-thor.out
15
/08/25
11:21:21 INFO Master: Starting Spark master at spark:
//thor
:7077
15
/08/25
11:21:21 INFO Utils: Successfully started service
'MasterUI'
on port 8080.
15
/08/25
11:21:21 INFO MasterWebUI: Started MasterWebUI at [url=http:
//10
.60.23.188:8080]http:
//10
.60.23.188:8080[
/url
]
root@thor:~
/spark
# ./sbin/start-slave.sh spark://thor:7077
|
1
2
|
spark.driver.extraClassPath =
/usr/local/spark/mysql-connector-java-5
.1.39-bin.jar
spark.executor.extraClassPath =
/usr/local/spark/mysql-connector-java-5
.1.39-bin.jar
|
1
|
$ .
/bin/spark-shell
--driver-memory 4G --master spark:
//server1
:7077
|
1
2
3
4
5
6
7
|
val
jdbcDF
=
spark.read.format(
"jdbc"
).options(
Map(
"url"
->
"jdbc:mysql://localhost:3306/ontime?user=root&password="
,
"dbtable"
->
"ontime.ontime_part"
,
"fetchSize"
->
"10000"
,
"partitionColumn"
->
"yeard"
,
"lowerBound"
->
"1988"
,
"upperBound"
->
"2016"
,
"numPartitions"
->
"28"
)).load()
jdbcDF.createOrReplaceTempView(
"ontime"
)
|
1
2
|
val
sqlDF
=
sql(
"select min(year), max(year) as max_year, Carrier, count(*) as cnt, sum(if(ArrDelayMinutes>30, 1, 0)) as flights_delayed, round(sum(if(ArrDelayMinutes>30, 1, 0))/count(*),2) as rate FROM ontime WHERE DayOfWeek not in (6,7) and OriginState not in ('AK', 'HI', 'PR', 'VI') and DestState not in ('AK', 'HI', 'PR', 'VI') and (origin = 'RDU' or dest = 'RDU') GROUP by carrier HAVING cnt > 100000 and max_year > '1990' ORDER by rate DESC, cnt desc LIMIT 10"
)
sqlDF.show()
|
01
02
03
04
05
06
07
08
09
10
11
|
select
min
(
year
),
max
(
year
)
as
max_year, Carrier,
count
(*)
as
cnt,
sum
(if(ArrDelayMinutes>30, 1, 0))
as
flights_delayed,
round(
sum
(if(ArrDelayMinutes>30, 1, 0))/
count
(*),2)
as
rate
FROM
ontime
WHERE
DayOfWeek
not
in
(6,7)
and
OriginState
not
in
(
'AK'
,
'HI'
,
'PR'
,
'VI'
)
and
DestState
not
in
(
'AK'
,
'HI'
,
'PR'
,
'VI'
)
GROUP
by
carrier
HAVING
cnt > 100000
and
max_year >
'1990'
ORDER
by
rate
DESC
, cnt
desc
LIMIT 10
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
|
mysql>
select
count
(*)
FROM
ontime
WHERE
DayOfWeek
not
in
(6,7)
and
OriginState
not
in
(
'AK'
,
'HI'
,
'PR'
,
'VI'
)
and
DestState
not
in
(
'AK'
,
'HI'
,
'PR'
,
'VI'
);
+
-----------+
|
count
(*) |
+
-----------+
| 108776741 |
+
-----------+
mysql>
select
count
(*)
FROM
ontime;
+
-----------+
|
count
(*) |
+
-----------+
| 152657276 |
+
-----------+
mysql>
select
round((108776741/152657276)*100, 2);
+
-------------------------------------+
| round((108776741/152657276)*100, 2) |
+
-------------------------------------+
| 71.26 |
+
-------------------------------------+
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
|
CREATE
TABLE
`ontime_part` (
`YearD`
int
(11)
NOT
NULL
,
`Quarter` tinyint(4)
DEFAULT
NULL
,
`MonthD` tinyint(4)
DEFAULT
NULL
,
`DayofMonth` tinyint(4)
DEFAULT
NULL
,
`DayOfWeek` tinyint(4)
DEFAULT
NULL
,
`FlightDate`
date
DEFAULT
NULL
,
`UniqueCarrier`
char
(7)
DEFAULT
NULL
,
`AirlineID`
int
(11)
DEFAULT
NULL
,
`Carrier`
char
(2)
DEFAULT
NULL
,
`TailNum`
varchar
(50)
DEFAULT
NULL
,
...
`id`
int
(11)
NOT
NULL
AUTO_INCREMENT,
PRIMARY
KEY
(`id`,`YearD`),
KEY
`covered` (`DayOfWeek`,`OriginState`,`DestState`,`Carrier`,`YearD`,`ArrDelayMinutes`)
) ENGINE=InnoDB AUTO_INCREMENT=162668935
DEFAULT
CHARSET=latin1
/*!50100 PARTITION
BY
RANGE (YearD)
(PARTITION p1987
VALUES
LESS THAN (1988) ENGINE = InnoDB,
PARTITION p1988
VALUES
LESS THAN (1989) ENGINE = InnoDB,
PARTITION p1989
VALUES
LESS THAN (1990) ENGINE = InnoDB,
PARTITION p1990
VALUES
LESS THAN (1991) ENGINE = InnoDB,
PARTITION p1991
VALUES
LESS THAN (1992) ENGINE = InnoDB,
PARTITION p1992
VALUES
LESS THAN (1993) ENGINE = InnoDB,
PARTITION p1993
VALUES
LESS THAN (1994) ENGINE = InnoDB,
PARTITION p1994
VALUES
LESS THAN (1995) ENGINE = InnoDB,
PARTITION p1995
VALUES
LESS THAN (1996) ENGINE = InnoDB,
PARTITION p1996
VALUES
LESS THAN (1997) ENGINE = InnoDB,
PARTITION p1997
VALUES
LESS THAN (1998) ENGINE = InnoDB,
PARTITION p1998
VALUES
LESS THAN (1999) ENGINE = InnoDB,
PARTITION p1999
VALUES
LESS THAN (2000) ENGINE = InnoDB,
PARTITION p2000
VALUES
LESS THAN (2001) ENGINE = InnoDB,
PARTITION p2001
VALUES
LESS THAN (2002) ENGINE = InnoDB,
PARTITION p2002
VALUES
LESS THAN (2003) ENGINE = InnoDB,
PARTITION p2003
VALUES
LESS THAN (2004) ENGINE = InnoDB,
PARTITION p2004
VALUES
LESS THAN (2005) ENGINE = InnoDB,
PARTITION p2005
VALUES
LESS THAN (2006) ENGINE = InnoDB,
PARTITION p2006
VALUES
LESS THAN (2007) ENGINE = InnoDB,
PARTITION p2007
VALUES
LESS THAN (2008) ENGINE = InnoDB,
PARTITION p2008
VALUES
LESS THAN (2009) ENGINE = InnoDB,
PARTITION p2009
VALUES
LESS THAN (2010) ENGINE = InnoDB,
PARTITION p2010
VALUES
LESS THAN (2011) ENGINE = InnoDB,
PARTITION p2011
VALUES
LESS THAN (2012) ENGINE = InnoDB,
PARTITION p2012
VALUES
LESS THAN (2013) ENGINE = InnoDB,
PARTITION p2013
VALUES
LESS THAN (2014) ENGINE = InnoDB,
PARTITION p2014
VALUES
LESS THAN (2015) ENGINE = InnoDB,
PARTITION p2015
VALUES
LESS THAN (2016) ENGINE = InnoDB,
PARTITION p_new
VALUES
LESS THAN MAXVALUE ENGINE = InnoDB) */
|
01
02
03
04
05
06
07
08
09
10
11
12
13
|
mysql> explain
select
min
(yearD),
max
(yearD)
as
max_year, Carrier,
count
(*)
as
cnt,
sum
(if(ArrDelayMinutes>30, 1, 0))
as
flights_delayed, round(
sum
(if(ArrDelayMinutes>30, 1, 0))/
count
(*),2)
as
rate
FROM
ontime_part
WHERE
DayOfWeek
not
in
(6,7)
and
OriginState
not
in
(
'AK'
,
'HI'
,
'PR'
,
'VI'
)
and
DestState
not
in
(
'AK'
,
'HI'
,
'PR'
,
'VI'
)
GROUP
by
carrier
HAVING
cnt > 1000
and
max_year >
'1990'
ORDER
by
rate
DESC
, cnt
desc
LIMIT 10G
*************************** 1. row ***************************
id: 1
select_type: SIMPLE
table
: ontime_part
type: range
possible_keys: covered
key
: covered
key_len: 2
ref:
NULL
rows
: 70483364
Extra: Using
where
; Using
index
; Using
temporary
; Using filesort
1 row
in
set
(0.00 sec)
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
|
mysql>
select
min
(yearD),
max
(yearD)
as
max_year, Carrier,
count
(*)
as
cnt,
sum
(if(ArrDelayMinutes>30, 1, 0))
as
flights_delayed, round(
sum
(if(ArrDelayMinutes>30, 1, 0))/
count
(*),2)
as
rate
FROM
ontime_part
WHERE
DayOfWeek
not
in
(6,7)
and
OriginState
not
in
(
'AK'
,
'HI'
,
'PR'
,
'VI'
)
and
DestState
not
in
(
'AK'
,
'HI'
,
'PR'
,
'VI'
)
GROUP
by
carrier
HAVING
cnt > 1000
and
max_year >
'1990'
ORDER
by
rate
DESC
, cnt
desc
LIMIT 10;
+
------------+----------+---------+----------+-----------------+------+
|
min
(yearD) | max_year | Carrier | cnt | flights_delayed | rate |
+
------------+----------+---------+----------+-----------------+------+
| 2003 | 2013 | EV | 2962008 | 464264 | 0.16 |
| 2003 | 2013 | B6 | 1237400 | 187863 | 0.15 |
| 2006 | 2011 | XE | 1615266 | 230977 | 0.14 |
| 2003 | 2005 | DH | 501056 | 69833 | 0.14 |
| 2001 | 2013 | MQ | 4518106 | 605698 | 0.13 |
| 2003 | 2013 | FL | 1692887 | 212069 | 0.13 |
| 2004 | 2010 | OH | 1307404 | 175258 | 0.13 |
| 2006 | 2013 | YV | 1121025 | 143597 | 0.13 |
| 2003 | 2006 | RU | 1007248 | 126733 | 0.13 |
| 1988 | 2013 | UA | 10717383 | 1327196 | 0.12 |
+
------------+----------+---------+----------+-----------------+------+
10
rows
in
set
(19
min
16.58 sec)
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
scala>
val
jdbcDF
=
spark.read.format(
"jdbc"
).options(
| Map(
"url"
->
"jdbc:mysql://localhost:3306/ontime?user=root&password=mysql"
,
|
"dbtable"
->
"ontime.ontime_sm"
,
|
"fetchSize"
->
"10000"
,
|
"partitionColumn"
->
"yeard"
,
"lowerBound"
->
"1988"
,
"upperBound"
->
"2015"
,
"numPartitions"
->
"48"
| )).load()
16
/
08
/
02
23
:
24
:
12
WARN JDBCRelation
:
The number of partitions is reduced because the specified number of partitions is less than the difference between upper bound and lower bound. Updated number of partitions
:
27
; Input number of partitions
:
48
; Lower bound
:
1988
; Upper bound
:
2015
.
dbcDF
:
org.apache.spark.sql.DataFrame
=
[id
:
int, YearD
:
date ...
19
more fields]
scala> jdbcDF.createOrReplaceTempView(
"ontime"
)
scala>
val
sqlDF
=
sql(
"select min(yearD), max(yearD) as max_year, Carrier, count(*) as cnt, sum(if(ArrDelayMinutes>30, 1, 0)) as flights_delayed, round(sum(if(ArrDelayMinutes>30, 1, 0))/count(*),2) as rate FROM ontime WHERE OriginState not in ('AK', 'HI', 'PR', 'VI') and DestState not in ('AK', 'HI', 'PR', 'VI') GROUP by carrier HAVING cnt > 1000 and max_year > '1990' ORDER by rate DESC, cnt desc LIMIT 10"
)
sqlDF
:
org.apache.spark.sql.DataFrame
=
[min(yearD)
:
date, max
_
year
:
date ...
4
more fields]
scala> sqlDF.show()
+----------+--------+-------+--------+---------------+----+
|min(yearD)|max
_
year|Carrier| cnt|flights
_
delayed|rate|
+----------+--------+-------+--------+---------------+----+
|
2003
|
2013
| EV|
2962008
|
464264
|
0.16
|
|
2003
|
2013
| B
6
|
1237400
|
187863
|
0.15
|
|
2006
|
2011
| XE|
1615266
|
230977
|
0.14
|
|
2003
|
2005
| DH|
501056
|
69833
|
0.14
|
|
2001
|
2013
| MQ|
4518106
|
605698
|
0.13
|
|
2003
|
2013
| FL|
1692887
|
212069
|
0.13
|
|
2004
|
2010
| OH|
1307404
|
175258
|
0.13
|
|
2006
|
2013
| YV|
1121025
|
143597
|
0.13
|
|
2003
|
2006
| RU|
1007248
|
126733
|
0.13
|
|
1988
|
2013
| UA|
10717383
|
1327196
|
0.12
|
+----------+--------+-------+--------+---------------+----+
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
.
/bin/spark-sql
--driver-memory 4G --master spark:
//thor
:7077
spark-sql> CREATE TEMPORARY VIEW ontime
> USING org.apache.spark.sql.jdbc
> OPTIONS (
> url
"jdbc:mysql://localhost:3306/ontime?user=root&password="
,
> dbtable
"ontime.ontime_part"
,
> fetchSize
"1000"
,
> partitionColumn
"yearD"
, lowerBound
"1988"
, upperBound
"2014"
, numPartitions
"48"
> );
16
/08/04
01:44:27 WARN JDBCRelation: The number of partitions is reduced because the specified number of partitions is
less
than the difference between upper bound and lower bound. Updated number of partitions: 26; Input number of partitions: 48; Lower bound: 1988; Upper bound: 2014.
Time taken: 3.864 seconds
spark-sql>
select
min(yearD), max(yearD) as max_year, Carrier, count(*) as cnt,
sum
(
if
(ArrDelayMinutes>30, 1, 0)) as flights_delayed, round(
sum
(
if
(ArrDelayMinutes>30, 1, 0))
/count
(*),2) as rate FROM ontime WHERE DayOfWeek not
in
(6,7) and OriginState not
in
(
'AK'
,
'HI'
,
'PR'
,
'VI'
) and DestState not
in
(
'AK'
,
'HI'
,
'PR'
,
'VI'
) GROUP by carrier HAVING cnt > 1000 and max_year >
'1990'
ORDER by rate DESC, cnt desc LIMIT 10;
16
/08/04
01:45:13 WARN Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting
'spark.debug.maxToStringFields'
in
SparkEnv.conf.
2003 2013 EV 2962008 464264 0.16
2003 2013 B6 1237400 187863 0.15
2006 2011 XE 1615266 230977 0.14
2003 2005 DH 501056 69833 0.14
2001 2013 MQ 4518106 605698 0.13
2003 2013 FL 1692887 212069 0.13
2004 2010 OH 1307404 175258 0.13
2006 2013 YV 1121025 143597 0.13
2003 2006 RU 1007248 126733 0.13
1988 2013 UA 10717383 1327196 0.12
Time taken: 139.628 seconds, Fetched 10 row(s)
|
1
2
|
scala> sqlDF.show()
[Stage 4:> (0 + 26) / 26]
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
mysql>
select
id
, info from information_schema.processlist where info is not NULL and info not like
'%information_schema%'
;
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
id
| info |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 10948 | SELECT `YearD`,`ArrDelayMinutes`,`Carrier` FROM ontime.ontime_part WHERE (((NOT (DayOfWeek IN (6, 7)))) AND ((NOT (OriginState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
)))) AND ((NOT (DestState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
))))) AND (yearD >= 2001 AND yearD < 2002) |
| 10965 | SELECT `YearD`,`ArrDelayMinutes`,`Carrier` FROM ontime.ontime_part WHERE (((NOT (DayOfWeek IN (6, 7)))) AND ((NOT (OriginState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
)))) AND ((NOT (DestState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
))))) AND (yearD >= 2007 AND yearD < 2008) |
| 10966 | SELECT `YearD`,`ArrDelayMinutes`,`Carrier` FROM ontime.ontime_part WHERE (((NOT (DayOfWeek IN (6, 7)))) AND ((NOT (OriginState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
)))) AND ((NOT (DestState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
))))) AND (yearD >= 1991 AND yearD < 1992) |
| 10967 | SELECT `YearD`,`ArrDelayMinutes`,`Carrier` FROM ontime.ontime_part WHERE (((NOT (DayOfWeek IN (6, 7)))) AND ((NOT (OriginState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
)))) AND ((NOT (DestState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
))))) AND (yearD >= 1994 AND yearD < 1995) |
| 10968 | SELECT `YearD`,`ArrDelayMinutes`,`Carrier` FROM ontime.ontime_part WHERE (((NOT (DayOfWeek IN (6, 7)))) AND ((NOT (OriginState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
)))) AND ((NOT (DestState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
))))) AND (yearD >= 1998 AND yearD < 1999) |
| 10969 | SELECT `YearD`,`ArrDelayMinutes`,`Carrier` FROM ontime.ontime_part WHERE (((NOT (DayOfWeek IN (6, 7)))) AND ((NOT (OriginState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
)))) AND ((NOT (DestState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
))))) AND (yearD >= 2010 AND yearD < 2011) |
| 10970 | SELECT `YearD`,`ArrDelayMinutes`,`Carrier` FROM ontime.ontime_part WHERE (((NOT (DayOfWeek IN (6, 7)))) AND ((NOT (OriginState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
)))) AND ((NOT (DestState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
))))) AND (yearD >= 2002 AND yearD < 2003) |
| 10971 | SELECT `YearD`,`ArrDelayMinutes`,`Carrier` FROM ontime.ontime_part WHERE (((NOT (DayOfWeek IN (6, 7)))) AND ((NOT (OriginState IN (
'AK'
,
'HI'
,
'PR'
,
'VI'
)))) AND ((NOT (DestState IN (
'AK'
,
'HI'
|