下载clickhouse repo
curl -s https://packagecloud.io/install/repositories/altinity/clickhouse/script.rpm.sh | sudo bash
查看clickhouse rpm包
[root@ipa clickhouse-server]# sudo yum list 'clickhouse*'
Loaded plugins: fastestmirror
Loading mirror speeds from cached hostfile
Installed Packages
clickhouse-client.x86_64 19.9.2.4-1.el7 @Altinity_clickhouse
clickhouse-common-static.x86_64 19.9.2.4-1.el7 @Altinity_clickhouse
clickhouse-server.x86_64 19.9.2.4-1.el7 @Altinity_clickhouse
clickhouse-server-common.x86_64 19.9.2.4-1.el7 @Altinity_clickhouse
Available Packages
clickhouse-compressor.x86_64 1.1.54336-3.el7 Altinity_clickhouse
clickhouse-debuginfo.x86_64 19.9.2.4-1.el7 Altinity_clickhouse
clickhouse-mysql.noarch 0.0.20180319-1 Altinity_clickhouse
clickhouse-odbc.x86_64 1.0.0.20190611-1 Altinity_clickhouse
clickhouse-test.x86_64
安装clickhouse
sudo yum install -y clickhouse-server clickhouse-client
验证安装
[root@ipa clickhouse-server]# sudo yum list installed 'clickhouse*'
Loaded plugins: fastestmirror
Loading mirror speeds from cached hostfile
Installed Packages
clickhouse-client.x86_64 19.9.2.4-1.el7 @Altinity_clickhouse
clickhouse-common-static.x86_64 19.9.2.4-1.el7 @Altinity_clickhouse
clickhouse-server.x86_64 19.9.2.4-1.el7 @Altinity_clickhouse
clickhouse-server-common.x86_64 19.9.2.4-1.el7 @Altinity_clickhouse
[root@ipa clickhouse-server]#
启动
sudo /etc/init.d/clickhouse-server restart
使用客户端连接
[root@ipa clickhouse-server]# clickhouse-client
ClickHouse client version 19.9.2.4.
Connecting to localhost:9000 as user default.
Connected to ClickHouse server version 19.9.2 revision 54421.
ipa.haohaozhu.hadoop :)
下载测试数据
vi download.sh
for s in `seq 1987 2018`
do
for m in `seq 1 12`
do
wget https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_${s}_${m}.zip
done
done
chmod +x download.sh
./download.sh
建表
CREATE TABLE ontime ( \
Year UInt16, \
Quarter UInt8, \
Month UInt8, \
DayofMonth UInt8, \
DayOfWeek UInt8, \
FlightDate Date, \
UniqueCarrier FixedString(7), \
AirlineID Int32, \
Carrier FixedString(2), \
TailNum String, \
FlightNum String, \
OriginAirportID Int32, \
OriginAirportSeqID Int32, \
OriginCityMarketID Int32, \
Origin FixedString(5), \
OriginCityName String, \
OriginState FixedString(2), \
OriginStateFips String, \
OriginStateName String, \
OriginWac Int32, \
DestAirportID Int32, \
DestAirportSeqID Int32, \
DestCityMarketID Int32, \
Dest FixedString(5), \
DestCityName String, \
DestState FixedString(2), \
DestStateFips String, \
DestStateName String, \
DestWac Int32, \
CRSDepTime Int32, \
DepTime Int32, \
DepDelay Int32, \
DepDelayMinutes Int32, \
DepDel15 Int32, \
DepartureDelayGroups String, \
DepTimeBlk String, \
TaxiOut Int32, \
WheelsOff Int32, \
WheelsOn Int32, \
TaxiIn Int32, \
CRSArrTime Int32, \
ArrTime Int32, \
ArrDelay Int32, \
ArrDelayMinutes Int32, \
ArrDel15 Int32, \
ArrivalDelayGroups Int32, \
ArrTimeBlk String, \
Cancelled UInt8, \
CancellationCode FixedString(1), \
Diverted UInt8, \
CRSElapsedTime Int32, \
ActualElapsedTime Int32, \
AirTime Int32, \
Flights Int32, \
Distance Int32, \
DistanceGroup UInt8, \
CarrierDelay Int32, \
WeatherDelay Int32, \
NASDelay Int32, \
SecurityDelay Int32, \
LateAircraftDelay Int32, \
FirstDepTime String, \
TotalAddGTime String, \
LongestAddGTime String, \
DivAirportLandings String, \
DivReachedDest String, \
DivActualElapsedTime String, \
DivArrDelay String, \
DivDistance String, \
Div1Airport String, \
Div1AirportID Int32, \
Div1AirportSeqID Int32, \
Div1WheelsOn String, \
Div1TotalGTime String, \
Div1LongestGTime String, \
Div1WheelsOff String, \
Div1TailNum String, \
Div2Airport String, \
Div2AirportID Int32, \
Div2AirportSeqID Int32, \
Div2WheelsOn String, \
Div2TotalGTime String, \
Div2LongestGTime String, \
Div2WheelsOff String, \
Div2TailNum String, \
Div3Airport String, \
Div3AirportID Int32, \
Div3AirportSeqID Int32, \
Div3WheelsOn String, \
Div3TotalGTime String, \
Div3LongestGTime String, \
Div3WheelsOff String, \
Div3TailNum String, \
Div4Airport String, \
Div4AirportID Int32, \
Div4AirportSeqID Int32, \
Div4WheelsOn String, \
Div4TotalGTime String, \
Div4LongestGTime String, \
Div4WheelsOff String, \
Div4TailNum String, \
Div5Airport String, \
Div5AirportID Int32, \
Div5AirportSeqID Int32, \
Div5WheelsOn String, \
Div5TotalGTime String, \
Div5LongestGTime String, \
Div5WheelsOff String, \
Div5TailNum String \
) ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192)
导入之前下载的数据,这里下载了1987-2000
区间数据,压缩前文件大小为:28GB左右,clickhouse server服务器配置:2core 8GB
for i in *.zip; do echo $i; unzip -cq $i '*.csv' | sed 's/\.00//g' | clickhouse-client --host=localhost --query="INSERT INTO ontime FORMAT CSVWithNames"; done
sql测试:
测试1
ipa.haohaozhu.hadoop :) select count(*) from ontime;
SELECT count(*)
FROM ontime
┌──count()─┐
│ 62077338 │
└──────────┘
1 rows in set. Elapsed: 0.887 sec. Processed 62.08 million rows, 62.08 MB (69.96 million rows/s., 69.96 MB/s.)
测试2
ipa.haohaozhu.hadoop :) SELECT DayOfWeek, count(*) AS c FROM ontime GROUP BY DayOfWeek ORDER BY c DESC;
SELECT
DayOfWeek,
count(*) AS c
FROM ontime
GROUP BY DayOfWeek
ORDER BY c DESC
┌─DayOfWeek─┬───────c─┐
│ 2 │ 9107705 │
│ 1 │ 9090812 │
│ 3 │ 9088271 │
│ 4 │ 9049647 │
│ 5 │ 9036578 │
│ 7 │ 8585662 │
│ 6 │ 8118663 │
└───────────┴─────────┘
7 rows in set. Elapsed: 0.231 sec. Processed 62.08 million rows, 62.08 MB (268.31 million rows/s., 268.31 MB/s.)
测试三
ipa.haohaozhu.hadoop :) SELECT DayOfWeek, count(*) AS c\
:-] FROM ontime\
:-] WHERE DepDelay>10 \
:-] GROUP BY DayOfWeek\
:-] ORDER BY c DESC;
SELECT
DayOfWeek,
count(*) AS c
FROM ontime
WHERE DepDelay > 10
GROUP BY DayOfWeek
ORDER BY c DESC
┌─DayOfWeek─┬───────c─┐
│ 5 │ 2008505 │
│ 4 │ 1885211 │
│ 3 │ 1645889 │
│ 7 │ 1611706 │
│ 1 │ 1534372 │
│ 2 │ 1496814 │
│ 6 │ 1372759 │
└───────────┴─────────┘
7 rows in set. Elapsed: 3.044 sec. Processed 62.08 million rows, 259.86 MB (20.39 million rows/s., 85.37 MB/s.)
ipa.haohaozhu.hadoop :)
测试四
ipa.haohaozhu.hadoop :) SELECT Origin, count(*) AS c\
:-] FROM ontime\
:-] WHERE DepDelay>10 \
:-] GROUP BY Origin\
:-] ORDER BY c DESC\
:-] LIMIT 10;
SELECT
Origin,
count(*) AS c
FROM ontime
WHERE DepDelay > 10
GROUP BY Origin
ORDER BY c DESC
LIMIT 10
┌─Origin─┬──────c─┐
│ ORD │ 751029 │
│ ATL │ 604497 │
│ DFW │ 578965 │
│ LAX │ 435993 │
│ PHX │ 405636 │
│ STL │ 363985 │
│ DEN │ 349509 │
│ SFO │ 338541 │
│ PIT │ 331315 │
│ CLT │ 327095 │
└────────┴────────┘
10 rows in set. Elapsed: 3.210 sec. Processed 62.08 million rows, 306.09 MB (19.34 million rows/s., 95.35 MB/s.)
测试五
ipa.haohaozhu.hadoop :) SELECT Carrier, count(*)\
:-] FROM ontime\
:-] WHERE DepDelay>10 \
:-] GROUP BY Carrier\
:-] ORDER BY count(*) DESC;
SELECT
Carrier,
count(*)
FROM ontime
WHERE DepDelay > 10
GROUP BY Carrier
ORDER BY count(*) DESC
┌─Carrier─┬─count()─┐
│ DL │ 1801581 │
│ UA │ 1685842 │
│ US │ 1683872 │
│ WN │ 1487246 │
│ AA │ 1307232 │
│ NW │ 898815 │
│ CO │ 854443 │
│ TW │ 580275 │
│ HP │ 463263 │
│ AS │ 267853 │
│ PI │ 209439 │
│ EA │ 155349 │
│ AL │ 99515 │
│ PA │ 36805 │
│ PS │ 17672 │
│ ML │ 5033 │
│ AQ │ 1021 │
└─────────┴─────────┘
17 rows in set. Elapsed: 1.287 sec. Processed 62.08 million rows, 271.43 MB (48.25 million rows/s., 210.96 MB/s.)
测试六
ipa.haohaozhu.hadoop :) SELECT Carrier, c, c2, c*100/c2 as c3\
:-] FROM\
:-] (\
:-] SELECT\
:-] Carrier,\
:-] count(*) AS c\
:-] FROM ontime\
:-] WHERE DepDelay>10\
:-] GROUP BY Carrier\
:-] )\
:-] ANY INNER JOIN\
:-] (\
:-] SELECT\
:-] Carrier,\
:-] count(*) AS c2\
:-] FROM ontime\
:-] GROUP BY Carrier\
:-] ) USING Carrier\
:-] ORDER BY c3 DESC;
SELECT
Carrier,
c,
c2,
(c * 100) / c2 AS c3
FROM
(
SELECT
Carrier,
count(*) AS c
FROM ontime
WHERE DepDelay > 10
GROUP BY Carrier
)
ANY INNER JOIN
(
SELECT
Carrier,
count(*) AS c2
FROM ontime
GROUP BY Carrier
) USING (Carrier)
ORDER BY c3 DESC
┌─Carrier─┬───────c─┬───────c2─┬─────────────────c3─┐
│ PI │ 209439 │ 873957 │ 23.964451340283333 │
│ WN │ 1487246 │ 6769119 │ 21.971042317323718 │
│ AL │ 99515 │ 455873 │ 21.829544631947932 │
│ UA │ 1685842 │ 7946749 │ 21.21423490285147 │
│ PS │ 17672 │ 83617 │ 21.13445830393341 │
│ US │ 1683872 │ 8492064 │ 19.82877189809215 │
│ HP │ 463263 │ 2339125 │ 19.804969807085982 │
│ AS │ 267853 │ 1421935 │ 18.837218297601506 │
│ TW │ 580275 │ 3125082 │ 18.56831276747298 │
│ DL │ 1801581 │ 10211113 │ 17.643336235726704 │
│ CO │ 854443 │ 4970900 │ 17.18889939447585 │
│ EA │ 155349 │ 919785 │ 16.889707920872812 │
│ ML │ 5033 │ 31123 │ 16.17132024547762 │
│ AA │ 1307232 │ 8335280 │ 15.683120423069171 │
│ NW │ 898815 │ 5812478 │ 15.463542399644352 │
│ PA │ 36805 │ 278102 │ 13.234352863337913 │
│ AQ │ 1021 │ 11036 │ 9.251540413193187 │
└─────────┴─────────┴──────────┴────────────────────┘
17 rows in set. Elapsed: 0.973 sec. Processed 124.15 million rows, 395.59 MB (127.64 million rows/s., 406.69 MB/s.)
测试七
ipa.haohaozhu.hadoop :) SELECT Carrier, avg(DepDelay>10)*100 AS c3\
:-] FROM ontime\
:-] GROUP BY Carrier\
:-] ORDER BY Carrier;
SELECT
Carrier,
avg(DepDelay > 10) * 100 AS c3
FROM ontime
GROUP BY Carrier
ORDER BY Carrier ASC
┌─Carrier─┬─────────────────c3─┐
│ AA │ 15.683120423069171 │
│ AL │ 21.829544631947932 │
│ AQ │ 9.251540413193187 │
│ AS │ 18.837218297601506 │
│ CO │ 17.188899394475847 │
│ DL │ 17.643336235726704 │
│ EA │ 16.889707920872812 │
│ HP │ 19.804969807085985 │
│ ML │ 16.17132024547762 │
│ NW │ 15.46354239964435 │
│ PA │ 13.23435286333791 │
│ PI │ 23.964451340283333 │
│ PS │ 21.13445830393341 │
│ TW │ 18.56831276747298 │
│ UA │ 21.214234902851466 │
│ US │ 19.82877189809215 │
│ WN │ 21.971042317323715 │
└─────────┴────────────────────┘
17 rows in set. Elapsed: 0.697 sec. Processed 62.08 million rows, 372.47 MB (89.11 million rows/s., 534.67 MB/s.)
测试八
ipa.haohaozhu.hadoop :) SELECT Carrier, c, c2, c*100/c2 as c3\
:-] FROM\
:-] (\
:-] SELECT\
:-] Carrier,\
:-] count(*) AS c\
:-] FROM ontime\
:-] WHERE DepDelay>10\
:-] GROUP BY Carrier\
:-] )\
:-] ANY INNER JOIN\
:-] (\
:-] SELECT\
:-] Carrier,\
:-] count(*) AS c2\
:-] FROM ontime\
:-] GROUP BY Carrier\
:-] ) USING Carrier\
:-] ORDER BY c3 DESC;
SELECT
Carrier,
c,
c2,
(c * 100) / c2 AS c3
FROM
(
SELECT
Carrier,
count(*) AS c
FROM ontime
WHERE DepDelay > 10
GROUP BY Carrier
)
ANY INNER JOIN
(
SELECT
Carrier,
count(*) AS c2
FROM ontime
GROUP BY Carrier
) USING (Carrier)
ORDER BY c3 DESC
┌─Carrier─┬───────c─┬───────c2─┬─────────────────c3─┐
│ PI │ 209439 │ 873957 │ 23.964451340283333 │
│ WN │ 1487246 │ 6769119 │ 21.971042317323718 │
│ AL │ 99515 │ 455873 │ 21.829544631947932 │
│ UA │ 1685842 │ 7946749 │ 21.21423490285147 │
│ PS │ 17672 │ 83617 │ 21.13445830393341 │
│ US │ 1683872 │ 8492064 │ 19.82877189809215 │
│ HP │ 463263 │ 2339125 │ 19.804969807085982 │
│ AS │ 267853 │ 1421935 │ 18.837218297601506 │
│ TW │ 580275 │ 3125082 │ 18.56831276747298 │
│ DL │ 1801581 │ 10211113 │ 17.643336235726704 │
│ CO │ 854443 │ 4970900 │ 17.18889939447585 │
│ EA │ 155349 │ 919785 │ 16.889707920872812 │
│ ML │ 5033 │ 31123 │ 16.17132024547762 │
│ AA │ 1307232 │ 8335280 │ 15.683120423069171 │
│ NW │ 898815 │ 5812478 │ 15.463542399644352 │
│ PA │ 36805 │ 278102 │ 13.234352863337913 │
│ AQ │ 1021 │ 11036 │ 9.251540413193187 │
└─────────┴─────────┴──────────┴────────────────────┘
17 rows in set. Elapsed: 1.011 sec. Processed 124.15 million rows, 395.59 MB (122.82 million rows/s., 391.35 MB/s.)
测试九
ipa.haohaozhu.hadoop :) SELECT DestCityName, uniqExact(OriginCityName) AS u \
:-] FROM ontime\
:-] GROUP BY DestCityName\
:-] ORDER BY u DESC\
:-] LIMIT 10;
SELECT
DestCityName,
uniqExact(OriginCityName) AS u
FROM ontime
GROUP BY DestCityName
ORDER BY u DESC
LIMIT 10
┌─DestCityName──────────┬───u─┐
│ Chicago, IL │ 126 │
│ Atlanta, GA │ 114 │
│ Dallas/Fort Worth, TX │ 105 │
│ Pittsburgh, PA │ 104 │
│ Minneapolis, MN │ 99 │
│ Denver, CO │ 97 │
│ Newark, NJ │ 94 │
│ St. Louis, MO │ 94 │
│ Charlotte, NC │ 94 │
│ Detroit, MI │ 92 │
└───────────────────────┴─────┘
10 rows in set. Elapsed: 7.020 sec. Processed 62.08 million rows, 2.76 GB (8.84 million rows/s., 393.13 MB/s.)
测试十
SELECT
min(Year),
max(Year),
Carrier,
count(*) AS cnt,
sum(ArrDelayMinutes > 30) AS flights_delayed,
round(sum(ArrDelayMinutes > 30) / count(*), 2) AS rate
FROM ontime
WHERE (DayOfWeek NOT IN (6, 7)) AND (OriginState NOT IN ('AK', 'HI', 'PR', 'VI')) AND (DestState NOT IN ('AK', 'HI', 'PR', 'VI')) AND (FlightDate < '2010-01-01')
GROUP BY Carrier
HAVING (cnt > 100000) AND (max(Year) > 1990)
ORDER BY rate DESC
LIMIT 1000
┌─min(Year)─┬─max(Year)─┬─Carrier─┬─────cnt─┬─flights_delayed─┬─rate─┐
│ 1987 │ 2000 │ UA │ 5668532 │ 705834 │ 0.12 │
│ 1987 │ 2000 │ TW │ 2257896 │ 242250 │ 0.11 │
│ 1987 │ 2000 │ CO │ 3635353 │ 394498 │ 0.11 │
│ 1987 │ 2000 │ AA │ 5770469 │ 578915 │ 0.1 │
│ 1988 │ 2000 │ US │ 6231991 │ 593442 │ 0.1 │
│ 1987 │ 1991 │ PA │ 194060 │ 18923 │ 0.1 │
│ 1987 │ 2000 │ NW │ 4230638 │ 382152 │ 0.09 │
│ 1987 │ 2000 │ DL │ 7188301 │ 671722 │ 0.09 │
│ 1987 │ 2000 │ HP │ 1700319 │ 157018 │ 0.09 │
│ 1987 │ 2000 │ AS │ 720882 │ 61112 │ 0.08 │
│ 1987 │ 2000 │ WN │ 5051739 │ 424503 │ 0.08 │
└───────────┴───────────┴─────────┴─────────┴─────────────────┴──────┘
11 rows in set. Elapsed: 6.396 sec. Processed 62.08 million rows, 730.73 MB (9.71 million rows/s., 114.25 MB/s.)