spark-sql测试总结

spark-sql测试总结
最近倒腾spark-sql,原来测试都是很小的数据,由于自己的是6个虚拟机资源有限,也不能太大,于是在找了帖子。

http://colobu.com/2014/12/11/spark-sql-quick-start/
Spark SQL 初探: 使用大数据分析2000万数据


############## 不要问我数据怎么下载的,自己搜索,我用完就删了。
1、文件检查,shell中wc和awk命令帮忙检查一下行和列。

############ head 一下文件,得知,都有列头,逗号分隔。因为涉及名字隐私信息,只打印列头,第二行开始是具体记录。

[hue@snn 2000w]$ head -1 1-200W.csv
Name,CardNo,Descriot,CtfTp,CtfId,Gender,Birthday,Address,Zip,Dirty,District1,District2,District3,District4,District5,District6,
FirstNm,LastNm,Duty,Mobile,Tel,Fax,EMail,Nation,Taste,Education,Company,CTel,CAddress,CZip,Family,Version,id
[hue@snn 2000w]$

############ wc 检查一下行数

[hadoop@snn 2000w]$ cat 1000W-1200W.csv | wc -l
2000050
[hadoop@snn 2000w]$ cat 1200W-1400W.csv | wc -l
2000205
[hadoop@snn 2000w]$ cat 1-200W.csv | wc -l
2000094
[hadoop@snn 2000w]$

############ awk 检查一下列数,33列

[hadoop@snn 2000w]$ awk 'BEGIN {FS=","}END{print "Filename:" FILENAME ",Linenumber:" NR ",Columns:" NF}' 1000W-1200W.csv

Filename:1000W-1200W.csv,Linenumber:2000050,Columns:33


####################################

2、hdfs创建文件夹,并put文件上去

[hue@snn ~]$ hadoop fs -mkdir /user/hue/external/2000w
[hue@snn ~]$ hadoop fs -put /opt/2000w/* /user/hue/external/2000w/
[hue@snn ~]$ hadoop fs -ls -R /user/hue/external/2000w/
-rw-r--r--   3 hue hue  348173735 2015-12-17 14:36 /user/hue/external/2000w/1-200W.csv
-rw-r--r--   3 hue hue  317365192 2015-12-17 14:36 /user/hue/external/2000w/1000W-1200W.csv
-rw-r--r--   3 hue hue  307266272 2015-12-17 14:36 /user/hue/external/2000w/1200W-1400W.csv
-rw-r--r--   3 hue hue  319828719 2015-12-17 14:36 /user/hue/external/2000w/1400W-1600W.csv
-rw-r--r--   3 hue hue  310125772 2015-12-17 14:37 /user/hue/external/2000w/1600w-1800w.csv
-rw-r--r--   3 hue hue  298454235 2015-12-17 14:37 /user/hue/external/2000w/1800w-2000w.csv
-rw-r--r--   3 hue hue  311349431 2015-12-17 14:38 /user/hue/external/2000w/200W-400W.csv
-rw-r--r--   3 hue hue  311013782 2015-12-17 14:38 /user/hue/external/2000w/400W-600W.csv
-rw-r--r--   3 hue hue  308703632 2015-12-17 14:38 /user/hue/external/2000w/600W-800W.csv
-rw-r--r--   3 hue hue  310797175 2015-12-17 14:38 /user/hue/external/2000w/800W-1000W.csv
-rw-r--r--   3 hue hue    7487744 2015-12-17 14:38 /user/hue/external/2000w/last_5000.csv
[hue@snn ~]$

####################################

3、创建外部表,不用挪动文件,即可查询。

Create external table IF NOT EXISTS external_2000w
(
Name String,
CardNo String,
Descriot String,
CtfTp String,
CtfId String,
Gender String,
Birthday String,
Address String,
Zip String,
Dirty String,
District1 String,
District2 String,
District3 String,
District4 String,
District5 String,
District6 String,
FirstNm String,
LastNm String,
Duty String,
Mobile String,
Tel String,
Fax String,
EMail String,
Nation String,
Taste String,
Education String,
Company String,
CTel String,
CAddress String,
CZip String,
Family String,
Version String,
id int
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 
LOCATION '/user/hue/external/2000w/';

##########################################################################################################
4、专题查询

############################ 全部记录,文件夹下11个文件,结构一样

select count(1) as cnt from external_2000w;

spark-sql> select count(1) as cnt from external_2000w;
20051440
Time taken: 27.806 seconds, Fetched 1 row(s)
spark-sql>

############################ 11 个文件,11个列头需要剔除。

select count(1) as cnt from external_2000w where name == 'Name';

spark-sql> select count(1) as cnt from external_2000w where name == 'Name';
11
Time taken: 29.432 seconds, Fetched 1 row(s)
spark-sql>

############################ 剔除列头,一共“20051429”条记录
select count(1) as cnt from external_2000w where name != 'Name';

spark-sql> select count(1) as cnt from external_2000w where name != 'Name';
20051429
Time taken: 34.129 seconds, Fetched 1 row(s)
spark-sql>

###########################  异常数据筛选,性别不是(M,F)

select count(1) as cnt from external_2000w where name != 'Name' and Gender not in('M','F');

spark-sql> select count(1) as cnt from external_2000w where name != 'Name' and Gender not in('M','F');
802043
Time taken: 34.735 seconds, Fetched 1 row(s)
spark-sql>

###########################  男女分组统计

select Gender,count(1) as cnt from external_2000w where name != 'Name' and Gender in('M','F') GROUP BY Gender;

spark-sql> select Gender,count(1) as cnt from external_2000w where name != 'Name' and Gender in('M','F') GROUP BY Gender;
F       6478121
M       12771211
Time taken: 41.875 seconds, Fetched 2 row(s)
spark-sql>

########################### 星座分组统计

select XingZuo,count(1) as cnt from (
select
CASE
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 120 and substring(Birthday,5) <= 219 THEN "水瓶座"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 220 and substring(Birthday,5) <= 320 THEN "双鱼座"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 321 and substring(Birthday,5) <= 420 THEN "白羊座"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 421 and substring(Birthday,5) <= 521 THEN "金牛座"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 522 and substring(Birthday,5) <= 621 THEN "双子座"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 622 and substring(Birthday,5) <= 722 THEN "巨蟹座"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 723 and substring(Birthday,5) <= 823 THEN "狮子座"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 824 and substring(Birthday,5) <= 923 THEN "处女座"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 924 and substring(Birthday,5) <= 1023 THEN "天秤座"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 1024 and substring(Birthday,5) <= 1122 THEN "天蝎座"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 1123 and substring(Birthday,5) <= 1222 THEN "射手座"
WHEN (length(Birthday) == 8 and substring(Birthday,5) >= 120 and substring(Birthday,5) <= 1231) 
or (length(Birthday) == 8 and substring(Birthday,5) >= 101 and substring(Birthday,5) <= 119) THEN "摩蝎座"
ELSE "未知"
END AS XingZuo
from external_2000w where name != 'Name' 
) as atable
group by XingZuo;

弹出很多乱七八糟的东西,难道是中文引起的,不要中文试试。

########################

select XingZuo,count(1) as cnt from (
select
CASE
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 120 and substring(Birthday,5) <= 219 THEN "A"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 220 and substring(Birthday,5) <= 320 THEN "B"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 321 and substring(Birthday,5) <= 420 THEN "C"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 421 and substring(Birthday,5) <= 521 THEN "D"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 522 and substring(Birthday,5) <= 621 THEN "E"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 622 and substring(Birthday,5) <= 722 THEN "F"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 723 and substring(Birthday,5) <= 823 THEN "G"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 824 and substring(Birthday,5) <= 923 THEN "H"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 924 and substring(Birthday,5) <= 1023 THEN "I"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 1024 and substring(Birthday,5) <= 1122 THEN "J"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 1123 and substring(Birthday,5) <= 1222 THEN "K"
WHEN (length(Birthday) == 8 and substring(Birthday,5) >= 120 and substring(Birthday,5) <= 1231) 
or (length(Birthday) == 8 and substring(Birthday,5) >= 101 and substring(Birthday,5) <= 119) THEN "L"
ELSE "M"
END AS XingZuo
from external_2000w
where name != 'Name'
) as atable
group by XingZuo;


A       1636084
B       1510535
C       1410462
D       1406847
E       1406631
F       1498724
G       1614266
H       1666768
I       1897450
J       1820476
K       1615660
L       2406878
M       160648
Time taken: 91.985 seconds, Fetched 13 row(s)

spark-sql测试总结_第1张图片

跟那个帖子的结果有点差异。过滤条件不一样?


spark-sql测试总结_第2张图片


spark-sql测试总结_第3张图片

############################  结果直接存入一个表格。

create table external_2000w_new as
select substring(Birthday,5) as born_day,
CASE
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 120 and substring(Birthday,5) <= 219 THEN "A"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 220 and substring(Birthday,5) <= 320 THEN "B"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 321 and substring(Birthday,5) <= 420 THEN "C"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 421 and substring(Birthday,5) <= 521 THEN "D"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 522 and substring(Birthday,5) <= 621 THEN "E"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 622 and substring(Birthday,5) <= 722 THEN "F"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 723 and substring(Birthday,5) <= 823 THEN "G"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 824 and substring(Birthday,5) <= 923 THEN "H"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 924 and substring(Birthday,5) <= 1023 THEN "I"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 1024 and substring(Birthday,5) <= 1122 THEN "J"
WHEN length(Birthday) == 8 and substring(Birthday,5) >= 1123 and substring(Birthday,5) <= 1222 THEN "K"
WHEN (length(Birthday) == 8 and substring(Birthday,5) >= 120 and substring(Birthday,5) <= 1231) 
or (length(Birthday) == 8 and substring(Birthday,5) >= 101 and substring(Birthday,5) <= 119) THEN "L"
ELSE "M"
END AS XingZuo
from external_2000w
where name != 'Name';

############################  结果直接存入一个表格。

31个小文件,stage里面分31tasks。

spark-sql测试总结_第4张图片

[root@snn conf]# hadoop fs -ls -R /user/hive/warehouse/external_2000w_new
drwxrwxrwt   - hadoop hive          0 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/.hive-staging_hive_2015-12-17_17-18-32_719_3374007692051174329-1
drwxr-xr-x   - hadoop hive          0 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/.hive-staging_hive_2015-12-17_17-18-32_719_3374007692051174329-1/-ext-10000
-rw-r--r--   3 hadoop hive          0 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/.hive-staging_hive_2015-12-17_17-18-32_719_3374007692051174329-1/-ext-10000/_SUCCESS
-rwxrwxrwt   3 hadoop hive    6307372 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/part-00000
-rwxrwxrwt   3 hadoop hive    4747600 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/part-00001
-rwxrwxrwt   3 hadoop hive    2943508 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00002
-rwxrwxrwt   3 hadoop hive    5949216 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/part-00003
-rwxrwxrwt   3 hadoop hive    5887275 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/part-00004
-rwxrwxrwt   3 hadoop hive    2160089 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00005
-rwxrwxrwt   3 hadoop hive    5950706 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00006
-rwxrwxrwt   3 hadoop hive    6322605 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/part-00007
-rwxrwxrwt   3 hadoop hive    1722862 2015-12-17 17:18 /user/hive/warehouse/external_2000w_new/part-00008
-rwxrwxrwt   3 hadoop hive    5927935 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00009
-rwxrwxrwt   3 hadoop hive    5839186 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00010
-rwxrwxrwt   3 hadoop hive    2229685 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00011
-rwxrwxrwt   3 hadoop hive    5907388 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00012
-rwxrwxrwt   3 hadoop hive    6142019 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00013
-rwxrwxrwt   3 hadoop hive    1869211 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00014
-rwxrwxrwt   3 hadoop hive    6119244 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00015
-rwxrwxrwt   3 hadoop hive    6200692 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00016
-rwxrwxrwt   3 hadoop hive    1399629 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00017
-rwxrwxrwt   3 hadoop hive    6045320 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00018
-rwxrwxrwt   3 hadoop hive    6044653 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00019
-rwxrwxrwt   3 hadoop hive    1906355 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00020
-rwxrwxrwt   3 hadoop hive    6024204 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00021
-rwxrwxrwt   3 hadoop hive    6035401 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00022
-rwxrwxrwt   3 hadoop hive    1936859 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00023
-rwxrwxrwt   3 hadoop hive    6101666 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00024
-rwxrwxrwt   3 hadoop hive    6075192 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00025
-rwxrwxrwt   3 hadoop hive    1819634 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00026
-rwxrwxrwt   3 hadoop hive    6058918 2015-12-17 17:20 /user/hive/warehouse/external_2000w_new/part-00027
-rwxrwxrwt   3 hadoop hive    6032423 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00028
-rwxrwxrwt   3 hadoop hive    1905099 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00029
-rwxrwxrwt   3 hadoop hive     341632 2015-12-17 17:19 /user/hive/warehouse/external_2000w_new/part-00030
[root@snn conf]#


###################  遗留问题:

1、中文查询那个,为何出现那么多异常抛出?后来改A/B/C之类的也有一个异常抛出;

2、根据外部表生成的结果直接生成到hive表,生成的文件数变成task的数量那么多part,小文件hdfs一个硬伤啊。


###################  遗留问题跟踪:

1、第一个问题是由于操作符引起,等于是=,不等于是<>,但是在spark里面还是可以执行,hive直接抛异常。

spark-sql测试总结_第5张图片

修改后,结果显示正常。

spark-sql测试总结_第6张图片


2、小文件这个问题,是spark不支持,hive可以支持。

参数:hive-site.xml 直接修改的值为true

或者CLI里面敲:set hive.merge.mapredfiles = true


spark-sql测试总结_第7张图片spark-sql测试总结_第8张图片


你可能感兴趣的:(spark-sql)