本文通过使用Hadoop的数据仓库工具Hive中的不同存储格式,比较按行存储和按列存储的不同。按列存储使用的是企业中最长见的ORC和PARQUET。
这里不讲解对于Hive的使用。
优缺点比较
企业中更多的业务场景是对列数据进行操作,如果按行存储,需要把每行数据加载到内存中,在从每行去获取数据。
CREATE (DATABASE|SCHEMA) [IF NOT EXISTS] database_name
[COMMENT database_comment]
[LOCATION hdfs_path]
[WITH DBPROPERTIES (property_name=property_value, ...)];
create database if not exists web_analysis location "/user/hive/warehouse/web_analysis";
准备好存放了100000条数据page_views.data数据文件。
create table web_analysis.page_views_txt(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
load data local inpath '/opt/datas/page_views.data' into table page_views_txt;
select count(1) from page_views_txt;
Hive通过跑MR之后获取到page_views_txt的条数是100000。
create table web_analysis.page_views_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc;
create table web_analysis.page_views_parquet(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET;
insert into table page_views_parquet select * from page_views_txt ;
insert into table page_views_orc select * from page_views_txt ;
dfs -du -h /user/hive/warehouse/web_analysis/page_views_txt;
dfs -du -h /user/hive/warehouse/web_analysis/page_views_parquet;
得到以下结果,说明page_views_parquet有13.1M
在hive命令行中输入
dfs -du -h /user/hive/warehouse/web_analysis/page_views_orc;
ORC格式默认ZLIB压缩,这里设置为snappy
create table page_views_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc tblproperties ("orc.compress"="SNAPPY");
set parquet.compression=SNAPPY ;
create table page_views_parquet_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS parquet;