8.Hive系列之压缩与存储

1. 创建一个 ZLIB 压缩的 ORC 存储方式
create table log_orc_zlib(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties("orc.compress"="ZLIB");
# 查看插入后数据
dfs -du -h /user/hive/warehouse/log_orc_zlib/ ;
2.78 M /user/hive/warehouse/log_orc_none/000000_0
2. 创建一个 SNAPPY 压缩的 ORC 存储方式
create table log_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties("orc.compress"="SNAPPY");
# 查看插入后数据
dfs -du -h /user/hive/warehouse/log_orc_snappy/;
3.75 M /user/hive/warehouse/log_orc_snappy/000000_0
3. 创建一个 SNAPPY 压缩的 parquet 存储方式
create table log_parquet_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as parquet
tblproperties("parquet.compression"="SNAPPY");
dfs -du -h /user/hive/warehouse/log_parquet_snappy/;
6.39 MB /user/hive/warehouse/ log_parquet_snappy /000000_0

4. 存储方式和压缩总结

在实际的项目开发当中,hive 表的数据存储格式一般选择:orc 或 parquet。压缩方式一般选择 snappy,lzo

你可能感兴趣的:(大数据,hive,hadoop,数据仓库)