org.apache.hadoop.mapred.TextInputFormat org.apache.hadoop.mapred.TextOutputFormat示例:
-- 建立TEXTFILE格式的表 create table olympic(athelete STRING,age INT,country STRING,year STRING,closing STRING,sport STRING,gold INT,silver INT,bronze INT,total INT) row format delimited fields terminated by '\t' stored as textfile; -- 向表中导入数据 load data local inpath '/home/kiran/Downloads/olympic_data.csv' into table olympic; -- 查询表 select athelete from olympic;
org.apache.hadoop.mapred.SequenceFileInputFormat org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat示例:
-- 建立SEQUENCEFILE格式的表 create table olympic_sequencefile(athelete STRING,age INT,country STRING,year STRING,closing STRING,sport STRING,gold INT,silver INT,bronze INT,total INT) row format delimited fields terminated by '\t' stored as sequencefile; -- 向表中导入数据 -- 与TEXTFILE有些不同,因为SEQUENCEFILE是二进制格式,所以需要从其它表向SEQUENCEFILE表插入数据。 INSERT OVERWRITE TABLE olympic_sequencefile SELECT * FROM olympic; -- 查询表 select athelete from olympic_sequencefile;
org.apache.hadoop.hive.ql.io.RCFileInputFormat org.apache.hadoop.hive.ql.io.RCFileOutputFormat示例:
-- 建立RCFILE格式的表 create table olympic_rcfile(athelete STRING,age INT,country STRING,year STRING,closing STRING,sport STRING,gold INT,silver INT,bronze INT,total INT) row format delimited fields terminated by '\t' stored as rcfile -- 向表中导入数据 -- 不能直接向RCFILE表中导入数据,需要从其它表向RCFILE表插入数据。 INSERT OVERWRITE TABLE olympic_rcfile SELECT * FROM olympic; -- 查询表 select athelete from olympic_rcfile;
org.apache.hadoop.hive.ql.io.orc示例:
-- 建立ORCFILE格式的表 create table olympic_orcfile(athelete STRING,age INT,country STRING,year STRING,closing STRING,sport STRING,gold INT,silver INT,bronze INT,total INT) row format delimited fields terminated by '\t' stored as orcfile; -- 向表中导入数据 -- 不能直接向ORCFILE表中导入数据,需要从其它表向ORCFILE表插入数据。 INSERT OVERWRITE TABLE olympic_orcfile SELECT * FROM olympic; -- 查询表 select athelete from olympic_orcfile;
对于多维数据仓库来说,需要处理SCD,必然要用到行级更新,所以所有TDS(转换后的数据存储)里的表,除日期维度表外,其它表都是用ORCFILE格式。日期维度表数据一旦生成就不会修改,所以使用TEXTFILE格式。RDS(原始数据存储)里的表使用缺省的TEXTFILE格式。
<!-- 添加如下配置项以支持事务 --> <property> <name>hive.support.concurrency</name> <value>true</value> </property> <property> <name>hive.exec.dynamic.partition.mode</name> <value>nonstrict</value> </property> <property> <name>hive.txn.manager</name> <value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value> </property> <property> <name>hive.compactor.initiator.on</name> <value>true</value> </property> <property> <name>hive.compactor.worker.threads</name> <value>1</value> </property>
INSERT INTO NEXT_LOCK_ID VALUES(1); INSERT INTO NEXT_COMPACTION_QUEUE_ID VALUES(1); INSERT INTO NEXT_TXN_ID VALUES(1); COMMIT;说明:如果这三个表没有数据,执行行级更新时会报以下错误:org.apache.hadoop.hive.ql.lockmgr.DbTxnManager FAILED: Error in acquiring locks: Error communicating with the metastore
use test; -- 建立测试表 create table t1(id int, name string) clustered by (id) into 8 buckets stored as orc TBLPROPERTIES ('transactional'='true');说明:
-- 测试insert insert into t1 values (1,'aaa'); insert into t1 values (2,'bbb'); select* from t1;查询结果如下图所示。
-- 测试update update t1 set name='ccc' where id=1; select* from t1;查询结果如下图所示。
-- 测试delete delete from t1 where id=2; select* from t1;查询结果如下图所示。
1,a,US,CA 2,b,US,CB 3,c,CA,BB 4,d,CA,BC
-- 建立非分区表并加载数据 use test; CREATE TABLE t1 (id INT, name STRING, cty STRING, st STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; LOAD DATA LOCAL INPATH '/root/a.txt' INTO TABLE t1; SELECT * FROM t1; -- 建立外部分区事务表并加载数据 CREATE EXTERNAL TABLE t2 (id INT, name STRING) PARTITIONED BY (country STRING, state STRING) CLUSTERED BY (id) INTO 8 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true'); INSERT INTO T2 PARTITION (country, state) SELECT * FROM T1; SELECT * FROM t2;查询结果如下图所示。
-- 修改数据 INSERT INTO TABLE t2 PARTITION (country, state) VALUES (5,'e','DD','DD'); UPDATE t2 SET name='f' WHERE id=1; DELETE FROM t2 WHERE name='b'; SELECT * FROM t2;查询结果如下图所示。
-- 建立源数据库 DROP DATABASE IF EXISTS source; CREATE DATABASE source; -- 建立源库表 USE source; -- 建立客户表 CREATE TABLE customer ( customer_number INT NOT NULL AUTO_INCREMENT PRIMARY KEY comment '客户编号,主键', customer_name VARCHAR(50) comment '客户名称', customer_street_address VARCHAR(50) comment '客户住址', customer_zip_code INT comment '邮编', customer_city VARCHAR(30) comment '所在城市', customer_state VARCHAR(2) comment '所在省份' ); -- 建立产品表 CREATE TABLE product ( product_code INT NOT NULL AUTO_INCREMENT PRIMARY KEY comment '产品编码,主键', product_name VARCHAR(30) comment '产品名称', product_category VARCHAR(30) comment '产品类型' ); -- 建立销售订单表 CREATE TABLE sales_order ( order_number INT NOT NULL AUTO_INCREMENT PRIMARY KEY comment '订单号,主键', customer_number INT comment '客户编号', product_code INT comment '产品编码', order_date DATETIME comment '订单日期', entry_date DATETIME comment '登记日期', order_amount DECIMAL(10 , 2 ) comment '销售金额', foreign key (customer_number) references customer (customer_number) on delete cascade on update cascade, foreign key (product_code) references product (product_code) on delete cascade on update cascade );(2)生成源库测试数据
USE source; -- 生成客户表测试数据 INSERT INTO customer (customer_name, customer_street_address, customer_zip_code, customer_city, customer_state) VALUES ('Really Large Customers', '7500 Louise Dr.',17050, 'Mechanicsburg','PA'), ('Small Stores', '2500 Woodland St.',17055, 'Pittsburgh','PA'), ('Medium Retailers','1111 Ritter Rd.',17055,'Pittsburgh','PA'), ('Good Companies','9500 Scott St.',17050,'Mechanicsburg','PA'), ('Wonderful Shops','3333 Rossmoyne Rd.',17050,'Mechanicsburg','PA'), ('Loyal Clients','7070 Ritter Rd.',17055,'Pittsburgh','PA'), ('Distinguished Partners','9999 Scott St.',17050,'Mechanicsburg','PA'); -- 生成产品表测试数据 INSERT INTO product (product_name, product_category ) VALUES ('Hard Disk Drive', 'Storage'), ('Floppy Drive', 'Storage'), ('LCD Panel', 'Monitor'); -- 生成100条销售订单表测试数据 DROP PROCEDURE IF EXISTS generate_sales_order_data; DELIMITER // CREATE PROCEDURE generate_sales_order_data() BEGIN DROP TABLE IF EXISTS temp_sales_order_data; CREATE TABLE temp_sales_order_data AS SELECT * FROM sales_order WHERE 1=0; SET @start_date := unix_timestamp('2016-03-01'); SET @end_date := unix_timestamp('2016-07-01'); SET @i := 1; WHILE @i<=100 DO SET @customer_number := floor(1 + rand() * 6); SET @product_code := floor(1 + rand() * 2); SET @order_date := from_unixtime(@start_date + rand() * (@end_date - @start_date)); SET @amount := floor(1000 + rand() * 9000); INSERT INTO temp_sales_order_data VALUES (@i,@customer_number,@product_code,@order_date,@order_date,@amount); SET @i:=@i+1; END WHILE; TRUNCATE TABLE sales_order; INSERT INTO sales_order SELECT NULL,customer_number,product_code,order_date,entry_date,order_amount FROM temp_sales_order_data ORDER BY order_date; COMMIT; END // DELIMITER ; CALL generate_sales_order_data();(3)建立RDS库表
-- 建立RDS数据库 DROP DATABASE IF EXISTS rds CASCADE; CREATE DATABASE rds; -- 建立RDS库表 USE rds; -- 建立客户过渡表 CREATE TABLE customer ( customer_number INT comment 'number', customer_name VARCHAR(30) comment 'name', customer_street_address VARCHAR(30) comment 'address', customer_zip_code INT comment 'zipcode', customer_city VARCHAR(30) comment 'city', customer_state VARCHAR(2) comment 'state' ); -- 建立产品过渡表 CREATE TABLE product ( product_code INT comment 'code', product_name VARCHAR(30) comment 'name', product_category VARCHAR(30) comment 'category' ); -- 建立销售订单过渡表 CREATE TABLE sales_order ( order_number INT comment 'order number', customer_number INT comment 'customer number', product_code INT comment 'product code', order_date TIMESTAMP comment 'order date', entry_date TIMESTAMP comment 'entry date', order_amount DECIMAL(10 , 2 ) comment 'order amount' );(4)建立TDS库表
-- 建立数据仓库数据库 DROP DATABASE IF EXISTS dw CASCADE; CREATE DATABASE dw; -- 建立数据仓库表 USE dw; -- 建立客户维度表 CREATE TABLE customer_dim ( customer_sk INT comment 'surrogate key', customer_number INT comment 'number', customer_name VARCHAR(50) comment 'name', customer_street_address VARCHAR(50) comment 'address', customer_zip_code INT comment 'zipcode', customer_city VARCHAR(30) comment 'city', customer_state VARCHAR(2) comment 'state', version INT comment 'version', effective_date DATE comment 'effective date', expiry_date DATE comment 'expiry date' ) CLUSTERED BY (customer_sk) INTO 8 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true'); -- 建立产品维度表 CREATE TABLE product_dim ( product_sk INT comment 'surrogate key', product_code INT comment 'code', product_name VARCHAR(30) comment 'name', product_category VARCHAR(30) comment 'category', version INT comment 'version', effective_date DATE comment 'effective date', expiry_date DATE comment 'expiry date' ) CLUSTERED BY (product_sk) INTO 8 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true'); -- 建立订单维度表 CREATE TABLE order_dim ( order_sk INT comment 'surrogate key', order_number INT comment 'number', version INT comment 'version', effective_date DATE comment 'effective date', expiry_date DATE comment 'expiry date' ) CLUSTERED BY (order_sk) INTO 8 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true'); -- 建立销售订单事实表 CREATE TABLE sales_order_fact ( order_sk INT comment 'order surrogate key', customer_sk INT comment 'customer surrogate key', product_sk INT comment 'product surrogate key', order_date_sk INT comment 'date surrogate key', order_amount DECIMAL(10 , 2 ) comment 'order amount' ) CLUSTERED BY (order_sk) INTO 8 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true');(5)建立日期维度表并生成数据
./date_dim_generate.sh 2000-01-01 2020-12-31
date_dim_generate.sh shell脚本文件内容如下图所示:
create_table_date_dim.sql SQL脚本内容如下:
drop table if exists date_dim; create table date_dim ( date_sk int comment 'surrogate key', date date comment 'date,yyyy-mm-dd', month tinyint comment 'month', month_name varchar(9) comment 'month name', quarter tinyint comment 'quarter', year smallint comment 'year' ) comment 'date dimension table' row format delimited fields terminated by ',' stored as textfile;说明: