import spark.implicits._
import java.io.File
new File("/data/projects/tpcds/sql").listFiles.filter(_.isDirectory).map(_.listFiles.filter(_.isFile)).flatMap(_.toList).foreach(f => spark.read.options(Map("delimiter" -> "|")).csv(f"file://${f.getCanonicalPath}").write.options(Map("compression"->"gzip")).parquet(f"file:///data/projects/tpcds/parquet/${f.getParentFile.getName}/${f.getName.replace(".dat", "")}.parquet"))
desc formatted table_name;
下载数据生成工具:TPC-DS Tools Download
unzip ${uuid}-tpc-ds-tool.zip
cd DSGen-software-code-${version}/tools
cmake OS=LINUX
dsdgen -SCALE 1 -DIR /data/tpcds
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/customer_address
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/customer_demographics
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/date_dim
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/warehouse
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/ship_mode
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/time_dim
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/reason
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/income_band
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/item
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/store
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/call_center
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/customer
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/web_site
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/store_returns
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/household_demographics
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/web_page
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/promotion
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/catalog_page
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/inventory
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/catalog_returns
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/web_returns
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/web_sales
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/catalog_sales
hdfs dfs -mkdir -p /data/hive/warehouse/tpcds/1G/store_sales
hdfs dfs -put tpcds/customer_address.dat /data/hive/warehouse/tpcds/1G/customer_address
hdfs dfs -put tpcds/customer_demographics.dat /data/hive/warehouse/tpcds/1G/customer_demographics
hdfs dfs -put tpcds/date_dim.dat /data/hive/warehouse/tpcds/1G/date_dim
hdfs dfs -put tpcds/warehouse.dat /data/hive/warehouse/tpcds/1G/warehouse
hdfs dfs -put tpcds/ship_mode.dat /data/hive/warehouse/tpcds/1G/ship_mode
hdfs dfs -put tpcds/time_dim.dat /data/hive/warehouse/tpcds/1G/time_dim
hdfs dfs -put tpcds/reason.dat /data/hive/warehouse/tpcds/1G/reason
hdfs dfs -put tpcds/income_band.dat /data/hive/warehouse/tpcds/1G/income_band
hdfs dfs -put tpcds/item.dat /data/hive/warehouse/tpcds/1G/item
hdfs dfs -put tpcds/store.dat /data/hive/warehouse/tpcds/1G/store
hdfs dfs -put tpcds/call_center.dat /data/hive/warehouse/tpcds/1G/call_center
hdfs dfs -put tpcds/customer.dat /data/hive/warehouse/tpcds/1G/customer
hdfs dfs -put tpcds/web_site.dat /data/hive/warehouse/tpcds/1G/web_site
hdfs dfs -put tpcds/store_returns.dat /data/hive/warehouse/tpcds/1G/store_returns
hdfs dfs -put tpcds/household_demographics.dat /data/hive/warehouse/tpcds/1G/household_demographics
hdfs dfs -put tpcds/web_page.dat /data/hive/warehouse/tpcds/1G/web_page
hdfs dfs -put tpcds/promotion.dat /data/hive/warehouse/tpcds/1G/promotion
hdfs dfs -put tpcds/catalog_page.dat /data/hive/warehouse/tpcds/1G/catalog_page
hdfs dfs -put tpcds/inventory.dat /data/hive/warehouse/tpcds/1G/inventory
hdfs dfs -put tpcds/catalog_returns.dat /data/hive/warehouse/tpcds/1G/catalog_returns
hdfs dfs -put tpcds/web_returns.dat /data/hive/warehouse/tpcds/1G/web_returns
hdfs dfs -put tpcds/web_sales.dat /data/hive/warehouse/tpcds/1G/web_sales
hdfs dfs -put tpcds/catalog_sales.dat /data/hive/warehouse/tpcds/1G/catalog_sales
hdfs dfs -put tpcds/store_sales.dat /data/hive/warehouse/tpcds/1G/store_sales
以customer_address
表为例:
create external table customer_address
(
ca_address_sk int ,
ca_address_id char(16) ,
ca_street_number char(10) ,
ca_street_name varchar(60) ,
ca_street_type char(15) ,
ca_suite_number char(10) ,
ca_city varchar(60) ,
ca_county varchar(30) ,
ca_state char(2) ,
ca_zip char(10) ,
ca_country varchar(20) ,
ca_gmt_offset decimal(5,2) ,
ca_location_type char(20)
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS textfile
location '/data/hive/warehouse/tpcds/1G/customer_address/'
;
drop table customer_address;
drop table customer_demographics;
drop table date_dim;
drop table warehouse;
drop table ship_mode;
drop table time_dim;
drop table reason;
drop table income_band;
drop table item;
drop table store;
drop table call_center;
drop table customer;
drop table web_site;
drop table store_returns;
drop table household_demographics;
drop table web_page;
drop table promotion;
drop table catalog_page;
drop table inventory;
drop table catalog_returns;
drop table web_returns;
drop table web_sales;
drop table catalog_sales;
drop table store_sales;
create table customer_address like tpcds.customer_address using parquet;
insert into customer_address select * from tpcds.customer_address;
create table customer_demographics like tpcds.customer_demographics using parquet;
insert into customer_demographics select * from tpcds.customer_demographics;
create table date_dim like tpcds.date_dim using parquet;
insert into date_dim select * from tpcds.date_dim;
create table warehouse like tpcds.warehouse using parquet;
insert into warehouse select * from tpcds.warehouse;
create table ship_mode like tpcds.ship_mode using parquet;
insert into ship_mode select * from tpcds.ship_mode;
create table time_dim like tpcds.time_dim using parquet;
insert into time_dim select * from tpcds.time_dim;
create table reason like tpcds.reason using parquet;
insert into reason select * from tpcds.reason;
create table income_band like tpcds.income_band using parquet;
insert into income_band select * from tpcds.income_band;
create table item like tpcds.item using parquet;
insert into item select * from tpcds.item;
create table store like tpcds.store using parquet;
insert into store select * from tpcds.store;
create table call_center like tpcds.call_center using parquet;
insert into call_center select * from tpcds.call_center;
create table customer like tpcds.customer using parquet;
insert into customer select * from tpcds.customer;
create table web_site like tpcds.web_site using parquet;
insert into web_site select * from tpcds.web_site;
create table store_returns like tpcds.store_returns using parquet;
insert into store_returns select * from tpcds.store_returns;
create table household_demographics like tpcds.household_demographics using parquet;
insert into household_demographics select * from tpcds.household_demographics;
create table web_page like tpcds.web_page using parquet;
insert into web_page select * from tpcds.web_page;
create table promotion like tpcds.promotion using parquet;
insert into promotion select * from tpcds.promotion;
create table catalog_page like tpcds.catalog_page using parquet;
insert into catalog_page select * from tpcds.catalog_page;
create table inventory like tpcds.inventory using parquet;
insert into inventory select * from tpcds.inventory;
create table catalog_returns like tpcds.catalog_returns using parquet;
insert into catalog_returns select * from tpcds.catalog_returns;
create table web_returns like tpcds.web_returns using parquet;
insert into web_returns select * from tpcds.web_returns;
create table web_sales like tpcds.web_sales using parquet;
insert into web_sales select * from tpcds.web_sales;
create table catalog_sales like tpcds.catalog_sales using parquet;
insert into catalog_sales select * from tpcds.catalog_sales;
create table store_sales like tpcds.store_sales using parquet;
insert into store_sales select * from tpcds.store_sales;