兼容MySQL协议
mysql -hdb01.doris.shyc2.qihoo.net -P9030 -udfs_shbt_logsget -pxxxxx -Ddp_db -N -A`
四种数据模型:
CREATE TABLE site_access_duplicate
(
site_id INT DEFAULT '10',
city_code SMALLINT,
user_name VARCHAR(32) DEFAULT '',
pv BIGINT DEFAULT '0'
)
DUPLICATE KEY(site_id, city_code)
DISTRIBUTED BY HASH(site_id) BUCKETS 10;
CREATE TABLE site_access_aggregate
(
site_id INT DEFAULT '10',
city_code SMALLINT,
pv BIGINT SUM DEFAULT '0'
)
AGGREGATE KEY(site_id, city_code)
DISTRIBUTED BY HASH(site_id) BUCKETS 10;
CREATE TABLE site_access_unique
(
site_id INT DEFAULT '10',
city_code SMALLINT,
user_name VARCHAR(32) DEFAULT '',
pv BIGINT DEFAULT '0'
)
UNIQUE KEY(site_id, city_code)
DISTRIBUTED BY HASH(site_id) BUCKETS 10;
完整建表语句如下:
create table if not exists rpt_qdas_show_activity_zone_daily(
-- 声明字段基本与MySQL一致(但不支持声明主键和外键,且无date数据类型)
day_id int NOT NULL DEFAULT '19700101',
appkey varchar(50) NOT NULL DEFAULT '',
ver varchar(50) NOT NULL DEFAULT '',
zone_id int DEFAULT NULL,
zone_name varchar(50) NOT NULL,
times int DEFAULT NULL,
all_times int DEFAULT NULL
)
engine=olap -- 计算引擎:除此之外,还能使用MySQL、hdfs等作为计算引擎
-- 声明类型数据:
①AGGREGATE KEY(聚合模型)
②UNIQ KEY(主键模型)
③DUPLICATE KEY(不对数据处理,声明的字段只用作排序)
duplicate key (day_id,appkey,ver,zone_id,zone_name)
partition by range(day_id)
//分区字段(可以声明为list类型)
//分区字段在建表字段中只能是数值型和日期类型。
(
partition p202104 values less than ('20210501'),
partition p202105 values less than ('20210601'),
partition p202106 values less than ('20210701')
//创建分区(如果声明list类型 则需要使用in)
)
distributed by hash(day_id) buckets 10//分桶字段
properties
(
'replication_num' = '3', -- 表的配置项目:备份数,储存介质类型等在此声明
'storage_medium' = 'SSD');
CREATE TABLE `dim_infoflow_authorinfo` (
`zmt_id` varchar(65533) NULL COMMENT "作者id",
`z_type` varchar(65533) NULL COMMENT "注册类型",
`z_status` varchar(65533) NULL COMMENT "账号状态",
`z_name` varchar(65533) NULL COMMENT "账号名称",
`z_realm` varchar(65533) NULL COMMENT "注册领域id",
`z_pqid` varchar(65533) NULL COMMENT "母账号id",
`z_category` varchar(65533) NULL COMMENT "是否洗白",
`z_realm_name` varchar(65533) NULL COMMENT "注册领域中文",
`z_invite` varchar(65533) NULL COMMENT "是否邀约",
`z_inviter` varchar(65533) NULL COMMENT "邀约人",
`z_new_level` varchar(65533) NULL COMMENT "新作者等级",
`z_fansi` varchar(65533) NULL COMMENT "粉丝数量",
`z_out_level` varchar(65533) NULL COMMENT "站外等级",
`z_verify_realm` varchar(65533) NULL COMMENT "认证兴趣领域",
`z_ytag` varchar(65533) NULL COMMENT "发文领域",
`z_org_type` varchar(65533) NULL COMMENT "原始类型",
`z_verify_type` varchar(65533) NULL COMMENT "认证类型"
) ENGINE=OLAP
DUPLICATE KEY(`zmt_id`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`zmt_id`) BUCKETS 10
PROPERTIES (
"replication_num" = "3",
"in_memory" = "false",
"storage_format" = "DEFAULT",
"storage_medium" = "SSD"
);
CREATE TABLE `dwd_infoflow_article_instore_d` (
`pday` int(11) NULL COMMENT "",
`day_id` int(11) NULL COMMENT "",
`rurl` varchar(65533) NULL COMMENT "新闻id",
`instore_time` varchar(65533) NULL COMMENT "入库时间",
`instore_type` varchar(65533) NULL COMMENT "入库类型",
`instore_status` varchar(65533) NULL COMMENT "入库状态",
`zmt_id` varchar(65533) NULL COMMENT "作者id"
) ENGINE=OLAP
DUPLICATE KEY(`pday`)
COMMENT "OLAP"
PARTITION BY RANGE(`pday`)
(
PARTITION p20220423 VALUES [("20220423"), ("20220424")),
PARTITION p20220424 VALUES [("20220424"), ("20220425")),
PARTITION p20220425 VALUES [("20220425"), ("20220426")),
PARTITION p20220426 VALUES [("20220426"), ("20220427")),
PARTITION p20220427 VALUES [("20220427"), ("20220428"))
)
DISTRIBUTED BY HASH(`pday`) BUCKETS 10
PROPERTIES (
"replication_num" = "3",
"dynamic_partition.enable" = "true",
"dynamic_partition.time_unit" = "day",
"dynamic_partition.time_zone" = "Asia/Shanghai",
"dynamic_partition.start" = "-2147483648",
"dynamic_partition.end" = "3",
"dynamic_partition.prefix" = "p",
"dynamic_partition.buckets" = "10",
"in_memory" = "false",
"storage_format" = "DEFAULT",
"storage_medium" = "SSD"
);
**************************************************************************************************************************************
properties中的几个参数释义如下:
dynamic_partition_enable:是否开启动态分区特性,开启为true,关闭false,默认为开启。
dynamic_partition_time_unit:动态分区调度的粒度,可指定为day/week/month。(不同分区粒度下动态分区自动创建分区的名称后缀不同,指定为day时,分区名后缀为yyyyMMdd,例如20220423;指定为week时,分区名后缀为yyyy_ww,例如2021_40代表2021年第40周;指定为month时,动态分区名后缀格式为yyyyMM,例如202204。)
dynamic_partition.start:动态分区的开始时间。以当天为基准,根据该参数向过去推算数个粒度的时间,超过该时间范围的分区将会被删除。如果不填写,则默认为Integer.NIM_VALUE即-2147483648。
dynamic_partition.end:动态分区的结束时间。以当天为基准,会根据该参数提前创建数个粒度的分区范围。
dynamic_partition.prefix:动态分区自动创建的分区名前缀。
dynamic_partition.buckets:动态创建的分区所对应的分桶数量。
以上属性在建完表后也可以进行修改,例如关闭动态分区:
alter table tableName set ("dynamic_partition.enable"="false")。
partition by range(day)(
start ("20220421") end ("20220423") every (interval 1 day)
)
等价于
partition p20220421 values[(20220421),(20220422)),
partition p20220422 values[(20220422),(20220423)),
partition p20220423 values[(20220423),(20220424))
alter table dim_infoflow_authorinfo_test modify COLUMN z_ytag varchar(655);//字段类型长度只能变长,不能缩短。
************************************************************************************************************************************
目前支持以下类型的转换:
TINYINT/SMALLINT/INT/BIGINT转换成TINYINT/SMALLINT/INT/BIGINT/DOUBLE;
TINTINT/SMALLINT/INT/BIGINT/LARGEINT/FLOAT/DOUBLE/DECIMAL转换成 VARCHAR;
VARCHAR支持修改最大长度;
VARCHAR转换成TINTINT/SMALLINT/INT/BIGINT/LARGEINT/FLOAT/DOUBLE;
VARCHAR转换成DATE(目前支持"%Y-%m-%d","%y-%m-%d","%Y%m%d","%y%m%d","%Y/%m/%d","%y/%m/%d"六种格式化格式);
DATETIME转换成DATE(仅保留年-月-日信息,例如:`2019-12-09 21:47:05` <--> `2019-12-09`);
DATE转换成DATETIME(时分秒自动补零,例如:`2019-12-09` <--> `2019-12-09 00:00:00`);
FLOAT转换成DOUBLE;
INT转换成DATE(如果INT类型数据不合法则转换失败,原始数据不变)。
HDFS - > StarRocks。StarRocks cli执行:
LOAD LABEL middle.test_ly_lable1
(
DATA INFILE("/home/logsget/output/s.cloud/20230512/split/safei18n_ins_err-2023051213-00005.gz")
INTO TABLE `test_ly_hdfstoSR`
PARTITION (p1, p2)
COLUMNS TERMINATED BY ","
(t1,t2,t3)
SET (
t3=t1+t2
)
)
//RESOURCE通过show resources查看。
WITH RESOURCE 'spark_shbt'
PROPERTIES
(
//timeout:指定导入操作的超时时间。默认超时为 4 小时。单位秒。
"timeout" = "3600",
//max_filter_ratio:最大容忍可过滤(数据不规范等原因)的数据比例。默认零容忍。
"max_filter_ratio" = "0.1"
);
Hive -> StarRocks
${starrocks_conn} -e "LOAD LABEL qdas_adb.${tablename}_${timestamp}
(
DATA INFILE(\"hdfs://xxx:9000/home/logsget/hive/warehouse/rd_xinxiliu.db/${table}/pday=${system.task.day}/*\")
INTO TABLE ${tablename}
)
#BROKER通过show broker查看。
WITH BROKER 'broker_hdfs'
(
\"username\"=\"xxx\",
\"password\"=\"***\"
)
PROPERTIES
(
\"timeout\"=\"7200\",
\"max_filter_ratio\"=\"0.1\"
);"
本地文件/数据流 - > StarRocks。
Kafka - > StarRocks。
insert into/overwrite test_ly_hdfstoSR PARTITION(p20230511)
WITH LABEL iiilytest
(ip,time,pday)
values('10.0.0.0','20000',20230511)
;
------------------------------------------------------------------------------------
注意:分区字段如果是建表语句中的字段,导入时必须制定分区字段,否则会报错:
1064 - 'pday' must be explicitly mentioned in column permutation, Time: 0.024000s
查看导入作业状态:
SHOW LOAD WHERE label="iiilytest"
**********************目标表结构*****************************
CREATE TABLE `global_dict_infoflow_rurl` (
`id` int(11) NULL COMMENT "id",
`rurl` varchar(60) NULL COMMENT "新闻id"
) ENGINE=OLAP
DUPLICATE KEY(`id`, `rurl`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`id`) BUCKETS 10
PROPERTIES (
"replication_num" = "3",
"in_memory" = "false",
"storage_format" = "DEFAULT",
"storage_medium" = "SSD"
);
***************************************************
set exec_mem_limit = 2147483648*100;
set query_timeout =60*60*3;
insert into global_dict_infoflow_rurl
select
(row_number() over(order by t1.rurl) + t3.max_id) as id,t1.rurl
from
(
select rurl from t_dim_infoflow_articleinfonew
where rurl is not null
) t1
left join
(
select rurl, id from global_dict_infoflow_rurl
) t2
on t1.rurl = t2.rurl
,
(
select max(id) as max_id from global_dict_infoflow_rurl
) t3
where t2.id is null;
动态分区,是说会随着时间推移自动创建后续的分区,并不是数据过来后,自动创建数据对应的分区。所以当要补历史数据时,需要手动创建历史分区。
ALTER TABLE rpt_qhsdk_151514_logshare_frontmidres1 SET("dynamic_partition.enable"="false");
ALTER table rpt_qhsdk_151514_logshare_frontmidres1 add partition p202401 values [("20230509"),("20240109"));
ALTER TABLE rpt_qhsdk_151514_logshare_frontmidres1 SET("dynamic_partition.enable"="true");