##Dsqoop.export.records.per.statement=500这个参数很关键,可解决和starrocks报"too many tablet versions" 错误
##来源https://stackoverflow.com/questions/52453293/sqoop-export-for-100-million-records-faster
#!/bin/bash
hivedatabase="ods"
hivetable="nrms_catalogue"
#dt=`date -d ${ym} +%Y-%m`
#partition-values=""
#--hcatalog-partition-keys dt \
#--hcatalog-partition-values ${dt} \
starrocksip="192.168.174.62"
starrocksport="9030"
starrocksdatabase="ods"
starrocksusername="root"
starrockspassword="vcom123456"
starrockstable="t_urc_catalogue"
echo "-------------- 导出hive的${hivedatabase}下${hivetable} 表到starrocks的${starrocksdatabase}下的${starrockstable} ------------------------------"
nohup sqoop export \
-Dsqoop.export.records.per.statement=500 \
--hcatalog-database ${hivedatabase} \
--hcatalog-table ${hivetable} \
--connect "jdbc:mysql://${starrocksip}:${starrocksport}/${starrocksdatabase}" \
--username ${starrocksusername} \
--password ${starrockspassword} \
--table ${starrockstable}
echo "##执行完毕------------ 导出hive的${hivedatabase}下${hivetable} 表到starrocks的${starrocksdatabase}下的${starrockstable} ------------------------------"
#!/bin/bash
for s in `hdfs dfs -ls /data/hive/warehouse/ods.db/ucs_statistic_login_day_student2/|grep -v "items" |awk -F= '{print $NF}'`
do
echo "##SSSSSSSSS####开始同步ucs_statistic_login_day_student2/server_node=${s}/分区数据-----------------------------------"
for d in `hdfs dfs -ls /data/hive/warehouse/ods.db/ucs_statistic_login_day_student2/server_node="$s"/|grep -v "items" |awk -F= '{print $NF}'`
do
echo "######开始同步ucs_statistic_login_day_student2/server_node=${s}/dt=${d}/分区数据-----------------------------------"
\cp -rf /opt/module/datax/job/hive2starrocks.json /opt/module/datax/job/job/ucs_statistic_login_day_student2-${s}-${d}-hive2starrocks.json
sed -i -e "s/SSS/${s}/g" -e "s/DDD/${d}/g" /opt/module/datax/job/job/ucs_statistic_login_day_student2-${s}-${d}-hive2starrocks.json
chown 502:games /opt/module/datax/job/job/ucs_statistic_login_day_student2-${s}-${d}-hive2starrocks.json
python /opt/module/datax/bin/datax.py --jvm="-Xms3G -Xmx3G" --loglevel=debug /opt/module/datax/job/job/ucs_statistic_login_day_student2-${s}-${d}-hive2starrocks.json
echo "######结束同步ucs_statistic_login_day_student2/server_node=${s}/dt=${d}/分区数据-----------------------------------"
done
echo "##SSSSSSSSS####结束同步ucs_statistic_login_day_student2/server_node=${s}/分区数据-----------------------------------"
done
###2.1、hadoopConfig内容要配置,内容来源于hdfs-site.xml,将hdfs-site.xml放入datax/conf下处理属组和权限的方式无效,不配置报错找不到nameservice1
###2.2、hdfsreader方式只能读分区下的文件,无法读取hive表,通过在column中添加字段方式可解决如下value和type
###2.3、hive表中分隔符"\001"对应datax中应为"fieldDelimiter": “\u0001”
###2.4、datax默认不带hivereder和starrockswriter,需要从starrocks官网指定位置下载,并导入datax/plugin对应目录下,且writer格式要按starrocks官网的,不能按datax官网的
###2.5、
{
"job": {
"setting": {
"speed": {
"channel": 3
}
},
"content": [
{
"reader": {
"name": "hdfsreader",
"parameter": {
"hadoopConfig":{
"dfs.client.failover.proxy.provider.nameservice1": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider",
"dfs.ha.namenodes.nameservice1": "namenode276,namenode301",
"dfs.namenode.rpc-address.nameservice1.namenode276": "cs-cdh61:8020",
"dfs.namenode.rpc-address.nameservice1.namenode301": "cs-cdh62:8020",
"dfs.nameservices": "nameservice1"
},
"path": "/data/hive/warehouse/ods.db/ucs_statistic_login_day_student2/server_node=SSS/dt=DDD/*",
"defaultFS": "hdfs://nameservice1",
"column": [
{
"value": "SSS",
"type": "string"
},
{
"value": "DDD",
"type": "string"
},
{
"index": 0,
"type": "string"
},
{
"index": 1,
"type": "string"
},
{
"index": 2,
"type": "string"
},
{
"index": 3,
"type": "string"
},
{
"index": 4,
"type": "string"
},
{
"index": 5,
"type": "string"
},
{
"index": 6,
"type": "string"
},
{
"index": 7,
"type": "string"
},
{
"index": 8,
"type": "string"
},
{
"index": 9,
"type": "string"
},
{
"index": 10,
"type": "string"
},
{
"index": 11,
"type": "string"
},
{
"index": 12,
"type": "string"
},
{
"index": 13,
"type": "long"
},
{
"index": 14,
"type": "long"
},
{
"index": 15,
"type": "long"
},
{
"index": 16,
"type": "date"
},
{
"index": 17,
"type": "long"
},
{
"index": 18,
"type": "long"
},
{
"index": 19,
"type": "long"
},
{
"index": 20,
"type": "long"
},
{
"index": 21,
"type": "long"
},
{
"index": 22,
"type": "long"
},
{
"index": 23,
"type": "long"
},
{
"index": 24,
"type": "date"
},
{
"index": 25,
"type": "date"
}
],
"fileType": "text",
"encoding": "UTF-8",
"fieldDelimiter": "\u0001"
}
},
"writer": {
"name": "starrockswriter",
"parameter": {
"username": "root",
"password": "vcom123456",
"database": "ods",
"table": "hive_ods_ucs_statistic_login_day_student2",
"column": ["server_node", "dt", "id", "area_id", "school_id", "school_name", "class_id", "study_stage_code", "grade_code", "grade_name", "class_name", "class_create_year", "real_name_class_teacher", "user_name", "real_name", "year", "month", "day", "date1", "week", "count_login", "online_time", "count_login_pc", "online_time_pc", "count_login_app", "online_time_app", "create_time", "update_time"],
"preSql": [],
"postSql": [],
"jdbcUrl": "jdbc:mysql://192.168.174.62:9030/",
"loadUrl": ["192.168.174.62:18030"],
"loadProps": {}
}
}
}
]
}
}
#该插件相关依赖过于古老,无从下载,无法打包,弃用!!!
{
"job": {
"setting": {
"speed": {
"channel": 3
}
},
"content": [
{
"reader": {
"name": "hdfsreader",
"parameter": {
"hadoopConfig":{
"dfs.client.failover.proxy.provider.nameservice1": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider",
"dfs.ha.namenodes.nameservice1": "namenode276,namenode301",
"dfs.namenode.rpc-address.nameservice1.namenode276": "cs-cdh61:8020",
"dfs.namenode.rpc-address.nameservice1.namenode301": "cs-cdh62:8020",
"dfs.nameservices": "nameservice1"
},
"path": "/data/hive/warehouse/ods.db/ucs_statistic_login_day_teacher2/*",
"defaultFS": "hdfs://nameservice1",
"column": ["*"],
"fileType": "text",
"encoding": "UTF-8",
"fieldDelimiter": "\u0001"
}
},
"writer": {
"name": "starrockswriter",
"parameter": {
"username": "root",
"password": "vcom123456",
"database": "ods",
"table": "hive_ods_ucs_statistic_login_day_student2",
"column": ["server_node", "dt", "id", "area_id", "school_id", "school_name", "class_id", "study_stage_code", "grade_code", "grade_name", "class_name", "class_create_year", "real_name_class_teacher", "user_name", "real_name", "year", "month", "day", "date1", "week", "count_login", "online_time", "count_login_pc", "online_time_pc", "count_login_app", "online_time_app", "create_time", "update_time"],
"preSql": [],
"postSql": [],
"jdbcUrl": "jdbc:mysql://192.168.174.62:9030/",
"loadUrl": ["192.168.174.62:18030"],
"loadProps": {}
}
}
}
]
}
}
CREATE TABLE `ods.ucs_statistic_login_day_teacher2_tmp`(
`server_node` string COMMENT '平台分区',
`dt` string COMMENT '按天分区',
`id` string COMMENT 'ID',
`area_id` string COMMENT '区域ID',
`school_id` string COMMENT '学校ID',
`school_name` string COMMENT '学校名称',
`user_name` string COMMENT '账号',
`real_name` string COMMENT '姓名',
`year` int COMMENT '年',
`month` int COMMENT '月',
`day` int COMMENT '日',
`date1` timestamp COMMENT '时间周期宽表中的字段,当前时间对应的年月日,格式yyyy-MM-dd,如2021-01-10',
`week` int COMMENT '周',
`count_login` bigint COMMENT '登录次数',
`online_time` bigint COMMENT '在线时长 (分)',
`count_login_pc` bigint COMMENT '登录次数-PC',
`online_time_pc` bigint COMMENT '在线时长 (分)-PC',
`count_login_app` bigint COMMENT '登录次数-APP',
`online_time_app` bigint COMMENT '在线时长 (分)-APP',
`count_login_teach_pc` bigint COMMENT '登录次数-授课端',
`online_time_teach_pc` bigint COMMENT '在线时长 (分)-授课端',
`create_time` timestamp COMMENT '增加时间',
`update_time` timestamp COMMENT '修改时间')
COMMENT '教师每天登录次数tmp'
row format delimited fields terminated by "\001"
lines terminated by '\012'
;
insert into table ods.ucs_statistic_login_day_teacher2_tmp
select
server_node,
dt,
id,
area_id,
school_id,
school_name,
user_name,
real_name,
year,
month,
day,
date1,
week,
count_login,
online_time,
count_login_pc,
online_time_pc,
count_login_app,
online_time_app,
count_login_teach_pc,
online_time_teach_pc,
create_time,
update_time
from ods.ucs_statistic_login_day_teacher2;
SELECT COUNT(*) from ods.ucs_statistic_login_day_teacher2 usldt ; -- 29822916;
SELECT COUNT(*) from ods.ucs_statistic_login_day_teacher2_tmp usldtt ; -- 29822916;
{
"job": {
"setting": {
"speed": {
"channel": 7
}
},
"content": [
{
"reader": {
"name": "hdfsreader",
"parameter": {
"hadoopConfig":{
"dfs.client.failover.proxy.provider.nameservice1": "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider",
"dfs.ha.namenodes.nameservice1": "namenode276,namenode301",
"dfs.namenode.rpc-address.nameservice1.namenode276": "cs-cdh61:8020",
"dfs.namenode.rpc-address.nameservice1.namenode301": "cs-cdh62:8020",
"dfs.nameservices": "nameservice1"
},
"path": "/data/hive/warehouse/ods.db/ucs_statistic_login_day_teacher2/*",
"defaultFS": "hdfs://nameservice1",
"column": ["*"],
"fileType": "text",
"encoding": "UTF-8",
"fieldDelimiter": "\u0001"
}
},
"writer": {
"name": "starrockswriter",
"parameter": {
"username": "root",
"password": "vcom123456",
"database": "ods",
"table": "hive_ods_ucs_statistic_login_day_student2",
"column": [
"server_node",
"dt",
"id",
"area_id",
"school_id",
"school_name",
"user_name",
"real_name",
"year",
"month",
"day",
"date1",
"week",
"count_login",
"online_time",
"count_login_pc",
"online_time_pc",
"count_login_app",
"online_time_app",
"count_login_teach_pc",
"online_time_teach_pc",
"create_time",
"update_time"
],
"preSql": [],
"postSql": [],
"jdbcUrl": "jdbc:mysql://192.168.174.62:9030/",
"loadUrl": ["192.168.174.62:18030"],
"loadProps": {}
}
}
}
]
}
}
#!/bin/bash
python /opt/module/datax/bin/datax.py --jvm="-Xms10G -Xmx10G" --loglevel=debug /home/qy/hive2starrocks/datax-hivetable2tmp2starrocks/hivetmptable2starrocks.json
任务启动时刻 : 2023-08-26 09:53:24
任务结束时刻 : 2023-08-26 09:55:56
任务总计耗时 : 151s
任务平均流量 : 30.11MB/s
记录写入速度 : 198819rec/s
读出记录总数 : 29822916
读写失败总数 : 0