**
**
1.准备spark2.4以上,并解压
2.准备waterdrop1.4以上,并解压
3.vim config/waterdrop-env.sh
#指定Spark安装路径
SPARK_HOME=${SPARK_HOME:-/usr/local/spark-2.4.3-bin-hadoop2.7}
4.把hive/config下的hive-site.xml复制到hadoop的conf目录之下(这步可能不需要)
5.把hive/config下的hive-site.xml复制到spark的conf目录之下
6.clickhouse中的测试表
先建一个库
CREATE DATABASE cms ENGINE = Ordinary;
再建一个表
CREATE TABLE cms.cms_msg
(
date Date,
hostname String,
domain String,
remote_addr String,
request_time Float32,
datetime String,
url String,
status Int16,
data_size Int16,
user_agent String,
minute String
) ENGINE = MergeTree PARTITION BY date ORDER BY (date, hostname) SETTINGS index_granularity = 16384
7.hive中的测试表
CREATE TABLE test.nginx_msg_detail
(
hostname
string,
domain
string,
remote_addr
string,
request_time
float,
datetime
string,
url
string,
status
int,
data_size
int,
referer
string,
cookie_info
string,
user_agent
string,
minute
string)
PARTITIONED BY (
date
string)
#往hive表中插入一些数据
insert into table test.nginx_msg_detail partition(date=‘2020-03-09’)
values(
‘host01’,
‘net’,
‘北京’,
12.00,
‘2020-02-09’,
‘www.baidu.com’,
001,
32,
‘zhangsan’,
‘huswekiiflw1290’,
‘lishi’,
‘30’)
insert into table test.nginx_msg_detail partition(date=‘2020-03-09’)
values(
‘host02’,
‘com’,
‘天津’,
12.00,
‘2020-02-10’,
‘www.JD.com’,
002,
35,
‘wangwu’,
‘huswekiiflw1200’,
‘zhaoliu’,
‘40’)
#读取测试是否有结果
配置文件vim batch.conf
spark {
spark.sql.catalogImplementation = “hive”
spark.app.name = “Waterdrop”
spark.executor.instances = 1
spark.executor.cores = 1
spark.executor.memory = “1g”
}
input {
hive {
pre_sql = “select * from test.nginx_msg_detail”
table_name = “access_log”
}
}
filter {
remove {
source_field = [“cookie_info”,“referer”]
}
}
output {
stdout {
limit = 10
serializer = “json”
}
}
配置文件vim batch.conf
spark {
spark.sql.catalogImplementation = “hive”
spark.app.name = “Waterdrop”
spark.executor.instances = 2
spark.executor.cores = 2
spark.executor.memory = “2g”
}
input {
hive {
pre_sql = “select * from test.nginx_msg_detail”
table_name = “access_log”
}
}
filter {
remove {
source_field = [“cookie_info”,“referer”]
}
}
output {
clickhouse {
host = “***.***.***.***:8123”
database = “cms”
table = “cms_msg”
fields = [“date”, “hostname”,“domain”,“remote_addr”,“request_time”,“datetime”,“url”,“status”,“user_agent”,“minute”]
username = “******”
password = “******”
}
}
#最后执行
sh /software/waterdrop/bin/start-waterdrop.sh --config /software/waterdrop/config/batch.conf -e client -m ‘local[2]’
#如果想动态传参数在input这里一个条件变量
input {
hive {
pre_sql = “select * from test.nginx_msg_detail where dt = '”${dt}"’ "
table_name = “access_log”
}
}
#执行时
sh /software/waterdrop/bin/start-waterdrop.sh --config /software/waterdrop/config/batch.conf -e client -m ‘local[2]’ -i dt=2020-05-01
文章来源:https://blog.csdn.net/huochen1994/article/details/84594508