https://github.com/pascallin/user-behavior-docker
创建下面的目录结构和文件
/usr/docker
|- confluent
|- docker-compose.yml
|- kafka-data
|- ...
|- schema-registry
|- connect-avro-standalone.properties
|- kafka-connect-elasticsearch
|- quickstart-elasticsearch.properties
|- elastic
|- docker-compose.yml
|- config
|- elasticsearch.yml
|- log4j2.properties
|- scripts
|- ...
|- esdata
|- ...
部署分为Elasticsearch和Confluent两个docker compose服务,默认每个docker compose服务会开一个默认的网络。这里的Confluent的ES Connect需要将数据推到不同docker compose服务的ES容器,不同docker compose服务网络连接可以创建一个外部network打通。
首先创建一个docker network
//查看所有network
docker network ls
//创建一个network
docker network create --driver bridge dev3network
//查看所有network
docker network ls
version: '3'
services:
elasticsearch:
hostname: elasticsearch
image: "elasticsearch:5"
command: "elasticsearch -Enode.name='user-behavior'"
volumes:
- /usr/docker/elastic/esdata:/usr/share/elasticsearch/data
- /usr/docker/elastic/config:/usr/share/elasticsearch/config
ports:
- "9201:9200"
- "9301:9300"
networks:
- default
- dev3network
networks:
dev3network:
external: true
elasticsearch.yml文件
network.host: 0.0.0.0
log4j2.properties文件
status = error
appender.console.type = Console
appender.console.name = console
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = [%d{ISO8601}][%-5p][%-25c{1.}] %marker%m%n
rootLogger.level = info
rootLogger.appenderRef.console.ref = console
max virtual memory areas vm.max_map_count [65530] likely too low, increase to at least [262144]
sudo sysctl -w vm.max_map_count=655360
version: '3'
services:
zookeeper:
image: confluentinc/cp-zookeeper:3.1.1
hostname: zookeeper
ports:
- "2181:2181"
environment:
ZOOKEEPER_CLIENT_PORT: 2181
ZOOKEEPER_TICK_TIME: 2000
kafka:
image: confluentinc/cp-kafka:3.1.1
hostname: kafka
ports:
- "9092:9092"
depends_on:
- zookeeper
volumes:
- ./kafka-data:/var/lib/kafka/data
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
KAFKA_ADVERTISED_LISTENERS: 'PLAINTEXT://kafka:9092'
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
schema_registry:
image: confluentinc/cp-schema-registry:3.1.1
hostname: schema_registry
depends_on:
- zookeeper
- kafka
ports:
- "8081:8081"
environment:
SCHEMA_REGISTRY_HOST_NAME: schema_registry
SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181'
kafka_rest:
image: confluentinc/cp-kafka-rest:3.1.1
hostname: kafka_rest
depends_on:
- zookeeper
- kafka
- schema_registry
ports:
- "8082:8082"
environment:
KAFKA_REST_ZOOKEEPER_CONNECT: 'zookeeper:2181'
KAFKA_REST_SCHEMA_REGISTRY_URL: 'http://schema_registry:8081'
KAFKA_REST_HOST_NAME: kafka_rest
KAFKA_REST_DEBUG: 'true'
connect:
image: confluentinc/cp-kafka-connect:3.1.1
hostname: connect
depends_on:
- zookeeper
- kafka
- schema_registry
- kafka_rest
external_links:- elastic_elasticsearch_1
volumes:
- ./kafka-connect-elasticsearch:/etc/kafka-connect-elasticsearch
- ./schema-registry:/etc/schema-registry
command: [
"connect-standalone",
"/etc/schema-registry/connect-avro-standalone.properties",
"/etc/kafka-connect-elasticsearch/quickstart-elasticsearch.properties"
]
environment:
CONNECT_BOOTSTRAP_SERVERS: 'kafka:9092'
CONNECT_ZOOKEEPER_CONNECT: 'zookeeper:2181'
CONNECT_REST_ADVERTISED_HOST_NAME: connect
CONNECT_REST_HOST_NAME: connect
CONNECT_REST_PORT: 8083
CONNECT_GROUP_ID: compose-connect-group
CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs
CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1
CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000
CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets
CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1
CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status
CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1
networks:
- default
- dev3network
networks:
dev3network:
external: true
topic+partition+offset
,所以需要把kafka的日志路径映射到主机,以防容器挂掉后offset重新计数,导致数据重复_id的数据写不进去。映射到主机的路径同样也需要chmod 777
。connecter有两种开启方式,standalone和distributed模式,这里直接用简单的standalone模式开启。
文件路径是:/usr/docker/confluent/schema-registry/connect-avro-standalone.properties
# The converters specify the format of data in Kafka and how to translate it into Connect data.
# Every Connect user will need to configure these based on the format they want their data in
# when loaded from or stored into Kafka
key.converter=io.confluent.connect.avro.AvroConverter
key.converter.schema.registry.url=http://schema_registry:8081
value.converter=io.confluent.connect.avro.AvroConverter
value.converter.schema.registry.url=http://schema_registry:8081
# The internal converter used for offsets and config data is configurable and must be specified,
# but most users will always want to use the built-in default. Offset and config data is never
# visible outside of Connect in this format.
internal.key.converter=org.apache.kafka.connect.json.JsonConverter
internal.value.converter=org.apache.kafka.connect.json.JsonConverter
internal.key.converter.schemas.enable=false
internal.value.converter.schemas.enable=false
# Local storage file for offset data
offset.storage.file.filename=/tmp/connect.offsets
这里的路径是:/usr/docker/confluent/kafka-connect-elasticsearch/quickstart-elasticsearch.properties
name=elasticsearch-sink
connector.class=io.confluent.connect.elasticsearch.ElasticsearchSinkConnector
tasks.max=1
topics=user-behavior
key.ignore=true
connection.url=http://elastic_elasticsearch_1:9200
type.name=user-behavior
cd /usr/docker/elasic
docker-compose up -d
cd /usr/docker/confluent
docker-compose up -d
// 查看所有镜像状态
docker ps -a
用nodejs代码测试发送数据到kafka rest服务。
const rp = require('request-promise')
const co = require('co')
co(function*(){
let schema = {
"type": "record",
"name": "UserBehavior",
"fields": [
{ "name": "user_id", "type": "string" },
{ "name": "project_id", "type": "string" },
{ "name": "appid", "type": "string" },
{ "name": "entity_id", "type": "string" },
{ "name": "action", "type": "string" },
{ "name": "ip", "type": "string" },
{ "name": "options", "type": "string" },
{ "name": "date", "type": "long" },
]
}
let opt = {
method: 'POST',
uri: "http://localhost:8082/topics/user-behavior",
headers: {
'Content-Type': 'application/vnd.kafka.avro.v1+json'
},
body:{
value_schema: JSON.stringify(schema),
records: [{
"value": {
"user_id": "5a0ea98d570d67c3ed85a73a",
"project_id": "5a0ea98d570d67c3ed85a73b",
"entity_id": "5a0ea98d570d67c3ed85a73d",
"appid": "5a0ea98d570d67c3ed85a73c",
"action": "view",
"ip": "127.0.0.1",
"options": "",
"date": 1510910336855
}
}]
},
json: true
}
let result = yield rp(opt)
console.log('-------------> result', result);
})
查看ES数据
curl "http://localhost:9201/user-behavior/_search"
var path = require('path');
const UglifyJSPlugin = require('uglifyjs-webpack-plugin');
module.exports = {
entry: './src/index.js',
output: {
path: path.resolve(__dirname, 'dist'),
filename: 'ub-sdk.js',
library: 'UBSdk',
libraryTarget:'umd'
},
module: {
rules: [
{ test: /\.js$/, exclude: /node_modules/, loader: "babel-loader" }
]
},
plugins: [
new UglifyJSPlugin()
]
};
sdk提交数据时,需要进行数字签名,以防被攻击导致脏数据。签名算法为参数加入secret和timestamp后,对参数名进行排序,拼接成字符串,然后进行md5单向加密。下面是JS代码示例。
import MD5 from 'md5.js'
function setSignature(params){
params.projectId = "" //项目ID,后端分配
params.secret = "" //项目对应的secret,后端分配
params.timestamp = Date.now()
let sortedKey = Object.keys(params).sort();
let str = "";
for (let key of sortedKey) {
str += key + "=" + params[key];
}
params.signature = new MD5().update(str).digest('hex')
delete params.secret
return params
}