它是一个为分布式应用提供一致性服务的软件,提供的功能包括:配置维护、域名服务、分布式同步、组服务等。
mkdir /software/server/zookeeper
tar -zxvf /software/package/apache-zookeeper-3.5.9-bin.tar.gz -C /software/server/zookeeper
mkdir /software/server/zookeeper/apache-zookeeper-3.5.9-bin/data
vim /software/server/zookeeper/apache-zookeeper-3.5.9-bin/conf/zoo.cfg
tickTime=2000
# zookeeper数据存储目录
dataDir=/export/server/zookeeper/data
clientPort=2181
initLimit=5
syncLimit=2
# server.1=centos7:2888:3888
/software/server/zookeeper/apache-zookeeper-3.5.9-bin/bin/zkServer.sh start
# /software/server/zookeeper/apache-zookeeper-3.5.9-bin/bin/zkServer.sh stop
jps
# 3442 QuorumPeerMain
/software/server/zookeeper/apache-zookeeper-3.5.9-bin/bin/zkCli.sh
# [zk: localhost:2181(CONNECTED) 0] ls /
# [zookeeper]
# quit 退出
mkdir /software/server/kafka
tar -zxvf /software/package/kafka_2.11-2.4.1.tgz -C /software/server/kafka
mkdir /software/server/kafka/kafka_2.11-2.4.1/data
vim /software/server/kafka/kafka_2.11-2.4.1/config/server.properties
zookeeper.connect=centos7:2181
advertised.listeners=PLAINTEXT://centos7:9092
log.dirs=/software/server/kafka/kafka_2.12-2.4.1/data
# 前台:看日志
/software/server/kafka/kafka_2.11-2.4.1/bin/kafka-server-start.sh /software/server/kafka/kafka_2.11-2.4.1/config/server.properties
# 后台
nohup /software/server/kafka/kafka_2.11-2.4.1/bin/kafka-server-start.sh /software/server/kafka/kafka_2.11-2.4.1/config/server.properties 2>&1 &
# 3442 QuorumPeerMain
# 3832 Kafka
# 4236 Jps
/software/server/kafka/kafka_2.11-2.4.1/bin/kafka-topics.sh --create --topic test01 --zookeeper centos7:2181 --partitions 3 --replication-factor 1
/software/server/kafka/kafka_2.11-2.4.1/bin/kafka-topics.sh --list --zookeeper centos7:2181
# test01
/software/server/kafka/kafka_2.11-2.4.1/bin/kafka-topics.sh --describe --zookeeper centos7:2181 --topic test01
Topic: test01 PartitionCount: 3 ReplicationFactor: 1 Configs:
Topic: test01 Partition: 0 Leader: 0 Replicas: 0 Isr: 0
Topic: test01 Partition: 1 Leader: 0 Replicas: 0 Isr: 0
Topic: test01 Partition: 2 Leader: 0 Replicas: 0 Isr: 0
/software/server/kafka/kafka_2.11-2.4.1/bin/kafka-topics.sh --delete --zookeeper centos7:2181 --topic test01
# Topic test01 is marked for deletion.
# cd /export/server/kafka/data 一会就消失了
# 创建topic
/software/server/kafka/kafka_2.11-2.4.1/bin/kafka-topics.sh --create --topic test01 --zookeeper centos7:2181 --partitions 3 --replication-factor 1
# 模拟生产者node1
/software/server/kafka/kafka_2.11-2.4.1/bin/kafka-console-producer.sh --broker-list centos7:9092 --topic test01
# 模拟消费者node3
/software/server/kafka/kafka_2.11-2.4.1/bin/kafka-console-consumer.sh --bootstrap-server centos7:9092 --topic test01
python -m pip install kafka-python -i https://pypi.tuna.tsinghua.edu.cn/simple
/software/server/kafka/kafka_2.11-2.4.1/bin/kafka-console-consumer.sh --bootstrap-server centos7:9092 --topic test01
发送hello kafka...
from kafka import KafkaProducer
if __name__ == '__main__':
# 创建
producer = KafkaProducer(
bootstrap_servers=['centos7:9092'],
acks=-1
)
# 生产数据到kafka
# 英文可以简写 中文str.encode('UTF-8')
producer.send('test01', b'hello kafka...').get()
# 关闭producer
producer.close()
下载地址:https://nowjava.com/jar/
kafka-clients-2.4.1.jar
spark-sql-kafka-0-10_2.11-2.4.1.jar
zstd-jni-0.4.3.jar
放入spark/jars
/software/server/kafka/kafka_2.11-2.4.1/bin/kafka-console-producer.sh --broker-list centos7:9092 --topic test01
输出终端
import os
os.environ['SPARK_HOME']='/software/server/spark/spark-2.4.5-bin-hadoop2.7'
os.environ['PYSPARK_PYTHON']='/software/server/miniconda3/bin/python3.7'
os.environ['JAVA_HOME']='/software/server/java/jdk1.8.0_221'
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
if __name__ == '__main__':
print("kafka和spark的集成")
spark = SparkSession.builder \
.appName('wd') \
.config("spark.sql.shuffle.partitions",4) \
.getOrCreate()
df = spark.readStream \
.format('kafka') \
.option('subscribe', 'test01') \
.option('kafka.bootstrap.servers', 'centos7:9092') \
.load()
df = df.withColumn('value', F.expr("cast(value as string)")).select('value')
df.writeStream.outputMode('append').format('console').start().awaitTermination()
java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.sql.kafka010.KafkaSourceProvider could not be instantiated
原因:org.apache.spark:spark-sql-kafka-0-10_2.12-2.4.1
在编写时似乎不稳定(https://www.cnpython.com/qa/149756)
解决:安装kafka_2.11-2.4.1
、使用spark-sql-kafka-0-10_2.11-2.4.1.jar