#启动zookeeper
zkService.sh start
#启动kafka
kafka-server-shart.sh /opt/soft/kafka211/config/server.properties
cd /opt/flumeconf
vi conf_08011_kafka.properties
#conf_08011_kafka.propertiess配置
a11.channels=c11
a11.sources=s11
a11.sinks=k11
a11.sources.s11.type=spooldir
a11.sources.s11.spoolDir=/opt/retail_db-csv
a11.sources.s11.interceptors=head_filter
a11.sources.s11.interceptors.head_filter.type=regex_filter
a11.sources.s11.interceptors.head_filter.regex=^user.*
a11.sources.s11.interceptors.head_filter.excludeEvents=true
a11.sources.s11.deserializer.maxLineLength=60000
a11.sinks.k11.type=org.apache.flume.sink.kafka.KafkaSink
a11.sinks.k11.kafka.bootstrap.servers=192.168.56.120:9092
a11.sinks.k11.kafka.topic=userfriedns
a11.channels.c11.type=memory
a11.channels.c11.capacity=60000
a11.channels.c11.transactionCapatity=60000
a11.sinks.k11.channel=c11
a11.sources.s11.channels=c11
kafka-topics.sh --create --zookeeper 192.168.56.129:2181 --topic mydemo --replication-factor 1 --partitions 1
flume-ng agent -n a11 -c conf -f /opt/flumeconf/conf_0807_kafka.properties
package com.njbdqn.myspark.kafka_sparksteaming.demo1
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.kafka010.{
ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{
Seconds, StreamingContext}
import org.apache.spark.{
SparkConf, SparkContext}
object MyReadKafkaHandler {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("name")
val sc = new SparkContext(conf)
//流处理的上下文类
val ssc = new StreamingContext(sc,Seconds(10))
//创建连接kafka服务参数
val kafkaParam = Map(
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG->"192.168.56.120:9092",
ConsumerConfig.GROUP_ID_CONFIG->"mykafka14",
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG->"true",
ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG->"20000",
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG->"earliest"
)
//创建Direct流
val streams = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set("userfriedns"), kafkaParam))
//读取数据,并做简单处理
val value = streams.map(_.value).filter(_.split(",").size > 1).flatMap(line => {
val ids = line.split(",")
ids(1).split(" ").map(word => (ids(0), word))
})
对KafkaProduce包装,再广播到每个Eexcutor中
package com.njbdqn.myspark.kafka_sparksteaming.demo1
import java.util.Properties
import org.apache.kafka.clients.producer.{
KafkaProducer, ProducerRecord}
class KafkaSinks[K,V](fc:()=>KafkaProducer[K,V]) extends Serializable {
//避免运行时产生NotSerializableException异常
lazy val producer = fc()
def send(topic:String,key:K,value:V)={
//写入Kafka
producer.send(new ProducerRecord[K,V](topic,key,value))
}
//写入Kafka
def send(topic:String,value:V)={
producer.send(new ProducerRecord[K,V](topic,value))
}
}
object KafkaSinks{
//导入Scala Java 自动类型互转换
import scala.collection.JavaConversions._
//此处Map为scala.collection.Map=>java.util.Map
def apply[K,V](conf:Map[String,String]): KafkaSinks[K,V] = {
var func = ()=>{
//新建KafkaProducer
//scala.collection.Map => java.util.Map
val prod = new KafkaProducer[K,V](conf)//需要java.util.Map
//虚拟机JVM退出时执行函数
sys.addShutdownHook{
//确保在Executor的JVM关闭前,KafkaProducer将缓存中的所有信息写入Kafka
//close()会被阻塞直到之前所有发送的请求完成
prod.close()
}
prod
}
new KafkaSinks[K,V](func)
}
//隐式转换 java.util.Properties => scala.collection.mutable.Map[String, String]
//再通过 Map.toMap => scala.collection.immutable.Map
def apply[K,V](conf:Properties): KafkaSinks[K,V] = apply(conf.toMap)
}
package com.njbdqn.myspark.kafka_sparksteaming.demo1
import org.apache.kafka.clients.producer.ProducerConfig
import org.apache.kafka.common.serialization.{
StringDeserializer, StringSerializer}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SparkSession
object MySingleBaseDAO {
/**
* 对对象先上锁 broadcast只有一个
*/
@volatile private var instance:Broadcast[KafkaSinks[String,String]]=null
//大家一起进来,但总有先进去(一开始都是null)
def getInstance() = {
if (instance == null) {
//为了确定由谁有创建KafkaParms,不造成混乱
val sc = SparkSession.builder().appName("writerKafka")
.master("local[2]").getOrCreate().sparkContext
synchronized{
if (instance ==null){
val kafkaParms=Map[String,String](
ProducerConfig.BOOTSTRAP_SERVERS_CONFIG->"192.168.56.120:9092",
ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG->classOf[StringSerializer].getName,
ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG->classOf[StringSerializer].getName
)
instance = sc.broadcast(KafkaSinks[String,String](kafkaParms))
}
instance
}
}
instance
}
}
package com.njbdqn.myspark.kafka_sparksteaming.demo1
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.kafka010.{
ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{
Seconds, StreamingContext}
import org.apache.spark.{
SparkConf, SparkContext}
object MyReadKafkaHandler {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("name")
val sc = new SparkContext(conf)
//流处理的上下文类
val ssc = new StreamingContext(sc,Seconds(10))
//创建连接kafka服务参数
val kafkaParam = Map(
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG->"192.168.56.120:9092",
ConsumerConfig.GROUP_ID_CONFIG->"mykafka14",
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG->"true",
ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG->"20000",
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG->"earliest"
)
//创建Direct流
val streams = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set("userfriedns"), kafkaParam))
//读取数据,并做简单处理
val value = streams.map(_.value).filter(_.split(",").size > 1).flatMap(line => {
val ids = line.split(",")
ids(1).split(" ").map(word => (ids(0), word))
})
// value.foreachRDD(rdd=>rdd.foreach(println))
//将处理的数据写回kafka
value.foreachRDD(rdd=>{
//获取可序列化
val producer = MySingleBaseDAO.getInstance().value
rdd.foreach(record=>{
producer.send("kafkaSink",record.toString())
})
})
//启动sparkstreaming
ssc.start()
ssc.awaitTermination()
}
}
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>myspark</groupId>
<artifactId>myspark</artifactId>
<version>1.0-SNAPSHOT</version>
<name>myspark</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<spark.version>2.3.4</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${
spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${
spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.3.4</version>
<scope>${
spark.version}</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-graphx -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
<version>${
spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.6.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${
spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.3.4</version>
</dependency>
</dependencies>
<build>
<finalName>myinterceptor</finalName>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</build>
</project>