Spark Streaming(制造数据到kafka,读取kafka并过滤数据写入到mysql练习)

Spark Streaming MakeData

import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}
import scala.collection.mutable.ListBuffer
import scala.util.Random
//生产数据
object MakeDataDemo {
  def main(args: Array[String]): Unit = {
//Kafka 参数
    val props = new Properties();
    props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092");
    props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
    props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");

   val producer = new KafkaProducer[String, String](props)

   val words = List[String]("zhangsan", "lisi", "wangwu", "spark", "sql", "Streaming", "xiaowang", "xiaohong")

    while (true) {
      val lines = new ListBuffer[String]
      for (i <- 1 to (1 + new Random().nextInt(6))) {
        lines.append(words(new Random().nextInt(8)))
      }
      val data = new ProducerRecord[String, String]("sparktest", lines.mkString(" "))	//topic名称sparktest
      producer.send(data)

      Thread.sleep(500)
      // print(data)
    }
  }
}

创建表相应表

mysql>create table wordcount (word varchar(200) primary key,counts int(11));
mysql> create table stopwords (word varchar(200) primary key);
mysql> insert into stopwords values('wangwu')
mysql>truncate table wordcount;

Spark Streaming 读取kafka数据并过滤写入到mysql中

package SparkTest.SparkStreaming
import java.sql.{Connection, PreparedStatement, ResultSet}
import SparkTest.SparkStreaming.StopWord
import SparkTest.util.DBUtil
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import scala.collection.mutable.ListBuffer

object SparkStreamingkafkaDemo {
  def main(args: Array[String]): Unit = {
  	//创建 SparkConf
    val conf = new SparkConf().setMaster("local[*]").setAppName(this.getClass.getName)
    //创建 StreamingContext
    val ssc = new StreamingContext(conf, Seconds(5))
    ssc.sparkContext.setLogLevel("error")
	//Kafka 参数
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "spark-demo",
      "auto.offset.reset" -> "latest"
      //      "enable.auto.commit" -> (t: java.lang.Boolean)
    )
   //读取 Kafka 数据创建 DStream
    val topics = Array("sparktest")
    val stream = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )

/*  //过滤 方法1
    val result = stream.map(_.value).flatMap(_.split(" ")).transform(rdd =>{
            val stopwords = new ListBuffer[String]()
            stopwords.append("wangwu")
            rdd.filter(!_.contains(stopwords(0)))
          })*/

    /*    //过滤 方法2
    val result = stream.map(_.value).flatMap(_.split(" ")).filter(!_.contains("wangwu"))*/

    //过滤 方法3
    val result = stream.map(_.value).flatMap(_.split(" ")).transform(rdd => {
      val stopwords = StopWords.getWords() //stopWords 方法1 方法2
      //val stopwords = StopWords.getWord() //stopWords 方法3
      rdd.filter(word => {
        !stopwords.contains(word)
      })
    })


    result.map((_, 1)).reduceByKey(_ + _).foreachRDD(rdd => {
	//foreacgRDD 写入mysql
      rdd.foreachPartition(it => {
        val con = DBUtil.getConnection()
        val pstmt = con.prepareStatement("insert into wordcount values(?,?) ON DUPLICATE KEY UPDATE counts=counts+?")

        try {

          it.foreach(line => {
            pstmt.setString(1, line._1)
            pstmt.setInt(2, line._2.toInt)
            pstmt.setInt(3, line._2.toInt)
            pstmt.addBatch()
          })
          pstmt.executeBatch()
        } catch {
          case e: Exception => {
            e.printStackTrace()
          }
        } finally {
          pstmt.close()
          con.close()
        }
      })
    })
  //stream.map(_.value).print()
	//开启任务
    ssc.start()
    ssc.awaitTermination()
  }
}

mysql 连接类
DBUtil

package SparkTest.util
import java.sql.{Connection, DriverManager}
object DBUtil {
  def getConnection():Connection={
    Class.forName("com.mysql.jdbc.Driver")
    DriverManager.getConnection("jdbc:mysql://192.168.58.203/testdb","root","123")
  }
}

StopWords 过滤类

package SparkTest.SparkStreaming
import java.sql.{Connection, PreparedStatement, ResultSet}
import SparkTest.util.DBUtil
import scala.collection.mutable.ListBuffer

object StopWords {
  //方法1 过滤需求在数据库中
  val stopword = new ListBuffer[String]()

  var con: Connection = null
  var pstmt: PreparedStatement = null
  var rs: ResultSet = null
  try {
    con = DBUtil.getConnection()
    pstmt = con.prepareStatement("select * from stopwords")  //读数据库
    //mysql> create table stopwords (word varchar(200) primary key);
    //mysql> insert into stopwords values('wangwu')
    
    rs = pstmt.executeQuery()

    while (rs.next()) {
      stopword.append(rs.getString("word"))
    }
  } catch {
    case e: Exception => {
      e.printStackTrace()
    }
  } finally {
    rs.close()
    pstmt.close()
    con.close()
  }

/*
    //方法2  过滤需求在本地文件中
    val lines = scala.io.Source.fromFile("data/stopword.txt").getLines()  //读文件
	//data/stopword.txt 里写入 wangwu
    lines.foreach(line=>{
      stopword.append(line)
    })  */

  def getWords(): ListBuffer[String] = {
    stopword
  }

/*
    //方法3  过滤需求手动设置

  val words="wangwu"
  def getWord():String={
    words
  }*/

  def main(args: Array[String]): Unit = {

//        val stopWord=new StopWord("wangwu")
//        print(stopWord.getWord())  // 法1
//        println(getWords())
  }
}

OffSetDemo 读批次 读topic 读的哪个分区 从哪里读到哪里

package SparkTest.SparkStreaming
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.{SparkConf, TaskContext}
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
//读批次 读topic 读的哪个分区 从哪里读到哪里
object OffSetDemo {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName(this.getClass.getName)
    val ssc = new StreamingContext(conf, Seconds(5))
    ssc.sparkContext.setLogLevel("error")

    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "192.168.58.201:9092,192.168.58.202:9092,192.168.58.203:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "spark-demo",
      "auto.offset.reset" -> "latest"
      //      "enable.auto.commit" -> (t: java.lang.Boolean)
    )

    val topics = Array("sparktest")
    val stream = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )

    stream.foreachRDD { rdd =>
      //数组  OffsetRange
      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd.foreachPartition { iter =>
        val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
        println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
      }
    }

    ssc.start()
    ssc.awaitTermination()
  }
}

你可能感兴趣的:(kafka,spark,mysql,大数据,scala)