链接: https://pan.baidu.com/s/1YVvhqy1u9rILqQWzJnNoVA
提取码: twt3
1、以下是RNG S8 8强赛失败后,官微发表道歉微博下一级评论
1.1、在kafak中创建rng_comment主题,设置2个分区2个副本
1.2、数据预处理,把空行过滤掉
1.3、请把给出的文件写入到kafka中,根据数据id进行分区,id为奇数的发送到一个分区中,偶数的发送到另一个分区
1.4、使用Spark Streaming对接kafka
1.5、使用Spark Streaming对接kafka之后进行计算
在mysql中创建一个数据库rng_comment
在数据库rng_comment创建vip_rank表,字段为数据的所有字段
在数据库rng_comment创建like_status表,字段为数据的所有字段
在数据库rng_comment创建count_conmment表,字段为 时间,条数
1.5.1、查询出微博会员等级为5的用户,并把这些数据写入到mysql数据库中的vip_rank表中
1.5.2、查询出评论赞的个数在10个以上的数据,并写入到mysql数据库中的like_status表中
1.5.3、分别计算出2018/10/20 ,2018/10/21,2018/10/22,2018/10/23这四天每一天的评论数是多少,并写入到mysql数据库中的count_conmment表中
数据说明:
rng_comment.txt文件中的数据
字段 | 字段含义 |
---|---|
index | 数据id |
child_comment | 回复数量 |
comment_time | 评论时间 |
content | 评论内容 |
da_v | 微博个人认证 |
like_status | 赞 |
pic | 图片评论url |
user_id | 微博用户id |
user_name | 微博用户名 |
vip_rank | 微博会员等级 |
stamp | 时间戳 |
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>org.examplegroupId>
<artifactId>spark01artifactId>
<version>1.0-SNAPSHOTversion>
<repositories>
<repository>
<id>aliyunid>
<url>http://maven.aliyun.com/nexus/content/groups/public/url>
repository>
<repository>
<id>clouderaid>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/url>
repository>
<repository>
<id>jbossid>
<url>http://repository.jboss.com/nexus/content/groups/publicurl>
repository>
<repository>
<id>scala-tools.orgid>
<name>Scala-tools Maven2 Repositoryname>
<url>http://scala-tools.org/repo-releasesurl>
repository>
repositories>
<pluginRepositories>
<pluginRepository>
<id>scala-tools.orgid>
<name>Scala-tools Maven2 Repositoryname>
<url>http://scala-tools.org/repo-releasesurl>
pluginRepository>
pluginRepositories>
<properties>
<maven.compiler.source>1.8maven.compiler.source>
<maven.compiler.target>1.8maven.compiler.target>
<encoding>UTF-8encoding>
<scala.version>2.11.8scala.version>
<scala.compat.version>2.11scala.compat.version>
<hadoop.version>2.7.4hadoop.version>
<spark.version>2.2.0spark.version>
properties>
<dependencies>
<dependency>
<groupId>org.scala-langgroupId>
<artifactId>scala-libraryartifactId>
<version>${scala.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-core_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-sql_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-hive_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-hive-thriftserver_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-streaming_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-streaming-kafka-0-10_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-sql-kafka-0-10_2.11artifactId>
<version>${spark.version}version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-clientartifactId>
<version>2.7.4version>
dependency>
<dependency>
<groupId>org.apache.hbasegroupId>
<artifactId>hbase-clientartifactId>
<version>1.3.1version>
dependency>
<dependency>
<groupId>org.apache.hbasegroupId>
<artifactId>hbase-serverartifactId>
<version>1.3.1version>
dependency>
<dependency>
<groupId>com.typesafegroupId>
<artifactId>configartifactId>
<version>1.3.3version>
dependency>
<dependency>
<groupId>mysqlgroupId>
<artifactId>mysql-connector-javaartifactId>
<version>5.1.38version>
dependency>
dependencies>
<build>
<sourceDirectory>src/main/scalasourceDirectory>
<testSourceDirectory>src/test/scalatestSourceDirectory>
<plugins>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-compiler-pluginartifactId>
<version>3.5.1version>
plugin>
<plugin>
<groupId>net.alchim31.mavengroupId>
<artifactId>scala-maven-pluginartifactId>
<version>3.2.2version>
<executions>
<execution>
<goals>
<goal>compilegoal>
<goal>testCompilegoal>
goals>
<configuration>
<args>
<arg>-dependencyfilearg>
<arg>${project.build.directory}/.scala_dependenciesarg>
args>
configuration>
execution>
executions>
plugin>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-surefire-pluginartifactId>
<version>2.18.1version>
<configuration>
<useFile>falseuseFile>
<disableXmlReport>truedisableXmlReport>
<includes>
<include>**/*Test.*include>
<include>**/*Suite.*include>
includes>
configuration>
plugin>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-shade-pluginartifactId>
<version>2.3version>
<executions>
<execution>
<phase>packagephase>
<goals>
<goal>shadegoal>
goals>
<configuration>
<filters>
<filter>
<artifact>*:*artifact>
<excludes>
<exclude>META-INF/*.SFexclude>
<exclude>META-INF/*.DSAexclude>
<exclude>META-INF/*.RSAexclude>
excludes>
filter>
filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>mainClass>
transformer>
transformers>
configuration>
execution>
executions>
plugin>
plugins>
build>
project>
-- 创建数据库
create database rng_comment character set utf8;
-- 使用数据库
use rng_comment;
-- 创建 vip_rank 表
create table vip_rank
(
`index` varchar(32) comment '数据id',
child_comment varchar(32) comment '回复数量 ',
comment_time varchar(255) comment '评论时间',
content varchar(255) comment '评论内容',
da_v varchar(32) comment '微博个人认证',
like_status varchar(32) comment '赞',
pic varchar(255) comment '图片评论url',
user_id varchar(255) comment '微博用户id',
user_name varchar(32) comment '微博用户名',
vip_rank varchar(32) comment '微博会员等级',
stamp varchar(32) comment '时间戳'
);
-- 创建 like_status 表
create table like_status
(
`index` varchar(32) comment '数据id',
child_comment varchar(32) comment '回复数量 ',
comment_time varchar(255) comment '评论时间',
content varchar(255) comment '评论内容',
da_v varchar(32) comment '微博个人认证',
like_status varchar(32) comment '赞',
pic varchar(255) comment '图片评论url',
user_id varchar(255) comment '微博用户id',
user_name varchar(32) comment '微博用户名',
vip_rank varchar(32) comment '微博会员等级',
stamp varchar(32) comment '时间戳'
);
-- 创建 count_comment 表
create table count_comment
(
time varchar(32) primary key comment '时间',
count int comment '条数'
);
-- 查询表结构
describe vip_rank;
describe like_status;
describe count_comment;
-- 查询表字段信息
select column_name,
column_comment
from information_schema.columns
where table_schema = 'rng_comment'
and table_name = 'vip_rank';
select column_name,
column_comment
from information_schema.columns
where table_schema = 'rng_comment'
and table_name = 'like_status';
select column_name,
column_comment
from information_schema.columns
where table_schema = 'rng_comment'
and table_name = 'count_comment';
-- 存在则修改不存在则创建,要有唯一id
INSERT INTO count_comment (time,count) VALUES (?,?) ON DUPLICATE KEY UPDATE count = ?;
-- 清空表
-- truncate table vip_rank;
-- truncate table like_status;
-- truncate table count_comment;
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
import org.apache.spark.rdd.RDD
/**
* @author 红尘丶世界
* @version v 1.0
*/
class KafkaProducerConf(topic: String, severs: String) extends Serializable {
//kafka 配置
def createKafkaConnection(): KafkaProducer[String, String] = {
val props = new Properties()
props.put("bootstrap.servers", severs)
//todo: 一定别忘了 .getName
props.put("key.serializer", classOf[StringSerializer].getName)
props.put("value.serializer", classOf[StringSerializer].getName)
props.put("retries", "1") //设置重试次数
props.put("batch.size", "71680") //设置批量发送数据的大小
props.put("buffer.memory", "33554432")//设置缓冲区大小
props.put("linger.ms", "1000") //最多延迟1000毫秒
props.put("partitioner.class", (new MyPartitioner).getClass)
new KafkaProducer[String, String](props)
}
//创建生产者
lazy val kafkaProducer: KafkaProducer[String, String] = createKafkaConnection()
Runtime.getRuntime.addShutdownHook(new Thread() {
override def run(): Unit = {
kafkaProducer.close()
}
})
//把数据保存到kafka
def save(vs: RDD[String]): Unit = {
try {
vs.foreach(x => {
val record = new ProducerRecord[String, String](topic, x.split("\t")(0), x.toString)
kafkaProducer.send(record)
})
} catch {
case e: Exception => println("发送数据出错:" + e)
}
}
}
(注意这是java类,不是scala类,同样可以创建在scala包下)
import org.apache.kafka.clients.producer.Partitioner;
import org.apache.kafka.common.Cluster;
import java.util.Map;
/**
* @author 红尘丶世界
* @version v 1.0
*/
public class MyPartitioner implements Partitioner {
@Override
public int partition(String topic, Object key, byte[] keyBytes, Object value, byte[] valueBytes, Cluster cluster) {
String s = key.toString();
int i = Integer.parseInt(s);
if (i % 2 == 0) {
return 0;
} else {
return 1;
}
}
@Override
public void close() {
}
@Override
public void configure(Map<String, ?> configs) {
}
}
import org.apache.spark.{SparkConf, SparkContext}
/**
* @author 红尘丶世界
* @version v 1.0
*/
object Producer {
def main(args: Array[String]): Unit = {
//创建 sparkContext
val conf: SparkConf = new SparkConf().setAppName("sparkStream").setMaster("local[*]")
val sc = new SparkContext(conf)
//读取文件并过滤空行
val lines = sc.textFile("D:\\dev\\大数据\\大数据资料\\spark\\4.14号练习题\\rng_comment.txt").filter(_.trim != "")
//指定连接的节点
val sink: KafkaProducerConf = new KafkaProducerConf("rng_comment", "hadoop01:9092,hadoop02:9092,hadoop03:9092")
//发送数据到kafka
sink.save(lines)
//关闭连接
sink.createKafkaConnection().close()
}
}
package com.czxy.exercise.exercise01
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* @author 红尘丶世界
* @version v 1.0
*/
object SparkStreamingKafka {
def main(args: Array[String]): Unit = {
//1 创建sparkConf
var conf = new SparkConf().setMaster("local[*]").setAppName("SparkStremingDemo1")
//2 创建一个sparkContext
var sc = new SparkContext(conf)
sc.setLogLevel("WARN")
//3 创建streamingContext
var ssc = new StreamingContext(sc, Seconds(5))
//设置缓存数据的位置
ssc.checkpoint("./TmpCount")
var kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hadoop01:9092,hadoop01:9092,hadoop01:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "SparkKafkaDemo",
//earliest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
//latest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
//none:topic各分区都存在已提交的offset时,从offset后开始消费;只要有一个分区不存在已提交的offset,则抛出异常
//这里配置latest自动重置偏移量为最新的偏移量,即如果有偏移量从偏移量位置开始消费,没有偏移量从新来的数据开始消费
"auto.offset.reset" -> "earliest",
//false表示关闭自动提交.由spark帮你提交到Checkpoint或程序员手动维护
"enable.auto.commit" -> (false: java.lang.Boolean)
)
//4 接收kafka数据并根据业务逻辑进行计算
//设置位置策略 均衡,
//kafkaDatas 含有key和value
//key是kafka成产数据时指定的key(可能为空)
//value是真实的数据(100%有数据)
val data: InputDStream[ConsumerRecord[String, String]] =
KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array("rng_comment"), kafkaParams))
data.foreachRDD(_.foreach(row => {
//获取一行转化成list
val list: List[String] = row.value().split("\t").toList
//判断如果 微博会员等级为5 就写入到 vip_rank表
if (list(9) == "5") {
//掉用方法把数据写入mysql
saveDataToMysqlVipRankAndLikeStatus(list, "vip_rank")
}
if (list(5).toInt >= 10) {
//掉用方法把数据写入mysql
saveDataToMysqlVipRankAndLikeStatus(list, "like_status")
}
})
)
//按天进行计算
val count: DStream[(String, Int)] = data
.map(_.value().split("\t")(2).split(" ")(0)) //截取日期中的天
.map((_, 1)).updateStateByKey(updateFunc) //实时统计总数(历史累加)
//遍历统计结果
count.foreachRDD(_.foreach(row => {
//调用方法把数据存储到mysql
saveDataToMysqlCountComment(row._1, row._2)
}
))
//5 开启计算任务
ssc.start()
//6 等待关闭
ssc.awaitTermination()
}
//创建连接,使用jdbc连接mysql
def mysqlConnection(): Connection = {
DriverManager.getConnection("jdbc:mysql://hadoop01:3306/rng_comment?characterEncoding=UTF-8", "root", "123456")
}
/**
* 将数据存入到mysql中
*
* @param data 数据
*/
def saveDataToMysqlVipRankAndLikeStatus(data: List[String], tableName: String): Unit = {
//获取连接
val connection: Connection = mysqlConnection()
//创建一个变量用来保存sql语句
val sql = s"insert into ${tableName} (`index`, child_comment, comment_time, content, da_v, like_status, pic, user_id, user_name,vip_rank, stamp) values (?,?,?,?,?,?,?,?,?,?,?)"
//将一条数据存入到mysql
val ps: PreparedStatement = connection.prepareStatement(sql)
ps.setString(1, data.head)
ps.setString(2, data(1))
ps.setString(3, data(2))
ps.setString(4, data(3))
ps.setString(5, data(4))
ps.setString(6, data(5))
ps.setString(7, data(6))
ps.setString(8, data(7))
ps.setString(9, data(8))
ps.setString(10, data(9))
ps.setString(11, data(10))
//提交
ps.execute()
connection.close()
}
/**
* 将数据存入到mysql中
*
* @param time 时间
* @param count 数量
*/
def saveDataToMysqlCountComment(time: String, count: Int): Unit = {
println(s"${time}\t ${count}")
if (time.contains("2018/10/20") || time.contains("2018/10/21") || time.contains("2018/10/22") || time.contains("2018/10/23")) {
//获取连接
val connection: Connection = mysqlConnection()
//创建一个变量用来保存sql语句
val sql = "INSERT INTO count_comment (time,count) VALUES (?,?) ON DUPLICATE KEY UPDATE count = ?"
//将一条数据存入到mysql
val ps: PreparedStatement = connection.prepareStatement(sql)
ps.setString(1, time)
ps.setInt(2, count)
ps.setInt(3, count)
//提交
ps.execute()
connection.close()
}
}
/**
* 历史累加
*
* @param currentValues 当前值
* @param historyValue 历史值
* @return
*/
def updateFunc(currentValues: Seq[Int], historyValue: Option[Int]): Option[Int] = {
//历史累加
val result: Int = currentValues.sum + historyValue.getOrElse(0)
//返回结果
Some(result)
}
}