大数据项目实战之新闻话题统计分析

前言:本文是一个完整的大数据项目实战,实时|离线统计分析用户的搜索话题,并用JavaEE工程前端界面展示出来。这些指标对网站的精准营销、运营都有极大帮助。架构大致是按照企业标准来的,从日志的采集、转化处理、实时计算、JAVA后台开发、WEB前端展示,一条完整流程线下来,甚至每个节点都用的高可用架构,都考虑了故障转移和容错性。所用到的框架包括 :Hadoop(HDFS+MapReduce+Yarn)+Flume+KafKa+Hbase+Hive+Spark(SQL、Streaming )+Hive+Mysql+SpringMVC+Mybatis+Websocket+AugularJs+Echarts。所涉及到的语言包括:JAVA、Scala、Shell 

项目部署地址:http://120.79.35.74:443/Hot_News_Web/

项目源代码联系邮箱:[email protected]

项目架构图:

大数据项目实战之新闻话题统计分析_第1张图片

 

     一:数据源处理(搜狗实验室获取新闻资源 XML——>TXT:java解析大批量xml文件 代码后贴)

 

       处理思路:利用SAXReader获取xml文件内容,并构建News实体类以便写入txt文件,然后编写ReadWebLog类并编写脚本运行在Liunx上模拟新闻搜索日志产生

       Liunx运行jar命令:java -jar 你的上传jar包所在目录  args0 args1

       或Shell脚本命令:

#/bin/bash

echo "start log"

java -jar 你的上传jar包所在目录  args0 args1

 

代码:

处理搜狗实验室元数据.xml----->txt

package cn.yusys.hotnews.datasource;

 

import org.dom4j.Document;

import org.dom4j.Element;

import org.dom4j.io.SAXReader;

 

import java.io.BufferedWriter;

import java.io.File;

import java.io.FileWriter;

import java.io.IOException;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Date;

import java.util.Iterator;

import java.util.List;

 

/**

* 解析搜狗实验室新闻xml文件为txt文件---->项目数据源

* @author Tangzhi mail:[email protected]

* Created on 2018年11月12日

*/

public class MyXMLReader2JDOM {

    public static void main(String[] args) {

        // 获取xml文件读取流

        SAXReader reader = new SAXReader();

        // 设置字符集编码方式

        reader.setEncoding("utf-8");

        Document document;

        Element rootElement;

        List docList;

        Iterator iterator;

        // 用于存放节点数据以便后面的写入之news.log

        ArrayList list = new ArrayList();

        // 开始进行读取

        try {

            document = reader.read(new File("D:\\Downloads\\大数据数据源\\news_tensite_xml.smarty.dat"));

            // 得到根节点元素 ...

            rootElement = document.getRootElement();

            //...

            docList = rootElement.elements("doc");

         /*

          * 得到xml具体配置文件信息

          */

            iterator = docList.iterator();

            for (Element e : docList) {

                News news = new News();

                /**

                 * 遍历子节点将具体新闻信息写入txt文件

                 */

                if (e.element("url") != null && !" ".equals(e.element("url"))) {

                    news.setUrl(e.element("url").getStringValue().trim());

                }

                if (e.element("docno") != null && !" ".equals(e.element("docno"))) {

                    news.setDocno(e.element("docno").getStringValue().trim());

                }

                if (e.element("contenttitle") != null && !" ".equals(e.element("contenttitle"))) {

                    news.setContenttitle(e.element("contenttitle").getStringValue().trim());

                }

                if (e.element("content") != null && !" ".equals(e.element("content"))) {

                    news.setContent(e.element("content").getStringValue().trim());

                }

                list.add(news);

            }

            /**

             * 进行写入txt文件

             */

            writwToFile(list);

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    /**

     * 写入txt文件(后期当源数据文件过大时进行分片处理)

     * @throws IOException

     */

    public static void writwToFile(List list) throws IOException {

        File file = new File("D:\\Downloads\\大数据数据源\\news2.log");

        BufferedWriter bw = new BufferedWriter(new FileWriter(file));

        if (!file.exists()) {

            try {

                file.createNewFile();

            } catch (IOException e) {

                e.printStackTrace();

            }

        } else {

            for (News news : list) {

                Date date = new Date();

                SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

                String dateStr = sdf.format(date);

                bw.write("datetime"+"="+dateStr+"|");

                bw.write("url"+"="+news.getUrl()+"|");

                bw.write("docno"+"="+news.getDocno()+"|");

                bw.write("contenttitle"+"="+news.getContenttitle()+"|");

                bw.write("content"+"="+news.getContent());

                bw.write("\n");

                bw.flush();

            }

        }

    }

}

----------------------------------------------------------------------------------------------------------------

----------------------------------------------------------------------------------------------------------------

package cn.yusys.hotnews.datasource;

/**

*xml解析时新闻实体类

*/

public class News implements Serializable{

     // 实现序列化接口以便多台机器同时解析

   public News () {

       

   }

   

   public News(String url, String docno, String contenttitle, String content) {

    super();

    this.url = url;

    this.docno = docno;

    this.contenttitle = contenttitle;

    this.content = content;

}

 

String url;

   String docno;

   String contenttitle;

   String content;

public String getUrl() {

    return url;

}

public void setUrl(String url) {

    this.url = url;

}

public String getDocno() {

    return docno;

}

public void setDocno(String docno) {

    this.docno = docno;

}

public String getContenttitle() {

    return contenttitle;

}

public void setContenttitle(String contenttitle) {

    this.contenttitle = contenttitle;

}

public String getContent() {

    return content;

}

public void setContent(String content) {

    this.content = content;

}

   

}

-----------------------------------------------------------------------------------------

-----------------------------------------------------------------------------------------

运行在Liunx上模拟日志产生并通过flume采集

package cn.yusys.hotnews.datasource;

import java.io.*;

/**

* 模拟日志服务器产生日(从news.log/news1.log中随机切换文件读取数据然后写入日志文件-----》然后使用进行flume采集)

* @author Tangzhi mail:[email protected]

* @date 2018年11月12日

*/

public class ReadWebLog {

    public static String readFileName;

    public static String writeFileName;

    public static void main (String[] args) {

         readFileName = args[0];

         writeFileName = args[1];

         readFile(readFileName);

    }

    /**

     * 从new.log/news1.log中随机读取日志信息

     */

    public static void readFile(String fileName){

        try {

            FileInputStream fs = new FileInputStream(fileName);

            // 转换流

            InputStreamReader isr = new InputStreamReader(fs,"utf-8");

            BufferedReader br = new BufferedReader(isr);

            int count = 0;

            while (br.readLine() != null){

                String line = br.readLine();

                count ++;

                // 自定义读取间隔毫秒

                Thread.sleep(1000);

                System.out.println("row:" + count + ">>>>>>>>" + line);

                /**

                 * 写入到指定文件中(与flume配置文件对应)

                 */

                writeFile(writeFileName,line);

            }

        } catch (Exception e) {

            e.printStackTrace();

        }

 

    }

    /**

     * 文件内容的写入

     */

    public static void writeFile (String fileName,String line) {

        try {

            FileOutputStream fs = new FileOutputStream(fileName, true);

            OutputStreamWriter osw = new OutputStreamWriter(fs);

            BufferedWriter bw = new BufferedWriter(osw);

            // 执行文件内容的写入

            bw.write(line);

            bw.write("\n");

            bw.close();

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

}

 

 

Q&A

Q1:

Java异常: "2 字节的 UTF-8 序列的字节 2 无效。"  

A1:利用记事本打开 另行保存编码格式为UTF-8 再Notepad++(其他编辑器亦可)用打开即可

Q2 :

在Liunx系统上运行jar时出现找不到主类

A1 :使用IDEA时pom.xml加入以下依赖并在部分写入你类全路径

        

            

                org.apache.maven.plugins

                maven-shade-plugin

                2.4.3

                

                    

                        package

                        

                            shade

                        

                        

                            

                                

                                    *:*

                                    

                                        META-INF/*.SF

                                        META-INF/*.DSA

                                        META-INF/*.RSA

                                    

                                

                            

                            

                                

                                    

                                

                            

                        

                    

                

            

        

    

Liunx效果图:

大数据项目实战之新闻话题统计分析_第2张图片

附 :当数据源文件很大时,可以在ReadWebLog类中进行随机读取多个log文件设置也可是模拟数据更为真实

      搜狗实验室:中文信息处理以及部分数据提供  http://www.sogou.com/labs/

注:根据项目需求进行数据前期清洗

到此:数据源获取 简单清洗(uid设置  时间戳绑定 部分数据过滤)已经完成

 

 

     二 :Flume与HBase、Kafka数据对接

处理思路:Flume采集数据源数据与HBase集成----->HBase与Hive对接------>Hadoop MapReduce 完成离线计算分析----->前台Echart

                 Flume采集数据源数据与Kafka集成----->Kafka与Spark Streaming对接----->Spark + MySql 完成实时计算分析----->前台Echart

 

1.Flume与HBase

    Tips:此时Flume Sink 下沉目标为HBase (前提:Liunx环境安装HBase,理解HBase原理)修改hbase-site.xml 修改hbase数据目录 zookeeper地址

               HBase常用Shell命令:

                                            启动HBase: strat-hbase.sh

                                            Shell命令交互模式:./hbase shell

                                                     创建表 : create '表名', '列族名1','列族名2','列族名N'.......

                 HBase配置文件下图:

 

##hbase-env.sh

export JAVA_HOME=/opt/jdk1.7.0_65   ----自己虚拟机jdk路径(etc/profile)

export JAVA_CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar

export HBASE_MANAGES_ZK=false ----不使用HBase自带zookeeper使用外部zookeeper集群

 

##hbase-site.xml

hbase.master

192.168.25.136:60000

hbase.master.maxclockskew

180000

hbase.rootdir

hdfs://192.168.25.136:9000/hbase

hbase.cluster.distributed

true

hbase.zookeeper.quorum

192.168.25.136

hbase.zookeeper.property.dataDir

/tmp/hbase_data

 

##regionservers

localhost ---填写集群IP地址或主机名

 

Q&A

Q1:创建HBase表时报错: ERROR:Can not get address from Zookeeper; znode data == null

A1:1、确保zookeeper集群可用 Hadoop集群可用 HBase服务正常启动

        2、vi hbase-site.xml 查看HBase数据存放目录权限是否为可读可写  

大数据项目实战之新闻话题统计分析_第3张图片

 

Q2:HBase0.9.4 通过脚本启动后创建表时报错  

ERROR: org.apache.hadoop.hbase.MasterNotRunningException: Retried 7 times

A2 : 

 大数据项目实战之新闻话题统计分析_第4张图片

   1.查看hadoop集群 zookeeper集群是否可用

   2.HBase 0.9.4 与 Hadoop 2.x 兼容性差 版本不一致 导致无法进行RPC通信 建议搭配:HBase 0.9.9.x + Hadoop 2.x但请注意此时

      Flume最好选择1.7.x 原因见下图:

 大数据项目实战之新闻话题统计分析_第5张图片

 

1.Flume与Kafka

    Tips:此时Flume Sink 下沉目标为Kafka(前提:Liunx环境安装Kafka,理解Kafka原理) 

               Kafka常用Shell命令:

                                          启动kafka: bin/kafka-servers-start   ../config/server.properties  (指定特定文件启动)

                                          创建主题:bin/kafka-topics.sh --create --zookeeper 192.168.25.136:2181 --replication-factor 1 --partitions 1 --topic weblogs

                                          删除主题:bin/kafka-topics.sh --delete --zookeeper 192.168.25.136:2181 --topic weblogs

                                          控制台消费topic的数据:bin/kafka-console-consumer.sh --zookeeper 192.168.25.136:2181 --topic weblogs  --from-beginning

                                          控制台生产数据:bin/kafka-console-producer.sh --broker-list 192.168.25.136:9092 --topic weblogs

                                          查看主题具体信息:bin/kafka-topics.sh --zookeeper 192.168.25.136:2181 --describe --topic weblogs

           Kafka配置文件 (kafka在大数据项目中大多作为数据缓冲区  生产者-消费者模式)

 

#broker的全局唯一编号,不能重复

broker.id=0

 

#用来监听链接的端口,producer或consumer将在此端口建立连接

port=9092

 

#处理网络请求的线程数量

num.network.threads=3

 

#用来处理磁盘IO的线程数量

num.io.threads=8

 

#发送套接字的缓冲区大小

socket.send.buffer.bytes=102400

 

#接受套接字的缓冲区大小

socket.receive.buffer.bytes=102400

 

#请求套接字的缓冲区大小

socket.request.max.bytes=104857600

 

#kafka运行日志存放的路径

log.dirs=/export/logs/kafka

 

#topic在当前broker上的分片个数

num.partitions=2

 

#用来恢复和清理data下数据的线程数量

num.recovery.threads.per.data.dir=1

 

#segment文件保留的最长时间,超时将被删除

log.retention.hours=168

 

#滚动生成新的segment文件的最大时间

log.roll.hours=168

 

#日志文件中每个segment的大小,默认为1G

log.segment.bytes=1073741824

 

#周期性检查文件大小的时间

log.retention.check.interval.ms=300000

 

#日志清理是否打开

log.cleaner.enable=true

 

#broker需要使用zookeeper保存meta数据

zookeeper.connect=192.168.25.136:2181,192.168.25.136:2182,192.168.25.136:2183

 

#zookeeper链接超时时间

zookeeper.connection.timeout.ms=6000

 

#partion buffer中,消息的条数达到阈值,将触发flush到磁盘

log.flush.interval.messages=10000

 

#消息buffer的时间,达到阈值,将触发flush到磁盘

log.flush.interval.ms=3000

 

#删除topic需要server.properties中设置delete.topic.enable=true否则只是标记删除

delete.topic.enable=true

 

#此处的host.name为本机IP(重要),如果不改,则客户端会抛出:Producer connection to localhost:9092 unsuccessful 错误!

host.name=192.168.25.136

        

  当kafka消费的数据与HBase的weblogs表记录总数相等时说明已完成Flume与HBase、Kafka数据对接

   Flume启动日志图:

大数据项目实战之新闻话题统计分析_第6张图片

HBase count表中数据:

Kafka消费数据:

 

附:Flume采集数据按特定列下沉至Hbase、Kafka配置文件(重点)

a1.sources = r1

a1.channels = kafkaC hbaseC

a1.sinks = kafkaS hbaseS

 

a1.sources.r1.type = exec  

a1.sources.r1.command = tail -F /home/hotnews/weblogs.log

a1.sources.r1.channels = kafkaC hbaseC

 

# flume + hbase

# sink 配置为HBaseSink 和 SimpleHbaseEventSerializer

a1.channels.hbaseC.type = memory

a1.channels.hbaseC.capacity = 10000

a1.channels.hbaseC.transactionCapacity = 10000

#HBase表名

a1.sinks.hbaseS.type = org.apache.flume.sink.hbase.HBaseSink

a1.sinks.hbaseS.table = weblogs

#HBase表的列族名称

a1.sinks.hbaseS.columnFamily  = info

a1.sinks.hbaseS.serializer = org.apache.flume.sink.hbase.RegexHbaseEventSerializer

#正则匹配新闻数据去到对应的列族下的对应列(xxxx|xxxx|xxxx|xxxx|xxxx)

a1.sinks.hbaseS.serializer.regex = ^(.*)\\|(.*)\\|(.*)\\|(.*)\\|(.*)

a1.sinks.hbaseS.serializer.colNames =datatime,url,docno,contenttitle,content

#a1.sinks.hbaseS.serializer.rowKeyIndex = 0   

# 组合sink和channel

a1.sinks.hbaseS.channel = hbaseC

 

# flume + kafka

a1.channels.kafkaC.type = memory

a1.channels.kafkaC.capacity = 10000

a1.channels.kafkaC.transactionCapacity = 10000

 

a1.sinks.kafkaS.channel = kafkaC

a1.sinks.kafkaS.type = org.apache.flume.sink.kafka.KafkaSink

a1.sinks.kafkaS.topic = weblogs

a1.sinks.kafkaS.brokerList = 192.168.25.136:9092

a1.sinks.kafkaS.zookeeperConnect = 192.168.25.136:2181

a1.sinks.kafkaS.requiredAcks = 1

a1.sinks.kafkaS.batchSize = 20

a1.sinks.kafkaS.serializer.class = kafka.serializer.StringEncoder

 

 

三 :Kafka+Spark Streaming +MySql 实时计算分析

1、数据库连接池编写(Java原生版 + Scala c3p0版)

      附:MySql部署在Liunx,连接信息配置在db.properties

Java原生版:

package cn.yuysy.hotnews.realtime.db;

 

import java.io.File;

import java.io.FileInputStream;

import java.sql.Connection;

import java.sql.DriverManager;

import java.sql.SQLException;

import java.util.LinkedList;

import java.util.Properties;

 

/**

* 数据库连接池

* Created on 2018-11-15

* @author @author tangzhi mail:[email protected]

*/

public class ConnectionPool {

           private static LinkedList connectionQueue;

           private static Properties prop ;

 

    /**

     * 驱动类

     */

    static {

        try {

            prop  = new Properties();

            prop.load(new FileInputStream(new File("C:\\Users\\Administrator\\Hot_News\\src\\main\\scala\\cn\\yuysy\\hotnews\\realtime\\db\\db.properties")));

            Class.forName(prop.getProperty("driverName").toString());

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    /**

     * 获取连接对象

     */

    public synchronized Connection getConnection () {

        if (connectionQueue == null || connectionQueue.size() == 0) {

            connectionQueue = new LinkedList();

            for (int i = 0;i < 5;i ++) {

                try {

                    Connection connection = DriverManager.getConnection(prop.getProperty("url").toString(), prop.getProperty("username").toString(), prop.getProperty("password").toString());

                    connectionQueue.add(connection);

                } catch (SQLException e) {

                    e.printStackTrace();

                }

            }

        }

        return connectionQueue.poll();

    }

    /**

     * 归还连接至连接池

     */

    public void returnConnection(Connection connection) {

        connectionQueue.add(connection);

    }

}

 

Scala c3p0版:

 

package cn.yuysy.hotnews.realtime.db

 

import java.io.{File, FileInputStream, InputStream}

import java.sql.Connection

import java.util.Properties

 

import com.mchange.v2.c3p0.ComboPooledDataSource

import org.apache.spark.SparkFiles

 

/**

  * C3P0数据库连接池

  * Created on 2018-11-15

  * @author tangzhi mail:[email protected]

  */

class c3p0ConnectionPool(isLocal: Boolean) extends Serializable {

     private val cpds: ComboPooledDataSource = new ComboPooledDataSource(true)

     private val prop = new Properties()

     private var in: InputStream = _

     isLocal match {

       case true => in = getClass.getResourceAsStream("db.properties")  //本地IDEA模式

       case false => in = new FileInputStream(new File(SparkFiles.get("db.properties"))) //Liunx spark集群模式

     }

 

  /**

    * 注册连接

    * @return

    */

    try {

      prop.load(in);

      cpds.setJdbcUrl(prop.getProperty("url").toString())

      cpds.setDriverClass(prop.getProperty("driverName").toString())

      cpds.setUser(prop.getProperty("username").toString())

      cpds.setPassword(prop.getProperty("password").toString())

      cpds.setMaxPoolSize(Integer.valueOf(prop.getProperty("maxPoolSize").toString()));

      cpds.setMinPoolSize(Integer.valueOf(prop.getProperty("minPoolSize").toString()));

      cpds.setAcquireIncrement(Integer.valueOf(prop.getProperty("acquireIncrement").toString()))

    } catch {

      case ex: Exception => ex.printStackTrace()

    }

  def getConnection: Connection={

         try {

           cpds.getConnection()

         } catch {

           case ex: Exception => ex.printStackTrace()

             null

         }

     }

   object c3p0ConnectionPool{

     var  connectionPool: c3p0ConnectionPool = _

     def getc3p0ConnectionPool(isLocal: Boolean): c3p0ConnectionPool = {

           synchronized {

              if (connectionPool == null) {

                connectionPool = new c3p0ConnectionPool(isLocal)

              }

           }

       connectionPool

     }

   }

}

 

Q&A

Q1:本地运行spark streaming 程序读取kafka数据报错:

Exception in thread "main" java.lang.AssertionError: assertion failed: No output streams registered, so nothing to execute at scala

A1:spark streaming 算子运行没有触发Action 以下为常见action

大数据项目实战之新闻话题统计分析_第7张图片

Q2:spark streamign 实时计算处理后的数据写入MySqL乱码

A2:在数据库连接配置文件的URL后添加: ?useUnicode=true&characterEncoding=utf8即可

2、实时分析思路 + 部分代码

实时分析思路

       从kafka读取数据后(_._2)----->新闻数据------>先将value映射为Map[String,String]----->切割、根据key分组、聚合----->根据key值写sql------>写入MySql成功

部分代码:

 

package cn.yuysy.hotnews.realtime

 

import java.io.{File, FileInputStream}

import java.sql.{Connection, Statement}

import java.util.Properties

 

import cn.yuysy.hotnews.realtime.db.c3p0ConnectionPool

import org.apache.spark.{HashPartitioner, SparkConf}

import org.apache.spark.storage.StorageLevel

import org.apache.spark.streaming.kafka.KafkaUtils

import org.apache.spark.streaming.{Seconds, StreamingContext}

 

/**

  * spark streaming从kafka获取数据进行实时计算然后下沉至MySql

  * Created on 2018-11-15

  * @author tangzhi mail:[email protected]

  */

object HotnewsRealTimeStreaming {

  val updateFunc = (iter: Iterator[(String, Seq[Int], Option[Int])]) => {

    //iter.flatMap(it=>Some(it._2.sum + it._3.getOrElse(0)).map(x=>(it._1,x)))

    iter.flatMap { case (x, y, z) => Some(y.sum + z.getOrElse(0)).map(i => (x, i)) }

  }

  def main(args: Array[String]): Unit = {

    // 设置日志打印级别

     LoggerLevels.setStreamingLogLevels()

     val prop = new Properties()

     prop.load(new FileInputStream(new File("C:\\Users\\Administrator\\Hot_News\\src\\main\\scala\\cn\\yuysy\\hotnews\\realtime\\sparkstreaming-kafka.properties")))

     val sparkConf = new SparkConf().setAppName("HotnewsRealTimeStreaming").setMaster("local[2]")

     val ssc = new StreamingContext(sparkConf,Seconds(3))

    //检查点文件存放地址

     ssc.checkpoint("d://ck")

    // 获取kafka主题(多个kafka主题)

     val topicMap = prop.getProperty("topics").split(",").map((_, prop.getProperty("numThreads").toInt)).toMap

    // 从kafka定时批量获取新闻数据

     val line = KafkaUtils.createStream(ssc,prop.getProperty("zkQuorum"),prop.getProperty("group"),topicMap,StorageLevel.MEMORY_AND_DISK_SER).map(_._2)

    /**

      * spark 实时从kafka上拉取数据存入MySql数据库

      */

    // 构造连接、结果集

    var conn: Connection = null

    var smt: Statement = null

    // 1.将kafka value即新闻数据存入Map[String,String] ---> ["datetime","2018-11-18"]

    def valueSplit(value: String): Map[String,String] = {

          val x = value.split("\\|")

          var valueMap: Map[String,String] = Map()

           x.foreach(kvs => {

            val kv = kvs.split("=")

            if (kv != null && kv.length == 2) {

              valueMap += (kv(0) -> kv(1))

            }

           })

       valueMap

    }

    /**

      * spark streaming 实时计算分析 写入MySql数据库

      */

    line.flatMap(_.split("\\|")).map(valueSplit).map(x => {

      (x.getOrElse("contenttitle",null),1)

    }).updateStateByKey(updateFunc, new HashPartitioner(ssc.sparkContext.defaultParallelism), true).map(data => {

      if (data._1 != null) {

        try {

          conn = c3p0ConnectionPool.getc3p0ConnectionPool(true).getConnection

          conn.setAutoCommit(false)

          smt = conn.createStatement()

          val updateSql = "UPDATE testweblogs SET count = '"+data._2+"' where contenttitle = '" + data._1 + "'"

//        val insertSql = "INSERT INTO testweblogs(contenttitle,count) VALUES ('" + data._1 + "','" + data._2 + "')"

          smt.execute(updateSql)

conn.commit()

        } catch {

          case ex => ex.printStackTrace()

        } finally {

          if (smt != null) {

            smt.close()

          }

          if (conn != null) {

            conn.close()

          }

        }

      }

    }).print()

     ssc.start()

     ssc.awaitTermination()

  }

}

object c3p0ConnectionPool{

  var  connectionPool: c3p0ConnectionPool = _

  def getc3p0ConnectionPool(isLocal: Boolean): c3p0ConnectionPool = {

    synchronized {

      if (connectionPool == null) {

        connectionPool = new c3p0ConnectionPool(isLocal)

      }

    }

    connectionPool

  }

}

 

 

四 :Kafka+HBase+Hive(Hadoop) 离线计算分析

1、Kafka对接Hbase已经完成

2、HBase对接Hive

      处理思路:在Hive创建外部表weblogs连接HBase对应表weblogs

      

CREATE EXTERNAL TABLE weblogs(`id` string , `datatime` string , `url` string, `docno` string, `contenttitle` string,`content` string)

STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'  //指定存储处理器

WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,info:datatime,info:url,info:docno,info:contenttitle,info:content")  //声明列族,列名

TBLPROPERTIES ("hbase.table.name" = "weblogs", "hbase.mapred.output.outputtable" = "weblogs");

//hbase.table.name声明HBase表名,为可选属性默认与Hive的表名相同,hbase.mapred.output.outputtable指定插入数据时写入的表,如果以后需要往表插入数据就需要指定该值

 

大数据项目实战之新闻话题统计分析_第8张图片

      大数据项目实战之新闻话题统计分析_第9张图片

离线分析思路: 

       数据高速存入HBase---->通过外部表同步至Hive------>书写SQL完成数据离线统计---->选择时间间隔存入MySql----->前台展示

部分代码:

附:IDE远程访问Hive必须开启hiveserver2服务不然会出现连接拒绝错误

       启动命令:./hive --service hiveserver2

package cn.yusys.hotnews.offline;

 

import cn.yusys.hotnews.dataconnection.ConnectionPool;

 

import java.io.File;

import java.io.FileInputStream;

import java.io.IOException;

import java.sql.*;

import java.util.Properties;

 

/**

* @ClassName HotnewsOfflineStreaming

* @Description 从Hive读取数据存入MySql

* @Author tangzhi mail:[email protected]

* * Created on 2018-11-20

**/

public class HotnewsOfflineStreaming {

    public static void main (String[] args) throws IOException {

        Connection connection = null ,connection1 = null;

        Statement statement = null ,statement1 = null;

        Properties prop = new Properties();

        prop.load(new FileInputStream(new File("C:\\Users\\Administrator\\Hot_News\\src\\main\\java\\cn\\yusys\\hotnews\\offline\\db.properties")));

        /**

         * 1.从Hive读取数据

         */

        try {

            Class.forName(prop.getProperty("driverName").toString());

            connection = DriverManager.getConnection(prop.getProperty("url").toString(), prop.getProperty("user").toString(), prop.getProperty("password").toString());

            connection.setAutoCommit(false);

            statement = connection.createStatement();

            String hiveSql = "select count(1) from weblogs";

            ResultSet resultSet = statement.executeQuery(hiveSql);

            while (resultSet.next()) {

                // 根据项目需求书写Sql

                int count = resultSet.getInt(1);

                /**

                 * 统计后的数据

                 */

                ConnectionPool connPool = new ConnectionPool();

                connection1 = connPool.getConnection();

                connection1.setAutoCommit(false);

                statement1 = connection1.createStatement();

                String sql = "INSERT INTO testweblogs2(count) VALUE('"+count+"')";

                boolean execute = statement1.execute(sql);

                connection1.commit();

                if (!execute) {

                    // excute:false ....>插入或更新成功  true:查询成功

                    System.out.println("离线分析数据库更新成功");

                }else {

                    System.out.println("离线分析数据库更新失败");

                }

            }

 

        } catch (Exception e) {

            e.printStackTrace();

        } finally {

            // 释放资源

            if (connection != null && connection1 != null && statement != null && statement1 != null) {

                try {

                    statement.close();

                    statement1.close();

                    connection.close();

                    connection1.close();

                } catch (SQLException e) {

                    e.printStackTrace();

                }

            }

        }

    }

}

 

至此大数据分析处理部分已经结束接下来即是构建JavaEE工程进行前端WEB展示

 

 

五 :JavaEE工程前端展示(SSM)

1、SSM工程搭建(略)

2、采用WebSocket全双工通信方式使前段与后端实现实时交换

Q&A

Q1:测试SSM框架搭建时出现数据无法加载问题

A1:1.当配置文件中没有指定mapper.xml具体路径时应把Mapper类与Mapper.xml文件放在同一包下

        2.也有可能spring无法加载到mapper.xml文件可在项目pom.xml中添加

 

    

        src/main/java

        

            **/*.xml

        

    

    

        src/main/resources

    

Q1:通过WebSocke通信时报错:

A1:1.若是在MVC模式下确认类上是否有@ServerEndpoint(value = "/websocket", configurator = SpringConfigurator.class)注解

        2.服务器版本要求:

大数据项目实战之新闻话题统计分析_第10张图片

        3.配置文件xml应当是如下配置

           其中 provided没写则会造成runtime 冲突 也会引起404 not found

            javax.websocket

            javax.websocket-api

            1.0

            provided

 

 #部分主要代码                

package service.impl;

import com.alibaba.fastjson.JSON;

import entity.Weblog;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.web.socket.server.standard.SpringConfigurator;

import service.WeblogService;

import javax.websocket.OnClose;

import javax.websocket.OnMessage;

import javax.websocket.OnOpen;

import javax.websocket.Session;

import javax.websocket.server.ServerEndpoint;

import java.io.IOException;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

/**

* @ClassName WebSocket

* @Description 实现前端与数据库全双工通信

* @Author Administrator

* @Version 1.0

**/

@ServerEndpoint(value = "/websocket", configurator = SpringConfigurator.class)

public class WebSocket {

    @Autowired

    private WeblogService weblogService;

    @OnMessage

    public void onMessage(String message, Session session) throws IOException, InterruptedException {

           String[] titleNames = new String[10];

           Long[]   titleCounts = new Long[10];

           Long[] titleSum = new Long[1];

           while (true) {

               Map map = new HashMap();

               List weblogList = weblogService.webcount();

               System.out.println(weblogList);

               for (int i = 0;i

                    titleNames[i] = weblogList.get(i).getContenttitle();

                    titleCounts[i] = weblogList.get(i).getCount();

               }

               titleSum[0] = weblogService.websum();

               map.put("titleName", titleNames);

               map.put("titleCount", titleCounts);

               map.put("titleSum", titleSum);

               System.out.print(map);

               session.getBasicRemote().sendText(JSON.toJSONString(map));

               Thread.sleep(1000);

               map.clear();

           }

    }

    @OnOpen

    public void onOpen() {

        System.out.println("Client connected");

    }

 

    @OnClose

    public void onClose() {

        System.out.println("Connection closed");

    }

}

 

至此前端展示工程也已经完成,也就意味着项目一期完成

注:原项目贡献者阿里云云栖社区地址:https://yq.aliyun.com/articles/557454

你可能感兴趣的:(大数据项目实战之新闻话题统计分析)