flume+kafka+sparkstreaming+hbase

文章目录

    • 爬虫代码
      • Monitor
      • Catcher
      • pom.xml
      • 启动爬虫
    • flume配置文件
    • 启动flume命令
    • kafka相关命令
    • Hive建立HBase关联表
    • Spark Streaming
      • SparkStreamTest
      • pom.xml
      • 启动命令

爬虫代码

Monitor

package ln;

import java.io.File;

public class Monitor extends Thread{
    @Override
    public void run() {
        super.run();
        File file = new File("Data.txt");
        while(true){
            System.out.println("文件大小:"+file.length()+"bytes");
            //1M等于1048576byte
            //file.length返回的是byte
            if(file.length()>=100000000){
                boolean delete = file.delete();
                if (delete==true)
                    System.out.println("删除成功");
            }
            try {
                sleep(2000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }
}

Catcher

package ln;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;

public class Catcher {
    public static void main(String[] args) throws IOException, InterruptedException {
        //股票列表url
        String StockListURL="http://43.push2.eastmoney.com/api/qt/clist/get?&pn=1&pz=10000&po=0&np=1&fltt=2&invt=2&fid=f12&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23&fields=f12,f13";
        WebClient webClient = new WebClient(BrowserVersion.CHROME);
        Monitor monitor = new Monitor();
        monitor.start();
        //json数据
        String contentAsString = webClient.getPage(StockListURL).getWebResponse().getContentAsString();
        Object[] objects = JSONObject.parseObject(contentAsString).getJSONObject("data").getJSONArray("diff").toArray();
        for (Object object : objects) {
            String F12 = JSONObject.parseObject(object.toString()).getString("f12");
            String F13 = JSONObject.parseObject(object.toString()).getString("f13");
            String DataURL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?fields1=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13&fields2=f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,f61&beg=0&end=20500101&rtntype=6&secid="+F13+"."+F12+"&klt=101&fqt=1";
            //原始数据
            String contentAsString1 = webClient.getPage(DataURL).getWebResponse().getContentAsString();
            Thread.sleep(5000);
            FileUtils.writeStringToFile(new File("Data.txt"), contentAsString1 + "\n", "UTF-8", true);
        }
    }
}

pom.xml



<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0modelVersion>

  <groupId>org.examplegroupId>
  <artifactId>SpiderartifactId>
  <version>1.0-SNAPSHOTversion>

  <name>Spidername>
  
  <url>http://www.example.comurl>

  <properties>
    <project.build.sourceEncoding>UTF-8project.build.sourceEncoding>
    <maven.compiler.source>1.7maven.compiler.source>
    <maven.compiler.target>1.7maven.compiler.target>
  properties>

  <dependencies>
    <dependency>
      <groupId>junitgroupId>
      <artifactId>junitartifactId>
      <version>4.11version>
      <scope>testscope>
    dependency>
    
    <dependency>
      <groupId>net.sourceforge.htmlunitgroupId>
      <artifactId>htmlunitartifactId>
      <version>2.49.1version>
    dependency>
    
    <dependency>
      <groupId>com.alibabagroupId>
      <artifactId>fastjsonartifactId>
      <version>1.2.76version>
    dependency>
    
    <dependency>
      <groupId>commons-iogroupId>
      <artifactId>commons-ioartifactId>
      <version>2.8.0version>
    dependency>
  dependencies>

  <build>
    <pluginManagement>
      <plugins>
        
        <plugin>
          <artifactId>maven-clean-pluginartifactId>
          <version>3.1.0version>
        plugin>
        
        <plugin>
          <artifactId>maven-resources-pluginartifactId>
          <version>3.0.2version>
        plugin>
        <plugin>
          <artifactId>maven-compiler-pluginartifactId>
          <version>3.8.0version>
        plugin>
        <plugin>
          <artifactId>maven-surefire-pluginartifactId>
          <version>2.22.1version>
        plugin>
        <plugin>
          <artifactId>maven-jar-pluginartifactId>
          <version>3.0.2version>
        plugin>
        <plugin>
          <artifactId>maven-install-pluginartifactId>
          <version>2.5.2version>
        plugin>
        <plugin>
          <artifactId>maven-deploy-pluginartifactId>
          <version>2.8.2version>
        plugin>
        
        <plugin>
          <artifactId>maven-site-pluginartifactId>
          <version>3.7.1version>
        plugin>
        <plugin>
          <artifactId>maven-project-info-reports-pluginartifactId>
          <version>3.0.0version>
        plugin>
      plugins>
    pluginManagement>
  build>
project>

启动爬虫

nohup java -jar xxxx.jar > /opt/spiderlog.file  2>&1 &

flume配置文件

a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /home/server/Data.txt
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.topic = mytopic
a1.sinks.k1.kafka.bootstrap.servers = server1:9092,server2:9092,server3:9092
a1.sinks.k1.kafka.flumeBatchSize = 20
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

启动flume命令

cd /opt/flume
nohup bin/flume-ng agent --conf conf --conf-file conf/example.conf --name a1 -Dflume.root.logger=INFO,console > /opt/flumelog.file  2>&1 &

kafka相关命令

bin/kafka-server-start.sh config/server.properties &
bin/kafka-topics.sh --create --topic mytopic --bootstrap-server localhost:9092
bin/kafka-topics.sh --describe --topic mytopic --bootstrap-server localhost:9092
bin/kafka-console-producer.sh --topic mytopic --bootstrap-server localhost:9092
bin/kafka-console-consumer.sh --topic mytopic --from-beginning --bootstrap-server localhost:9092
bin/kafka-topics.sh --zookeeper server1:2181 --list

Hive建立HBase关联表

CREATE TABLE StockInfo(
stockkey string,
stockmarket string,
stockcode string,
stockname string,
stockdate string,
stockopen string,
stockend string,
highest string,
lowest string,
ts string,
tn string,
zf string,
zdf string,
zde string,
ch string)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = 
":key,
info:stockmarket,
info:stockcode,
info:stockname,
info:stockdate,
info:stockopen,
info:stockend,
info:highest,
info:lowest,
info:ts,
info:tn,
info:zf,
info:zdf,
info:zde,
info:ch
")
TBLPROPERTIES ("hbase.table.name" = "StockInfo");

Spark Streaming

SparkStreamTest

import com.alibaba.fastjson.JSON.parseObject
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.streaming._
import org.apache.spark.SparkConf
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{KafkaUtils, LocationStrategies, LocationStrategy}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe

import scala.collection.mutable.ArrayBuffer

object SparkStreamTest {
  Logger.getLogger("org").setLevel(Level.ERROR);
  def main(args: Array[String]): Unit = {
    println("spark启动中")
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("StreamingKafkaTest")
    val ssc = new StreamingContext(conf,Seconds(1))
    ssc.checkpoint("hdfs://server1:9000/spark-checkpoint")
    val kafkaTopic = Array("mytopic")
    val kafkaParams = Map[String,Object](
      "bootstrap.servers" -> "server1:9092,server2:9092,server3:9092",
      "key.deserializer"->classOf[StringDeserializer],
      "value.deserializer"->classOf[StringDeserializer],
      "group.id"->"1",
      "enable.auto.commit"->(false : java.lang.Boolean)
    )


    val inputStream:InputDStream[ConsumerRecord[String,String]]= KafkaUtils.createDirectStream[String,String](ssc, LocationStrategies.PreferConsistent, Subscribe[String,String](kafkaTopic,kafkaParams))
    val dataDStream = inputStream.map(record =>(record.key,record.value)).map(_._2)
    dataDStream.foreachRDD{y=>y.foreach{x=>
      val data = parseObject(x).getJSONObject("data")
      val market = data.getString("market")
      val code = data.getString("code")
      val name = data.getString("name")
      val day_klines = data.getJSONArray("klines")
      val hbaseConf = HBaseConfiguration.create()
      hbaseConf.set("hbase.zookeeper.quorum","10.206.0.6")
      hbaseConf.set("hbase.property.clientPort","2181")
      val conn = ConnectionFactory.createConnection(hbaseConf)
      val tableName = TableName.valueOf("StockInfo")
      val table = conn.getTable(tableName)
      day_klines.forEach{data=>{
        val s = data.toString.split(",")
        val date = s(0)
        val open = s(1)
        val end = s(2)
        val highest = s(3)
        val lowest  = s(4)
        val ts  = s(5)
        val tn	= s(6)
        val zf	= s(7)
        val zdf	= s(8)
        val zde	= s(9)
        val ch= s(10)

        val putin = new Put(Bytes.toBytes(market+"."+code+","+date))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockname"), Bytes.toBytes(name))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockmarket"), Bytes.toBytes(market))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockcode"), Bytes.toBytes(code))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockdate"), Bytes.toBytes(date))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockopen"), Bytes.toBytes(open))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockend"), Bytes.toBytes(end))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("highest"), Bytes.toBytes(highest))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("lowest"), Bytes.toBytes(lowest))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("ts"), Bytes.toBytes(ts))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("tn"), Bytes.toBytes(tn))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("zf"), Bytes.toBytes(zf))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("zdf"), Bytes.toBytes(zdf))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("zde"), Bytes.toBytes(zde))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("ch"), Bytes.toBytes(ch))

        table.put(putin)
      }}
    }}
    ssc.start()
    ssc.awaitTermination()
  }
}

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0modelVersion>
  <groupId>org.examplegroupId>
  <artifactId>SparkProgramartifactId>
  <version>1.0-SNAPSHOTversion>
  <inceptionYear>2008inceptionYear>
  <properties>
    <scala.version>2.12.10scala.version>
  properties>

  <repositories>
    <repository>
      <id>scala-tools.orgid>
      <name>Scala-Tools Maven2 Repositoryname>
      <url>http://scala-tools.org/repo-releasesurl>
    repository>
  repositories>

  <pluginRepositories>
    <pluginRepository>
      <id>scala-tools.orgid>
      <name>Scala-Tools Maven2 Repositoryname>
      <url>http://scala-tools.org/repo-releasesurl>
    pluginRepository>
  pluginRepositories>

  <dependencies>
    <dependency>
      <groupId>org.scala-toolsgroupId>
      <artifactId>maven-scala-pluginartifactId>
      <version>2.12version>
    dependency>
    <dependency>
      <groupId>org.apache.maven.pluginsgroupId>
      <artifactId>maven-eclipse-pluginartifactId>
      <version>2.5.1version>
    dependency>
    <dependency>
      <groupId>org.scala-langgroupId>
      <artifactId>scala-libraryartifactId>
      <version>${scala.version}version>
    dependency>
    <dependency>
      <groupId>junitgroupId>
      <artifactId>junitartifactId>
      <version>4.4version>
      <scope>testscope>
    dependency>
    <dependency>
      <groupId>org.specsgroupId>
      <artifactId>specsartifactId>
      <version>1.2.5version>
      <scope>testscope>
    dependency>
    
    
    <dependency>
      <groupId>org.apache.sparkgroupId>
      <artifactId>spark-core_2.12artifactId>
      <version>3.1.2version>
    dependency>
    <dependency>
      <groupId>org.apache.sparkgroupId>
      <artifactId>spark-streaming_2.12artifactId>
      <version>3.1.2version>
    dependency>
    <dependency>
      <groupId>org.apache.sparkgroupId>
      <artifactId>spark-streaming-kafka-0-10_2.12artifactId>
      <version>3.1.2version>
    dependency>
    <dependency>
      <groupId>org.apache.hadoopgroupId>
      <artifactId>hadoop-mapreduce-client-coreartifactId>
      <version>2.7.2version>
    dependency>
    <dependency>
      <groupId>org.apache.hadoopgroupId>
      <artifactId>hadoop-commonartifactId>
      <version>2.7.2version>
    dependency>
    <dependency>
      <groupId>org.apache.hadoopgroupId>
      <artifactId>hadoop-mapreduce-client-commonartifactId>
      <version>2.7.2version>
    dependency>
    <dependency>
      <groupId>org.apache.hadoopgroupId>
      <artifactId>hadoop-mapreduce-client-jobclientartifactId>
      <version>2.7.2version>
    dependency>
    <dependency>
      <groupId>org.apache.hbasegroupId>
      <artifactId>hbase-clientartifactId>
      <version>1.4.13version>
    dependency>
    <dependency>
      <groupId>org.apache.hbasegroupId>
      <artifactId>hbase-serverartifactId>
      <version>1.4.13version>
    dependency>
    <dependency>
      <groupId>com.alibabagroupId>
      <artifactId>fastjsonartifactId>
      <version>1.2.76version>
    dependency>

  dependencies>

  <build>
    <sourceDirectory>src/main/scalasourceDirectory>
    <testSourceDirectory>src/test/scalatestSourceDirectory>
    <plugins>
      <plugin>
        <groupId>org.scala-toolsgroupId>
        <artifactId>maven-scala-pluginartifactId>
        <executions>
          <execution>
            <goals>
              <goal>compilegoal>
              <goal>testCompilegoal>
            goals>
          execution>
        executions>
        <configuration>
          <scalaVersion>${scala.version}scalaVersion>
          <args>
            <arg>-target:jvm-1.5arg>
          args>
        configuration>
      plugin>
      <plugin>
        <groupId>org.apache.maven.pluginsgroupId>
        <artifactId>maven-eclipse-pluginartifactId>
        <configuration>
          <downloadSources>truedownloadSources>
          <buildcommands>
            <buildcommand>ch.epfl.lamp.sdt.core.scalabuilderbuildcommand>
          buildcommands>
          <additionalProjectnatures>
            <projectnature>ch.epfl.lamp.sdt.core.scalanatureprojectnature>
          additionalProjectnatures>
          <classpathContainers>
            <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINERclasspathContainer>
            <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINERclasspathContainer>
          classpathContainers>
        configuration>
      plugin>
    plugins>
  build>
  <reporting>
    <plugins>
      <plugin>
        <groupId>org.scala-toolsgroupId>
        <artifactId>maven-scala-pluginartifactId>
        <configuration>
          <scalaVersion>${scala.version}scalaVersion>
        configuration>
      plugin>
    plugins>
  reporting>
project>

启动命令

nohup bin/spark-submit /home/server/SparkProgram.jar --class SparkStreamTest > /opt/sparkstreamlog  2>&1 &

你可能感兴趣的:(大数据实时项目文档,flume,kafka,hbase,spark)