文章目录
-
- 爬虫代码
-
- Monitor
- Catcher
- pom.xml
- 启动爬虫
- flume配置文件
- 启动flume命令
- kafka相关命令
- Hive建立HBase关联表
- Spark Streaming
-
- SparkStreamTest
- pom.xml
- 启动命令
爬虫代码
Monitor
package ln;
import java.io.File;
public class Monitor extends Thread{
@Override
public void run() {
super.run();
File file = new File("Data.txt");
while(true){
System.out.println("文件大小:"+file.length()+"bytes");
if(file.length()>=100000000){
boolean delete = file.delete();
if (delete==true)
System.out.println("删除成功");
}
try {
sleep(2000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
Catcher
package ln;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import org.apache.commons.io.FileUtils;
import java.io.File;
import java.io.IOException;
public class Catcher {
public static void main(String[] args) throws IOException, InterruptedException {
String StockListURL="http://43.push2.eastmoney.com/api/qt/clist/get?&pn=1&pz=10000&po=0&np=1&fltt=2&invt=2&fid=f12&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23&fields=f12,f13";
WebClient webClient = new WebClient(BrowserVersion.CHROME);
Monitor monitor = new Monitor();
monitor.start();
String contentAsString = webClient.getPage(StockListURL).getWebResponse().getContentAsString();
Object[] objects = JSONObject.parseObject(contentAsString).getJSONObject("data").getJSONArray("diff").toArray();
for (Object object : objects) {
String F12 = JSONObject.parseObject(object.toString()).getString("f12");
String F13 = JSONObject.parseObject(object.toString()).getString("f13");
String DataURL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?fields1=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13&fields2=f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,f61&beg=0&end=20500101&rtntype=6&secid="+F13+"."+F12+"&klt=101&fqt=1";
String contentAsString1 = webClient.getPage(DataURL).getWebResponse().getContentAsString();
Thread.sleep(5000);
FileUtils.writeStringToFile(new File("Data.txt"), contentAsString1 + "\n", "UTF-8", true);
}
}
}
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>org.examplegroupId>
<artifactId>SpiderartifactId>
<version>1.0-SNAPSHOTversion>
<name>Spidername>
<url>http://www.example.comurl>
<properties>
<project.build.sourceEncoding>UTF-8project.build.sourceEncoding>
<maven.compiler.source>1.7maven.compiler.source>
<maven.compiler.target>1.7maven.compiler.target>
properties>
<dependencies>
<dependency>
<groupId>junitgroupId>
<artifactId>junitartifactId>
<version>4.11version>
<scope>testscope>
dependency>
<dependency>
<groupId>net.sourceforge.htmlunitgroupId>
<artifactId>htmlunitartifactId>
<version>2.49.1version>
dependency>
<dependency>
<groupId>com.alibabagroupId>
<artifactId>fastjsonartifactId>
<version>1.2.76version>
dependency>
<dependency>
<groupId>commons-iogroupId>
<artifactId>commons-ioartifactId>
<version>2.8.0version>
dependency>
dependencies>
<build>
<pluginManagement>
<plugins>
<plugin>
<artifactId>maven-clean-pluginartifactId>
<version>3.1.0version>
plugin>
<plugin>
<artifactId>maven-resources-pluginartifactId>
<version>3.0.2version>
plugin>
<plugin>
<artifactId>maven-compiler-pluginartifactId>
<version>3.8.0version>
plugin>
<plugin>
<artifactId>maven-surefire-pluginartifactId>
<version>2.22.1version>
plugin>
<plugin>
<artifactId>maven-jar-pluginartifactId>
<version>3.0.2version>
plugin>
<plugin>
<artifactId>maven-install-pluginartifactId>
<version>2.5.2version>
plugin>
<plugin>
<artifactId>maven-deploy-pluginartifactId>
<version>2.8.2version>
plugin>
<plugin>
<artifactId>maven-site-pluginartifactId>
<version>3.7.1version>
plugin>
<plugin>
<artifactId>maven-project-info-reports-pluginartifactId>
<version>3.0.0version>
plugin>
plugins>
pluginManagement>
build>
project>
启动爬虫
nohup java -jar xxxx.jar > /opt/spiderlog.file 2>&1 &
flume配置文件
a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /home/server/Data.txt
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.topic = mytopic
a1.sinks.k1.kafka.bootstrap.servers = server1:9092,server2:9092,server3:9092
a1.sinks.k1.kafka.flumeBatchSize = 20
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动flume命令
cd /opt/flume
nohup bin/flume-ng agent --conf conf --conf-file conf/example.conf --name a1 -Dflume.root.logger=INFO,console > /opt/flumelog.file 2>&1 &
kafka相关命令
bin/kafka-server-start.sh config/server.properties &
bin/kafka-topics.sh --create --topic mytopic --bootstrap-server localhost:9092
bin/kafka-topics.sh --describe --topic mytopic --bootstrap-server localhost:9092
bin/kafka-console-producer.sh --topic mytopic --bootstrap-server localhost:9092
bin/kafka-console-consumer.sh --topic mytopic --from-beginning --bootstrap-server localhost:9092
bin/kafka-topics.sh --zookeeper server1:2181 --list
Hive建立HBase关联表
CREATE TABLE StockInfo(
stockkey string,
stockmarket string,
stockcode string,
stockname string,
stockdate string,
stockopen string,
stockend string,
highest string,
lowest string,
ts string,
tn string,
zf string,
zdf string,
zde string,
ch string)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" =
":key,
info:stockmarket,
info:stockcode,
info:stockname,
info:stockdate,
info:stockopen,
info:stockend,
info:highest,
info:lowest,
info:ts,
info:tn,
info:zf,
info:zdf,
info:zde,
info:ch
")
TBLPROPERTIES ("hbase.table.name" = "StockInfo");
Spark Streaming
SparkStreamTest
import com.alibaba.fastjson.JSON.parseObject
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.streaming._
import org.apache.spark.SparkConf
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{KafkaUtils, LocationStrategies, LocationStrategy}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import scala.collection.mutable.ArrayBuffer
object SparkStreamTest {
Logger.getLogger("org").setLevel(Level.ERROR);
def main(args: Array[String]): Unit = {
println("spark启动中")
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("StreamingKafkaTest")
val ssc = new StreamingContext(conf,Seconds(1))
ssc.checkpoint("hdfs://server1:9000/spark-checkpoint")
val kafkaTopic = Array("mytopic")
val kafkaParams = Map[String,Object](
"bootstrap.servers" -> "server1:9092,server2:9092,server3:9092",
"key.deserializer"->classOf[StringDeserializer],
"value.deserializer"->classOf[StringDeserializer],
"group.id"->"1",
"enable.auto.commit"->(false : java.lang.Boolean)
)
val inputStream:InputDStream[ConsumerRecord[String,String]]= KafkaUtils.createDirectStream[String,String](ssc, LocationStrategies.PreferConsistent, Subscribe[String,String](kafkaTopic,kafkaParams))
val dataDStream = inputStream.map(record =>(record.key,record.value)).map(_._2)
dataDStream.foreachRDD{y=>y.foreach{x=>
val data = parseObject(x).getJSONObject("data")
val market = data.getString("market")
val code = data.getString("code")
val name = data.getString("name")
val day_klines = data.getJSONArray("klines")
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum","10.206.0.6")
hbaseConf.set("hbase.property.clientPort","2181")
val conn = ConnectionFactory.createConnection(hbaseConf)
val tableName = TableName.valueOf("StockInfo")
val table = conn.getTable(tableName)
day_klines.forEach{data=>{
val s = data.toString.split(",")
val date = s(0)
val open = s(1)
val end = s(2)
val highest = s(3)
val lowest = s(4)
val ts = s(5)
val tn = s(6)
val zf = s(7)
val zdf = s(8)
val zde = s(9)
val ch= s(10)
val putin = new Put(Bytes.toBytes(market+"."+code+","+date))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockname"), Bytes.toBytes(name))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockmarket"), Bytes.toBytes(market))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockcode"), Bytes.toBytes(code))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockdate"), Bytes.toBytes(date))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockopen"), Bytes.toBytes(open))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockend"), Bytes.toBytes(end))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("highest"), Bytes.toBytes(highest))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("lowest"), Bytes.toBytes(lowest))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("ts"), Bytes.toBytes(ts))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("tn"), Bytes.toBytes(tn))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("zf"), Bytes.toBytes(zf))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("zdf"), Bytes.toBytes(zdf))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("zde"), Bytes.toBytes(zde))
putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("ch"), Bytes.toBytes(ch))
table.put(putin)
}}
}}
ssc.start()
ssc.awaitTermination()
}
}
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>org.examplegroupId>
<artifactId>SparkProgramartifactId>
<version>1.0-SNAPSHOTversion>
<inceptionYear>2008inceptionYear>
<properties>
<scala.version>2.12.10scala.version>
properties>
<repositories>
<repository>
<id>scala-tools.orgid>
<name>Scala-Tools Maven2 Repositoryname>
<url>http://scala-tools.org/repo-releasesurl>
repository>
repositories>
<pluginRepositories>
<pluginRepository>
<id>scala-tools.orgid>
<name>Scala-Tools Maven2 Repositoryname>
<url>http://scala-tools.org/repo-releasesurl>
pluginRepository>
pluginRepositories>
<dependencies>
<dependency>
<groupId>org.scala-toolsgroupId>
<artifactId>maven-scala-pluginartifactId>
<version>2.12version>
dependency>
<dependency>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-eclipse-pluginartifactId>
<version>2.5.1version>
dependency>
<dependency>
<groupId>org.scala-langgroupId>
<artifactId>scala-libraryartifactId>
<version>${scala.version}version>
dependency>
<dependency>
<groupId>junitgroupId>
<artifactId>junitartifactId>
<version>4.4version>
<scope>testscope>
dependency>
<dependency>
<groupId>org.specsgroupId>
<artifactId>specsartifactId>
<version>1.2.5version>
<scope>testscope>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-core_2.12artifactId>
<version>3.1.2version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-streaming_2.12artifactId>
<version>3.1.2version>
dependency>
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-streaming-kafka-0-10_2.12artifactId>
<version>3.1.2version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-mapreduce-client-coreartifactId>
<version>2.7.2version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-commonartifactId>
<version>2.7.2version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-mapreduce-client-commonartifactId>
<version>2.7.2version>
dependency>
<dependency>
<groupId>org.apache.hadoopgroupId>
<artifactId>hadoop-mapreduce-client-jobclientartifactId>
<version>2.7.2version>
dependency>
<dependency>
<groupId>org.apache.hbasegroupId>
<artifactId>hbase-clientartifactId>
<version>1.4.13version>
dependency>
<dependency>
<groupId>org.apache.hbasegroupId>
<artifactId>hbase-serverartifactId>
<version>1.4.13version>
dependency>
<dependency>
<groupId>com.alibabagroupId>
<artifactId>fastjsonartifactId>
<version>1.2.76version>
dependency>
dependencies>
<build>
<sourceDirectory>src/main/scalasourceDirectory>
<testSourceDirectory>src/test/scalatestSourceDirectory>
<plugins>
<plugin>
<groupId>org.scala-toolsgroupId>
<artifactId>maven-scala-pluginartifactId>
<executions>
<execution>
<goals>
<goal>compilegoal>
<goal>testCompilegoal>
goals>
execution>
executions>
<configuration>
<scalaVersion>${scala.version}scalaVersion>
<args>
<arg>-target:jvm-1.5arg>
args>
configuration>
plugin>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-eclipse-pluginartifactId>
<configuration>
<downloadSources>truedownloadSources>
<buildcommands>
<buildcommand>ch.epfl.lamp.sdt.core.scalabuilderbuildcommand>
buildcommands>
<additionalProjectnatures>
<projectnature>ch.epfl.lamp.sdt.core.scalanatureprojectnature>
additionalProjectnatures>
<classpathContainers>
<classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINERclasspathContainer>
<classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINERclasspathContainer>
classpathContainers>
configuration>
plugin>
plugins>
build>
<reporting>
<plugins>
<plugin>
<groupId>org.scala-toolsgroupId>
<artifactId>maven-scala-pluginartifactId>
<configuration>
<scalaVersion>${scala.version}scalaVersion>
configuration>
plugin>
plugins>
reporting>
project>
启动命令
nohup bin/spark-submit /home/server/SparkProgram.jar --class SparkStreamTest > /opt/sparkstreamlog 2>&1 &