名称 | 版本 |
---|---|
Hadoop | 2.8.5 |
Hive | 2.1.0 |
Spark | 1.6.3 |
Kafka | 2.10_0.8.21 |
MariaDB(Mysql) | 5.5.64 |
Scala | 2.10.6 |
Java | 1.8.0_25 |
Zookeeper | 3.4.12 |
core-site.xml
<configuration>
# 配置两个namenode的隔离策略
# sshfence方式
# 使用这种方式,必须实现ssh无密码登陆
<property>
<name>dfs.ha.fencing.methodsname>
<value>sshfencevalue>
property>
<property>
<name>dfs.ha.fencing.ssh.private-key-filesname>
<value>/root/.ssh/id_rsavalue>
property>
<property>
<name>fs.defaultFSname>
<value>hdfs://ns1value>
property>
<property>
<name>fs.defaultFSname>
<value>hdfs://ns1value>
property>
<property>
<name>hadoop.tmp.dirname>
<value>/opt/programs/hadoop-2.8.5/data/tmpvalue>
property>
<property>
<name>ha.zookeeper.quorumname>
<value>hadoop1:2181,hadoop2:2181,hadoop3:2181value>
property>
<property>
<name>hadoop.proxyuser.root.hostsname>
<value>*value>
property>
<property>
<name>hadoop.proxyuser.root.groupsname>
<value>*value>
property>
configuration>
hdfs-site.xml
<configuration>
<property>
# 对整个文件系统需要一个统称
<name>dfs.nameservicesname>
<value>ns1value>
property>
<property>
# 指明这个文件系统的namenode有哪些
<name>dfs.ha.namenodes.ns1name>
<value>nn1,nn2value>
property>
<property>
# 指明nn1是哪个
<name>dfs.namenode.rpc-address.ns1.nn1name>
<value>hadoop1:9000value>
property>
<property>
# 指明nn2是那个
<name>dfs.namenode.rpc-address.ns1.nn2name>
<value>hadoop3:9000value>
property>
<property>
# 指明nn1访问地址端口
<name>dfs.namenode.http-address.ns1.nn1name>
<value>hadoop1:50070value>
property>
<property>
# 指明nn2访问地址端口
<name>dfs.namenode.http-address.ns1.nn2name>
<value>hadoop3:50070value>
property>
<property>
# 共享日志在journalnode上的共享端口
<name>dfs.namenode.shared.edits.dirname>
<value>qjournal://hadoop1:8485;hadoop2:8485;hadoop3:8485/ns1value>
property>
# 配置edits在journalnode上的保存地址
<property>
<name>dfs.journalnode.edits.dirname>
<value>/opt/programs/hadoop-2.8.5/data/tmp/dfs/jnvalue>
property>
<property>
# 配置proxy代理客户端
<name>dfs.client.failover.proxy.provider.ns1name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvidervalue>
property>
<property>
<name>dfs.replicationname>
<value>3value>
property>
<property>
<name>dfs.ha.automatic-failover.enabledname>
<value>truevalue>
property>
configuration>
yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-servicesname>
<value>mapreduce_shufflevalue>
property>
<property>
<name>yarn.nodemanager.pmem-check-enabledname>
<value>falsevalue>
property>
<property>
<name>yarn.nodemanager.vmem-check-enabledname>
<value>falsevalue>
property>
configuration>
hadoop-env.sh
export JAVA_HOME=/usr/local/java/jdk1.8.0_25
export HADOOP_PREFIX=/opt/programs/hadoop-2.8.5
mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.namename>
<value>yarnvalue>
property>
configuration>
slaves
hadoop1
hadoop2
hadoop3
hive-env.sh
export JAVA_HOME=/usr/local/java/jdk1.8.0_25
export HADOOP_HOME=/opt/programs/hadoop-2.8.5
export HIVE_HOME=/opt/programs/apache-hive-2.1.0-bin
hive-site.xml
<configuration>
<property>
<name>javax.jdo.option.ConnectionURLname>
<value>jdbc:mysql://hadoop3:3306/hive?createDatabaseIfNotExist=truevalue>
property>
<property>
<name>javax.jdo.option.ConnectionDriverNamename>
<value>com.mysql.jdbc.Drivervalue>
property>
<property>
<name>javax.jdo.option.ConnectionUserNamename>
<value>rootvalue>
property>
<property>
<name>javax.jdo.option.ConnectionPasswordname>
<value>123456value>
property>
<property>
<name>hive.exec.scratchdirname>
<value>/tmp/hivevalue>
property>
<property>
<name>hive.exec.local.scratchdirname>
<value>/tmp/hive/localvalue>
<description>Local scratch space for Hive jobsdescription>
property>
<property>
<name>hive.downloaded.resources.dirname>
<value>/tmp/hive/resourcesvalue>
<description>Temporary local directory for added resources in the remote file system.description>
property>
<property>
<name>hive.querylog.locationname>
<value>/tmp/hive/querylogvalue>
<description>Location of Hive run time structured log filedescription>
property>
<property>
<name>hive.server2.logging.operation.log.locationname>
<value>/tmp/hive/operation_logsvalue>
<description>Top level directory where operation logs are stored if logging functionality is enableddescription>
property>
<property>
<name>hive.execution.enginename>
<value>sparkvalue>
property>
<property>
<name>hive.metastore.urisname>
<value>thrift://hadoop3:9083value>
<description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.description>
property>
<property>
<name>hive.server2.thrift.portname>
<value>10000value>
<description>Port number of HiveServer2 Thrift interface.
Can be overridden by setting $HIVE_SERVER2_THRIFT_PORTdescription>
property>
configuration>
spark-env.sh
export JAVA_HOME=/usr/local/java/jdk1.8.0_25
export SCALA_HOME=/opt/programs/scala-2.10.6
export HADOOP_HOME=/opt/programs/hadoop-2.8.5
export HADOOP_CONF_DIR=/opt/programs/hadoop-2.8.5/etc/hadoop
#export SPARK_MASTER_HOST=hadoop1
export SPARK_WORKER_MEMORY=1g
export SPARK_WORKER_CORES=2
export SPARK_HOME=/opt/programs/spark-1.6.3-bin-hadoop2.6
export SPARK_DIST_CLASSPATH=$(/opt/programs/hadoop-2.8.5/bin/hadoop classpath)
export SPARK_DAEMON_JAVA_OPTS="
-Dspark.deploy.recoveryMode=ZOOKEEPER
-Dspark.deploy.zookeeper.url=hadoop1,hadoop2,hadoop3
-Dspark.deploy.zookeeper.dir=/spark"
server.properties
zookeeper.connection.timeout.ms=6000
broker.id=3
log.dirs=/opt/programs/kafka_2.10-0.8.2.1/logs
listeners=PLAINTEXT://192.168.1.25:9092
zookeeper.connect=192.168.1.23:2181,192.168.1.24:2181,192.168.1.25:2181
zoo.cfg
# The number of milliseconds of each tick
tickTime=2000
# The number of ticks that the initial
# synchronization phase can take
initLimit=10
# The number of ticks that can pass between
# sending a request and getting an acknowledgement
syncLimit=5
# the directory where the snapshot is stored.
# do not use /tmp for storage, /tmp here is just
# example sakes.
dataDir=/opt/programs/zookeeper-3.4.12/data
dataLogDir=/opt/programs/zookeeper-3.4.12/logs
# the port at which the clients will connect
clientPort=2181
# the maximum number of client connections.
# increase this if you need to handle more clients
#maxClientCnxns=60
#
# Be sure to read the maintenance section of the
# administrator guide before turning on autopurge.
#
# http://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance
#
# The number of snapshots to retain in dataDir
#autopurge.snapRetainCount=3
# Purge task interval in hours
# Set to "0" to disable auto purge feature
#autopurge.purgeInterval=1
server.1=hadoop1:2888:3888
server.2=hadoop2:2888:3888
server.3=hadoop3:2888:3888
export JAVA_HOME=/usr/local/java/jdk1.8.0_25
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export PATH=$PATH:$JAVA_HOME/bin
export HADOOP_HOME=/opt/programs/hadoop-2.8.5
export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin
export ZOOKEEPER_HOME=/opt/programs/zookeeper-3.4.12
export PATH=$PATH:$ZOOKEEPER_HOME/bin
export SCALA_HOME=/opt/programs/scala-2.10.6
export PATH=$PATH:$SCALA_HOME/bin
export SPARK_HOME=//programs/spark-1.6.3-bin-hadoop2.6
export PATH=$PATH:$SPARK_HOME/bin
export FLUME_HOME=/opt/programs/apache-flume-1.6.0-bin
export PATH=$PATH:$FLUME_HOME/bin
export KAFKA_HOME=/opt/programs/kafka_2.10-0.8.2.1
export PATH=$PATH:$KAFKA_HOME/bin
export HIVE_HOME=/opt/programs/apache-hive-2.1.0-bin
export PATH=$PATH:$HIVE_HOME/bin
movies
package main.scala.com.hopu.wash
import main.scala.com.hopu.caseclass.{Movies}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.hive.HiveContext
object movies {
def main(args: Array[String]): Unit = {
//1、创建Sparkconf1并设置App名称
val conf1 = new SparkConf().setAppName("ratings data").setMaster("local[2]").setSparkHome("spark://hadoop1:7077")
//2、创建SparkContext,该对象是提交Spark App的入口
val sc = new SparkContext(conf1)
//设置日志的输出级别
val hiveContext=new HiveContext(sc)
hiveContext.sql("create table if not exists movies(movieId String,title String,genres String)")
val miniPartitions=6
import hiveContext.implicits._
// sc.setLogLevel("warn")
//3、去取本地数据文件
val data: RDD[String] = sc.textFile("/data/movies.txt",miniPartitions)
val split=data.map(x=>{
var y=x.split(",")
if(x.contains("\"")){
y=x.split("\"")
}
y
})
val df=split.filter(x=>{
var tf=true
for(y <- x){
if(y.equals(" ")||y.length==0){
tf=false
}
}
tf
}).map(x=>Movies(x(0).trim.replace(",",""),x(1),x(2).replace(",",""))).toDF()
// df.write.mode(SaveMode.Overwrite).parquet("/tmp/movies")
df.insertInto("movies")
// hiveContext.sql("load data inpath '/tmp/movies' into table movies")
}
}
//从ratings抽样60%成训练集ratx表
create table ratx as select * from ratings tablesample(60 percent);
//ratings表-ratx表60%=得到40%测试集ratc
create table ratc3 like ratings;
insert overwrite table ratc3 select * from ratings left join ratx on ratx.userid=ratings.userid and ratx.movieid=ratings.movieid and ratx.timestamp=ratings.timestamp where ratx.timestamp is null;
package main.scala.com.hopu.myals
import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.sql.hive.HiveContext
//训练
object train {
def main(args: Array[String]): Unit = {
//1、创建Sparkconf1并设置App名称
val conf1 = new SparkConf().setAppName("ALS train 3.0").setMaster("spark://hadoop1:7077")
//2、创建SparkContext,该对象是提交Spark App的入口
val sc = new SparkContext(conf1)
val hiveContext=new HiveContext(sc)
//训练集
val dfratx=hiveContext.sql("select * from ratx")
val ratx=sc.broadcast(dfratx)//广播变量
//测试集
val dfratc=hiveContext.sql("select * from ratc3")
val ratc=sc.broadcast(dfratc)
val ratings=ratx.value.map(r=>Rating(r.getString(0).toInt,r.getString(1).toInt,r.getString(2).toDouble))
val ratingsCe=ratc.value.map(r=>Rating(r.getString(0).toInt,r.getString(1).toInt,r.getString(2).toDouble))
// Build the recommendation model using ALS
val lambdas=Array(0.04,0.2)
val rank = 1
val numIterations = Array(30)
var numIter=0;
var lamb:Double=0;
var bestmodel:MatrixFactorizationModel=null
var MSE:Double=2019.1211;
for(lam<-lambdas){
for(iter<-numIterations){
val model = ALS.train(ratings, rank, iter, lam)
// Evaluate the model on rating data
val usersProducts = ratingsCe.map { case Rating(user, product, rate) =>
(user, product)
}
val predictions =model.predict(usersProducts).map { case Rating(user, product, rate) =>
((user, product), rate)
}
//测试集算MSE(均方误差)
val ratesAndPreds = ratingsCe.map { case Rating(user, product, rate) =>
((user, product), rate)
}.join(predictions)
val theMSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
val err = (r1 - r2)
err * err
}.mean()
if(MSE==2019.1211||theMSE<=MSE){
MSE=theMSE
numIter=iter
lamb=lam
bestmodel=model
model.save(sc,"/myals/Model"+MSE.toString+numIter.toString+lamb.toString)
}
}
}
// Save and load model
// val nowTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date())
bestmodel.save(sc, "/myals3/bestModel"+MSE.toString+numIter.toString+lamb.toString)
// val sameModel = MatrixFactorizationModel.load(sc, "/myals2/bestModel")
}
}
package main.scala.com.hopu.myjdbc
import java.util.Properties
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SQLContext}
import scala.util.control.Exception
import util.control.Breaks._
object PredictToMysql {
def main(args: Array[String]): Unit = {
movieToUser()
}
def movieToUser(): Unit ={
val url = "jdbc:mariadb://192.168.1.25:3306/test"
val table = "recommend"
val properties = new Properties()
properties.setProperty("user", "root")
properties.setProperty("password", "123456")
val conf1 = new SparkConf().setAppName("recommend to all user 1.0").setMaster("spark://hadoop1:7077")
val sc = new SparkContext(conf1)
val hiveContext=new HiveContext(sc)
val sqlContext=new SQLContext(sc)
val sameModel = MatrixFactorizationModel.load(sc, "/myals3/bestModel0.7675729805190136200.04")
val alluser=hiveContext.sql("select distinct(userid) from ratings")
val allusers=alluser.rdd.toLocalIterator
import sqlContext.implicits._
while (allusers.hasNext){
try{
var userid=allusers.next().getString(0).toInt
var ratings=sameModel.recommendProducts(userid,5)
for(r<-ratings){
var mname=hiveContext.sql(s"select * from movies where movieid='${r.product}'").first()
var thedf = Seq((userid, mname.getString(0).toInt,mname.getString(1),mname.getString(2))).toDF("userid", "movieid","title","category")
thedf.write.mode("Append").jdbc(url,table,properties)
}
}catch {
case _ =>println("怎么回事,咋出错了呢!")
}
}
}
}
生产者
package main.scala.com.hopu.kfk
import java.util.Properties
import main.scala.com.hopu.wash.MyConf
import org.apache.kafka.clients.producer.KafkaProducer
import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
object MyProducer{
def main(args: Array[String]): Unit = {
//1、创建Sparkconf1并设置App名称
val conf1 = new SparkConf().setAppName("ALS").setMaster("local[2]")
//2、创建SparkContext,该对象是提交Spark App的入口
val sc = new SparkContext(conf1)
val hiveContext=new HiveContext(sc)
val ratc=hiveContext.sql("select * from ratc3 limit 2000")
val topic="test1213"
val iter=ratc.map(x=>(topic,x.getString(0)+"|"+x.getString(1)+"|"+x.getString(2))).toLocalIterator
val props = new Properties()
props.put("bootstrap.servers", "hadoop2:9092")
props.put("acks", "all")
props.put("delivery.timeout.ms", "30000")
props.put("batch.size", "16384")
props.put("linger.ms", "1")
props.put("buffer.memory", "33554432")
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
val producer = new KafkaProducer[String, String](props)
while (iter.hasNext){
var msm=iter.next()
producer.send(new ProducerRecord[String, String](topic,msm._1,msm._2))
Thread.sleep(1000)
}
producer.close()
}
}
实时推荐
package main.scala.com.hopu.kfk
import java.util.Properties
import kafka.serializer.StringDecoder
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}
object Show {
def main(args: Array[String]): Unit = {
val url = "jdbc:mariadb://192.168.1.25:3306/test"
val table = "recommend2"
val properties = new Properties()
properties.setProperty("user", "root")
properties.setProperty("password", "123456")
//Duration对象中封装了时间的一个对象,它的单位是ms.
val conf = new SparkConf().setAppName("SparkDirectStream").setMaster("local[2]")
val batchDuration = new Duration(1000)
val ssc = new StreamingContext(conf, batchDuration)
val hc = new HiveContext(ssc.sparkContext)
val modelpath = "/myals3/bestModel0.7675729805190136200.04"
val broker = "hadoop3:9092"
val topics = "test1213".split(",").toSet
val kafkaParams = Map("bootstrap.servers" -> "hadoop2:9092")
val model = MatrixFactorizationModel.load(ssc.sparkContext, modelpath)
val kafkaDirectStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
val messages = kafkaDirectStream.foreachRDD { rdd =>
val userrdd = rdd.map(x => x._2.split("|")(0).toInt)
//可以采用迭代器的方式来避开对象不能序列化的问题。通过对RDD中的每个元素实时产生推荐结果,将结果写入到redis,或者其他高速缓存中,来达到一定的实时性。
val validusersIter = userrdd.toLocalIterator
import hc.implicits._
while (validusersIter.hasNext) {
val userid=validusersIter.next
val us:DataFrame=hc.sql(s"select count(*) from ratings where userid=${userid}")
if(us.first().getInt(0)==0){
//新用户,从top50中抽5部
val results=hc.sql("select * from top50 tablesample(10 percent)")
println("推荐如下电影 :")
results.show()
}else{
val recresult = model.recommendProducts(userid, 5)
println("推荐如下电影 :")
for(r<-recresult){
println("#####"+r.product+"#####")
var thedf = Seq((userid, r.product,r.rating)).toDF("userid","movieid","rating")
thedf.write.mode("Append").jdbc(url,table,properties)
}
}
}
}
ssc.start()
ssc.awaitTermination()