【spark】基于Spark的电影推荐系统+[详细代码及配置]

基于Spark的电影推荐系统

项目架构

【spark】基于Spark的电影推荐系统+[详细代码及配置]_第1张图片

组件版本及配置

名称 版本
Hadoop 2.8.5
Hive 2.1.0
Spark 1.6.3
Kafka 2.10_0.8.21
MariaDB(Mysql) 5.5.64
Scala 2.10.6
Java 1.8.0_25
Zookeeper 3.4.12

Hadoop配置

core-site.xml

<configuration>
# 配置两个namenode的隔离策略
# sshfence方式
# 使用这种方式,必须实现ssh无密码登陆
<property>
	<name>dfs.ha.fencing.methodsname>
	<value>sshfencevalue>
property>
<property>
	<name>dfs.ha.fencing.ssh.private-key-filesname>
	<value>/root/.ssh/id_rsavalue>
property>
<property>
	<name>fs.defaultFSname>
	<value>hdfs://ns1value>
property>
<property>
	<name>fs.defaultFSname>
	<value>hdfs://ns1value>
property>    

<property>
        <name>hadoop.tmp.dirname>
        <value>/opt/programs/hadoop-2.8.5/data/tmpvalue>
property>
<property>
	<name>ha.zookeeper.quorumname>
	<value>hadoop1:2181,hadoop2:2181,hadoop3:2181value>
property>
<property>
    <name>hadoop.proxyuser.root.hostsname>
    <value>*value>
property>
<property>
    <name>hadoop.proxyuser.root.groupsname>
    <value>*value>
property>
configuration>

hdfs-site.xml

<configuration>
<property>
	# 对整个文件系统需要一个统称
	<name>dfs.nameservicesname>
	<value>ns1value>
property>
<property>
	# 指明这个文件系统的namenode有哪些
	<name>dfs.ha.namenodes.ns1name>
	<value>nn1,nn2value>
property>
<property>
	# 指明nn1是哪个
	<name>dfs.namenode.rpc-address.ns1.nn1name>
	<value>hadoop1:9000value>
property>
<property>
	# 指明nn2是那个
	<name>dfs.namenode.rpc-address.ns1.nn2name>
	<value>hadoop3:9000value>
property>

<property>
	# 指明nn1访问地址端口
	<name>dfs.namenode.http-address.ns1.nn1name>
	<value>hadoop1:50070value>
property>
<property>
	# 指明nn2访问地址端口
	<name>dfs.namenode.http-address.ns1.nn2name>
	<value>hadoop3:50070value>
property>
<property>
	# 共享日志在journalnode上的共享端口
	<name>dfs.namenode.shared.edits.dirname>
	<value>qjournal://hadoop1:8485;hadoop2:8485;hadoop3:8485/ns1value>
property>
# 配置edits在journalnode上的保存地址
<property>
	<name>dfs.journalnode.edits.dirname>
	<value>/opt/programs/hadoop-2.8.5/data/tmp/dfs/jnvalue>
property>
<property>
	# 配置proxy代理客户端
    <name>dfs.client.failover.proxy.provider.ns1name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvidervalue>
property>
<property>
        <name>dfs.replicationname>
        <value>3value>
property>
<property>
	<name>dfs.ha.automatic-failover.enabledname>
	<value>truevalue>
property>
configuration>

yarn-site.xml

<configuration>

<property>
        <name>yarn.nodemanager.aux-servicesname>
        <value>mapreduce_shufflevalue>
    property>
<property>
        <name>yarn.nodemanager.pmem-check-enabledname>
        <value>falsevalue>
        property>
<property>
        <name>yarn.nodemanager.vmem-check-enabledname>
        <value>falsevalue>
property>
configuration>

hadoop-env.sh

export JAVA_HOME=/usr/local/java/jdk1.8.0_25
export HADOOP_PREFIX=/opt/programs/hadoop-2.8.5

mapred-site.xml

<configuration>
<property>
        <name>mapreduce.framework.namename>
        <value>yarnvalue>
property>
configuration>

slaves

hadoop1
hadoop2
hadoop3

Hive配置

hive-env.sh

export JAVA_HOME=/usr/local/java/jdk1.8.0_25
export HADOOP_HOME=/opt/programs/hadoop-2.8.5
export HIVE_HOME=/opt/programs/apache-hive-2.1.0-bin

hive-site.xml

<configuration>
	<property>
		<name>javax.jdo.option.ConnectionURLname>
		<value>jdbc:mysql://hadoop3:3306/hive?createDatabaseIfNotExist=truevalue>
	property>
	<property>
		<name>javax.jdo.option.ConnectionDriverNamename>
		<value>com.mysql.jdbc.Drivervalue>
	property>
	<property>
		<name>javax.jdo.option.ConnectionUserNamename>
		<value>rootvalue>
	property>
	<property>
		<name>javax.jdo.option.ConnectionPasswordname>
		<value>123456value>
	property>
	
	<property> 
		<name>hive.exec.scratchdirname> 
		<value>/tmp/hivevalue> 
	property> 
	<property> 
		<name>hive.exec.local.scratchdirname> 
		<value>/tmp/hive/localvalue> 
		<description>Local scratch space for Hive jobsdescription> 
	property> 
	<property> 
		<name>hive.downloaded.resources.dirname> 
		<value>/tmp/hive/resourcesvalue> 
		<description>Temporary local directory for added resources in the remote file system.description> 
	property> 
	<property> 
		<name>hive.querylog.locationname> 
		<value>/tmp/hive/querylogvalue> 
		<description>Location of Hive run time structured log filedescription> 
	property>
	<property> 
		<name>hive.server2.logging.operation.log.locationname> 
		<value>/tmp/hive/operation_logsvalue> 
		<description>Top level directory where operation logs are stored if logging functionality is enableddescription> 
	property>
	<property>
                <name>hive.execution.enginename>
                <value>sparkvalue>
        property>
	<property>
		<name>hive.metastore.urisname>
		<value>thrift://hadoop3:9083value>
		<description>Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.description>
	property>
 <property>
      <name>hive.server2.thrift.portname>
      <value>10000value>                              
      <description>Port number of HiveServer2 Thrift interface.
      Can be overridden by setting $HIVE_SERVER2_THRIFT_PORTdescription>
    property>
configuration>

Spark配置

spark-env.sh

export JAVA_HOME=/usr/local/java/jdk1.8.0_25
export SCALA_HOME=/opt/programs/scala-2.10.6
export HADOOP_HOME=/opt/programs/hadoop-2.8.5
export HADOOP_CONF_DIR=/opt/programs/hadoop-2.8.5/etc/hadoop
#export SPARK_MASTER_HOST=hadoop1
export SPARK_WORKER_MEMORY=1g
export SPARK_WORKER_CORES=2
export SPARK_HOME=/opt/programs/spark-1.6.3-bin-hadoop2.6
export SPARK_DIST_CLASSPATH=$(/opt/programs/hadoop-2.8.5/bin/hadoop classpath)
export SPARK_DAEMON_JAVA_OPTS="
-Dspark.deploy.recoveryMode=ZOOKEEPER 
-Dspark.deploy.zookeeper.url=hadoop1,hadoop2,hadoop3
-Dspark.deploy.zookeeper.dir=/spark"

kafka配置

server.properties

zookeeper.connection.timeout.ms=6000
broker.id=3
log.dirs=/opt/programs/kafka_2.10-0.8.2.1/logs
listeners=PLAINTEXT://192.168.1.25:9092
zookeeper.connect=192.168.1.23:2181,192.168.1.24:2181,192.168.1.25:2181

Zookeeper配置

zoo.cfg

# The number of milliseconds of each tick
tickTime=2000
# The number of ticks that the initial 
# synchronization phase can take
initLimit=10
# The number of ticks that can pass between 
# sending a request and getting an acknowledgement
syncLimit=5
# the directory where the snapshot is stored.
# do not use /tmp for storage, /tmp here is just 
# example sakes.
dataDir=/opt/programs/zookeeper-3.4.12/data
dataLogDir=/opt/programs/zookeeper-3.4.12/logs
# the port at which the clients will connect
clientPort=2181
# the maximum number of client connections.
# increase this if you need to handle more clients
#maxClientCnxns=60
#
# Be sure to read the maintenance section of the 
# administrator guide before turning on autopurge.
#
# http://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance
#
# The number of snapshots to retain in dataDir
#autopurge.snapRetainCount=3
# Purge task interval in hours
# Set to "0" to disable auto purge feature
#autopurge.purgeInterval=1
server.1=hadoop1:2888:3888
server.2=hadoop2:2888:3888
server.3=hadoop3:2888:3888

/etc/profile

export JAVA_HOME=/usr/local/java/jdk1.8.0_25
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export PATH=$PATH:$JAVA_HOME/bin

export HADOOP_HOME=/opt/programs/hadoop-2.8.5
export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin

export ZOOKEEPER_HOME=/opt/programs/zookeeper-3.4.12
export PATH=$PATH:$ZOOKEEPER_HOME/bin

export SCALA_HOME=/opt/programs/scala-2.10.6
export PATH=$PATH:$SCALA_HOME/bin

export SPARK_HOME=//programs/spark-1.6.3-bin-hadoop2.6
export PATH=$PATH:$SPARK_HOME/bin

export FLUME_HOME=/opt/programs/apache-flume-1.6.0-bin
export PATH=$PATH:$FLUME_HOME/bin

export KAFKA_HOME=/opt/programs/kafka_2.10-0.8.2.1
export PATH=$PATH:$KAFKA_HOME/bin

export HIVE_HOME=/opt/programs/apache-hive-2.1.0-bin
export PATH=$PATH:$HIVE_HOME/bin

数据清洗

movies

package main.scala.com.hopu.wash

import main.scala.com.hopu.caseclass.{Movies}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.hive.HiveContext

object movies {
  def main(args: Array[String]): Unit = {
    //1、创建Sparkconf1并设置App名称
    val conf1 = new SparkConf().setAppName("ratings data").setMaster("local[2]").setSparkHome("spark://hadoop1:7077")
    //2、创建SparkContext,该对象是提交Spark App的入口
    val sc = new SparkContext(conf1)
    //设置日志的输出级别
    val hiveContext=new HiveContext(sc)
    hiveContext.sql("create table if not exists movies(movieId String,title String,genres String)")

    val miniPartitions=6
    import hiveContext.implicits._
    //    sc.setLogLevel("warn")
    //3、去取本地数据文件
    val data: RDD[String] = sc.textFile("/data/movies.txt",miniPartitions)
    val split=data.map(x=>{
      var y=x.split(",")
      if(x.contains("\"")){
        y=x.split("\"")
      }
      y
    })
    val df=split.filter(x=>{
      var tf=true
      for(y <- x){
        if(y.equals(" ")||y.length==0){
          tf=false
        }
      }
      tf
    }).map(x=>Movies(x(0).trim.replace(",",""),x(1),x(2).replace(",",""))).toDF()

//    df.write.mode(SaveMode.Overwrite).parquet("/tmp/movies")
    df.insertInto("movies")
//    hiveContext.sql("load data inpath '/tmp/movies' into table movies")
  }
}

把rating表分为训练集、测试集

//从ratings抽样60%成训练集ratx表
create table ratx as select * from ratings tablesample(60 percent);

//ratings表-ratx表60%=得到40%测试集ratc
create table ratc3 like ratings;
insert overwrite table ratc3 select * from ratings left join ratx on ratx.userid=ratings.userid and ratx.movieid=ratings.movieid and ratx.timestamp=ratings.timestamp where ratx.timestamp is null;

训练模型

package main.scala.com.hopu.myals

import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.sql.hive.HiveContext
//训练
object train {
  def main(args: Array[String]): Unit = {
    //1、创建Sparkconf1并设置App名称
    val conf1 = new SparkConf().setAppName("ALS train 3.0").setMaster("spark://hadoop1:7077")
    //2、创建SparkContext,该对象是提交Spark App的入口
    val sc = new SparkContext(conf1)

    val hiveContext=new HiveContext(sc)
    //训练集
    val dfratx=hiveContext.sql("select * from ratx")
    val ratx=sc.broadcast(dfratx)//广播变量
    //测试集
    val dfratc=hiveContext.sql("select * from ratc3")
    val ratc=sc.broadcast(dfratc)

    val ratings=ratx.value.map(r=>Rating(r.getString(0).toInt,r.getString(1).toInt,r.getString(2).toDouble))
    val ratingsCe=ratc.value.map(r=>Rating(r.getString(0).toInt,r.getString(1).toInt,r.getString(2).toDouble))
    // Build the recommendation model using ALS
    val lambdas=Array(0.04,0.2)
    val rank = 1
    val numIterations = Array(30)
    var numIter=0;
    var lamb:Double=0;
    var bestmodel:MatrixFactorizationModel=null
    var MSE:Double=2019.1211;
    for(lam<-lambdas){
      for(iter<-numIterations){
        val model = ALS.train(ratings, rank, iter, lam)
        // Evaluate the model on rating data
        val usersProducts = ratingsCe.map { case Rating(user, product, rate) =>
          (user, product)
        }
        val predictions =model.predict(usersProducts).map { case Rating(user, product, rate) =>
            ((user, product), rate)
          }
        //测试集算MSE(均方误差)
        val ratesAndPreds = ratingsCe.map { case Rating(user, product, rate) =>
          ((user, product), rate)
        }.join(predictions)
        val theMSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
          val err = (r1 - r2)
          err * err
        }.mean()
        if(MSE==2019.1211||theMSE<=MSE){
          MSE=theMSE
          numIter=iter
          lamb=lam
          bestmodel=model
          model.save(sc,"/myals/Model"+MSE.toString+numIter.toString+lamb.toString)
        }
      }
    }
    // Save and load model
//    val nowTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date())
    bestmodel.save(sc, "/myals3/bestModel"+MSE.toString+numIter.toString+lamb.toString)
//    val sameModel = MatrixFactorizationModel.load(sc, "/myals2/bestModel")
  }
}

离线推荐

package main.scala.com.hopu.myjdbc
import java.util.Properties

import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SQLContext}

import scala.util.control.Exception
import util.control.Breaks._

object PredictToMysql {
  def main(args: Array[String]): Unit = {
    movieToUser()
  }

  def movieToUser(): Unit ={
    val url = "jdbc:mariadb://192.168.1.25:3306/test"
    val table = "recommend"
    val properties = new Properties()
    properties.setProperty("user", "root")
    properties.setProperty("password", "123456")

    val conf1 = new SparkConf().setAppName("recommend to all user 1.0").setMaster("spark://hadoop1:7077")
    val sc = new SparkContext(conf1)
    val hiveContext=new HiveContext(sc)
    val sqlContext=new SQLContext(sc)
    val sameModel = MatrixFactorizationModel.load(sc, "/myals3/bestModel0.7675729805190136200.04")

    val alluser=hiveContext.sql("select distinct(userid) from ratings")
    val allusers=alluser.rdd.toLocalIterator
    import sqlContext.implicits._

    while (allusers.hasNext){
      try{
        var userid=allusers.next().getString(0).toInt
        var ratings=sameModel.recommendProducts(userid,5)
        for(r<-ratings){
            var mname=hiveContext.sql(s"select * from movies where movieid='${r.product}'").first()
            var thedf = Seq((userid, mname.getString(0).toInt,mname.getString(1),mname.getString(2))).toDF("userid", "movieid","title","category")
            thedf.write.mode("Append").jdbc(url,table,properties)
        }
      }catch {
        case _ =>println("怎么回事,咋出错了呢!")
      }
    }
  }

}

实时推荐

生产者

package main.scala.com.hopu.kfk

import java.util.Properties

import main.scala.com.hopu.wash.MyConf
import org.apache.kafka.clients.producer.KafkaProducer
import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext

object MyProducer{
  def main(args: Array[String]): Unit = {
    //1、创建Sparkconf1并设置App名称
    val conf1 = new SparkConf().setAppName("ALS").setMaster("local[2]")
    //2、创建SparkContext,该对象是提交Spark App的入口
    val sc = new SparkContext(conf1)

    val hiveContext=new HiveContext(sc)
    val ratc=hiveContext.sql("select * from ratc3 limit 2000")
    val topic="test1213"
    val iter=ratc.map(x=>(topic,x.getString(0)+"|"+x.getString(1)+"|"+x.getString(2))).toLocalIterator

    val props = new Properties()
    props.put("bootstrap.servers", "hadoop2:9092")
    props.put("acks", "all")
    props.put("delivery.timeout.ms", "30000")
    props.put("batch.size", "16384")
    props.put("linger.ms", "1")
    props.put("buffer.memory", "33554432")
    props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
    props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
    val producer = new KafkaProducer[String, String](props)
    while (iter.hasNext){
      var msm=iter.next()
      producer.send(new ProducerRecord[String, String](topic,msm._1,msm._2))
      Thread.sleep(1000)
    }
    producer.close()
  }
}

实时推荐

package main.scala.com.hopu.kfk

import java.util.Properties

import kafka.serializer.StringDecoder
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}

object Show {
  def main(args: Array[String]): Unit = {
    val url = "jdbc:mariadb://192.168.1.25:3306/test"
    val table = "recommend2"
    val properties = new Properties()
    properties.setProperty("user", "root")
    properties.setProperty("password", "123456")
    //Duration对象中封装了时间的一个对象,它的单位是ms.
    val conf = new SparkConf().setAppName("SparkDirectStream").setMaster("local[2]")
    val batchDuration = new Duration(1000)
    val ssc = new StreamingContext(conf, batchDuration)
    val hc = new HiveContext(ssc.sparkContext)

    val modelpath = "/myals3/bestModel0.7675729805190136200.04"
    val broker = "hadoop3:9092"
    val topics = "test1213".split(",").toSet
    val kafkaParams = Map("bootstrap.servers" -> "hadoop2:9092")
    val model = MatrixFactorizationModel.load(ssc.sparkContext, modelpath)
    val kafkaDirectStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
    val messages = kafkaDirectStream.foreachRDD { rdd =>

      val userrdd = rdd.map(x => x._2.split("|")(0).toInt)

      //可以采用迭代器的方式来避开对象不能序列化的问题。通过对RDD中的每个元素实时产生推荐结果,将结果写入到redis,或者其他高速缓存中,来达到一定的实时性。

      val validusersIter = userrdd.toLocalIterator

      import hc.implicits._

      while (validusersIter.hasNext) {
        val userid=validusersIter.next
        val us:DataFrame=hc.sql(s"select count(*) from ratings where userid=${userid}")
        if(us.first().getInt(0)==0){
          //新用户,从top50中抽5部
          val results=hc.sql("select * from top50 tablesample(10 percent)")
          println("推荐如下电影 :")
          results.show()
        }else{
          val recresult = model.recommendProducts(userid, 5)
          println("推荐如下电影 :")
          for(r<-recresult){
            println("#####"+r.product+"#####")
            var thedf = Seq((userid, r.product,r.rating)).toDF("userid","movieid","rating")
            thedf.write.mode("Append").jdbc(url,table,properties)
          }
        }
      }
    }
    ssc.start()
    ssc.awaitTermination()

你可能感兴趣的:(Spark,spark)