1、Spark机器学习库特点
[Estimator]
运行在包含了feature和label(结果)的dataFrame之上,对数据进行训练创建model。
该模型用于以后的预测。
[Transformer]
将包含feature的Dataframe变换成了包含了预测的dataframe.由Estimator创建的model就是Transformer。
[Parameter]
Estimator和Transformer使用的数据,通常和机器学习的算法相关。Spark API给出了一致性API针对算法。
[Pipeline]
将Estimators和Transformers组合在一起,形成机器学习工作流。
机器学习应用步骤
1.读取数据文件形成训练数据框
2.创建模型(LinearRegression)并设置参数
3.对训练数据进行模型拟合,完成评估管线.
4.创建包含测试数据的DataFrame,典型包含feature和label,可以通过比较预测标签和测试标签确认model是ok。
5.使用模型,对测试数据进行变换(应用模型),抽取feature ,label,prediction.
2、案例
导入pom.xml
<dependency>
<groupId>org.apache.sparkgroupId>
<artifactId>spark-mllib_2.11artifactId>
<version>2.1.0version>
dependency>
2.1 线性回归模型对白酒质量进行预测
/**
* Created by Administrator on 2017/4/8.
*/
//评估酒的质量
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{Row, SparkSession}
object SparkMLDemo1 {
def main(args: Array[String]): Unit = {
val sess = SparkSession.builder().appName("ml").master("local[4]").getOrCreate();
val sc = sess.sparkContext;
/*
//数据目录
val dataDir = "file:///D:/downloads/bigdata/ml/winequality-white.csv"
//定义样例类
case class Wine(FixedAcidity: Double, VolatileAcidity: Double,
CitricAcid: Double, ResidualSugar: Double, Chlorides: Double,
FreeSulfurDioxide: Double, TotalSulfurDioxide: Double, Density: Double, PH:
Double, Sulphates: Double, Alcohol: Double, Quality: Double)
//变换
val wineDataRDD = sc.textFile(dataDir).map(_.split(";")).map(w => Wine(w(0).toDouble, w(1).toDouble,
w(2).toDouble, w(3).toDouble, w(4).toDouble, w(5).toDouble, w(6).toDouble, w(7).toDouble, w(8).toDouble
, w(9).toDouble, w(10).toDouble, w(11).toDouble))
import sess.implicits._
//转换RDD成DataFrame
val trainingDF = wineDataRDD.map(w => (w.Quality,
Vectors.dense(w.FixedAcidity, w.VolatileAcidity, w.CitricAcid,
w.ResidualSugar, w.Chlorides, w.FreeSulfurDioxide, w.TotalSulfurDioxide,
w.Density, w.PH, w.Sulphates, w.Alcohol))).toDF("label", "features")
//显式数据
trainingDF.show()
println("======================")
//创建线性回归对象
val lr = new LinearRegression()
//设置最大迭代次数
lr.setMaxIter(50)
//通过线性回归拟合训练数据,生成模型
val model = lr.fit(trainingDF)
//保存模型(即参数)
model.save("file:///d:/scala/model");
*/
//加载模型(执行了上面注释的代码,将模型保存到了磁盘之后,下次再用只需要直接加载模型即可)
val model = LinearRegressionModel.load("file:///d:/scala/model");
//创建内存测试数据数据框
val testDF = sess.createDataFrame(Seq(
(5.0, Vectors.dense(7.4, 0.7, 0.0, 1.9, 0.076, 25.0, 67.0, 0.9968, 3.2, 0.68, 9.8)),
(5.0, Vectors.dense(7.8, 0.88, 0.0, 2.6, 0.098, 11.0, 34.0, 0.9978, 3.51, 0.56, 9.4)),
(7.0, Vectors.dense(7.3, 0.65, 0.0, 1.2, 0.065, 15.0, 18.0, 0.9968, 3.36, 0.57, 9.5))))
.toDF("label", "features")
testDF.show()
//创建临时视图
testDF.createOrReplaceTempView("test")
println("======================")
//利用model对测试数据进行变化,得到新数据框,查询features", "label", "prediction方面值。
val tested = model.transform(testDF).select("features", "label", "prediction");
tested.show();
}
}
2.2 逻辑回归模型用于白酒的分类
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.SparkSession
object LogicRegressWineClassifyDemo{
def main(args: Array[String]): Unit = {
val sess = SparkSession.builder().appName("ml").master("local[4]").getOrCreate();
val sc = sess.sparkContext;
//数据目录
val dataDir = "file:///D:/downloads/bigdata/ml/winequality-white.csv"
//定义样例类
case class Wine(FixedAcidity: Double, VolatileAcidity: Double,
CitricAcid: Double, ResidualSugar: Double, Chlorides: Double,
FreeSulfurDioxide: Double, TotalSulfurDioxide: Double, Density: Double, PH:
Double, Sulphates: Double, Alcohol: Double, Quality: Double)
//变换
val wineDataRDD = sc.textFile(dataDir).map(_.split(";")).map(w => Wine(w(0).toDouble, w(1).toDouble,
w(2).toDouble, w(3).toDouble, w(4).toDouble, w(5).toDouble, w(6).toDouble, w(7).toDouble, w(8).toDouble
, w(9).toDouble, w(10).toDouble, w(11).toDouble))
import sess.implicits._
//转换RDD成DataFrame
val trainingDF = wineDataRDD.map(w => (if (w.Quality < 7) 0D else 1D,
Vectors.dense(w.FixedAcidity, w.VolatileAcidity, w.CitricAcid,
w.ResidualSugar, w.Chlorides, w.FreeSulfurDioxide, w.TotalSulfurDioxide,
w.Density, w.PH, w.Sulphates, w.Alcohol))).toDF("label", "features")
//创建逻辑回归对象
val lr = new LogisticRegression()
//设置最大迭代次数
lr.setMaxIter(10).setRegParam(0.01)
//拟合模型
val model = lr.fit(trainingDF)
//创建测试Dataframe
val testDF = sess.createDataFrame(Seq(
(1.0,Vectors.dense(6.1, 0.32, 0.24, 1.5, 0.036, 43, 140, 0.9894, 3.36, 0.64, 10.7)),
(0.0, Vectors.dense(5.2, 0.44, 0.04, 1.4, 0.036, 38, 124, 0.9898, 3.29, 0.42, 12.4)),
(0.0,Vectors.dense(7.2, 0.32, 0.47, 5.1, 0.044, 19, 65, 0.9951, 3.38, 0.36, 9)),
(0.0, Vectors.dense(6.4, 0.595, 0.14, 5.2, 0.058, 15, 97, 0.991, 3.03, 0.41, 12.6)))
).toDF("label", "features")
//显式测试数据
testDF.show();
println("========================")
//预测测试数据(带标签),评测模型的质量。
testDF.createOrReplaceTempView("test")
val tested = model.transform(testDF).select("features", "label", "prediction")
tested.show();
println("========================")
//预测无标签的测试数据。
val predictDF = sess.sql("SELECT features FROM test")
//预测结果
val predicted = model.transform(predictDF).select("features", "prediction")
predicted.show();
}
}
2.3 采用逻辑回归模型对垃圾邮件过滤
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{HashingTF, RegexTokenizer, StopWordsRemover, Tokenizer, Word2Vec}
object SpamFilterDemo1 {
def main(args: Array[String]): Unit = {
val sess = SparkSession.builder().appName("ml").master("local[4]").getOrCreate();
val sc = sess.sparkContext;
//垃圾邮件训练数据
val training = sess.createDataFrame(Seq(
("[email protected]", "hope you are well", 0.0),
("[email protected]", "nice to hear from you", 0.0),
("[email protected]", "happy holidays", 0.0),
("[email protected]", "see you tomorrow", 0.0),
("[email protected]", "save loan money", 1.0),
("[email protected]", "save money", 1.0),
("[email protected]", "low interest rate", 1.0),
("[email protected]", "cheap loan", 1.0)
)).toDF("email", "message", "label")
//分词器,指定输入列,生成输出列
val tokenizer = new Tokenizer().setInputCol("message").setOutputCol("words")
//哈希词频,同一个单词分配到同一个分区
val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol("words").setOutputCol("features")
/* //类似于切割动作。
val wordsDF = tokenizer.transform(training)
//wordsDF.show()
val featurizedDF = hashingTF.transform(wordsDF)
featurizedDF.show()*/
//创建逻辑回归对象
val lr = new LogisticRegression().setMaxIter(10).setRegParam(0.01)
//设置管线
val pipeline = new Pipeline().setStages(Array(tokenizer,hashingTF, lr))
//拟合,产生模型
val model = pipeline.fit(training)
//测试数据,评判model的质量
val test = sess.createDataFrame(Seq(
("[email protected]", "ab how are you"),
("[email protected]", "ab hope doing well"),
("[email protected]", "ab want some money"),
("[email protected]", "ab secure loan"),
("[email protected]", "ab need loan")
)).toDF("email", "message")
//对测试数据进行模型变换,得到模型的预测结果
val prediction = model.transform(training).select("email", "message", "prediction")//将结果写入到email、message、prediction列
//prediction.show()
}
}
2.4 使用最小二乘模型实现推荐模型
import breeze.linalg.product
import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
import org.apache.spark.{SparkConf, SparkContext}
object RecommDemo{
def main(args: Array[String]): Unit = {
val conf=new SparkConf().setAppName("Recommend").setMaster("local[4]")
val sc=new SparkContext(conf)
//load and parse the data
val data=sc.textFile("file:///F:\\test.data")
//变换数据成为Rating
val ratings=data.map(_.split(",") match {
case Array(user,item,rate)=>
Rating(user.toInt,item.toInt,rate.toDouble)
})
//Build the recommendation model using ALS
val rank=10
val numIterations=10//设置迭代次数
//最小二乘算法构建推荐模型
val model=ALS.train(ratings,rank,numIterations,0.01)
//取出评分数据的(User,product)
val usersProducts=ratings.map { case Rating(user, product, rate) =>
(user, product)
}
//通过model对(user,product)进行预测, ((user,product),rate)
val predicctions=
model.predict(usersProducts).map{case Rating(user, product, rate)=>
((user,product),rate)
}
//对训练数据进行map映射成((user,product),rate)
val ratesAndPreds=ratings.map{case Rating(user,product,rate)=>
((user,product),rate)
}.join(predicctions)
val MSE=ratesAndPreds.map{case ((user,product),(r1,r2))=>
val err=(r1-r2)
err*err
}.mean()
println("Mean Squared Error="+MSE)
//
model.save(sc,"target/tmp/myCollaborativeFilter")
val sameModel=MatrixFactorizationModel.load(sc,"target/tmp/myCollaborativeFilter")
}
}
Java版实现
package cn.ctgu.spark.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.recommendation.ALS;
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
import org.apache.spark.mllib.recommendation.Rating;
import scala.Tuple2;
public class JavaRecommendationExample {
public static void main(String[] args) {
SparkConf conf=new SparkConf().setAppName("Recommendation Example").setMaster("local[4]");
JavaSparkContext jsc=new JavaSparkContext(conf);
//load and parse the data
String path="data/mllib/als/test.data";
JavaRDDdata=jsc.textFile(path);
//训练数据集
JavaRDDratings=data.map(
new Function() {
public Rating call(String s) throws Exception {
String[] sarray=s.split(",");
return new Rating(Integer.parseInt(sarray[0]),Integer.parseInt(sarray[1]),
Double.parseDouble(sarray[2]));
}
}
);
//Build the recommendation model using ALS
int rank=10;
int numIterations=10;
MatrixFactorizationModel model= ALS.train(JavaRDD.toRDD(ratings),rank,numIterations,0.01);
//提取训练数据的(user,product)信息构成元组
JavaRDD>userProducts=ratings.map(
new Function>() {
public Tuple2
2.5 使用最小二乘实现电影推荐
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.sql.SparkSession
/**
* 电影推荐
*/
object MovieRecommDemo {
//定义评级样例类
case class Rating0(userId: Int, movieId: Int, rating: Float, timestamp: Long)
def main(args: Array[String]): Unit = {
val conf = new SparkConf();
conf.setAppName("movieRecomm");
conf.setMaster("local[4]")
val spark = SparkSession.builder().config(conf).getOrCreate() ;
import spark.implicits._
//解析评级
def parseRating(str: String): Rating0 = {
val fields = str.split("::")
assert(fields.size == 4)
Rating0(fields(0).toInt, fields(1).toInt, fields(2).toFloat, fields(3).toLong)
}
//转换成Rating的DF对象
var ratings = spark.sparkContext.textFile("file:///D:\\scala\\ml\\recomm\\sample_movielens_ratings.txt");
val ratings0 = ratings.map(parseRating)
val df = ratings0.toDF()
//随机切割训练数据,生成两个一个数组,第一个元素是training,第二个是test
val Array(training, test) = df.randomSplit(Array(0.99, 0.01))
//建ALS推荐算法并设置参数
val als = new ALS().setMaxIter(5)
.setRegParam(0.01)
.setUserCol("userId")
.setItemCol("movieId")
.setRatingCol("rating")
//通过als对象对训练数据进行拟合,生成推荐模型
val model = als.fit(training)
/*******向5个用户推荐8款商品********/
//val res = model.recommendProducts(5,8);
/*******将指定的商品推荐给n个用户********/
//val res = model.recommendUsers(3,5)
/*******向所有用户推荐3种商品********/
val res = model.recommendProductsForUsers(3)
//使用model对test数据进行变换,实现预测过程
val predictions = model.transform(test);
predictions.collect().foreach(println)
}
}