:数据集:UCI Machine Learning Repository's Wine data
https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/
:数据集内容 The attributes are
1 - fixed acidity
2 - volatile acidity
3 - citric acid
4 - residual sugar
5 - chlorides
6 - free sulfur dioxide
7 - total sulfur dioxide
8 - density
9 - pH
10 - sulphates
11 - alcohol
Output variable (based on sensory data):
12 - quality (score between 0 and 10)
:maven项目中添加spark csv(https://github.com/databricks/spark-csv)依赖
<dependency> <groupId>com.databricks</groupId> <artifactId>spark-csv_2.10</artifactId> <version>1.4.0</version> </dependency>:spark shell启动命令添加spark csv依赖
bin/spark-shell --packages com.databricks:spark-csv_2.10:1.4.0
package com.bbw5.ml.spark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.evaluation.RegressionEvaluator import org.apache.spark.ml.regression.LinearRegression import org.apache.spark.ml.regression.LinearRegressionModel import org.apache.spark.ml.tuning.CrossValidator import org.apache.spark.ml.tuning.ParamGridBuilder import org.apache.spark.mllib.feature.StandardScaler import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.SQLContext /** * author:baibaw5 */ object LinearRegression4Wine { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("LinearRegression4Wine") val sc = new SparkContext(sparkConf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ val filename = "I:/DM-dataset/wine/winequality-red.csv" val texts = sc.textFile(filename) //load data val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").option("delimiter", ";").load(filename) //Summary stats df.describe("fixed acidity", "pH", "alcohol", "quality").show //cal corr df.stat.corr("pH", "alcohol") //transform def convert(value: Any): Double = { if (value.isInstanceOf[Double]) value.asInstanceOf[Double] else value.asInstanceOf[Int] } val rdd = df.map { row => (row.toSeq.take(11).map { x => convert(x) }, row.getInt(11)) }.map(a => Vectors.dense(a._1.toArray) -> a._2) rdd.toDF("features", "label").show //Let's scale the points. (LinearRegression.setStandardization already does it) val scaler = new StandardScaler(withMean = true, withStd = true).fit(rdd.map(dp => dp._1)) val data = rdd.map(dp => scaler.transform(dp._1) -> dp._2.toDouble).toDF("features", "label") data.show //Create models data.cache() val Array(training, testing) = data.randomSplit(Array(0.8, 0.2), 1234L) val lr = new LinearRegression().setSolver("l-bfgs") val paramGrid = new ParamGridBuilder(). addGrid(lr.regParam, Array(0.0001, 0.01, 1.0)). addGrid(lr.maxIter, Array(100)). addGrid(lr.elasticNetParam, Array(0.1, 0.5, 1.0)).build() // 80% of the data will be used for training and the remaining 20% for validation. val cv = new CrossValidator(). setEstimator(lr). setEvaluator(new RegressionEvaluator). setEstimatorParamMaps(paramGrid). setNumFolds(4) // Run train validation split, and choose the best set of parameters. val model = cv.fit(training) //save model model.save("D:/Develop/Model/WINE-LIR-" + System.currentTimeMillis()) //Calculate evaluation metrics val bestModel = model.bestModel.asInstanceOf[LinearRegressionModel] println("bestModel.params:" + bestModel.extractParamMap) //show training metrics val trainingSummary = bestModel.summary println(s"Coefficients: ${bestModel.coefficients} Intercept: ${bestModel.intercept}") println(s"numIterations: ${trainingSummary.totalIterations}") println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}") trainingSummary.residuals.show() println(s"train rmse: ${trainingSummary.rootMeanSquaredError}") println(s"train r2: ${trainingSummary.r2}") val testDF = model.transform(testing) //show testing metrics println("test rmse:" + new RegressionEvaluator().setMetricName("rmse").evaluate(testDF)) println("test mse:" + new RegressionEvaluator().setMetricName("mse").evaluate(testDF)) println("test r2:" + new RegressionEvaluator().setMetricName("r2").evaluate(testDF)) println("test mae:" + new RegressionEvaluator().setMetricName("mae").evaluate(testDF)) } }
:参考资料
《Mastering Machine Learning with scikit-learn》
《Scala_Data_Analysis_Cookbook》