trait BaseModel {
//0.加载配置
val config: Config = ConfigFactory.load()
val url: String = config.getString("jdbc.url")
val tableName: String = config.getString("jdbc.table")
val sourceClass: String = config.getString("hbase.source.class")
val zkHosts: String = config.getString("hbase.source.zkHosts")
val zkPort: String = config.getString("hbase.source.zkPort")
val hbaseTable: String = config.getString("hbase.source.hbaseTable")
val family: String = config.getString("hbase.source.family")
val selectFields: String = config.getString("hbase.source.selectFields")
val rowKey: String = config.getString("hbase.source.rowKey")
val hbaseMeta = HBaseMeta(
"",
zkHosts,
zkPort,
hbaseTable,
family,
selectFields,
rowKey
)
//0.创建SparkSession
val spark = SparkSession.builder()
.appName("model")
.master("local[*]")
.config("spark.hadoop.validateOutputSpecs", "false")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
import spark.implicits._
//下面的7个步骤中,只有第5步比较特殊,因为不同的标签计算方式可能不太一样
//所以我们可以将第5步做成抽象方法,由子类去实现,而其他的步骤都可以在该Model中进行封装
/**
* execute方法将模型计算的整体流程进行了封装,子类实现BaseModel的时候只需要在main方法中调用execute方法并重写抽象方法即可
*/
def execute()={
//1.加载MySQL数据
val mysqlDF:DataFrame = getMySQLDF()
//2.获取4级标签(连接HBase的规则)
val fourRule:Map[String,String] = getFourRule(mysqlDF)
//3.获取5级标签
val fiveRule:DataFrame = getFiveRule(mysqlDF)
//4.加载HBase数据
val HBaseDF:DataFrame = getHBaseDF(fourRule)
//5.计算(匹配/统计/算法挖掘...)
val computeResult:DataFrame = compute(fiveRule,HBaseDF)
//6.合并
val result:DataFrame = merge(computeResult)
//7.保存
save(result)
}
/**
* 加载MySQL中的数据
* @return
*/
def getMySQLDF(): DataFrame = {
spark.read.jdbc(url,tableName,new Properties())
}
/**
* 提供一个抽象方法由子类实现并返回标签id
* @return
*/
def getTagID(): Int
/**
* 从MySQL中获取4级标签
* @param mysqlDF
* @return
*/
def getFourRule(mysqlDF: DataFrame): Map[String, String] = {
//根据id查询4级规则,而id得等到子类实现的时候才知道,应该提供一个方法给子类实现
val fourRuleDS: Dataset[Row] = mysqlDF.select('rule).where('id === getTagID())
//inType=HBase##zkHosts=192.168.10.20##zkPort=2181##hbaseTable=tbl_users##family=detail##selectFields=id,job
val fourRuleMap: Map[String, String] = fourRuleDS.rdd.map(row => {
val kvStrs: Array[String] = row.getAs[String]("rule").split("##")
val kvTuple: Array[(String, String)] = kvStrs.map(kvStr => {
val kv: Array[String] = kvStr.split("=")
(kv(0), kv(1))
})
kvTuple.toMap
}).collect()(0)
fourRuleMap
}
/**
* 从MySQL中获取5级标签
* @param mysqlDF
* @return
*/
def getFiveRule(mysqlDF: DataFrame): DataFrame = {
mysqlDF.select('id,'rule).where('pid === getTagID())
}
/**
* 根据4级标签规则从HBase中获取数据
* @param fourRule inType=HBase##zkHosts=192.168.10.20##zkPort=2181##hbaseTable=tbl_users##family=detail##selectFields=id,job
* @return
*/
def getHBaseDF(fourRule: Map[String, String]): DataFrame = {
spark.read.format("cn.itcast.up.tools.HBaseSource").options(fourRule).load()
}
/**
* 标签计算的具体流程,应该由子类去实现
* @param fiveRule
* @param HBaseDF
* @return
*/
def compute(fiveRule: DataFrame, HBaseDF: DataFrame): DataFrame
/**
* 将这一次计算的结果和HBase中原来的结果进行合并
* @param newDF
* @return
*/
def merge(newDF: DataFrame): DataFrame = {
//1.获取原来的结果
val oldDF: DataFrame = spark.read.format("cn.itcast.up.tools.HBaseSource")
.option(HBaseMeta.ZKHOSTS, hbaseMeta.zkHosts)
.option(HBaseMeta.ZKPORT, hbaseMeta.zkPort)
.option(HBaseMeta.HBASETABLE, hbaseMeta.hbaseTable)
.option(HBaseMeta.FAMILY, hbaseMeta.family)
.option(HBaseMeta.SELECTFIELDS, hbaseMeta.selectFields)
.option(HBaseMeta.ROWKEY, hbaseMeta.rowKey)
.load()
//2.合并oldDF和newDF
oldDF.createOrReplaceTempView("t_old")
newDF.createOrReplaceTempView("t_new")
spark.udf.register("mergeTagId",(newTagsId:String,oldTagsId:String)=>{
if(StringUtils.isBlank(newTagsId)){
oldTagsId
}else if(StringUtils.isBlank(oldTagsId)){
newTagsId
}else{
(newTagsId.split(",") ++ oldTagsId.split(",")).toSet.mkString(",")
}
})
//new 22: 10 old 22:23,30
//result: 22: 10,23,30
//new 22: 10 old
//result: 22: 10
//new old 22:23,30
//result
val sql:String =
"""
|select n.userId userId,mergeTagId(n.tagIds,o.tagIds) as tagIds from t_new n
|left join t_old o
|on
|n.userId = o.userId
|""".stripMargin
val resutl: DataFrame = spark.sql(sql)
resutl
}
/**
* 将标签合并的结果保存到HBase
* @param result
*/
def save(result: DataFrame) = {
result.show(10)
result.write.format("cn.itcast.up.tools.HBaseSource")
.option(HBaseMeta.ZKHOSTS, hbaseMeta.zkHosts)
.option(HBaseMeta.ZKPORT, hbaseMeta.zkPort)
.option(HBaseMeta.HBASETABLE, hbaseMeta.hbaseTable)
.option(HBaseMeta.FAMILY, hbaseMeta.family)
.option(HBaseMeta.SELECTFIELDS, hbaseMeta.selectFields)
.option(HBaseMeta.ROWKEY, hbaseMeta.rowKey)
.save()
}
/**
* 将聚类中心索引和5级规则进行拉链,并将聚类结果中的聚类编号换成拉链结果中的tag
* @param fiveRule 5级规则
* @param model 模型(模型中有聚类中心的所有信息)
* @param clusterDF 聚类结果
* @return newDF(userId,tagIds)
*/
def zipResult(fiveRule: DataFrame, model: KMeansModel, clusterDF: DataFrame): DataFrame = {
import spark.implicits._
//6.从上面的结果中可以看到聚类之后的聚类编号和5级规则顺序并不匹配
//所以需要对各个聚类中心的psm值进行排序,得到这样的集合[聚类中心编号,psm值]
//model.clusterCenters返回的是Array[Vector]
//model.clusterCenters.indices返回的是所有的聚类中心的编号组成的集合
//model.clusterCenters(i)根据索引编号取出Array[Vector]中的Vector
//model.clusterCenters(i).toArray.sum根据索引编号取出该聚类中心(是一个Vector),再将Vector转为数组,再求和
//IndexedSeq[(聚类中心编号, psm值)]
val indexAndPSM: immutable.IndexedSeq[(Int, Double)] = model.clusterCenters.indices.map(i => (i, model.clusterCenters(i).toArray.sum))
val sortedIndexAndPSM: immutable.IndexedSeq[(Int, Double)] = indexAndPSM.sortBy(_._2).reverse
sortedIndexAndPSM.foreach(println)
/*
(1,0.5563226557645843)
(2,0.31754213552513205)
(4,0.21281283437093323)
(0,0.1320103555777084)
(3,0.08401071578741981)
*/
/*
+---+----+
|id |rule|
+---+----+
|51 |1 |
|52 |2 |
|53 |3 |
|54 |4 |
|55 |5 |
+---+----+
想要的结果:
(1,51)
(2,52)
(4,53)
(0,54)
(3,55)
*/
//7.将sortedIndexAndPSM和fiveRule进行拉链
//List[(tagId, rule)]
val ruleList: List[(String, String)] = fiveRule.as[(String, String)].collect().toList.sortBy(_._2)
//[((predict, psm), (tagId, rule))]
val tempSeq: immutable.IndexedSeq[((Int, Double), (String, String))] = sortedIndexAndPSM.zip(ruleList)
//[(predict, tagId)]
val predictAndTagId: immutable.IndexedSeq[(Int, String)] = tempSeq.map(t => (t._1._1, t._2._1))
val predictAndTagIdMap: Map[Int, String] = predictAndTagId.toMap
predictAndTagIdMap.foreach(println)
/*
(1,51)
(2,52)
(4,53)
(0,54)
(3,55)
*/
//8.将clusterDF中的predict换成tagId
//spark.udf.register("函数名",()=>{})
val predict2TagId = functions.udf((predict: Int) => {
predictAndTagIdMap(predict)
})
val newDF: DataFrame = clusterDF.select('userId, predict2TagId('predict).as("tagIds"))
newDF.show(10, false)
newDF
}
}
1.查询mysql中的标签数据
2.计算5级标签的数据
3.查询HBase中的数据
4.获取RFM/RFE/PSM模型数据
使用sparkSQL的函数,注意要导包
import org.apache.spark.sql.functions._
//进行分组 groupby memberId
用户最近一次消费距离今天的天数
functions.datediff(date_sub(current_timestamp(),90), from_unixtime(max('finishTime))) as recencyStr
用户最近消费的频次(一般用运营或产品经理指定)
用户消费的总金额
5.对数据进行归一化(这里使用评分制度)
6.对归一化的数据进行特征工程处理(这里使用向量化处理)
7.K值的选取(使用SSE值进行分析,并结合运营部门以及算法工程师,给出确定值)
8.使用K-means模型进行训练
9.结果预测(预测的结果的顺序和我们定义的规则不一致,这个时候要进行预测结果的排序处理)
10.将排好序的结果与规则进行匹配,这里通过Zip拉链,将需要的结果选取出来.并保存到HBase.
确定K值的方法
1.肘部法则–通过计算SSE值,以及和运营商讨后得出的K的取值.
肘部法则的特点计算简单
SSE计算K值代码实现
//选取K的值
println("开始选取k值")
val ks: List[Int] = List(3,4,5,6,7,8,9,10,11)
//准备一个集合存放k对应的SSE的值
//val表示变量不能被重新赋值
//mutable表示该集合可变,指的是集合中的元素可以变化
val map = mutable.Map[Int,Double]()
//循环遍历ks,分别计算每一个k的SSE的值并放到map中
for(k <- ks){
val kmodel : KMeansModel = new KMeans()
.setFeaturesCol(featureStr)
.setPredictionCol(predictStr)
.setK(k) //这里先直接使用7,实际中这个应该由算法工程师训练得出K值再结合运营的实际需求选取最佳的K值
.setMaxIter(10)
.setSeed(10) //随机种子,方便我们测试时使用,可以保证多次运行结果一致
.fit(VectorDF)
//计算k的SSE
val SSE: Double = kmodel.computeCost(VectorDF)
map.put(k,SSE)
}
println("k值对应的sse的值如下:")
map.foreach(println)
/*
(8,13.365270018618354)
(2,218.8742481449105)
(5,22.774427527778087)
(4,49.90323450135087)
(7,16.11569736904577)
(3,61.42354755494108)
(6,20.87674069305692)
*/
2.轮廓系数法,这个方法计算比较复杂,因为如果要有100个点的话就要进行10000次的计算,代价太大.
用户价值代码开发-REM
//1.按用户id进行聚合获取客户RFM
//客户价值模型-RFM:
//Rencency:最近一次消费,最后一次订单距今时间
//Frequency:消费频率,订单总数量
//Monetary:消费金额,订单总金额
//https://blog.csdn.net/liam08/article/details/79663018
val recencyAggColumn: Column = functions.datediff(date_sub(current_timestamp(),60), from_unixtime(max('finishTime))) as recencyStr
val frequencyAggColumn: Column = functions.count('orderSn) as frequencyStr
val monetaryAggColumn: Column = functions.sum('orderAmount) as monetaryStr
val RFMResult = hbaseDF
.groupBy('memberId)
.agg(recencyAggColumn, frequencyAggColumn, monetaryAggColumn)
用户活跃度模型代码的开发-REF
//0.定义常量字符串,避免后续拼写错误
val recencyStr = "recency"
val frequencyStr = "frequency"
val engagementsStr = "engagements"
val featureStr = "feature"
val scaleFeatureStr = "scaleFeature"
val predictStr = "predict"
//1.按用户id进行聚合获取用户活跃度-RFE
//Recency:最近一次访问时间,用户最后一次访问距今时间,当前时间 - log_time
//Frequency:访问频率,用户一段时间内访问的页面总数,count(loc_url)
//Engagements:页面互动度,用户一段时间内访问的独立页面数,也可以定义为页面 浏览量、下载量、 视频播放数量等,distinct count(loc_url)
val recencyAggColumn: Column = datediff(date_sub(current_timestamp(),60), max('log_time)) as recencyStr
val frequencyAggColumn: Column = count('loc_url) as frequencyStr
val engagementsAggColumn: Column = countDistinct('loc_url) as engagementsStr
val RFEResult = hbaseDF.groupBy('global_user_id)
.agg(recencyAggColumn, frequencyAggColumn, engagementsAggColumn)
价格敏感度模型-PSM
模型的含义/组成
PSM(price Sensitivity measurement)
//价格敏感度模型
//ra:receivableAmount 应收金额
//da:discountAmount 优惠金额
//pa:practicalAmount 实收金额
//tdon total discountAmount order num 优惠订单数
//ton total order num 总订单总数
//ada avage discountAmount 平均优惠金额
//ara 平均每单应收
//tda 优惠总金额
//tra 应收总金额
//tdonr 优惠订单占比(优惠订单数 / 订单总数)
//adar 平均优惠金额占比(平均优惠金额 / 平均每单应收金额)
//tdar 优惠金额占比(优惠总金额 / 订单总金额)
//psm = 优惠订单占比 + 平均优惠金额占比 + 优惠总金额占比
//psmScore = tdonr + adar + tdar
//>=1 极度敏感
//0.4~1 比较敏感
//0.1~0.3 一般敏感
//0 不太敏感
//<0 极度不敏感
//0.字符串常量
val psmScoreStr: String = "psm"
val featureStr: String = "feature"
val predictStr: String = "predict"
//1.计算PSM的需要的指标
//价格敏感度模型
//ra:receivableAmount应收金额 100
//da:discountAmount 优惠金额 20
//pa:practicalAmount 实收金额 80
val raColumn = 'orderAmount + 'couponCodeValue as "ra"
val daColumn = 'couponCodeValue as "da"
val paColumn = 'orderAmount as "pa"
//优惠金额不为0的才为优惠订单,记为1
val stateColumn= functions
.when('couponCodeValue =!= 0.0d,1)
.when('couponCodeValue === 0.0d,0)
.as("state")
val tempDF: DataFrame = HBaseDF.select('memberId as "userId",raColumn,daColumn,paColumn,stateColumn)
决策树代码的开发
//1. 数据读取
val source: DataFrame = spark.read
.csv("file:///D:\\data\\spark\\ml\\iris_tree.csv")
.toDF("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width", "Species")
.select(
'Sepal_Length cast DoubleType,
'Sepal_Width cast DoubleType,
'Petal_Length cast DoubleType,
'Petal_Width cast DoubleType,
'Species)
source.show(false)
/*
+------------+-----------+------------+-----------+-----------+
|Sepal_Length|Sepal_Width|Petal_Length|Petal_Width|Species |
+------------+-----------+------------+-----------+-----------+
|5.1 |3.5 |1.4 |0.2 |Iris-setosa|
|4.9 |3.0 |1.4 |0.2 |Iris-setosa|
|4.7 |3.2 |1.3 |0.2 |Iris-setosa|
|4.6 |3.1 |1.5 |0.2 |Iris-setosa|
|5.0 |3.6 |1.4 |0.2 |Iris-setosa|
|5.4 |3.9 |1.7 |0.4 |Iris-setosa|
|4.6 |3.4 |1.4 |0.3 |Iris-setosa|
|5.0 |3.4 |1.5 |0.2 |Iris-setosa|
|4.4 |2.9 |1.4 |0.2 |Iris-setosa|
|4.9 |3.1 |1.5 |0.1 |Iris-setosa|
|5.4 |3.7 |1.5 |0.2 |Iris-setosa|
|4.8 |3.4 |1.6 |0.2 |Iris-setosa|
|4.8 |3.0 |1.4 |0.1 |Iris-setosa|
|4.3 |3.0 |1.1 |0.1 |Iris-setosa|
|5.8 |4.0 |1.2 |0.2 |Iris-setosa|
|5.7 |4.4 |1.5 |0.4 |Iris-setosa|
|5.4 |3.9 |1.3 |0.4 |Iris-setosa|
|5.1 |3.5 |1.4 |0.3 |Iris-setosa|
|5.7 |3.8 |1.7 |0.3 |Iris-setosa|
|5.1 |3.8 |1.5 |0.3 |Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 20 rows
*/
//2.特征工程(2.1数值化标签列)
val stringIndexer: StringIndexer = new StringIndexer()
.setInputCol("Species") //字符串形式
.setOutputCol("Species_Indexer")//数值化之后的标签列
//2.特征工程(2.2特征向量化+归一化)
val vectorAsserbler: VectorAssembler = new VectorAssembler()
.setInputCols(Array("Sepal_Length", "Sepal_Width", "Petal_Length", "Petal_Width"))
.setOutputCol("features")//向量化之后的特征列
//3.构建决策树
val decisionTreeClassifier: DecisionTreeClassifier = new DecisionTreeClassifier()
.setLabelCol("Species_Indexer")//数值化之后的标签列
.setFeaturesCol("features")//向量化之后的特征列
.setPredictionCol("predict")//预测列
.setImpurity("gini")
.setMaxDepth(5)
//还原标签列(方便我们查看结果,才做的步骤,可以省略,如果写上,应该对预测列进行还原)
val indexToString: IndexToString = new IndexToString()
.setLabels(Array("Iris-versicolor","Iris-setosa","Iris-virginica"))
.setInputCol("predict")//对预测的数值进行还原
.setOutputCol("predict_String")//还原成字符串的标签列名
//拆分数据集
val Array(trainSet,testSet) = source.randomSplit(Array(0.8,0.2),10)
//4.构建Pipeline并训练模型
val pModel: PipelineModel = new Pipeline()
.setStages(Array(stringIndexer, vectorAsserbler, decisionTreeClassifier, indexToString))
.fit(trainSet)
//5.预测分类
val result: DataFrame = pModel.transform(testSet)
result.show(false)
//6.查看结果(查看决策过程)
val tmodel: DecisionTreeClassificationModel = pModel.stages(2).asInstanceOf[DecisionTreeClassificationModel]
println(tmodel.toDebugString)//获取决策树的决策过程
//7.评估模型
val evalutor: MulticlassClassificationEvaluator = new MulticlassClassificationEvaluator()//创建评估器
.setLabelCol("Species_Indexer")//数值化的标签列(原来数据中有的真正的标签列)
.setPredictionCol("predict")//和预测类别比较
.setMetricName("accuracy")//准确率
val acc: Double = evalutor.evaluate(result)//evaluate表示计算.评估
println(s"ERROR Ratio : ${1 - acc}")//错误率
用户消费性别标签代码开发-USG(User Shopping Gender)
object USGModel extends BaseModel{
def main(args: Array[String]): Unit = {
execute()
}
/**
* 提供一个抽象方法由子类实现并返回标签id
* @return
*/
override def getTagID(): Int = 56
/**
* 标签计算的具体流程,应该由子类去实现
* @param fiveRule
* @param HBaseDF
* @return
*/
override def compute(fiveRule: DataFrame, HBaseDF: DataFrame): DataFrame = {
//fiveRule.show(10,false)
//fiveRule.printSchema()
//根据4级标签从HBase中查出来的实际上是商品表(订单项表)
val goodsDF = HBaseDF
//goodsDF.show(10,false)
//goodsDF.printSchema()
//0.额外再查询订单表
val ordersDF: DataFrame = spark.read
.format("cn.itcast.up.tools.HBaseSource")
.option(HBaseMeta.ZKHOSTS, "bd001")
.option(HBaseMeta.ZKPORT, "2181")
.option(HBaseMeta.HBASETABLE, "tbl_orders")
.option(HBaseMeta.FAMILY, "detail")
.option(HBaseMeta.SELECTFIELDS, "memberId,orderSn")
.load()
//ordersDF.show(10)
/*
+---+----+
|id |rule|
+---+----+
|57 |0 |
|58 |1 |
|59 |-1 |
+---+----+
root
|-- id: long (nullable = false)
|-- rule: string (nullable = true)
//商品表(订单项表)
+----------------------+---------+-----------+
|cOrderSn |ogColor |productType|
+----------------------+---------+-----------+
|jd_14091818005983607 |白色 |烤箱 |
|jd_14091818005983607 |香槟金 |冰吧 |
|jd_14092012560709235 |香槟金色 |净水机 |
|rrs_15234137 |梦境极光【布朗灰】|烤箱 |
|suning_790750687478116|梦境极光【卡其金】|4K电视|
|rsq_805093707860210 |黑色 |烟灶套系 |
|jd_14090910361908941 |黑色 |智能电视 |
|jd_14091823464864679 |香槟金色 |燃气灶 |
|jd_14091817311906413 |银色 |滤芯 |
|suning_804226647488814|玫瑰金 |电饭煲 |
+----------------------+---------+-----------+
only showing top 10 rows
root
|-- cOrderSn: string (nullable = true)
|-- ogColor: string (nullable = true)
|-- productType: string (nullable = true)
//订单表
+---------+-------------------+
| memberId| orderSn|
+---------+-------------------+
| 13823431| ts_792756751164275|
| 4035167| D14090106121770839|
| 4035291| D14090112394810659|
| 4035041| fx_787749561729045|
| 13823285| D14092120154435903|
| 4034219| D14092120155620305|
|138230939|top_810791455519102|
| 4035083| D14092120161884409|
|138230935| D14092120162313538|
| 13823231| D14092120162378713|
+---------+-------------------+
only showing top 10 rows
最后我们需要的是:
+---------+----------------------+---------+-----------+
| memberId|cOrderSn |ogColor |productType|
+---------+----------------------+---------+-----------+
| 13823431|jd_14091818005983607 |白色 |烤箱 |
| 13823431|jd_14091818005983607 |香槟金 |冰吧 |
| 4035291|jd_14092012560709235 |香槟金色 |净水机 |
| 4035041|rrs_15234137 |梦境极光【布朗灰】|烤箱 |
| 13823285|suning_790750687478116|梦境极光【卡其金】|4K电视 |
| 4034219|rsq_805093707860210 |黑色 |烟灶套系 |
|138230939|jd_14090910361908941 |黑色 |智能电视 |
| 4035083|jd_14091823464864679 |香槟金色 |燃气灶 |
|138230935|jd_14091817311906413 |银色 |滤芯 |
| 13823231|suning_804226647488814|玫瑰金 |电饭煲 |
+---------+----------------------+---------+-----------+
*/
//0.导入隐式转换
import org.apache.spark.sql.functions._
import spark.implicits._
//1.特征选取
//我们这里做了简化只考虑商品的颜色和商品的类型,然后使用决策树对用户购物性别进行建模/预测
//实际中可能选取的特征会有几十个上百个(颜色/类型/大小尺寸/品牌/产地/价格.....)那么计算量会很大,可能需要进行PCA降维(主成分分析)
//颜色ID应该来源于字典表,这里简化处理,直接使用case...when赋值
//但是需要注意,并不能表示对特征进行了数值编码(因为特征的数值编码应该从0开始)
val color: Column = functions
.when('ogColor.equalTo("银色"), 1)
.when('ogColor.equalTo("香槟金色"), 2)
.when('ogColor.equalTo("黑色"), 3)
.when('ogColor.equalTo("白色"), 4)
.when('ogColor.equalTo("梦境极光【卡其金】"), 5)
.when('ogColor.equalTo("梦境极光【布朗灰】"), 6)
.when('ogColor.equalTo("粉色"), 7)
.when('ogColor.equalTo("金属灰"), 8)
.when('ogColor.equalTo("金色"), 9)
.when('ogColor.equalTo("乐享金"), 10)
.when('ogColor.equalTo("布鲁钢"), 11)
.when('ogColor.equalTo("月光银"), 12)
.when('ogColor.equalTo("时尚光谱【浅金棕】"), 13)
.when('ogColor.equalTo("香槟色"), 14)
.when('ogColor.equalTo("香槟金"), 15)
.when('ogColor.equalTo("灰色"), 16)
.when('ogColor.equalTo("樱花粉"), 17)
.when('ogColor.equalTo("蓝色"), 18)
.when('ogColor.equalTo("金属银"), 19)
.when('ogColor.equalTo("玫瑰金"), 20)
.otherwise(0)
.alias("color")
//类型ID应该来源于字典表,这里简化处理
val productType: Column = functions
.when('productType.equalTo("4K电视"), 9)
.when('productType.equalTo("Haier/海尔冰箱"), 10)
.when('productType.equalTo("Haier/海尔冰箱"), 11)
.when('productType.equalTo("LED电视"), 12)
.when('productType.equalTo("Leader/统帅冰箱"), 13)
.when('productType.equalTo("冰吧"), 14)
.when('productType.equalTo("冷柜"), 15)
.when('productType.equalTo("净水机"), 16)
.when('productType.equalTo("前置过滤器"), 17)
.when('productType.equalTo("取暖电器"), 18)
.when('productType.equalTo("吸尘器/除螨仪"), 19)
.when('productType.equalTo("嵌入式厨电"), 20)
.when('productType.equalTo("微波炉"), 21)
.when('productType.equalTo("挂烫机"), 22)
.when('productType.equalTo("料理机"), 23)
.when('productType.equalTo("智能电视"), 24)
.when('productType.equalTo("波轮洗衣机"), 25)
.when('productType.equalTo("滤芯"), 26)
.when('productType.equalTo("烟灶套系"), 27)
.when('productType.equalTo("烤箱"), 28)
.when('productType.equalTo("燃气灶"), 29)
.when('productType.equalTo("燃气热水器"), 30)
.when('productType.equalTo("电水壶/热水瓶"), 31)
.when('productType.equalTo("电热水器"), 32)
.when('productType.equalTo("电磁炉"), 33)
.when('productType.equalTo("电风扇"), 34)
.when('productType.equalTo("电饭煲"), 35)
.when('productType.equalTo("破壁机"), 36)
.when('productType.equalTo("空气净化器"), 37)
.otherwise(0)
.alias("productType")
//2.数据标注-我们这里的标注比较简单粗糙
//使用运营的统计数据对数据进行标注
//运营会根据购买记录和调研信息,得出用户购物性别和购买商品的一个统计规律
//得到的统计规律是符合现有的数据的,那么对于未来的新数据,不应该全部直接由该规律直接判断用户的购物性别
//而应该使用算法结合运营的统计规律去挖掘更多的分类决策依据得到一个分类模型,这样的话
//当有新数据到来的时候就可以用该模型进行预测,而不是去直接套用该统计规律
//训练的目的就是从已标注数据中找到规律,以后新来了一条数据,就可以进行预测
val label: Column = functions
.when('ogColor.equalTo("樱花粉")
.or('ogColor.equalTo("白色"))
.or('ogColor.equalTo("香槟色"))
.or('ogColor.equalTo("香槟金"))
.or('productType.equalTo("料理机"))
.or('productType.equalTo("挂烫机"))
.or('productType.equalTo("吸尘器/除螨仪")), 1) //女
.otherwise(0)//男
.alias("gender")//决策树预测label
//
val sourceDF: DataFrame = goodsDF.join(ordersDF, 'cOrderSn === 'orderSn)
.select('memberId as "userId", color, productType, label)
sourceDF.show(10,false)
/*
可以理解为处理之后的原始数据集,含义为:
用户id,商品颜色,商品类型,购物性别男/女
该数据集中包含了特征列,标签列,可以使用有监督学习算法决策树进行训练
训练好之后,对于以后新来的数据就可以预测用户的购物性别是男还是女?
+---------+-----+-----------+------+
|userId |color|productType|gender|
+---------+-----+-----------+------+
|13823535 |16 |0 |0 |
|13823535 |1 |24 |0 |
|13823535 |7 |30 |0 |
|13823391 |10 |14 |0 |
|4034493 |9 |12 |0 |
|13823683 |8 |17 |0 |
|62 |9 |15 |0 |
|4035201 |8 |12 |0 |
|13823449 |10 |0 |0 |
|138230919|12 |15 |0 |
+---------+-----+-----------+------+
*/
//3.数值化标签列(编码)
val labelIndexer: StringIndexerModel = new StringIndexer()
.setInputCol("gender")
.setOutputCol("label")
.fit(sourceDF)
//4.特征向量化
val featureVectorAssembler: VectorAssembler = new VectorAssembler()
.setInputCols(Array("color", "productType"))
.setOutputCol("features")
//5.特征数值化(编码)
//对特征进行索引,大于3个不同的值的特征被视为连续特征
//VectorIndexer是对数据集特征向量中的类别(离散值)特征(index categorical features categorical features)进行编号。
//它能够自动判断那些特征是离散值型的特征,并对他们进行编号,具体做法是通过设置一个maxCategories,
//特征向量中某一个特征不重复取值个数小于maxCategories,则被重新编号为0~K(K<=maxCategories-1)。
//某一个特征不重复取值个数大于maxCategories,则该特征视为连续值,不会重新编号(不会发生任何改变)
//主要作用:提高决策树或随机森林等ML方法的分类效果
val featureVectorIndexer: VectorIndexer = new VectorIndexer()
.setInputCol("features")
.setOutputCol("featureIndexed")
.setMaxCategories(3)
//6.构建决策树
val decisionTreeClassifier: DecisionTreeClassifier = new DecisionTreeClassifier()
.setFeaturesCol("featureIndexed")
.setPredictionCol("predict")
.setImpurity("gini") //Gini不纯度
.setMaxDepth(5) //树的最大深度
.setMaxBins(5)//离散化连续特征的最大划分数
//7.还原标签列
val labelConverter: IndexToString = new IndexToString()
.setInputCol("label")
.setOutputCol("labelConverted")
.setLabels(labelIndexer.labels)
//8.划分数据集
val Array(traiData,testData) = sourceDF.randomSplit(Array(0.8,0.2))
//9.构建Pipeline
val pipeline: Pipeline = new Pipeline()
.setStages(Array(labelIndexer, featureVectorAssembler, featureVectorIndexer, decisionTreeClassifier, labelConverter))
val pmodel: PipelineModel = pipeline.fit(traiData)
//10.预测
val traiResult: DataFrame = pmodel.transform(traiData)
val testResult: DataFrame = pmodel.transform(testData)
//11.模型评估
this.evaluateAUC(traiResult,testResult)
//12.查看决策过程
val tmodel: DecisionTreeClassificationModel = pmodel.stages(3).asInstanceOf[DecisionTreeClassificationModel]
println(tmodel.toDebugString)//获取决策树的决策过程
//13.查看结果
//我们手里有的数据:
//userId,predict
//我们想要的数据:
//用户id,被预测多少次,被预测为男多少次,被预测为女多少次
//userid,total,male,female
//1234 , 10 ,8, 2
val tempDF:DataFrame = traiResult.union(testResult)
.select('userId,
when('predict === 0, 1).otherwise(0).as("male"), //计算每个用户所有订单中的男性商品的订单数
when('predict === 1, 1).otherwise(0).as("female")) //计算每个用户所有订单中的女性商品的订单数
.groupBy('userId)
.agg(
count('userId).cast(DoubleType).as("total"), //求total
sum('male).cast(DoubleType).as("male"), //求预测为男的次数
sum('female).cast(DoubleType).as("female") //求预测为女的次数
)
tempDF.show(10,false)
/*
+---------+-----+----+------+
|userId |total|male|female|
+---------+-----+----+------+
|138230919|5.0 |3.0 |2.0 |
|4033473 |13.0 |13.0|0.0 |
|13822725 |7.0 |7.0 |0.0 |
|13823083 |17.0 |16.0|1.0 |
|13823681 |3.0 |3.0 |0.0 |
|4034923 |6.0 |5.0 |1.0 |
|4033575 |9.0 |9.0 |0.0 |
|13823431 |8.0 |7.0 |1.0 |
|4033483 |5.0 |4.0 |1.0 |
|4034191 |6.0 |4.0 |2.0 |
+---------+-----+----+------+
*/
//最后我们想要的数据:
//userId,gender,tagIds
//所以我们需要对上面的数据再进行转换,转换的依据是?
//转换规则A:每个订单的男性商品>=80%则认定为该订单的用户为男,或女商品比例达到80%则认定为该订单的用户为女;
//由于是家电产品,一个订单中通常只有一个商品。调整规则A为规则B:
//转换规则B:计算每个用户近半年内所有订单中的男性商品超过60%则认定该用户为男,或近半年内所有订单中的女性品超过60%则认定该用户为女
//fiveRule如下:
// |-- id: long (nullable = false)
// |-- rule: string (nullable = true)
//+---+----+
//|id |rule|
//+---+----+
//|57 |0 |
//|58 |1 |
//|59 |-1 |
//+---+----+
//现在我们手里有如上转换规则B和fiveRule,要对tempDF中的userId打上对应的tagId
val ruleMap: Map[String, Long] = fiveRule.as[(Long,String)].map(t=>(t._2,t._1)).collect().toMap
val gender2tag = udf((total:Double,male:Double,female:Double)=>{
val maleRate = male / total
val femaleRate = female / total
if(maleRate >= 0.6){
ruleMap("0")
}else if(femaleRate >= 0.6){
ruleMap("1")
}else{
ruleMap("-1")
}
})
val newDF: DataFrame = tempDF.select('userId,gender2tag('total,'male,'female).as("tagIds"))
newDF
}
/**
* @param predictTestDF
* @param predictTrainDF
*/
def evaluateAUC(predictTrainDF: DataFrame,predictTestDF: DataFrame): Unit = {
// 1. ACC
val accEvaluator = new MulticlassClassificationEvaluator()
.setPredictionCol("predict")
.setLabelCol("label")
.setMetricName("accuracy")//精准度
val trainAcc: Double = accEvaluator.evaluate(predictTrainDF)
val testAcc: Double = accEvaluator.evaluate(predictTestDF)
println(s"训练集上的 ACC 是 : $trainAcc")
println(s"测试集上的 ACC 是 : $testAcc")
//训练集上的 ACC 是 : 0.7561343903359758
//测试集上的 ACC 是 : 0.7417582417582418
// 2. AUC
val trainRdd: RDD[(Double, Double)] = predictTrainDF.select("label", "predict").rdd
.map(row => (row.getAs[Double](0), row.getAs[Double](1)))
val testRdd: RDD[(Double, Double)] = predictTestDF.select("label", "predict").rdd
.map(row => (row.getAs[Double](0), row.getAs[Double](1)))
val trainAUC: Double = new BinaryClassificationMetrics(trainRdd).areaUnderROC()
val testAUC: Double = new BinaryClassificationMetrics(testRdd).areaUnderROC()
println(s"训练集上的 AUC 是 : $trainAUC")
println(s"测试集上的 AUC 是 : $testAUC")
//训练集上的 AUC 是 : 0.6731303868870732
//测试集上的 AUC 是 : 0.6482208896596393
}
}
用户购物偏好模型–PB
object BPModel extends BaseModel{
def main(args: Array[String]): Unit = {
execute()
}
/**
* 提供一个抽象方法由子类实现并返回标签id
* @return
*/
override def getTagID(): Int = 60
/**
* 标签计算的具体流程,应该由子类去实现
* @param fiveRule
* @param HBaseDF
* @return
*/
override def compute(fiveRule: DataFrame, HBaseDF: DataFrame): DataFrame = {
//HBaseDF.show(10,false)
//HBaseDF.printSchema()
/*
+--------------+-------------------------------------------------------------------+-------------------+
|global_user_id|loc_url |log_time |
+--------------+-------------------------------------------------------------------+-------------------+
|424 |http://m.eshop.com/mobile/coupon/getCoupons.html?couponsId=3377 |2019-08-13 03:03:55|
|619 |http://m.eshop.com/?source=mobile |2019-07-29 15:07:41|
|898 |http://m.eshop.com/mobile/item/11941.html |2019-08-14 09:23:44|
|642 |http://www.eshop.com/l/2729-2931.html |2019-08-11 03:20:17|
|130 |http://www.eshop.com/ |2019-08-12 11:59:28|
|515 |http://www.eshop.com/l/2723-0-0-1-0-0-0-0-0-0-0-0.html |2019-07-23 14:39:25|
|274 |http://www.eshop.com/ |2019-07-24 15:37:12|
|772 |http://ck.eshop.com/login.html |2019-07-24 07:56:49|
|189 |http://m.eshop.com/mobile/item/9673.html |2019-07-26 19:17:00|
|529 |http://m.eshop.com/mobile/search/_bplvbiwq_XQS75_btX_ZY1328-se.html|2019-07-25 23:18:37|
+--------------+-------------------------------------------------------------------+-------------------+
only showing top 10 rows
root
|-- global_user_id: string (nullable = true)
|-- loc_url: string (nullable = true)
|-- log_time: string (nullable = true)
*/
//0.导入隐式转换
import spark.implicits._
import scala.collection.JavaConversions._
import org.apache.spark.sql.functions._
//1.构建评分矩阵(用户id,产品id,隐式评分(浏览商品的次数))
//1.1解析url获取商品id
//自定义udf完成url解析,获取出productId
val url2productId = udf((url:String)=>{
var productId: String = null
if (url.contains("/product/") && url.contains(".html")) {
val start: Int = url.indexOf("/product/")
val end: Int = url.indexOf(".html")
if (end > start) {
productId = url.substring(start + 9, end)
}
}
productId
})
val tempDF: DataFrame = HBaseDF.select('global_user_id.as("userId"),url2productId('loc_url).as("productId"))
.filter('productId.isNotNull)//因为日志记录中的数据会包含用户访问的其他的页面,可能该条记录中没有访问产品,所以应该过滤掉
//tempDF.show(10,false)
//userId,productId
/*
+------+---------+
|userId|productId|
+------+---------+
|81 |11013 |
|81 |11013 |
|302 |5353 |
|370 |9221 |
|405 |4167 |
|685 |9763 |
|733 |9501 |
|659 |11457 |
|642 |12231 |
|182 |9763 |
+------+---------+
only showing top 10 rows
81 11013 2
*/
//1.2分组并计数得到(用户id,产品id,隐式评分(浏览商品的次数))
val tempDF2: DataFrame = tempDF.groupBy('userId, 'productId)
.agg(count('productId) as "rating")
//1.3转换成ALS算法需要的类型(用户id-Int,产品id-Int,隐式评分(浏览商品的次数)-Double)
val ratingDF: Dataset[Row] = tempDF2.select('userId.cast(IntegerType), 'productId.cast(IntegerType), 'rating.cast(DoubleType))
//考虑到我们的数据中可能有一些用户或者产品的id不是数字格式的字符串
//那么在上一步转换的时候就失败,但是SparkSQL不会报错,而是返回null
//而后面的机器学习算法不支持null,所以应该过滤掉,即过滤出不为null的
.filter('userId.isNotNull && 'productId.isNotNull && 'rating.isNotNull)
//ratingDF.show(10,false)
//ratingDF.printSchema()
//用户id-Int,产品id-Int,隐式评分(浏览商品的次数)-Double
/*
+------+---------+------+
|userId|productId|rating|
+------+---------+------+
|533 |11455 |1.0 |
|322 |11949 |1.0 |
|258 |7467 |1.0 |
|558 |10937 |1.0 |
|555 |10333 |1.0 |
|24 |11111 |1.0 |
|601 |5214 |2.0 |
|756 |10795 |1.0 |
|501 |12233 |3.0 |
|395 |9499 |1.0 |
+------+---------+------+
我们就得到了用户商品的评分矩阵
*/
//2.构建ALS算法模型并训练
val model: ALSModel = new ALS()
.setUserCol("userId")
.setItemCol("productId")
.setRatingCol("rating")
.setImplicitPrefs(true) //是否使用隐式评分,默认是false表示显示评分
.setRank(10) //隐藏因子的数量,默认为10(就是k值: m行*n列 = m行*k列 X k行*n列)
.setMaxIter(10) //最大迭代次数
.setAlpha(1.0) //默认就是1.0
.fit(ratingDF)
//3.取出预测评分,给所有用户推荐他可能感兴趣的5个商品
val result: DataFrame = model.recommendForAllUsers(5)
//result.show(10,false)
//result.printSchema()
//userId,[推荐商品列表-5个商品]
/*
+------+------------------------------------------------------------------------------------------------+
|userId|recommendations |
+------+------------------------------------------------------------------------------------------------+
|471 |[[10935,0.85480124], [6603,0.82611465], [9371,0.77806026], [6393,0.7115429], [6395,0.7073429]] |
|463 |[[6603,0.73332685], [10935,0.73117584], [9371,0.7171242], [7173,0.68159044], [6395,0.6529176]] |
|833 |[[6603,0.79149675], [10935,0.76704717], [9371,0.7628299], [7173,0.7460966], [6393,0.71079165]] |
|496 |[[6603,0.8117078], [9371,0.7372035], [10935,0.7206242], [6393,0.71912766], [6395,0.69081223]] |
|148 |[[6603,0.68078536], [10935,0.6766914], [9371,0.65072364], [7173,0.62549454], [6393,0.6091096]] |
|540 |[[6603,0.7680414], [10935,0.7268602], [9371,0.7239733], [6393,0.7048858], [6395,0.6673726]] |
|392 |[[9371,0.8268737], [10935,0.8252757], [6603,0.78230804], [10781,0.76121557], [7173,0.74036145]] |
|243 |[[10935,0.8475637], [10613,0.8446924], [6603,0.81898254], [9371,0.7813303], [6393,0.7759238]] |
|623 |[[9371,0.8200202], [6603,0.81787753], [10935,0.7876395], [7173,0.7756662], [5394,0.7284824]] |
|737 |[[6603,0.83411646], [9371,0.79150367], [11949,0.73539114], [10935,0.73003834], [6393,0.7108383]]|
+------+------------------------------------------------------------------------------------------------+
only showing top 10 rows
root
|-- userId: integer (nullable = false)
|-- recommendations: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- productId: integer (nullable = true)
| | |-- rating: float (nullable = true)
*/
//
val newDF: DataFrame = result.as[(Int, Array[(Int, Double)])].map(t => {
val userId = t._1
val tagIds: String = t._2.map(_._1).mkString(",")
(userId, tagIds)
}).toDF("userId", "tagIds")
newDF.show(10,false)
/*
+------+--------------------------+
|userId|tagIds |
+------+--------------------------+
|471 |10935,6603,9371,6393,6395 |
|463 |6603,10935,9371,7173,6395 |
|833 |6603,10935,9371,7173,6393 |
|496 |6603,9371,10935,6393,6395 |
|148 |6603,10935,9371,7173,6393 |
|540 |6603,10935,9371,6393,6395 |
|392 |9371,10935,6603,10781,7173|
|243 |10935,10613,6603,9371,6393|
|623 |9371,6603,10935,7173,5394 |
|737 |6603,9371,11949,10935,6393|
+------+--------------------------+
*/
newDF
}
}