用连续数值表示当前维度特征,通常会对数值型特征进行数学上的处理,主要的做法是归一化和离散化
幅度调整/归一化
特征与特征之间应该是平等的,区别应该体现在特征内部
例如房屋价格和住房面积的幅度是不同的,房屋价格可能在3000000 ~ 15000000 (万)之间,而住房面积在40-300 (平方米)之间,那么明明是平等的两个特征,输入到相同的模型中后由于本身的幅值不同导致产生的效果不同,这是不合理的
用之前的特征除以特征的最大值和最小值之差得出新的特征
# 1.引入依赖
import numpy as np
import pandas as pd
# 2.定义数据和预处理
docA = "The cat sat on my bed"
docB = "The dog sat on my knees"
bowA = docA.split(" ")
bowB = docB.split(" ")
# 3.构建词库
wordSet = set(bowA).union(set(bowB))
wordSet # {'The', 'bed', 'cat', 'dog', 'knees', 'my', 'on', 'sat'}
# 进行词数统计
# 用统计字典来保存词出现的次数
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)
wordDictA
# {'The': 0,
# 'sat': 0,
# 'cat': 0,
# 'on': 0,
# 'dog': 0,
# 'my': 0,
# 'bed': 0,
# 'knees': 0}
# 遍历文档,统计词数
for word in bowA:
wordDictA[word] += 1
for word in bowB:
wordDictB[word] += 1
pd.DataFrame([wordDictA, wordDictB])
# The sat cat on dog my bed knees
# 0 1 1 1 1 0 1 1 0
# 1 1 1 0 1 1 1 0 1
# 4.计算词频TF
def computeTF( wordDict, bow ):
# 用一个字典对象记录tf,把所有的词对应在bow文档里的tf都算出来
tfDict = {}
nbowCount = len(bow)
for word, count in wordDict.items():
tfDict[word] = count / nbowCount
return tfDict
tfA = computeTF(wordDictA, bowA)
tfB = computeTF(wordDictB, bowB)
tfA
# {'The': 0.16666666666666666,
# 'sat': 0.16666666666666666,
# 'cat': 0.16666666666666666,
# 'on': 0.16666666666666666,
# 'dog': 0.0,
# 'my': 0.16666666666666666,
# 'bed': 0.16666666666666666,
# 'knees': 0.0}
# 5.计算逆文档频率IDF
def computeIDF(wordDictList):
# 用一个字典对象保存idf结果,每个词作为key,初始值为0
idfDict = dict.fromkeys(wordDictList[0], 0)
N = len(wordDictList)
import math
for wordDict in wordDictList:
# 遍历字典中的每个词汇
for word, count in wordDict.items():
if count > 0:
# 先把Ni增加1,存入到idfDict
idfDict[word] += 1
# 已经得到所有词汇i对应的Ni,现在根据公式把它替换成为idf值
for word, ni in idfDict.items():
idfDict[word] = math.log10((N + 1) / (ni + 1))
return idfDict
idfs = computeIDF([wordDictA, wordDictB])
idfs
# {'The': 0.0,
# 'sat': 0.0,
# 'cat': 0.17609125905568124,
# 'on': 0.0,
# 'dog': 0.17609125905568124,
# 'my': 0.0,
# 'bed': 0.17609125905568124,
# 'knees': 0.17609125905568124}
# 6.计算TF-IDF
def computeTFIDF(tf, idfs):
tfidf = {}
for word, tfval in tf.items():
tfidf[word] = tfval * idfs[word]
return tfidf
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
pd.DataFrame([tfidfA, tfidfB])
# The sat cat on dog my bed knees
# 0 0.0 0.0 0.029349 0.0 0.000000 0.0 0.029349 0.000000
# 1 0.0 0.0 0.000000 0.0 0.029349 0.0 0.000000 0.029349
# 1 引入依赖
import numpy as np
import pandas as pd
# 2 数据准备
# 评分矩阵R
R = np.array([[4,0,2,0,1],
[0,2,3,0,0],
[1,0,2,4,0],
[5,0,0,3,1],
[0,0,1,5,1],
[0,3,2,4,1]])
len(R[0]) # 5
# 3 算法实现
"""
输入参数:
R:M*N的评分矩阵
K:隐特征向量维度
max_iter:最大迭代次数
alpha:步长
lambda:正则化系数
输出:
分解之后的P,Q
P:初始化用户特征矩阵M*K
Q:初始化物品特征矩阵N*K
"""
# 给定超参数
K = 2
max_iter = 5000
alpha = 0.0002
lamda = 0.004
# 核心算法
def LFM_grad_desc( R, K=2, max_iter=1000, alpha=0.0001, lamda=0.002):
# 基本维度参数定义
M = len(R)
N = len(R[0])
# P,Q初始值,随机生成M*K的矩阵
P = np.random.rand(M, K)
Q = np.random.rand(N, K)
# 转置
Q = Q.T
# 开始迭代
for step in range(max_iter):
# 对所有的用户u,物品i做遍历,对应的特征向量Pu、Qi梯度下降
for u in range(M):
for i in range(N):
# 对于每一个大于0的评分,求出预测评分误差
if R[u][i] > 0:
eui = np.dot(P[u, :], Q[:, i]) - R[u][i]
# 代入公式,按照梯度下降算法更新当前的Pu、Qi
for k in range(K):
P[u][k] = P[u][k] - alpha * ( 2 * eui * Q[k][i] + 2 * lamda * P[u][k])
Q[k][i] = Q[k][i] - alpha * ( 2 * eui * P[u][k] + 2 * lamda * Q[k][i])
# u、i遍历完成,所有特征向量更新完成,可以得到P、Q,可以计算预测评分矩阵
predR = np.dot( P,Q )
# 计算当前损失函数
cost = 0
for u in range(M):
for i in range(N):
if R[u][i] > 0:
cost += (np.dot(P[u, :], Q[:, i]) - R[u][i]) ** 2
# 加上正则化项
for k in range(K):
cost += lamda * (P[u][k] ** 2 + Q[k][i] ** 2)
if cost < 0.0001:
break
return P, Q.T, cost
# 4 测试
P, Q, cost = LFM_grad_desc(R, K, max_iter, alpha, lamda)
print(P)
print(Q)
print(cost)
print(R)
predR = P.dot(Q.T)
predR
"""
[[ 1.51434749 0.77443196]
[ 1.24971278 1.50631155]
[ 0.3242702 1.54690507]
[ 1.87077959 -0.08405844]
[ 1.71301986 0.71429246]
[ 1.74590798 0.55058081]]
[[2.60086925 0.1612641 ]
[1.39956377 0.38112632]
[0.59704728 1.20348814]
[1.77147573 2.1578591 ]
[0.56379486 0.07498884]]
2.1410251115602126
[[4 0 2 0 1]
[0 2 3 0 0]
[1 0 2 4 0]
[5 0 0 3 1]
[0 0 1 5 1]
[0 3 2 4 1]]
array([[4.06350791, 2.41458229, 1.83615673, 4.35374487, 0.91185508],
[3.49325351, 2.32314771, 2.55896571, 5.46424393, 0.81753819],
[1.09284464, 1.04340306, 2.05528654, 3.91243996, 0.29882248],
[4.85209752, 2.58623846, 1.01578053, 3.13265437, 1.04843247],
[4.57053041, 2.6697162 , 1.88239635, 4.57591558, 1.01935575],
[4.62966731, 2.65335041, 1.7050071 , 4.28090943, 1.02562136]])
"""
# 历史热门电影统计
select mid, count(mid) as count from ratings group by mid
# => RateMoreMovies
# 近期热门电影统计
select mid, score, changeDate(timestamp) as yearmonth from ratings
# => ratingOfMonth
select mid, count(mid) as count ,yearmonth from ratingOfMonth group by yearmonth,mid order by yearmonth desc,count desc
# => RateMoreRecentlyMovies
# 电影平均评分统计
select mid, avg(score) as avg from ratings group by mid
# => AverageMovies
# 各类别 Top10 评分电影统计
select a.mid, genres, if(isnull(b.avg),0,b.avg) score from movies a left join averageMovies b on a.mid = b.mid
# => movieWithScore
spark.sql("select * from (select " +
"mid," +
"gen," +
"score, " +
"row_number() over(partition by gen order by score desc) rank " +
"from " +
"(select mid,score,explode(splitGe(genres)) gen from movieWithScore)
genresMovies) rankGenresMovies " +
"where rank <= 10")