目录
一、attention模块
二、三层全连接网络
三、Logistic Regression
四、FM
五、手写Kmeans
六、word2vec 编解码层
七、Swing
八、CNN网络
九、Transformer代码
def attention(queries, keys, keys_length):
'''
queries: [B, H]
keys: [B, T, H]
keys_length: [B]
'''
queries_hidden_units = queries.get_shape().as_list()[-1]
queries = tf.tile(queries, [1, tf.shape(keys)[1]])
queries = tf.reshape(queries, [-1, tf.shape(keys)[1], queries_hidden_units])
din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att', reuse=tf.AUTO_REUSE)
d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att', reuse=tf.AUTO_REUSE)
d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE)
d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(keys)[1]])
outputs = d_layer_3_all
# Mask
key_masks = tf.sequence_mask(keys_length, tf.shape(keys)[1]) # [B, T]
key_masks = tf.expand_dims(key_masks, 1) # [B, 1, T]
paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
outputs = tf.where(key_masks, outputs, paddings) # [B, 1, T]
# Scale
outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
# Activation
outputs = tf.nn.softmax(outputs) # [B, 1, T]
# Weighted sum
outputs = tf.matmul(outputs, keys) # [B, 1, H]
return outputs
def dnn_process(self):
layer_normal_1 = tf.layers.batch_normalization(inputs=self.input_embed, name="batch_normal_1", trainable=self.global_normal)
layer_1 = tf.layers_dense(layer_normal_1, 1024, activation=tf.nn.relu, name="layer_1")
layer_dropout_1 = tf.nn.dropout(layer_1, keep_prob=self._keep_prob, name="dropout_1")
layer_normal_2 = tf.layers.batch_normalization(inputs=layer_dropout_1, name="batch_normal_2", trainable=self.global_normal)
layer_2 = tf.layers_dense(layer_normal_2, 512, activation=tf.nn.relu, name="layer_2")
layer_dropout_2 = tf.nn.dropout(layer_2,keep_prob=self._keep_prob, name="dropout_2")
layer_normal_3 = tf.layers.batch_normalization(inputs=layer_dropout_2, name="batch_normal_2", trainable=self.global_normal)
layer_3 = tf.layers_dense(layer_normal_3, self._item_esize, activation=tf.nn.relu, name="layer_3")
self._user_embed = tf.expand_dims(layer_3, 1)
import numpy as np
from LoadDataSet import loadDataset
from logistic_regression import gradientDescent
train_dataMat, train_labelMat, test_dataMat, test_labelMat = loadDataset('I:\wangpengfei-D\DataSet\\two_classier\\testSet.txt')
mtrain, ntrain = np.shape(train_dataMat)
mtest, ntest = np.shape(test_dataMat)
numIterations = 100000 #梯度下降的次数
alpha = 0.0005 #每一次的下降步长
theta = np.ones(shape=(ntrain, 1)) #参数θ
theta = gradientDescent(train_dataMat, train_labelMat.transpose(),theta, alpha, mtrain, numIterations) #返回训练完毕的参数θ
y_hat = np.dot(test_dataMat, theta) #得到估计的结果保存到y_hat中
mark = []
for i in range(30):
res = sigmoid(y_hat[i])
if res > 0.5:
mark.append(1)
else:
mark.append(0)
print ('predic result:',mark)
print ('real result: ', test_labelMat)
right_sum = 0;
for i in range(30):
if mark[i] == test_labelMat[i]:
right_sum += 1
print ("right number: %d, right rate: %lf" %(right_sum, right_sum*1.0/30))
# -*- coding: utf-8 -*-
from __future__ import division
from math import exp
from numpy import *
from random import normalvariate # 正态分布
from sklearn import preprocessing
import numpy as np
'''
data : 数据的路径
feature_potenital : 潜在分解维度数
alpha : 学习速率
iter : 迭代次数
_w,_w_0,_v : 拆分子矩阵的weight
with_col : 是否带有columns_name
first_col : 首列有价值的feature的index
'''
class fm(object):
def __init__(self):
self.data = None
self.feature_potential = None
self.alpha = None
self.iter = None
self._w = None
self._w_0 = None
self.v = None
self.with_col = None
self.first_col = None
def min_max(self, data):
self.data = data
min_max_scaler = preprocessing.MinMaxScaler()
return min_max_scaler.fit_transform(self.data)
def loadDataSet(self, data, with_col=True, first_col=2):
# 我就是闲的蛋疼,明明pd.read_table()可以直接度,非要搞这样的,显得代码很长,小数据下完全可以直接读嘛,唉~
self.first_col = first_col
dataMat = []
labelMat = []
fr = open(data)
self.with_col = with_col
if self.with_col:
N = 0
for line in fr.readlines():
# N=1时干掉列表名
if N > 0:
currLine = line.strip().split()
lineArr = []
featureNum = len(currLine)
for i in range(self.first_col, featureNum):
lineArr.append(float(currLine[i]))
dataMat.append(lineArr)
labelMat.append(float(currLine[1]) * 2 - 1)
N = N + 1
else:
for line in fr.readlines():
currLine = line.strip().split()
lineArr = []
featureNum = len(currLine)
for i in range(2, featureNum):
lineArr.append(float(currLine[i]))
dataMat.append(lineArr)
labelMat.append(float(currLine[1]) * 2 - 1)
return mat(self.min_max(dataMat)), labelMat
def sigmoid(self, inx):
# return 1.0/(1+exp(min(max(-inx,-10),10)))
return 1.0 / (1 + exp(-inx))
# 得到对应的特征weight的矩阵
def fit(self, data, feature_potential=8, alpha=0.01, iter=100):
# alpha是学习速率
self.alpha = alpha
self.feature_potential = feature_potential
self.iter = iter
# dataMatrix用的是mat, classLabels是列表
dataMatrix, classLabels = self.loadDataSet(data)
print('dataMatrix:',dataMatrix.shape)
print('classLabels:',classLabels)
k = self.feature_potential
m, n = shape(dataMatrix)
# 初始化参数
w = zeros((n, 1)) # 其中n是特征的个数
w_0 = 0.
v = normalvariate(0, 0.2) * ones((n, k))
for it in range(self.iter): # 迭代次数
# 对每一个样本,优化
for x in range(m):
# 这边注意一个数学知识:对应点积的地方通常会有sum,对应位置积的地方通常都没有,详细参见矩阵运算规则,本处计算逻辑在:http://blog.csdn.net/google19890102/article/details/45532745
# xi·vi,xi与vi的矩阵点积
inter_1 = dataMatrix[x] * v
# xi与xi的对应位置乘积 与 xi^2与vi^2对应位置的乘积 的点积
inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v) # multiply对应元素相乘
# 完成交叉项,xi*vi*xi*vi - xi^2*vi^2
interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
# 计算预测的输出
p = w_0 + dataMatrix[x] * w + interaction
print('classLabels[x]:',classLabels[x])
print('预测的输出p:', p)
# 计算sigmoid(y*pred_y)-1
loss = self.sigmoid(classLabels[x] * p[0, 0]) - 1
if loss >= -1:
loss_res = '正方向 '
else:
loss_res = '反方向'
# 更新参数
w_0 = w_0 - self.alpha * loss * classLabels[x]
for i in range(n):
if dataMatrix[x, i] != 0:
w[i, 0] = w[i, 0] - self.alpha * loss * classLabels[x] * dataMatrix[x, i]
for j in range(k):
v[i, j] = v[i, j] - self.alpha * loss * classLabels[x] * (
dataMatrix[x, i] * inter_1[0, j] - v[i, j] * dataMatrix[x, i] * dataMatrix[x, i])
print('the no %s times, the loss arrach %s' % (it, loss_res))
self._w_0, self._w, self._v = w_0, w, v
def predict(self, X):
if (self._w_0 == None) or (self._w == None).any() or (self._v == None).any():
raise NotFittedError("Estimator not fitted, call `fit` first")
# 类型检查
if isinstance(X, np.ndarray):
pass
else:
try:
X = np.array(X)
except:
raise TypeError("numpy.ndarray required for X")
w_0 = self._w_0
w = self._w
v = self._v
m, n = shape(X)
result = []
for x in range(m):
inter_1 = mat(X[x]) * v
inter_2 = mat(multiply(X[x], X[x])) * multiply(v, v) # multiply对应元素相乘
# 完成交叉项
interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
p = w_0 + X[x] * w + interaction # 计算预测的输出
pre = self.sigmoid(p[0, 0])
result.append(pre)
return result
def getAccuracy(self, data):
dataMatrix, classLabels = self.loadDataSet(data)
w_0 = self._w_0
w = self._w
v = self._v
m, n = shape(dataMatrix)
allItem = 0
error = 0
result = []
for x in range(m):
allItem += 1
inter_1 = dataMatrix[x] * v
inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v) # multiply对应元素相乘
# 完成交叉项
interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
p = w_0 + dataMatrix[x] * w + interaction # 计算预测的输出
pre = self.sigmoid(p[0, 0])
result.append(pre)
if pre < 0.5 and classLabels[x] == 1.0:
error += 1
elif pre >= 0.5 and classLabels[x] == -1.0:
error += 1
else:
continue
# print(result)
value = 1 - float(error) / allItem
return value
class NotFittedError(Exception):
"""
Exception class to raise if estimator is used before fitting
"""
pass
if __name__ == '__main__':
fm()
# -*- coding: utf-8 -*-
"""
参考: https://gist.github.com/iandanforth/5862470
"""
import random
from kmeans_tools import Cluster, get_distance, gen_random_sample
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors
def kmeans(samples, k, cutoff):
"""
kmeans函数
"""
# 随机选k个样本点作为初始聚类中心
init_samples = random.sample(samples, k)
# 创建k个聚类,聚类的中心分别为随机初始的样本点
clusters = [Cluster([sample]) for sample in init_samples]
# 迭代循环直到聚类划分稳定
n_loop = 0
while True:
# 初始化一组空列表用于存储每个聚类内的样本点
lists = [[] for _ in clusters]
# 开始迭代
n_loop += 1
# 遍历样本集中的每个样本
for sample in samples:
# 计算样本点sample和第一个聚类中心的距离
smallest_distance = get_distance(sample, clusters[0].centroid)
# 初始化属于聚类 0
cluster_index = 0
# 计算和其他聚类中心的距离
for i in range(k - 1):
# 计算样本点sample和聚类中心的距离
distance = get_distance(sample, clusters[i+1].centroid)
# 如果存在更小的距离,更新距离
if distance < smallest_distance:
smallest_distance = distance
cluster_index = i + 1
# 找到最近的聚类中心,更新所属聚类
lists[cluster_index].append(sample)
# 初始化最大移动距离
biggest_shift = 0.0
# 计算本次迭代中,聚类中心移动的距离
for i in range(k):
shift = clusters[i].update(lists[i])
# 记录最大移动距离
biggest_shift = max(biggest_shift, shift)
# 如果聚类中心移动的距离小于收敛阈值,即:聚类稳定
if biggest_shift < cutoff:
print("第{}次迭代后,聚类稳定。".format(n_loop))
break
# 返回聚类结果
return clusters
def run_main():
"""
主函数
"""
# 样本个数
n_samples = 1000
# 特征个数 (特征维度)
n_feat = 2
# 特征数值范围
lower = 0
upper = 200
# 聚类个数
n_cluster = 3
# 生成随机样本
samples = [gen_random_sample(n_feat, lower, upper) for _ in range(n_samples)]
# 收敛阈值
cutoff = 0.2
clusters = kmeans(samples, n_cluster, cutoff)
# 输出结果
for i, c in enumerate(clusters):
for sample in c.samples:
print('聚类--{},样本点--{}'.format(i, sample))
# 可视化结果
plt.subplot()
color_names = list(mcolors.cnames)
for i, c in enumerate(clusters):
x = []
y = []
random.choice
color = [color_names[i]] * len(c.samples)
for sample in c.samples:
x.append(sample.coords[0])
y.append(sample.coords[1])
plt.scatter(x, y, c=color)
plt.show()
if __name__ == '__main__':
run_main()
#!usr/bin/env python
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import random
import jieba
import numpy as np
from six.moves import xrange
import tensorflow as tf
#step 1:读取文件中的内容组成一个列表
def read_data():
"""
对要训练的文本进行处理,最后把文本的内容的所有词放在一个列表中
"""
# 读取文本,预处理,分词,得到词典
raw_word_list = []
with open('test.txt',"r", encoding='UTF-8') as f:
line = f.readline()
while line:
while '\n' in line:
line = line.replace('\n','')
while ' ' in line:
line = line.replace(' ','')
if len(line)>0: # 如果句子非空
raw_words = list(jieba.cut(line,cut_all=False))
raw_word_list.extend(raw_words)
line=f.readline()
return raw_word_list
words = read_data()
print('Data size', len(words))
# Step 2: 构造映射字典,并把未登录词记录
vocabulary_size = 50000
def build_dataset(words):
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words)
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span) #构造一个窗口缓冲队列
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
# target label at the center of the buffer
target = skip_window #坐标,【2】
targets_to_avoid = [skip_window] #【2】
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1) #生成一个指定范围内的整数
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window] #buffer实现了滑动窗口
labels[i * num_skips + j, 0] = buffer[target] #要保证每个词对中的target都不是一样的
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2
valid_size = 8 #切记这个数字要和len(valid_word)对应,要不然会报错哦
valid_window = 100
num_sampled = 64 # Number of negative examples to sample. #negative sample 下采样
#计算图
graph = tf.Graph()
with graph.as_default():
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
with tf.device('/cpu:0'):
# Look up embeddings for inputs.
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# Construct the variables for the NCE loss
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]),dtype=tf.float32)
# Step 5: Begin training.
num_steps = 3000000
#会话执行
with tf.Session(graph=graph) as session:
# We must initialize all variables before we use them.
init.run()
print("Initialized")
average_loss = 0
for step in xrange(num_steps):
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
# We perform one update step by evaluating the optimizer op (including it
# in the list of returned values for session.run()
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val
if step % 2000 == 0:
if step > 0:
average_loss /= 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print("Average loss at step ", step, ": ", average_loss)
average_loss = 0
# Note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[:top_k]
log_str = "Nearest to %s:" % valid_word
for k in xrange(top_k):
close_word = reverse_dictionary[nearest[k]]
log_str = "%s %s," % (log_str, close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval()
package com.vs.mllib.rec.online.graph
import java.util.Properties
import com.alibaba.fastjson.JSON
import com.constants.Constants
import com.utils.HBaseUtils
import com.utils.ConnUtils
import com.utils.HttpUtils
import kafka.javaapi.producer.Producer
import kafka.producer.{KeyedMessage, ProducerConfig}
import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.{Bytes, MD5Hash}
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.streaming.dstream.DStream
import redis.clients.jedis.{ShardedJedis, ShardedJedisPipeline}
import org.apache.spark.mllib.recommendation.Rating
import scala.util.Try
object Swing {
/**
* 图结构推荐算法的第一步,入参主数据为经过一定技术转换的用户对物品的评分数据,此任务主要负责在短时间内(建议10秒内,
* 用户的浏览行为及时的被收集可以让用户更快的得到算法推荐的内容)对用户的浏览点击记录(存于hBase和redis)、
* 物品被浏览点击的用户集合(存于hBase)进行收集,并将评分数据发往kafka供图结构推荐算法的第二、三步进行模型的更新计算。
*
* @param ratings (userID, productID, rating)对的Stream
* @param redisNum redis中用户最近浏览记录需要保存的条数(推荐10-15条)
* @param prefix redis中用户最近浏览记录的KEY前缀
* @param expire redis中用户最近浏览记录的过期时间(推荐一小时)
* @param num1 用户浏览记录表保存的条数(推荐30条)
* @param table1 用户浏览记录表名(这里默认使用hBase)
* @param family1 用户浏览记录表的族名
* @param column1 用户浏览记录表的列名
* @param kafkaBrokers kafka的brokers(默认格式为x.x.x.x:xxxx,x.x.x.x:xxxx)
* @param topic 发往Step2,Step3的kafka话题名
* @param hBaseZk hBaseZk(默认格式为node-2,node-3,node-4,node-5,node-6)
* @param zkParent1 hBase实例(默认/hbase)
* @param redisBrokers redis的brokers(默认格式为x.x.x.x:xxxx,x.x.x.x:xxxx)
* @param redisPwd redis的密码如果有的话
*/
def runStep1(ratings:DStream[Rating],
redisNum:Int,prefix:String,expire:Int,num1:Int,table1:String,family1:String,column1:String,
kafkaBrokers:String,topic:String,
hBaseZk:String,zkParent1:String,redisBrokers:String,redisPwd:String) {
val props =new Properties()
props.put("metadata.broker.list", kafkaBrokers)
props.put("serializer.class", "kafka.serializer.StringEncoder")
ratings.foreachRDD(rdd=>{
rdd.groupBy(_.user).foreachPartition(part => {
ConnUtils.setHBaseZk(hBaseZk)
ConnUtils.setZkParent1(zkParent1)
val conn=ConnUtils.HBaseClient.conn
val updateHistory =new ArrayBuffer[Put]()
ConnUtils.setRedisBrokers(redisBrokers)
ConnUtils.setRedisPwd(redisPwd)
val redis = ConnUtils.CommonRedisClient.pool.getResource
val pipeline=redis.pipelined()
part.foreach(row => {
val userHistories =updateUserBrowserHistory(row,pipeline, redisNum, num1, conn,
prefix, expire, table1, family1, column1)
updateHistory.append(HBaseUtils.getPutAction(MD5Hash.getMD5AsHex(Bytes.toBytes(row._1.toString)), family1,
Array(column1), Array(userHistories.mkString(","))))
})
HBaseUtils.addDataBatchEx(table1, updateHistory.asJava, conn)
pipeline.sync()
ConnUtils.CommonRedisClient.pool.returnResourceObject(redis)
HBaseUtils.addDataBatchEx(table1, updateHistory.asJava, conn)
})
})
ratings.foreachRDD(rdd=>{
rdd.foreachPartition(part => {
val sends=new java.util.ArrayList[KeyedMessage[String, String]]()
val kafkaConfig =new ProducerConfig(props)
val producer =new Producer[String, String](kafkaConfig)
part.foreach(row => {
sends.add(new KeyedMessage[String, String](topic, row.user+"_"+row.product+"_"+row.rating))
})
producer.send(sends)
producer.close
})
})
}
/**
* 图结构推荐算法的第二步,更新某个物品被看过的用户的集合,并对这些集合里的用户进行两两成对,
* 每一对从业务上来说表示两个人同时看过或者购买过同一个物品,累加两个用户同时看过或购买过的次数存在hBase中
*
* @param stream 从step1接收的Stream
* @param partitionNum 需要对数据进来重新分区的个数
* @param hBaseZk1 hBaseZk(默认格式为node-2,node-3,node-4,node-5,node-6)
* @param zkParent1 hBase实例(默认/hbase)
* @param table1 物品下的用户集合表名
* @param family1 物品下的用户集合表的列族
* @param column1 物品下的用户集合表的列名
* @param num1 物品下的用户集合表要存储的条数
* @param hBaseZk2 两个用户之间同时有行为的物品次数记录表(userPairCount)的hBaseZk
* @param zkParent2 hBase实例(默认/hbase)
* @param userPairCountTable userPairCount表名
* @param userPairCountFamily userPairCount列族名
* @param userPairCountColumn userPairCount列名
*/
def runStep2(stream:DStream[(String,String)],partitionNum:Int,
hBaseZk1:String,zkParent1:String,table1:String,family1:String,column1:String,num1:Int,
hBaseZk2:String,zkParent2:String,userPairCountTable:String,userPairCountFamily:String,userPairCountColumn:String) = {
val data = stream.map(tuple => {
val row = tuple._2.split("_")
val userId = row(0)
val itemId = row(1)
(userId, itemId)
}).repartition(partitionNum)
val finalData=data.transform(rdd => rdd.groupBy(_._2).mapPartitions(part => {
ConnUtils.setHBaseZk(hBaseZk1)
ConnUtils.setZkParent1(zkParent1)
val conn = ConnUtils.HBaseClient.conn
part.map(x=>{
val users = x._2.map(y => y._1)
val itemHistory=updateItemBrowserHistory(x._1,users,conn,table1,family1,column1,num1)
(itemHistory, users)
})
}))
finalData.filter(_._1 !=null)
.flatMap(x => x._1.map(y => (y, x._2)))
.flatMap(x =>
x._2.map(y => (x._1, y))
.filter(x => x._1 != x._2 && StringUtils.isNotBlank(x._1) && StringUtils.isNotBlank(x._2))
.map(x => (x._1.toInt, x._2.toInt))
.map(x =>if (x._1 > x._2) (x._1, x._2)else (x._2, x._1)).toArray.distinct
)
.map(x => ((x._1, x._2), 1)).reduceByKey(_ + _).repartition(partitionNum)
.foreachRDD(rdd => {
rdd.foreachPartition(part => {
ConnUtils.setHBaseZk2(hBaseZk2)
ConnUtils.setZkParent2(zkParent2)
val conn = ConnUtils.HBaseClient2.conn
val batch=new java.util.ArrayList[Increment]()
part.foreach(row => {
val hBaseKey = MD5Hash.getMD5AsHex(Bytes.toBytes(row._1._1 +"_" + row._1._2)).substring(0,8)
val increment=HBaseUtils.getIncrAction(hBaseKey, userPairCountFamily,userPairCountColumn,row._2.toLong)
batch.add(increment)
})
HBaseUtils.incrDataBatchEx(userPairCountTable, batch, conn)
})
})
}
/**
* 图结构推荐算法的第三步,对用户刚看过的ITEM和之前看过的ITEM进行两两成对算其相似度,其中需要使用到第二步中
* 记录的userPairCount,且从图结构的算法来看,相似度是一个累加的过程,这里我们抽象出两个物品相似的一个累加分
* 这里称为itemPairSim,每次计算两个ITEM的相似度时会使用userPairCount得到这里称为itemPairSim,然后在得
* 到两个ITEM的相似度,并更新到redis中。
*
* @param stream 从step1接收的Stream
* @param partitionNum 需要对数据进来重新分区的个数
* @param redisNum 相似列表中要存的相似ITEM的条数(20-30都可以)
* @param redisPrefix ITEM在REDIS中的KEY前缀
* @param redisExpire ITEM的过期时间(过期时间因业务而异,比如资讯场景下过期时间大概7天即可)
* @param hBaseZk1 hBaseZk(默认格式为node-2,node-3,node-4,node-5,node-6)
* @param zkParent1 hBase实例(默认/hbase)
* @param userHistoryTable 用户浏览记录表名(这里默认使用hBase)
* @param userHistoryFamily 用户浏览记录表的族名
* @param userHistoryColumn 用户浏览记录表的列名
* @param historyNum 取出用户的多少条浏览记录来做相似度计算(不要超过step1中记录的条数)
* @param hBaseZk2 两个用户之间同时有行为的物品次数记录表(userPairCount)的hBaseZk
* @param zkParent2 hBase实例(默认/hbase)
* @param userPairCountTable userPairCount表名
* @param userPairCountFamily userPairCount列族名
* @param userPairCountColumn userPairCount列名
* @param hBaseZk3 两个ITEM累加的相似得分(itemPairSim)的hBaseZk
* @param zkParent3 hBase实例(默认/hbase)
* @param itemPairSimTable itemPairSim表名
* @param itemPairSimFamily itemPairSim列族名
* @param itemPairSimColumn itemPairSim列名
* @param redisBrokers redis的brokers
* @param redisPwd redis密码如果有的话
* @param isNormal 是否对相似值做归一话(图结构的sim为累加的值,如果需要和其它指标做排序如CTR,可以归一化)
* @param normalFlag 如果归一化会在redis中存itemPairSim的最大值,归一化时用到,此为redis中存的key的名称
*/
def runStep3(stream:DStream[(String,String)], partitionNum:Int,
redisNum:Int, redisPrefix:String, redisExpire:Int,
hBaseZk1:String,zkParent1:String, userHistoryTable:String, userHistoryFamily:String, userHistoryColumn:String, historyNum:Int,
hBaseZk2:String,zkParent2:String, userPairCountTable:String, userPairCountFamily:String, userPairCountColumn:String,
hBaseZk3:String,zkParent3:String, itemPairSimTable:String, itemPairSimFamily:String, itemPairSimColumn:String,
redisBrokers:String, redisPwd:String, isNormal:Boolean, normalFlag:String) ={
val data=stream.map(tuple => {
val row=tuple._2.split("_")
val userId= row(0)
val itemId=row(1)
(userId,itemId)
}).repartition(partitionNum)
val finalData=data.transform(rdd=>rdd.groupBy(_._1).mapPartitions(part=>{
ConnUtils.setHBaseZk(hBaseZk1)
ConnUtils.setZkParent1(zkParent1)
val conn = ConnUtils.HBaseClient.conn
part.map(x=> (getUserBrowserHistory(x._1,conn,userHistoryTable,userHistoryFamily,userHistoryColumn,historyNum),x._2.map(y=>y._2),x._1))
}))
.filter(_._1!=null)
.flatMap(x=> x._1.map(y=>(y,x._2,x._3)))
.flatMap(x => x._2.map(y => (x._1, y,x._3)))
.filter(x => x._1 != x._2 && StringUtils.isNotBlank(x._1) && StringUtils.isNotBlank(x._2))
.map(x => (x._1.toInt, x._2.toInt,x._3))
.map(x =>if (x._1 > x._2) (x._1, x._2,x._3)else (x._2, x._1,x._3))
.map(x=>((x._1,x._2),x._3))
.reduceByKey((x,y)=>x+","+y)
.map(x=>(x._1,x._2.split(",").distinct))
.filter(x=>x._2.length>1)
.repartition(partitionNum)
.mapPartitions(part=>{
ConnUtils.setHBaseZk2(hBaseZk2)
ConnUtils.setZkParent2(zkParent2)
val conn = ConnUtils.HBaseClient2.conn
part.map(x=>(x._1,getUserPairSim(x._2,conn,userPairCountTable,userPairCountFamily,userPairCountColumn)))
}).filter(_._2>0)
finalData.foreachRDD(rdd=>{
rdd.foreachPartition(part => {
ConnUtils.setHBaseZk3(hBaseZk3)
ConnUtils.setZkParent3(zkParent3)
val conn = ConnUtils.HBaseClient3.conn
val array =new ArrayBuffer[Put]()
var maxSim=0.0d
ConnUtils.setRedisBrokers(redisBrokers)
ConnUtils.setRedisPwd(redisPwd)
val redis = ConnUtils.CommonRedisClient.pool.getResource
if(isNormal) maxSim=Try(redis.get(normalFlag).toDouble).getOrElse(0.0d)
part.foreach(row => {
try {
var sim = row._2
val itemI = row._1._1.toString
val itemJ = row._1._2.toString
val hBaseKey = MD5Hash.getMD5AsHex(Bytes.toBytes(itemI +"_" + itemJ)).substring(0,8)
val hBaseRow = HBaseUtils.getResult(itemPairSimTable, hBaseKey, conn)
val simBefore =Try(Bytes.toString(hBaseRow.getValue(Bytes.toBytes(itemPairSimFamily),
Bytes.toBytes(itemPairSimColumn))).toDouble).getOrElse(0.0d)
sim = simBefore + sim
array.append(HBaseUtils.getPutAction(hBaseKey, itemPairSimFamily, Array(itemPairSimColumn), Array(sim.toString)))
sim = Math.log(sim +1)
if(sim>maxSim){
maxSim=sim
redis.set(normalFlag,maxSim.toString)
println(maxSim)
}
if (isNormal && maxSim >0) sim = sim * (1.0 / maxSim)
//更新sim
if(sim<1.0d) {
updateSim(itemI, itemJ, sim, redisNum, redis, redisExpire, redisPrefix)
updateSim(itemJ, itemI, sim, redisNum, redis, redisExpire, redisPrefix)
}
array.append(HBaseUtils.getPutAction(hBaseKey, itemPairSimFamily, Array(itemPairSimColumn), Array(sim.toString)))
}catch {
case e:Exception => e.printStackTrace()
}
})
ConnUtils.CommonRedisClient.pool.returnResourceObject(redis)
HBaseUtils.addDataBatchEx(itemPairSimTable, array.asJava, conn)
})
})
}
private def getUserBrowserHistory(userId:String,conn:HConnection,tableName:String,family:String,column:String,historyNum:Int) ={
val hBaseKey = MD5Hash.getMD5AsHex(Bytes.toBytes(userId))
val hBaseRow =Try(HBaseUtils.getResult(tableName, hBaseKey, conn)).getOrElse(null)
if(hBaseRow !=null) {
val history = Bytes.toString(hBaseRow.getValue(Bytes.toBytes(family), Bytes.toBytes(column)))
if(history!=null) history.split(",").map(x => x.split("_")(0)).takeRight(historyNum)else null
}
else null
}
private def updateUserBrowserHistory(row: (Int,Iterable[Rating]),pipeline: ShardedJedisPipeline,
redisNum:Int,hBaseNum:Int,conn: HConnection,
redisPrefix:String,redisTime:Int,tableName:String,family:String,column1:String)={
val redisKey = redisPrefix+row._1
val hBaseKey = MD5Hash.getMD5AsHex(Bytes.toBytes(row._1.toString))
var result = ArrayBuffer[String]()
val hBaseRow = HBaseUtils.getResult(tableName, hBaseKey,conn)
val temp = Bytes.toString(hBaseRow.getValue(Bytes.toBytes(family), Bytes.toBytes(column1)))
if (StringUtils.isNotBlank(temp)) result ++=temp.split(",")
row._2.foreach(x=>{
pipeline.lpush(redisKey, x.product+"_"+x.rating )
result += x.product+"_"+x.rating
})
pipeline.ltrim(redisKey, 0, redisNum -1)
pipeline.expire(redisKey, redisTime)
result=result.distinct.takeRight(hBaseNum)
result
}
private def updateItemBrowserHistory(itemId:String,row:Iterable[String], conn: HConnection, table2:String, family2:String, column2:String,num:Int) = {
val hBaseKey = MD5Hash.getMD5AsHex(Bytes.toBytes(itemId)).substring(0,8)
var result = ArrayBuffer[String]()
val hBaseRow = HBaseUtils.getResult(table2, hBaseKey,conn)
val temp = Bytes.toString(hBaseRow.getValue(Bytes.toBytes(family2), Bytes.toBytes(column2)))
if (StringUtils.isNotBlank(temp)) result ++=temp.split(",")
result ++=row
val back=result.distinct
HBaseUtils.addData(hBaseKey,table2,family2,Array(column2),Array(back.takeRight(num).mkString(",")),conn)
back
}
private def getUserPairSim(rows:Array[String],conn:HConnection,table:String,family:String,column:String):Double ={
val list=rows.map(x => (x, rows.filter(_ != x)))
.filter(_._2.nonEmpty)
.flatMap(x => x._2.map(y => (x._1.toInt, y.toInt)))
.map(x =>if (x._1 > x._2) x._1 +"_" + x._2else x._2 +"_" + x._1)
.map(x=>new Get(Bytes.toBytes(MD5Hash.getMD5AsHex(Bytes.toBytes(x)).substring(0,8)))).toList.asJava
val sim=HBaseUtils.getDataBatch(table,list,conn).map(x=>{
Try(Bytes.toLong(x.getValue(Bytes.toBytes(family), Bytes.toBytes(column)))).getOrElse(0l)
}).filter(_ >0).map(x=>1.0 /10.0 + x.toDouble).sum
sim
}
private def updateSim(itemI:String,itemJ:String,sim:Double,simNum:Int,jedis: ShardedJedis,expire:Int,prefix:String)={
val key = prefix + itemI
val simList = jedis.lrange(key, 0, -1)
val lastValue=Try(simList.get(simList.size()-1).split("_")(1).toDouble).getOrElse(0.0d)
if(sim > lastValue) {
simList.add(itemJ +"_" + sim)
val list = simList.asScala.map(x => (x.split("_")(0), x.split("_")(1))).filter(_._2 !="Infinity")
.map(x => (x._1, x._2.toDouble))
.groupBy(_._1)
.map(x => x._2.sortWith(_._2 > _._2).head).toArray.sortWith(_._2 < _._2).takeRight(simNum)
if (list.nonEmpty) {
val simArray = list.map(x => x._1 +"_" + x._2)
jedis.lpush(key, simArray: _*)
jedis.ltrim(key, 0, simArray.length -1)
jedis.expire(key, expire)
}
}
}
}
深度学习之卷积神经网络(CNN)详解与代码实现(一) - w_x_w1985 - 博客园
Transformer 代码详解_得克特-CSDN博客_transformer代码详解