推荐系统 & 神经网络常用代码实现

目录

一、attention模块

二、三层全连接网络

三、Logistic Regression

四、FM

五、手写Kmeans

六、word2vec 编解码层

七、Swing

八、CNN网络

九、Transformer代码


一、attention模块


def attention(queries, keys, keys_length):
  '''
    queries:     [B, H]
    keys:        [B, T, H]
    keys_length: [B]
  '''
  queries_hidden_units = queries.get_shape().as_list()[-1]
  queries = tf.tile(queries, [1, tf.shape(keys)[1]])
  queries = tf.reshape(queries, [-1, tf.shape(keys)[1], queries_hidden_units])
  din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
  d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att', reuse=tf.AUTO_REUSE)
  d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att', reuse=tf.AUTO_REUSE)
  d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE)
  d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(keys)[1]])
  outputs = d_layer_3_all 
  # Mask
  key_masks = tf.sequence_mask(keys_length, tf.shape(keys)[1])   # [B, T]
  key_masks = tf.expand_dims(key_masks, 1) # [B, 1, T]
  paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
  outputs = tf.where(key_masks, outputs, paddings)  # [B, 1, T]

  # Scale
  outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)

  # Activation
  outputs = tf.nn.softmax(outputs)  # [B, 1, T]

  # Weighted sum
  outputs = tf.matmul(outputs, keys)  # [B, 1, H]

  return outputs

二、三层全连接网络

def dnn_process(self):
    layer_normal_1 = tf.layers.batch_normalization(inputs=self.input_embed, name="batch_normal_1", trainable=self.global_normal)
    layer_1 = tf.layers_dense(layer_normal_1, 1024, activation=tf.nn.relu, name="layer_1")
    layer_dropout_1 = tf.nn.dropout(layer_1, keep_prob=self._keep_prob, name="dropout_1")
    layer_normal_2 = tf.layers.batch_normalization(inputs=layer_dropout_1, name="batch_normal_2", trainable=self.global_normal)
    layer_2 = tf.layers_dense(layer_normal_2, 512, activation=tf.nn.relu, name="layer_2")
    layer_dropout_2 = tf.nn.dropout(layer_2,keep_prob=self._keep_prob, name="dropout_2")
    layer_normal_3 = tf.layers.batch_normalization(inputs=layer_dropout_2, name="batch_normal_2", trainable=self.global_normal)
    layer_3 = tf.layers_dense(layer_normal_3, self._item_esize, activation=tf.nn.relu, name="layer_3")
    self._user_embed = tf.expand_dims(layer_3, 1)

三、Logistic Regression

import numpy as np
from LoadDataSet import loadDataset
from logistic_regression import gradientDescent
 
train_dataMat, train_labelMat, test_dataMat, test_labelMat = loadDataset('I:\wangpengfei-D\DataSet\\two_classier\\testSet.txt')
mtrain, ntrain = np.shape(train_dataMat)
mtest, ntest = np.shape(test_dataMat)
numIterations = 100000 #梯度下降的次数
alpha = 0.0005 #每一次的下降步长
theta = np.ones(shape=(ntrain, 1)) #参数θ
theta = gradientDescent(train_dataMat, train_labelMat.transpose(),theta, alpha, mtrain, numIterations) #返回训练完毕的参数θ
y_hat = np.dot(test_dataMat, theta) #得到估计的结果保存到y_hat中
 
mark = []
for i in range(30):
    res = sigmoid(y_hat[i])
    if res > 0.5:
        mark.append(1)
    else:
        mark.append(0)
print ('predic result:',mark)
print ('real result:  ', test_labelMat)
right_sum = 0;
for i in range(30):
    if mark[i] == test_labelMat[i]:
        right_sum += 1
print ("right number: %d, right rate: %lf" %(right_sum, right_sum*1.0/30))
 
 

四、FM

# -*- coding: utf-8 -*-

from __future__ import division
from math import exp
from numpy import *
from random import normalvariate  # 正态分布
from sklearn import preprocessing
import numpy as np

'''
    data : 数据的路径
    feature_potenital : 潜在分解维度数
    alpha : 学习速率
    iter : 迭代次数
    _w,_w_0,_v : 拆分子矩阵的weight
    with_col : 是否带有columns_name
    first_col : 首列有价值的feature的index
'''


class fm(object):
    def __init__(self):
        self.data = None
        self.feature_potential = None
        self.alpha = None
        self.iter = None
        self._w = None
        self._w_0 = None
        self.v = None
        self.with_col = None
        self.first_col = None

    def min_max(self, data):
        self.data = data
        min_max_scaler = preprocessing.MinMaxScaler()
        return min_max_scaler.fit_transform(self.data)

    def loadDataSet(self, data, with_col=True, first_col=2):
        # 我就是闲的蛋疼,明明pd.read_table()可以直接度,非要搞这样的,显得代码很长,小数据下完全可以直接读嘛,唉~
        self.first_col = first_col
        dataMat = []
        labelMat = []
        fr = open(data)
        self.with_col = with_col
        if self.with_col:
            N = 0
            for line in fr.readlines():
                # N=1时干掉列表名
                if N > 0:
                    currLine = line.strip().split()
                    lineArr = []
                    featureNum = len(currLine)
                    for i in range(self.first_col, featureNum):
                        lineArr.append(float(currLine[i]))
                    dataMat.append(lineArr)
                    labelMat.append(float(currLine[1]) * 2 - 1)
                N = N + 1
        else:
            for line in fr.readlines():
                currLine = line.strip().split()
                lineArr = []
                featureNum = len(currLine)
                for i in range(2, featureNum):
                    lineArr.append(float(currLine[i]))
                dataMat.append(lineArr)
                labelMat.append(float(currLine[1]) * 2 - 1)
        return mat(self.min_max(dataMat)), labelMat

    def sigmoid(self, inx):
        # return 1.0/(1+exp(min(max(-inx,-10),10)))
        return 1.0 / (1 + exp(-inx))

    # 得到对应的特征weight的矩阵
    def fit(self, data, feature_potential=8, alpha=0.01, iter=100):
        # alpha是学习速率
        self.alpha = alpha
        self.feature_potential = feature_potential
        self.iter = iter
        # dataMatrix用的是mat, classLabels是列表
        dataMatrix, classLabels = self.loadDataSet(data)
        print('dataMatrix:',dataMatrix.shape)
        print('classLabels:',classLabels)
        k = self.feature_potential
        m, n = shape(dataMatrix)
        # 初始化参数
        w = zeros((n, 1))  # 其中n是特征的个数
        w_0 = 0.
        v = normalvariate(0, 0.2) * ones((n, k))
        for it in range(self.iter): # 迭代次数
            # 对每一个样本,优化
            for x in range(m):
                # 这边注意一个数学知识:对应点积的地方通常会有sum,对应位置积的地方通常都没有,详细参见矩阵运算规则,本处计算逻辑在:http://blog.csdn.net/google19890102/article/details/45532745
                # xi·vi,xi与vi的矩阵点积
                inter_1 = dataMatrix[x] * v
                # xi与xi的对应位置乘积   与   xi^2与vi^2对应位置的乘积    的点积
                inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v)  # multiply对应元素相乘
                # 完成交叉项,xi*vi*xi*vi - xi^2*vi^2
                interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
                # 计算预测的输出
                p = w_0 + dataMatrix[x] * w + interaction
                print('classLabels[x]:',classLabels[x])
                print('预测的输出p:', p)
                # 计算sigmoid(y*pred_y)-1
                loss = self.sigmoid(classLabels[x] * p[0, 0]) - 1
                if loss >= -1:
                    loss_res = '正方向 '
                else:
                    loss_res = '反方向'
                # 更新参数
                w_0 = w_0 - self.alpha * loss * classLabels[x]
                for i in range(n):
                    if dataMatrix[x, i] != 0:
                        w[i, 0] = w[i, 0] - self.alpha * loss * classLabels[x] * dataMatrix[x, i]
                        for j in range(k):
                            v[i, j] = v[i, j] - self.alpha * loss * classLabels[x] * (
                                    dataMatrix[x, i] * inter_1[0, j] - v[i, j] * dataMatrix[x, i] * dataMatrix[x, i])
            print('the no %s times, the loss arrach %s' % (it, loss_res))
        self._w_0, self._w, self._v = w_0, w, v

    def predict(self, X):
        if (self._w_0 == None) or (self._w == None).any() or (self._v == None).any():
            raise NotFittedError("Estimator not fitted, call `fit` first")
        # 类型检查
        if isinstance(X, np.ndarray):
            pass
        else:
            try:
                X = np.array(X)
            except:
                raise TypeError("numpy.ndarray required for X")
        w_0 = self._w_0
        w = self._w
        v = self._v
        m, n = shape(X)
        result = []
        for x in range(m):
            inter_1 = mat(X[x]) * v
            inter_2 = mat(multiply(X[x], X[x])) * multiply(v, v)  # multiply对应元素相乘
            # 完成交叉项
            interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
            p = w_0 + X[x] * w + interaction  # 计算预测的输出
            pre = self.sigmoid(p[0, 0])
            result.append(pre)
        return result

    def getAccuracy(self, data):
        dataMatrix, classLabels = self.loadDataSet(data)
        w_0 = self._w_0
        w = self._w
        v = self._v
        m, n = shape(dataMatrix)
        allItem = 0
        error = 0
        result = []
        for x in range(m):
            allItem += 1
            inter_1 = dataMatrix[x] * v
            inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v)  # multiply对应元素相乘
            # 完成交叉项
            interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
            p = w_0 + dataMatrix[x] * w + interaction  # 计算预测的输出
            pre = self.sigmoid(p[0, 0])
            result.append(pre)
            if pre < 0.5 and classLabels[x] == 1.0:
                error += 1
            elif pre >= 0.5 and classLabels[x] == -1.0:
                error += 1
            else:
                continue
        # print(result)
        value = 1 - float(error) / allItem
        return value


class NotFittedError(Exception):
    """
    Exception class to raise if estimator is used before fitting
    """
    pass


if __name__ == '__main__':
    fm()

五、手写Kmeans

# -*- coding: utf-8 -*-
"""
    参考:     https://gist.github.com/iandanforth/5862470
"""

import random
from kmeans_tools import Cluster, get_distance, gen_random_sample
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors


def kmeans(samples, k, cutoff):
    """
        kmeans函数
    """

    # 随机选k个样本点作为初始聚类中心
    init_samples = random.sample(samples, k)

    # 创建k个聚类,聚类的中心分别为随机初始的样本点
    clusters = [Cluster([sample]) for sample in init_samples]

    # 迭代循环直到聚类划分稳定
    n_loop = 0
    while True:
        # 初始化一组空列表用于存储每个聚类内的样本点
        lists = [[] for _ in clusters]

        # 开始迭代
        n_loop += 1
        # 遍历样本集中的每个样本
        for sample in samples:
            # 计算样本点sample和第一个聚类中心的距离
            smallest_distance = get_distance(sample, clusters[0].centroid)
            # 初始化属于聚类 0
            cluster_index = 0

            # 计算和其他聚类中心的距离
            for i in range(k - 1):
                # 计算样本点sample和聚类中心的距离
                distance = get_distance(sample, clusters[i+1].centroid)
                # 如果存在更小的距离,更新距离
                if distance < smallest_distance:
                    smallest_distance = distance
                    cluster_index = i + 1

            # 找到最近的聚类中心,更新所属聚类
            lists[cluster_index].append(sample)

        # 初始化最大移动距离
        biggest_shift = 0.0

        # 计算本次迭代中,聚类中心移动的距离
        for i in range(k):
            shift = clusters[i].update(lists[i])
            # 记录最大移动距离
            biggest_shift = max(biggest_shift, shift)

        # 如果聚类中心移动的距离小于收敛阈值,即:聚类稳定
        if biggest_shift < cutoff:
            print("第{}次迭代后,聚类稳定。".format(n_loop))
            break
    # 返回聚类结果
    return clusters


def run_main():
    """
        主函数
    """
    # 样本个数
    n_samples = 1000

    # 特征个数 (特征维度)
    n_feat = 2

    # 特征数值范围
    lower = 0
    upper = 200

    # 聚类个数
    n_cluster = 3

    # 生成随机样本
    samples = [gen_random_sample(n_feat, lower, upper) for _ in range(n_samples)]

    # 收敛阈值
    cutoff = 0.2

    clusters = kmeans(samples, n_cluster, cutoff)

    # 输出结果
    for i, c in enumerate(clusters):
        for sample in c.samples:
            print('聚类--{},样本点--{}'.format(i, sample))

    # 可视化结果
    plt.subplot()
    color_names = list(mcolors.cnames)
    for i, c in enumerate(clusters):
        x = []
        y = []
        random.choice
        color = [color_names[i]] * len(c.samples)
        for sample in c.samples:
            x.append(sample.coords[0])
            y.append(sample.coords[1])
        plt.scatter(x, y, c=color)
    plt.show()

if __name__ == '__main__':
    run_main()

六、word2vec 编解码层

#!usr/bin/env python
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import random
import jieba
import numpy as np
from six.moves import xrange
import tensorflow as tf

#step 1:读取文件中的内容组成一个列表
def read_data():
    """
    对要训练的文本进行处理,最后把文本的内容的所有词放在一个列表中
    """
    # 读取文本,预处理,分词,得到词典
    raw_word_list = []
    with open('test.txt',"r", encoding='UTF-8') as f:
        line = f.readline()
        while line:
            while '\n' in line:
                line = line.replace('\n','')
            while ' ' in line:
                line = line.replace(' ','')
            if len(line)>0: # 如果句子非空
                raw_words = list(jieba.cut(line,cut_all=False))
                raw_word_list.extend(raw_words)
            line=f.readline()
    return raw_word_list

words = read_data()
print('Data size', len(words))

# Step 2: 构造映射字典,并把未登录词记录
vocabulary_size = 50000

def build_dataset(words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)     #构造一个窗口缓冲队列

    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)

    for i in range(batch_size // num_skips):
        # target label at the center of the buffer
        target = skip_window  #坐标,【2】
        targets_to_avoid = [skip_window] #【2】
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1) #生成一个指定范围内的整数
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window] #buffer实现了滑动窗口
            labels[i * num_skips + j, 0] = buffer[target] #要保证每个词对中的target都不是一样的
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

batch_size = 128
embedding_size = 128  
skip_window = 1       
num_skips = 2         
valid_size = 8      #切记这个数字要和len(valid_word)对应,要不然会报错哦
valid_window = 100  
num_sampled = 64    # Number of negative examples to sample.   #negative sample 下采样

#计算图
graph = tf.Graph()
with graph.as_default():
    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        # Construct the variables for the NCE loss
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]),dtype=tf.float32)

# Step 5: Begin training.
num_steps = 3000000
#会话执行
with tf.Session(graph=graph) as session:
    # We must initialize all variables before we use them.
    init.run()
    print("Initialized")

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print("Average loss at step ", step, ": ", average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in xrange(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[:top_k]
                log_str = "Nearest to %s:" % valid_word
                for k in xrange(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = "%s %s," % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()



七、Swing

package com.vs.mllib.rec.online.graph

import java.util.Properties

import com.alibaba.fastjson.JSON

import com.constants.Constants

import com.utils.HBaseUtils

import com.utils.ConnUtils

import com.utils.HttpUtils

import kafka.javaapi.producer.Producer

import kafka.producer.{KeyedMessage, ProducerConfig}

import org.apache.commons.lang3.StringUtils

import org.apache.hadoop.hbase.client._

import org.apache.hadoop.hbase.util.{Bytes, MD5Hash}

import scala.collection.JavaConverters._

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.streaming.dstream.DStream

import redis.clients.jedis.{ShardedJedis, ShardedJedisPipeline}

import org.apache.spark.mllib.recommendation.Rating

import scala.util.Try

object Swing {

/**

* 图结构推荐算法的第一步,入参主数据为经过一定技术转换的用户对物品的评分数据,此任务主要负责在短时间内(建议10秒内,

* 用户的浏览行为及时的被收集可以让用户更快的得到算法推荐的内容)对用户的浏览点击记录(存于hBase和redis)、

* 物品被浏览点击的用户集合(存于hBase)进行收集,并将评分数据发往kafka供图结构推荐算法的第二、三步进行模型的更新计算。

*

    * @param ratings    (userID, productID, rating)对的Stream

    * @param redisNum  redis中用户最近浏览记录需要保存的条数(推荐10-15条)

    * @param prefix    redis中用户最近浏览记录的KEY前缀

    * @param expire    redis中用户最近浏览记录的过期时间(推荐一小时)

    * @param num1      用户浏览记录表保存的条数(推荐30条)

    * @param table1    用户浏览记录表名(这里默认使用hBase)

    * @param family1    用户浏览记录表的族名

    * @param column1    用户浏览记录表的列名

    * @param kafkaBrokers    kafka的brokers(默认格式为x.x.x.x:xxxx,x.x.x.x:xxxx)

    * @param topic      发往Step2,Step3的kafka话题名

    * @param hBaseZk    hBaseZk(默认格式为node-2,node-3,node-4,node-5,node-6)

    * @param zkParent1  hBase实例(默认/hbase)

    * @param redisBrokers    redis的brokers(默认格式为x.x.x.x:xxxx,x.x.x.x:xxxx)

    * @param redisPwd    redis的密码如果有的话

*/

  def runStep1(ratings:DStream[Rating],

              redisNum:Int,prefix:String,expire:Int,num1:Int,table1:String,family1:String,column1:String,

              kafkaBrokers:String,topic:String,

              hBaseZk:String,zkParent1:String,redisBrokers:String,redisPwd:String) {

val props =new Properties()

props.put("metadata.broker.list", kafkaBrokers)

props.put("serializer.class", "kafka.serializer.StringEncoder")

ratings.foreachRDD(rdd=>{

rdd.groupBy(_.user).foreachPartition(part => {

ConnUtils.setHBaseZk(hBaseZk)

ConnUtils.setZkParent1(zkParent1)

val conn=ConnUtils.HBaseClient.conn

        val updateHistory =new ArrayBuffer[Put]()

ConnUtils.setRedisBrokers(redisBrokers)

ConnUtils.setRedisPwd(redisPwd)

val redis = ConnUtils.CommonRedisClient.pool.getResource

val pipeline=redis.pipelined()

part.foreach(row => {

val userHistories =updateUserBrowserHistory(row,pipeline, redisNum, num1, conn,

            prefix, expire, table1, family1, column1)

updateHistory.append(HBaseUtils.getPutAction(MD5Hash.getMD5AsHex(Bytes.toBytes(row._1.toString)), family1,

            Array(column1), Array(userHistories.mkString(","))))

})

HBaseUtils.addDataBatchEx(table1, updateHistory.asJava, conn)

pipeline.sync()

ConnUtils.CommonRedisClient.pool.returnResourceObject(redis)

HBaseUtils.addDataBatchEx(table1, updateHistory.asJava, conn)

})

})

ratings.foreachRDD(rdd=>{

rdd.foreachPartition(part => {

val sends=new java.util.ArrayList[KeyedMessage[String, String]]()

val kafkaConfig =new ProducerConfig(props)

val producer =new Producer[String, String](kafkaConfig)

part.foreach(row => {

sends.add(new KeyedMessage[String, String](topic, row.user+"_"+row.product+"_"+row.rating))

})

producer.send(sends)

producer.close

})

})

}

/**

* 图结构推荐算法的第二步,更新某个物品被看过的用户的集合,并对这些集合里的用户进行两两成对,

* 每一对从业务上来说表示两个人同时看过或者购买过同一个物品,累加两个用户同时看过或购买过的次数存在hBase中

*

    * @param stream    从step1接收的Stream

    * @param partitionNum  需要对数据进来重新分区的个数

    * @param hBaseZk1    hBaseZk(默认格式为node-2,node-3,node-4,node-5,node-6)

    * @param zkParent1    hBase实例(默认/hbase)

    * @param table1      物品下的用户集合表名

    * @param family1    物品下的用户集合表的列族

    * @param column1    物品下的用户集合表的列名

    * @param num1      物品下的用户集合表要存储的条数

    * @param hBaseZk2    两个用户之间同时有行为的物品次数记录表(userPairCount)的hBaseZk

    * @param zkParent2    hBase实例(默认/hbase)

    * @param userPairCountTable    userPairCount表名

    * @param userPairCountFamily    userPairCount列族名

    * @param userPairCountColumn    userPairCount列名

*/

  def runStep2(stream:DStream[(String,String)],partitionNum:Int,

              hBaseZk1:String,zkParent1:String,table1:String,family1:String,column1:String,num1:Int,

              hBaseZk2:String,zkParent2:String,userPairCountTable:String,userPairCountFamily:String,userPairCountColumn:String) = {

val data = stream.map(tuple => {

val row = tuple._2.split("_")

val userId = row(0)

val itemId = row(1)

(userId, itemId)

}).repartition(partitionNum)

val finalData=data.transform(rdd => rdd.groupBy(_._2).mapPartitions(part => {

ConnUtils.setHBaseZk(hBaseZk1)

ConnUtils.setZkParent1(zkParent1)

val conn = ConnUtils.HBaseClient.conn

      part.map(x=>{

val users = x._2.map(y => y._1)

val itemHistory=updateItemBrowserHistory(x._1,users,conn,table1,family1,column1,num1)

(itemHistory, users)

})

}))

finalData.filter(_._1 !=null)

.flatMap(x => x._1.map(y => (y, x._2)))

.flatMap(x =>

x._2.map(y => (x._1, y))

.filter(x => x._1 != x._2 && StringUtils.isNotBlank(x._1) && StringUtils.isNotBlank(x._2))

.map(x => (x._1.toInt, x._2.toInt))

.map(x =>if (x._1 > x._2) (x._1, x._2)else (x._2, x._1)).toArray.distinct

)

.map(x => ((x._1, x._2), 1)).reduceByKey(_ + _).repartition(partitionNum)

.foreachRDD(rdd => {

rdd.foreachPartition(part => {

ConnUtils.setHBaseZk2(hBaseZk2)

ConnUtils.setZkParent2(zkParent2)

val conn = ConnUtils.HBaseClient2.conn

          val batch=new java.util.ArrayList[Increment]()

part.foreach(row => {

val hBaseKey = MD5Hash.getMD5AsHex(Bytes.toBytes(row._1._1 +"_" + row._1._2)).substring(0,8)

val increment=HBaseUtils.getIncrAction(hBaseKey, userPairCountFamily,userPairCountColumn,row._2.toLong)

batch.add(increment)

})

HBaseUtils.incrDataBatchEx(userPairCountTable, batch, conn)

})

})

}

/**

* 图结构推荐算法的第三步,对用户刚看过的ITEM和之前看过的ITEM进行两两成对算其相似度,其中需要使用到第二步中

* 记录的userPairCount,且从图结构的算法来看,相似度是一个累加的过程,这里我们抽象出两个物品相似的一个累加分

* 这里称为itemPairSim,每次计算两个ITEM的相似度时会使用userPairCount得到这里称为itemPairSim,然后在得

* 到两个ITEM的相似度,并更新到redis中。

*

    * @param stream    从step1接收的Stream

    * @param partitionNum  需要对数据进来重新分区的个数

    * @param redisNum      相似列表中要存的相似ITEM的条数(20-30都可以)

    * @param redisPrefix    ITEM在REDIS中的KEY前缀

    * @param redisExpire    ITEM的过期时间(过期时间因业务而异,比如资讯场景下过期时间大概7天即可)

    * @param hBaseZk1    hBaseZk(默认格式为node-2,node-3,node-4,node-5,node-6)

    * @param zkParent1    hBase实例(默认/hbase)

    * @param userHistoryTable    用户浏览记录表名(这里默认使用hBase)

    * @param userHistoryFamily    用户浏览记录表的族名

    * @param userHistoryColumn    用户浏览记录表的列名

    * @param historyNum      取出用户的多少条浏览记录来做相似度计算(不要超过step1中记录的条数)

    * @param hBaseZk2    两个用户之间同时有行为的物品次数记录表(userPairCount)的hBaseZk

    * @param zkParent2    hBase实例(默认/hbase)

    * @param userPairCountTable    userPairCount表名

    * @param userPairCountFamily    userPairCount列族名

    * @param userPairCountColumn    userPairCount列名

    * @param hBaseZk3    两个ITEM累加的相似得分(itemPairSim)的hBaseZk

    * @param zkParent3    hBase实例(默认/hbase)

    * @param itemPairSimTable    itemPairSim表名

    * @param itemPairSimFamily    itemPairSim列族名

    * @param itemPairSimColumn    itemPairSim列名

    * @param redisBrokers    redis的brokers

    * @param redisPwd    redis密码如果有的话

    * @param isNormal      是否对相似值做归一话(图结构的sim为累加的值,如果需要和其它指标做排序如CTR,可以归一化)

    * @param normalFlag    如果归一化会在redis中存itemPairSim的最大值,归一化时用到,此为redis中存的key的名称

*/

  def runStep3(stream:DStream[(String,String)], partitionNum:Int,

              redisNum:Int, redisPrefix:String, redisExpire:Int,

              hBaseZk1:String,zkParent1:String, userHistoryTable:String, userHistoryFamily:String, userHistoryColumn:String, historyNum:Int,

              hBaseZk2:String,zkParent2:String, userPairCountTable:String, userPairCountFamily:String, userPairCountColumn:String,

              hBaseZk3:String,zkParent3:String, itemPairSimTable:String, itemPairSimFamily:String, itemPairSimColumn:String,

              redisBrokers:String, redisPwd:String, isNormal:Boolean, normalFlag:String) ={

val data=stream.map(tuple => {

val row=tuple._2.split("_")

val userId= row(0)

val itemId=row(1)

(userId,itemId)

}).repartition(partitionNum)

val finalData=data.transform(rdd=>rdd.groupBy(_._1).mapPartitions(part=>{

ConnUtils.setHBaseZk(hBaseZk1)

ConnUtils.setZkParent1(zkParent1)

val conn = ConnUtils.HBaseClient.conn

      part.map(x=> (getUserBrowserHistory(x._1,conn,userHistoryTable,userHistoryFamily,userHistoryColumn,historyNum),x._2.map(y=>y._2),x._1))

}))

.filter(_._1!=null)

.flatMap(x=> x._1.map(y=>(y,x._2,x._3)))

.flatMap(x => x._2.map(y => (x._1, y,x._3)))

.filter(x => x._1 != x._2 && StringUtils.isNotBlank(x._1) && StringUtils.isNotBlank(x._2))

.map(x => (x._1.toInt, x._2.toInt,x._3))

.map(x =>if (x._1 > x._2) (x._1, x._2,x._3)else (x._2, x._1,x._3))

.map(x=>((x._1,x._2),x._3))

.reduceByKey((x,y)=>x+","+y)

.map(x=>(x._1,x._2.split(",").distinct))

.filter(x=>x._2.length>1)

.repartition(partitionNum)

.mapPartitions(part=>{

ConnUtils.setHBaseZk2(hBaseZk2)

ConnUtils.setZkParent2(zkParent2)

val conn = ConnUtils.HBaseClient2.conn

      part.map(x=>(x._1,getUserPairSim(x._2,conn,userPairCountTable,userPairCountFamily,userPairCountColumn)))

}).filter(_._2>0)

finalData.foreachRDD(rdd=>{

rdd.foreachPartition(part => {

ConnUtils.setHBaseZk3(hBaseZk3)

ConnUtils.setZkParent3(zkParent3)

val conn = ConnUtils.HBaseClient3.conn

        val array =new ArrayBuffer[Put]()

var maxSim=0.0d

        ConnUtils.setRedisBrokers(redisBrokers)

ConnUtils.setRedisPwd(redisPwd)

val redis = ConnUtils.CommonRedisClient.pool.getResource

if(isNormal) maxSim=Try(redis.get(normalFlag).toDouble).getOrElse(0.0d)

part.foreach(row => {

try {

var sim = row._2

val itemI = row._1._1.toString

val itemJ = row._1._2.toString

val hBaseKey = MD5Hash.getMD5AsHex(Bytes.toBytes(itemI +"_" + itemJ)).substring(0,8)

val hBaseRow = HBaseUtils.getResult(itemPairSimTable, hBaseKey, conn)

val simBefore =Try(Bytes.toString(hBaseRow.getValue(Bytes.toBytes(itemPairSimFamily),

              Bytes.toBytes(itemPairSimColumn))).toDouble).getOrElse(0.0d)

sim = simBefore + sim

array.append(HBaseUtils.getPutAction(hBaseKey, itemPairSimFamily, Array(itemPairSimColumn), Array(sim.toString)))

sim = Math.log(sim +1)

if(sim>maxSim){

maxSim=sim

redis.set(normalFlag,maxSim.toString)

println(maxSim)

}

if (isNormal && maxSim >0) sim = sim * (1.0 / maxSim)

//更新sim

            if(sim<1.0d) {

updateSim(itemI, itemJ, sim, redisNum, redis, redisExpire, redisPrefix)

updateSim(itemJ, itemI, sim, redisNum, redis, redisExpire, redisPrefix)

}

array.append(HBaseUtils.getPutAction(hBaseKey, itemPairSimFamily, Array(itemPairSimColumn), Array(sim.toString)))

}catch {

case e:Exception => e.printStackTrace()

}

})

ConnUtils.CommonRedisClient.pool.returnResourceObject(redis)

HBaseUtils.addDataBatchEx(itemPairSimTable, array.asJava, conn)

})

})

}

private def getUserBrowserHistory(userId:String,conn:HConnection,tableName:String,family:String,column:String,historyNum:Int) ={

val hBaseKey = MD5Hash.getMD5AsHex(Bytes.toBytes(userId))

val hBaseRow =Try(HBaseUtils.getResult(tableName, hBaseKey, conn)).getOrElse(null)

if(hBaseRow !=null) {

val history = Bytes.toString(hBaseRow.getValue(Bytes.toBytes(family), Bytes.toBytes(column)))

if(history!=null) history.split(",").map(x => x.split("_")(0)).takeRight(historyNum)else null

    }

else null

  }

private def updateUserBrowserHistory(row: (Int,Iterable[Rating]),pipeline: ShardedJedisPipeline,

                              redisNum:Int,hBaseNum:Int,conn: HConnection,

                              redisPrefix:String,redisTime:Int,tableName:String,family:String,column1:String)={

val redisKey = redisPrefix+row._1

val hBaseKey = MD5Hash.getMD5AsHex(Bytes.toBytes(row._1.toString))

var result = ArrayBuffer[String]()

val hBaseRow = HBaseUtils.getResult(tableName, hBaseKey,conn)

val temp = Bytes.toString(hBaseRow.getValue(Bytes.toBytes(family), Bytes.toBytes(column1)))

if (StringUtils.isNotBlank(temp)) result ++=temp.split(",")

row._2.foreach(x=>{

pipeline.lpush(redisKey, x.product+"_"+x.rating )

result += x.product+"_"+x.rating

})

pipeline.ltrim(redisKey, 0, redisNum -1)

pipeline.expire(redisKey, redisTime)

result=result.distinct.takeRight(hBaseNum)

result

}

private def updateItemBrowserHistory(itemId:String,row:Iterable[String], conn: HConnection, table2:String, family2:String, column2:String,num:Int) = {

val hBaseKey = MD5Hash.getMD5AsHex(Bytes.toBytes(itemId)).substring(0,8)

var result = ArrayBuffer[String]()

val hBaseRow = HBaseUtils.getResult(table2, hBaseKey,conn)

val temp = Bytes.toString(hBaseRow.getValue(Bytes.toBytes(family2), Bytes.toBytes(column2)))

if (StringUtils.isNotBlank(temp)) result ++=temp.split(",")

result ++=row

val back=result.distinct

HBaseUtils.addData(hBaseKey,table2,family2,Array(column2),Array(back.takeRight(num).mkString(",")),conn)

back

}

private def getUserPairSim(rows:Array[String],conn:HConnection,table:String,family:String,column:String):Double ={

val list=rows.map(x => (x, rows.filter(_ != x)))

.filter(_._2.nonEmpty)

.flatMap(x => x._2.map(y => (x._1.toInt, y.toInt)))

.map(x =>if (x._1 > x._2) x._1 +"_" + x._2else x._2 +"_" + x._1)

.map(x=>new Get(Bytes.toBytes(MD5Hash.getMD5AsHex(Bytes.toBytes(x)).substring(0,8)))).toList.asJava

val sim=HBaseUtils.getDataBatch(table,list,conn).map(x=>{

Try(Bytes.toLong(x.getValue(Bytes.toBytes(family), Bytes.toBytes(column)))).getOrElse(0l)

}).filter(_ >0).map(x=>1.0 /10.0 + x.toDouble).sum

sim

}

private def updateSim(itemI:String,itemJ:String,sim:Double,simNum:Int,jedis: ShardedJedis,expire:Int,prefix:String)={

val key = prefix + itemI

val simList = jedis.lrange(key, 0, -1)

val lastValue=Try(simList.get(simList.size()-1).split("_")(1).toDouble).getOrElse(0.0d)

if(sim > lastValue) {

simList.add(itemJ +"_" + sim)

val list = simList.asScala.map(x => (x.split("_")(0), x.split("_")(1))).filter(_._2 !="Infinity")

.map(x => (x._1, x._2.toDouble))

.groupBy(_._1)

.map(x => x._2.sortWith(_._2 > _._2).head).toArray.sortWith(_._2 < _._2).takeRight(simNum)

if (list.nonEmpty) {

val simArray = list.map(x => x._1 +"_" + x._2)

jedis.lpush(key, simArray: _*)

jedis.ltrim(key, 0, simArray.length -1)

jedis.expire(key, expire)

}

}

}

}

八、CNN网络

深度学习之卷积神经网络(CNN)详解与代码实现(一) - w_x_w1985 - 博客园

九、Transformer代码

Transformer 代码详解_得克特-CSDN博客_transformer代码详解

你可能感兴趣的:(神经网络,人工智能,深度学习)