推荐算法之关联规则实例

利用的知识

深度分箱
Apriori算法
数据连接、聚合等处理

数据说明

本数据来源于last.fm的数据
数据包含:


1892 users
17632 artists

12717 bi-directional user friend relations, i.e. 25434 (user_i, user_j) pairs
avg. 13.443 friend relations per user

92834 user-listened artist relations, i.e. tuples [user, artist, listeningCount]
avg. 49.067 artists most listened by each user
avg. 5.265 users who listened each artist

11946 tags

186479 tag assignments (tas), i.e. tuples [user, tag, artist]
avg. 98.562 tas per user
avg. 14.891 tas per artist
avg. 18.930 distinct tags used by each user
avg. 8.764 distinct tags used for each artist


数据集
  • artists.dat

This file contains information about music artists listened and tagged by the users.

  • tags.dat

    This file contains the set of tags available in the dataset.

  • user_artists.dat

    This file contains the artists listened by each user.

    It also provides a listening count for each [user, artist] pair.

  • user_taggedartists.dat - user_taggedartists-timestamps.dat

    These files contain the tag assignments of artists provided by each particular user.

    hey also contain the timestamps when the tag assignments were done.

  • user_friends.dat

    These files contain the friend relations between users in the database.

数据处理以及算法

将以上数据转化为csv再读取,否则由于有些数据较为混乱用read.table() 可能读不了数据

library(data.table)
library(sqldf)
library(dplyr)
library(arules)
library(Matrix)
library(xml2)
library(rvest)
library(arulesViz)
library(caret) 

getwd()
setwd('C:\\R\\working\\music\\data')
# read data
artists <- fread('artists.csv')
tags <- fread('tags.csv')
user_artists <- fread('user_artists.csv')
user_friends <- fread('user_friends.csv')
user_taggedartists <- fread('user_taggedartists.csv')
user_taggedartists_timestamps <- fread('user_taggedartists-timestamps.csv')

# 查看数据关于用户/艺术家/tag的类别数信息
# 结果表明:所有用户参加了打标签,而并不是所有的艺术家都被打了标签
# 29%的艺术家没被打过标签


length(table(artists$id))  # 17632
length(table(user_artists$artistID))   # 17632
length(table(user_taggedartists$artistID))     # 12523
length(table(user_taggedartists_timestamps$artistID))    # 12523


length(table(user_artists$userID))  # 1892
length(table(user_friends$userID))  # 1892
length(table(user_taggedartists$userID))   # 1892
length(table(user_taggedartists_timestamps$userID))  # 1892

length(table(tags$tagID))  # 11946
length(table(user_taggedartists$tagID))  # 9749
length(table(user_taggedartists_timestamps$tagID)) # 9749

# 可用的数据维度有限,关联规则推荐算法中可以利用
# 艺术家歌曲被播放次数

artists_sum_weight <- sqldf::sqldf('select artistID, sum(weight) as hit from user_artists group by artistID order by hit desc')

# 构建等深分箱函数(Equal frequency intervals)

EFI <- function(data,parts,Min){  
  parts <- parts         # 分几个箱  
  Min <- Min             # 最小值  
  value<-quantile(data,probs = seq(0,1,1/parts))  #这里以data等比分为4段,步长为1/4  
  number<-mapply(function(x){  
    for (i in 1:(parts-1))   
    {  
      if(x>=(value[i]-Min)&x1])  
      {  
        return(i)  
      }  
    }  
    if(x+Min>value[parts])  
    {  
      return(parts)  
    }  
    return(-1)  
  },data)  
  #打标签L1 L2  
  return(list(degree=paste("L",number,sep=""),degreevalue=number,value=table(value),number=table(number)))  #将连续变量转化成定序变量,此时为L1,L2,L3,L4...根据parts  
} 

artists_sum_weight_EFI <- EFI(artists_sum_weight$hit,10,min(artists_sum_weight$hit))

artists_sum_weight_EFI$value

# 根据划分区间将艺术家划分为下众(占10%)、中下(占40%)、中上(40%)、大众(10%)等4类

artists_sum_weight$hit <- ifelse(artists_sum_weight$hit<28,'小众',
       ifelse(artists_sum_weight$hit<350,'中下',
              ifelse(artists_sum_weight$hit<4646,'中上','大众')))


# user_taggedartists与artists_sum_weight左连接得艺术家对应的类型
user_taggedartists<- as.data.table(user_taggedartists)
artists_sum_weight <- as.data.table(artists_sum_weight)
temp_data0 <- artists_sum_weight[user_taggedartists,on='artistID']
head(temp_data0)
temp_data1 <- tags[temp_data0,on='tagID']
head(temp_data1)
table(temp_data1$tagValue)
temp_data1_0 <- sqldf::sqldf('select artistID,count(1) as artist_diversity from temp_data1 
                group by artistID order by  artist_diversity desc')


temp_data1_0_EFI <- EFI(temp_data1_0$artist_diversity,10,min(temp_data1_0$artist_diversity))
temp_data1_0_EFI$value
# 根据二六二分
temp_data1_0$artist_diversity <- ifelse(temp_data1_0$artist_diversity<=2,'单调',
                                        ifelse(temp_data1_0$artist_diversity<=17,'一般','多样'))
temp_data1_0 <- as.data.table(temp_data1_0)
temp_data2 <- temp_data1_0[temp_data1,on='artistID'] 

# 关联规则数据构建

names(temp_data2)[5] <- 'artist_type'

apriori_data <- temp_data2[,c('userID','artistID','artist_diversity','artist_type','tagValue')]
head(apriori_data)

# write.csv(apriori_data,'apriori_data.csv',fileEncoding = 'utf-8')

## Apriori算法
# 训练集与测试集划分(7:3)
set.seed(12301)
train_number <-createDataPartition(y= apriori_data$userID,p=0.70,list=FALSE) # 测试数据序号
train_apriori <- apriori_data[train_number, ] #训练数据集
test_apriori  <- apriori_data[-train_number, ] #测试集

train_apriori <- as.data.frame(train_apriori)

for(i in 1:5){
  train_apriori[,i] <- as.factor(train_apriori[,i]) 
}
train_apriori <- as(train_apriori, "transactions") 
inspect(head(train_apriori,100))


# 探究目标规则
# 根据那些用户喜爱的艺术家类型、歌曲风格等,为用户推荐相似的用户喜爱的艺术家
rules<-apriori(train_apriori,parameter=list(support=0.0003,confidence=0.0002,minlen=4),
               appearance=list(lhs=c("artist_diversity=多样","artist_type=大众","tagValue=rock"),default="rhs"))

rules

apriori_result <- inspect(sort(rules,by='support'))
# 选择出推荐的artistID
apriori_result$rhs


# 测试 
test_apriori <- as.data.frame(test_apriori)

for(i in 1:5){
  test_apriori[,i] <- as.factor(test_apriori[,i]) 
}
test_apriori <- as(test_apriori, "transactions") 
inspect(head(test_apriori,100))

rules<-apriori(train_apriori,parameter=list(support=0.000003,confidence=0.0002,minlen=4),
               appearance=list(lhs=c("artist_diversity=单调","artist_type=小众","tagValue=funk"),default="rhs"))

rules

apriori_result <- inspect(sort(rules,by='support'))

你可能感兴趣的:(R,机器学习)