深度分箱
Apriori算法
数据连接、聚合等处理
本数据来源于last.fm的数据
数据包含:
1892 users
17632 artists
12717 bi-directional user friend relations, i.e. 25434 (user_i, user_j) pairs
avg. 13.443 friend relations per user
92834 user-listened artist relations, i.e. tuples [user, artist, listeningCount]
avg. 49.067 artists most listened by each user
avg. 5.265 users who listened each artist
11946 tags
186479 tag assignments (tas), i.e. tuples [user, tag, artist]
avg. 98.562 tas per user
avg. 14.891 tas per artist
avg. 18.930 distinct tags used by each user
avg. 8.764 distinct tags used for each artist
This file contains information about music artists listened and tagged by the users.
tags.dat
This file contains the set of tags available in the dataset.
user_artists.dat
This file contains the artists listened by each user.
It also provides a listening count for each [user, artist] pair.
user_taggedartists.dat - user_taggedartists-timestamps.dat
These files contain the tag assignments of artists provided by each particular user.
hey also contain the timestamps when the tag assignments were done.
user_friends.dat
These files contain the friend relations between users in the database.
将以上数据转化为csv再读取,否则由于有些数据较为混乱用read.table() 可能读不了数据
library(data.table)
library(sqldf)
library(dplyr)
library(arules)
library(Matrix)
library(xml2)
library(rvest)
library(arulesViz)
library(caret)
getwd()
setwd('C:\\R\\working\\music\\data')
# read data
artists <- fread('artists.csv')
tags <- fread('tags.csv')
user_artists <- fread('user_artists.csv')
user_friends <- fread('user_friends.csv')
user_taggedartists <- fread('user_taggedartists.csv')
user_taggedartists_timestamps <- fread('user_taggedartists-timestamps.csv')
# 查看数据关于用户/艺术家/tag的类别数信息
# 结果表明:所有用户参加了打标签,而并不是所有的艺术家都被打了标签
# 29%的艺术家没被打过标签
length(table(artists$id)) # 17632
length(table(user_artists$artistID)) # 17632
length(table(user_taggedartists$artistID)) # 12523
length(table(user_taggedartists_timestamps$artistID)) # 12523
length(table(user_artists$userID)) # 1892
length(table(user_friends$userID)) # 1892
length(table(user_taggedartists$userID)) # 1892
length(table(user_taggedartists_timestamps$userID)) # 1892
length(table(tags$tagID)) # 11946
length(table(user_taggedartists$tagID)) # 9749
length(table(user_taggedartists_timestamps$tagID)) # 9749
# 可用的数据维度有限,关联规则推荐算法中可以利用
# 艺术家歌曲被播放次数
artists_sum_weight <- sqldf::sqldf('select artistID, sum(weight) as hit from user_artists group by artistID order by hit desc')
# 构建等深分箱函数(Equal frequency intervals)
EFI <- function(data,parts,Min){
parts <- parts # 分几个箱
Min <- Min # 最小值
value<-quantile(data,probs = seq(0,1,1/parts)) #这里以data等比分为4段,步长为1/4
number<-mapply(function(x){
for (i in 1:(parts-1))
{
if(x>=(value[i]-Min)&x1])
{
return(i)
}
}
if(x+Min>value[parts])
{
return(parts)
}
return(-1)
},data)
#打标签L1 L2
return(list(degree=paste("L",number,sep=""),degreevalue=number,value=table(value),number=table(number))) #将连续变量转化成定序变量,此时为L1,L2,L3,L4...根据parts
}
artists_sum_weight_EFI <- EFI(artists_sum_weight$hit,10,min(artists_sum_weight$hit))
artists_sum_weight_EFI$value
# 根据划分区间将艺术家划分为下众(占10%)、中下(占40%)、中上(40%)、大众(10%)等4类
artists_sum_weight$hit <- ifelse(artists_sum_weight$hit<28,'小众',
ifelse(artists_sum_weight$hit<350,'中下',
ifelse(artists_sum_weight$hit<4646,'中上','大众')))
# user_taggedartists与artists_sum_weight左连接得艺术家对应的类型
user_taggedartists<- as.data.table(user_taggedartists)
artists_sum_weight <- as.data.table(artists_sum_weight)
temp_data0 <- artists_sum_weight[user_taggedartists,on='artistID']
head(temp_data0)
temp_data1 <- tags[temp_data0,on='tagID']
head(temp_data1)
table(temp_data1$tagValue)
temp_data1_0 <- sqldf::sqldf('select artistID,count(1) as artist_diversity from temp_data1
group by artistID order by artist_diversity desc')
temp_data1_0_EFI <- EFI(temp_data1_0$artist_diversity,10,min(temp_data1_0$artist_diversity))
temp_data1_0_EFI$value
# 根据二六二分
temp_data1_0$artist_diversity <- ifelse(temp_data1_0$artist_diversity<=2,'单调',
ifelse(temp_data1_0$artist_diversity<=17,'一般','多样'))
temp_data1_0 <- as.data.table(temp_data1_0)
temp_data2 <- temp_data1_0[temp_data1,on='artistID']
# 关联规则数据构建
names(temp_data2)[5] <- 'artist_type'
apriori_data <- temp_data2[,c('userID','artistID','artist_diversity','artist_type','tagValue')]
head(apriori_data)
# write.csv(apriori_data,'apriori_data.csv',fileEncoding = 'utf-8')
## Apriori算法
# 训练集与测试集划分(7:3)
set.seed(12301)
train_number <-createDataPartition(y= apriori_data$userID,p=0.70,list=FALSE) # 测试数据序号
train_apriori <- apriori_data[train_number, ] #训练数据集
test_apriori <- apriori_data[-train_number, ] #测试集
train_apriori <- as.data.frame(train_apriori)
for(i in 1:5){
train_apriori[,i] <- as.factor(train_apriori[,i])
}
train_apriori <- as(train_apriori, "transactions")
inspect(head(train_apriori,100))
# 探究目标规则
# 根据那些用户喜爱的艺术家类型、歌曲风格等,为用户推荐相似的用户喜爱的艺术家
rules<-apriori(train_apriori,parameter=list(support=0.0003,confidence=0.0002,minlen=4),
appearance=list(lhs=c("artist_diversity=多样","artist_type=大众","tagValue=rock"),default="rhs"))
rules
apriori_result <- inspect(sort(rules,by='support'))
# 选择出推荐的artistID
apriori_result$rhs
# 测试
test_apriori <- as.data.frame(test_apriori)
for(i in 1:5){
test_apriori[,i] <- as.factor(test_apriori[,i])
}
test_apriori <- as(test_apriori, "transactions")
inspect(head(test_apriori,100))
rules<-apriori(train_apriori,parameter=list(support=0.000003,confidence=0.0002,minlen=4),
appearance=list(lhs=c("artist_diversity=单调","artist_type=小众","tagValue=funk"),default="rhs"))
rules
apriori_result <- inspect(sort(rules,by='support'))