【数据挖掘】基于R语言的Apriori算法应用案例

基于R语言的Apriori算法应用案例

# 加载相关包
library(arules)

data("Groceries")
summary(Groceries)
#-------------------查看数据集信息
class(Groceries)
Groceries
dim(Groceries)
colnames(Groceries)[1:5]
rownames(Groceries)[1:5]

# basketSize表示每个transaction包含item的数目,是row level。
# ItemFrequency是这个item的支持度,是column level。
basketSize <- size(Groceries) 
summary(basketSize)
sum(basketSize) # count of all 1s in the sparse matrix

itemFreq <- itemFrequency(Groceries) 
itemFreq[1:5]
sum(iterFreq)

# 查看basketSize的分布:密度曲线(TO ADD HERE)
# itemCount表示每个item出现的次数
# Support(X) = Xs / N, N是总的交易数,Xs就是Item X的count。
# itemXCount = N * itemXFreq = (ItemXFreq / sum(itemFreq)) * sum(basketSize)
itemCount <- (itemFreq/sum(itemFreq))*sum(basketSize)
summary(itemCount)
orderedItem <- sort(itemCount,decreasing = T)
orderedItem[1:10]

# 把支持度itemFrequency排序,查看支持度的最大值
orderedItemFreq <- sort(itemFrequency(Groceries), decreasing=T)
orderedItemFreq[1:10]
itemFrequency(Groceries[100:800,1:3])
itemFrequencyPlot(Groceries,support=0.1) # 按最小支持度查看
itemFrequencyPlot(Groceries,topN=10,horiz=T) # 按照排序查看

# 根据业务对数据集进行过滤,获得进一步规则挖掘的数据集
groceries_use <- Groceries[basketSize>1]
dim(groceries_use)

# 查看数据
inspect(Groceries[1:5])
# 通过图形更直观观测数据的稀疏情况。一个点代表在某个transaction上购买了item。
image(Groceries[1:10])
# 用sample函数进行采样显示
image(sample(Groceries,100))


library(arules) # association rules
library(arulesViz) # data visualization of association rules
library(RColorBrewer) # color palettes for plots

# ----------------- 规则挖掘
# 设定一个最小支持度
# apriori 默认的support=0.1, confidence=0.8, minlen=1, maxlen=10
groceryrules <- apriori(Groceries,parameter = list(support=0.006,confidence=0.25,minlen=2))

#------------------评估模型
summary(groceryrules)
# 使用inspect查看具体的规则
inspect(groceryrules[1:5])

#----------------- 评估规则
# 规则分为三类:
# 1.Actionable:这些rule提供了非常清晰、有用的洞察,可以直接应用在业务上。
# 2.Trivial:这些rule显而易见,很清晰但是没啥用。属于common sense,如 {尿布} => {婴儿食品}。
# 3.Inexplicable:这些rule是不清晰的,难以解释,需要额外的研究来判定是否是有用的rule。
# 按照某种度量,对规则进行排序
ordered_groceryrules <- sort(groceryrules, by="lift")
inspect(ordered_groceryrules[1:5])


#----------------- 搜索规则 
yogurtrules <- subset(groceryrules, items %in% c("yogurt"))
inspect(yogurtrules)
## items %in% c("A", "B")表示 lhs+rhs的项集并集中,至少有一个item是在c("A", "B")。item = A or item = B
## 如果仅仅想搜索lhs或者rhs,那么用lhs或rhs替换items即可。如:lhs %in% c("yogurt")

# %in%是精确匹配
# %pin%是部分匹配,也就是说只要item like '%A%' or item like '%B%'
# %ain%是完全匹配,也就是说itemset has ’A' and itemset has ‘B'
# 可以通过 条件运算符(&, |, !) 添加 support, confidence, lift的过滤条件。

fruitrules <- subset(groceryrules, items %pin% c("fruit"))
inspect(fruitrules)
byrules <- subset(groceryrules, items %ain% c("berries", "yogurt"))
inspect(byrules)
fruitrules <- subset(groceryrules, items %pin% c("fruit") & lift > 2)
inspect(fruitrules)

# 查看其它的quality measure
qualityMeasures  <- interestMeasure(groceryrules, method=c("coverage","fishersExactTest","conviction", "chiSquared"), transactions=groceries)
summary(qualityMeasures)

quality(groceryrules) <- cbind(quality(groceryrules), qualityMeasures)
inspect(head(sort(groceryrules, by = "conviction", decreasing = F)))

# 限制挖掘的item
berriesInLHS <- apriori(Groceries, parameter = list( support = 0.001, confidence = 0.1 ), appearance = list(lhs = c("berries"), default="rhs"))
summary(berriesInLHS)
inspect(berriesInLHS)
inspect(head(rhs(berriesInLHS), n=5))

berrySub <- subset(berriesInLHS, subset = !(rhs %in% c("root vegetables", "whole milk")))
inspect(head(rhs(sort(berrySub, by="confidence")), n=5))
berrySub

# 保存挖掘的结果
# 第一,保存到文件。可以与外部程序进行交换
write(groceryrules,file = "groceryrules.csv", sep = ",",quote = TRUE,row.names = FALSE)

# 第二,转换为data frame 然后再进一步的处理。
groceryrules_df <- as(groceryrules,"data.frame")
str(groceryrules_df)

# 进阶部分
print(levels(itemInfo(Groceries)[["level1"]]))
print(levels(itemInfo(Groceries)[["level2"]]))

# 一般来说,如果商品有类别信息,可以尝试在类别上进行关联规则的挖掘,
# 毕竟成千上百个商品之间的规则挖掘要困难得多。可以先从高粒度上进行挖掘实验,
# 然后再进行细粒度的挖掘实验。
inspect(Groceries[1:3])
Groceries <- aggregate(Groceries,itemInfo(Groceries)[["level2"]])
inspect(Groceries[1:3])

itemFrequencyPlot(Groceries, support = 0.025, cex.names=0.8, xlim = c(0,0.3),
                  type = "relative", horiz = TRUE, col = "dark red", las = 1,
                  xlab = paste("Proportion of Market Baskets Containing Item",
                               "\n(Item Relative Frequency or Support)"))

# 规则的图形展示
second.rules <- apriori(Groceries,parameter = list(support=0.025,confidence=0.05))
print(summary(second.rules))
# Scatter Plot
plot(second.rules, control=list(jitter=2, col = rev(brewer.pal(9, "Greens")[4:9])),shading = "lift")   
# Grouped Matrix
plot(second.rules, method="grouped",control=list(col = rev(brewer.pal(9, "Greens")[4:9])))
# Graph
top.vegie.rules <- sort(groceryrules, by=c('support','lift'))[1:10]
plot(top.vegie.rules, measure="confidence", method="graph",control=list(type="items"),shading = "lift")

你可能感兴趣的:(Data,Mining&Analysis,我的学习历程)