从数据集频繁出现的商品中选出共同出现的商品组成频繁项集,生成关联规则
如果通过遍历每一种组合,那么数据量太大了,我们这里引入支持度的概念
例如:
要生成包含商品A、B的频繁项集(A,B),要求支持度至少为30
那么A和B都必须至少在数据集中出现30次,(A,B)至少也要出现30次
现有100k有关电影打分的数据
目标:输出“如果用户喜欢A,那么他有可能喜欢B”
数据下载地址:http://grouplens.org/datasets/movielens/
下载100k的那一个数据
import os
import pandas as pd
from collections import defaultdict
import sys
from operator import itemgetter
ratings_filename='C:/Users/F.S.Z/Desktop/数据挖掘学习ing/Data/ml-100k/u.data'
all_ratings=pd.read_csv(ratings_filename,delimiter="\t",header=None,names=["UserID","MovieId","Rating","Datetime"],engine='python')
all_ratings["Favorable"]=all_ratings["Rating"]>3#评分大于3认为喜欢这部电影
ratings=all_ratings[all_ratings["UserID"].isin(range(200))]
favorable_ratings=ratings[ratings["Favorable"]]#只要喜欢的电影
favorable_reviews_by_users=dict((k,frozenset(v.values))for k,v in favorable_ratings.groupby("UserID")["MovieId"])
num_favorable_by_movie=ratings[["MovieId","Favorable"]].groupby("MovieId").sum()
frequent_itemsets={}#频繁项集的字典
min_support=50#最小支持度
frequent_itemsets[1]=dict((frozenset((movie_id,)),row["Favorable"])for movie_id,row in num_favorable_by_movie.iterrows() if row["Favorable"]>min_support)#包含一个元素的平凡项集
def find_frequent_itemsets(favorable_reviews_by_users,k_i_itemsets,min_support):#在前一个频繁项集的基础上使频繁项集包含的元素个数加一
counts=defaultdict(int)#计数字典
for user,reviews in favorable_reviews_by_users.items():#遍历每一个用户喜欢的所有电影
for itemset in k_i_itemsets:#遍历上一个频繁项集
if itemset.issubset(reviews):#如果频繁项集中的一个集合是这个用户喜欢所有电影集合的子集
for other_reviewed_movie in reviews-itemset:#把这个用户喜欢的其他电影单个遍历
current_superset=itemset|frozenset((other_reviewed_movie,))#与前一个频繁项集进行合并,生成新的频繁项集
counts[current_superset]+=1#该个频繁项集计数加一
return dict([(itemset,frequency)for itemset,frequency in counts.items() if frequency>=min_support])#返回新的频繁项集
for k in range(2,20):#生成含2-20个元素的频繁项集
cur_frequent_itemsets=find_frequent_itemsets(favorable_reviews_by_users,frequent_itemsets[k-1],min_support)
frequent_itemsets[k]=cur_frequent_itemsets
if len(cur_frequent_itemsets)==0:
print("Did not find any frequent itemsets of length{}".format(k))
sys.stdout.flush()#这个是确保代码还在运行时,把缓冲区内容输出到终端
else:
print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets),k))
sys.stdout.flush()
del frequent_itemsets[1]#删除只含一个元素的频繁项集
candidate_rules=[]
for itemset_length,itemset_counts in frequent_itemsets.items():
for itemset in itemset_counts.keys():
for conclusion in itemset:
premise=itemset-set((conclusion,))
candidate_rules.append((premise,conclusion))
correct_counts=defaultdict(int)
incorrect_counts=defaultdict(int)
for user,reviews in favorable_reviews_by_users.items():
for candidate_rule in candidate_rules:
premise,conclusion=candidate_rule
if premise.issubset(reviews):
if conclusion in reviews:
correct_counts[candidate_rule]+=1
else:
incorrect_counts[candidate_rule]+=1
rule_confidence={candidate_rule:correct_counts[candidate_rule]/float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule])for candidate_rule in candidate_rules}
sorted_confidence=sorted(rule_confidence.items(),key=itemgetter(1),reverse=True)#按置信度排序
test_dataset=all_ratings[~all_ratings["UserID"].isin(range(200))]#选取除训练数据中200位用户外的用户
test_favorable=test_dataset[test_dataset["Favorable"]]
test_favorable_by_users=dict((k,frozenset(v.values))for k,v in test_favorable.groupby("UserID")["MovieId"])
correct_counts=defaultdict(int)
incorrect_counts=defaultdict(int)
for user,reviews in test_favorable_by_users.items():
for candidate_rule in candidate_rules:
premise,conclusion=candidate_rule
if premise.issubset(reviews):
if conclusion in reviews:
correct_counts[candidate_rule]+=1
else:
incorrect_counts[candidate_rule]+=1
test_confidence={candidate_rule:correct_counts[candidate_rule]/float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule])for candidate_rule in rule_confidence}
for index in range(5):
print("#Rule #{}".format(index+1))
(premise,conclusion)=sorted_confidence[index][0]
print("Rule:If a person recommends{},they will also recommend {}".format(premise,conclusion))
print("- Confidence:{0:.3f}".format(rule_confidence[(premise,conclusion)]))
print("- test Confidence:{0:.3f}".format(test_confidence[(premise, conclusion)]))
print('\n')