Apriori算法是常用的用于挖掘出数据关联规则的算法,它用来找出数据值中频繁出现的数据集合,找出这些集合的模式有助于我们做一些决策。
对于关联度分析,我们首先会用Apriori算法去产生频繁项集。第二步,我们通过其中的频繁项集测试其前提和条件,进而创建关联的规则。
import numpy as np
import pandas as pd
file = "ml-100k/u.data"
data = pd.read_csv(file,delimiter='\t',header=None,names = ['UserID','MovieID','Rating','Datetime'])
data["Datetime"] = pd.to_datetime(data['Datetime'],unit='s')
data[:5]
# determine if a person recommends a movie
data["Favor"] = data["Rating"] > 3
data[10:15]
data[data["UserID"] == 1][:5]
# 选择前200 users
data_200 = data[data['UserID'].isin(range(200))]
# only the favorable reviews
favor_data = data_200[data_200["Favor"] == 1]
favor_data[:5]
# 将用户评论的电影放入一个集合
# frozenset 冻结集合
favor_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favor_data.groupby("UserID")["MovieID"])
print(favor_reviews_by_users[1])
print(len(favor_reviews_by_users))
# 电影得到推荐的数量
num_favor_by_movie = data_200[["MovieID","Favor"]].groupby("MovieID").sum()
num_favor_by_movie.sort_values("Favor",ascending=False)[:5]
算法流程:
from collections import defaultdict
import sys
def find_frequent_itemsets(favor_reviews_by_users,k_itemsets,min_support):
counts = defaultdict(int)
# iterate over all of the users and their reviews
for user,review in favor_reviews_by_users.items():
# see whether itemset is a subset of the reviews or not
for itemset in k_itemsets:
if itemset.issubset(review):
for other_reviewed_movie in review-itemset:
current_superset = itemset|frozenset((other_reviewed_movie,))
counts[current_superset] += 1
return dict([(itemset,frequence) for itemset,frequence in counts.items() if frequence >= min_support])
frequent_itemsets={}
min_support = 50
# 长度为1的频繁项集
frequent_itemsets[1] = dict((frozenset((movie_id,)),row['Favor']) for movie_id,row in num_favor_by_movie.iterrows() if row['Favor'] > min_support)
print("There are {0} movies with more than {1} favor reviews.".format(len(frequent_itemsets[1]),min_support))
for k in range(2,20):
#通过k-1个频繁项集产生k个频繁项集
cur_frequent_itemsets = find_frequent_itemsets(favor_reviews_by_users,frequent_itemsets[k-1],min_support)
if len(cur_frequent_itemsets) == 0:
print("Did not any frequent itemsets of length {}".format(k))
sys.stdout.flush()
break
else:
print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets),k))
sys.stdout.flush()
frequent_itemsets[k] = cur_frequent_itemsets
# 长度为1的频繁项集不需要
del frequent_itemsets[1]
print("Found a total of {} frequent itemsets.".format(sum(len(frequent_item) for frequent_item in frequent_itemsets.values())))
# Now, we create the association rules.First ,they can ben the candidate rules until tested
candidate_rules = []
for itemset_length,itemset_counts in frequent_itemsets.items():
for itemset in itemset_counts.keys():
# 将其中一个项作为结论,其他作为前提
for conclusion in itemset:
premise = itemset - set((conclusion,))
candidate_rules.append((premise,conclusion))
print("There are {} candidate rules in total.".format(len(candidate_rules)))
print(candidate_rules[:5])
# Now,we compute the confidence of each of these rules.
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user,reviews in favor_reviews_by_users.items():
for candidate_rule in candidate_rules:
premise,conclusion = candidate_rule
if premise.issubset(reviews):
if conclusion in reviews:
correct_counts[candidate_rule] += 1
else:
incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
for candidate_rule in candidate_rules}
# set the min_confidence
min_confidence = 0.9
# filter out the poor rules
rule_confidence = {rule: confidence for rule,confidence in rule_confidence.items() if confidence > min_confidence}
print(len(rule_confidence))
from operator import itemgetter
sort_confidence = sorted(rule_confidence.items(),key=itemgetter(1),reverse = True)
for index in range(0,5):
print("Rule #{0}:".format(index+1))
premise,conclusion = sort_confidence[index][0]
print("Rule: If a person recommends {0} they will also recommend {1}".format(premise,conclusion))
print("- Confidence: {0:.1f}%".format(sort_confidence[index][1]))
# Even better, we can get the movie titles themselves from the dataset
movie_name_file = "ml-100k/u.item"
movie_name_data = pd.read_csv(movie_name_file,delimiter="|",header=None,encoding="mac-roman")
movie_name_data.columns = ["MovieID", "Title", "Release Date", "Video Release", "IMDB", "", "Action", "Adventure",
"Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
"Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
def get_movie_name(movie_id):
title_object = movie_name_data[movie_name_data["MovieID"] == movie_id]["Title"]
title = title_object.values[0]
return title
get_movie_name(1)
for index in range(0,5):
print("Rule #{0}:".format(index+1))
premise,conclusion = sort_confidence[index][0]
premise_name = ", ".join(get_movie_name(idx) for idx in premise)
conclusion_name = get_movie_name(conclusion)
print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_name,conclusion_name))
print("- Confidence: {0:.1f}%".format(sort_confidence[index][1]))
# evaluation using test data
test_data = data[~data['UserID'].isin(range(200))]
test_favor = test_data[test_data["Favor"]]
test_favor_by_users = dict((k,frozenset(v.values)) for k,v in test_favor.groupby('UserID')['MovieID'])
test_data[:5]
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user,reviews in test_favor_by_users.items():
for candidate_rule in candidate_rules:
premise,conclusion = candidate_rule
if premise.issubset(reviews):
if conclusion in reviews:
correct_counts[candidate_rule] += 1
else:
incorrect_counts[candidate_rule] += 1
test_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule])
for candidate_rule in rule_confidence}
print(len(test_confidence))
sort_test_confidence = sorted(test_confidence.items(),key=itemgetter(1),reverse = True)
print(sort_test_confidence[:5])
for index in range(5):
print("Rule #{0}:".format(index+1))
premise,conclusion = sort_confidence[index][0]
premise_name = ", ".join(get_movie_name(idx) for idx in premise)
conclusion_name = get_movie_name(conclusion)
print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_name,conclusion_name))
print("- Train Confidence: {0:.1f}%".format(rule_confidence.get((premise,conclusion),-1)*100))
print("- Test Confidence: {0:.1f}%".format(test_confidence.get((premise,conclusion),-1)*100))