import pandas as pd
#1000名用户和1700部电影
all_ratings = pd.read_csv('u.data',delimiter='\t',header=None,names = ['UserID','MovieID','Rating','Datetime'])
all_ratings['Datetime'] = pd.to_datetime(all_ratings['Datetime'],unit='s')
#all_ratings.head()
#对某部电影评分大于3,判断于喜欢该电影
all_ratings['Favorable'] = all_ratings['Rating']>3
#all_ratings[10:15]
#训练集
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]
#用户喜欢某部电影的数据集
favorable_ratings = ratings[ratings['Favorable']]
#每个用户喜欢那些电影
favorable_reviews_by_users = dict((k , frozenset(v.values)) for k ,v in favorable_ratings.groupby('UserID')['MovieID'])
#每部电影的影迷数量
num_favorable_by_movie = ratings[['MovieID','Favorable']].groupby('MovieID').sum()
#最受欢迎的五部电影
num_favorable_by_movie.sort_values(by = 'Favorable',ascending=False)[:5]
#初始化频繁项集
frequent_itemsets ={}
#最小支持度
min_support = 50
frequent_itemsets[1] = dict((frozenset((movie_id ,)) , row['Favorable']) for movie_id , row in num_favorable_by_movie.iterrows() if row['Favorable']> min_support)
from collections import defaultdict
from os import sys
def find_frequent_itemsets(favorable_reviews_by_users ,k_1_itemsets,minsupport):
counts = defaultdict(int)
for user , reviews in favorable_reviews_by_users.items():
for itemset in k_1_itemsets:
if itemset.issubset(reviews):
for other_reviewed_movie in reviews - itemset :
current_superset = itemset | frozenset((other_reviewed_movie,))
counts[current_superset] += 1
return dict([(itemset , frequency) for itemset , frequency in counts.items() if frequency >= min_support])
for k in range(2,20):
cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users ,frequent_itemsets[k-1],min_support)
frequent_itemsets[k] = cur_frequent_itemsets
if len(cur_frequent_itemsets) == 0:
print('Did not find any frequent itemsets of length {0}'.format(k))
sys.stdout.flush()#把缓存区中的内容输出到终端
break
else:
print('I find {0} frequent itemsets of length {1}'.format(len(cur_frequent_itemsets),k))
sys.stdout.flush()
del frequent_itemsets[1]
candidate_rules = []
for itemset_length , itemset_counts in frequent_itemsets.items():
for itemset in itemset_counts.keys():
for conclusion in itemset:
premise = itemset - set((conclusion ,))
candidate_rules.append((premise,conclusion))
print(candidate_rules[:5])
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user , reviews in favorable_reviews_by_users.items():
for candidate_rule in candidate_rules:
premise , conclusion = candidate_rule
if premise.issubset(reviews):
if conclusion in reviews:
correct_counts[candidate_rule] += 1
else:
incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule : correct_counts[candidate_rule] / float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items() ,key = itemgetter(1) ,reverse =True)
for index in range(5):
print('Rule #{0}'.format(index +1 ))
(premise , conclusion) = sorted_confidence[index][0]
print('RuleL if a person recommends {0} they will also recommend {1}'.format(premise , conclusion))
print(' - Confidence: {0:.3f}'.format(rule_confidence[(premise , conclusion)]))
print('')
movie_name_data = pd.read_csv('u.item',delimiter='|',header=None , encoding='mac_roman')
movie_name_data.columns = ['MovieID', 'Title', 'Release Date','Video Release', 'IMDB', '', 'Action', 'Adventure','Animation', "Children's", 'Comedy', 'Crime', 'Documentary','Drama', 'Fantasy',
'Film-Noir','Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller','War', 'Western']
def get_movie_name(movie_id):
title_object = movie_name_data[movie_name_data['MovieID'] == movie_id]['Title']
title= title_object.values[0]
return title
for index in range(5):
print('Rule #{0}'.format(index +1 ))
(premise , conclusion) = sorted_confidence[index][0]
premise_names = ', '.join(get_movie_name(idx) for idx in premise)
conclusion_name = get_movie_name(conclusion)
print('Rule: if a person recommends {0} they will also recommend {1}'.format(premise_names , conclusion_name))
print(' - Confidence: {0:.3f}'.format(rule_confidence[(premise , conclusion)]))
print('')
test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]
test_favorable = test_dataset[test_dataset['Favorable']]
test_favorable_by_users = dict((k,frozenset(v.values)) for k ,v in test_favorable.groupby('UserID')['MovieID'] )
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user ,reviews in test_favorable_by_users.items():
for candidate_rule in candidate_rules:
premise , conclusion = candidate_rule
if premise.issubset(reviews):
if conclusion in reviews:
correct_counts[candidate_rule] +=1
else:
incorrect_counts[candidate_rule] +=1
test_confidence = {candidate_rule :correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule] ) for candidate_rule in candidate_rules}
for index in range(5):
print('Rule #{0}'.format(index +1 ))
(premise , conclusion) = sorted_confidence[index][0]
premise_names = ', '.join(get_movie_name(idx) for idx in premise)
conclusion_name = get_movie_name(conclusion)
print('Rule: if a person recommends {0} they will also recommend {1}'.format(premise_names , conclusion_name))
print(' - Train Confidence: {0:.3f}'.format(rule_confidence[(premise , conclusion)]))
print(' - Test Confidence: {0:.3f}'.format(test_confidence[(premise , conclusion)]))
print('')