《Python数据挖掘入门与实战》第四章电影推荐案例

import pandas as pd
#1000名用户和1700部电影
all_ratings = pd.read_csv('u.data',delimiter='\t',header=None,names = ['UserID','MovieID','Rating','Datetime'])
all_ratings['Datetime'] = pd.to_datetime(all_ratings['Datetime'],unit='s')
#all_ratings.head()
#对某部电影评分大于3,判断于喜欢该电影
all_ratings['Favorable'] = all_ratings['Rating']>3
#all_ratings[10:15]
#训练集
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]

#用户喜欢某部电影的数据集
favorable_ratings = ratings[ratings['Favorable']]
#每个用户喜欢那些电影 
favorable_reviews_by_users = dict((k , frozenset(v.values)) for k ,v in favorable_ratings.groupby('UserID')['MovieID'])
#每部电影的影迷数量
num_favorable_by_movie = ratings[['MovieID','Favorable']].groupby('MovieID').sum()
#最受欢迎的五部电影
num_favorable_by_movie.sort_values(by = 'Favorable',ascending=False)[:5]

#初始化频繁项集
frequent_itemsets ={}
#最小支持度
min_support = 50

frequent_itemsets[1] = dict((frozenset((movie_id ,)) , row['Favorable']) for movie_id , row in num_favorable_by_movie.iterrows() if row['Favorable']> min_support)
 

from collections import defaultdict
from os import sys
def find_frequent_itemsets(favorable_reviews_by_users ,k_1_itemsets,minsupport):
    counts = defaultdict(int)
    for user , reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset :
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset , frequency) for itemset , frequency in counts.items() if frequency >= min_support])
for k in range(2,20):
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users ,frequent_itemsets[k-1],min_support)
    frequent_itemsets[k] = cur_frequent_itemsets
    if len(cur_frequent_itemsets) == 0:
        print('Did not find any frequent itemsets of length {0}'.format(k))
        sys.stdout.flush()#把缓存区中的内容输出到终端
        break
    else:
        print('I find {0} frequent itemsets of length {1}'.format(len(cur_frequent_itemsets),k))
        sys.stdout.flush() 
del frequent_itemsets[1]
    

candidate_rules = []
for itemset_length , itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion ,))
            candidate_rules.append((premise,conclusion))
print(candidate_rules[:5])

correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user , reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise , conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule : correct_counts[candidate_rule] / float(correct_counts[candidate_rule]+incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules}

from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items() ,key = itemgetter(1) ,reverse =True)
for index in range(5):
    print('Rule #{0}'.format(index +1 ))
    (premise , conclusion) = sorted_confidence[index][0]
    print('RuleL if a person recommends {0} they will also recommend {1}'.format(premise , conclusion))
    print(' - Confidence: {0:.3f}'.format(rule_confidence[(premise , conclusion)]))
    print('')            
        
movie_name_data = pd.read_csv('u.item',delimiter='|',header=None , encoding='mac_roman')
movie_name_data.columns = ['MovieID', 'Title', 'Release Date','Video Release', 'IMDB', '', 'Action', 'Adventure','Animation', "Children's", 'Comedy', 'Crime', 'Documentary','Drama', 'Fantasy',
                           'Film-Noir','Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller','War', 'Western']
def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data['MovieID'] == movie_id]['Title']
    title= title_object.values[0]
    return title
for index in range(5):
    print('Rule #{0}'.format(index +1 ))
    (premise , conclusion) = sorted_confidence[index][0]
    premise_names = ', '.join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print('Rule: if a person recommends {0} they will also recommend {1}'.format(premise_names , conclusion_name))
    print(' - Confidence: {0:.3f}'.format(rule_confidence[(premise , conclusion)]))
    print('')  
test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]
test_favorable = test_dataset[test_dataset['Favorable']]
test_favorable_by_users = dict((k,frozenset(v.values)) for k ,v in test_favorable.groupby('UserID')['MovieID'] )

correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user ,reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise , conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] +=1
            else:
                incorrect_counts[candidate_rule] +=1
test_confidence = {candidate_rule :correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule] ) for candidate_rule in candidate_rules}
for index in range(5):
    print('Rule #{0}'.format(index +1 ))
    (premise , conclusion) = sorted_confidence[index][0]
    premise_names = ', '.join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print('Rule: if a person recommends {0} they will also recommend {1}'.format(premise_names , conclusion_name))
    print(' - Train Confidence: {0:.3f}'.format(rule_confidence[(premise , conclusion)]))
    print(' - Test Confidence: {0:.3f}'.format(test_confidence[(premise , conclusion)]))
    print('')  
                              

你可能感兴趣的:(机器学习,机器学习,Apriori,算法)