PCY(Park-Chen-Yu)算法是对 A-Priori 算法的改进,主要通过在第一遍扫描中创建哈希表来优化项对计数的过程。其基本机制包括以下几个步骤:
数据集中包含了多笔交易记录,每笔交易记录由顾客购买的商品组成。具体交易如下:
消费者偏好分析:
交叉销售机会:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project :catkin_ws
@File :A_priori.py
@IDE :PyCharm
@Author :maosw
@Date :2025/1/22 22:19
'''
import csv
from itertools import chain, combinations
from collections import defaultdict
# 创建模拟数据
data = [
['Milk', 'Bread', 'Diaper'],
['Milk', 'Diaper', 'Beer', 'Eggs'],
['Milk', 'Bread', 'Diaper', 'Beer'],
['Bread', 'Diaper'],
['Milk', 'Bread', 'Diaper', 'Cola'],
['Diaper', 'Beer'],
['Milk', 'Bread', 'Cola'],
['Milk', 'Diaper', 'Cola'],
['Bread', 'Diaper', 'Cola'],
]
# 将数据写入 CSV 文件
with open('transactions.csv', 'w', newline='') as file:
writer = csv.writer(file)
for transaction in data:
writer.writerow(transaction)
def subsets(arr):
"""返回非空子集"""
return chain(*[combinations(arr, i + 1) for i in range(len(arr))])
def get_support(itemset, transaction_list):
"""计算支持度"""
count = sum(1 for transaction in transaction_list if itemset.issubset(transaction))
return count / len(transaction_list)
def apriori(transactions, min_support):
"""实现 Apriori 算法"""
itemset = set()
for transaction in transactions:
for item in transaction:
itemset.add(frozenset([item])) # 生成 1-itemSet
# 计算频繁项集
freq_itemsets = {}
current_itemset = {item for item in itemset if get_support(item, transactions) >= min_support}
k = 1
while current_itemset:
freq_itemsets[k] = current_itemset
k += 1
current_itemset = {i.union(j) for i in current_itemset for j in current_itemset if
len(i.union(j)) == k and get_support(i.union(j), transactions) >= min_support}
return freq_itemsets
def generate_association_rules(freq_itemsets, min_confidence):
"""生成关联规则"""
rules = []
for k, itemsets in freq_itemsets.items():
if k < 2: # 只对 2项集及以上尝试生成规则
continue
for itemset in itemsets:
for subset in subsets(itemset):
remain = itemset.difference(frozenset(subset))
if remain: # 只处理非空剩余
confidence = get_support(itemset, transactions) / get_support(frozenset(subset), transactions)
if confidence >= min_confidence:
rules.append(((tuple(subset), tuple(remain)), confidence))
return rules
def print_frequent_itemsets(freq_itemsets):
"""打印频繁项集"""
for k, itemsets in freq_itemsets.items():
print(f"频繁 {k}-项集:")
for itemset in itemsets:
print(f" {set(itemset)}")
def print_association_rules(rules):
"""打印关联规则"""
print("\n生成的关联规则:")
for rule, confidence in rules:
antecedent, consequent = rule
print(f"规则: {set(antecedent)} ==> {set(consequent)} , 置信度: {confidence:.3f}")
# 读取数据
def load_transactions(filename):
transactions = []
with open(filename, 'r') as file:
reader = csv.reader(file)
for row in reader:
transactions.append(frozenset(row))
return transactions
# 主程序
if __name__ == "__main__":
transactions = load_transactions('transactions.csv')
min_support = 0.2 # 最小支持度
min_confidence = 0.6 # 最小置信度
freq_itemsets = apriori(transactions, min_support)
print_frequent_itemsets(freq_itemsets)
# 生成和打印关联规则
rules = generate_association_rules(freq_itemsets, min_confidence)
print_association_rules(rules)