python购物篮分析_关联规则-购物篮分析(basket analysis)

数据集:网上下载的公共数据

环境:Python3.5

实现:找到了C1 L1 C2 L2

存在的不足:只能找到指定长度候选K项级和频繁K项级,没有实现找到存在的所有K项级,但实现原理相同

数据格式如下:

9900000988 07004 水晶梨 1

9900000989 03001 300g壶瓶枣 1

9900000989 03002 484g壶瓶枣 1

9900000989 03004 2000g壶瓶枣礼盒一 1

...

9900000989 06002 夹子 2

9900000989 06003 蜻蜓扑克 1

9900000989 04002 800g沁州黄 1

9900000989 04004 5斤布袋沁州黄 1

上代码:

Created on 2018年4月12日

@author: yqm

'''

import os

'''data文件路径'''

file_dir = "./data/data.txt"

'''判断文件是否存在'''

if(os.path.exists(file_dir)):

pass

else:

raise FileNotFoundError("找不到文件" + file_dir)

'''读数据文件 存为列表list'''

data_list = [] # 保存所有订单所有商品

with open(file_dir, encoding="utf-8") as f:

lines = f.readlines()

for line in lines:

list = line.split()

data_list.append(list)

# print(data_list)

'''统计每笔订单购买商品和商品数,每笔订单存为一个词典[{},{}...{}]'''

All_Data_List = [] # 存储每笔订单统计

data_one = "9900000984" #第一个小票号 此号用来唯一标示一笔订单

item = {}

for i, data in enumerate(data_list):

if(data[0]==data_one):

item[data[2]] = int(data[3])

else:

All_Data_List.append(item)

data_one = data[0]

item = {}

item[data[2]] = int(data[3])

# print(All_Data_List)

'''统计每个商品的数量'''

goods_num_Statistics = {}

for list in data_list:

if list[2] in goods_num_Statistics:

goods_num_Statistics[list[2]] += 1

else:

goods_num_Statistics[list[2]] = 1

print("C1 = " + str(goods_num_Statistics))

'''根据C1结果删除数量小于2的商品,输出L1'''

keys_list = []

for key in goods_num_Statistics.keys():

keys_list.append(key)

for key in keys_list:

if goods_num_Statistics[key] < 2:

del goods_num_Statistics[key]

print("L1 = " + str(goods_num_Statistics))

'''根据L1结果计算C2'''

C2_item = [] # 存放组合结果,不包含数量

C2_keys = [] # goods_num_Statistics字典所有的key值

for key in goods_num_Statistics.keys():

C2_keys.append(key)

for name1 in C2_keys:

for name2 in C2_keys:

C2_item.append([name1, name2])

# print(C2_item)

# print(len(C2_item))

# 去重

for item in C2_item:

a = [item[0], item[1]]

b = [item[1], item[0]]

if a in C2_item and b in C2_item:

aa = C2_item.index(a)

del C2_item[aa]

for item1 in C2_item:

for item2 in C2_item:

if item1 == item2:

bb = C2_item.index(item1)

del C2_item[bb]

# print("C2_item" + str(C2_item))

# print(len(C2_item))

'''根据去重后的组合C2_item计算C2结果'''

C2_result = {} # 存放C2结果

C2_item_key = []

for keys in All_Data_List:

list_key = []

for key in keys.keys():

list_key.append(key)

C2_item_key.append(list_key)

# print("C2_item_key" + str(C2_item_key))

for item1 in C2_item:

num = 0

for item2 in C2_item_key:

if set(item1).issubset(item2):

num += 1

item1.append(num)

print("C2 = " + str(C2_item))

'''根据C2筛选出数量大于2的所有集合'''

L2_result = []

for item in C2_item:

if item[2] >= 2:

L2_result.append(item)

print("L2 = " + str(L2_result))

运行结果部分展示:

C1 = {'宁化府十二珍醋': 2, '软中华': 4, '中南海0.8': 5, '大豆': 1, '800g*2壶瓶醉枣礼盒(桶)': 2,...}

L1 = {'宁化府十二珍醋': 2, '软中华': 4, '中南海0.8': 5, '800g*2壶瓶醉枣礼盒(桶)': 2, '牛肉258g': 2,...}

C2 = [['宁化府十二珍醋', '800g*2壶瓶醉枣礼盒(桶)', 0], ['宁化府十二珍醋', '散核桃仁', 0],...]

L2 = [['中南海0.8', '散大核桃', 2], ['牛肉258g', '牛肉258g', 2], ['牛肉258g', '软云', 2], ...]

你可能感兴趣的:(python购物篮分析)