数据挖掘实验之Apriori算法

数据挖掘实验Apriori算法,在指导书上看着写的,仅供参考。

数据集如下:

购物单号    购物项目
T1    1 3 4
T2    2 3 5
T3    1 2 3 5
T4    2 5
 

#!/user/bin/env python
#coding=utf-8
'''
@author  : Eikken
#@file   : ApiTest.py
#@time   : 2019-05-21 11:57:04
'''
import pandas as pd
import numpy as np

def getDataSet():
    DataSet = pd.read_excel(r'Apriori实验数据.xlsx',encoding='UTF-8')
    dataSet = np.array(DataSet).tolist()
    columns = np.array(DataSet.columns).tolist()
    data = []
    for d in dataSet:
        d[1] = str(d[1]).split(' ')# str(d[1]).split(' ') 元素集转化为单个
        data.append(d[1])
    return data,columns

def createItems(dataSet):
    Items = []
    for d in dataSet:
        for item in d:
            if not[item] in Items: # list 用 not[item] in list
                Items.append([item])
    Items.sort()
    return map(frozenset, Items)

def createSupportItem( D , Items, MinSupport):
    X = {}
    dataSet = list(D)
    items = list(Items)
    sumItem = float(len(dataSet))
    # map对象用一次就空了,所以转化为list
    for d in dataSet:
        for item in items: # 候选集
            if item.issubset(d): # 候选集为item子集
                if not item in X:
                    X[item] = 1 #不存在就创建,存在就加一
                else:
                    X[item] += 1
    supportItems = [] # 返回结果
    supportData = {}
    for k in X.keys():
        support = X[k]/float(sumItem) #支持度
        if support >= MinSupport:
            supportItems.insert(0,k)
        supportData[k] = support
    return supportItems,supportData

def AprioriConf( Lk, k): #计算K频繁项集
    # Lk 是上一个频繁项集 last
    # k是创建的项集数
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk):
            L1 = list(Lk[i])[:k-2]
            L2 = list(Lk[j])[:k-2]
            L1.sort()
            L2.sort()
            if L1 == L2:
                retList.append(Lk[i]|Lk[j])
    return retList

def Apriori(dataSet, minSupport):
    Items = createItems(dataSet)
    D = map(set, dataSet)
    L1, supportData = createSupportItem(D, Items, minSupport)
    L = [L1]
    k = 2
    while( len(L[k-2]) > 0 ):
        Ck = AprioriConf(L[k-2], k)
        Lk , Supk = createSupportItem(map(set, dataSet), Ck, MinSupport=minSupport)
        supportData.update(Supk)
        L.append(Lk)
        k += 1
    return L, supportData

def main():
    dataSet,columns = getDataSet() # dataSet中仅有项目集,没有订单集
    L, Support = Apriori(dataSet,0.5)
    print('所有频繁项集L:')
    for l in L:
        print(l)
    print('对应支持度Support:')
    for k,v in Support.items():
        print('项目集:',k,'的支持度为:',v)

if __name__ == '__main__':
    main()

 

你可能感兴趣的:(Python)