Apriori算法

关联分析概念:

  • 关联分析是一种在大规模数据集中寻找有趣关系的任务;目标是发现频繁项集和发现关联规则;
  • 频繁项集:是经常出项在一块的物品的集合;
  • 关联规则:暗示两种物品之间可能存在很强的关系;
  • 支持度:定义为数据集中包含该项集的记录所占的比例;support(A=>B) = P(A∪B)
  • 可信度:条件概率;confidence(A=>B) = P(B|A) = support(A∪B)/ support(A)
使用Apriori算法发现频繁集
def loadDataSet():
    return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]

#创建C1:单个元素的集合
def createC1(dataSet):
    '''
    [frozenset({1}),
     frozenset({2}),
     frozenset({3}),
     frozenset({4}),
     frozenset({5})]
    '''
    C1 = []
    for transaction in dataSet:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    return list(map(frozenset,C1))

#统计支持度并且过滤,C1-->L1
def scanD(D, Ck, minSupport):
    ssCnt = {}
    
    #统计每个候选集出现在记录中的个数
    for tid in D: #遍历每一个记录
        for can in Ck: #遍历候选集
            if can.issubset(tid):#若候选集在记录中
                ssCnt[can] = ssCnt.get(can,0) + 1
    
    #计算支持度
    numItems = float(len(D))
    retList = []
    supportData = {}
    for key in ssCnt:#遍历每一个候选集
        support = ssCnt[key] / numItems #计算该候选集的支持度
        
        #过滤,只保留大于支持率的候选集
        if support >= minSupport:
            retList.insert(0,key) #在首部插入
        supportData[key] = support
    return retList,supportData

dataSet = loadDataSet()
C1 = createC1(dataSet)
L1,supportData0 = scanD(dataSet,C1,0.5)
L1
[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})]
supportData0
{frozenset({4}): 0.25,
 frozenset({5}): 0.75,
 frozenset({2}): 0.75,
 frozenset({3}): 0.75,
 frozenset({1}): 0.5}
#完整的Apriori算法

#Lk-1 --> Ck:组合
def aprioriGen(Lk,k):
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk): #第一个元素
        L1 = list(Lk[i])[:k-2] #左闭右开
        L1.sort()
        for j in range(i+1,lenLk):#第二个元素
            L2 = list(Lk[j])[:k-2]
            L2.sort()
            if L1==L2:
                retList.append(Lk[i] | Lk[j])
    return retList

#Apriori主函数
def apriori(dataSet,minSupport = 0.5):
    #创建C1-->L1
    C1 = createC1(dataSet)
    D = list(map(set,dataSet)) #所有数据
    L1, supportData = scanD(D,C1,minSupport)
    L = [L1]
    k = 2
    
    #Ck-->Lk:过滤
    while (len(L[k-2]) > 0):#直到最后的元素为空则停止
        Ck = aprioriGen(L[k-2],k)#k-2:从index=0开始
        Lk,supK = scanD(D,Ck, minSupport)
        supportData.update(supK)
        L.append(Lk)
        k += 1
    return L,supportData

apriori(dataSet,minSupport = 0.5)
([[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})],
  [frozenset({3, 5}), frozenset({1, 3}), frozenset({2, 5}), frozenset({2, 3})],
  [frozenset({2, 3, 5})],
  []],
 {frozenset({5}): 0.75,
  frozenset({3}): 0.75,
  frozenset({2, 3, 5}): 0.5,
  frozenset({1, 2}): 0.25,
  frozenset({1, 5}): 0.25,
  frozenset({3, 5}): 0.5,
  frozenset({4}): 0.25,
  frozenset({2, 3}): 0.5,
  frozenset({2, 5}): 0.75,
  frozenset({1}): 0.5,
  frozenset({1, 3}): 0.5,
  frozenset({2}): 0.75})
从频繁项集中挖掘关联规则
#生成关联规则
def generateRules(L,supportData,minConf =0.7):
    bigRuleList = []

    for i in range(1,len(L)): #从包含两个元素的项集开始,因为单个元素没有其他元素与之关联
        for freqSet in L[i]:
            '''
            遍历L层;
                    [[frozenset({1}), frozenset({3}), frozenset({2}), frozenset({5})],
                     [frozenset({3, 5}), frozenset({1, 3}), frozenset({2, 5}), frozenset({2, 3})],
                     [frozenset({2, 3, 5})],
                     []]
            L0:项集一个元素;
            L1:项集两个元素;
            L2:项集三个元素
            '''
            H1 = [frozenset([item]) for item in freqSet] #收集一个项集的关联规则的右边
            if (i > 1):
                rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)
            else:#右边只有一个元素,直接计算可信度
                calcConf(freqSet,H1,supportData,bigRuleList,minConf)
    return bigRuleList

#计算可信度
def calcConf(freqSet,H,supportData,brList,minConf=0.7):
    '''
    freqSet:frozenset({3, 5})
    H:[frozenset({3}), frozenset({5})]
    frozenset({5}) --> frozenset({3}) conf: 0.6666666666666666
    frozenset({3}) --> frozenset({5}) conf: 0.6666666666666666
    '''
    prunedH = []
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet-conseq]
        if conf >= minConf:
            print(freqSet-conseq,'-->',conseq,'conf:',conf)
            brList.append((freqSet-conseq,conseq,conf))
            prunedH.append(conseq) #剪枝之后的右边
    return prunedH

def rulesFromConseq(freqSet,H,supportData,brList,minConf=0.7):
#     print(freqSet) #frozenset({2, 3, 5})
#     print(H) #[frozenset({2}), frozenset({3}), frozenset({5})]
    m = len(H[0])#1
    if (len(freqSet) > (m+1)): #m+1表示左边一个,右边m个,len(freqSet)就是总的项集元素个数
        Hmp1 = aprioriGen(H,m+1) #合并,规则进行组合
        Hmp1 = calcConf(freqSet,Hmp1,supportData,brList,minConf)
        if (len(Hmp1) > 1): #如果不止一条规则满足要求,则考虑进一步合并
            rulesFromConseq(freqSet,Hmp1,supportData,brList,minConf)
L,supportData = apriori(dataSet,minSupport = 0.5)
generateRules(L,supportData,minConf =0.5)
frozenset({5}) --> frozenset({3}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({5}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({1}) conf: 0.6666666666666666
frozenset({1}) --> frozenset({3}) conf: 1.0
frozenset({5}) --> frozenset({2}) conf: 1.0
frozenset({2}) --> frozenset({5}) conf: 1.0
frozenset({3}) --> frozenset({2}) conf: 0.6666666666666666
frozenset({2}) --> frozenset({3}) conf: 0.6666666666666666
frozenset({5}) --> frozenset({2, 3}) conf: 0.6666666666666666
frozenset({3}) --> frozenset({2, 5}) conf: 0.6666666666666666
frozenset({2}) --> frozenset({3, 5}) conf: 0.6666666666666666





[(frozenset({5}), frozenset({3}), 0.6666666666666666),
 (frozenset({3}), frozenset({5}), 0.6666666666666666),
 (frozenset({3}), frozenset({1}), 0.6666666666666666),
 (frozenset({1}), frozenset({3}), 1.0),
 (frozenset({5}), frozenset({2}), 1.0),
 (frozenset({2}), frozenset({5}), 1.0),
 (frozenset({3}), frozenset({2}), 0.6666666666666666),
 (frozenset({2}), frozenset({3}), 0.6666666666666666),
 (frozenset({5}), frozenset({2, 3}), 0.6666666666666666),
 (frozenset({3}), frozenset({2, 5}), 0.6666666666666666),
 (frozenset({2}), frozenset({3, 5}), 0.6666666666666666)]

示例:中医证型关联规则挖掘

  • 先对各个特征进行k-means聚类,得到离散化数据,再进行关联分析
# %load 8-1_discretization.py
import warnings
warnings.filterwarnings("ignore")
'''
聚类离散化,最后的result的格式为:
      1           2           3           4
A     0    0.178698    0.257724    0.351843
An  240  356.000000  281.000000   53.000000
即(0, 0.178698]有240个,(0.178698, 0.257724]有356个,依此类推。
'''

import pandas as pd
from sklearn.cluster import KMeans #导入K均值聚类算法

datafile = '/home/ubuntu/jason/Python数据分析与挖掘实战/图书配套数据、代码/chapter8/demo/data/data.xls' #待聚类的数据文件
processedfile = '/home/ubuntu/jason/Python数据分析与挖掘实战/图书配套数据、代码/chapter8/demo/tmp/data_processed.xls' #数据处理后文件
typelabel ={'肝气郁结证型系数':'A', '热毒蕴结证型系数':'B', '冲任失调证型系数':'C', '气血两虚证型系数':'D', '脾胃虚弱证型系数':'E', '肝肾阴虚证型系数':'F'}
k = 4 #需要进行的聚类类别数

#读取数据并进行聚类分析
data = pd.read_excel(datafile) #读取数据

keys = list(typelabel.keys())
result = pd.DataFrame()

if __name__ == '__main__': #判断是否主窗口运行,如果是将代码保存为.py后运行,则需要这句,如果直接复制到命令窗口运行,则不需要这句。
    for i in range(len(keys)):
    #调用k-means算法,进行聚类离散化
        print('正在进行“%s”的聚类...' % keys[i])
        kmodel = KMeans(n_clusters = k, n_jobs = 4) #n_jobs是并行数,一般等于CPU数较好
        kmodel.fit(data[[keys[i]]].values) #训练模型

        r1 = pd.DataFrame(kmodel.cluster_centers_, columns = [typelabel[keys[i]]]) #聚类中心
        r2 = pd.Series(kmodel.labels_).value_counts() #分类统计
        r2 = pd.DataFrame(r2, columns = [typelabel[keys[i]]+'num']) #转为DataFrame,记录各个类别的数目
        r = pd.concat([r1, r2], axis = 1).sort_values(typelabel[keys[i]]) #匹配聚类中心和类别数目
        r.index = [1, 2, 3, 4]
        '''
        print(r)
                 A  numA
        1  0.138327   248
        2  0.221695   351
        3  0.295406   278
        4  0.408679    53
       '''
        r[typelabel[keys[i]]] = r[typelabel[keys[i]]].rolling(2).mean() #rolling_mean()用来计算相邻2列的均值,以此作为边界点。往前合并
        '''
        print(r)
                  A  numA
        1       NaN   248
        2  0.180011   351
        3  0.258551   278
        4  0.352043    53
        '''
        r[typelabel[keys[i]]][1] = 0.0 #这两句代码将原来的聚类中心改为边界点。
        result = result.append(r.T)
    result = result.sort_index() #以Index排序,即以A,B,C,D,E,F顺序排
    result.to_excel(processedfile)
print(result)
正在进行“肝气郁结证型系数”的聚类...
正在进行“热毒蕴结证型系数”的聚类...
正在进行“冲任失调证型系数”的聚类...
正在进行“气血两虚证型系数”的聚类...
正在进行“脾胃虚弱证型系数”的聚类...
正在进行“肝肾阴虚证型系数”的聚类...
          1           2           3           4
A       0.0    0.178698    0.257724    0.351843
Anum  240.0  356.000000  281.000000   53.000000
B       0.0    0.153543    0.298217    0.489954
Bnum  342.0  380.000000  179.000000   29.000000
C       0.0    0.202149    0.289061    0.423537
Cnum  297.0  394.000000  204.000000   35.000000
D       0.0    0.172281    0.251683    0.359353
Dnum  283.0  375.000000  228.000000   44.000000
E       0.0    0.152698    0.257762    0.375661
Enum  273.0  319.000000  244.000000   94.000000
F       0.0    0.179143    0.261386    0.354643
Fnum  200.0  237.000000  265.000000  228.000000
print(data.describe())
         肝气郁结证型系数    热毒蕴结证型系数    冲任失调证型系数    气血两虚证型系数    脾胃虚弱证型系数    肝肾阴虚证型系数
count  930.000000  930.000000  930.000000  930.000000  930.000000  930.000000
mean     0.232154    0.214438    0.247039    0.217702    0.227043    0.271739
std      0.078292    0.131887    0.087779    0.079210    0.108064    0.099186
min      0.026000    0.000000    0.067000    0.059000    0.003000    0.016000
25%      0.176250    0.127000    0.185000    0.160000    0.140000    0.188250
50%      0.231000    0.186000    0.236500    0.208500    0.200000    0.273000
75%      0.281750    0.274000    0.291000    0.264000    0.318000    0.352000
max      0.504000    0.780000    0.610000    0.552000    0.526000    0.607000
# %load apriori.py

import pandas as pd

#自定义连接函数,用于实现L_{k-1}到C_k的连接
def connect_string(x, ms):
    x = list([sorted(i.split(ms)) for i in x])
    l = len(x[0])
    r = []
    for i in range(len(x)):
        for j in range(i,len(x)):
            if x[i][:l-1] == x[j][:l-1] and x[i][l-1] != x[j][l-1]:
                r.append(x[i][:l-1]+sorted([x[j][l-1],x[i][l-1]]))
    return r

#寻找关联规则的函数
def find_rule(d, support, confidence, ms = '--'):
    result = pd.DataFrame(index=['support', 'confidence']) #定义输出结果

    support_series = 1.0*d.sum()/len(d) #支持度序列
    column = list(support_series[support_series > support].index) #初步根据支持度筛选
    k = 0

    while len(column) > 1:
        k = k+1
        print('\n正在进行第%s次搜索...' %k)
        column = connect_string(column, ms)
        print('数目:%s...' %len(column))
        sf = lambda i: d[i].prod(axis=1, numeric_only = True) #新一批支持度的计算函数

        #创建连接数据,这一步耗时、耗内存最严重。当数据集较大时,可以考虑并行运算优化。
        d_2 = pd.DataFrame(list(map(sf,column)), index = [ms.join(i) for i in column]).T

        support_series_2 = 1.0*d_2[[ms.join(i) for i in column]].sum()/len(d) #计算连接后的支持度
        column = list(support_series_2[support_series_2 > support].index) #新一轮支持度筛选
        support_series = support_series.append(support_series_2)
        column2 = []

        for i in column: #遍历可能的推理,如{A,B,C}究竟是A+B-->C还是B+C-->A还是C+A-->B?
            i = i.split(ms)
            for j in range(len(i)):
                column2.append(i[:j]+i[j+1:]+i[j:j+1])

            cofidence_series = pd.Series(index=[ms.join(i) for i in column2]) #定义置信度序列

        for i in column2: #计算置信度序列
            cofidence_series[ms.join(i)] = support_series[ms.join(sorted(i))]/support_series[ms.join(i[:len(i)-1])]

        for i in cofidence_series[cofidence_series > confidence].index: #置信度筛选
            result[i] = 0.0
            result[i]['confidence'] = cofidence_series[i]
            result[i]['support'] = support_series[ms.join(sorted(i.split(ms)))]

    result = result.T.sort_values(['confidence','support'], ascending = False) #结果整理,输出
    print('\n结果为:')
    print(result)

    return result
# %load 8-2_apriori_rules.py

import pandas as pd
# from apriori import * #导入自行编写的apriori函数
import time #导入时间库用来计算用时

inputfile = '/home/ubuntu/jason/Python数据分析与挖掘实战/图书配套数据、代码/chapter8/demo/data/apriori.txt' #输入事务集文件
data = pd.read_csv(inputfile, header=None, dtype = object)

#one-hot encoding
start = time.clock() #计时开始
data = pd.get_dummies(data)
end = time.clock() #计时结束
print('\n转换完毕,用时:%0.2f秒' %(end-start))

# start = time.clock() #计时开始
# print('\n转换原始数据至0-1矩阵...')
# ct = lambda x : pd.Series(1, index = x[pd.notnull(x)]) #转换0-1矩阵的过渡函数,输入一行数据,全置1
# b = list(map(ct, data.as_matrix())) #用map方式执行
# data = pd.DataFrame(b).fillna(0) #实现矩阵转换,空值用0填充
# end = time.clock() #计时结束
# print('\n转换完毕,用时:%0.2f秒' %(end-start))
# del b #删除中间变量b,节省内存

support = 0.06 #最小支持度
confidence = 0.75 #最小置信度
ms = '---' #连接符,默认'--',用来区分不同元素,如A--B。需要保证原始表格中不含有该字符

start = time.clock() #计时开始
print('\n开始搜索关联规则...')
find_rule(data, support, confidence, ms)
end = time.clock() #计时结束
print('\n搜索完成,用时:%0.2f秒' %(end-start))
转换完毕,用时:0.01秒

开始搜索关联规则...

正在进行第1次搜索...
数目:276...

正在进行第2次搜索...
数目:947...

正在进行第3次搜索...
数目:41...

结果为:
                            support  confidence
0_A3---5_F4---6_H4         0.078495    0.879518
2_C3---5_F4---6_H4         0.075269    0.875000
1_B2---5_F4---6_H4         0.062366    0.794521
2_C2---4_E3---3_D2         0.092473    0.754386
3_D2---5_F3---6_H4---0_A2  0.062366    0.753247

搜索完成,用时:1.41秒
  • A3、F4-->H4支持度最大,达到7.85%,置信度达到87.96%,说明肝气郁结证型系数处于(0.258,0.35],肝肾阴虚证型系数处于(0.353,0.607]范围内,TNM分期诊断为H4的可能性是87.96%,而这种情况发生的可能性是7.85%。

你可能感兴趣的:(Apriori算法)