作者:金良([email protected]) csdn博客:http://blog.csdn.net/u012176591
# -*- coding: UTF8 -*- import sys import copy ''' 事务集T = [['A','B','C','D'],['B','C','E'],\ ['A','B','C','E'],['B','D','E'],['A','B','C','D']],按最小支持度0.3,找出其中的频繁项集 ''' # 采用字典的方式,记录元素的同时,也实现了计算元素个数 def single_count(T): C = {} for t in T: for i in t: if i in C.keys(): C[i] += 1 else: C[i] = 1 return C def is_equal_lists(lst1,lst2): if len(lst1) != len(lst2): return False else: n=len(lst1) for i in range(n): if lst1[i] != lst2[i]: return False return True def candidate_gen(k_itemsets): C = [] k = len(k_itemsets[0]) + 1 #当前F为K项集,则传入参数F为K-1项频繁集 for f1 in k_itemsets: # for f2 in k_itemsets: if f1[k-2] < f2[k-2]:#比较两个K-1项集的最后一个元素,防止重复 if is_equal_lists(f1[:k-2],f2[:k-2]): c = copy.copy(f1) c.append(f2[k-2])#连接步 flag = True for i in range(0,k-2): #剪枝步,看子项集是否存在于频繁的(k-1)-项集中,如果不在,直接剔除 s = copy.copy(c) s.pop(i) if s not in k_itemsets: flag = False break if flag and c not in C: C.append(c) return C def compare_list(A,B): for a in A: if a not in B: return False return True # 求出满足最小支持度的项目集合 def apriori(T, minsup): single_count_set = single_count(T) # keys = single_count_set.keys() #得到1-项集的的列表 keys.sort() #排序 C=keys #C为1-项候选集['A', 'B', 'C', 'D', 'E'] n = len(T) #求出事物集的个数 F = [[]]#F频繁项集的列表 for f in C: if single_count_set[f]*1.0/n >= minsup: F[0].append([f]) k = 1 # 此时获得1项目集 while F[k-1] != []: C=candidate_gen(F[k-1]) F.append([]) # 这个很重要,每次先加入一个空集合 for c in C:#依次拿出每个候选项集 count = 0; for t in T: # 计算候选项集的支持度 if compare_list(c,t): count += 1 if count*1.0/n >= minsup:#判断候选项集是否是频繁项集 F[k].append(c) k += 1 U = [] for f in F:#将二维列表变成一维列表 for x in f: U.append(x) return U def printlist(T): for i in range(len(T)): for j in range(len(T[i])): print T[i][j], T = [['A','B','C','D'],['B','C','E'],['A','B','C','E'],['B','D','E'],['A','B','C','D']] F = apriori(T, 0.3) print '\n所有的频繁项集:\n' print F print '\n频繁项集的个数:\n' print len(F)
\noindent{\bf INSERT INTO} $C_k$\\ {\bf SELECT } $p.fitItemset_1,p.fitItemset_2,...,p.fitItemset_{k-1},q.fitItemset_{k-1}$\\ {\bf FROM } $F_{k-1}p,F_{k-1}q$\\ {\bf WHERE } $p.fitItemset_1=q.fitItemset_1,...,p.fitItemset_{k-2}=q.fitItemset_{k-2}$\\ \indent $p.fitItemset_{k-1}<q.fitItemset_{k-1}$
\documentclass[11pt]{ctexart} \usepackage[top=2cm, bottom=2cm, left=2cm, right=2cm]{geometry} \usepackage{algorithm} \usepackage{algorithmicx} \usepackage{algpseudocode} \usepackage{amsmath} \floatname{algorithm}{} \renewcommand{\algorithmicrequire}{\textbf{Input:}} \renewcommand{\algorithmicensure}{\textbf{Output:}} \newcommand{\myoutput}{\textbf{output }} \newcommand{\myadd}{\textbf{add }} \newcommand{\mycall}{\textbf{call }} \newcommand{\mydelete}{\textbf{delete }} \begin{document} \begin{algorithm}[h] \caption{Apriori algorithm} \begin{algorithmic}[1] \Require $F_1$:一阶频繁集;$\mathcal{D}$:数据库,包含所有transaction \Ensure $Answer$:各阶频繁项集 \State \{large 1-itemsets\}; \For {$k=2;L_{k-1}\neq \phi;k++$} \State $//$generate $k$-candidates from $F_{k-1}$; \State $C_k=$apriori-gen$(F_{k-1})$; \ForAll {transactions $t\in \mathcal{D}$} \State $//$given $t$,find candidates contained in $t$ from $C_k$ \State $C_t=$subset$(C_k,t)$; \ForAll {candidates $c \in C_t$} \State $c.count$++ \EndFor \EndFor \State $F_k=\{c\in C_k|c.count\geq minsup\}$ \EndFor \State $Answer=\cup_k\{F_k\}$ \end{algorithmic} \end{algorithm} \end{document}
\begin{algorithm}[h] \caption{Association Rule Generating algorithm} \begin{algorithmic}[1] \State $H_1=\phi$ \ForAll{frequent k-itemset $f_k(k\geq 2)$} \State $A=$ (k-1)-itemsets $ a_{k-1} \mathrm{~such~that~} a_{k-1} \subset f_k$ \ForAll{$a_{k-1}\in A$} \State $conf = support(f_k)/support(a_{k-1})$ \If {$conf \geq minconf$} \State \myoutput the rule $a_{k-1}\Rightarrow(f_k-a_{k-1})$ \State \myadd $(f_k-a_{k-1})$ to $H_1$; \EndIf \EndFor \State \mycall AP-GENRULES($f_k,H_1$) \EndFor \Function{ap-genrules}{$f_k:$ frequent k-itemset,$H_m:$ set of m-itemset consequences} \If {$k > m+1$} \State $H_{m+1}=$ apriori-gen($H_m$); \ForAll {$h_{m+1}\in H_{m+1}$} \State $conf=support(f_k)/support(f_k-h_{m+1})$; \If{$conf \geq minconf$} \State \myoutput the rule $f_k-h_{m+1}\Rightarrow h_{m+1}$ \Else \State \mydelete $h_{m+1}$ from $H_{m+1}$ \EndIf \EndFor \State \mycall AP-GENRULES$(f_k,H_{m+1})$; \EndIf \EndFunction \end{algorithmic} \end{algorithm}
\begin{algorithm}[h] \caption{AprioriTid algorithm} \begin{algorithmic}[1] \State $F_1=\{ $ frequent~1-itemsets $ \}$; \State $//~\overline{C}_k$中每个元素都具有$<TID,\{ID\}>$的形式,其中$TID$是事务的标识符,$\{ID\}$(标记为$C_t$)是事务$TID$中的一个潜在频繁$k$-项集的标识符,它的一个元素对应于一个事务$t$, 即$<t.TID,\{c\in C_k|c$ 在$t$中$\}>$ \State $\overline{C}_1=\mathrm{~database~} D $; \For {$k=2;F_{k-1}\neq \phi;k++$} \State $//~$由$k-1$-阶频繁项集$F_{k-1}$生成$k$-阶候选集$C_k$ \State $C_k=$apriori-gen($F_{k-1}$); \State $\overline{C}_k=\phi$; \ForAll {entry $t \in \overline{C}_{k-1}$} \State $//~$$C_t$和$t.set-of-itemsets$就是上面的事务$t$的$\{ID\}$所代表的候选频繁项集,只是阶数不同.对每个候选项$c\in C_k$,$(c-c[k])$和$(c-c[k-1])$是生成$c$的两个项集,当生成$c\in C_k$的两个项集都是事务$t$所包含的项集时,则$c\in C_k$也是事务$t$所包含的潜在的$k$-阶频繁项集 \State $C_t=\{c\in C_k|(c-c[k])\in $ t.set-of-itemsets $ \wedge(c-c[k-1])\in $ t.set-of-itemsets $\}$; \If {$C_t\neq \phi$} \State $\overline{C}_k=\overline{C}_k\cup\{<t.TID,C_t>\}$; \State $//~$事务$t$包含的$k$-阶候选项集$c_k$的支持度增加$1$ \ForAll {candidate $c\in C_t$} \State $c.count++$; \EndFor \EndIf \EndFor \State $//~$根据支持度$count$的大小从候选集$C_k$找出$k$-阶频繁项集$F_k$ \State $F_k=\{c\in C_k|c.count\geq minsup\}$; \EndFor \State $Answer=\cup_k\{F_k\}$; \end{algorithmic} \end{algorithm}