FP-Growth算法:
首先,把所有的事务集的汉字词语都转化成数字,好处如下:
数字之间的比较远比字符串快;
减少内存的使用;
所以前提就是把所有的汉字词语转化为数字
eg:汉字词语
牛奶,鸡蛋,面包,薯片 鸡蛋,爆米花,薯片,啤酒 鸡蛋,面包,薯片
转化为一一对应的数字
1,2,3,4 2,5,4,6 2,3,4
FP-Growth的树结构的java代码如下,把String->integer:
import java.util.HashSet; import java.util.Set; public class TreeNode { private TreeNode parent; private int nameNO; private int count; private Set<TreeNode> children; public TreeNode(TreeNode parent, int nameNO, int count) { this.count = count; this.parent = parent; this.nameNO = nameNO; } public TreeNode(int nameNO, int count) { this.nameNO = nameNO; this.count = count; } /** * 当前节点计数+i * * @param i */ public void incrementCount(int i) { this.count = count + i; } /** * 父节点是否包含子节点包含则返回,否则返回null * * @param key * @return */ public TreeNode findChild(int key) { if (this.children == null) { return null; } for (TreeNode child : this.children) { if (child.nameNO == key) { return child; } } return null; } /** * 给父节点增加一个子节点 * * @param child * @return */ public TreeNode addChild(TreeNode child) { if (this.children == null) { this.children = new HashSet<TreeNode>(); } this.children.add(child); return child; } public boolean isEmpty() { return this.children == null || this.children.size() == 0; } public TreeNode getParent() { return parent; } public void setParent(TreeNode parent) { this.parent = parent; } public int getNameCount() { return nameNO; } public void setNameCount(int nameNO) { this.nameNO = nameNO; } public int getCount() { return count; } public void setCount(int count) { this.count = count; } public Set<TreeNode> getChildren() { return children; } public void setChildren(Set<TreeNode> children) { this.children = children; } }
FP-Growth算法主要算法如下:
import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.OutputStreamWriter; import java.util.*; import java.util.Map.Entry; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; public class FrequentItemSets { private static Logger logger = Logger.getLogger(FrequentItemSets.class .getName()); //设定支持度 private static int ABSOLUTE_SUPPORT = 3; //用于输出频繁一项的标志位 private static int flag = 1; private static BufferedWriter bw = null; private static File tempFile = null; private static File fileSortMap = null; private static BufferedWriter bwSortMap = null; private static List<Integer> frequentItemsData = new ArrayList<Integer>(); public static void main(String[] args) throws Exception { FrequentItemSets frequentItemSets = new FrequentItemSets(); frequentItemSets.beginFrequentItemSets( "C:\\Users\\angelo\\Desktop\\test\\1.txt", "C:\\Users\\angelo\\Desktop\\test\\OneItem.txt", "C:\\Users\\angelo\\Desktop\\test\\Items.txt"); } @SuppressWarnings("unchecked") public void beginFrequentItemSets(String fromFilePath, String toFrequentOneItemFilePath, String frequentItemsSetDataFile) throws Exception { fileSortMap = new File(toFrequentOneItemFilePath); // 从文件中读取事物数据集,这个里面的汉字是经过UTF-8排序过的汉字 FileOutputStream fos = new FileOutputStream(fileSortMap); bwSortMap = new BufferedWriter(new OutputStreamWriter(fos)); Iterator<String> lineIte = FileUtils .lineIterator(new File(fromFilePath)); List<List<Integer>> transactions = new ArrayList<List<Integer>>(); while (lineIte.hasNext()) { String line = lineIte.next(); if (StringUtils.isNotEmpty(line) && line.length() != 33) { String[] subjects = line.split(","); List<String> list = new ArrayList<String>( Arrays.asList(subjects)); List<Integer> intList = new ArrayList<Integer>(); for (String temp : list) { intList.add(Integer.parseInt(temp)); } transactions.add(intList); } } // 初始一个频繁模式集 List<Integer> frequences = new LinkedList<Integer>(); tempFile = new File("C:\\Users\\angelo\\Desktop\\test\\Items2.txt"); if (tempFile.exists()) { tempFile.delete(); } tempFile.createNewFile(); bw = new BufferedWriter(new FileWriter(tempFile)); digTree(transactions, frequences); // set转list 在用UTF-8排序 List<Integer> frequentItemsDataList = new ArrayList<Integer>( frequentItemsData); bw.write(listToString(frequentItemsDataList, ",") + "\n"); bw.flush(); bw.close(); } public void digTree(List<List<Integer>> transactions, List<Integer> frequences) throws Exception { // 扫描事物数据集,排序 final Map<Integer, Integer> sortedMap = scanAndSort(transactions); // 没有数据是支持最小支持度了,可以停止了 if (sortedMap.size() == 0) { return; } Map<Integer, List<TreeNode>> index = new HashMap<Integer, List<TreeNode>>(); TreeNode root = buildTree(transactions, index, sortedMap); // 否则开始从排序最低的项开始 抽出条件模式基,递归挖掘 List<Integer> headTable = new ArrayList<Integer>(sortedMap.keySet()); Collections.sort(headTable, new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { int i = sortedMap.get(o2) - sortedMap.get(o1); return i != 0 ? i : o1.compareTo(o2); } }); //输出频繁一项集数据 if (flag == 1) { for (Integer keyWord : headTable) { bwSortMap.write(keyWord + ","); } bwSortMap.flush(); bwSortMap.close(); flag++; } // 从项头表最后一项开始挖掘 for (int i = headTable.size() - 1; i >= 0; i--) { Integer subject = headTable.get(i); List<List<Integer>> frequentModeBases = extract(index.get(subject), root); LinkedList<Integer> nextFrequences = new LinkedList<Integer>( frequences); nextFrequences.add(subject); if (nextFrequences.size() > 1) { try { //重点:数据的压缩在这里 List<Integer> tempList = new ArrayList<Integer>(); for (Integer temp : nextFrequences) { tempList.add(temp); } if (frequentItemsData.size() == 0) { frequentItemsData.addAll(tempList); } else { List<Integer> tempFrequentList = new ArrayList<Integer>(); tempFrequentList.addAll(frequentItemsData); List<Integer> saveTempList = new ArrayList<Integer>(); saveTempList.addAll(tempList); tempFrequentList.removeAll(tempList); tempList.removeAll(frequentItemsData); if (tempFrequentList.size() == 0) { frequentItemsData.clear(); frequentItemsData.addAll(saveTempList); } else if (tempList.size() == 0) { continue; } else { List<String> frequentItemsDataList = new ArrayList<String>(); for (Integer tempInt : frequentItemsData) { frequentItemsDataList.add(tempInt + ""); } Collections.sort(frequentItemsDataList, new SortChineseKeywords()); bw.write(listToString(frequentItemsDataList, ",") + "\n"); bw.flush(); frequentItemsData.clear(); frequentItemsData.addAll(saveTempList); } } } catch (Exception ex) { logger.error(ex.getMessage()); } } digTree(frequentModeBases, nextFrequences); } } public List<List<Integer>> extract(List<TreeNode> list, TreeNode root) { List<List<Integer>> returnList = new ArrayList<List<Integer>>(); for (TreeNode node : list) { TreeNode parent = node.getParent(); if (parent.getCount() != -1) { ArrayList<Integer> tranc = new ArrayList<Integer>(); while (parent.getCount() != -1) { tranc.add(parent.getNameCount()); parent = parent.getParent(); } for (int i = 0; i < node.getCount(); i++) { returnList.add(tranc); } } } return returnList; } public TreeNode buildTree(List<List<Integer>> transactions, Map<Integer, List<TreeNode>> index, final Map<Integer, Integer> sortedMap) { TreeNode root = new TreeNode(null, -1, -1); for (List<Integer> subjects : transactions) { Iterator<Integer> ite = subjects.iterator(); while (ite.hasNext()) { Integer subject = ite.next(); if (!sortedMap.containsKey(subject)) { ite.remove(); } } Collections.sort(subjects, new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { int i = sortedMap.get(o2) - sortedMap.get(o1); return i != 0 ? i : o1.compareTo(o2); } }); TreeNode current = root; for (int i = 0; i < subjects.size(); i++) { Integer subject = subjects.get(i); TreeNode next = current.findChild(subject); if (next == null) { TreeNode newNode = new TreeNode(current, subject, 1); current.addChild(newNode); current = newNode; List<TreeNode> thisIndex = index.get(subject); if (thisIndex == null) { thisIndex = new ArrayList<TreeNode>(); index.put(subject, thisIndex); } thisIndex.add(newNode); } else { next.incrementCount(1); current = next; } } } return root; } public Map<Integer, Integer> scanAndSort(List<List<Integer>> transactions) { Map<Integer, Integer> map = new HashMap<Integer, Integer>(); // 空的就不扫了 if (transactions.size() == 0) { return map; } for (List<Integer> basket : transactions) { for (Integer subject : basket) { Integer count = map.get(subject); if (count == null) { map.put(subject, 1); } else { map.put(subject, count + 1); } } } Iterator<Entry<Integer, Integer>> ite = map.entrySet().iterator(); while (ite.hasNext()) { Entry<Integer, Integer> entry = ite.next(); if (entry.getValue() < ABSOLUTE_SUPPORT) { ite.remove(); } } return map; } public static String listToString(List list, String reg) { // 默认用,符号 if (reg == null || "".equals(reg)) { reg = ","; } StringBuffer sb = new StringBuffer(); if (null == list || list.size() == 0) { return null; } for (Iterator iter = list.iterator(); iter.hasNext();) { sb.append(iter.next()).append(reg); } int length = sb.length(); if (length > 0) { sb = sb.delete(length - 1, length); } return sb.toString(); } }
ChineseUTF-8排序算法:
import java.text.Collator; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; public class SortChineseKeywords implements Comparator<String> { Collator cmp = Collator.getInstance(java.util.Locale.CHINA); @Override public int compare(String o1, String o2) { if (cmp.compare(o1, o2) > 0) { return 1; } else if (cmp.compare(o1, o2) < 0) { return -1; } return 0; } }
如需改进,非常感谢!