SimHash去重

今天,学习了网页去重的一些方法,算法当中SimHash算法,我尤为关注。下面我将详细介绍一下这个算法
首先,介绍一下SimHash:如果两个相似文档的语义指纹只相差几个位或更少,这样的语义指纹叫做SimHash
计算海明距离的两种方法:
1,按位与
2,两个长整形异或后,然后计算结果中1的个数
取得每个特征的64位hash值
public static int hamming(long l1, long l2) {
  long lxor = l1 ^ l2;
  return BitUtil.pop(lxor);
 }
SimHash计算过程:
初始化长度为64位的向量,该向量的每个维度都是0
循环处理:取每个特征的64位hash值,如果这个hash值得第i位是1,则将向量的第i个数加上特征权重,反之,如果为0,则减去相应的权重
完成所有特征的处理,向量中某些数为正,某些数为负,正数对应的位为1,负数为0,得到最终64位的SimHash
在写入文件过程中,可以把SimHash值使用差分编码进行压缩后保存,下面是一个简单实现代码
View Code
 1 package com.test;
 2 
 3 import java.io.BufferedInputStream;
 4 import java.io.BufferedOutputStream;
 5 import java.io.DataInputStream;
 6 import java.io.FileInputStream;
 7 import java.io.FileNotFoundException;
 8 import java.io.FileOutputStream;
 9 import java.io.IOException;
10 
11 public class DetaCompress {
12 
13     public static byte[] longToBytes(long n) {
14         byte[] buf = new byte[8];// 新建一个byte数组
15         for (int i = buf.length - 1; i >= 0; i--) {
16             buf[i] = (byte) (n & 0x00000000000000ff);// 取低8位的值
17             n >>>= 8;// 右移8位
18         }
19         return buf;
20     }
21 
22     // 把一个long型的数据进行压缩
23     public static void writeVLong(long i, BufferedOutputStream dos)
24             throws IOException {
25         while ((i & ~0x7F) != 0) {
26             dos.write((byte) ((i & 0x7f) | 0x80)); // 写入低位字节
27             i >>>= 7; // 右移7位
28         }
29 
30         dos.write((byte) i);
31         // System.out.println((byte)i+"    写入低位字节");
32 
33     }
34 
35     // 把一个压缩后的long型的数据读取出来
36     static long readVLong(DataInputStream dis) throws IOException {
37         byte b = dis.readByte(); // 读入一个字节
38         int i = b & 0x7F; // 取低7位的值
39         // 每个高位的字节多乘个2的7次方,也就是128
40         for (int shift = 7; (b & 0x80) != 0; shift += 7) {
41             if (dis.available() != 0) {
42                 b = dis.readByte();
43                 i |= (b & 0x7F) << shift; // 当前字节表示的位乘2的shift次方
44             }
45         }
46         return i;// 返回最终结果i
47     }
48 
49     // 把long型数组simHashSet写入fileName指定的文件中去
50     static int write(long[] simHashSet, String fileName) {
51         int j = 0;
52         try {
53             BufferedOutputStream dos = new BufferedOutputStream(
54                     new FileOutputStream(fileName));
55             byte[] b = longToBytes(simHashSet[0]);// 数组的第一个数字一个转换成二进制
56             dos.write(b);// 把它写到文件中
57             for (int i = 1; i < simHashSet.length; i++) {
58                 long lo = simHashSet[i] - simHashSet[i - 1];// 用一个变量记录数组中后一个数减前一个数的差
59                 writeVLong(lo, dos);// 把这个差值写入文件
60             }
61             dos.close();
62             j = simHashSet.length;
63         } catch (FileNotFoundException e) {
64             e.printStackTrace();
65         } catch (IOException e) {
66             e.printStackTrace();
67         }
68         return j;
69     }
70 
71     // 从fileName指定的文件中把long型数组写出来
72     static long[] read(int len, String fileName) {
73         try {
74             DataInputStream dis = new DataInputStream(new BufferedInputStream(
75                     new FileInputStream(fileName)));
76             long[] simHashSet = new long[len];
77             simHashSet[0] = dis.readLong();// 从文件读取第一个long型数字放入数组
78             for (int i = 1; i < len; i++) {
79                 simHashSet[i] = readVLong(dis);// 读取文件剩下的元素
80                 simHashSet[i] = simHashSet[i] + simHashSet[i - 1];  // 将元素都变成数组后一个数和前一个数字的和
81             }
82             dis.close();
83             
84             return simHashSet;
85         } catch (FileNotFoundException e) {
86             e.printStackTrace();
87         } catch (IOException e) {
88             e.printStackTrace();
89         }
90         return null;
91     }
92 }
排重的总体思想是:
先把要检索的f 位指纹集合缩小,将集合f位划分几块,
精确匹配高d位,集合容量缩小变为f'=|s|/2^d'
然后在小集合中检索f-d'位的海明距离
下面是实现的例子:
View Code
 1 package com.lietu.simhash;  2 
 3 import java.io.BufferedReader;  4 import java.io.BufferedWriter;  5 import java.io.File;  6 import java.io.FileInputStream;  7 import java.io.FileNotFoundException;  8 import java.io.FileOutputStream;  9 import java.io.FileWriter;  10 import java.io.IOException;  11 import java.io.InputStream;  12 import java.io.InputStreamReader;  13 import java.io.OutputStream;  14 import java.io.OutputStreamWriter;  15 import java.io.UnsupportedEncodingException;  16 import java.util.ArrayList;  17 import java.util.Collections;  18 import java.util.Comparator;  19 import java.util.HashMap;  20 import java.util.HashSet;  21 import java.util.Iterator;  22 import java.util.StringTokenizer;  23 import java.util.Map.Entry;  24 
 25 /**
 26  * 64位分四块,最多找出有3位差别的simhash  27  *  28  * @author lg  29  *  30  */
 31 // TODO: 保存排序后的中间状态
 32 public class SimHashSet4 implements Iterable<SimHashData> {  33     ArrayList<SimHashData> t1 = new ArrayList<SimHashData>();  34     ArrayList<SimHashData> t2 = new ArrayList<SimHashData>();  35     ArrayList<SimHashData> t3 = new ArrayList<SimHashData>();  36     ArrayList<SimHashData> t4 = new ArrayList<SimHashData>();  37 
 38     public ArrayList<SimHashData> getT1(){  39         return t1;  40  }  41     static Comparator<SimHashData> comp = new Comparator<SimHashData>() {  42         public int compare(SimHashData o1, SimHashData o2) {  43             if (o1.q == o2.q)  44                 return 0;  45             return (isLessThanUnsigned(o1.q, o2.q)) ? 1 : -1;  46  }  47     }; // 比较无符号64位
 48     static Comparator<Long> compHigh = new Comparator<Long>() {  49         public int compare(Long o1, Long o2) {  50             o1 |= 0xFFFFFFFFFFFFL;  51             o2 |= 0xFFFFFFFFFFFFL;  52             // System.out.println(Long.toBinaryString(o1));  53             // System.out.println(Long.toBinaryString(o2));  54             // System.out.println((o1 == o2));
 55             if (o1.equals(o2))  56                 return 0;  57             return (isLessThanUnsigned(o1, o2)) ? 1 : -1;  58  }  59     }; // 比较无符号64位中的高16位
 60 
 61     public void load(String fileName) {  62         String line = null;  63 
 64         try {  65             InputStream is = new FileInputStream(new File(fileName));  66 
 67             BufferedReader br = new BufferedReader(new InputStreamReader(is));  68 
 69             while ((line = br.readLine()) != null) {  70  addSimHash(line.trim());  71  }  72  br.close();  73 
 74         } catch (FileNotFoundException e) {  75  e.printStackTrace();  76         } catch (UnsupportedEncodingException e) {  77  e.printStackTrace();  78         } catch (IOException e) {  79  e.printStackTrace();  80  }  81  }  82 
 83     public static boolean isLessThanUnsigned(long n1, long n2) {  84         return (n1 < n2) ^ ((n1 < 0) != (n2 < 0));  85  }  86 
 87     public void sort() {  88  t2.clear();  89  t3.clear();  90  t4.clear();  91         for (SimHashData simHash : t1)  92  {  93             long t = Long.rotateLeft(simHash.q, 16);  94             t2.add(new SimHashData(t, simHash.no));  95 
 96             t = Long.rotateLeft(t, 16);  97             t3.add(new SimHashData(t, simHash.no));  98 
 99             t = Long.rotateLeft(t, 16); 100             t4.add(new SimHashData(t, simHash.no)); 101  } 102 
103  Collections.sort(t1, comp); 104  Collections.sort(t2, comp); 105  Collections.sort(t3, comp); 106  Collections.sort(t4, comp); 107  } 108 
109     public boolean contains(SimHashData key) { 110         int low = 0; 111         int high = t1.size() - 1; 112 
113         while (low <= high) { 114             int mid = (low + high) >>> 1; 115             SimHashData midVal = t1.get(mid); 116             int cmp = comp.compare(midVal, key); 117 
118             if (cmp < 0) 119                 low = mid + 1; 120             else if (cmp > 0) 121                 high = mid - 1; 122             else
123                 return true; // key found
124  } 125         return false; // key not found
126  } 127 
128     /**
129  * probe exact match 130  * 131  * @param t 132  * @return
133      */
134     public Span probe(ArrayList<SimHashData> t, long key) { 135         // System.out.println("t:"+t.size());
136         int low = 0; 137         int high = t.size() - 1; 138 
139         while (low <= high) { 140             int mid = (low + high) >>> 1; 141             Long midVal = t.get(mid).q; 142             int cmp = compHigh.compare(midVal, key); 143 
144             if (cmp < 0) 145                 low = mid + 1; 146             else if (cmp > 0) 147                 high = mid - 1; 148             else { 149                 // key found
150                 int matchStart = mid; 151                 int matchEnd = mid; 152                 while (matchStart > 0) { 153                     midVal = t.get(matchStart - 1).q; 154                     if (compHigh.compare(midVal, key) == 0) { 155                         --matchStart; 156                     } else { 157                         break; 158  } 159  } 160 
161                 while (matchEnd < (t.size() - 1)) { 162                     midVal = t.get(matchEnd + 1).q; 163                     if (compHigh.compare(midVal, key) == 0) { 164                         ++matchEnd; 165                     } else { 166                         break; 167  } 168  } 169                 return new Span(matchStart, matchEnd); 170  } 171  } 172         return null; // key not found
173  } 174 
175     /**
176  * get most 3 bit difference. 177  * 178  * @param fingerPrint 179  * @param k 180  * @return
181      */
182     public HashSet<SimHashData> getSimSet(long fingerPrint, int k) { 183 
184         HashSet<SimHashData> retAll = new HashSet<SimHashData>(); 185         Span s1 = probe(t1, fingerPrint); 186         if (s1 != null) { 187             // System.out.println("s1:"+s1);
188             ArrayList<SimHashData> ret1 = getSim(t1, s1, fingerPrint, k); 189  retAll.addAll(ret1); 190  } 191         long q2 = Long.rotateLeft(fingerPrint, 16); 192         Span s2 = probe(t2, q2); 193         if (s2 != null) { 194             // System.out.println("s2:"+s2);
195             ArrayList<SimHashData> ret2 = getSim(t2, s2, q2, k); 196             // rotateRight(ret2, 16);
197  retAll.addAll(ret2); 198  } 199 
200         long q3 = Long.rotateLeft(q2, 16); 201         Span s3 = probe(t3, q3); 202         if (s3 != null) { 203             // System.out.println("s3:"+s3);
204             ArrayList<SimHashData> ret3 = getSim(t3, s3, q3, k); 205             // rotateRight(ret3, 32);
206  retAll.addAll(ret3); 207  } 208 
209         long q4 = Long.rotateLeft(q3, 16); 210         Span s4 = probe(t4, q4); 211         if (s4 != null) { 212         // System.out.println("s4:" + s4);
213             ArrayList<SimHashData> ret4 = getSim(t4, s4, q4, k); 214             // rotateRight(ret4, 48);
215  retAll.addAll(ret4); 216  } 217         // System.out.println("o:"+Long.toBinaryString(fingerPrint));
218         return retAll; 219  } 220 
221     /**
222  * 从Span找出部分相等的,取出最多差k位的 223  * 224  * @param t 225  * @param s 226  * @param fingerPrint 227  * @param k 228  * @return
229      */
230     public ArrayList<SimHashData> getSim(ArrayList<SimHashData> t, Span s, 231             long fingerPrint, int k) { 232         ArrayList<SimHashData> result = new ArrayList<SimHashData>(); 233 
234         for (int i = s.getStart(); i <= s.getEnd(); ++i) { 235             SimHashData data = t.get(i); 236             long q = data.q; 237             if (BitUtil.diffIn(fingerPrint, q, k)) { 238  result.add(data); 239  } 240  } 241 
242         return result; 243  } 244 
245     public void addSimHash(String line) { 246         StringTokenizer st = new StringTokenizer(line, ":"); 247         String key = st.nextToken(); 248         long t = BitUtil.decodeLong(key); 249         long no = Long.parseLong(st.nextToken()); 250         // Long.parseLong(key,2); 251         // System.out.println(t);
252         t1.add(new SimHashData(t, no)); 253  } 254     
255     public void addSimHash(SimHashData key) { 256  t1.add(key); 257  } 258 
259     public void addInc(String key) { 260         long t = BitUtil.decodeLong(key); 261         // Long.parseLong(key,2); 262         // System.out.println(t);
263         SimHashData element = new SimHashData(t); 264         int insertionPoint = findInsertionPoint(t1, element); 265  t1.add(insertionPoint, element); 266 
267         long q2 = Long.rotateLeft(t, 16); 268         element = new SimHashData(q2); 269         insertionPoint = findInsertionPoint(t2, element); 270  t2.add(insertionPoint, element); 271 
272         long q3 = Long.rotateLeft(q2, 16); 273         element = new SimHashData(q3); 274         insertionPoint = findInsertionPoint(t3, element); 275  t3.add(insertionPoint, element); 276 
277         long q4 = Long.rotateLeft(q3, 16); 278         element = new SimHashData(q4); 279         insertionPoint = findInsertionPoint(t4, element); 280  t4.add(insertionPoint, element); 281  } 282 
283     /**
284  * Find the insertion point for the argument in a sorted list. 285  * 286  * @param element 287  * find this object's insertion point in the sorted list 288  * @return the index of the insertion point 289      */
290     int findInsertionPoint(ArrayList<SimHashData> list, SimHashData element) { 291         // Find the new element's insertion point.
292         int insertionPoint = Collections.binarySearch(list, element, comp); 293         if (insertionPoint < 0) { 294             insertionPoint = -(insertionPoint + 1); 295  } 296         return insertionPoint; 297  } 298 
299     public Iterator<SimHashData> iterator() { 300         return t1.iterator(); 301  } 302 
303     public void save(String fileName) { 304  BufferedWriter writer; 305         try { 306             writer = new BufferedWriter(new FileWriter(fileName)); 307             for (SimHashData simhash : t1) { 308                 //String str=BitUtil.encodeLong(simhash.q).substring(8);
309                 String str=BitUtil.encodeLong(simhash.q); 310  writer.write(str); 311 // writer.write(simhash.q+"");
312                 writer.write(":"); 313  writer.write(String.valueOf(simhash.no)); 314                 writer.write("\r\n"); 315  } 316  writer.flush(); 317  writer.close(); 318         } catch (Exception e) { 319  e.printStackTrace(); 320  } 321  } 322 
323     public void save(String fileName, String[] newStr) { 324  BufferedWriter writer; 325         try { 326             OutputStream out = new FileOutputStream(fileName, true); 327             OutputStreamWriter outWriter = new OutputStreamWriter(out); 328             writer = new BufferedWriter(outWriter); 329             for (int i = 0; i < newStr.length; i++) { 330                 if (newStr[i] != null) { 331  writer.append(newStr[i]); 332                     writer.append("\r\n"); 333                     if (i % 10000 == 0) 334                         System.out.println(i + ":" + newStr[i]); 335                 } else { 336                     break; 337  } 338  } 339  writer.flush(); 340  writer.close(); 341             System.out.println("结束!"); 342         } catch (Exception e) { 343  e.printStackTrace(); 344  } 345  } 346 
347     // 将数据读成SimHashData对象型集合
348     public ArrayList<SimHashData> readData(String path) { 349         ArrayList<SimHashData> list = new ArrayList<SimHashData>(); 350 
351         try { 352             InputStream input = new FileInputStream(new File(path)); 353             BufferedReader br = new BufferedReader(new InputStreamReader(input)); 354             String line = ""; 355             while ((line = br.readLine()) != null) { 356                 StringTokenizer st = new StringTokenizer(line, ":"); 357                 long key = BitUtil.decodeLong(st.nextToken()); 358                 long no = Long.parseLong(st.nextToken()); 359                 list.add(new SimHashData(key, no)); 360  } 361  br.close(); 362         } catch (FileNotFoundException e) { 363  e.printStackTrace(); 364         } catch (IOException e) { 365  e.printStackTrace(); 366  } 367         return list; 368  } 369 
370 
371 
372 }

介绍一篇论文:Google Detecting NearDuplicates For Web Crawling 论文介绍了把SimHash用于爬虫抓取过程的网页去重。

最后,说一下分布式文档排重:利用分布式系统框架如hadoop等,使用MapReduce进行文档排重,提高了效率和节省了时间,这已经成为了常用的大数据量的排重方式

以上,是我对SimHash的一些总结,请大家指教!大家共勉

你可能感兴趣的:(hash)