1 package com.test; 2 3 import java.io.BufferedInputStream; 4 import java.io.BufferedOutputStream; 5 import java.io.DataInputStream; 6 import java.io.FileInputStream; 7 import java.io.FileNotFoundException; 8 import java.io.FileOutputStream; 9 import java.io.IOException; 10 11 public class DetaCompress { 12 13 public static byte[] longToBytes(long n) { 14 byte[] buf = new byte[8];// 新建一个byte数组 15 for (int i = buf.length - 1; i >= 0; i--) { 16 buf[i] = (byte) (n & 0x00000000000000ff);// 取低8位的值 17 n >>>= 8;// 右移8位 18 } 19 return buf; 20 } 21 22 // 把一个long型的数据进行压缩 23 public static void writeVLong(long i, BufferedOutputStream dos) 24 throws IOException { 25 while ((i & ~0x7F) != 0) { 26 dos.write((byte) ((i & 0x7f) | 0x80)); // 写入低位字节 27 i >>>= 7; // 右移7位 28 } 29 30 dos.write((byte) i); 31 // System.out.println((byte)i+" 写入低位字节"); 32 33 } 34 35 // 把一个压缩后的long型的数据读取出来 36 static long readVLong(DataInputStream dis) throws IOException { 37 byte b = dis.readByte(); // 读入一个字节 38 int i = b & 0x7F; // 取低7位的值 39 // 每个高位的字节多乘个2的7次方,也就是128 40 for (int shift = 7; (b & 0x80) != 0; shift += 7) { 41 if (dis.available() != 0) { 42 b = dis.readByte(); 43 i |= (b & 0x7F) << shift; // 当前字节表示的位乘2的shift次方 44 } 45 } 46 return i;// 返回最终结果i 47 } 48 49 // 把long型数组simHashSet写入fileName指定的文件中去 50 static int write(long[] simHashSet, String fileName) { 51 int j = 0; 52 try { 53 BufferedOutputStream dos = new BufferedOutputStream( 54 new FileOutputStream(fileName)); 55 byte[] b = longToBytes(simHashSet[0]);// 数组的第一个数字一个转换成二进制 56 dos.write(b);// 把它写到文件中 57 for (int i = 1; i < simHashSet.length; i++) { 58 long lo = simHashSet[i] - simHashSet[i - 1];// 用一个变量记录数组中后一个数减前一个数的差 59 writeVLong(lo, dos);// 把这个差值写入文件 60 } 61 dos.close(); 62 j = simHashSet.length; 63 } catch (FileNotFoundException e) { 64 e.printStackTrace(); 65 } catch (IOException e) { 66 e.printStackTrace(); 67 } 68 return j; 69 } 70 71 // 从fileName指定的文件中把long型数组写出来 72 static long[] read(int len, String fileName) { 73 try { 74 DataInputStream dis = new DataInputStream(new BufferedInputStream( 75 new FileInputStream(fileName))); 76 long[] simHashSet = new long[len]; 77 simHashSet[0] = dis.readLong();// 从文件读取第一个long型数字放入数组 78 for (int i = 1; i < len; i++) { 79 simHashSet[i] = readVLong(dis);// 读取文件剩下的元素 80 simHashSet[i] = simHashSet[i] + simHashSet[i - 1]; // 将元素都变成数组后一个数和前一个数字的和 81 } 82 dis.close(); 83 84 return simHashSet; 85 } catch (FileNotFoundException e) { 86 e.printStackTrace(); 87 } catch (IOException e) { 88 e.printStackTrace(); 89 } 90 return null; 91 } 92 }
1 package com.lietu.simhash; 2
3 import java.io.BufferedReader; 4 import java.io.BufferedWriter; 5 import java.io.File; 6 import java.io.FileInputStream; 7 import java.io.FileNotFoundException; 8 import java.io.FileOutputStream; 9 import java.io.FileWriter; 10 import java.io.IOException; 11 import java.io.InputStream; 12 import java.io.InputStreamReader; 13 import java.io.OutputStream; 14 import java.io.OutputStreamWriter; 15 import java.io.UnsupportedEncodingException; 16 import java.util.ArrayList; 17 import java.util.Collections; 18 import java.util.Comparator; 19 import java.util.HashMap; 20 import java.util.HashSet; 21 import java.util.Iterator; 22 import java.util.StringTokenizer; 23 import java.util.Map.Entry; 24
25 /**
26 * 64位分四块,最多找出有3位差别的simhash 27 * 28 * @author lg 29 * 30 */
31 // TODO: 保存排序后的中间状态
32 public class SimHashSet4 implements Iterable<SimHashData> { 33 ArrayList<SimHashData> t1 = new ArrayList<SimHashData>(); 34 ArrayList<SimHashData> t2 = new ArrayList<SimHashData>(); 35 ArrayList<SimHashData> t3 = new ArrayList<SimHashData>(); 36 ArrayList<SimHashData> t4 = new ArrayList<SimHashData>(); 37
38 public ArrayList<SimHashData> getT1(){ 39 return t1; 40 } 41 static Comparator<SimHashData> comp = new Comparator<SimHashData>() { 42 public int compare(SimHashData o1, SimHashData o2) { 43 if (o1.q == o2.q) 44 return 0; 45 return (isLessThanUnsigned(o1.q, o2.q)) ? 1 : -1; 46 } 47 }; // 比较无符号64位
48 static Comparator<Long> compHigh = new Comparator<Long>() { 49 public int compare(Long o1, Long o2) { 50 o1 |= 0xFFFFFFFFFFFFL; 51 o2 |= 0xFFFFFFFFFFFFL; 52 // System.out.println(Long.toBinaryString(o1)); 53 // System.out.println(Long.toBinaryString(o2)); 54 // System.out.println((o1 == o2));
55 if (o1.equals(o2)) 56 return 0; 57 return (isLessThanUnsigned(o1, o2)) ? 1 : -1; 58 } 59 }; // 比较无符号64位中的高16位
60
61 public void load(String fileName) { 62 String line = null; 63
64 try { 65 InputStream is = new FileInputStream(new File(fileName)); 66
67 BufferedReader br = new BufferedReader(new InputStreamReader(is)); 68
69 while ((line = br.readLine()) != null) { 70 addSimHash(line.trim()); 71 } 72 br.close(); 73
74 } catch (FileNotFoundException e) { 75 e.printStackTrace(); 76 } catch (UnsupportedEncodingException e) { 77 e.printStackTrace(); 78 } catch (IOException e) { 79 e.printStackTrace(); 80 } 81 } 82
83 public static boolean isLessThanUnsigned(long n1, long n2) { 84 return (n1 < n2) ^ ((n1 < 0) != (n2 < 0)); 85 } 86
87 public void sort() { 88 t2.clear(); 89 t3.clear(); 90 t4.clear(); 91 for (SimHashData simHash : t1) 92 { 93 long t = Long.rotateLeft(simHash.q, 16); 94 t2.add(new SimHashData(t, simHash.no)); 95
96 t = Long.rotateLeft(t, 16); 97 t3.add(new SimHashData(t, simHash.no)); 98
99 t = Long.rotateLeft(t, 16); 100 t4.add(new SimHashData(t, simHash.no)); 101 } 102
103 Collections.sort(t1, comp); 104 Collections.sort(t2, comp); 105 Collections.sort(t3, comp); 106 Collections.sort(t4, comp); 107 } 108
109 public boolean contains(SimHashData key) { 110 int low = 0; 111 int high = t1.size() - 1; 112
113 while (low <= high) { 114 int mid = (low + high) >>> 1; 115 SimHashData midVal = t1.get(mid); 116 int cmp = comp.compare(midVal, key); 117
118 if (cmp < 0) 119 low = mid + 1; 120 else if (cmp > 0) 121 high = mid - 1; 122 else
123 return true; // key found
124 } 125 return false; // key not found
126 } 127
128 /**
129 * probe exact match 130 * 131 * @param t 132 * @return
133 */
134 public Span probe(ArrayList<SimHashData> t, long key) { 135 // System.out.println("t:"+t.size());
136 int low = 0; 137 int high = t.size() - 1; 138
139 while (low <= high) { 140 int mid = (low + high) >>> 1; 141 Long midVal = t.get(mid).q; 142 int cmp = compHigh.compare(midVal, key); 143
144 if (cmp < 0) 145 low = mid + 1; 146 else if (cmp > 0) 147 high = mid - 1; 148 else { 149 // key found
150 int matchStart = mid; 151 int matchEnd = mid; 152 while (matchStart > 0) { 153 midVal = t.get(matchStart - 1).q; 154 if (compHigh.compare(midVal, key) == 0) { 155 --matchStart; 156 } else { 157 break; 158 } 159 } 160
161 while (matchEnd < (t.size() - 1)) { 162 midVal = t.get(matchEnd + 1).q; 163 if (compHigh.compare(midVal, key) == 0) { 164 ++matchEnd; 165 } else { 166 break; 167 } 168 } 169 return new Span(matchStart, matchEnd); 170 } 171 } 172 return null; // key not found
173 } 174
175 /**
176 * get most 3 bit difference. 177 * 178 * @param fingerPrint 179 * @param k 180 * @return
181 */
182 public HashSet<SimHashData> getSimSet(long fingerPrint, int k) { 183
184 HashSet<SimHashData> retAll = new HashSet<SimHashData>(); 185 Span s1 = probe(t1, fingerPrint); 186 if (s1 != null) { 187 // System.out.println("s1:"+s1);
188 ArrayList<SimHashData> ret1 = getSim(t1, s1, fingerPrint, k); 189 retAll.addAll(ret1); 190 } 191 long q2 = Long.rotateLeft(fingerPrint, 16); 192 Span s2 = probe(t2, q2); 193 if (s2 != null) { 194 // System.out.println("s2:"+s2);
195 ArrayList<SimHashData> ret2 = getSim(t2, s2, q2, k); 196 // rotateRight(ret2, 16);
197 retAll.addAll(ret2); 198 } 199
200 long q3 = Long.rotateLeft(q2, 16); 201 Span s3 = probe(t3, q3); 202 if (s3 != null) { 203 // System.out.println("s3:"+s3);
204 ArrayList<SimHashData> ret3 = getSim(t3, s3, q3, k); 205 // rotateRight(ret3, 32);
206 retAll.addAll(ret3); 207 } 208
209 long q4 = Long.rotateLeft(q3, 16); 210 Span s4 = probe(t4, q4); 211 if (s4 != null) { 212 // System.out.println("s4:" + s4);
213 ArrayList<SimHashData> ret4 = getSim(t4, s4, q4, k); 214 // rotateRight(ret4, 48);
215 retAll.addAll(ret4); 216 } 217 // System.out.println("o:"+Long.toBinaryString(fingerPrint));
218 return retAll; 219 } 220
221 /**
222 * 从Span找出部分相等的,取出最多差k位的 223 * 224 * @param t 225 * @param s 226 * @param fingerPrint 227 * @param k 228 * @return
229 */
230 public ArrayList<SimHashData> getSim(ArrayList<SimHashData> t, Span s, 231 long fingerPrint, int k) { 232 ArrayList<SimHashData> result = new ArrayList<SimHashData>(); 233
234 for (int i = s.getStart(); i <= s.getEnd(); ++i) { 235 SimHashData data = t.get(i); 236 long q = data.q; 237 if (BitUtil.diffIn(fingerPrint, q, k)) { 238 result.add(data); 239 } 240 } 241
242 return result; 243 } 244
245 public void addSimHash(String line) { 246 StringTokenizer st = new StringTokenizer(line, ":"); 247 String key = st.nextToken(); 248 long t = BitUtil.decodeLong(key); 249 long no = Long.parseLong(st.nextToken()); 250 // Long.parseLong(key,2); 251 // System.out.println(t);
252 t1.add(new SimHashData(t, no)); 253 } 254
255 public void addSimHash(SimHashData key) { 256 t1.add(key); 257 } 258
259 public void addInc(String key) { 260 long t = BitUtil.decodeLong(key); 261 // Long.parseLong(key,2); 262 // System.out.println(t);
263 SimHashData element = new SimHashData(t); 264 int insertionPoint = findInsertionPoint(t1, element); 265 t1.add(insertionPoint, element); 266
267 long q2 = Long.rotateLeft(t, 16); 268 element = new SimHashData(q2); 269 insertionPoint = findInsertionPoint(t2, element); 270 t2.add(insertionPoint, element); 271
272 long q3 = Long.rotateLeft(q2, 16); 273 element = new SimHashData(q3); 274 insertionPoint = findInsertionPoint(t3, element); 275 t3.add(insertionPoint, element); 276
277 long q4 = Long.rotateLeft(q3, 16); 278 element = new SimHashData(q4); 279 insertionPoint = findInsertionPoint(t4, element); 280 t4.add(insertionPoint, element); 281 } 282
283 /**
284 * Find the insertion point for the argument in a sorted list. 285 * 286 * @param element 287 * find this object's insertion point in the sorted list 288 * @return the index of the insertion point 289 */
290 int findInsertionPoint(ArrayList<SimHashData> list, SimHashData element) { 291 // Find the new element's insertion point.
292 int insertionPoint = Collections.binarySearch(list, element, comp); 293 if (insertionPoint < 0) { 294 insertionPoint = -(insertionPoint + 1); 295 } 296 return insertionPoint; 297 } 298
299 public Iterator<SimHashData> iterator() { 300 return t1.iterator(); 301 } 302
303 public void save(String fileName) { 304 BufferedWriter writer; 305 try { 306 writer = new BufferedWriter(new FileWriter(fileName)); 307 for (SimHashData simhash : t1) { 308 //String str=BitUtil.encodeLong(simhash.q).substring(8);
309 String str=BitUtil.encodeLong(simhash.q); 310 writer.write(str); 311 // writer.write(simhash.q+"");
312 writer.write(":"); 313 writer.write(String.valueOf(simhash.no)); 314 writer.write("\r\n"); 315 } 316 writer.flush(); 317 writer.close(); 318 } catch (Exception e) { 319 e.printStackTrace(); 320 } 321 } 322
323 public void save(String fileName, String[] newStr) { 324 BufferedWriter writer; 325 try { 326 OutputStream out = new FileOutputStream(fileName, true); 327 OutputStreamWriter outWriter = new OutputStreamWriter(out); 328 writer = new BufferedWriter(outWriter); 329 for (int i = 0; i < newStr.length; i++) { 330 if (newStr[i] != null) { 331 writer.append(newStr[i]); 332 writer.append("\r\n"); 333 if (i % 10000 == 0) 334 System.out.println(i + ":" + newStr[i]); 335 } else { 336 break; 337 } 338 } 339 writer.flush(); 340 writer.close(); 341 System.out.println("结束!"); 342 } catch (Exception e) { 343 e.printStackTrace(); 344 } 345 } 346
347 // 将数据读成SimHashData对象型集合
348 public ArrayList<SimHashData> readData(String path) { 349 ArrayList<SimHashData> list = new ArrayList<SimHashData>(); 350
351 try { 352 InputStream input = new FileInputStream(new File(path)); 353 BufferedReader br = new BufferedReader(new InputStreamReader(input)); 354 String line = ""; 355 while ((line = br.readLine()) != null) { 356 StringTokenizer st = new StringTokenizer(line, ":"); 357 long key = BitUtil.decodeLong(st.nextToken()); 358 long no = Long.parseLong(st.nextToken()); 359 list.add(new SimHashData(key, no)); 360 } 361 br.close(); 362 } catch (FileNotFoundException e) { 363 e.printStackTrace(); 364 } catch (IOException e) { 365 e.printStackTrace(); 366 } 367 return list; 368 } 369
370
371
372 }
介绍一篇论文:Google Detecting NearDuplicates For Web Crawling 论文介绍了把SimHash用于爬虫抓取过程的网页去重。
最后,说一下分布式文档排重:利用分布式系统框架如hadoop等,使用MapReduce进行文档排重,提高了效率和节省了时间,这已经成为了常用的大数据量的排重方式
以上,是我对SimHash的一些总结,请大家指教!大家共勉