昨天看Larbin源代码,觉得Larbin根本没有用bloom filter算法,他只hash了一次。不过他的按位保存的那段代码还是蛮精辟的。今天上网,发现了这位仁兄的博客,他也觉得Larbin没有用bloom Filter算法,而且他的blog对Larbin的重要之处都做了解释。现在转载过来。 原文出处是:http://quweiprotoss.blog.163.com/blog/static/4088288320103190243558/
在larbin里判断用一个URL是否被抓取过,用的是bloom filter算法(至少网上的人这么说),但是我感觉与《数学之美系列二十一-布隆过滤器(Bloom Filter)》中所介绍的算法有着很大的不同,因为larbin中只是简单用了hash方法,它有点像位图法,但我的算法很一般,意见仅供参考。
class hashTable { private: ssize_t size; char *table; public: /* constructor */ hashTable(bool create); /* destructor */ ~hashTable(); /* save the hashTable in a file */ void save(); /* test if this url is allready in the hashtable * return true if it has been added * return false if it has allready been seen */ bool test(url *U); /* set a url as present in the hashtable */ void set(url *U); /* add a new url in the hashtable * return true if it has been added * return false if it has allready been seen */ bool testSet(url *U); };
/* constructor */ hashTable::hashTable(bool create) { ssize_t total = hashSize / 8; table = new char[total]; if (create) { for (ssize_t i = 0; i < hashSize / 8; i++) { table[i] = 0; } } else { int fds = open("hashtable.bak", O_RDONLY); if (fds < 0) { cerr << "Cannot find hashtable.bak, restart from scratch\n"; for (ssize_t i = 0; i < hashSize / 8; i++) { table[i] = 0; } } else { ssize_t sr = 0; while (sr < total) { ssize_t tmp = read(fds, table + sr, total - sr); if (tmp <= 0) { cerr << "Cannot read hashtable.bak : " << strerror(errno) << endl; exit(1); } else { sr += tmp; } } close(fds); } } }
/* save the hashTable in a file */ void hashTable::save() { rename("hashtable.bak", "hashtable.old"); int fds = creat("hashtable.bak", 00600); if (fds >= 0) { ecrireBuff(fds, table, hashSize / 8); close(fds); } unlink("hashtable.old"); }
/* test if this url is allready in the hashtable * return true if it has been added * return false if it has allready been seen */ bool hashTable::test(url *U) { unsigned int code = U->hashCode(); unsigned int pos = code / 8; unsigned int bits = 1 << (code % 8); return table[pos] & bits; }
这里计算得到URL的hash code,后计算它在哪个字节上,再看它在它个bit上,最后判断这个bit是否已经置过1。
/* return a hashcode for this url */ uint url::hashCode() { unsigned int h = port; unsigned int i = 0; while (host[i] != 0) { h = 31 * h + host[i]; i++; } i = 0; while (file[i] != 0) { h = 31 * h + file[i]; i++; } return h % hashSize; }
代码在utils/url.c。这里是将host name和后面部分做hash,nutch里算这个hash code值的时候,是反过来算的,因为这样同一host的url的hashcode的值就会有更大的差异,是不是会更好些呢?这里乘31是经典做法,java中也是这样的。
/* set a url as present in the hashtable */ void hashTable::set(url *U) { unsigned int code = U->hashCode(); unsigned int pos = code / 8; unsigned int bits = 1 << (code % 8); table[pos] |= bits; }
/** check if an url is already known * if not send it * @param u the url to check */ void check(url *u); /** Check the extension of an url * @return true if it might be interesting, false otherwise */ bool filter1(char *host, char *file);
void check(url *u) { if (global::seen->testSet(u)) { hashUrls(); // stat // where should this link go ? #ifdef SPECIFICSEARCH if (privilegedExts[0] != NULL && matchPrivExt(u->getFile())) { interestingExtension(); global::URLsPriority->put(u); } else { global::URLsDisk->put(u); } #else // not a SPECIFICSEARCH global::URLsDisk->put(u); #endif } else { // This url has already been seen answers(urlDup); delete u; } }
/** Check the extension of an url * @return true if it might be interesting, false otherwise */ bool filter1(char *host, char *file) { int i = 0; if (global::domains != NULL) { bool ok = false; while ((*global::domains)[i] != NULL) { ok = ok || endWith((*global::domains)[i], host); i++; } if (!ok) { return false; } } i = 0; int l = strlen(file); if (endWithIgnoreCase("html", file, l) || file[l - 1] == '/' || endWithIgnoreCase("htm", file, l)) { return true; } while (global::forbExt[i] != NULL) { if (endWithIgnoreCase(global::forbExt[i], file, l)) { return false; } i++; } return true; }
/** Check the extension of an url * @return true if it might be interesting, false otherwise */ bool filter1(char *host, char *file) { int i = 0; if (global::domains != NULL) { bool ok = false; while ((*global::domains)[i] != NULL) { ok = ( ok || endWith((*global::domains)[i], host) ); i++; } if (!ok) { return false; } } i = 0; int l = strlen(file); if (endWithIgnoreCase("html", file, l) || file[l - 1] == '/' || endWithIgnoreCase("htm", file, l)) { return true; } while (global::forbExt[i] != NULL) { if (endWithIgnoreCase(global::forbExt[i], file, l)) { return false; } i++; } return true; }
while ((*global::domains)[i] != NULL) { if( endWith((*global::domains)[i], host) ) return false; i++; }
# Do you want to limit your search to a specific domain ? # if yes, uncomment the following line #limitToDomain .fr .dk .uk end # What are the extensions you surely don't want # never forbid .html, .htm and so on : larbin needs them forbiddenExtensions .tar .gz .tgz .zip .Z .rpm .deb .ps .dvi .pdf .png .jpg .jpeg .bmp .smi .tiff .gif .mov .avi .mpeg .mpg .mp3 .qt .wav .ram .rm .jar .java .class .diff .doc .xls .ppt .mdb .rtf .exe .pps .so .psd end