heritrix ELFHash多线程抓取

1.添加了org.archive.crawler.frontier.ELFHashQueueAssignmentPolicy.java
引用

public class ELFHashQueueAssignmentPolicy extends QueueAssignmentPolicy {

private static final Logger logger = Logger
.getLogger(ELFHashQueueAssignmentPolicy.class.getName());

private static String DEFAULT_CLASS_KEY = "default...";

private static final String DNS = "dns";

/**
*
*/
@Override
public String getClassKey(CrawlController controller, CandidateURI cauri) {
String uri = cauri.getUURI().toString();
String scheme = cauri.getUURI().getScheme();
String candidate = null;

try {
if (scheme.equals(DNS)) {
if (cauri.getVia() != null) {
// Special handling for DNS: treat as being
// of the same class as the triggering URI.
// When a URI includes a port, this ensures
// the DNS lookup goes atop the host:port
// queue that triggered it, rather than
// some other host queue
UURI viaUuri = UURIFactory.getInstance(cauri.flattenVia());
candidate = viaUuri.getAuthorityMinusUserinfo();
// adopt scheme of triggering URI
scheme = viaUuri.getScheme();
} else {
candidate = cauri.getUURI().getReferencedHost();
}
} else {
// String uri = cauri.getUURI().toString();
long hash = ELFHash(uri);
candidate = Long.toString(hash % 100);
}

if (candidate == null || candidate.length() == 0) {
candidate = DEFAULT_CLASS_KEY;
}
} catch (URIException e) {
logger.log(Level.INFO,
"unable to extract class key; using default", e);
candidate = DEFAULT_CLASS_KEY;
}

return candidate.replace(':', '#');
}

public String getClassKey(String uri) {
// String uri = cauri.getUURI().toString();
long hash = ELFHash(uri);
String a = Long.toString(hash % 100);
return a;
}

public static long ELFHash(String str) {
long hash = 0;
long x = 0;
for (int i = 0; i < str.length(); i++) {
hash = (hash << 4) + str.charAt(i);
if ((x = hash & 0xF0000000L) != 0) {
hash ^= (x >> 24);
hash &= ~x;
}
}
return (hash & 0x7FFFFFFF);
}

public static void main(String args[]){
ELFHashQueueAssignmentPolicy el=new ELFHashQueueAssignmentPolicy();
String a=el.getClassKey("http://www.chinanews.com.cn/");
System.out.println(a);
}

}







2.修改了AbstractFrontier 类的AbstractFrontier方法
/**
         * 自己添加修改的配置----------------------------
         */
        String queueStr = System.getProperty(AbstractFrontier.class.getName() +
                "." + ATTR_QUEUE_ASSIGNMENT_POLICY,
                ELFHashQueueAssignmentPolicy.class.getName() + " " +
                IPQueueAssignmentPolicy.class.getName() + " " +
                BucketQueueAssignmentPolicy.class.getName() + " " +
                SurtAuthorityQueueAssignmentPolicy.class.getName() + " " +
                TopmostAssignedSurtQueueAssignmentPolicy.class.getName());
        Pattern p = Pattern.compile("\\s*,\\s*|\\s+");
       /**
        * -----------------------------------------------
        */


3.修改了heritrix.properties
修改为

引用

#############################################################################
# FRONTIER
#############################################################################

# List here all queue assignment policies you'd have show as a
# queue-assignment-policy choice in AbstractFrontier derived Frontiers
# (e.g. BdbFrontier).
org.archive.crawler.frontier.AbstractFrontier.queue-assignment-policy = org.archive.crawler.frontier.ELFHashQueueAssignmentPolicy org.archive.crawler.frontier.IPQueueAssignmentPolicy org.archive.crawler.frontier.BucketQueueAssignmentPolicy org.archive.crawler.frontier.SurtAuthorityQueueAssignmentPolicy org.archive.crawler.frontier.TopmostAssignedSurtQueueAssignmentPolicy
org.archive.crawler.frontier.BdbFrontier.level = INFO



要新建一个job才有效,否则还是默认的HostnameQueueAssignmentPolicy
还有,有时候只是散列出30个DNS就关闭。
解决方法1,增加入口连接
        2,对这个job重新建立以下with exist




因为heritrix1.14.4 默认认为你的站点是按每个域名来爬的, 修改成一个站点多个线程, 会有一定问题, 比如导致很多次执行同样的DNS。建议使用heritrix3, heritrix3能支持这个功能

你可能感兴趣的:(多线程,Scheme)