网上的某些资料也说了对Heritrix进行扩展定制即可个性化的从网上爬取自己需要的内容,为建立垂直搜索作做好准备。
这里主要扩展FrontierScheduler和Extractor两个class,由于是个demo所以比较简单,由于这些代码仅仅是周日下午和今天晚上抽时间些的,故效率上没有可考虑,比如太多的正则,反射方法构造后应该缓存等,仅供参考
扩展如下:
class FrontierSchedulerFor360Buy extends FrontierScheduler
//至于为什么要扩展schedule大家应该都知道,CandidateURI表示代选的链接,那么意思就是对将要进入候选的链接进行业务上的过滤
注意:
这里有个问题,就是如果需要过滤符合某个格式的链接,但是这个格式的链接不是全部过滤,而且也找不到一定的业务规则去过滤,那么就不能在这里过滤了,后面在Extractor中讲解
protected void schedule(CandidateURI caUri) {
// TODO Auto-generated method stub
String url = caUri.toString();
System.err.println(">>>++++++ " + url);
if(url.indexOf("dns:www.360buy.com") != -1){
System.err.println(">>>>>>++++++ " + url);
this.getController().getFrontier().schedule(caUri);
}
if(url.indexOf("http://www.360buy.com/robots.txt") != -1){
System.err.println(">>>>>>++++++ " + url);
this.getController().getFrontier().schedule(caUri);
}
if(url.indexOf(PATTERN_360_NOTEBOOK)!=-1){
System.err.println(">>>>>>++++++ " + url);
this.getController().getFrontier().schedule(caUri);
}
if(url.matches(this.PATTERN_360_NOTEBOOK_BRAND)){
if(url.indexOf("670-671-672-0")!=-1){
return;
}
System.err.println(">>>>>>++++++ " + url);
this.getController().getFrontier().schedule(caUri);
}
}
class ExtractorFor360Buy extends Extractor
这里扩展的话基本是对crawlUri的解析,以及对其中的一部分符合业务规则的链接加入到候选的链接(已经加入就需要过滤了,这就遇到了上面红色注意部分说道的问题),这样我们需要直接将链接交由 BdbFrontier 处理,那么根据源码查看,是因为在schedul候选链接CadicateUri的时候,会getPathFromSeed(),所以需要通过反射设置pathFromSeed
以下为具体的异常:
java.lang.NullPointerException
at org.archive.crawler.datamodel.CandidateURI.getTransHops(CandidateURI.java:382)
at org.archive.crawler.frontier.AbstractFrontier.applySpecialHandling(AbstractFrontier.java:727)
at org.archive.crawler.frontier.WorkQueueFrontier.receive(WorkQueueFrontier.java:442)
at org.archive.crawler.util.SetBasedUriUniqFilter.add(SetBasedUriUniqFilter.java:90)
at org.archive.crawler.frontier.WorkQueueFrontier.schedule(WorkQueueFrontier.java:427)
at com.awen.heritrix.ExtractorFor360Buy.extract(ExtractorFor360Buy.java:124)
at org.archive.crawler.extractor.Extractor.innerProcess(Extractor.java:67)
at org.archive.crawler.framework.Processor.process(Processor.java:109)
at org.archive.crawler.framework.ToeThread.processCrawlUri(ToeThread.java:306)
at org.archive.crawler.framework.ToeThread.run(ToeThread.java:154)
protected void extract(CrawlURI curi) {
// TODO Auto-generated method stub
String currentUrl = curi.toString();
//如果是笔记本品牌的话,直接解析,进行数据的
if(currentUrl.matches(this.PATTERN_360_NOTEBOOK_BRAND_ABSOLUTE)){
NodeFilter reputation_clild_filter = new AndFilter(new TagNameFilter("span"),
new HasAttributeFilter("class", "reputation"));
/**
* 此filter为笔记本产品的基本信息,标题,价格,评论链接,图片等
*/
NodeFilter div_child_filter = new HasChildFilter(reputation_clild_filter);
NodeFilter product_filter = new HasChildFilter(div_child_filter);
try {
parser.setURL(currentUrl);
//获取品牌的编码http://www.360buy.com/products/670-671-672-[1-9][\\d]*+-0-0-0-0-0-0-1-1-[\\d].html
String brandNo ="";
brandNo = currentUrl.substring(currentUrl.indexOf("670-671-672-") + 12, currentUrl.indexOf("-", "http://www.360buy.com/products/670-671-672-".length()));
NodeList nodeList = parser.parse(product_filter);
for (int i = 0; i < nodeList.size(); ++i){
Node node_li = nodeList.elementAt(i);
//String node_li_html = node_li.toHtml();使用正则匹配的话效率太低
try {
String content = node_li.toHtml();
String detail="";
String img="";
String price="";
String remark="";
String review = "";
//产品详情链接
Pattern pattern = Pattern.compile(this.PATTERN_360_PRODUCT,Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(content);
if(matcher.find()){
String temp = matcher.group();
CandidateURI cUri = new CandidateURI(UURIFactory.getInstance(temp));
//在往BdbFrontier直接加入CandidateURI过程中会调用CandidateURI的getPathFromSeed,由于为保护方法,所以需要反射设置
Method m = cUri.getClass().getDeclaredMethod("setPathFromSeed", String.class);
m.setAccessible(true);
m.invoke(cUri, temp);
m.setAccessible(false);
this.getController().getFrontier().schedule(cUri);
detail = temp.substring(temp.lastIndexOf("/")+1, temp.indexOf(".html"));
System.err.println(detail);
}
//产品图片链接
Pattern pattern1 = Pattern.compile(this.PATTERN_360_PRODUCT_IMG,Pattern.CASE_INSENSITIVE);
Matcher matcher1 = pattern1.matcher(content);
if(matcher1.find()){
String temp = matcher1.group();
this.addLindFromString(curi, temp, "", Link.EMBED_HOP);
img = temp.substring(temp.lastIndexOf("/")+1, temp.indexOf(".jpg"));
System.err.println(img);
this.getController().getFrontier().schedule(new org.archive.crawler.datamodel.CandidateURI(UURIFactory.getInstance(temp)));
}
//产品评论链接
Pattern pattern2 = Pattern.compile(this.PATTERN_360_PRODUCT_EVALUATE,Pattern.CASE_INSENSITIVE);
Matcher matcher2 = pattern2.matcher(content);
if(matcher2.find()){
String temp = matcher2.group();
//this.addLindFromString(curi, temp, "", Link.NAVLINK_HOP);
review = temp;
}
//价格
NodeFilter price_filter = new AndFilter(new TagNameFilter("div"),
new HasAttributeFilter("class", "p-price"));
//名称
NodeFilter name_filter = new AndFilter(new TagNameFilter("div"),
new HasAttributeFilter("class", "p-name"));
//获取价格
Parser parser = new Parser();
parser.setInputHTML(content);
NodeList price_list = parser.parse(price_filter);
TextExtractingVisitor vistior = new TextExtractingVisitor();
new Parser(price_list.elementAt(0).toHtml()).visitAllNodesWith(vistior);
price =vistior.getExtractedText();
System.err.println("price = " + price);
//获取名称描述
Parser parser1 = new Parser();
parser1.setInputHTML(content);
NodeList name_list = parser1.parse(name_filter);
TextExtractingVisitor vistior1 = new TextExtractingVisitor();
new Parser(name_list.elementAt(0).toHtml()).visitAllNodesWith(vistior1);
remark = vistior1.getExtractedText();
System.err.println("name = " + remark);
try {
File file = new File("/home/awen/360/" + brandNo + "_" + detail + ".txt");
if(!file.exists()){
file.createNewFile();
}
BufferedWriter bw = new BufferedWriter(new FileWriter(file));
bw.write(brandNo + ls);//品牌
bw.write(detail + ls);//详情链接后缀
bw.write(img + ls);//图片链接后缀
bw.write(price + ls);//价格
bw.write(review + ls);//评价链接
bw.write(remark + ls);//描述
bw.flush();
bw.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (URIException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SecurityException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (NoSuchMethodException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IllegalArgumentException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IllegalAccessException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvocationTargetException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//如果不是笔记本品牌的话,直接解析
HttpRecorder recorder = curi.getHttpRecorder();
ReplayCharSequence rcs = null;
try {
rcs = recorder.getReplayCharSequence();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if(rcs==null){
return;
}
String content = rcs.toString();
Pattern pattern = Pattern.compile(this.PATTERN_A_HREF, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(content);
while(matcher.find()){
String newUrl = matcher.group(2);
//符合笔记本
if(newUrl.matches(this.PATTERN_360_NOTEBOOK)){
System.err.println("————————————————————————————————符合笔记本——————————————————————————");
//对newUrl进行修改
newUrl = "http://www.360buy.com/products" + newUrl;
this.addLindFromString(curi, newUrl, "", Link.NAVLINK_HOP);
}
//符合笔记本品牌
if(newUrl.matches(this.PATTERN_360_NOTEBOOK_BRAND)){
//对newUrl进行修改
System.err.println("————————————————————————————————符合笔记本品牌——————————————————————————");
newUrl = "http://www.360buy.com/products/" + newUrl;
this.addLindFromString(curi, newUrl, "", Link.NAVLINK_HOP);
}
}
}
//将链接加入到候选链接中,如果在Frontier中也对这种格式的链接需要过滤却不能分辨哪些是否需要过滤,而这些链接
//又必须要抓取,那么可以直接交给BdbFrontier依次上级调用
private void addLindFromString(CrawlURI curi, String uri, CharSequence context, char hopType){
try {
curi.createAndAddLinkRelativeToBase(uri, context, hopType);
} catch (URIException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}