Nutch爬虫解决页面相对路径问题

修改LinkDb.java的map方法,对页面取到的outlink进行解析
import com.sun.org.apache.xml.internal.utils.URI.MalformedURIException;
import com.sun.org.apache.xml.internal.utils.URI;


    Inlinks inlinks = new Inlinks();
    URI baseUri = new URI(fromUrl);
    URI absoluteUri = null;

    for (int i = 0; i < outlinks.length; i++) {
      Outlink outlink = outlinks[i];
      String toUrl = outlink.getToUrl();
      if (ignoreInternalLinks) {
        String toHost = getHost(toUrl);
        if (toHost == null || toHost.equals(fromHost)) { // internal link
          continue;                               // skip it
        }
      }
      if (urlNormalizers != null) {
        try {
          toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url
        } catch (Exception e) {
          LOG.warn("Skipping " + toUrl + ":" + e);
          toUrl = null;
        }
      }
      if (toUrl != null && urlFilters != null) {
        try {
          toUrl = urlFilters.filter(toUrl); // filter the url
        } catch (Exception e) {
          LOG.warn("Skipping " + toUrl + ":" + e);
          toUrl = null;
        }
      }
      if (toUrl == null) continue;
      inlinks.clear();
      String anchor = outlink.getAnchor();        // truncate long anchors
      if (anchor.length() > maxAnchorLength) {
        anchor = anchor.substring(0, maxAnchorLength);
      }
      inlinks.add(new Inlink(fromUrl, anchor));   // collect inverted link
      try {
          absoluteUri = new URI(baseUri, toUrl);
      } catch (MalformedURIException e) {
          continue;
      }

    
//      output.collect(new Text(toUrl), inlinks);
      output.collect(new Text(absoluteUri.toString()), inlinks);
    }


还有一种方法

import java.net.*;  
import java.io.*;  
public class Test{  
public static void main(String args[]) throws Exception {  
  String abURL=null;    
 URI base=new URI("http://www.pep.com.cn/xe/jszx/tbjxzy/pepxe/pepsa/dzkb/200703/t20070308_303223.htm");//基本网页URI     
  URI abs=base.resolve("../../../pepwa/dzkb/200703/W020070308571116931595.jpg");//解析于上述网页的相对URL,得到绝对URI     
        URL absURL=abs.toURL();//转成URL     
        System.out.println(absURL);    
        abURL = absURL.toString();    
}  
}

你可能感兴趣的:(相对路径,Nutch爬虫,LinkDb)