nutch2.3.1 updatejob时错误url导致崩溃

原因可能是错误的html解析出来的
在DbUpdateMapper.java的map时加个trycatch

 55  @Override
 56  public void map(String key, WebPage page, Context context)
 57      throws IOException, InterruptedException {
 58    if (Mark.GENERATE_MARK.checkMark(page) == null) {
 59      if (LOG.isDebugEnabled()) {
 60        LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
 61            + "; not generated yet");
 62      }
 63      return;
 64    }
 65
 66    String url = TableUtil.unreverseUrl(key);
 67
 68    scoreData.clear();
 69    Map outlinks = page.getOutlinks();
 70    if (outlinks != null) {
 71      for (Entry e : outlinks.entrySet()) {
 72        int depth = Integer.MAX_VALUE;
 73        CharSequence depthUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
 74        if (depthUtf8 != null)
 75          depth = Integer.parseInt(depthUtf8.toString());
           // add here to filter error url
 76        try {
 77            String testUrl = TableUtil.reverseUrl(e.getKey().toString());
 78        } catch (MalformedURLException ex) {
 79            LOG.warn("dbupdate,error url:" + e.getKey().toString());
 80            continue;
 81        }
 82        scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(), e.getValue()
 83            .toString(), depth));
 84      }
 85    }
 86
 87    // TODO: Outlink filtering (i.e. "only keep the first n outlinks")
 88    try {
 89      scoringFilters.distributeScoreToOutlinks(url, page, scoreData,
 90          (outlinks == null ? 0 : outlinks.size()));
 91    } catch (ScoringFilterException e) {
 92      LOG.warn("Distributing score failed for URL: " + key + " exception:"
 93          + StringUtils.stringifyException(e));
 94    }
 95
 96    urlWithScore.setUrl(key);
 97    urlWithScore.setScore(Float.MAX_VALUE);
 98    pageWritable.setWebPage(page);
 99    nutchWritable.set(pageWritable);
100    context.write(urlWithScore, nutchWritable);
101
102    for (ScoreDatum scoreDatum : scoreData) {
103      String reversedOut = TableUtil.reverseUrl(scoreDatum.getUrl());
104      scoreDatum.setUrl(url);
105      urlWithScore.setUrl(reversedOut);
106      urlWithScore.setScore(scoreDatum.getScore());
107      nutchWritable.set(scoreDatum);
108      context.write(urlWithScore, nutchWritable);
109    }
110  }

你可能感兴趣的:(爬虫)