用nutch-1.3抓取了大量的页面后,最后用solrindex索引到solr中,发现出现了如下错误:
LinkDb: finished at 2011-08-10 06:14:32, elapsed: 03:13:16
SolrIndexer: starting at 2011-08-10 06:14:32
java.io.IOException: Job failed!
SolrDeleteDuplicates: starting at 2011-08-10 09:37:59
SolrDeleteDuplicates: Solr url: http://10.42.31.96:880
SolrDeleteDuplicates: finished at 2011-08-10 09:41:38, elapsed: 00:03:38
crawl finished: test
查看logs\ hadoop.log文件,错误提示如下:
2011-08-10 09:37:58,888 WARN mapred.LocalJobRunner - job_local_0020
org.apache.solr.common.SolrException: [was class java.io.CharConversionException] Invalid UTF-8 character 0xffff at char #1021643, byte #2821736) java.lang.RuntimeException: [w
as class java.io.CharConversionException] Invalid UTF-8 character 0xffff at char #1021643, byte #2821736) at com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(Exceptio
nUtil.java:18) at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731) at com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3657)
at com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:266) at org.apache.sol
r.handler.XMLLoader.processUpdate(XMLLoader.java:126) at org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77) at org.apache.solr.handler.ContentStreamHandlerBase.handl
eRequestBody(ContentStreamHandlerBase.java:67) at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129) at org.apache.solr.core.SolrCore.
execute(SolrCore.java:1310) at org.rivu.web.servlet.RivuUpdateServlet.doPost(RivuUpdateServlet.java:49) at javax.servlet.http.HttpServlet.service(HttpServlet.java:710) at javax.servlet.http.HttpServlet.service(HttpServlet.java:803) at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:290) a
t org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206) at org.rivu.web.servlet.RivuFilter.doFilter(RivuFilter.java:294) at org.ap
ache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235) at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChai
n.java:206) at org.springframework.orm.hibernate3.support.OpenSessionInViewFilter.doFilterInternal(OpenSessionInViewFilter.java:198) at org.springframework.web.filter
.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:76) at org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235) at org.ap
ache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206) at org.springframework.web.filter.CharacterEncodingFilter.doFilterInternal(CharacterEncod
ingFilter.java:96) at org.springframework.web.filter.OncePerRequestFilter.doFilter(OncePerRequestFilter.java:76) at org.apache.catalina.core.ApplicationFilterChain.intern
alDoFilter(ApplicationFilterChain.java:235) at org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206) at org.apache.catalina.core.Stand
ardWrapperValve.invoke(StandardWrapperValve.java:233) at org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:175) at org.apache.catalina.core.Stand
ardHostValve.invoke(StandardHostValve.java:128) at org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102) at org.apache.catalina.core.Stand
ardEngineValve.invoke(StandardEngineValve.java:109) at org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:263) at org.apache.coyote.http11.Http11Process
or.process(Http11Processor.java:844) at org.apache.coyote.http11.Http11Protocol$Http11ConnectionHandler.process(Http11Protocol.java:584) at org.apache.tomcat.util.net.JIo
Endpoint$Worker.run(JIoEndpoint.java:447) at java.lang.Thread.run(Thread.java:662) Caused by: java.io.CharConversionException: Invalid UTF-8 character 0xffff at char #1021
643, byte #2821736) at com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335) at com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249) at com.ctc.wstx.i
o.MergedReader.read(MergedReader.java:101) at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84) at com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReader
Source.java:57) at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992) at com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.java:462
8) at com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.java:4126) at com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701) a
t com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3649) ... 33 more
solrindex的功能是将从nutch抓取的页面内容及相关信息,并生成xml格式,通过update方式,参数为wt=javabin&version=2提交给solr索引库中。从错误提示看,因xml格式为UTF-8字符集,在生成xml过程中,因数据中存在0xffff等非法字符,而无法生成xml。
通过google搜索,找到https://issues.apache.org/jira/browse/NUTCH-1016,发现是nutch-1.3的一个Bug,该页面提供了补丁NUTCH1016-1.4-4.patch,解决了这一问题,下载该补丁,并给 SolrWrite.java打上该补丁即可。
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.indexer.solr; import java.io.IOException; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Map.Entry; //Commons Logging imports import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.mapred.JobConf; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.indexer.NutchField; import org.apache.nutch.indexer.NutchIndexWriter; import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.util.DateUtil; public class SolrWriter implements NutchIndexWriter { public static Logger LOG = LoggerFactory.getLogger(SolrWriter.class); private SolrServer solr; private SolrMappingReader solrMapping; private final List<SolrInputDocument> inputDocs = new ArrayList<SolrInputDocument>(); private int commitSize; public void open(JobConf job, String name) throws IOException { solr = new CommonsHttpSolrServer(job.get(SolrConstants.SERVER_URL)); commitSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000); solrMapping = SolrMappingReader.getInstance(job); } public void write(NutchDocument doc) throws IOException { final SolrInputDocument inputDoc = new SolrInputDocument(); for(final Entry<String, NutchField> e : doc) { for (final Object val : e.getValue().getValues()) { // normalise the string representation for a Date Object val2 = val; if (val instanceof Date){ val2 = DateUtil.getThreadLocalDateFormat().format(val); } if (e.getKey().equals("content")||e.getKey().equals("title")|| e.getKey().equals("anchor")||e.getKey().equals("url")) { val2 = stripNonCharCodepoints((String)val); } inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e.getValue().getWeight()); String sCopy = solrMapping.mapCopyKey(e.getKey()); if (sCopy != e.getKey()) { inputDoc.addField(sCopy, val); } } } inputDoc.setDocumentBoost(doc.getWeight()); inputDocs.add(inputDoc); if (inputDocs.size() >= commitSize) { try { LOG.info("Adding " + Integer.toString(inputDocs.size()) + " documents"); solr.add(inputDocs); } catch (final SolrServerException e) { throw makeIOException(e); } inputDocs.clear(); } } public void close() throws IOException { try { if (!inputDocs.isEmpty()) { LOG.info("Adding " + Integer.toString(inputDocs.size()) + " documents"); solr.add(inputDocs); inputDocs.clear(); } // solr.commit(); } catch (final SolrServerException e) { throw makeIOException(e); } } public static IOException makeIOException(SolrServerException e) { final IOException ioe = new IOException(); ioe.initCause(e); return ioe; } public static String stripNonCharCodepoints(String input) { StringBuilder retval = new StringBuilder(); char ch; for (int i = 0; i < input.length(); i++) { ch = input.charAt(i); // Strip all non-characters http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:] // and non-printable control characters except tabulator, new line and carriage return if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000 ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) { retval.append(ch); } } return retval.toString(); } }