bin/nutch crawl urls -dir crawl -depth 3 -topN 5 说明: -dir 抓取结果目录名 -depth 抓取的深度 -topN 最一层的最大抓取个数 一般抓取完成后会看到如下的目录 crawl/crawldb crawl/linkdb crawl/segments
bin/nutch solrindex http://127.0.0.1:8983/solr/ crawldb -linkdb crawldb/linkdb crawldb/segments/* 使用这个命令的前提是你已经开启了默认的solr服务 开启默认solr服务的命令如下 cd ${APACHE_SOLR_HOME}/example java -jar start.jar 这个时候服务就开启了 你可以在浏览器中输入如下地址进行测试 http://localhost:8983/solr/admin/ http://localhost:8983/solr/admin/stats.jsp
但是要结合Nutch来使用solr,还要在solr中加一个相应的策略配置,在nutch的conf目录中有一个默认的配置,把它复制到solr的相应目录中就可以使用了 cp ${NUTCH_RUNTIME_HOME}/conf/schema.xml ${APACHE_SOLR_HOME}/example/solr/conf/ 这个时候要重新启动一下solr
索引建立完成以后就你就可以用关键词进行查询,solr默认返回的是一个xml文件
/** Extension point for indexing. Permits one to add metadata to the indexed * fields. All plugins found which implement this extension point are run * sequentially on the parse. */ public interface IndexingFilter extends Pluggable, Configurable { /** The name of the extension point. */ final static String X_POINT_ID = IndexingFilter.class.getName(); /** * Adds fields or otherwise modifies the document that will be indexed for a * parse. Unwanted documents can be removed from indexing by returning a null value. * * @param doc document instance for collecting fields * @param parse parse data instance * @param url page url * @param datum crawl datum for the page * @param inlinks page inlinks * @return modified (or a new) document instance, or null (meaning the document * should be discarded) * @throws IndexingException */ NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException; }
package org.apache.nutch.indexer.metadata; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; import org.apache.nutch.indexer.IndexingFilter; import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.indexer.NutchDocument; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; import java.util.Date; import org.apache.hadoop.conf.Configuration; /** * Add (or reset) a few metaData properties as respective fields (if they are * available), so that they can be displayed by more.jsp (called by search.jsp). * * @author Lemo lu */ public class MetaDataIndexingFilter implements IndexingFilter { public static final Logger LOG = LoggerFactory .getLogger(MetaDataIndexingFilter.class); private Configuration conf; public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // add metadata field addMetaData(doc, parse.getData(), datum); // add fetch time field addFetchTime(doc, parse.getData(),datum); return doc; } private NutchDocument addFetchTime(NutchDocument doc, ParseData data,CrawlDatum datum) { long fetchTime = datum.getFetchTime(); doc.add("fetchTime",new Date(fetchTime)); return doc; } private NutchDocument addMetaData(NutchDocument doc, ParseData data, CrawlDatum datum) { String metadata = data.getParseMeta().toString(); doc.add("metadata", metadata); return doc; } public void setConf(Configuration conf) { this.conf = conf; } public Configuration getConf() { return this.conf; } }
<?xml version="1.0" encoding="UTF-8"?> <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <plugin id="index-metadata" name="Metadata Indexing Filter" version="1.0.0" provider-name="nutch.org"> <runtime> <library name="index-metadata.jar"> <export name="*"/> </library> </runtime> <requires> <import plugin="nutch-extensionpoints"/> </requires> <extension id="org.apache.nutch.indexer.more" name="Nutch MetaData Indexing Filter" point="org.apache.nutch.indexer.IndexingFilter"> <implementation id="MetaDataIndexingFilter" class="org.apache.nutch.indexer.metadata.MetaDataIndexingFilter"/> </extension> </plugin>
<!-- metadata fields --> <field name="fetchTime" type="date" stored="true" indexed="true"/> <field name="metadata" type="string" stored="true" indexed="true"/>
<field dest="fetchTime" source="fetchTime"/> <field dest="metadata" source="metadata"/>
bin/nutch solrindex http://127.0.0.1:8983/solr/ crawldb -linkdb crawldb/linkdb crawldb/segments/*
This XML file does not appear to have any style information associated with it. The document tree is shown below. <response> <lst name="responseHeader"> <int name="status">0</int> <int name="QTime">1</int> <lst name="params"> <str name="indent">on</str> <str name="start">0</str> <str name="q">a</str> <str name="version">2.2</str> <str name="rows">10</str> </lst> </lst> <result name="response" numFound="1" start="0"> <doc> <float name="boost">1.1090536</float> <str name="digest">da3aefc69d8a5a7c1ea5447f9680d66d</str> <date name="fetchTime">2012-04-11T03:19:33.088Z</date> <str name="id">http://nutch.apache.org/</str> <str name="metadata"> CharEncodingForConversion=utf-8 OriginalCharEncoding=utf-8 </str> <str name="segment">20120410231836</str> <str name="title">Welcome to Apache Nutch®</str> <date name="tstamp">2012-04-11T03:19:33.088Z</date> <str name="url">http://nutch.apache.org/</str> </doc> </result> </response>