Ubuntu nutch

只用于备忘!

0.NutchAnalysis.jj


0.1
import org.wltea.analyzer.lucene.IKTokenizer;


0.2
PARSER_END(NutchAnalysis)


TOKEN_MGR_DECLS : {


  /** Constructs a token manager for the provided Reader. */
  public NutchAnalysisTokenManager(Reader reader) {
    this(new FastCharStream(reader));
  }


  IKTokenizer Analyzer;
  TermAttribute termAtt = null;//代表用空格分割器分割出来的一个中文词
  OffsetAttribute offAtt = null;//中文词开始结束标记
  TokenStream stream = null;
  private int cjkStartOffset = 0;//中文片段起始位置定义


}


0.3
// chinese, japanese and korean characters
//| <SIGRAM: <CJK> >
| <SIGRAM: (<CJK>)+  >
{
if(stream == null){
stream = new IKTokenizer(new StringReader(image.toString()),true);
//stream = Analyzer.tokenStream("",new StringReader(image.toString()));
cjkStartOffset = matchedToken.beginColumn;
try{
stream.reset();
}catch(IOException e){
e.printStackTrace();
}


termAtt = (TermAttribute)stream.addAttribute(TermAttribute.class);
offAtt = (OffsetAttribute)stream.addAttribute(OffsetAttribute.class);


try{
if(stream.incrementToken() == false){
termAtt = null;
}
}catch(IOException e){
e.printStackTrace();
}
}
if(termAtt != null && !termAtt.term().equals("")){
matchedToken.image = termAtt.term();
matchedToken.beginColumn = cjkStartOffset + offAtt.startOffset();
matchedToken.endColumn = cjkStartOffset + offAtt.endOffset();


try{
if(stream.incrementToken() != false){
input_stream.backup(1);
}else{
termAtt = null;
}
}catch(IOException e){
e.printStackTrace();
}


}


if(termAtt == null || termAtt.term().equals("")){
stream = null;
cjkStartOffset = 0;
}


}


   // irregular words
| <#IRREGULAR_WORD: (<C_PLUS_PLUS>|<C_SHARP>)>
| <#C_PLUS_PLUS: ("C"|"c") "++" >




3.NutchDocumentAnalyzer.java  
  /** Returns a new token stream for text from the named field. */
  public TokenStream tokenStream(String fieldName, Reader reader) {


 return (new org.wltea.analyzer.lucene.IKAnalyzer().tokenStream(fieldName, reader));
  }




4.Query.java
  /**
   * Parse a query from a string using a language specific analyzer.
   *
   * @param queryString is the raw query string to parse
   * @param queryLang is a two-letters language code used to identify which
   *        {@link org.apache.nutch.analysis.NutchAnalyzer} should be used
   *        to parse the query string.
   * @see org.apache.nutch.analysis.AnalyzerFactory
   */
  public static Query parse(String queryString, String queryLang, Configuration conf)
  throws IOException {


 Query query = null;


 try{
 query = NutchAnalysis.parseQuery(queryString,AnalyzerFactory.get(conf).get(queryLang),conf);
 }catch(Exception e){
 return null;
 }


    return fixup(query,conf);
  }


2.NutchAnalysis.java


  /** Construct a query parser for the text in a reader. */
  public static Query parseQuery(String queryString, Configuration conf) throws IOException,ParseException {
    return parseQuery(queryString, null, conf);
  }


  /** Construct a query parser for the text in a reader. */
  public static Query parseQuery(String queryString, Analyzer analyzer, Configuration conf)
    throws IOException,ParseException {
    NutchAnalysis parser = new NutchAnalysis(
          queryString, (analyzer != null) ? analyzer : new NutchDocumentAnalyzer(conf));
    parser.queryString = queryString;
    parser.queryFilters = new QueryFilters(conf);
    return parser.parse(conf);
  }


1.NutchAnalysisTokenManager.java
  import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;


5.build.xml
  <include name="IKAnalyzer3.2.8.jar"/>
  <target name="job" depends="compile,war">


6.crawl-urlfilter.txt
# skip file:, ftp:, & mailto: urls
-^(file|ftp|mailto):


# skip image and other suffixes we can't yet parse
-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$


# skip URLs containing certain characters as probable queries, etc.
-[?*!@=]
#-[?*!@=]   //表示过滤包含指定字符的URL,改为: -[~]


# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
-.*(/[^/]+)/[^/]+\1/[^/]+\1/
#-.*(/.+?)/.*?/1/.*?/1/


# accept hosts in MY.DOMAIN.NAME
#+^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
#+^http://([a-z0-9]*/.)*tianya.cn/[/s/S]*    // 过滤正则表达式,([a-z0-9]*/.)*表示任意数字和字母,[/s/S]*表示任意字符
#下面是摘自别人的示例
            +^http://mysite.com/discuz/index.php$
            +^http://mysite.com/discuz/forumdisplay.php/?fid=/d+$
            +^http://mysite.com/discuz/forumdisplay.php/?fid=/d+&page=/d+$
            +^http://mysite.com/discuz/viewthread.php/?tid=/d+&extra=page%3D/d+$
            +^http://mysite.com/discuz/viewthread.php/?tid=/d+&extra=page%3D/d+&page=/d+$


# skip everything else


7.nutch-default.xml 
7.1
<property>
  <name>http.agent.name</name>
  <value>xcodes</value>
  <description>HTTP 'User-Agent' request header. MUST NOT be empty - 
  please set this to a single word uniquely related to your organization.


  NOTE: You should also check other related properties:


http.robots.agents
http.agent.description
http.agent.url
http.agent.email
http.agent.version


  and set their values appropriately.


  </description>
</property>
7.2
<property>
  <name>indexer.mergeFactor</name>
  <value>500</value>
  <description>The factor that determines the frequency of Lucene segment
  merges. This must not be less than 2, higher values increase indexing
  speed but lead to increased RAM usage, and increase the number of
  open file handles (which may lead to "Too many open files" errors).
  NOTE: the "segments" here have nothing to do with Nutch segments, they
  are a low-level data unit used by Lucene.
  </description>
</property>


<property>
  <name>indexer.minMergeDocs</name>
  <value>500</value>
  <description>This number determines the minimum number of Lucene
  Documents buffered in memory between Lucene segment merges. Larger
  values increase indexing speed and increase RAM usage.
  </description>
</property>


8.regex-urlfilter.txt 
建议注释掉其中的,-[?*!@=]


9.nutch-site.xml
<configuration>
    <property>
        <name>http.agent.name</name>
        <value>xcodes</value>
    </property>
</configuration>


10.运行
Nutch-1.2_ configration
org.apache.nutch.crawl.Crawl作为主类
Program Arguments:urls -dir crawl -depth 3 -topN 50
VM arguments:-Dhadoop.log.dir=logs -Dhadoop.log.file=hadoop.log
-Xms800m -Xmx800m




附:crawl-urlfilter.txt
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.




# The url filter file used by the crawl command.


# Better for intranet crawling.
# Be sure to change MY.DOMAIN.NAME to your domain name.


# Each non-comment, non-blank line contains a regular expression
# prefixed by '+' or '-'.  The first matching pattern in the file
# determines whether a URL is included or ignored.  If no pattern
# matches, the URL is ignored.


# skip file:, ftp:, & mailto: urls
-^(file|ftp|mailto):


# skip image and other suffixes we can't yet parse
+^http://fmn.rrfmn.com/[/s/S]*.jpg$
#-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$


# skip URLs containing certain characters as probable queries, etc.
-[~]


# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
-.*(/.+?)/.*?/1/.*?/1/


# accept hosts in MY.DOMAIN.NAME
#+^http://page.renren.com/600038849/$
+^http://([a-z0-9]*/.)*page.renren.com/600038849/[/s/S]*
#http://page.renren.com/600038849/
# skip everything else
-.


#http://fmn.rrfmn.com/fmn058/20121009/1000/p_large_HTnz_75b6000049741261.jpg
#http://fmn.rrfmn.com/fmn058/20121009/1000/p_large_nNNF_7ef1000022341263.jpg
#http://fmn.rrimg.com/fmn056/20120927/2310/original_bf56_6d2e00006760125e.jpg
#http://fmn.rrimg.com/fmn060/20120927/2310/original_sRjX_31fa00006776125c.jpg
#http://fmn.rrimg.com/fmn063/20121008/1745/p_large_hqQ5_260c00002d251262.jpg








你可能感兴趣的:(Ubuntu nutch)