order.xml是整个Heritrix的核心,里面的每个一个配置都关系到Heritrix的运行情况,没读源码之前我只能从有限的渠道去获知这些配置的运用.读完之后才知道Heritrix竟然有如此灵活的运用,如可以控制抓取速度,可以优化电脑性能,可以在某一次的抓取上继续抓取.当然整个order.xml里我也没有全部掌握,只知道大部分配置的作用,希望大家指点改正以及补充,谢谢!
- <meta></meta> 代表着该抓取JOB的元素,相当于Html的meta
- <meta>
- <name>myheritrix</name>
- <description>my heritrix</description>
- <operator>Admin</operator>
- <organization></organization>
- <audience></audience>
- <date>20090520051654</date>
2.<controller></controller> 跟抓取有关的所有参数,由于内容较多,并且Heritrix也已将他们分成不同模块,所以这里我也将他们拆分来说明.
- <controller>
- <string name="settings-directory">settings</string>
- <string name="disk-path"></string>
- <string name="logs-path">logs</string>
- <string name="checkpoints-path">checkpoints</string>
- <string name="state-path">state</string>
- <string name="scratch-path">scratch</string>
- <long name="max-bytes-download">0</long>
- <long name="max-document-download">0</long>
- <long name="max-time-sec">0</long>
- <integer name="max-toe-threads">30</integer>
- <integer name="recorder-out-buffer-bytes">4096</integer>
- <integer name="recorder-in-buffer-bytes">65536</integer>
- <integer name="bdb-cache-percent">0</integer>
- <newObject name="scope" class="org.archive.crawler.deciderules.DecidingScope">
- </newObject>
- <map name="http-headers">
- </map>
- <newObject name="robots-honoring-policy" class="org.archive.crawler.datamodel.RobotsHonoringPolicy">
- </newObject>
- <newObject name="frontier" class="org.archive.crawler.frontier.BdbFrontier"> <!-- Frontier 调度器,等下拆分来说明-- >
- </newObject>
- <map name="uri-canonicalization-rules">
- </map>
- <map name="pre-fetch-processors">
- </map>
- <map name="fetch-processors">
- </map>
- <map name="extract-processors">
- </map>
- <map name="write-processors">
- </map>
- <map name="post-processors">
- </map>
- <map name="loggers">
- </map>
- <newObject name="credential-store" class="org.archive.crawler.datamodel.CredentialStore">
- </newObject>
- </controller>
3.接下来拆分每个组件的配置文件一一进行说明,最后对Heritrix主要的配置也就是我们可以影响抓取的配置进行说明。
3.1:抓取范围<newObject name="scope" class="org.archive.crawler.deciderules.DecidingScope">
- <newObject name="scope" class="org.archive.crawler.deciderules.DecidingScope">
- <boolean name="enabled">false</boolean>
- <string name="seedsfile">seeds.txt</string>
- <boolean name="reread-seeds-on-config">true</boolean>
- <newObject name="decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- <newObject name="rejectByDefault" class="org.archive.crawler.deciderules.RejectDecideRule">
- </newObject>
- <newObject name="acceptIfSurtPrefixed" class="org.archive.crawler.deciderules.SurtPrefixedDecideRule">
- <string name="decision">ACCEPT</string>
- <string name="surts-source-file"></string>
- <boolean name="seeds-as-surt-prefixes">true</boolean>
- <string name="surts-dump-file"></string>
- <boolean name="also-check-via">false</boolean>
- <boolean name="rebuild-on-reconfig">true</boolean>
- </newObject>
- <newObject name="rejectIfTooManyHops" class="org.archive.crawler.deciderules.TooManyHopsDecideRule">
- <integer name="max-hops">20</integer>
- </newObject>
- <newObject name="acceptIfTranscluded" class="org.archive.crawler.deciderules.TransclusionDecideRule">
- <integer name="max-trans-hops">3</integer>
- <integer name="max-speculative-hops">1</integer>
- </newObject>
- <newObject name="rejectIfPathological" class="org.archive.crawler.deciderules.PathologicalPathDecideRule">
- <integer name="max-repetitions">2</integer>
- </newObject>
- <newObject name="rejectIfTooManyPathSegs" class="org.archive.crawler.deciderules.TooManyPathSegmentsDecideRule">
- <integer name="max-path-depth">20</integer>
- </newObject>
- <newObject name="acceptIfPrerequisite" class="org.archive.crawler.deciderules.PrerequisiteAcceptDecideRule">
- </newObject>
- </map>
- </newObject>
- </newObject>
3.2: HTTP协议<map name="http-headers">
- <map name="http-headers">
- <string name="user-agent">Mozilla/5.0 (compatible; heritrix/1.14.3 +http://127.0.0.1)</string>
- <string name="from">guoyunsky@hotmail.com</string>
- </map>
3.3:爬虫协议 <newObject name="robots-honoring-policy" class="org.archive.crawler.datamodel.RobotsHonoringPolicy">
- <newObject name="robots-honoring-policy" class="org.archive.crawler.datamodel.RobotsHonoringPolicy">
- <string name="type">classic</string>
- <boolean name="masquerade">false</boolean>
- <text name="custom-robots"></text>
- <stringList name="user-agents">
- </stringList>
- </newObject>
3.4:Frontier 调度器<newObject name="frontier" class="org.archive.crawler.frontier.BdbFrontier"><!-- Frontier 调度器-->
- <newObject name="frontier" class="org.archive.crawler.frontier.BdbFrontier">
- <float name="delay-factor">4.0</float>
- <integer name="max-delay-ms">20000</integer>
- <integer name="min-delay-ms">2000</integer>
- <integer name="respect-crawl-delay-up-to-secs">300</integer>
- <integer name="max-retries">30</integer>
- <long name="retry-delay-seconds">900</long>
- <integer name="preference-embed-hops">1</integer>
- <integer name="total-bandwidth-usage-KB-sec">0</integer>
- <integer name="max-per-host-bandwidth-usage-KB-sec">0</integer>
- <string name="queue-assignment-policy">org.archive.crawler.frontier.HostnameQueueAssignmentPolicy</string>
- <string name="force-queue-assignment"></string>
- <boolean name="pause-at-start">false</boolean>
- <boolean name="pause-at-finish">false</boolean>
- <boolean name="source-tag-seeds">false</boolean>
- <boolean name="recovery-log-enabled">true</boolean>
- <boolean name="hold-queues">true</boolean>
- <integer name="balance-replenish-amount">3000</integer>
- <integer name="error-penalty-amount">100</integer>
- <long name="queue-total-budget">-1</long>
- <string name="cost-policy">org.archive.crawler.frontier.ZeroCostAssignmentPolicy</string>
- <long name="snooze-deactivate-ms">300000</long>
- <integer name="target-ready-backlog">50</integer>
- <string name="uri-included-structure">org.archive.crawler.util.BdbUriUniqFilter</string>
- <boolean name="dump-pending-at-close">false</boolean>
- </newObject>
3.5:URL规范化规则,主要用来规范化每个URL,用Heritrix默认的就好了,这里不做说明了,其实也是通过各种规则
3.6:预先处理链组件: <map name="pre-fetch-processors">
- <map name="pre-fetch-processors">
- <newObject name="Preselector" class="org.archive.crawler.prefetch.Preselector">
- <boolean name="enabled">true</boolean>
- <newObject name="Preselector#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- <boolean name="override-logger">false</boolean>
- <boolean name="recheck-scope">true</boolean>
- <boolean name="block-all">false</boolean>
- <string name="block-by-regexp"></string>
- <string name="allow-by-regexp"></string>
- </newObject>
- <newObject name="Preprocessor" class="org.archive.crawler.prefetch.PreconditionEnforcer">
- <boolean name="enabled">true</boolean>
- <newObject name="Preprocessor#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
- <map name="rules">
- </map>
- </newObject>
- <integer name="ip-validity-duration-seconds">86400</integer>
- <integer name="robot-validity-duration-seconds">86400</integer>
- <boolean name="calculate-robots-only">false</boolean>
- </newObject>
- </map>
3.7:获取组件:<map name="fetch-processors">
3.8:抽取组件<map name="extract-processors"> <!-- 抽取链 -->
<map name="extract-processors"> <!-- 抽取链 -->
<newObject name="ExtractorHTTP" class="org.archive.crawler.extractor.ExtractorHTTP"><!-- 抽取HTTP-->
<boolean name="enabled">true</boolean><!-- 是否启用该组件 -->
<newObject name="ExtractorHTTP#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence"><!-- 规则,用于忽略不符合规则的URL -->
<map name="rules"><!-- -->
</map>
</newObject>
</newObject>
<newObject name="ExtractorHTML" class="org.archive.crawler.extractor.ExtractorHTML"><!-- 抽取HTML,主要的抽取类 -->
<boolean name="enabled">true</boolean><!-- 是否启用该组件 -->
<newObject name="ExtractorHTML#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence"><!-- 规则,用于忽略不符合规则的URL -->
<map name="rules"><!-- -->
</map>
</newObject>
<boolean name="extract-javascript">true</boolean><!-- 是否在Javascript里找链接,默认为true -->
<boolean name="treat-frames-as-embed-links">true</boolean><!-- 如果以上值为true,FRAME/IFRAME被当做嵌入式链接(像图片,hop-type是E),否则就把他们当做导航链接,默认为true -->
<boolean name="ignore-form-action-urls">true</boolean><!-- 如果为true,uri中再HTML FORM中出现的Action属性将被忽略,默认为false -->
<boolean name="extract-only-form-gets">true</boolean><!-- 如果为true,则uri中HTML FORM中只抽取Method为get的URL,Method为post的将被忽略-->
<boolean name="extract-value-attributes">true</boolean><!--如果为true,则抽取那些像链接的字符串,这种操作可能会抽取到有效的和无效的链接,默认为true-->
<boolean name="ignore-unexpected-html">true</boolean><!-- 如果为true,则那种特殊格式的URL,比如图片将不会被扫描,默认为true -->
</newObject>
</map>
3.9:写组件<map name="write-processors">
<map name="write-processors"> <!--写链 -->
<newObject name="Archiver" class="com.steel.heritrix.extend.MyWriterMirror"><!--这里我是用的自己的写链-->
<boolean name="enabled">true</boolean><!-- 是否启用该组件 -->
<newObject name="Archiver#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence"><!--规则,用于忽略不符合规则的URL -->
<map name="rules"><!-- -->
</map>
</newObject>
<boolean name="case-sensitive">true</boolean> <!--true表示操作系统区分大小写 -->
<stringList name="character-map" /> <!--这是一个键值对组,用value代替key.-->
<stringList name="content-type-map" /> <!--这是一个键值对组,用value代替key -->
<string name="directory-file">index.html</string> <!-- 如果给定的URL不是明确的HTML,则从这个URL去获取-->
<string name="dot-begin">%2E</string> <!--如果一个段以.开头,则用这个值替换它。 -->
<string name="dot-end">.</string> <!--如果一个目录以.结尾,则用这个值替换它.所有的操作系统出了Windows,.是建议使用的.但Windws,%%2E才是建议的 -->
<stringList name="host-map" /> <!--这是一个键值对组,如果一个host名字里匹配该key,则用value值替换它。当一个host使用多个name时这个可以保持一致性,如:[12.34.56.78 www42.foo.com] -->
<boolean name="host-directory">true</boolean> <!--是否创建在url在host命名中的子目录.如www.baidu.com创建www.baidu.com这个目录,而www.baidu.com/zhidao,则在www.baidu.com目录后面再创建知道这个子目录 -->
<string name="path">mirror</string> <!-- 用于下载html文件的头目录-->
<integer name="max-path-length">1023</integer> <!--文件系统路径最大长度 -->
<integer name="max-segment-length">255</integer> <!-- 文件系统路径中段路径的最大长度-->
<boolean name="port-directory">false</boolean> <!--在url中是否创建一个以port命名的子目录 -->
<boolean name="suffix-at-end">true</boolean> <!--如果为true,则后缀放在url中查询段的后面.如果为false则放在前面 -->
<string name="too-long-directory">LONG</string> <!--如果url中目录都超过或者接近超过文件系统最大长度,超过部分它们都将用这个代替. -->
<stringList name="underscore-set" /> <!--如果一个目录名在列表里忽略大小写,那么_将放在它前面.所有的文件系统除了Windows,这个是不需要的.Windows里需要注意的是:[com1 com2 com3 com4 com5 com6 com7 com8 com9 lpt1 lpt2 lpt3 lpt4 lpt5 lpt6 lpt7 lpt8 lpt9 con nul prn] -->
</newObject>
</map>
3.10:请求链组件<map name="post-processors">里面可以配置自己的调度器
<map name="post-processors"> <!-- 请求链:清理URI和在URI范围内填充新的URI -->
<newObject name="Updater" class="org.archive.crawler.postprocessor.CrawlStateUpdater"><!-- -->
<boolean name="enabled">true</boolean><!-- -->
<newObject name="Updater#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence"><!-- -->
<map name="rules"><!-- -->
</map>
</newObject>
</newObject>
<newObject name="LinksScoper" class="org.archive.crawler.postprocessor.LinksScoper"><!-- -->
<boolean name="enabled">true</boolean><!-- -->
<newObject name="LinksScoper#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence"><!-- -->
<map name="rules">
</map>
</newObject>
<boolean name="override-logger">false</boolean><!-- 如果启用则覆盖这个类的默认日志器,默认日志器将日志打印在控制台.覆盖的日志器将把所有日志发送到
在日志目录下的以本类命名的日志文件中。在heritrix.properties中设置好日志等级和日志格式,这个属性在重启后知获取一次. -->
<boolean name="seed-redirects-new-seed">true</boolean><!-- 如果为true,任何种子重定向的URL,同样当做一个种子对待 -->
<integer name="preference-depth-hops">-1</integer><!-- 种子重定向url hop等级设置.-->
<newObject name="scope-rejected-url-rules" class="org.archive.crawler.deciderules.DecideRuleSequence"><!-- -->
<map name="rules">
</map>
</newObject>
</newObject>
<newObject name="Scheduler" class="com.steel.heritrix.extend.MyFrontierScheduler"><!-- 我自己的调度器 -->
<boolean name="enabled">true</boolean><!-- -->
<newObject name="Scheduler#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence"><!-- -->
<map name="rules"><!-- -->
</map>
</newObject>
</newObject>
</map>
3.11:统计跟踪链组件<map name="loggers">
<map name="loggers"> <!-- 统计跟踪链.统计跟踪模块,指定用于监视抓取和写日志,以及报告和提供信息给用户接口-->
<newObject name="crawl-statistics" class="org.archive.crawler.admin.StatisticsTracker"><!--统计类 -->
<integer name="interval-seconds">20</integer><!--写日志消息的时间间隔(秒) -->
</newObject>
</map>