larbin的详细配置

配置英文原版说明:

http://larbin.sourceforge.net/custom-eng.html#larbin.conf

 

larbin.conf

###############################################
# Who are you ?
# mail of the one who launched larbin (YOUR mail)
From [email protected]
# name of the bot (sent with http headers)
UserAgent larbin_2.6.3

############################################
# What are the inputs and ouputs of larbin
# port on which is launched the http statistic webserver
# if unset or set to 0, no webserver is launched//larbin在运行时可以通过  http://localhost:8081查看运行情况;如果值为0,则不启动web服务器。
httpPort 8081
# port on which you can submit urls to fetch
# no input is possible if you comment this line or use port 0
#inputPort 1976

############################################
# parameters to adapt depending on your network
# Number of connexions in parallel (to adapt depending of your network speed)//并行获取网页的数量
pagesConnexions 100
# Number of dns calls in parallel//并行解析dns的数量
dnsConnexions 5
# How deep do you want to go in a site//网页抓取深度
depthInSite 5
# do you want to follow external links//是否允许抓取域名外连接
#noExternalLinks
# time between 2 calls on the same server (in sec) : NEVER less than 30//对同一个服务器获取网页的间隔时间
waitDuration 60
# Make requests through a proxy (use with care)
#proxy www 8080

##############################################
# now, let's customize the search

# first page to fetch (you can specify several urls)
startUrl http://www.csdn.net/    //抓取网页的其实URL,可指定多值

# Do you want to limit your search to a specific domain ?
# if yes, uncomment the following line//限制爬虫抓取的网址域名后缀。
#limitToDomain .fr .dk .uk end

# What are the extensions you surely don't want//限制不被下载的对象的后缀,可通过注释或者增加后缀控制下载
# never forbid .html, .htm and so on : larbin needs them
forbiddenExtensions
#.tar .gz .tgz .zip .Z .rpm .deb
#.ps .dvi .pdf
#.png .jpg .jpeg .bmp .smi .tiff .gif
#.mov .avi .mpeg .mpg .mp3 .qt .wav .ram .rm
#.jar .java .class .diff
#.doc .xls .ppt .mdb .rtf .exe .pps .so .psd
end

 

 

 

option.h

 

// Larbin
// Sebastien Ailleret
// 27-05-01 -> 09-03-02

#ifndef LARBIN_CONFIG
#define LARBIN_CONFIG

#include "config.h"

/* This files allows a lot of customizations of larbin
 * see doc/custom-eng.html for more details
 */

/////////////////////////////////////////////////////////////
// Select the output module you want to use
//相关代码在src/interf/useroutput.cc

//#define DEFAULT_OUTPUT   // do nothing.    //除了统计,其他什么都不做,不会下载网页
#define SIMPLE_SAVE      // save in files named save/dxxxxxx/fyyyyyy    //以fyyyyy为文件名下载到save/dxxxxx目录下,该目录还包含一个index文件,记录已经下载的网址
//#define MIRROR_SAVE      // save in files (respect sites hierarchy)    //网页下载到save/dxxxxx/url目录下,其中url是起始网址
//#define STATS_OUTPUT     // do some stats on pages    //输出统计,可以通过http://localhost:8081/output.html查看

////////////////////////////////////////////////////////////
// Set up a specific search
//相关代码在src/fetch/specbuf.cc

//#define SPECIFICSEARCH    //寻找特定的文档
//#define contentTypes ((char *[]) { "audio/mpeg", NULL })    //寻找内容的类型
//#define privilegedExts ((char *[]) { ".mp3", NULL })        //文件的扩展名

// how do you want to manage specific pages (select one of the followings)
//#define DEFAULT_SPECIFIC    //默认像html一样保存
//#define SAVE_SPECIFIC        //特别的页面保存在磁盘上
//#define DYNAMIC_SPECIFIC    //对于大的文件,使用动态分配的缓冲区


//////////////////////////////////////////////////////////
// What do you want the crawler to do

// do you want to follow links in pages//如果此选项未设置,HTML页面不会被解析和链接,就不会跟踪。
#define FOLLOW_LINKS

// do you want the crawler to associate to each page the list of its sons//加入个个页面连接包含的链表
//#define LINKS_INFO

// do you want to associate a tag to pages (given in input)
// this allows to follow a page from input to output (and follow redirection)
//#define URL_TAGS

// do you want to suppress duplicate pages//如果将此选项设置,当遇到和旧的内容相同的网页时,larbin不返回成功
#define NO_DUP

// do you want larbin to stop when everything has been fetched//完成时是否要退出
//#define EXIT_AT_END

// do you want to fetch images    //是否想要下载图像
// if you enable this option, update forbiddenExtensions in larbin.conf
//#define IMAGES

// downlaod everything (ie no check of content type in http headers)//下载任何东西
//#define ANYTYPE

// do you want to manage cookies//对cookies进行管理
//#define COOKIES


//////////////////////////////////////////////////////////
// Various options

// do you want to get cgi    //获取指定的cgi
// 0 : yes ; 1 : no ; 2 : NO !    //0代表所有的cgi,1代表拒绝urls里有‘?’或‘=’的cgi,2代表禁止所有的cgi
#define CGILEVEL 0

// limit bandwith usage (in octets/sec)    //设置限制的带宽,不设置则没有限制
// be carefull, larbin might use 10 to 20% more    //larbin可能可以用到10%——20%或更多
//#define MAXBANDWIDTH 200000

// the depth is initialized each time a link goes to another site    //如果此选项,当一个链接指向另一个网站,新的URL深度会初始化,否则它永远不会
#define DEPTHBYSITE


//////////////////////////////////////////////////////////
// Efficiency vs feature

// do we need a special thread for output//如果没有设置,在程序中只有一个线程
// This is compulsory if it can block
// (not needed if you did not add code yourself)
//#define THREAD_OUTPUT

// if this option is set, larbin saves the hashtable from time to time
// this way it can restart from where it last stopped
// by reloading the table    //从上次停止的地方开始执行
//#define RELOAD


//////////////////////////////////////////////////////////
// now it's just if you need to know how it works

// do not launch the webserver        //启动web服务器
// this can be usefull in order to launch no thread at all
//#define NOWEBSERVER

// do you want nice graphs for in the stats page    //在统计页面实时显示直方图
#define GRAPH

// uncomment if you are not interested in debugging information
//#define NDEBUG    //不在web上显示调试信息

// enable this if you really dislike stats (in the webserver)//不在web上显示统计信息
//#define NOSTATS

// enable this if you really like stats (on stdout)
#define STATS        //每8秒显示一次统计信息
//#define BIGSTATS    //在屏幕上显示获取到的所有页面,但会减慢larbin速度

// Please enable this option if you want to report a crash//当崩溃是报告
// then compile with "make debug"
//#define CRASH

#endif // LARBIN_CONFIG

 

 

types.h

 

// Larbin
// Sebastien Ailleret
// 12-01-00 -> 10-12-01

#ifndef TYPES_H
#define TYPES_H

// Size of the HashSize (max number of urls that can be fetched)//hash表的大小(最大的可以提取的网址数量)
#define hashSize 64000000

// Size of the duplicate hashTable//复制哈希表的大小
#define dupSize hashSize
#define dupFile "dupfile.bak"

// Size of the arrays of Sites in main memory//主存网址数组的大小
#define namedSiteListSize 20000
#define IPSiteListSize 10000

// Max number of urls in ram//随机存取存储器的大小
#define ramUrls 100000
#define maxIPUrls 80000  // this should allow less dns call

// Max number of urls per site in Url//每个网站的网址的最大数量
#define maxUrlsBySite 254  // must fit in uint8_t

// time out when reading a page (in sec)//读一个网页超时的时间
#define timeoutPage 30   // default time out
#define timeoutIncr 2000 // number of bytes for 1 more sec

// How long do we keep dns answers and robots.txt//保持域名解释的时间
#define dnsValidTime 2*24*3600

// Maximum size of a page//可以下载的网页的最大大小
#define maxPageSize    1000000
#define nearlyFullPage  90000

// Maximum size of a robots.txt that is read
// the value used is min(maxPageSize, maxRobotsSize)
#define maxRobotsSize 10000

// How many forbidden items do we accept in a robots.txt
#define maxRobotsItem 100

// file name used for storing urls on disk//在硬盘上存储urls的文件名
#define fifoFile "fifo"
#define fifoFileWait "fifowait"

// number of urls per file on disk//每个文件的urls个数
// should be equal to ramUrls for good interaction with restart//为了在重起时有好的影响,应该和随机存储器的大小相等
#define urlByFile ramUrls

// Size of the buffer used to read sockets//套接字缓冲区的大小
#define BUF_SIZE 16384
#define STRING_SIZE 1024

// Max size for a url//url的最大值
#define maxUrlSize 512
#define maxSiteSize 40    // max size for the name of a site

// max size for cookies//cookies的最大大小
#define maxCookieSize 128

// Standard size of a fifo in a Site
#define StdVectSize maxRobotsItem

// maximum number of input connections//输入链接的最大数
#define maxInput 5

// if we save files, how many files per directory and where
#define filesPerDir 2000    //每个文件夹保存的网页数量
#define saveDir "save/"        //下载的网页保存的路径
#define indexFile "index.html"    // for MIRROR_SAVE
#define nbDir 1000                // for MIRROR_SAVE

// options for SPECIFICSEARCH (except with DEFAULT_SPECIFIC)
#define specDir "specific/"    //特殊文件的保存路径
#define maxSpecSize 5000000    //特殊文件的最大大小

// Various reasons of error when getting a page//下载一个网页时各种错误的原因
#define nbAnswers 16
enum FetchError
{
  success,
  noDNS,
  noConnection,
  forbiddenRobots,
  timeout,
  badType,
  tooBig,
  err30X,
  err40X,
  earlyStop,
  duplicate,
  fastRobots,
  fastNoConn,
  fastNoDns,
  tooDeep,
  urlDup
};

// standard types
typedef    unsigned int uint;

#endif // TYPES_H

你可能感兴趣的:(网络爬虫)