配置英文原版说明:
http://larbin.sourceforge.net/custom-eng.html#larbin.conf
larbin.conf
###############################################
# Who are you ?
# mail of the one who launched larbin (YOUR mail)
From [email protected]
# name of the bot (sent with http headers)
UserAgent larbin_2.6.3
############################################
# What are the inputs and ouputs of larbin
# port on which is launched the http statistic webserver
# if unset or set to 0, no webserver is launched//larbin在运行时可以通过 http://localhost:8081查看运行情况;如果值为0,则不启动web服务器。
httpPort 8081
# port on which you can submit urls to fetch
# no input is possible if you comment this line or use port 0
#inputPort 1976
############################################
# parameters to adapt depending on your network
# Number of connexions in parallel (to adapt depending of your network speed)//并行获取网页的数量
pagesConnexions 100
# Number of dns calls in parallel//并行解析dns的数量
dnsConnexions 5
# How deep do you want to go in a site//网页抓取深度
depthInSite 5
# do you want to follow external links//是否允许抓取域名外连接
#noExternalLinks
# time between 2 calls on the same server (in sec) : NEVER less than 30//对同一个服务器获取网页的间隔时间
waitDuration 60
# Make requests through a proxy (use with care)
#proxy www 8080
##############################################
# now, let's customize the search
# first page to fetch (you can specify several urls)
startUrl http://www.csdn.net/ //抓取网页的其实URL,可指定多值
# Do you want to limit your search to a specific domain ?
# if yes, uncomment the following line//限制爬虫抓取的网址域名后缀。
#limitToDomain .fr .dk .uk end
# What are the extensions you surely don't want//限制不被下载的对象的后缀,可通过注释或者增加后缀控制下载
# never forbid .html, .htm and so on : larbin needs them
forbiddenExtensions
#.tar .gz .tgz .zip .Z .rpm .deb
#.ps .dvi .pdf
#.png .jpg .jpeg .bmp .smi .tiff .gif
#.mov .avi .mpeg .mpg .mp3 .qt .wav .ram .rm
#.jar .java .class .diff
#.doc .xls .ppt .mdb .rtf .exe .pps .so .psd
end
option.h
// Larbin
// Sebastien Ailleret
// 27-05-01 -> 09-03-02
#ifndef LARBIN_CONFIG
#define LARBIN_CONFIG
#include "config.h"
/* This files allows a lot of customizations of larbin
* see doc/custom-eng.html for more details
*/
/////////////////////////////////////////////////////////////
// Select the output module you want to use
//相关代码在src/interf/useroutput.cc
//#define DEFAULT_OUTPUT // do nothing. //除了统计,其他什么都不做,不会下载网页
#define SIMPLE_SAVE // save in files named save/dxxxxxx/fyyyyyy //以fyyyyy为文件名下载到save/dxxxxx目录下,该目录还包含一个index文件,记录已经下载的网址
//#define MIRROR_SAVE // save in files (respect sites hierarchy) //网页下载到save/dxxxxx/url目录下,其中url是起始网址
//#define STATS_OUTPUT // do some stats on pages //输出统计,可以通过http://localhost:8081/output.html查看
////////////////////////////////////////////////////////////
// Set up a specific search
//相关代码在src/fetch/specbuf.cc
//#define SPECIFICSEARCH //寻找特定的文档
//#define contentTypes ((char *[]) { "audio/mpeg", NULL }) //寻找内容的类型
//#define privilegedExts ((char *[]) { ".mp3", NULL }) //文件的扩展名
// how do you want to manage specific pages (select one of the followings)
//#define DEFAULT_SPECIFIC //默认像html一样保存
//#define SAVE_SPECIFIC //特别的页面保存在磁盘上
//#define DYNAMIC_SPECIFIC //对于大的文件,使用动态分配的缓冲区
//////////////////////////////////////////////////////////
// What do you want the crawler to do
// do you want to follow links in pages//如果此选项未设置,HTML页面不会被解析和链接,就不会跟踪。
#define FOLLOW_LINKS
// do you want the crawler to associate to each page the list of its sons//加入个个页面连接包含的链表
//#define LINKS_INFO
// do you want to associate a tag to pages (given in input)
// this allows to follow a page from input to output (and follow redirection)
//#define URL_TAGS
// do you want to suppress duplicate pages//如果将此选项设置,当遇到和旧的内容相同的网页时,larbin不返回成功
#define NO_DUP
// do you want larbin to stop when everything has been fetched//完成时是否要退出
//#define EXIT_AT_END
// do you want to fetch images //是否想要下载图像
// if you enable this option, update forbiddenExtensions in larbin.conf
//#define IMAGES
// downlaod everything (ie no check of content type in http headers)//下载任何东西
//#define ANYTYPE
// do you want to manage cookies//对cookies进行管理
//#define COOKIES
//////////////////////////////////////////////////////////
// Various options
// do you want to get cgi //获取指定的cgi
// 0 : yes ; 1 : no ; 2 : NO ! //0代表所有的cgi,1代表拒绝urls里有‘?’或‘=’的cgi,2代表禁止所有的cgi
#define CGILEVEL 0
// limit bandwith usage (in octets/sec) //设置限制的带宽,不设置则没有限制
// be carefull, larbin might use 10 to 20% more //larbin可能可以用到10%——20%或更多
//#define MAXBANDWIDTH 200000
// the depth is initialized each time a link goes to another site //如果此选项,当一个链接指向另一个网站,新的URL深度会初始化,否则它永远不会
#define DEPTHBYSITE
//////////////////////////////////////////////////////////
// Efficiency vs feature
// do we need a special thread for output//如果没有设置,在程序中只有一个线程
// This is compulsory if it can block
// (not needed if you did not add code yourself)
//#define THREAD_OUTPUT
// if this option is set, larbin saves the hashtable from time to time
// this way it can restart from where it last stopped
// by reloading the table //从上次停止的地方开始执行
//#define RELOAD
//////////////////////////////////////////////////////////
// now it's just if you need to know how it works
// do not launch the webserver //启动web服务器
// this can be usefull in order to launch no thread at all
//#define NOWEBSERVER
// do you want nice graphs for in the stats page //在统计页面实时显示直方图
#define GRAPH
// uncomment if you are not interested in debugging information
//#define NDEBUG //不在web上显示调试信息
// enable this if you really dislike stats (in the webserver)//不在web上显示统计信息
//#define NOSTATS
// enable this if you really like stats (on stdout)
#define STATS //每8秒显示一次统计信息
//#define BIGSTATS //在屏幕上显示获取到的所有页面,但会减慢larbin速度
// Please enable this option if you want to report a crash//当崩溃是报告
// then compile with "make debug"
//#define CRASH
#endif // LARBIN_CONFIG
types.h
// Larbin
// Sebastien Ailleret
// 12-01-00 -> 10-12-01
#ifndef TYPES_H
#define TYPES_H
// Size of the HashSize (max number of urls that can be fetched)//hash表的大小(最大的可以提取的网址数量)
#define hashSize 64000000
// Size of the duplicate hashTable//复制哈希表的大小
#define dupSize hashSize
#define dupFile "dupfile.bak"
// Size of the arrays of Sites in main memory//主存网址数组的大小
#define namedSiteListSize 20000
#define IPSiteListSize 10000
// Max number of urls in ram//随机存取存储器的大小
#define ramUrls 100000
#define maxIPUrls 80000 // this should allow less dns call
// Max number of urls per site in Url//每个网站的网址的最大数量
#define maxUrlsBySite 254 // must fit in uint8_t
// time out when reading a page (in sec)//读一个网页超时的时间
#define timeoutPage 30 // default time out
#define timeoutIncr 2000 // number of bytes for 1 more sec
// How long do we keep dns answers and robots.txt//保持域名解释的时间
#define dnsValidTime 2*24*3600
// Maximum size of a page//可以下载的网页的最大大小
#define maxPageSize 1000000
#define nearlyFullPage 90000
// Maximum size of a robots.txt that is read
// the value used is min(maxPageSize, maxRobotsSize)
#define maxRobotsSize 10000
// How many forbidden items do we accept in a robots.txt
#define maxRobotsItem 100
// file name used for storing urls on disk//在硬盘上存储urls的文件名
#define fifoFile "fifo"
#define fifoFileWait "fifowait"
// number of urls per file on disk//每个文件的urls个数
// should be equal to ramUrls for good interaction with restart//为了在重起时有好的影响,应该和随机存储器的大小相等
#define urlByFile ramUrls
// Size of the buffer used to read sockets//套接字缓冲区的大小
#define BUF_SIZE 16384
#define STRING_SIZE 1024
// Max size for a url//url的最大值
#define maxUrlSize 512
#define maxSiteSize 40 // max size for the name of a site
// max size for cookies//cookies的最大大小
#define maxCookieSize 128
// Standard size of a fifo in a Site
#define StdVectSize maxRobotsItem
// maximum number of input connections//输入链接的最大数
#define maxInput 5
// if we save files, how many files per directory and where
#define filesPerDir 2000 //每个文件夹保存的网页数量
#define saveDir "save/" //下载的网页保存的路径
#define indexFile "index.html" // for MIRROR_SAVE
#define nbDir 1000 // for MIRROR_SAVE
// options for SPECIFICSEARCH (except with DEFAULT_SPECIFIC)
#define specDir "specific/" //特殊文件的保存路径
#define maxSpecSize 5000000 //特殊文件的最大大小
// Various reasons of error when getting a page//下载一个网页时各种错误的原因
#define nbAnswers 16
enum FetchError
{
success,
noDNS,
noConnection,
forbiddenRobots,
timeout,
badType,
tooBig,
err30X,
err40X,
earlyStop,
duplicate,
fastRobots,
fastNoConn,
fastNoDns,
tooDeep,
urlDup
};
// standard types
typedef unsigned int uint;
#endif // TYPES_H