larbin源码分析---NamedSite

此类事用来存储已经访问过的网站,每一个NamedSite都对应着相应的IPsite

 

class NamedSite {

 private:

  /* string used for following CNAME chains (just one jump) */

  char *cname;

  /** we've got a good dns answer

   * get the robots.txt */

  void dnsOK ();

  /** Cannot get the inet addr

   * dnsState must have been set properly before the call */

  void dnsErr ();

  /** Delete the old identity of the site */

  void newId ();     //为网站删除旧的标识符

  /** put this url in its IPSite */

  void transfer (url *u);  //url放入他的IPSite

  /** forget this url for this reason */

  void forgetUrl (url *u, FetchError reason);

 public:

  /** Constructor */

  NamedSite ();

  /** Destructor : never used */

  ~NamedSite ();

  /* name of the site */

  char name[maxSiteSize];       //网站名

  /* port of the site */

  uint16_t port;

  /* numbers of urls in ram for this site */

  uint16_t nburls;

  /* fifo of urls waiting to be fetched */

  url *fifo[maxUrlsBySite];         //此网站中存在的url

  uint8_t inFifo;

  uint8_t outFifo;

  void putInFifo(url *u);               //入队

  url *getInFifo();                    //出队

  short fifoLength();                  //长度

  /** Is this Site in a dnsSites */

  bool isInFifo;

  /** internet addr of this server */

  char dnsState;

  struct in_addr addr;

  uint ipHash;     //此变量用以确定此NamedSite所对应的IPSite

  /* Date of expiration of dns call and robots.txt fetch */

  time_t dnsTimeout;

  /** test if a file can be fetched thanks to the robots.txt */

  bool testRobots(char *file);

  /* forbidden paths : given by robots.txt */

  Vector<char> forbidden;           //获取robots中的禁止抓取的url

  /** Put an url in the fifo

   * If there are too much, put it back in UrlsInternal

   * Never fill totally the fifo => call at least with 1 */

  void putGenericUrl(url *u, int limit, bool prio);

  inline void putUrl (url *u) { putGenericUrl(u, 15, false); }

  inline void putUrlWait (url *u) { putGenericUrl(u, 10, false); }

  inline void putPriorityUrl (url *u) { putGenericUrl(u, 5, true); }

  inline void putPriorityUrlWait (url *u) { putGenericUrl(u, 1, true); }

  /** Init a new dns query */

  void newQuery ();

  /** The dns query ended with success */

  void dnsAns (adns_answer *ans);

  /** we got the robots.txt, transfer what must be in IPSites */

  void robotsResult (FetchError res);

};

 

你可能感兴趣的:(爬虫,搜索引擎,spider,休闲,larbin)