爬虫Larbin解析(二)——sequencer()

分析的函数: void sequencer() 

//位置:larbin-2.6.3/src/fetch/sequencer.cc
void
sequencer() { bool testPriority = true; if (space == 0) //unit space = 0 { space = global::inter->putAll(); } int still = space; if (still > maxPerCall) //#define maxPerCall 100 still = maxPerCall; while (still) { if (canGetUrl(&testPriority)) { --space; --still; } else { still = 0; } } }

所在的文件

larbin-2.6.3/src/fetch/sequencer.h、larbin-2.6.3/src/fetch/sequencer.cc

// Larbin
// Sebastien Ailleret
// 15-11-99 -> 15-11-99

#ifndef SEQUENCER_H
#define SEQUENCER_H

/** only for debugging, handle with care */
extern uint space;

/** Call the sequencer */
void sequencer ();

#endif
View Code
// Larbin
// Sebastien Ailleret
// 15-11-99 -> 04-01-02

#include 

#include "options.h"

#include "global.h"
#include "types.h"
#include "utils/url.h"
#include "utils/debug.h"
#include "fetch/site.h"

static bool canGetUrl (bool *testPriority);
uint space = 0;

#define maxPerCall 100

/** start the sequencer*/
//按优先度将URL放到代爬站点
void sequencer() 
{
    bool testPriority = true;
    if (space == 0) 
    {
        space = global::inter->putAll();
    }
    int still = space;
    if (still > maxPerCall)  //#define maxPerCall 100
        still = maxPerCall;
    while (still) 
    {
        if (canGetUrl(&testPriority)) 
        {
            space--; 
            still--; 
        } 
        else 
        { 
            still = 0; 
        } 
    }
}

/* Get the next url
 * here is defined how priorities are handled
 按优先级从各个URL队列
 (比如URLsDisk,URLsDiskWait或URLsPriority,URLsPriorityWait)
 获取url保存到某个NameSite(通过url的hash值)
 */
static bool canGetUrl (bool *testPriority) 
{
    url *u;
    if (global::readPriorityWait)  // global.cc  赋值为0
    {
        global::readPriorityWait--;
        u = global::URLsPriorityWait->get();
        global::namedSiteList[u->hostHashCode()].putPriorityUrlWait(u);
        return true;
    } 
    else if (*testPriority && (u=global::URLsPriority->tryGet()) != NULL) 
    {
        // We've got one url (priority)
        global::namedSiteList[u->hostHashCode()].putPriorityUrl(u);
        return true;
    } 
    else 
    {
        *testPriority = false;
        // Try to get an ordinary url
        if (global::readWait) 
        {
          global::readWait--;
          u = global::URLsDiskWait->get();
          global::namedSiteList[u->hostHashCode()].putUrlWait(u);
          return true;
        } 
        else 
        {
            u = global::URLsDisk->tryGet();
            if (u != NULL) 
            {
                global::namedSiteList[u->hostHashCode()].putUrl(u);
                return true;
            }
            else 
            {
                return false;
            }
        }
    }
}
View Code

 

一、 对于space = global::inter->putAll();

1. interf在global.cc(位置:/larbin-2.6.3/src/global.cc)中的定义为

inter = new Interval(ramUrls);   //#define ramUrls 100000  (位置:larbin-2.6.3/src/types.h)

批注:区别 inter = new Interval(ramUrls);  和 inter = new Interval[ramUrls];  前一个()内是参数,要传入构造函数的;后一个[]内是开辟数组的个数。

2. 类 Interval定义(位置:/larbin-2.6.3/src/fetch/site.h)

/** This class is intended to make sure the sum of the
 * sizes of the fifo included in the different sites
 * are not too big
 */
class Interval 
{
    public:
        Interval (uint sizes) : size(sizes), pos(0) {}
        ~Interval () {}
        /** How many urls can we put. Answer 0: if no urls can be put */
        inline uint putAll () 
        { 
            int res = size - pos; 
            pos = size; 
            return res; 
        }
        /** Warn an url has been retrieved */
        inline void getOne () 
        { 
            --pos; 
        }
        /** only for debugging, handle with care */
        inline uint getPos () 
        { 
            return pos; 
        }
    private:
        /** Size of the interval */
        uint size;
        /** Position in the interval */
        uint pos;
};
View Code

批注:类内的函数定义为inline。对内联函数的几点说明:

  • 内联函数避免函数调用的开销。将函数指定为内联函数,(通常)就是将它在程序的每个调用点上“内联地”展开,消除调用函数进行的额外开销(调用前先保存寄存器,并在返回时回复)。内联说明(在函数返回值前加inline)对编译器来说只是一个建议,编译器可以选择忽略。一般内敛函数适用于优化小的、只有几行、经常被调用的函数。大多数编译器不支持递归函数的内敛。
  • 把内联函数放在头文件。以便编译器能够在调用点展开同一个函数(保证编译器可见、所有的定义相同)。
  • 编译器隐式地将在类内定义的成员函数当作为内联函数.

 

二、 对于canGetUrl(&testPriority)

函数定义(位置larbin-2.6.3/src/fetch/sequencer.cc)

/* Get the next url
 * here is defined how priorities are handled
 按优先级从各个URL队列
 (比如URLsDisk,URLsDiskWait或URLsPriority,URLsPriorityWait)
 获取url保存到某个NameSite(通过url的hash值)

at "global.cc"
// FIFOs
URLsDisk         = new PersistentFifo(reload, fifoFile);
URLsDiskWait     = new PersistentFifo(reload, fifoFileWait);
URLsPriority     = new SyncFifo;
URLsPriorityWait = new SyncFifo;

 */
static bool canGetUrl (bool *testPriority) 
{
    url *u;
    if (global::readPriorityWait != 0)  // 在global.cc声明定义: uint global::readPriorityWait=0;
    {
        global::readPriorityWait--;
        u = global::URLsPriorityWait->get();
        global::namedSiteList[u->hostHashCode()].putPriorityUrlWait(u);
        return true;
    } 
    else if (*testPriority && (u=global::URLsPriority->tryGet()) != NULL) 
    {
        // We've got one url (priority)
        global::namedSiteList[u->hostHashCode()].putPriorityUrl(u);
        return true;
    } 
    else 
    {
        *testPriority = false;
        // Try to get an ordinary url
        if (global::readWait) 
        {
          global::readWait--;
          u = global::URLsDiskWait->get();
          global::namedSiteList[u->hostHashCode()].putUrlWait(u);
          return true;
        } 
        else 
        {
            u = global::URLsDisk->tryGet();
            if (u != NULL) 
            {
                global::namedSiteList[u->hostHashCode()].putUrl(u);
                return true;
            }
            else 
            {
                return false;
            }
        }
    }
}

1. 为什么diskpriority的队列都是成对出现的,是因为可以认为每个sitenamedSiteList当中都有一个小的队列来保存它的url,这个url的个数是有个数限制的,当超过这个限制的时候就不能再把该site下的url放入,但也不能丢弃,而是放入wait队列。Larbin会控制一段时间在disk队列中取url,一段时间在diskWait当中取urldiskpriority的区别只是优先级的区别。namedSiteList的作用是实现了DNS缓存

          爬虫Larbin解析(二)——sequencer()_第1张图片

2. global::readPriorityWait 的值由main.cc的cron()函数中变化得知

// see if we should read again urls in fifowait
if ((global::now % 300) == 0) {
    global::readPriorityWait = global::URLsPriorityWait->getLength();
    global::readWait = global::URLsDiskWait->getLength();
}
if ((global::now % 300) == 150) {
    global::readPriorityWait = 0;
    global::readWait = 0;
}

这里global::now%300是判断这次是对wait里的url进行处理,还是对不是wait里的进行处理,这里的%300等于0150的概率都是1/300,所以大约300次换一次。readPriorityWaitURLsPriorityWait中的长度(也就是url的数量);readWait是URLsDiskWait中url的个数。

3. 在canGetUrl中,在对于每个站点,将相应的url放进去。putPriorityUrlWait, putPriorityUrl, putUrlWait, putUrl在site.h的定义如下

/** Put an url in the fifo
 * If there are too much, put it back in UrlsInternal
 * Never fill totally the fifo => call at least with 1 */
void putGenericUrl(url *u, int limit, bool prio);
inline void putUrl(url *u) {
    putGenericUrl(u, 15, false);
}
inline void putUrlWait(url *u) {
    putGenericUrl(u, 10, false);
}
inline void putPriorityUrl(url *u) {
    putGenericUrl(u, 5, true);
}
inline void putPriorityUrlWait(url *u) {
    putGenericUrl(u, 1, true);
}

 可以发现,每次都是调用函数putGenericUrl,其定义如下

/* Put an url in the fifo if their are not too many */
void NamedSite::putGenericUrl(url *u, int limit, bool prio) 
{
if (nburls > maxUrlsBySite - limit)
{
// Already enough Urls in memory for this Site // first check if it can already be forgotten if (!strcmp(name, u->getHost()))
{
if (dnsState == errorDns)
{ nburls
++; forgetUrl(u, noDNS); return; } if (dnsState == noConnDns)
{ nburls
++; forgetUrl(u, noConnection); return; } if (u->getPort() == port && dnsState == doneDns && !testRobots(u->getFile()))
{ nburls
++; forgetUrl(u, forbiddenRobots); return; } } // else put it back in URLsDisk refUrl(); global::inter->getOne(); if (prio)
{
global::URLsPriorityWait->put(u); }
else
{
global::URLsDiskWait->put(u); } }

如果已经有足够多的url在内存里,执行这里if中的代码,strcmp(name,u->getHost())是判断这个主机是不是已经就进行过dns方面的判断,也就是说对于一个站点,只做一次dns解析的判断,以后就按这个结果进行处理,dnsStatenoDnsnoConnDns,还有robots.txt不允许的情况,如果没有问题,就把它放到URLsDisk中。

else {
    nburls++;
    if (dnsState == waitDns || strcmp(name, u->getHost()) || port
           != u->getPort() || global::now > dnsTimeout) {
       // dns not done or other site
       putInFifo(u);
       addNamedUrl();
       // Put Site in fifo if not yet in
       if (!isInFifo) {
           isInFifo = true;
           global::dnsSites->put(this);
       }
    } else
       switch (dnsState) {
       case doneDns:
           transfer(u);
           break;
       case errorDns:
           forgetUrl(u, noDNS);
           break;
       default: // noConnDns
           forgetUrl(u, noConnection);
       }
}

 如果需要判断dns能不能解析,就将它放到dnsSites里,这个会在fetchDns中判断。或是如果还能放到内存里,并且又是doneDns,表示可以解析,就调用transfer

void NamedSite::transfer(url *u) {
    if (testRobots(u->getFile())) {
       if (global::proxyAddr == NULL) {
           memcpy(&u->addr, &addr, sizeof(struct in_addr));
       }
       global::IPSiteList[ipHash].putUrl(u);
    } else {
       forgetUrl(u, forbiddenRobots);
    }
}

这里是将url放入到IPSiteList的相应ipHash中。

 

附类的定义

类url定义(larbin-2.6.3/src/utils/url.h  larbin-2.6.3/src/utils/url.cc

// Larbin
// Sebastien Ailleret
// 15-11-99 -> 14-03-02

/* This class describes an URL */

#ifndef URL_H
#define URL_H

#include in.h>
#include 
#include 
#include 

#include "types.h"

bool fileNormalize (char *file);

class url {
 private:
  char *host;
  char *file;
  uint16_t port; // the order of variables is important for physical size
  int8_t depth;
  /* parse the url */
  void parse (char *s);
  /** parse a file with base */
  void parseWithBase (char *u, url *base);
  /* normalize file name */
  bool normalize (char *file);
  /* Does this url starts with a protocol name */
  bool isProtocol (char *s);
  /* constructor used by giveBase */
  url (char *host, uint port, char *file);

 public:
  /* Constructor : Parses an url (u is deleted) */
  url (char *u, int8_t depth, url *base);

  /* constructor used by input */
  url (char *line, int8_t depth);

  /* Constructor : read the url from a file (cf serialize) */
  url (char *line);

  /* Destructor */
  ~url ();

  /* inet addr (once calculated) */
  struct in_addr addr;

  /* Is it a valid url ? */
  bool isValid ();

  /* print an URL */
  void print ();

  /* return the host */
  inline char *getHost () { return host; }

  /* return the port */
  inline uint getPort () { return port; }

  /* return the file */
  inline char *getFile () { return file; }

  /** Depth in the Site */
  inline int8_t getDepth () { return depth; }

  /* Set depth to max if we are at an entry point in the site
   * try to find the ip addr
   * answer false if forbidden by robots.txt, true otherwise */
  bool initOK (url *from);

  /** return the base of the url
   * give means that you have to delete the string yourself
   */
  url *giveBase ();

  /** return a char * representation of the url
   * give means that you have to delete the string yourself
   */
  char *giveUrl ();

  /** write the url in a buffer
   * buf must be at least of size maxUrlSize
   * returns the size of what has been written (not including '\0')
   */
  int writeUrl (char *buf);

  /* serialize the url for the Persistent Fifo */
  char *serialize ();

  /* very thread unsafe serialisation in a static buffer */
  char *getUrl();

  /* return a hashcode for the host of this url */
  uint hostHashCode ();

  /* return a hashcode for this url */
  uint hashCode ();

#ifdef URL_TAGS
  /* tag associated to this url */
  uint tag;
#endif // URL_TAGS

#ifdef COOKIES
  /* cookies associated with this page */
  char *cookie;
  void addCookie(char *header);
#else // COOKIES
  inline void addCookie(char *header) {}
#endif // COOKIES
};

#endif // URL_H
View Code
// Larbin
// Sebastien Ailleret
// 15-11-99 -> 16-03-02

/* This class describes an URL */

#include 
#include 
#include 
#include <string.h>
#include 
#include 
#include 

#include "options.h"

#include "types.h"
#include "global.h"
#include "utils/url.h"
#include "utils/text.h"
#include "utils/connexion.h"
#include "utils/debug.h"

#ifdef COOKIES
#define initCookie() cookie=NULL
#else // COOKIES
#define initCookie() ((void) 0)
#endif // COOKIES

/* small functions used later */
static uint siteHashCode (char *host) {
  uint h=0;
  uint i=0;
  while (host[i] != 0) {
    h = 37*h + host[i];
    i++;
  }
  return h % namedSiteListSize;
}

/* return the int with correspond to a char
 * -1 if not an hexa char */
static int int_of_hexa (char c) {
  if (c >= '0' && c <= '9')
    return (c - '0');
  else if (c >= 'a' && c <= 'f')
    return (c - 'a' + 10);
  else if (c >= 'A' && c <= 'F')
    return (c - 'A' + 10);
  else
    return -1;
}

/* normalize a file name : also called by robots.txt parser
 * return true if it is ok, false otherwise (cgi-bin)
 */
bool fileNormalize (char *file) {
  int i=0;
  while (file[i] != 0 && file[i] != '#') {
    if (file[i] == '/') {
      if (file[i+1] == '.' && file[i+2] == '/') {
        // suppress /./
        int j=i+3;
        while (file[j] != 0) {
          file[j-2] = file[j];
          j++;
        }
        file[j-2] = 0;
      } else if (file[i+1] == '/') {
        // replace // by /
        int j=i+2;
        while (file[j] != 0) {
          file[j-1] = file[j];
          j++;
        }
        file[j-1] = 0;
      } else if (file[i+1] == '.' && file[i+2] == '.' && file[i+3] == '/') {
        // suppress /../
        if (i == 0) {
          // the file name starts with /../ : error
          return false;
        } else {
          int j = i+4, dec;
          i--;
          while (file[i] != '/') { i--; }
          dec = i+1-j; // dec < 0
          while (file[j] != 0) {
            file[j+dec] = file[j];
            j++;
          }
          file[j+dec] = 0;
        }
      } else if (file[i+1] == '.' && file[i+2] == 0) {
        // suppress /.
        file[i+1] = 0;
        return true;
      } else if (file[i+1] == '.' && file[i+2] == '.' && file[i+3] == 0) {
        // suppress /..
        if (i == 0) {
          // the file name starts with /.. : error
          return false;
        } else {
          i--;
          while (file[i] != '/') {
            i--;
          }
          file[i+1] = 0;
          return true;
        }
      } else { // nothing special, go forward
        i++;
      }
    } else if (file[i] == '%') {
      int v1 = int_of_hexa(file[i+1]);
      int v2 = int_of_hexa(file[i+2]);
      if (v1 < 0 || v2 < 0) return false;
      char c = 16 * v1 + v2;
      if (isgraph(c)) {
        file[i] = c;
        int j = i+3;
        while (file[j] != 0) {
          file[j-2] = file[j];
          j++;
        }
        file[j-2] = 0;
        i++;
      } else if (c == ' ' || c == '/') { // keep it with the % notation
        i += 3;
      } else { // bad url
        return false;
      }
    } else { // nothing special, go forward
      i++;
    }
  }
  file[i] = 0;
  return true;
}

/**************************************/
/* definition of methods of class url */
/**************************************/

/* Constructor : Parses an url */
url::url (char *u, int8_t depth, url *base) {
  newUrl();
  this->depth = depth;
  host = NULL;
  port = 80;
  file = NULL;
  initCookie();
#ifdef URL_TAGS
  tag = 0;
#endif // URL_TAGS
  if (startWith("http://", u)) {
    // absolute url
    parse (u + 7);
    // normalize file name
    if (file != NULL && !normalize(file)) {
      delete [] file;
      file = NULL;
      delete [] host;
      host = NULL;
    }
  } else if (base != NULL) {
    if (startWith("http:", u)) {
      parseWithBase(u+5, base);
    } else if (isProtocol(u)) {
      // Unknown protocol (mailto, ftp, news, file, gopher...)
    } else {
      parseWithBase(u, base);
    }
  }
}

/* constructor used by input */
url::url (char *line,  int8_t depth) {
  newUrl();
  this->depth = depth;
  host = NULL;
  port = 80;
  file = NULL;
  initCookie();
  int i=0;
#ifdef URL_TAGS
  tag = 0;
  while (line[i] >= '0' && line[i] <= '9') {
    tag = 10*tag + line[i] - '0';
    i++;
  }
  i++;
#endif // URL_TAGS
  if (startWith("http://", line+i)) {
    parse(line+i+7);
    // normalize file name
    if (file != NULL && !normalize(file)) {
      delete [] file;
      file = NULL;
      delete [] host;
      host = NULL;
    }
  }
}

/* Constructor : read the url from a file (cf serialize)
 */
url::url (char *line) {
  newUrl();
  int i=0;
  // Read depth
  depth = 0;
  while (line[i] >= '0' && line[i] <= '9') {
    depth = 10*depth + line[i] - '0';
    i++;
  }
#ifdef URL_TAGS
  // read tag
  tag = 0; i++;
  while (line[i] >= '0' && line[i] <= '9') {
    tag = 10*tag + line[i] - '0';
    i++;
  }
#endif // URL_TAGS
  int deb = ++i;
  // Read host
  while (line[i] != ':') {
    i++;
  }
  line[i] = 0;
  host = newString(line+deb);
  i++;
  // Read port
  port = 0;
  while (line[i] >= '0' && line[i] <= '9') {
    port = 10*port + line[i] - '0';
    i++;
  }
#ifndef COOKIES
  // Read file name
  file = newString(line+i);
#else // COOKIES
  char *cpos = strchr(line+i, ' ');
  if (cpos == NULL) {
    cookie = NULL;
  } else {
    *cpos = 0;
    // read cookies
    cookie = new char[maxCookieSize];
    strcpy(cookie, cpos+1);
  }
  // Read file name
  file = newString(line+i);
#endif // COOKIES
}

/* constructor used by giveBase */
url::url (char *host, uint port, char *file) {
  newUrl();
  initCookie();
  this->host = host;
  this->port = port;
  this->file = file;
}

/* Destructor */
url::~url () {
  delUrl();
  delete [] host;
  delete [] file;
#ifdef COOKIES
  delete [] cookie;
#endif // COOKIES
}

/* Is it a valid url ? */
bool url::isValid () {
  if (host == NULL) return false;
  int lh = strlen(host);
  return file!=NULL && lh < maxSiteSize
    && lh + strlen(file) + 18 < maxUrlSize;
}

/* print an URL */
void url::print () {
  printf("http://%s:%u%s\n", host, port, file);
}

/* Set depth to max if necessary
 * try to find the ip addr
 * answer false if forbidden by robots.txt, true otherwise */
bool url::initOK (url *from) {
#if defined(DEPTHBYSITE) || defined(COOKIES)
  if (strcmp(from->getHost(), host)) { // different site
#ifdef DEPTHBYSITE
    depth = global::depthInSite;
#endif // DEPTHBYSITE
  } else { // same site
#ifdef COOKIES
    if (from->cookie != NULL) {
      cookie = new char[maxCookieSize];
      strcpy(cookie, from->cookie);
    }
#endif // COOKIES
  }
#endif // defined(DEPTHBYSITE) || defined(COOKIES)
  if (depth < 0) {
    errno = tooDeep;
    return false;
  }
  NamedSite *ns = global::namedSiteList + (hostHashCode());
  if (!strcmp(ns->name, host) && ns->port == port) {
    switch (ns->dnsState) {
    case errorDns:
      errno = fastNoDns;
      return false;
    case noConnDns:
      errno = fastNoConn;
      return false;
    case doneDns:
      if (!ns->testRobots(file)) {
        errno = fastRobots;
        return false;
      }
    }
  }
  return true;
}

/* return the base of the url */
url *url::giveBase () {
  int i = strlen(file);
  assert (file[0] == '/');
  while (file[i] != '/') {
    i--;
  }
  char *newFile = new char[i+2];
  memcpy(newFile, file, i+1);
  newFile[i+1] = 0;
  return new url(newString(host), port, newFile);
}

/** return a char * representation of the url
 * give means that you have to delete the string yourself
 */
char *url::giveUrl () {
  char *tmp;
  int i = strlen(file);
  int j = strlen(host);

  tmp = new char[18+i+j];  // 7 + j + 1 + 9 + i + 1
                           // http://(host):(port)(file)\0
  strcpy(tmp, "http://");
  strcpy (tmp+7, host);
  j += 7;
  if (port != 80) {
    j += sprintf(tmp + j, ":%u", port);
  }
  // Copy file name
  while (i >= 0) {
    tmp [j+i] = file[i];
    i--;
  }
  return tmp;
}

/** write the url in a buffer
 * buf must be at least of size maxUrlSize
 * returns the size of what has been written (not including '\0')
 */
int url::writeUrl (char *buf) {
  if (port == 80)
    return sprintf(buf, "http://%s%s", host, file);
  else
    return sprintf(buf, "http://%s:%u%s", host, port, file);
}

/* serialize the url for the Persistent Fifo */
char *url::serialize () {
  // this buffer is protected by the lock of PersFifo
  static char statstr[maxUrlSize+40+maxCookieSize];
  int pos = sprintf(statstr, "%u ", depth);
#ifdef URL_TAGS
  pos += sprintf(statstr+pos, "%u ", tag);
#endif // URL_TAGS
  pos += sprintf(statstr+pos, "%s:%u%s", host, port, file);
#ifdef COOKIES
  if (cookie != NULL) {
    pos += sprintf(statstr+pos, " %s", cookie);
  }
#endif // COOKIES
  statstr[pos] = '\n';
  statstr[pos+1] = 0;
  return statstr;
}

/* very thread unsafe serialisation in a static buffer */
char *url::getUrl() {
  static char statstr[maxUrlSize+40];
  sprintf(statstr, "http://%s:%u%s", host, port, file);
  return statstr;
}

/* return a hashcode for the host of this url */
uint url::hostHashCode () {
  return siteHashCode (host);
}

/* return a hashcode for this url */
uint url::hashCode () {
  unsigned int h=port;
  unsigned int i=0;
  while (host[i] != 0) {
    h = 31*h + host[i];
    i++;
  }
  i=0;
  while (file[i] != 0) {
    h = 31*h + file[i];
    i++;
  }
  return h % hashSize;
}

/* parses a url : 
 * at the end, arg must have its initial state, 
 * http:// has allready been suppressed
 */
void url::parse (char *arg) {
  int deb = 0, fin = deb;
  // Find the end of host name (put it into lowerCase)
  while (arg[fin] != '/' && arg[fin] != ':' && arg[fin] != 0) {
    fin++;
  }
  if (fin == 0) return;

  // get host name
  host = new char[fin+1];
  for (int  i=0; i) {
    host[i] = lowerCase(arg[i]);
  }
  host[fin] = 0;

  // get port number
  if (arg[fin] == ':') {
    port = 0;
    fin++;
    while (arg[fin] >= '0' && arg[fin] <= '9') {
      port = port*10 + arg[fin]-'0';
      fin++;
    }
  }

  // get file name
  if (arg[fin] != '/') {
    // www.inria.fr => add the final /
    file = newString("/");
  } else {
    file = newString(arg + fin);
  }
}

/** parse a file with base
 */
void url::parseWithBase (char *u, url *base) {
  // cat filebase and file
  if (u[0] == '/') {
    file = newString(u);
  } else {
    uint lenb = strlen(base->file);
    char *tmp = new char[lenb + strlen(u) + 1];
    memcpy(tmp, base->file, lenb);
    strcpy(tmp + lenb, u);
    file = tmp;
  }
  if (!normalize(file)) {
    delete [] file;
    file = NULL;
    return;
  }
  host = newString(base->host);
  port = base->port;
}

/** normalize file name
 * return true if it is ok, false otherwise (cgi-bin)
 */
bool url::normalize (char *file) {
  return fileNormalize(file);
}

/* Does this url starts with a protocol name */
bool url::isProtocol (char *s) {
  uint i = 0;
  while (isalnum(s[i])) {
    i++;
  }
  return s[i] == ':';
}

#ifdef COOKIES
#define addToCookie(s) len = strlen(cookie); \
    strncpy(cookie+len, s, maxCookieSize-len); \
    cookie[maxCookieSize-1] = 0;

/* see if a header contain a new cookie */
void url::addCookie(char *header) {
  if (startWithIgnoreCase("set-cookie: ", header)) {
    char *pos = strchr(header+12, ';');
    if (pos != NULL) {
      int len;
      if (cookie == NULL) {
        cookie = new char[maxCookieSize];
        cookie[0] = 0;
      } else {
        addToCookie("; ");
      }
      *pos = 0;
      addToCookie(header+12);
      *pos = ';';
    }
  }
}
#endif // COOKIES
View Code

global::namedSiteList

NamedSite *global::namedSiteList;
namedSiteList = new NamedSite[namedSiteListSize];
class NamedSite 
{
    private:
        /* string used for following CNAME chains (just one jump) */
        char *cname;
        /** we've got a good dns answer
        * get the robots.txt */
        void dnsOK ();
        /** Cannot get the inet addr
        * dnsState must have been set properly before the call */
        void dnsErr ();
        /** Delete the old identity of the site */
        void newId ();
        /** put this url in its IPSite */
        void transfer (url *u);
        /** forget this url for this reason */
        void forgetUrl (url *u, FetchError reason);
    public:
        /** Constructor */
        NamedSite ();
        /** Destructor : never used */
        ~NamedSite ();
        /* name of the site */
        char name[maxSiteSize];
        /* port of the site */
        uint16_t port;
        /* numbers of urls in ram for this site */
        uint16_t nburls;
        /* fifo of urls waiting to be fetched */
        url *fifo[maxUrlsBySite];
        uint8_t inFifo;
        uint8_t outFifo;
        void putInFifo(url *u);
        url *getInFifo();
        short fifoLength();
        /** Is this Site in a dnsSites */
        bool isInFifo;
        /** internet addr of this server */
        char dnsState;
        struct in_addr addr;
        uint ipHash;
        /* Date of expiration of dns call and robots.txt fetch */
        time_t dnsTimeout;
        /** test if a file can be fetched thanks to the robots.txt */
        bool testRobots(char *file);
        /* forbidden paths : given by robots.txt */
        Vector<char> forbidden;
        /** Put an url in the fifo
        * If there are too much, put it back in UrlsInternal
        * Never fill totally the fifo => call at least with 1 */
        void putGenericUrl(url *u, int limit, bool prio);
        inline void putUrl (url *u) { putGenericUrl(u, 15, false); }
        inline void putUrlWait (url *u) { putGenericUrl(u, 10, false); }
        inline void putPriorityUrl (url *u) { putGenericUrl(u, 5, true); }
        inline void putPriorityUrlWait (url *u) { putGenericUrl(u, 1, true); }
        /** Init a new dns query */
        void newQuery ();
        /** The dns query ended with success */
        void dnsAns (adns_answer *ans);
        /** we got the robots.txt, transfer what must be in IPSites */
        void robotsResult (FetchError res);
};
View Code
///////////////////////////////////////////////////////////
// class NamedSite
///////////////////////////////////////////////////////////

/** Constructor : initiate fields used by the program
 */
NamedSite::NamedSite () 
{
  name[0] = 0;
  nburls = 0;
  inFifo = 0; outFifo = 0;
  isInFifo = false;
  dnsState = waitDns;
  cname = NULL;
}

/** Destructor : This one is never used
 */
NamedSite::~NamedSite () {
  assert(false);
}

/* Management of the Fifo */
void NamedSite::putInFifo(url *u) {
  fifo[inFifo] = u;
  inFifo = (inFifo + 1) % maxUrlsBySite;
  assert(inFifo!=outFifo);
}

url *NamedSite::getInFifo() {
  assert (inFifo != outFifo);
  url *tmp = fifo[outFifo];
  outFifo = (outFifo + 1) % maxUrlsBySite;
  return tmp;
}

short NamedSite::fifoLength() {
  return (inFifo + maxUrlsBySite - outFifo) % maxUrlsBySite;
}

/* Put an url in the fifo if their are not too many */
void NamedSite::putGenericUrl(url *u, int limit, bool prio) {
  if (nburls > maxUrlsBySite-limit) {
    // Already enough Urls in memory for this Site
    // first check if it can already be forgotten
    if (!strcmp(name, u->getHost())) {
      if (dnsState == errorDns) {
        nburls++;
        forgetUrl(u, noDNS);
        return;
      }
      if (dnsState == noConnDns) {
        nburls++;
        forgetUrl(u, noConnection);
        return;
      }
      if (u->getPort() == port
          && dnsState == doneDns && !testRobots(u->getFile())) {
        nburls++;
        forgetUrl(u, forbiddenRobots);
        return;
      }
    }
    // else put it back in URLsDisk
    refUrl();
    global::inter->getOne();
    if (prio) {
      global::URLsPriorityWait->put(u);
    } else {
      global::URLsDiskWait->put(u);
    }
  } else {
    nburls++;
    if (dnsState == waitDns
        || strcmp(name, u->getHost())
        || port != u->getPort()
        || global::now > dnsTimeout) {
      // dns not done or other site
      putInFifo(u);
      addNamedUrl();
      // Put Site in fifo if not yet in
      if (!isInFifo) {
        isInFifo = true;
        global::dnsSites->put(this);
      }
    } else switch (dnsState) {
    case doneDns:
      transfer(u);
      break;
    case errorDns:
      forgetUrl(u, noDNS);
      break;
    default: // noConnDns
      forgetUrl(u, noConnection);
    }
  }
}

/** Init a new dns query
 */
void NamedSite::newQuery () 
{
    // Update our stats
    newId();
    if (global::proxyAddr != NULL) 
    {
        // we use a proxy, no need to get the sockaddr
        // give anything for going on
        siteSeen();
        siteDNS();
        // Get the robots.txt
        dnsOK();
    } 
    else if (isdigit(name[0])) 
    {
        // the name already in numbers-and-dots notation
        siteSeen();
        if (inet_aton(name, &addr)) 
        {
              // Yes, it is in numbers-and-dots notation
              siteDNS();
              // Get the robots.txt
              dnsOK();
        } 
        else 
        {
            // No, it isn't : this site is a non sense
            dnsState = errorDns;
            dnsErr();
        }
    } 
    else 
    {
        // submit an adns query
        global::nbDnsCalls++;
        adns_query quer = NULL;
        adns_submit(global::ads, name,
                    (adns_rrtype) adns_r_addr,
                    (adns_queryflags) 0,
                    this, &quer);
    }
}

/** The dns query ended with success
 * assert there is a freeConn
 */
void NamedSite::dnsAns (adns_answer *ans) 
{
    if (ans->status == adns_s_prohibitedcname) 
    {
        if (cname == NULL) 
        {
            // try to find ip for cname of cname
            cname = newString(ans->cname);
            global::nbDnsCalls++;
            adns_query quer = NULL;
            adns_submit(global::ads, cname,
                  (adns_rrtype) adns_r_addr,
                  (adns_queryflags) 0,
                  this, &quer);
        } 
        else 
        {
            // dns chains too long => dns error
            // cf nslookup or host for more information
            siteSeen();
            delete [] cname; cname = NULL;
            dnsState = errorDns;
            dnsErr();
        }
    } 
    else 
    {
        siteSeen();
        if (cname != NULL) 
        { 
            delete [] cname; 
            cname = NULL; 
        }
        if (ans->status != adns_s_ok) 
        {
          // No addr inet
          dnsState = errorDns;
          dnsErr();
        } 
        else 
        {
          siteDNS();
          // compute the new addr
          memcpy (&addr,
                  &ans->rrs.addr->addr.inet.sin_addr,
                  sizeof (struct in_addr));
          // Get the robots.txt
          dnsOK();
        }
    }
}

/** we've got a good dns answer
 * get the robots.txt
 * assert there is a freeConn
 */
void NamedSite::dnsOK () {
  Connexion *conn = global::freeConns->get();
  char res = getFds(conn, &addr, port);
  if (res != emptyC) {
    conn->timeout = timeoutPage;
    if (global::proxyAddr != NULL) {
      // use a proxy
      conn->request.addString("GET http://");
      conn->request.addString(name);
      char tmp[15];
      sprintf(tmp, ":%u", port);
      conn->request.addString(tmp);
      conn->request.addString("/robots.txt HTTP/1.0\r\nHost: ");
    } else {
      // direct connection
      conn->request.addString("GET /robots.txt HTTP/1.0\r\nHost: ");
    }
    conn->request.addString(name);
    conn->request.addString(global::headersRobots);
    conn->parser = new robots(this, conn);
    conn->pos = 0;
    conn->err = success;
    conn->state = res;
  } else {
    // Unable to get a socket
    global::freeConns->put(conn);
    dnsState = noConnDns;
    dnsErr();
  }
}

/** Cannot get the inet addr
 * dnsState must have been set properly before the call
 */
void NamedSite::dnsErr () {
  FetchError theErr;
  if (dnsState == errorDns) {
    theErr = noDNS;
  } else {
    theErr = noConnection;
  }
  int ss = fifoLength();
  // scan the queue
  for (int i=0; i) {
    url *u = getInFifo();
    if (!strcmp(name, u->getHost())) {
      delNamedUrl();
      forgetUrl(u, theErr);
    } else { // different name
      putInFifo(u);
    }
  }
  // where should now lie this site
  if (inFifo==outFifo) {
    isInFifo = false;
  } else {
    global::dnsSites->put(this);
  }
}

/** test if a file can be fetched thanks to the robots.txt */
bool NamedSite::testRobots(char *file) {
  uint pos = forbidden.getLength();
  for (uint i=0; i) {
    if (robotsMatch(forbidden[i], file))
      return false;
  }
  return true;
}

/** Delete the old identity of the site */
void NamedSite::newId () {
  // ip expires or new name or just new port
  // Change the identity of this site
#ifndef NDEBUG
  if (name[0] == 0) {
    addsite();
  }
#endif // NDEBUG
  url *u = fifo[outFifo];
  strcpy(name, u->getHost());
  port = u->getPort();
  dnsTimeout = global::now + dnsValidTime;
  dnsState = waitDns;
}

/** we got the robots.txt,
 * compute ipHashCode
 * transfer what must be in IPSites
 */
void NamedSite::robotsResult (FetchError res) {
  bool ok = res != noConnection;
  if (ok) {
    dnsState = doneDns;
    // compute ip hashcode
    if (global::proxyAddr == NULL) {
      ipHash=0;
      char *s = (char *) &addr;
      for (uint i=0; i<sizeof(struct in_addr); i++) {
        ipHash = ipHash*31 + s[i];
      }
    } else {
      // no ip and need to avoid rapidFire => use hostHashCode
      ipHash = this - global::namedSiteList;
    }
    ipHash %= IPSiteListSize;
  } else {
    dnsState = noConnDns;
  }
  int ss = fifoLength();
  // scan the queue
  for (int i=0; i) {
    url *u = getInFifo();
    if (!strcmp(name, u->getHost())) {
      delNamedUrl();
      if (ok) {
        if (port == u->getPort()) {
          transfer(u);
        } else {
          putInFifo(u);
        }
      } else {
        forgetUrl(u, noConnection);
      }
    } else {
      putInFifo(u);
    }
  }
  // where should now lie this site
  if (inFifo==outFifo) {
    isInFifo = false;
  } else {
    global::dnsSites->put(this);
  }  
}

void NamedSite::transfer (url *u) {
  if (testRobots(u->getFile())) {
    if (global::proxyAddr == NULL) {
      memcpy (&u->addr, &addr, sizeof (struct in_addr));
    }
    global::IPSiteList[ipHash].putUrl(u);
  } else {
    forgetUrl(u, forbiddenRobots);
  }
}

void NamedSite::forgetUrl (url *u, FetchError reason) {
  urls();
  fetchFail(u, reason);
  answers(reason);
  nburls--;
  delete u;
  global::inter->getOne();
}
View Code

 

其中两个类的定义

larbin-2.6.3/src/utils/PersistentFifo.h、larbin-2.6.3/src/utils/PersistentFifo.cc

// Larbin
// Sebastien Ailleret
// 06-01-00 -> 12-06-01

/* this fifo is stored on disk */

#ifndef PERSFIFO_H
#define PERSFIFO_H

#include 
#include 
#include 
#include 
#include 
#include 
#include <string.h>

#include "types.h"
#include "utils/url.h"
#include "utils/text.h"
#include "utils/connexion.h"
#include "utils/mypthread.h"

class PersistentFifo 
{
    protected:
        uint in, out;
        #ifdef THREAD_OUTPUT
        pthread_mutex_t lock;
        #endif
        // number of the file used for reading
        uint fileNameLength;
        // name of files
        int fin, fout;
        char *fileName;

    protected:
        // Make fileName fit with this number
        void makeName(uint nb);
        // Give a file name for this int
        int getNumber(char *file);
        // Change the file used for reading
        void updateRead ();
        // Change the file used for writing
        void updateWrite ();

    protected:
        // buffer used for readLine
        char outbuf[BUF_SIZE];
        // number of char used in this buffer
        uint outbufPos;
        // buffer used for readLine
        char buf[BUF_SIZE];
        // number of char used in this buffer
        uint bufPos, bufEnd;
        // sockets for reading and writing
        int rfds, wfds;
    protected:
        // read a line on rfds
        char *readLine ();
        // write an url in the out file (buffered write)
        void writeUrl (char *s);
        // Flush the out Buffer in the outFile
        void flushOut ();

    public:
        PersistentFifo (bool reload, char *baseName);
        ~PersistentFifo ();

        /* get the first object (non totally blocking)
        * return NULL if there is none
        */
        url *tryGet ();

        /* get the first object (non totally blocking)
        * probably crash if there is none
        */
        url *get ();

        /* add an object in the fifo */
        void put (url *obj);

        /* how many items are there inside ? */
        int getLength ();
};

#endif // PERSFIFO_H
View Code
// Larbin
// Sebastien Ailleret
// 27-05-01 -> 04-01-02

#include <string.h>
#include 
#include 
#include 
#include <string.h>
#include 
#include 

#include "types.h"
#include "global.h"
#include "utils/mypthread.h"
#include "utils/PersistentFifo.h"

PersistentFifo::PersistentFifo (bool reload, char *baseName) 
{
  fileNameLength = strlen(baseName)+5;
  fileName = new char[fileNameLength+2];
  strcpy(fileName, baseName);
  fileName[fileNameLength+1] = 0;
  outbufPos = 0;
  bufPos = 0;
  bufEnd = 0;
  mypthread_mutex_init(&lock, NULL);
  if (reload) 
  {
    DIR *dir = opendir(".");
    struct dirent *name;

    fin = -1;
    fout = -1;
    name = readdir(dir);
    while (name != NULL) 
    {
      if (startWith(fileName, name->d_name)) 
      {
        int tmp = getNumber(name->d_name);
        if (fin == -1) 
        {
          fin = tmp;
          fout = tmp;
        } 
        else 
        {
          if (tmp > fin)  { fin = tmp; }
          if (tmp < fout) { fout = tmp; }
        }
      }
      name = readdir(dir);
    }
    if (fin == -1) 
    {
      fin = 0;
      fout = 0;
    }
    if (fin == fout && fin != 0) 
    {
      cerr << "previous crawl was too little, cannot reload state\n"
           << "please restart larbin with -scratch option\n";
      exit(1);
    }
    closedir(dir);
    in = (fin - fout) * urlByFile;
    out = 0;
    makeName(fin);
    wfds = creat (fileName, S_IRUSR | S_IWUSR);
    makeName(fout);
    rfds = open (fileName, O_RDONLY);
  } 
  else 
  {
    // Delete old fifos
    DIR *dir = opendir(".");
    struct dirent *name;
    name = readdir(dir);
    while (name != NULL) 
    {
      if (startWith(fileName, name->d_name)) 
      {
        unlink(name->d_name);
      }
      name = readdir(dir);
    }
    closedir(dir);

    fin = 0;
    fout = 0;
    in = 0;
    out = 0;
    makeName(0);
    wfds = creat (fileName, S_IRUSR | S_IWUSR);
    rfds = open (fileName, O_RDONLY);
  }
}

PersistentFifo::~PersistentFifo () 
{
  mypthread_mutex_destroy (&lock);
  close(rfds);
  close(wfds);
}

url *PersistentFifo::tryGet () 
{
  url *tmp = NULL;
  mypthread_mutex_lock(&lock);
  if (in != out) 
  {
    // The stack is not empty
    char *line = readLine();
    tmp = new url(line);
    out++;
    updateRead();
  }
  mypthread_mutex_unlock(&lock);
  return tmp;
}

url *PersistentFifo::get () 
{
  mypthread_mutex_lock(&lock);
  char *line = readLine();
  url *res = new url(line);
  out++;
  updateRead();
  mypthread_mutex_unlock(&lock);
  return res;
}

/** Put something in the fifo
 * The objet is then deleted
 */
void PersistentFifo::put (url *obj) 
{
  mypthread_mutex_lock(&lock);
  char *s = obj->serialize(); // statically allocated string
  writeUrl(s);
  in++;
  updateWrite();
  mypthread_mutex_unlock(&lock);
  delete obj;
} 
int PersistentFifo::getLength () 
{
  return in - out;
}

void PersistentFifo::makeName (uint nb) 
{
  for (uint i=fileNameLength; i>=fileNameLength-5; i--) 
  {
    fileName[i] = (nb % 10) + '0';
    nb /= 10;
  }
}

int PersistentFifo::getNumber (char *file) 
{
  uint len = strlen(file);
  int res = 0;
  for (uint i=len-6; i<=len-1; i++) 
  {
    res = (res * 10) + file[i] - '0';
  }
  return res;
}

void PersistentFifo::updateRead () 
{
  if ((out % urlByFile) == 0) 
  {
    close(rfds);
    makeName(fout);
    unlink(fileName);
    makeName(++fout);
    rfds = open(fileName, O_RDONLY);
    in -= out;
    out = 0;
    assert(bufPos == bufEnd);
  }
}

void PersistentFifo::updateWrite () 
{
  if ((in % urlByFile) == 0) 
  {
    flushOut();
    close(wfds);
    makeName(++fin);
    wfds = creat(fileName, S_IRUSR | S_IWUSR);
#ifdef RELOAD
    global::seen->save();
#ifdef NO_DUP
    global::hDuplicate->save();
#endif
#endif
  }
}

/* read a line from the file
 * uses a buffer
 */
char *PersistentFifo::readLine () {
  if (bufPos == bufEnd) {
    bufPos = 0; bufEnd = 0; buf[0] = 0;
  }
  char *posn = strchr(buf + bufPos, '\n');
  while (posn == NULL) {
    if (!(bufEnd - bufPos < maxUrlSize + 40 + maxCookieSize)) {
      printf(fileName);
      printf(buf+bufPos);
    }
    if (bufPos*2 > BUF_SIZE) {
      bufEnd -= bufPos;
      memmove(buf, buf+bufPos, bufEnd);
      bufPos = 0;
    }
    int postmp = bufEnd;
    bool noRead = true;
    while (noRead) {
      int rd = read(rfds, buf+bufEnd, BUF_SIZE-1-bufEnd);
      switch (rd) {
      case 0 :
        // We need to flush the output in order to read it
        flushOut();
        break;
      case -1 :
        // We have a trouble here
        if (errno != EINTR) {
          cerr << "Big Problem while reading (persistentFifo.h)\n";
          perror("reason");
          assert(false);
        } else {
          perror("Warning in PersistentFifo: ");
        }
        break;
      default:
        noRead = false;
        bufEnd += rd;
        buf[bufEnd] = 0;
        break;
      }
    }
    posn = strchr(buf + postmp, '\n');
  }
  *posn = 0;
  char *res = buf + bufPos;
  bufPos = posn + 1 - buf;
  return res;
}

// write an url in the out file (buffered write)
void PersistentFifo::writeUrl (char *s) {
  size_t len = strlen(s);
  assert(len < maxUrlSize + 40 + maxCookieSize);
  if (outbufPos + len < BUF_SIZE) {
    memcpy(outbuf + outbufPos, s, len);
    outbufPos += len;
  } else {
    // The buffer is full
    flushOut ();
    memcpy(outbuf + outbufPos, s, len);
    outbufPos = len;
  }
}

// Flush the out Buffer in the outFile
void PersistentFifo::flushOut () {
  ecrireBuff (wfds, outbuf, outbufPos);
  outbufPos = 0;
}
View Code

Larbin-2.6.3/src/utils/syncFifo.h

// Larbin
// Sebastien Ailleret
// 09-11-99 -> 07-12-01

/* fifo in RAM with synchronisations */

#ifndef SYNCFIFO_H
#define SYNCFIFO_H

#define std_size 100

#include "utils/mypthread.h"

template <class T>
class SyncFifo {
 protected:
  uint in, out;
  uint size;
  T **tab;
#ifdef THREAD_OUTPUT
  pthread_mutex_t lock;
  pthread_cond_t nonEmpty;
#endif

 public:
  /* Specific constructor */
  SyncFifo (uint size = std_size);

  /* Destructor */
  ~SyncFifo ();

  /* get the first object */
  T *get ();

  /* get the first object (non totally blocking)
   * return NULL if there is none
   */
  T *tryGet ();

  /* add an object in the Fifo */
  void put (T *obj);

  /* how many itmes are there inside ? */
  int getLength ();
};

template <class T>
SyncFifo::SyncFifo (uint size) {
  tab = new T*[size];
  this->size = size;
  in = 0;
  out = 0;
  mypthread_mutex_init (&lock, NULL);
  mypthread_cond_init (&nonEmpty, NULL);
}

template <class T>
SyncFifo::~SyncFifo () {
  delete [] tab;
  mypthread_mutex_destroy (&lock);
  mypthread_cond_destroy (&nonEmpty);
}

template <class T>
T *SyncFifo::get () {
  T *tmp;
  mypthread_mutex_lock(&lock);
  mypthread_cond_wait(in == out, &nonEmpty, &lock);
  tmp = tab[out];
  out = (out + 1) % size;
  mypthread_mutex_unlock(&lock);
  return tmp;
}

template <class T>
T *SyncFifo::tryGet () {
  T *tmp = NULL;
  mypthread_mutex_lock(&lock);
  if (in != out) {
    // The stack is not empty
    tmp = tab[out];
    out = (out + 1) % size;
  }
  mypthread_mutex_unlock(&lock);
  return tmp;
}

template <class T>
void SyncFifo::put (T *obj) {
  mypthread_mutex_lock(&lock);
  tab[in] = obj;
  if (in == out) {
    mypthread_cond_broadcast(&nonEmpty);
  }
  in = (in + 1) % size;
  if (in == out) {
    T **tmp;
    tmp = new T*[2*size];
    for (uint i=out; i) {
      tmp[i] = tab[i];
    }
    for (uint i=0; i<in; i++) {
      tmp[i+size] = tab[i];
    }
    in += size;
    size *= 2;
    delete [] tab;
    tab = tmp;
  }
  mypthread_mutex_unlock(&lock);
}

template <class T>
int SyncFifo::getLength () {
  int tmp;
  mypthread_mutex_lock(&lock);
  tmp = (in + size - out) % size;
  mypthread_mutex_unlock(&lock);
  return tmp;
}

#endif // SYNCFIFO_H
View Code

 

你可能感兴趣的:(爬虫Larbin解析(二)——sequencer())