// Larbin // Sebastien Ailleret // 07-03-00 -> 07-03-00 #include <string.h> #include <unistd.h> #include <iostream.h> #include <netdb.h> #include <sys/socket.h> #include <sys/types.h> #include <sys/wait.h> #include <arpa/inet.h> #include <fcntl.h> #include <sys/time.h> #include <sys/resource.h> #include "global.h" #include "xfetcher/fetchOpen.h" #include "xutils/text.h" #include "xutils/connexion.h" #include "xutils/debug.h" static void readAll (int fds); static void writeRes (int fds); static char *startDate; /////////////////////////////////////////////////////////////////////////////////////// // //函数功能:提供远程通过web查询,获取爬虫状态的信息 //参数:无 //返回值:void // /////////////////////////////////////////////////////////////////////////////////////// void webserver () { // bind the socket int fds; int nAllowReuse = 1; struct sockaddr_in addr; time_t now = time(NULL); startDate = newString(ctime(&now)); //获得当前的时间 bzero (&addr, sizeof(addr)); addr.sin_addr.s_addr = INADDR_ANY; addr.sin_family = AF_INET; addr.sin_port = htons(global::httpPort); //httpPort 用通过web看Larbin抓取的统计情况的接口 //以服务器方式建立TCP连接 if ((fds = socket(AF_INET, SOCK_STREAM, 0)) == -1 || setsockopt(fds, SOL_SOCKET, SO_REUSEADDR, (char*)&nAllowReuse, sizeof(nAllowReuse)) || bind(fds, (struct sockaddr *) &addr, sizeof(addr)) != 0 || listen(fds, 4) != 0) { cerr << "Unable to get the socket/n"; exit(1); } // answer requests for (;;) //这里是一个循环,来支持多次的web查询 { struct sockaddr_in addrc; int fdc; uint len = sizeof(addr); fdc = accept(fds, (struct sockaddr *) &addrc, &len); if (fdc == -1) { //accept 失败 cerr << "Trouble with web server.../n"; } else { readAll(fdc); writeRes(fdc); }//end if_else }//end for } /////////////////////////////////////////////////////////////// // //函数功能:处理发送来的请求 //参数:int fds 创建的套接字 //返回值:void //注:没看出该函数实现了什么功能 // //////////////////////////////////////////////////////////////////// static void readAll (int fds) { char c; int cont = 2; while (cont) { if (read(fds, &c, 1) == 1) { //如果收到的数据等于1,做下面处理 switch (c) { case '/r' : break; case '/n' : cont--; break; default : cont = 2; break; }//end switch } else { //将所有接收的数据都读取完毕后,cont置零,readAll函数结束,接着调用writeRes cont = 0; }//end if_else }//end while } /////////////////////////////////////////////////////////////////////////// // //函数功能:向远程发送数据 //参数:int fds 套接字 //返回值:void // /////////////////////////////////////////////////////////////////////////// static void writeRes (int fds) { crash("Answer to a web query"); // headers and html tags ecrire(fds, "HTTP/1.0 200 OK/r/nServer: Larbin/r/nContent-type: text/html/r/n/r/n<html>/n<head>/n<title>Larbin real time statistic</title>/n</head>/n<body bgcolor=/"#FFFFFF/">/n<center><h1>Larbin is up and running !</h1></center>/n"); ecrire(fds, "/nStart date : "); ecrire(fds, startDate); ecrire(fds, "/n<br>Current date : "); time_t now = time(NULL); ecrire(fds, newString(ctime(&now))); #ifndef NOSTATS if (global::isSpecific) { ecrire(fds, "/n<h2>Interesting pages ("); ecrire(fds, global::contentType); ecrire(fds, ") :</h2>/ntotal Fetched (success) : "); ecrireInt(fds, interestingPage); ecrire(fds, "/n<br>total Fetched (error or success) : "); ecrireInt(fds, interestingSeen); if (global::privilegedExt != NULL) { ecrire(fds, "/n<br>privileged links seen ("); ecrire(fds, global::privilegedExt); ecrire(fds, ") : "); ecrireInt(fds, interestingExtension); ecrire(fds, "/n<br>privileged links fetched : "); ecrireInt(fds, extensionTreated); } } ecrire(fds, "/n<h2>Pages :</h2>/nurls treated : "); ecrireInt(fds, urls); ecrire(fds, "/n<br>forbiddenRobots : "); ecrireInt(fds, answers[forbiddenRobots]); ecrire(fds, "/n<br>noDNS : "); ecrireInt(fds, answers[noDNS]); ecrire(fds, "/n<br>/n<br>Pages : "); ecrireInt(fds, pages); ecrire(fds, "/n<br>Success : "); ecrireInt(fds, answers[success]); ecrire(fds, "/n<br>no Connection : "); ecrireInt(fds, answers[noConnection]); ecrire(fds, "/n<br>early stop : "); ecrireInt(fds, answers[earlyStop]); ecrire(fds, "/n<br>timeout : "); ecrireInt(fds, answers[timeout]); ecrire(fds, "/n<br>badType : "); ecrireInt(fds, answers[badType]); ecrire(fds, "/n<br>tooBig : "); ecrireInt(fds, answers[tooBig]); ecrire(fds, "/n<br>err40X : "); ecrireInt(fds, answers[err40X]); ecrire(fds, "/n<br>/n<br>urls accepted : "); ecrireInt(fds, hashUrls); ecrire(fds, " / "); ecrireInt(fds, hashSize); ecrire(fds, "/n<h2>Sites seen (dns call done) :</h2>/ntotal number : "); ecrireInt(fds, siteSeen); ecrire(fds, " (+"); ecrireInt(fds, nbCalls); ecrire(fds, ")/n<br>with dns : "); ecrireInt(fds, siteDNS); ecrire(fds, "/n<br>with robots.txt : "); ecrireInt(fds, siteRobots); ecrire(fds, "/n<br>with good robots.txt : "); ecrireInt(fds, robotsOK); ecrire(fds, "/n<h2>Fifos :</h2>/nurls on disk : "); ecrireInt(fds, global::URLsInternal->getLength()); ecrire(fds, "/n<br>sites with ip addr and something to fetch : "); ecrireInt(fds, global::okSites->getLength() + global::nb_conn - global::freeConns->getLength()); ecrire(fds, "/n<br>sites without ip addr yet : "); ecrireInt(fds, global::dnsSites->getLength()); #endif // NOSTATS #ifndef NDEBUG ecrire(fds, "/n<h2>Ressources Sharing :</h2>/nconnexions in use : "); ecrireInt(fds, global::nb_conn - global::freeConns->getLength() - global::userConns->getLength()); ecrire(fds, "/n<br>connexions waiting user processing : "); ecrireInt(fds, global::userConns->getLength()); ecrire(fds, "/n<br>free connexions : "); ecrireInt(fds, global::freeConns->getLength()); ecrire(fds, "/n<br>parsers : "); ecrireInt(fds, debPars); ecrire(fds, "/n<br>sites in ram : "); ecrireInt(fds, sites); ecrire(fds, "/n<br>urls in ram : "); ecrireInt(fds, debUrl); ecrire(fds, "/n<h2>State of threads :</h2>/nstateBlock : "); ecrireInt(fds, stateBlock); ecrire(fds, "/n<br>stateNonBlock : "); ecrireInt(fds, stateNonBlock); ecrire(fds, "/n<br>statePipe : "); ecrireInt(fds, statePipe); ecrire(fds, "/n<h2>/proc/self/status :</h2>/n<pre>/n"); int status = open("/proc/self/status", O_RDONLY); char *file = readfile(status); ecrire(fds, file); delete [] file; close(status); ecrire(fds, "</pre>"); #endif // NDEBUG // end of page and kill the connexion ecrire(fds, "/n<hr>/n<A HREF="/" mce_HREF="/""http://pauillac.inria.fr/~ailleret//"><img SRC="/" mce_SRC="/""http://pauillac.inria.fr/~ailleret/seb.gif/" ALT=/"ma photo/"></A>/n<A HREF="/" mce_HREF="/""mailto:[email protected]/">[email protected]</A>/n</body>/n</html>"); shutdown(fds, 2); close(fds); }