用C++程序代码书写爬虫软件

#include 
#include 
#pragma comment(lib, "ws2_32.lib")
int getRand(int n)
{
	return rand() % n + 1;
}
int main()
{
	int j = 1;
	for(j = 1; j <= 12; j++) // 博主的博文有12个列表
	{
		int flag = 0;
		WSADATA wsaData;
		WSAStartup(MAKEWORD(1,1), &wsaData);
		char szWeb[] = "blog.sina.com.cn";  // 新浪博客
		HOSTENT *pHost = gethostbyname(szWeb);
		const char* pIPAddr = inet_ntoa(*((struct in_addr *)pHost->h_addr)) ;
		printf("web server ip is : %s\n", pIPAddr);
	SOCKADDR_IN  webServerAddr;
	webServerAddr.sin_family = AF_INET;
	webServerAddr.sin_addr.S_un.S_addr=inet_addr(pIPAddr);
	webServerAddr.sin_port = htons(80);
	SOCKET sockClient = socket(AF_INET, SOCK_STREAM, 0);
	int nRet = connect(sockClient ,(struct sockaddr*)&webServerAddr, sizeof(webServerAddr));
	if(nRet < 0)
	{
		printf("connect error\n");
		return 1;
	}
	// 该博主博客列表访问格式/s/articlelist_5890965060_0_%d.html
	char szHttpRest[1024] = {0};
	sprintf(szHttpRest, "GET /s/articlelist_5890965060_0_%d.html HTTP/1.1\r\nHost:%s\r\nConnection: Keep-Alive\r\n\r\n", j, szWeb);
	printf("send buf is:\n");
	printf("%s\n", szHttpRest);
	nRet = send(sockClient , szHttpRest, strlen(szHttpRest) + 1, 0);
	if(nRet < 0)
	{
		printf("send error\n");
		return 1;
	}
	FILE *fp = fopen("test.txt", "a+");
	while(1)
	{
	    char szRecvBuf[2] = {0};
		nRet = recv(sockClient ,szRecvBuf, 1 ,0);
		if(nRet < 0)
		{
			printf("recv error\n");
			goto LABEL;
		}			
		if(0 == nRet)
		{
			printf("connection has been closed by web server\n");
			goto LABEL;
		}
		if(0 == flag)
		{
			printf("writing data to file...\n");
			flag = 1;
		}
		fputc(szRecvBuf[0], fp);
	}
LABEL: 
	fclose(fp);
	closesocket(sockClient);  
	WSACleanup();  
	printf("list index is ------------------------------------> %d\n\n\n", j);
	Sleep(1000 * getRand(1));
}
printf("\n\n\ndone!!!!!!\n\n\n");
return 0;

}

你可能感兴趣的:(用C++程序代码书写爬虫软件)