先给你一个入口网站,发送http请求头接收返回的内容放入URL txt文件中,然后在加入到搜索过的链表中,放入到搜索url txt文件中,分析html内容,找出其中的超链,把超链放入待搜索队列中,最后循环以上步骤直到待搜索队列没有内容。
编译环境Visual Studio
#include
#include
#include
#include
#include
#include
#include
#include
#pragma comment(lib,"ws2_32.lib")
using namespace std;
#define DEFUAL_PORT 80
#define DEFUAL_ARRAY_SIZE 1048576
#define URL "http://www.baidu.com/"
#define SAVE_HTML_DATA_DIRECTORY "./html"//存放html文本的文件夹
//截取HostUrl中的内容直到Sign处
char *InterceptString(char *&HostUrl,char Sign)
{
if(HostUrl == NULL)
return NULL;
int size = 100;
int nIndex = 0;
char *string = new char[size];
char *MarkString = string;
char *NewStr = NULL;
while(*HostUrl != Sign && *HostUrl != '\0')
{
*string++=*HostUrl++;
nIndex++;
if(nIndex +1 == size)
{
*string = '\0';
size += 100;
NewStr = new char[size];
strcpy_s(NewStr,size,MarkString);
free(MarkString);
MarkString = string = NewStr;
string += nIndex;
}
}
*string = '\0';
return MarkString;
}
//解析URL 分离出主机 和 资源 分别放入host和resource中
bool ParseUrl(char *HostUrl,char *&resource,char *&host)
{
if(HostUrl == NULL)
return false;
if( strstr(HostUrl,"http://") != NULL)//判定URL是否是标准格式
HostUrl +=7;
if(*HostUrl != 'w')//格式错误
return false;
//分离 主机 和 资源
if(!(host = InterceptString(HostUrl,'/')))
return false;
if(*HostUrl == '\0'){
resource = new char[2];
resource[0] = '/';
resource[1] = '\0';
}
else
if(!(resource = InterceptString(HostUrl,'\0')))
return false;
//cout<
}
//发送http请求报文
bool SendHttp(char *HostUrl,char *&Htmlresource,int &ByteRead)
{
char *resource = NULL;
char *host = NULL;
if( !ParseUrl(HostUrl,resource,host) ){
cout<< "Parse Url fail !" << endl;
return false;
}
//创建套接字
SOCKET sock = socket(AF_INET,SOCK_STREAM,IPPROTO_TCP);
if(sock == -1 || sock == -2){
cout<<"socket error ! error number === "<
}
//通过域名得到对应ip地址(此处需要联网)
hostent *p_TargetHost_ip = gethostbyname(host);
if(p_TargetHost_ip == NULL){
cout<<"gethostbyname error ! error number === "<
}
sockaddr_in TargetHost;
TargetHost.sin_family = AF_INET;
TargetHost.sin_port = htons(DEFUAL_PORT);
TargetHost.sin_addr.s_addr = *(u_long*)p_TargetHost_ip->h_addr_list[0];
if( connect(sock,(sockaddr*)&TargetHost,sizeof(TargetHost)) < 0){
cout<<"connect error ! error number === "<
}
//定义发送报文内容
char message[1024] = {0};//此处不应该是固定大小--------------------
char *StringInputFormat = "GET %s HTTP/1.1\r\nHost:%s\r\nConnection:Close\r\n\r\n" ;
sprintf_s(message,1024,StringInputFormat,resource,host);
//发送请求包
size_t sent = 0;
int tmpress;
while(sent < strlen(message)){
tmpress = send(sock,message+sent,strlen(message)-sent,0);
if(tmpress == SOCKET_ERROR){
cout<< "send message fail ! error ===" << GetLastError() << endl;
return false;
}
sent += tmpress;
}
//接收回复包
//char arr[DEFUAL_ARRAY_SIZE] = {0};//错误 数组不能定义这么大,所以采用new的方式
//另外直接开固定数组也容易越界
int Message_Byte = 100000;//这里应该变成可变的大小-----------
char *ReplyMsg = new char[Message_Byte];
memset(ReplyMsg , 0 ,Message_Byte);
ByteRead = 0;
int rempress = 1;
cout << "Read :" ;
while(rempress > 0){
rempress = recv(sock,ReplyMsg+ByteRead,Message_Byte-ByteRead,0);
if(rempress > 0){
ByteRead += rempress;
}
cout << rempress << " ";
}
ReplyMsg[ByteRead] = 0;
//cout<
Htmlresource = ReplyMsg;
free(resource);
resource = NULL;
free(host);
host = NULL;
closesocket(sock);
return true;
}
//将URL转化为文件名------此处应该改成附加一个变量的,如果为0 加上txt
char *UrlTranformateFilename(char *Url)
{
if(Url == NULL)
return NULL;
char *string = new char[strlen(Url)+5];//Url长度,加上.txt四个字符
char *MarkString = string;
while(*Url != '\0'){
if(*Url != ':'&&*Url != '/')
*string++=*Url;
Url++;
}
char *str_text = ".txt";
while(*str_text != '\0')
{
*string++=*str_text++;
}
*string = '\0';
return MarkString;
}
//将得到的内容放入txt文件中
bool InputFile(char *&Url,char *&Htmlresource)
{
char *Filename = NULL;
if(!(Filename = UrlTranformateFilename(Url))){//将URL转化为文件名
cout<< "UrlTranformateFilename fail ! Url ====" <
}
//通过文件名加上路径创建txt文件
char *path = SAVE_HTML_DATA_DIRECTORY;
char *Filepath = new char[strlen(Filename) +strlen(path)+2];
strcpy_s(Filepath,strlen(Filename)+strlen(path)+2,path);
strcat_s(Filepath,strlen(Filename)+strlen(path)+2,"/");
strcat_s(Filepath,strlen(Filename)+strlen(path)+2,Filename);
cout<
ofstream File(Filepath);//创建txt文件
if(File.is_open()){//打开文件,这种打开方式会清空原文件内容,重新写入
File<
}
//及时删除,防止内存泄漏
free(Filepath);
Filepath = NULL;
return true;
}
//解析HTML内容
bool ParseHtml(char *Htmlresource,queue
{
//查找html中body 得到其中的body内容
char *str_body = " char *Html = NULL;
if(!(Html = strstr(Htmlresource,str_body))){
cout<< "this Html error ! " << endl;
return false;
}
//分析html内容 找到其中的超链 如果不在已搜索队列中,加入待搜索队列
char *str_target = "href=\"";
char *str_hyperlink = NULL;
if(!(str_hyperlink = strstr(Html,str_target))){
cout<< "this html no hyperlink !" << endl;
}
while(str_hyperlink){
str_hyperlink += strlen(str_target);
char *str = strstr(str_hyperlink,"\"");
char *Url = InterceptString(str_hyperlink,'"');
NotSearchQueue.push(Url);//此处应该判断是否遍历过
str_hyperlink = strstr(str_hyperlink,str_target);
}
return true;
}
void BFS(char *Url,queue
{
//先发送http得到其中资源
char *Htmlresource = NULL;
int Byte = 0;
if(!SendHttp(Url,Htmlresource,Byte)){
cout<< "SendHttp fail ! return !" <
}
if(!InputFile(Url,Htmlresource)){//输入到文件失败不需要返回false 还可以继续爬
cout<< "InputFile fail ! ignore !"<
//cout<
//解析html
if(!ParseHtml(Htmlresource,SearchQueue,NotSearchQueue)){
cout<<"ParseHtml fail ! return "<
}
}
bool Init()
{
WSADATA wsaData;
if( WSAStartup( MAKEWORD(2,2),&wsaData) != 0)
{
printf("WSAStartup error ! error number === %d \n",GetLastError());
return false;
}
//创建存储html文本和html中图片的文件夹
CreateDirectory(TEXT(SAVE_HTML_DATA_DIRECTORY),NULL);
return true;
}
void Print(queue
{
while(!NotSearchQueue.empty()){
cout<
}
}
int main(int argc,char *argv[])
{
if(!Init()){
cout<< "Init() fail ! " <
}
char *UrlStart = URL;
cout<
queue
BFS(UrlStart,SearchQueue,NotSearchQueue);
SearchQueue.push(UrlStart);
//Print(NotSearchQueue);
//当未搜索队列不为空时候,继续
while(!NotSearchQueue.empty()){
cout<
SearchQueue.push(NotSearchQueue.front());
NotSearchQueue.pop();
}
WSACleanup();
return 0;
}