网络爬虫需要从指定的URL通过HTTP协议来获得HTML文件信息,以此从一个URL爬到另一个URL。在Windows平台上,这往往通过WinINet接口实现。
但是,如果对HTTP协议熟悉的话,也可以通过Winsock接口实现。代码如下。
1
#pragma warning (disable:
4996
)
2
3
#define
DEFAULT_URL "http:
//
www.google.com"
4
5
BOOL WinsockStartup(BYTE highVer, BYTE lowVer)
6
{
7
WSADATA wsaData;
8
return
WSAStartup(MAKEWORD(highVer, lowVer),
&
wsaData)
==
0
;
9
}
10
11
int
SendData(SOCKET s,
char
*
data)
12
{
13
return
send(s, data, strlen(data),
0
);
14
}
15
16
void
ParseTheURL(
char
*
pszURL,
char
*
pszHostName)
17
{
18
char
*
p,
*
pHostStart;
19
20
p
=
strstr(pszURL,
"
http://
"
);
21
if
(p
&&
p
==
pszURL)
22
{
23
pHostStart
=
pszURL
+
7
;
24
}
25
else
26
{
27
pHostStart
=
pszURL;
28
}
29
p
=
strchr(pHostStart,
'
/
'
);
30
if
(p)
31
{
32
memcpy(pszHostName, pHostStart, p
-
pHostStart);
33
}
34
else
35
{
36
memcpy(pszHostName, pHostStart, strlen(pHostStart));
37
}
38
}
39
40
int
_tmain()
41
{
42
int
iRet
=
0
;
43
DWORD dwError
=
0
;
44
BOOL bOk
=
FALSE;
45
46
char
szURL[
256
]
=
{
0
};
//
主机文件,即URL
47
char
szHostName[
256
]
=
{
0
};
//
主机名
48
char
szPortName[]
=
"
80
"
;
//
端口号
49
50
if
(
!
WinsockStartup(
2
,
2
))
51
{
52
_tcprintf(TEXT(
"
初始化Windows Sockets失败!
"
));
53
cin.getline(szURL,
255
);
54
return
-
1
;
55
}
56
57
addrinfo aiHints
=
{
0
};
58
addrinfo
*
aiList;
59
60
aiHints.ai_family
=
AF_INET;
61
aiHints.ai_socktype
=
SOCK_STREAM;
62
aiHints.ai_protocol
=
IPPROTO_TCP;
63
64
cout
<<
"
输入URL:
"
;
65
cin.getline(szURL,
255
);
66
67
if
(strcmp(szURL,
""
)
==
0
)
68
{
69
strcpy(szURL, DEFAULT_URL);
70
cout
<<
DEFAULT_URL
<<
endl;
71
}
72
73
ParseTheURL(szURL, szHostName);
74
75
if
(getaddrinfo(szHostName, szPortName, NULL,
&
aiList)
!=
0
)
76
{
77
_tcprintf_s(TEXT(
"
getaddrinfo失败:%d
"
), WSAGetLastError());
78
WSACleanup();
79
cin.getline(szURL,
255
);
80
return
-
1
;
81
}
82
83
SOCKET s;
84
for
(addrinfo
*
aiPtr
=
aiList; aiPtr
!=
NULL; aiPtr
=
aiPtr
->
ai_next)
85
{
86
s
=
socket(aiList
->
ai_family, aiList
->
ai_socktype, aiList
->
ai_protocol);
87
if
(s
==
INVALID_SOCKET)
88
{
89
_tcprintf_s(TEXT(
"
socket创建失败:%d
"
), WSAGetLastError());
90
continue
;
91
}
92
93
if
(connect(s, aiPtr
->
ai_addr, aiPtr
->
ai_addrlen)
==
SOCKET_ERROR)
94
{
95
closesocket(s);
96
s
=
INVALID_SOCKET;
97
_tcprintf_s(TEXT(
"
connect失败:%d
"
), WSAGetLastError());
98
continue
;
99
}
100
break
;
101
}
102
103
freeaddrinfo(aiList);
104
105
if
(s
==
INVALID_SOCKET)
106
{
107
WSACleanup();
108
cin.getline(szURL,
255
);
109
return
-
1
;
110
}
111
112
char
requestData[
512
]
=
{
0
};
113
sprintf(requestData,
"
GET %s HTTP/1.1\r\n
"
, szURL);
114
SendData(s, requestData);
115
//
SendData(s, "GET / HTTP/1.1\r\n");
116
sprintf(requestData,
"
Host:%s\r\n
"
, szHostName);
117
SendData(s, requestData);
118
SendData(s,
"
Accept: */*\r\n
"
);
119
SendData(s,
"
User-Agent: Mozilla/4.0(compatible; MSIE 5.00; Windows NT)\r\n
"
);
120
SendData(s,
"
Connection:Close\r\n
"
);
121
//
SendData(s, "Connection:Keep-Alive\r\n");
122
SendData(s,
"
\r\n
"
);
123
SendData(s,
"
\r\n
"
);
//
最后要加空行
124
125
BOOL done
=
FALSE;
126
char
buffer[
1024
]
=
{
0
};
127
int
l, chars
=
0
;
128
129
//
打印http响应的头部
130
while
(
!
done)
131
{
132
l
=
recv(s, buffer,
1
,
0
);
133
if
(l
<=
0
)
134
done
=
TRUE;
135
switch
(
*
buffer)
136
{
137
case
'
\r
'
:
138
break
;
139
case
'
\n
'
:
140
if
(chars
==
0
)
141
done
=
TRUE;
142
chars
=
0
;
//
表示另起一行
143
break
;
144
default
:
145
++
chars;
146
break
;
147
}
148
printf(
"
%c
"
,
*
buffer);
149
}
150
151
//
接收正文部分
152
int
sum
=
0
;
153
do
154
{
155
l
=
recv(s, buffer,
sizeof
(buffer)
-
1
,
0
);
156
if
( l
<=
0
)
157
break
;
158
sum
+=
l;
159
*
(buffer
+
l)
=
0
;
160
printf(buffer);
161
}
while
( l
>
0
);
162
163
//
这里输出正文部分大小,发现其实和响应消息头部的Content-length大小是一样的
164
//
这样就可以检查是否接受完毕
165
printf(
"
\n\n大小 = %d字节\n
"
,sum);
166
167
WSACleanup();
168
169
cin.getline(szURL,
255
);
170
return
0
;
171
}