[转]利用Winsock模拟HTTP的GET请求
网络爬虫需要从指定的URL通过HTTP协议来获得HTML文件信息,以此从一个URL爬到另一个URL。在Windows平台上,这往往通过WinINet接口实现。
但是,如果对HTTP协议熟悉的话,也可以通过Winsock接口实现。代码如下。
1
#pragma warning (disable:
4996
)
2
3 #define DEFAULT_URL "http: // www.google.com"
4
5 BOOL WinsockStartup(BYTE highVer, BYTE lowVer)
6 {
7 WSADATA wsaData;
8 return WSAStartup(MAKEWORD(highVer, lowVer), & wsaData) == 0 ;
9 }
10
11 int SendData(SOCKET s, char * data)
12 {
13 return send(s, data, strlen(data), 0 );
14 }
15
16 void ParseTheURL( char * pszURL, char * pszHostName)
17 {
18 char * p, * pHostStart;
19
20 p = strstr(pszURL, " http:// " );
21 if (p && p == pszURL)
22 {
23 pHostStart = pszURL + 7 ;
24 }
25 else
26 {
27 pHostStart = pszURL;
28 }
29 p = strchr(pHostStart, ' / ' );
30 if (p)
31 {
32 memcpy(pszHostName, pHostStart, p - pHostStart);
33 }
34 else
35 {
36 memcpy(pszHostName, pHostStart, strlen(pHostStart));
37 }
38 }
39
40 int _tmain()
41 {
42 int iRet = 0 ;
43 DWORD dwError = 0 ;
44 BOOL bOk = FALSE;
45
46 char szURL[ 256 ] = { 0 }; // 主机文件,即URL
47 char szHostName[ 256 ] = { 0 }; // 主机名
48 char szPortName[] = " 80 " ; // 端口号
49
50 if ( ! WinsockStartup( 2 , 2 ))
51 {
52 _tcprintf(TEXT( " 初始化Windows Sockets失败! " ));
53 cin.getline(szURL, 255 );
54 return - 1 ;
55 }
56
57 addrinfo aiHints = { 0 };
58 addrinfo * aiList;
59
60 aiHints.ai_family = AF_INET;
61 aiHints.ai_socktype = SOCK_STREAM;
62 aiHints.ai_protocol = IPPROTO_TCP;
63
64 cout << " 输入URL: " ;
65 cin.getline(szURL, 255 );
66
67 if (strcmp(szURL, "" ) == 0 )
68 {
69 strcpy(szURL, DEFAULT_URL);
70 cout << DEFAULT_URL << endl;
71 }
72
73 ParseTheURL(szURL, szHostName);
74
75 if (getaddrinfo(szHostName, szPortName, NULL, & aiList) != 0 )
76 {
77 _tcprintf_s(TEXT( " getaddrinfo失败:%d " ), WSAGetLastError());
78 WSACleanup();
79 cin.getline(szURL, 255 );
80 return - 1 ;
81 }
82
83 SOCKET s;
84 for (addrinfo * aiPtr = aiList; aiPtr != NULL; aiPtr = aiPtr -> ai_next)
85 {
86 s = socket(aiList -> ai_family, aiList -> ai_socktype, aiList -> ai_protocol);
87 if (s == INVALID_SOCKET)
88 {
89 _tcprintf_s(TEXT( " socket创建失败:%d " ), WSAGetLastError());
90 continue ;
91 }
92
93 if (connect(s, aiPtr -> ai_addr, aiPtr -> ai_addrlen) == SOCKET_ERROR)
94 {
95 closesocket(s);
96 s = INVALID_SOCKET;
97 _tcprintf_s(TEXT( " connect失败:%d " ), WSAGetLastError());
98 continue ;
99 }
100 break ;
101 }
102
103 freeaddrinfo(aiList);
104
105 if (s == INVALID_SOCKET)
106 {
107 WSACleanup();
108 cin.getline(szURL, 255 );
109 return - 1 ;
110 }
111
112 char requestData[ 512 ] = { 0 };
113 sprintf(requestData, " GET %s HTTP/1.1\r\n " , szURL);
114 SendData(s, requestData);
115 // SendData(s, "GET / HTTP/1.1\r\n");
116 sprintf(requestData, " Host:%s\r\n " , szHostName);
117 SendData(s, requestData);
118 SendData(s, " Accept: */*\r\n " );
119 SendData(s, " User-Agent: Mozilla/4.0(compatible; MSIE 5.00; Windows NT)\r\n " );
120 SendData(s, " Connection:Close\r\n " );
121 // SendData(s, "Connection:Keep-Alive\r\n");
122 SendData(s, " \r\n " );
123 SendData(s, " \r\n " ); // 最后要加空行
124
125 BOOL done = FALSE;
126 char buffer[ 1024 ] = { 0 };
127 int l, chars = 0 ;
128
129 // 打印http响应的头部
130 while ( ! done)
131 {
132 l = recv(s, buffer, 1 , 0 );
133 if (l <= 0 )
134 done = TRUE;
135 switch ( * buffer)
136 {
137 case ' \r ' :
138 break ;
139 case ' \n ' :
140 if (chars == 0 )
141 done = TRUE;
142 chars = 0 ; // 表示另起一行
143 break ;
144 default :
145 ++ chars;
146 break ;
147 }
148 printf( " %c " , * buffer);
149 }
150
151 // 接收正文部分
152 int sum = 0 ;
153 do
154 {
155 l = recv(s, buffer, sizeof (buffer) - 1 , 0 );
156 if ( l <= 0 )
157 break ;
158 sum += l;
159 * (buffer + l) = 0 ;
160 printf(buffer);
161 } while ( l > 0 );
162
163 // 这里输出正文部分大小,发现其实和响应消息头部的Content-length大小是一样的
164 // 这样就可以检查是否接受完毕
165 printf( " \n\n大小 = %d字节\n " ,sum);
166
167 WSACleanup();
168
169 cin.getline(szURL, 255 );
170 return 0 ;
171 }
2
3 #define DEFAULT_URL "http: // www.google.com"
4
5 BOOL WinsockStartup(BYTE highVer, BYTE lowVer)
6 {
7 WSADATA wsaData;
8 return WSAStartup(MAKEWORD(highVer, lowVer), & wsaData) == 0 ;
9 }
10
11 int SendData(SOCKET s, char * data)
12 {
13 return send(s, data, strlen(data), 0 );
14 }
15
16 void ParseTheURL( char * pszURL, char * pszHostName)
17 {
18 char * p, * pHostStart;
19
20 p = strstr(pszURL, " http:// " );
21 if (p && p == pszURL)
22 {
23 pHostStart = pszURL + 7 ;
24 }
25 else
26 {
27 pHostStart = pszURL;
28 }
29 p = strchr(pHostStart, ' / ' );
30 if (p)
31 {
32 memcpy(pszHostName, pHostStart, p - pHostStart);
33 }
34 else
35 {
36 memcpy(pszHostName, pHostStart, strlen(pHostStart));
37 }
38 }
39
40 int _tmain()
41 {
42 int iRet = 0 ;
43 DWORD dwError = 0 ;
44 BOOL bOk = FALSE;
45
46 char szURL[ 256 ] = { 0 }; // 主机文件,即URL
47 char szHostName[ 256 ] = { 0 }; // 主机名
48 char szPortName[] = " 80 " ; // 端口号
49
50 if ( ! WinsockStartup( 2 , 2 ))
51 {
52 _tcprintf(TEXT( " 初始化Windows Sockets失败! " ));
53 cin.getline(szURL, 255 );
54 return - 1 ;
55 }
56
57 addrinfo aiHints = { 0 };
58 addrinfo * aiList;
59
60 aiHints.ai_family = AF_INET;
61 aiHints.ai_socktype = SOCK_STREAM;
62 aiHints.ai_protocol = IPPROTO_TCP;
63
64 cout << " 输入URL: " ;
65 cin.getline(szURL, 255 );
66
67 if (strcmp(szURL, "" ) == 0 )
68 {
69 strcpy(szURL, DEFAULT_URL);
70 cout << DEFAULT_URL << endl;
71 }
72
73 ParseTheURL(szURL, szHostName);
74
75 if (getaddrinfo(szHostName, szPortName, NULL, & aiList) != 0 )
76 {
77 _tcprintf_s(TEXT( " getaddrinfo失败:%d " ), WSAGetLastError());
78 WSACleanup();
79 cin.getline(szURL, 255 );
80 return - 1 ;
81 }
82
83 SOCKET s;
84 for (addrinfo * aiPtr = aiList; aiPtr != NULL; aiPtr = aiPtr -> ai_next)
85 {
86 s = socket(aiList -> ai_family, aiList -> ai_socktype, aiList -> ai_protocol);
87 if (s == INVALID_SOCKET)
88 {
89 _tcprintf_s(TEXT( " socket创建失败:%d " ), WSAGetLastError());
90 continue ;
91 }
92
93 if (connect(s, aiPtr -> ai_addr, aiPtr -> ai_addrlen) == SOCKET_ERROR)
94 {
95 closesocket(s);
96 s = INVALID_SOCKET;
97 _tcprintf_s(TEXT( " connect失败:%d " ), WSAGetLastError());
98 continue ;
99 }
100 break ;
101 }
102
103 freeaddrinfo(aiList);
104
105 if (s == INVALID_SOCKET)
106 {
107 WSACleanup();
108 cin.getline(szURL, 255 );
109 return - 1 ;
110 }
111
112 char requestData[ 512 ] = { 0 };
113 sprintf(requestData, " GET %s HTTP/1.1\r\n " , szURL);
114 SendData(s, requestData);
115 // SendData(s, "GET / HTTP/1.1\r\n");
116 sprintf(requestData, " Host:%s\r\n " , szHostName);
117 SendData(s, requestData);
118 SendData(s, " Accept: */*\r\n " );
119 SendData(s, " User-Agent: Mozilla/4.0(compatible; MSIE 5.00; Windows NT)\r\n " );
120 SendData(s, " Connection:Close\r\n " );
121 // SendData(s, "Connection:Keep-Alive\r\n");
122 SendData(s, " \r\n " );
123 SendData(s, " \r\n " ); // 最后要加空行
124
125 BOOL done = FALSE;
126 char buffer[ 1024 ] = { 0 };
127 int l, chars = 0 ;
128
129 // 打印http响应的头部
130 while ( ! done)
131 {
132 l = recv(s, buffer, 1 , 0 );
133 if (l <= 0 )
134 done = TRUE;
135 switch ( * buffer)
136 {
137 case ' \r ' :
138 break ;
139 case ' \n ' :
140 if (chars == 0 )
141 done = TRUE;
142 chars = 0 ; // 表示另起一行
143 break ;
144 default :
145 ++ chars;
146 break ;
147 }
148 printf( " %c " , * buffer);
149 }
150
151 // 接收正文部分
152 int sum = 0 ;
153 do
154 {
155 l = recv(s, buffer, sizeof (buffer) - 1 , 0 );
156 if ( l <= 0 )
157 break ;
158 sum += l;
159 * (buffer + l) = 0 ;
160 printf(buffer);
161 } while ( l > 0 );
162
163 // 这里输出正文部分大小,发现其实和响应消息头部的Content-length大小是一样的
164 // 这样就可以检查是否接受完毕
165 printf( " \n\n大小 = %d字节\n " ,sum);
166
167 WSACleanup();
168
169 cin.getline(szURL, 255 );
170 return 0 ;
171 }