是的,你没有听错。就是用c++或者说c语言写爬虫。
其实不难,虽然没有Python写起来那么简单。但是也不是那么复杂啦,毕竟好多大佬都写了那么多库,我们只要会用大佬写的库就行。
网址:https://acm.sjtu.edu.cn/OnlineJudge/status
我们就爬取这个页面的评审状态的所有内容。
代码如下:
#include
#include
#include "gumbo/Document.h"
#include "gumbo/Node.h"
#include "MyStringFormat.h"
#include "curl/curl.h"
using namespace std;
#define URL_REFERER "https://acm.sjtu.edu.cn/OnlineJudge/"
void printFunc(string page)
{
CDocument doc;
doc.parse(page.c_str());
CSelection c = doc.find("#status tr");
for (int i = 0; i < c.nodeNum(); i++)
{
for (int j = 0; j < c.nodeAt(i).childNum(); j++)
{
CNode nd = c.nodeAt(i).childAt(j);
cout << MyStringFormat::UTF_82ASCII(nd.text()).c_str() << " ";
}
cout << endl;
}
}
static size_t OnWriteData(void* buffer, size_t size, size_t nmemb, void* lpVoid)
{
string* str = dynamic_cast((string *)lpVoid);
if (NULL == str || NULL == buffer)
{
return -1;
}
char* pData = (char*)buffer;
str->append(pData, size * nmemb);
return nmemb;
}
bool HttpRequest(const char* url,
string& strResponse,
bool get/* = true*/,
const char* headers/* = NULL*/,
const char* postdata/* = NULL*/,
bool bReserveHeaders/* = false*/,
int timeout/* = 10*/)
{
CURLcode res;
CURL* curl = curl_easy_init();
if (NULL == curl)
{
return false;
}
curl_easy_setopt(curl, CURLOPT_URL, url);
//响应结果中保留头部信息
if (bReserveHeaders)
curl_easy_setopt(curl, CURLOPT_HEADER, 1);
curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, OnWriteData);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&strResponse);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
//设定为不验证证书和HOST
//curl_easy_setopt(curl, CURLOPT_PROXY, "127.0.0.1:8888");//设置代理
//curl_easy_setopt(curl, CURLOPT_PROXYPORT, 9999); //代理服务器端口
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false);
//设置超时时间
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, timeout);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
curl_easy_setopt(curl, CURLOPT_REFERER, URL_REFERER);
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
//不设置接收的编码格式或者设置为空,libcurl会自动解压压缩的格式,如gzip
//curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip, deflate, br");
//设置hostConnection: Keep-Alive
struct curl_slist *chunk = NULL;
chunk = curl_slist_append(chunk, "Host: acm.sjtu.edu.cn");
chunk = curl_slist_append(chunk, "Connection: Keep-Alive");
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
//添加自定义头信息
if (headers != NULL)
{
chunk = curl_slist_append(chunk, headers);
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
}
if (!get && postdata != NULL)
{
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postdata);
}
res = curl_easy_perform(curl);
bool bError = false;
if (res == CURLE_OK)
{
int code;
res = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &code);
if (code != 200 && code != 302)
{
bError = true;
}
}
else
{
bError = true;
}
curl_easy_cleanup(curl);
return !bError;
}
int main(int argc, char * argv[])
{
string response;
HttpRequest("https://acm.sjtu.edu.cn/OnlineJudge/status", response, true, NULL, NULL, false, 10);
printFunc(response);
system("pause");
return 0;
}
我知道,我贴出这些代码,也没法运行,所以我把工程文件也发出来。为了不被大家说我骗积分,我的所有东西都贴出百度云链接。
链接:https://pan.baidu.com/s/1jBZ-6tT-4ne0uTMw4jFvKA
提取码:pmg6
喜欢的欢迎关注我的公众号