C/C++实现简单爬虫(基于libcurl、gumbo-parser)

初步的计划是先给出关键的实现步骤,细节和排版后期会慢慢调整


首先介绍一下libcurlgumbo-parser

  • libcurl是一个免费易用的客户端URL传输库,支持DICT, FILE, FTP, FTPS, Gopher, HTTP, HTTPS, IMAP, IMAPS, LDAP, LDAPS, POP3, POP3S, RTMP, RTSP, SCP, SFTP, SMTP, SMTPS, Telnet and TFTP协议,支持SSL证书,支持HTTP POST, HTTP PUT, FTP上传,支持基于表单的上传,支持跨平台,支持...,反正支持很多,很强大就是了!这是官方网址,感兴趣的同学可以查看(https://curl.haxx.se/libcurl/)。
  • gumbo-parser是谷歌开源的一个HTML5解析库,github地址(https://github.com/google/gumbo-parser)。

因此我们将使用libcurl获取网页和下载,使用gumbo-parser来解析网页获取感兴趣内容,全部基于Windows平台(我的环境为Windows10专业版+Visusl Studio 2015)。

编译libcurlgumbo-parser

  • libcurl参考另一篇文章 https://www.jianshu.com/p/181c96e5156d
  • gumbo-parser从github下载源代码,源代码中附带又Visusl Studio的工程文件,直接打开编译即可。
    Visusl Studio工程文件

编码粗糙,还请谅解-_-''(码云项目地址https://gitee.com/liarmaiq/SpiderCxx

// SpiderCxx.cpp
// 获取一篇文章(https://www.jianshu.com/p/9d7e83d16bd7)中的图片

#include "stdafx.h"
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#include "gumbo.h"
#define CURL_STATICLIB
#include 

#pragma comment(lib,"ws2_32.lib")
#pragma comment(lib,"wldap32.lib")

#ifdef _DEBUG
#pragma comment(lib,"..\\lib\\win32_debug\\libcurld.lib")
#pragma comment(lib,"..\\lib\\win32_debug\\libcrypto.lib")
#pragma comment(lib,"..\\lib\\win32_debug\\libssl.lib")
#pragma comment(lib,"..\\lib\\win32_debug\\gumbo.lib")
#else
#pragma comment(lib,"..\\lib\\win32_release\\libcurl.lib")
#pragma comment(lib,"..\\lib\\win32_release\\libcrypto.lib")
#pragma comment(lib,"..\\lib\\win32_release\\libssl.lib")
#pragma comment(lib,"..\\lib\\win32_release\\gumbo.lib")
#endif

// 递归HTML网页的节点,获取文章中图片的连接
void search_for_links(GumboNode* node, std::list &urls)
{
    if (node->type != GUMBO_NODE_ELEMENT) 
    {
        return;
    }
    GumboAttribute* href;
    if (node->v.element.tag == GUMBO_TAG_IMG &&
        (href = gumbo_get_attribute(&node->v.element.attributes, "data-original-src"))) 
    {
        std::cout << href->value << std::endl;
        urls.push_back(href->value);
    }

    GumboVector* children = &node->v.element.children;
    for (unsigned int i = 0; i < children->length; ++i) 
    {
        search_for_links(static_cast(children->data[i]), urls);
    }
}

// 获取网页的curl CURLOPT_WRITEFUNCTION 回调函数
static size_t write_html(void *ptr, size_t size, size_t nmemb, void *stream)
{
    std::string* strHtml = (std::string*)stream;
    for (size_t i = 0; i < nmemb * size; i++)
    {
        strHtml->append(1, ((char*)ptr)[i]);
    }
    return nmemb * size;
}

// 获取图片的curl CURLOPT_WRITEFUNCTION 回调函数
size_t write_pic(void *ptr, size_t size, size_t nmemb, void *stream)
{
    return fwrite(ptr, size, nmemb, (FILE *)stream);
}

// 发起curl请求
void request(std::string url, void* write_func, void* container)
{
    CURL *curl_handle;
    //struct myprogress prog;

    curl_global_init(CURL_GLOBAL_ALL);

    /* init the curl session */
    curl_handle = curl_easy_init();

    //prog.lastruntime = 0;
    //prog.curl = curl_handle;

    /* set URL to get here */
    curl_easy_setopt(curl_handle, CURLOPT_URL, url.c_str());

    /* Switch on full protocol/debug output while testing */
    //curl_easy_setopt(curl_handle, CURLOPT_VERBOSE, 1L);

    /* disable progress meter, set to 0L to enable and 1L to disable debug output */
    curl_easy_setopt(curl_handle, CURLOPT_NOPROGRESS, 1L);

    /* send all data to this function  */
    curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, write_func);

    // 不要检查证书
    curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYPEER, 0);

    //curl_easy_setopt(curl_handle, CURLOPT_XFERINFOFUNCTION, xferinfo);
    /* pass the struct pointer into the xferinfo function, note that this is
    an alias to CURLOPT_PROGRESSDATA */
    //curl_easy_setopt(curl_handle, CURLOPT_XFERINFODATA, &prog);

    /* write the page body to this file handle */
    curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, container);

    /* get it! */
    CURLcode errorCode = curl_easy_perform(curl_handle);

    /* cleanup curl stuff */
    curl_easy_cleanup(curl_handle);
}

// 获取网页
void get_html(std::string& html, std::string url)
{
    request(url, write_html, &html);
}

// 获取图片
void get_pic(std::string path, std::string url)
{
    // 给url添加https前缀,根据自己获取的具体url情况调整
    url = "https:" + url;

    // 获取图片名称并添加jpg后缀,根据自己获取的具体url情况调整
    std::string picName = url.substr(url.rfind('/') + 1);
    picName += ".jpg";

    // 组装图片的全路径
    std::string picPath;
    if (path.back() == '/' || path.back() == '\\')
        picPath = path + picName;
    else
        picPath = path + "/" + picName;

    // 创建文件、发起请求
    FILE *pagefile;
    fopen_s(&pagefile, picPath.c_str(), "wb");
    if (pagefile)
    {
        request(url, write_pic, pagefile);
        fclose(pagefile);
    }
}


int main(int argc, const char** argv)
{
    // 获取网页,一篇文章
    std::string strUrl = "https://www.jianshu.com/p/9d7e83d16bd7";
    std::string strHtml;
    get_html(strHtml, strUrl);

    // 解析网页
    const char* input = strHtml.data();
    int input_length = strHtml.length();
    GumboOutput* output = gumbo_parse_with_options(&kGumboDefaultOptions, input, input_length);

    // 获取图片url
    std::list urls;
    search_for_links(output->root, urls);
    gumbo_destroy_output(&kGumboDefaultOptions, output);

    // 多线程获取图片
    std::thread *ts = new std::thread[urls.size()];
    int index = 0;
    std::list::iterator iter = urls.begin();
    for (; iter != urls.end(); iter++)
    {
        ts[index++] = std::thread(get_pic, "../../pics", *iter);
    }

    for (size_t i = 0; i < urls.size(); i++)
    {
        ts[i].join();
    }

    delete[] ts;

    return 0;
}

你可能感兴趣的:(C/C++实现简单爬虫(基于libcurl、gumbo-parser))