C语言多线程爬虫代码示例

使用C语言编写多线程爬虫能够同时处理多条数据,提高了爬虫的并发度和效率。在编写多线程爬虫时仍需要注意线程安全性和错误处理机制,并根据系统资源和目标网站的特点调整线程数和优化并发策略,以提高程序效率和稳定性。

C语言多线程爬虫代码示例_第1张图片

以下是一个使用C语言多线程编写的简单爬虫示例,实现了并发爬取多个页面的功能:

#include 
#include 
#include 
#include 
#include 

#define MAX_URL_NUM 10 // 最大URL数量
#define MAX_URL_LENGTH 256 // URL最大长度// URL列表
const char *url_list[MAX_URL_NUM] = {"https://www.example.com/page1.html", 
                                     "https://www.example.com/page2.html", 
                                     "https://www.example.com/page3.html",
                                     "https://www.example.com/page4.html",
                                     ".example.com/page5.html",
                                     "https://www.example.com/page6.html",
                                     "https://www.example.com/page7.html",
                                     "https://www.example.com/page8.html",
                                     "https://www.example.com/page9.html",
                                     "https://www.example.com/page10.html"};

// 线程参数结构体
typedef struct {
    char url[MAX_URL_LENGTH];
    int thread_id;
} ThreadArgs;

// 数据缓冲区结构体
typedef struct {
    char *data;
    size_t size;
} MemoryStruct;

// 获取页面响应的回调函数
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
    size_t realsize = size * nmemb;
    MemoryStruct *mem = (MemoryStruct *) userp;

    mem->data = realloc(mem->data, mem->size + realsize + 1);
    if (mem->data == NULL) {
        /* out of memory! */
        printf("not enough memory (realloc returned NULL)\n");
        return 0;
    }

    memcpy(&(mem->data[mem->size]), contents, realsize);
    mem->size += realsize;
    mem->data[mem->size] = 0;

    return realsize;
线程函数
static void *CrawlThreadFunc(void *args) {
    ThreadArgs *targs = (ThreadArgs *) args;
    char *url = targs->url;
    int thread_id = targs->thread_id;
    CURL *curl;
    CURLcode res;
    MemoryStruct chunk;

    printf("Thread %d: Downloading %s\n", thread_id, url);

    chunk.data = malloc(1);  /* will be grown as needed by the realloc above */
    chunk.size = 0;           /* no data at this point */

    curl = curl_easy_init();
    if (curl) {
        curl_easy_setopt(curl, CURLOPT_URL, url);
        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
        curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk);
        res = curl_easy_perform(curl);

        if (res != CURLE_OK) {
            printf("Thread %d: Download failed: %s\n", thread_id, curl_easy_strerror(res));
        } else {
            printf("Thread %d: Download succeeded, fetched %lu bytes of data\n", thread_id, (unsigned long)chunk.size);
        }

        curl_easy_cleanup(curl);
    }
    
    free(chunk.data);
    pthread_exit(NULL);
}

int main(int argc, char **argv) {
    pthread_t threads[MAX_URL_NUM];
    int rc, i;
    ThreadArgs targs[MAX_URL_NUM];

    // 初始化库
    curl_global_init(CURL_GLOBAL_ALL);

    // 创建子线程并执行爬虫
    for (i = 0; i < MAX_URL_NUM; i++) {
        strncpy(targs[i].url, url_list[i], MAX_URL_LENGTH-1);
        targs[i].thread_id = i;
        rc = pthread_create(&threads[i], NULL, CrawlThreadFunc, (void *) &targs[i]);
        if (rc) {
            printf("Error: return code from pthread_create() is %d\n", rc);
            exit(EXIT_FAILURE);
        }
    }

    // 等待子线程完成
    for (i = 0; i < MAX_URL_NUM; i++) {
        rc = pthread_join(threads[i], NULL);
        if (rc) {
            printf("Error: return code from pthread_join() is %d\n", rc);
            exit(EXIT_FAILURE);
        }
    }

    // 清理库并退出程序
    curl_global_cleanup();
    printf("All downloads complete!\n");
    exit(EXIT_SUCCESS);
}

在以上示例中,我们使用了libcurl库来发起HTTP请求和接收响应。为了方便演示,我们直接预定义了10个URL,并启动10个线程并行执行爬虫任务。实际项目中,可以通过读取配置文件或从命令行参数中获取URL列表,并根据系统资源、网络质量等因素自适应地调整并发度。

在实现过程中,需要注意线程安全性和错误处理机制,避免出现死锁、内存泄漏等问题。

你可能感兴趣的:(c语言,爬虫,c++,服务器,开发语言)