使用C语言编写多线程爬虫能够同时处理多条数据,提高了爬虫的并发度和效率。在编写多线程爬虫时仍需要注意线程安全性和错误处理机制,并根据系统资源和目标网站的特点调整线程数和优化并发策略,以提高程序效率和稳定性。
以下是一个使用C语言多线程编写的简单爬虫示例,实现了并发爬取多个页面的功能:
#include
#include
#include
#include
#include
#define MAX_URL_NUM 10 // 最大URL数量
#define MAX_URL_LENGTH 256 // URL最大长度// URL列表
const char *url_list[MAX_URL_NUM] = {"https://www.example.com/page1.html",
"https://www.example.com/page2.html",
"https://www.example.com/page3.html",
"https://www.example.com/page4.html",
".example.com/page5.html",
"https://www.example.com/page6.html",
"https://www.example.com/page7.html",
"https://www.example.com/page8.html",
"https://www.example.com/page9.html",
"https://www.example.com/page10.html"};
// 线程参数结构体
typedef struct {
char url[MAX_URL_LENGTH];
int thread_id;
} ThreadArgs;
// 数据缓冲区结构体
typedef struct {
char *data;
size_t size;
} MemoryStruct;
// 获取页面响应的回调函数
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
size_t realsize = size * nmemb;
MemoryStruct *mem = (MemoryStruct *) userp;
mem->data = realloc(mem->data, mem->size + realsize + 1);
if (mem->data == NULL) {
/* out of memory! */
printf("not enough memory (realloc returned NULL)\n");
return 0;
}
memcpy(&(mem->data[mem->size]), contents, realsize);
mem->size += realsize;
mem->data[mem->size] = 0;
return realsize;
线程函数
static void *CrawlThreadFunc(void *args) {
ThreadArgs *targs = (ThreadArgs *) args;
char *url = targs->url;
int thread_id = targs->thread_id;
CURL *curl;
CURLcode res;
MemoryStruct chunk;
printf("Thread %d: Downloading %s\n", thread_id, url);
chunk.data = malloc(1); /* will be grown as needed by the realloc above */
chunk.size = 0; /* no data at this point */
curl = curl_easy_init();
if (curl) {
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk);
res = curl_easy_perform(curl);
if (res != CURLE_OK) {
printf("Thread %d: Download failed: %s\n", thread_id, curl_easy_strerror(res));
} else {
printf("Thread %d: Download succeeded, fetched %lu bytes of data\n", thread_id, (unsigned long)chunk.size);
}
curl_easy_cleanup(curl);
}
free(chunk.data);
pthread_exit(NULL);
}
int main(int argc, char **argv) {
pthread_t threads[MAX_URL_NUM];
int rc, i;
ThreadArgs targs[MAX_URL_NUM];
// 初始化库
curl_global_init(CURL_GLOBAL_ALL);
// 创建子线程并执行爬虫
for (i = 0; i < MAX_URL_NUM; i++) {
strncpy(targs[i].url, url_list[i], MAX_URL_LENGTH-1);
targs[i].thread_id = i;
rc = pthread_create(&threads[i], NULL, CrawlThreadFunc, (void *) &targs[i]);
if (rc) {
printf("Error: return code from pthread_create() is %d\n", rc);
exit(EXIT_FAILURE);
}
}
// 等待子线程完成
for (i = 0; i < MAX_URL_NUM; i++) {
rc = pthread_join(threads[i], NULL);
if (rc) {
printf("Error: return code from pthread_join() is %d\n", rc);
exit(EXIT_FAILURE);
}
}
// 清理库并退出程序
curl_global_cleanup();
printf("All downloads complete!\n");
exit(EXIT_SUCCESS);
}
在以上示例中,我们使用了libcurl库来发起HTTP请求和接收响应。为了方便演示,我们直接预定义了10个URL,并启动10个线程并行执行爬虫任务。实际项目中,可以通过读取配置文件或从命令行参数中获取URL列表,并根据系统资源、网络质量等因素自适应地调整并发度。
在实现过程中,需要注意线程安全性和错误处理机制,避免出现死锁、内存泄漏等问题。