【python爬虫】多线程、队列、保存为csv文件、诗词数据清洗

本章主要爬取了某网站的全部诗词。包括作者、朝代、诗词正文。当中进行了部分数据的清洗、异常处理。最终并保存为csv文件。代码设计思想如下图:
【python爬虫】多线程、队列、保存为csv文件、诗词数据清洗_第1张图片闲话少说,直接贴代码。

import requests
from my_fake_useragent import UserAgent
from lxml import etree
import queue
import threading
import csv

class PoetrySpider:
    def __init__(self):
        self.url = 'http://www.shicimingju.com/chaxun/zuozhe/{}_{}.html'
        self.headers = {"User-Agent": UserAgent().random()}
        self.url_queue = queue.Queue(maxsize=20)
        self.poetry_queue = queue.Queue(maxsize=20)

    # 获取页面的html_str
    def get_html(self, url):
        return requests.get(url=url,
                            headers=self.headers).content.decode('utf-8')

    # 获取二级页面的URL
    def get_second_url(self, html_str):
        html = etree.HTML(html_str)
        second_url_list = html.xpath('//div[@class="www-shadow-card www-main-container"]/h3/a/@href')
        return second_url_list

    # 解析二级页面,获取想要的数据
    def parse_sencond_html(self, html_str):
        html = etree.HTML(html_str)
        try:
            title = html.xpath('//div[@class="shici-container www-shadow-card"]/h1/text()')[0]
            old_dynasty = html.xpath('//div[@class="shici-container www-shadow-card"]/div[@class="shici-info"]/text()')
            dynasty = self.clean_dynasty(old_dynasty)
            author = html.xpath('//div[@class="shici-container www-shadow-card"]/div[@class="shici-info"]/a/text()')[0]
            old_poetry = html.xpath(
                '//div[@class="shici-container www-shadow-card"]/div[@class="shici-content"]/text()')
            poetry = self.clean_poetry(old_poetry)
        except:
            print('数据有误...')
        else:
            return (title, dynasty, author, poetry)

    # 清洗诗词题目
    def clean_dynasty(self, title):
        return "".join(title).strip()[1]

    # 清洗诗句
    def clean_poetry(self, content_list):
        return "".join(content_list).strip()

    # 验证网页title确认不是404和500
    def check_net_title(self, html_str):
        html = etree.HTML(html_str)
        title = html.xpath('//title/text()')[0]
        if "404" in title or "500" in title:
            return False
        else:
            return True

    # url生产器
    def product_next_url(self):
        for i in range(38, 326):
            j = 1
            while True:
                url = self.url.format(i, j)
                print(url)
                html_str = self.get_html(url)
                if self.check_net_title(html_str):
                    url_list = self.get_second_url(html_str)
                    for url in url_list:
                        next_url = 'http://www.shicimingju.com' + url
                        # 生产的url入队
                        self.url_queue.put(next_url)
                    j += 1
                else:
                    break

    # 消费生辰的url,并解析页面,让产生的数据入队
    def consumer_url(self):
        while True:
            next_url = self.url_queue.get()
            print(next_url)
            html_str = self.get_html(next_url)
            data = self.parse_sencond_html(html_str)
            self.poetry_queue.put(data)

    # 从队列中取出数据,保存在csv的文件中
    def data_storage(self):
        with open('data.csv', 'w+', encoding='utf-8', newline='') as csv_f:
            writer = csv.writer(csv_f)
            while True:
            	try:
	                poetry = self.poetry_queue.get()
	                writer.writerow(poetry)
	                print('保存成功', poetry)
	           	except:
	           		print('保存错误'# 主函数
    def main(self):
        # 线程1生产下一个url
        p_url = threading.Thread(target=self.product_next_url)
        # 创建4个线程解析页面
        c_url01 = threading.Thread(target=self.consumer_url)
        c_url02 = threading.Thread(target=self.consumer_url)
        c_url03 = threading.Thread(target=self.consumer_url)
        c_url04 = threading.Thread(target=self.consumer_url)
        # 创建一个线程储存数据
        d_data = threading.Thread(target=self.data_storage)

        # 启动所有线程
        p_url.start()
        c_url01.start()
        c_url02.start()
        c_url03.start()
        c_url04.start()
        d_data.start()


if __name__ == "__main__":
    flt = PoetrySpider()
    flt.main()

你可能感兴趣的:(python网络爬虫)