史上最简单的多线程爬小说

import requests
import threading
import queue
from lxml import etree
import time
Q = queue.Queue()

class A(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.url = 'http://www.17k.com/list/2926161.html'

    def run(self):
        resp = requests.get(self.url)
        html = resp.content.decode('utf-8')
        text = etree.HTML(html)
        dds = text.xpath('//div[@class="Main List"]/dl[@class="Volume"]/dd/a/@href')
        for url in dds:
            url = 'http://www.17k.com' + url
            Q.put(url)

class B(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        while True:
            url = Q.get()
            resp = requests.get(url)
            html = resp.content.decode('utf-8')
            text = etree.HTML(html)
            name = text.xpath('//div[@class="readAreaBox content"]/h1/text()')[0].strip()  # 章节的名字
            contents = text.xpath('//div[@class="readAreaBox content"]/div[@class="p"]/text()')
            f = open('./%s.txt' % name, 'w')
            print('正在保存%s' % name)
            for content in contents:
                f.write(content)  # content是一段一段的文字,不是一个整体的,若是使用with open只能保存第一句
                f.write('\n')
            f.close()

if __name__ == '__main__':
    start = time.time()
    s = A()
    q = B()
    s.start()
    q.start()
    s.join()
    q.join()
    print(time.time()-start)

你可能感兴趣的:(爬虫系列(个人学习过程))