python爬虫实战-多线程实例-解析文章标题及内容
import time
import threading
from queue import Queue
import requests
from lxml import etree
import json
import os
g_crawl_list=[]
g_parser_list=[]
class CrawlThread(threading.Thread):
"""docstring for CrawlThread"""
def __init__(self, name,page_queue,data_queue):
super(CrawlThread, self).__init__()
self.name = name
self.page_queue=page_queue
self.data_queue=data_queue
self.url='http://www.fanjian.net/jiantu-{}'
self.headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
def run(self):
print('%s 启动....(%s)'%(self.name,os.getpid()))
while 1:
if self.page_queue.empty():
break
page=self.page_queue.get()
url=self.url.format(page)
r=requests.get(url,headers=self.headers)
self.data_queue.put(r.text)
print('%s 结束....(%s)'%(self.name,os.getppid()))
class ParserThread(threading.Thread):
"""docstring for ParserThread"""
def __init__(self, name,data_queue,fp,lock):
super(ParserThread, self).__init__()
self.name = name
self.data_queue=data_queue
self.fp=fp
self.lock=lock
def run(self):
print('%s 启动....(%s)'%(self.name,os.getppid()))
while 1:
data=self.data_queue.get()
tree=etree.HTML(data)
image_li_list=tree.xpath('//ul[@class="cont-list"]/li')
image_items=[]
for oli in image_li_list:
title=oli.xpath('.//h2/a/text()')[0]
image_url=oli.xpath('.//div[@class="cont-list-main"]/p/img/@data-src')
item={
'标题':title,
'连接':image_url,
}
image_items.append(item)
self.lock.acquire()
self.fp.write(json.dumps(image_items,ensure_ascii=False)+'\n')
self.lock.release()
print('%s 启动....(%s)'%(self.name,os.getpid()))
def create_queue():
page_queue=Queue()
for page in range(1,11):
page_queue.put(page)
data_queue=Queue()
return page_queue,data_queue
def creade_crawl_thread(page_queue,data_queue):
crawl_name=['采集线程1号','采集线程2号','采集线程3号']
for name in crawl_name:
tcrawl=CrawlThread(name,page_queue,data_queue)
g_crawl_list.append(tcrawl)
def creade_parser_thread(data_queue,fp,lock):
parser_name=['解析线程1号','解析线程2号','解析线程3号']
for name in parser_name:
tparser=ParserThread(name,data_queue,fp,lock)
g_parser_list.append(tparser)
def main():
page_queue,data_queue=create_queue()
fp=open('image_url.json','a',encoding='utf8')
lock=threading.Lock()
creade_crawl_thread(page_queue,data_queue)
creade_parser_thread(data_queue,fp,lock)
for tcrawl in g_crawl_list:
tcrawl.start()
for tparser in g_parser_list:
tparser.start()
for tcrawl in g_crawl_list:
tcrawl.join()
for tparser in g_parser_list:
tparser.join()
fp.close()
if __name__ == '__main__':
main()