python爬虫实战-多线程实例-解析文章标题及内容

python爬虫实战-多线程实例-解析文章标题及内容

import time
import threading
from queue import Queue
import requests
from lxml import etree
import json
import os

#用来存放采集线程
g_crawl_list=[]
#用来存放解析线程
g_parser_list=[]

class CrawlThread(threading.Thread):
	"""docstring for CrawlThread"""
	def __init__(self, name,page_queue,data_queue):
		super(CrawlThread, self).__init__()
		self.name = name
		self.page_queue=page_queue
		self.data_queue=data_queue
		self.url='http://www.fanjian.net/jiantu-{}'
		self.headers={
				'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
			}

	def run(self):
		print('%s 启动....(%s)'%(self.name,os.getpid()))
		while 1:
			if self.page_queue.empty():
				break
			page=self.page_queue.get()
			url=self.url.format(page)
			r=requests.get(url,headers=self.headers)
			self.data_queue.put(r.text)
		print('%s 结束....(%s)'%(self.name,os.getppid()))


class ParserThread(threading.Thread):
	"""docstring for ParserThread"""
	def __init__(self, name,data_queue,fp,lock):
		super(ParserThread, self).__init__()
		self.name = name
		self.data_queue=data_queue
		self.fp=fp
		self.lock=lock

	def run(self):
		print('%s 启动....(%s)'%(self.name,os.getppid()))
		while 1:
			# if self.data_queue.empty:
			# 	break
			data=self.data_queue.get()
			tree=etree.HTML(data)
			image_li_list=tree.xpath('//ul[@class="cont-list"]/li')
			image_items=[]
			for oli in image_li_list:
				title=oli.xpath('.//h2/a/text()')[0]
				image_url=oli.xpath('.//div[@class="cont-list-main"]/p/img/@data-src')
				# print (title)
				item={
				'标题':title,
				'连接':image_url,
				}
				image_items.append(item)
			# 写入文件
			self.lock.acquire()
			self.fp.write(json.dumps(image_items,ensure_ascii=False)+'\n')
			self.lock.release()
		print('%s 启动....(%s)'%(self.name,os.getpid()))

#创建队列函数
def create_queue():
	#创建一个页码队列
	page_queue=Queue()
	for page in range(1,11):
		page_queue.put(page)
	data_queue=Queue()
	return page_queue,data_queue

#创建采集线程
def creade_crawl_thread(page_queue,data_queue):
	crawl_name=['采集线程1号','采集线程2号','采集线程3号']
	for name in crawl_name:
		tcrawl=CrawlThread(name,page_queue,data_queue)
		g_crawl_list.append(tcrawl)


#创建解析线程
def creade_parser_thread(data_queue,fp,lock):
	parser_name=['解析线程1号','解析线程2号','解析线程3号']
	for name in parser_name:
		tparser=ParserThread(name,data_queue,fp,lock)
		g_parser_list.append(tparser)

def main():
	#创建队列函数
	page_queue,data_queue=create_queue()
	#打开保存的文件
	fp=open('image_url.json','a',encoding='utf8')
	#创建文件锁
	lock=threading.Lock()
	#创建采集线程
	creade_crawl_thread(page_queue,data_queue)
	#创建解析线程
	creade_parser_thread(data_queue,fp,lock)
	for tcrawl in g_crawl_list:
		tcrawl.start()
	for tparser in g_parser_list:
		tparser.start()
	for tcrawl in g_crawl_list:
		tcrawl.join()
	for tparser in g_parser_list:
		tparser.join()

	fp.close()

if __name__ == '__main__':
	main()

你可能感兴趣的:(python实战)