Date: 2019-07-03
Author: Sun
优化之前的网络爬虫代码如下:
# -*- coding: utf-8 -*-
__author__ = 'sun'
__date__ = '2019/7/3 上午10:53'
from bs4 import BeautifulSoup as BSP4
import requests
g_set = set()
URL_LIST = [
('https://www.geyanw.com/lizhimingyan/list_33_1.html', '励志名言', 'lizhimingyan'),
('https://www.geyanw.com/renshenggeyan/list_32_1.html', '人生格言', 'renshenggeyan'),
('https://www.geyanw.com/mingyanjingju/list_37_1.html', '名言警句', 'mingyanjingju'),
('https://www.geyanw.com/html/mingrenmingyan/list_1_1.html', '名人名言', 'mingrenmingyan'),
('https://www.geyanw.com/html/dushumingyan/list_5_1.html', '读书名言', 'dushumingyan'),
('https://www.geyanw.com/html/jingdianmingyan/list_2_1.html', '经典名言', 'jingdianmingyan'),
]
def store_file(filename, response):
html_doc = response.text
with open("geyan_%s.html" % filename, "w", encoding="utf-8") as f:
f.write(html_doc)
def download(url, filename="index", store_flag=True):
'''
:param url: 待爬取的url
:param filename: 待存储html文件名称
:param store_flag: 本地持久化的标志
:return:
'''
response = requests.get(url)
if store_flag:
store_file(filename, response)
return response
def parse_page(page, ctype, url):
response = download(url, store_flag=False)
html_doc = response.content
soup = BSP4(html_doc, "lxml")
link_list = soup.select("#p_left .newlist ul h2 a")
#print(link_list)
index = 1
for link in link_list:
url_link = "https://www.geyanw.com" + link['href']
print("ctype:" + ctype + ", page: " + str(page) + ", url_link: " + url_link)
if url_link not in g_set:
index += 1
response = download(url_link, filename="%s_%s.html" % (ctype, index), store_flag=False)
def parse(response):
url = response.url
#print(url)
base_urls = url.split("/list_")
print(base_urls)
domain = base_urls[0]
init_html = base_urls[-1]
print(domain)
print(init_html)
ctype = init_html.split("_")[0]
cindex = init_html.split("_")[1].split(".")[0]
g_set.add(url)
html_doc = response.content
soup = BSP4(html_doc, "lxml")
#page_list = soup.select("#p_left .newlist .pagelist li a") #分页内容
#print(page_list)
total_num = soup.select("#p_left .newlist .pagelist .pageinfo strong")[0]
page_max = int(total_num.get_text())
[parse_page(page, ctype, "%s/list_%s_%s.html" % (domain, ctype, page)) for page in range(2, page_max+1)]
def process(entry_url):
try:
response = download(entry_url, store_flag=False)
parse(response) #下载和解析进行分开
return True
except Exception as e:
return False
'''
采用多进程的方式来爬取
'''
def multprocess_run():
from multiprocessing import Pool
pool = Pool(processes=8)
result = []
for (entry_url, name, type ) in URL_LIST:
pc = pool.apply_async(process, args=(entry_url, ))
result.append(pc)
pool.close()
pool.join()
'''
采用协程来处理并发量
'''
import asyncio
@asyncio.coroutine
def async_io_loop(entry_url):
yield from process(entry_url)
def async_run():
loop = asyncio.get_event_loop()
tasks = [async_io_loop(url) for (url, name, type) in URL_LIST]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
import threading
import queue
import time
class Worker(threading.Thread):
def __init__(self, name, queue):
threading.Thread.__init__(self)
self.queue = queue
self.start()
def run(self):
while True:
if self.queue.empty():
break
url = self.queue.get()
print(self.getName() + " process " + str(url))
process(url)
self.queue.task_done()
def multithread_run():
squeue = queue.Queue()
for (url, name, type) in URL_LIST:
squeue.put(url)
for i in range(10):
threadName = 'Thread' + str(i)
Worker(threadName, squeue)
squeue.join()
def main():
#multprocess_run()
#async_run()
multithread_run()
# for (url, name, type) in URL_LIST:
# process(url, name, type)
#[process(url, name, type) for (url, name, type) in URL_LIST]
# entry_url = "https://www.geyanw.com/lizhimingyan/list_33_1.html"
# process(entry_url)
if __name__ == "__main__":
main()