最近公司需要爬取网站业务信息,开启一个爬虫小项目,爬取某网站全站链接后查找需要的信息,这里是第一部分,先完成对全站的链接爬取,后面传入url后查找信息并不是难点。具体思路,通过首页url,爬取html李所有链接,然后创建队列,在队列中加入新的url后,重新调用爬取html函数,直到网站最深处,爬取结束。
首先进入首页,完成网站爬取
def getHtml(url, ua_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko',
num_retries=5):
headers = {"User-Agent":ua_agent}
req = request.Request(url, headers=headers)
html = None
try:
response = request.urlopen(req)
html = response.read().decode('utf-8')
except error.URLError or error.HTTPError as e:
if num_retries > 0:
if hasattr(e,'code') and 500 <= e.code < 600:
getHtml(url, ua_agent, num_retries-1)
return html
二,对html分析提取所有href
def get_urls(html):
"""
获取当前页面中所有的超链接的列表信息
"""
links = []
soup = BeautifulSoup(html, "html.parser")
url_list = soup.find_all('a')
for link in url_list:
links.append(link.get('href'))
return links
三,对href做筛选,筛选出能够访问的链接
def CrawlInfo(url, q):
# 获取当前节点的信息
global crawl_queue
crawl_queue = [] #声明待爬队列
hlinks = []
html = getHtml(url)
links = get_urls(html)
for murl in links:
if re.findall("^http",murl):
murl = str(murl) + "\r\n"
hlinks.append(murl)
save_file(murl, "xh_url.txt")
elif re.findall("^java",murl):
links.remove(murl)
elif re.findall("gsp$",murl) or re.findall("shtml$",murl) or re.findall("[0-9]*$",murl):
murl = "https://www.newchinalife.com" + str(murl) + "\r\n"
hlinks.append(murl)
save_file(murl, "xh_url.txt")
elif re.findall("^/",murl):
murl = "https://www.newchinalife.com" + str(murl) + "\r\n"
hlinks.append(murl)
save_file(murl, "xh_url.txt")
else:
pass
for _ in hlinks:
crawl_queue.append(_)
time.sleep(0.001)
q.put(url) # 当前的URL处理完成,通知主进程
四,保存到文本
def save_file(murl,fileName):
with open(fileName,'ab') as f:
f.write(murl.encode())
五,main函数,使用进程池爬取,加快爬取速度,并创建队列
if __name__ == "__main__":
# 使用进程池
pool = Pool()
q = Manager().Queue()
crawled_queue = [] #已爬队列
seedUrl="https://www.newchinalife.com/ncl/cn/new/index/index.shtml"
# 当前页的处理
CrawlInfo(seedUrl, q)
crawl_queue.append(seedUrl)
# 在待爬队列中再做一次去重
crawl_queue = list(set(crawl_queue))
# 抓取队列中的信息为空,则退出循环
while crawl_queue:
url = crawl_queue.pop(0)
# 用进程池中的进程来处理这个URL
pool.apply_async(func=CrawlInfo, args=(url, q))
# 处理完之后,需要把这个url放入已爬队列中
url = q.get()
crawled_queue.append(url)
pool.close()
pool.join()
这是所有的代码
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 20 14:31:56 2018
@author: 数据猫JL
"""
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 20 09:24:49 2018
@author: jianglu
"""
from urllib import request
from urllib import error
from bs4 import BeautifulSoup
import re
from multiprocessing import Pool
from multiprocessing import Manager
import time
def getHtml(url, ua_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko',
num_retries=5):
headers = {"User-Agent":ua_agent}
req = request.Request(url, headers=headers)
html = None
try:
response = request.urlopen(req)
html = response.read().decode('utf-8')
except error.URLError or error.HTTPError as e:
if num_retries > 0:
if hasattr(e,'code') and 500 <= e.code < 600:
getHtml(url, ua_agent, num_retries-1)
return html
def get_urls(html):
"""
获取当前页面中所有的超链接的列表信息
"""
links = []
soup = BeautifulSoup(html, "html.parser")
url_list = soup.find_all('a')
for link in url_list:
links.append(link.get('href'))
return links
# 匹配规则^http或者com$,cn$
def save_file(murl,fileName):
with open(fileName,'ab') as f:
f.write(murl.encode())
def CrawlInfo(url, q):
# 获取当前节点的信息
global crawl_queue
crawl_queue = [] #声明待爬队列
hlinks = []
html = getHtml(url)
links = get_urls(html)
for murl in links:
if re.findall("^http",murl):
murl = str(murl) + "\r\n"
hlinks.append(murl)
save_file(murl, "xh_url.txt")
elif re.findall("^java",murl):
links.remove(murl)
elif re.findall("gsp$",murl) or re.findall("shtml$",murl) or re.findall("[0-9]*$",murl):
murl = "https://www.newchinalife.com" + str(murl) + "\r\n"
hlinks.append(murl)
save_file(murl, "xh_url.txt")
elif re.findall("^/",murl):
murl = "https://www.newchinalife.com" + str(murl) + "\r\n"
hlinks.append(murl)
save_file(murl, "xh_url.txt")
else:
pass
for _ in hlinks:
crawl_queue.append(_)
time.sleep(0.001)
q.put(url) # 当前的URL处理完成,通知主进程
if __name__ == "__main__":
# 使用进程池
pool = Pool()
q = Manager().Queue()
crawled_queue = [] #已爬队列
seedUrl="https://www.newchinalife.com/ncl/cn/new/index/index.shtml"
# 当前页的处理
CrawlInfo(seedUrl, q)
crawl_queue.append(seedUrl)
# 在待爬队列中再做一次去重
crawl_queue = list(set(crawl_queue))
# 抓取队列中的信息为空,则退出循环
while crawl_queue:
url = crawl_queue.pop(0)
# 用进程池中的进程来处理这个URL
pool.apply_async(func=CrawlInfo, args=(url, q))
# 处理完之后,需要把这个url放入已爬队列中
url = q.get()
crawled_queue.append(url)
pool.close()
pool.join()
如果需要爬取网站信息,则可以遍历文件的url,对爬取的网页信息进行处理,更改网站的话,只需要重写分析链接结构后正则部分