python广度优先爬取全站超链接和url

最近公司需要爬取网站业务信息,开启一个爬虫小项目,爬取某网站全站链接后查找需要的信息,这里是第一部分,先完成对全站的链接爬取,后面传入url后查找信息并不是难点。具体思路,通过首页url,爬取html李所有链接,然后创建队列,在队列中加入新的url后,重新调用爬取html函数,直到网站最深处,爬取结束。

首先进入首页,完成网站爬取

def getHtml(url, ua_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko',
            num_retries=5):
    headers = {"User-Agent":ua_agent}
    req = request.Request(url, headers=headers)
    html = None
    try:
        response = request.urlopen(req)
        html = response.read().decode('utf-8')
    except error.URLError or error.HTTPError as e:
        if num_retries > 0:
            if hasattr(e,'code') and 500 <= e.code < 600:
                getHtml(url, ua_agent, num_retries-1)
    return html 

二,对html分析提取所有href

def get_urls(html):
    """
    获取当前页面中所有的超链接的列表信息
    """
    links = []
    soup = BeautifulSoup(html, "html.parser")
    url_list = soup.find_all('a')
    for link in url_list: 
        links.append(link.get('href'))
    return links

三,对href做筛选,筛选出能够访问的链接

def CrawlInfo(url, q):
    # 获取当前节点的信息
    global crawl_queue
    crawl_queue = []   #声明待爬队列
    hlinks = []
    html = getHtml(url)
    links = get_urls(html)
    for murl in links:      
        if re.findall("^http",murl):
            murl = str(murl) + "\r\n"
            hlinks.append(murl)
            save_file(murl, "xh_url.txt")
        elif re.findall("^java",murl):
            links.remove(murl)
        elif re.findall("gsp$",murl) or re.findall("shtml$",murl) or re.findall("[0-9]*$",murl):
            murl = "https://www.newchinalife.com" + str(murl) + "\r\n"
            hlinks.append(murl)
            save_file(murl, "xh_url.txt")
        elif re.findall("^/",murl):
            murl = "https://www.newchinalife.com" + str(murl) + "\r\n"
            hlinks.append(murl)
            save_file(murl, "xh_url.txt")
        else:
            pass            
    for _ in hlinks:
        crawl_queue.append(_)
        time.sleep(0.001)
    q.put(url) # 当前的URL处理完成,通知主进程

四,保存到文本

def save_file(murl,fileName):
    with open(fileName,'ab') as f:
        f.write(murl.encode())

五,main函数,使用进程池爬取,加快爬取速度,并创建队列

if __name__ == "__main__":  
    # 使用进程池
    pool = Pool()
    q = Manager().Queue()
    crawled_queue = [] #已爬队列
    seedUrl="https://www.newchinalife.com/ncl/cn/new/index/index.shtml"
    # 当前页的处理
    CrawlInfo(seedUrl, q)
    crawl_queue.append(seedUrl)
    # 在待爬队列中再做一次去重
    crawl_queue = list(set(crawl_queue))        
    # 抓取队列中的信息为空,则退出循环
    while crawl_queue:
        url = crawl_queue.pop(0)
        # 用进程池中的进程来处理这个URL
        pool.apply_async(func=CrawlInfo, args=(url, q))      
        # 处理完之后,需要把这个url放入已爬队列中
        url = q.get()
        crawled_queue.append(url)
    pool.close()
    pool.join()

这是所有的代码

# -*- coding: utf-8 -*-
"""
Created on Thu Sep 20 14:31:56 2018

@author: 数据猫JL
"""
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 20 09:24:49 2018

@author: jianglu
"""
from urllib import request
from urllib import error
from bs4 import BeautifulSoup
import re
from multiprocessing import Pool
from multiprocessing import Manager
import time
def getHtml(url, ua_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko',
            num_retries=5):
    headers = {"User-Agent":ua_agent}
    req = request.Request(url, headers=headers)
    html = None
    try:
        response = request.urlopen(req)
        html = response.read().decode('utf-8')
    except error.URLError or error.HTTPError as e:
        if num_retries > 0:
            if hasattr(e,'code') and 500 <= e.code < 600:
                getHtml(url, ua_agent, num_retries-1)
    return html 

def get_urls(html):
    """
    获取当前页面中所有的超链接的列表信息
    """
    links = []
    soup = BeautifulSoup(html, "html.parser")
    url_list = soup.find_all('a')
    for link in url_list: 
        links.append(link.get('href'))
    return links
#    匹配规则^http或者com$,cn$


def save_file(murl,fileName):
    with open(fileName,'ab') as f:
        f.write(murl.encode()) 

def CrawlInfo(url, q):
    # 获取当前节点的信息
    global crawl_queue
    crawl_queue = []   #声明待爬队列
    hlinks = []
    html = getHtml(url)
    links = get_urls(html)
    for murl in links:      
        if re.findall("^http",murl):
            murl = str(murl) + "\r\n"
            hlinks.append(murl)
            save_file(murl, "xh_url.txt")
        elif re.findall("^java",murl):
            links.remove(murl)
        elif re.findall("gsp$",murl) or re.findall("shtml$",murl) or re.findall("[0-9]*$",murl):
            murl = "https://www.newchinalife.com" + str(murl) + "\r\n"
            hlinks.append(murl)
            save_file(murl, "xh_url.txt")
        elif re.findall("^/",murl):
            murl = "https://www.newchinalife.com" + str(murl) + "\r\n"
            hlinks.append(murl)
            save_file(murl, "xh_url.txt")
        else:
            pass            
    for _ in hlinks:
        crawl_queue.append(_)
        time.sleep(0.001)
    q.put(url) # 当前的URL处理完成,通知主进程

if __name__ == "__main__":  
    # 使用进程池
    pool = Pool()
    q = Manager().Queue()
    crawled_queue = [] #已爬队列
    seedUrl="https://www.newchinalife.com/ncl/cn/new/index/index.shtml"
    # 当前页的处理
    CrawlInfo(seedUrl, q)
    crawl_queue.append(seedUrl)
    # 在待爬队列中再做一次去重
    crawl_queue = list(set(crawl_queue))        
    # 抓取队列中的信息为空,则退出循环
    while crawl_queue:
        url = crawl_queue.pop(0)
        # 用进程池中的进程来处理这个URL
        pool.apply_async(func=CrawlInfo, args=(url, q))      
        # 处理完之后,需要把这个url放入已爬队列中
        url = q.get()
        crawled_queue.append(url)
    pool.close()
    pool.join()
    
    

如果需要爬取网站信息,则可以遍历文件的url,对爬取的网页信息进行处理,更改网站的话,只需要重写分析链接结构后正则部分

你可能感兴趣的:(爬虫)