python四种方式解析网页获取页面中的链接

# 链接解释器
# 使用三种不同解释器 环境python3.8
#
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib.parse import urljoin
import time
# 非标准库
from bs4 import BeautifulSoup, SoupStrainer

# 非标准库
# from html5lib import html5parser, treebuilders,treewalkers

# 要解析的网站
URLS = ('http://python.org',)


# 定义输出函数
def output(x):
    print('\n'.join(sorted(list(set(x)))))


def simpleBS(url, f):
    """
    使用BeautifulSoup库来解析所有标签以获得锚点
    :param url: url
    :param f: 文件数据
    :return:
    """
    list_href = [x for x in BeautifulSoup(f).findAll('a')]
    output(urljoin(url, x['href']) for x in list_href)


def fasterBS(url, f):
    """
    使用BeautifulSoup来只解析a标签
    :param url: url
    :param f: 文加数据
    :return:
    """
    list_href = [x for x in BeautifulSoup(f, parse_only=SoupStrainer('a'))][1:]
    output(urljoin(url, x['href']) for x in list_href)


def htmlparser(url, f):
    """
    自定义HTMLParser 子类来解析锚点
    :param url: url
    :param f: 文件数据
    :return:
    """

    class AnchorParser(HTMLParser):
        def handle_starttag(self, tag, attrs):
            if tag != 'a':
                return
            # 若没有data值,增加data属性
            if not hasattr(self, 'data'):
                self.data = []
            for attr in attrs:
                if attr[0] == 'href':
                    self.data.append(attr[1])

    parser = AnchorParser()
    # 只能feed字符串,故将字节串转化为字符串
    parser.feed(data=f.decode())
    output(urljoin(url, x) for x in parser.data)


def html5libparse(url, f):
    """
    使用html5lib库作为bs的解释器来解析锚点
    :param url: url
    :param f: 文件数据
    :return:
    """
    list_href = [x for x in BeautifulSoup(f.decode(), 'html5lib').findAll('a')]
    output(urljoin(url, x['href']) for x in list_href)


def process(url, data):
    print('\n*** simple BS')
    start = time.time()
    simpleBS(url, data)
    print('simple BS:%.3fs used'% (time.time() - start))
    print('\n*** faster BS')
    start = time.time()
    fasterBS(url, data)
    print('faster BS %.3fs used'% (time.time() - start))
    print('\n*** HTMLParse')
    start = time.time()
    htmlparser(url, data)
    print('HTMLParse %.3fs used' % (time.time() - start))
    print('\n*** HTML5lib')
    start = time.time()
    html5libparse(url, data)
    print('HTML5lib %.3fs used'% (time.time() - start))

    # 主函数


def main():
    for url in URLS:
        f = urlopen(url)
        data = f.read()
        f.close()
        # 处理数据
        process(url, data)


main()

你可能感兴趣的:(python碎碎,python)