from html.parser import HTMLParser
from urllib.request import urlopen
from urllib.parse import urljoin
import time
from bs4 import BeautifulSoup, SoupStrainer
URLS = ('http://python.org',)
def output(x):
print('\n'.join(sorted(list(set(x)))))
def simpleBS(url, f):
"""
使用BeautifulSoup库来解析所有标签以获得锚点
:param url: url
:param f: 文件数据
:return:
"""
list_href = [x for x in BeautifulSoup(f).findAll('a')]
output(urljoin(url, x['href']) for x in list_href)
def fasterBS(url, f):
"""
使用BeautifulSoup来只解析a标签
:param url: url
:param f: 文加数据
:return:
"""
list_href = [x for x in BeautifulSoup(f, parse_only=SoupStrainer('a'))][1:]
output(urljoin(url, x['href']) for x in list_href)
def htmlparser(url, f):
"""
自定义HTMLParser 子类来解析锚点
:param url: url
:param f: 文件数据
:return:
"""
class AnchorParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag != 'a':
return
if not hasattr(self, 'data'):
self.data = []
for attr in attrs:
if attr[0] == 'href':
self.data.append(attr[1])
parser = AnchorParser()
parser.feed(data=f.decode())
output(urljoin(url, x) for x in parser.data)
def html5libparse(url, f):
"""
使用html5lib库作为bs的解释器来解析锚点
:param url: url
:param f: 文件数据
:return:
"""
list_href = [x for x in BeautifulSoup(f.decode(), 'html5lib').findAll('a')]
output(urljoin(url, x['href']) for x in list_href)
def process(url, data):
print('\n*** simple BS')
start = time.time()
simpleBS(url, data)
print('simple BS:%.3fs used'% (time.time() - start))
print('\n*** faster BS')
start = time.time()
fasterBS(url, data)
print('faster BS %.3fs used'% (time.time() - start))
print('\n*** HTMLParse')
start = time.time()
htmlparser(url, data)
print('HTMLParse %.3fs used' % (time.time() - start))
print('\n*** HTML5lib')
start = time.time()
html5libparse(url, data)
print('HTML5lib %.3fs used'% (time.time() - start))
def main():
for url in URLS:
f = urlopen(url)
data = f.read()
f.close()
process(url, data)
main()