是否能够通过一个wiki页面上的站内链接,经过最多六次跳转,到达另一个wiki页面,对于本书,我们的任务是从https://en.wikipedia.org/wiki/Eric_Idle
跳转到https://en.wikipedia.org/wiki/Kevin_Bacon
。
书上都写了,不讲了
反正疫情在家闲着也是闲着,让笔记本开着跑了三天,最后的结果是:
获取一个wiki页面并保存到本地(毕竟有wall,方便出错了重新跑)
from urllib.request import urlopen
from urllib.error import HTTPError, URLError
from http.client import HTTPResponse
import time
storage_directory = 'D:/MyResources/爬虫数据/Wiki Pages'
def process_filename(filename: str) -> str:
hash_res = hash(filename)
filename = filename.replace('"', '')\
.replace('?', '')\
.replace('*', '')\
.replace('<', '')\
.replace('>', '')\
.replace(':', '')\
.replace('/', '')\
.replace('\\', '')\
.replace('|', '')
if len(filename) == 0 or len(filename) == filename.count('.'):
filename = str(hash_res)
return storage_directory + '/' + filename
def get_and_store_page(url: str, filename: str) -> bool:
try:
response = urlopen(url) # type: HTTPResponse
except HTTPError as e:
print(f'HTTPError: {e}')
return False
except URLError as e:
print(f'URLError: {e}')
return False
else:
html = response.read().decode(encoding='utf-8')
try:
filename = process_filename(filename)
f = open(file=filename, mode='w', encoding='utf-8')
except FileNotFoundError as e:
print(f'check your file name: {e}')
return False
else:
f.write(html)
f.close()
time.sleep(1)
return True
def load_stored_html(filename: str) -> (str, bool):
filename = process_filename(filename)
try:
f = open(file=filename, mode='r', encoding='utf-8')
except FileNotFoundError as e:
print(f'check your filename: {e}')
return '', False
else:
res = f.read()
f.close()
return res, True
if __name__ == '__main__':
if get_and_store_page(url='https://en.wikipedia.org/wiki/Kevin_Bacon', filename='Kevin_Bacon.html'):
print('success: https://en.wikipedia.org/wiki/Kevin_Bacon')
else:
print('fail: https://en.wikipedia.org/wiki/Kevin_Bacon')
if get_and_store_page(url='https://en.wikipedia.org/wiki/Eric_Idle', filename='Eric_Idle.html'):
print('success: https://en.wikipedia.org/wiki/Eric_Idle')
else:
print('fail: https://en.wikipedia.org/wiki/Eric_Idle')
验证六度分隔理论
from bs4 import BeautifulSoup
from bs4.element import Tag
from CH3_GetWikipedia import load_stored_html, get_and_store_page
import re
import time
import copy
host = 'https://en.wikipedia.org'
visited_url = dict()
jump_path = ['', '', '', '', '', '', '']
results = []
def find_kevin_bacon(path: str, jumps: int) -> None:
global host, visited_url, jump_path, results
jump_path[jumps] = host + path
if path.split('/')[-1] == 'Kevin_Bacon':
print(f'!!!! it\'s found!')
results.append(copy.deepcopy(jump_path))
with open(file='./result.txt', mode='a', encoding='utf-8') as f:
for u in jump_path:
print(u)
f.write(u + '\n')
print(host + '/wiki/Kevin_Bacon')
f.write('--------------------\n')
return
if path in visited_url:
if visited_url[path] > jumps:
visited_url[path] = jumps
else:
return
else:
visited_url[path] = jumps
now = time.localtime(time.time())
hour = now.tm_hour
minute = now.tm_min
second = now.tm_sec
print(f'---> {hour}:{minute}:{second} jump time: {jumps}, visited: {len(visited_url)}, now visit: {path}.')
if jumps >= 6:
return
html, success = load_stored_html(filename=path.split('/')[-1] + '.html')
if not success:
success = get_and_store_page(url=host + path, filename=path.split('/')[-1] + '.html')
if not success:
return
else:
html, success = load_stored_html(filename=path.split('/')[-1] + '.html')
bs = BeautifulSoup(markup=html, features='html.parser')
links = bs.find(name='div', attrs={'id': 'bodyContent'}).\
find_all(name='a', attrs={'href': re.compile('^(/wiki/)((?!:).)*$')})
for link in links: # type: Tag
find_kevin_bacon(path=link['href'], jumps=jumps + 1)
if __name__ == '__main__':
find_kevin_bacon(path='/wiki/Eric_Idle', jumps=0)
print(f'一共找到{len(results)}种方案:')
for res in results:
for p in res:
print(f'{p} -> ', end='')
print('/wiki/Kevin_Bacon')
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/Time_zone
https://en.wikipedia.org/wiki/Nome,_Alaska
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/England
https://en.wikipedia.org/wiki/Michael_Caine
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/England
https://en.wikipedia.org/wiki/Gary_Oldman
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/England
https://en.wikipedia.org/wiki/Daniel_Day-Lewis
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/New_town
https://en.wikipedia.org/wiki/Edmund_Bacon_(architect)
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/Stoke-on-Trent
https://en.wikipedia.org/wiki/Hugh_Dancy
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/Coventry
https://en.wikipedia.org/wiki/Bon_Jovi
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/Blackpool
https://en.wikipedia.org/wiki/Pleasure_Beach_Blackpool
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/Blackpool
https://en.wikipedia.org/wiki/Blackpool_Pleasure_Beach
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/Blackpool
https://en.wikipedia.org/wiki/Frasier
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/Brighton_and_Hove
https://en.wikipedia.org/wiki/Lewes
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/Isle_of_Wight
https://en.wikipedia.org/wiki/Jeremy_Irons
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Telford_and_Wrekin
https://en.wikipedia.org/wiki/South_Gloucestershire
https://en.wikipedia.org/wiki/EE_Limited
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Metropolitan_county
https://en.wikipedia.org/wiki/Conservative_Party_(UK)
https://en.wikipedia.org/wiki/Early_1990s
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Metropolitan_county
https://en.wikipedia.org/wiki/Margaret_Thatcher
https://en.wikipedia.org/wiki/Meryl_Streep
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Metropolitan_county
https://en.wikipedia.org/wiki/History_of_local_government_in_England
https://en.wikipedia.org/wiki/Cleveland
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
https://en.wikipedia.org/wiki/Eric_Idle
https://en.wikipedia.org/wiki/South_Shields
https://en.wikipedia.org/wiki/Tyne_and_Wear
https://en.wikipedia.org/wiki/Metropolitan_county
https://en.wikipedia.org/wiki/Urban_area
https://en.wikipedia.org/wiki/Empire_State_Building
https://en.wikipedia.org/wiki/Kevin_Bacon
--------------------
2020见证历史!