import os
import re
import urllib.request
import requests
def get_one_page(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}
response = requests.get(url,headers = headers)
if(response.status_code == 200):
return response.text
return None
def download(url,filename):
filepath = 'F:\大三暑假\爬虫课程\猫眼电影\必应壁纸\\'+ filename + '.jpg'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}
if os.path.exists(filepath):
return
with open(filepath,'wb')as f:
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
f.write(response.read())
def parse(html):
pattern = re.compile('data-progressive="(.*?)".*?(.*?)
')
items = re.findall(pattern,html)
for item in items:
try:
url = item[0].replace('800','1920').replace('480','1080')
imagename = item[1].strip()
rule = re.compile(r'[a-zA-z1-9()-/]')
imagename = rule.sub('', imagename)
download(url,imagename.strip())
print(imagename,"正在下载")
except Exception:
continue
if __name__ == '__main__':
for page in range(1,73):
url = 'https://bing.ioliu.cn/ranking?p='+str(page)
print("正在抓取第", page, "页" ,url)
html = get_one_page(url)
parse(html)
遇到问题包括
1.urllib.error.HTTPError: HTTP Error 403: Forbidden
添加headers可以解决
2.FileNotFoundError: [Errno 2] No such file or directory
原因是文件名太长了,缩短一下就可以了