[python 爬虫]必应壁纸爬取

import os
import re
import urllib.request

import requests

def get_one_page(url):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}
    response = requests.get(url,headers = headers)
    if(response.status_code == 200):
        return response.text
    return None

def download(url,filename):
    filepath = 'F:\大三暑假\爬虫课程\猫眼电影\必应壁纸\\'+ filename + '.jpg'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}
    if os.path.exists(filepath):
        return
    with open(filepath,'wb')as f:
        req = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(req)
        f.write(response.read())

def parse(html):
    pattern  = re.compile('data-progressive="(.*?)".*?

(.*?)

') items = re.findall(pattern,html) for item in items: try: url = item[0].replace('800','1920').replace('480','1080') imagename = item[1].strip() rule = re.compile(r'[a-zA-z1-9()-/]') imagename = rule.sub('', imagename) download(url,imagename.strip()) print(imagename,"正在下载") except Exception: continue if __name__ == '__main__': for page in range(1,73): url = 'https://bing.ioliu.cn/ranking?p='+str(page) print("正在抓取第", page, "页" ,url) html = get_one_page(url) parse(html)

[python 爬虫]必应壁纸爬取_第1张图片

遇到问题包括

1.urllib.error.HTTPError: HTTP Error 403: Forbidden

添加headers可以解决

2.FileNotFoundError: [Errno 2] No such file or directory

原因是文件名太长了,缩短一下就可以了

你可能感兴趣的:(Python,Python爬虫)