单线程爬取彼岸桌面美女壁纸

爬虫未加入多线程,有兴趣研究的可以深入了解
其实用Scrapy爬取效率更高,代码更少.这个脚本需要第三方库requests和BeautifulSoup4

脚本未做优化或试错

捕获.PNG
import requests
from bs4 import BeautifulSoup
import os

url = 'http://www.netbian.com/meinv/'
path = './美女壁纸/'


def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
    try:
        res = requests.get(url, headers=headers)
        if res.ok:
            res.encoding = res.apparent_encoding
            return res.text
        return None
    except:
        print('访问出错!!')


def get_img_url(url=url):
    html = get_html(url)
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.select("div.list b a[href*=.htm]")
        for link in links:
            url = link.get('href').replace('.htm', '-1920x1080.htm').replace('/desk', 'http://www.netbian.com/desk')
            get_wallpage(url)
        prev = soup.select('div.page > a.prev')
        for a in prev:
            if "下一页" in a.text:
                next_page = 'http://www.netbian.com' + a.get('href')
                print('获取下一页' + next_page)
        get_img_url(next_page)


def get_wallpage(url):
    html = get_html(url)
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        if soup.select("img[title]"):
            link = soup.select("img[title]")[0]
            title = link.get('title')
            src = link.get('src')
            save_wallpage(title, src)


def save_wallpage(name, src):
    global path
    image = requests.get(src)
    if image.ok:
        print('正在保存...' + name)
        with open(path + name + '.jpg', 'wb') as f:
            f.write(image.content)


get_img_url()


你可能感兴趣的:(单线程爬取彼岸桌面美女壁纸)