用到的库有: requests, lxml, beautifulsoup4, time . 因为加了个耗时提醒 用到了time 库
import requests
import time
import lxml
from bs4 import BeautifulSoup
# 爬取 http://wallpaperswide.com/latest_wallpapers 网页的最新壁纸
# 图片宽度 高度 页数 这里传的是 960, 600
def loadWallpaperswides(w, h, count=1):
print('loadWallpaperswides')
for i in range(count):
# 获取每页的地址
url = 'http://wallpaperswide.com/latest_wallpapers/page/' + str(i + 1)
# 找到列表里面每个图片的地址 定位到ul.wallpapers li a
wall = getHtmlAndSelect(url, 'ul.wallpapers li a')
nameList = {}
# 因为有2次相同的名字, 取第一次下载就可以了
for i in wall:
str1 = i.get('href')
name = str1.split('/')[-1]
name = name.split('-')[0]
if name not in nameList.keys():
# 拼接下载路径
path = 'http://wallpaperswide.com/download/' + \
name + '-' + str(w) + 'x' + str(h)
loadImg(path, fn.IMG_PATH, name)
nameList[name] = 1
# 下载地址 例子
# http://wallpaperswide.com/download/lighthouse_16-960x600.html
# 打开网页 获取html
# 网址 要截取的部分
def getHtmlAndSelect(url, select = ''):
with requests.get(url) as resq: # 打开网址
html = resq.text # 获得这个网页的html代码
soup = BeautifulSoup(html, 'lxml') # 将代码放入解析器
if select != '':
scroll = soup.select(select)
return scroll
return soup
# 图片地址, 保存路径, 保存名字
# path 保存图片的路径
def loadImg(url, path, name):
print('url = ' + url)
print(name + '正在下载')
timer = int(time.time())
with requests.get(url) as resq: # 打开图片
# 打开路径 (以wb, w:写入, b:二进制方式, 以你设置的名字打开, 文件不存在则创建)
filename = path + '\\' + name + '.jpg'
with open(filename, 'wb') as f:
f.write(resq.content) # 写入图片
print(name + '下载完成.......耗时:' + str(int(time.time()) - timer))