#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os, sys, time
import urllib.request, requests, bs4
#os._exit(0)
'''
下载文件
'''
def downfiles(imglist):
#fname = time.strftime("%Y%m%d%H%M%S", time.localtime()) # 日期命名
x = 0
# 遍历
for imgurl in imglist:
# 获取获得的从imglist中遍历得到的imgurl
imgres = requests.get(imgurl)
fname = imgurl.split('/')[-1]
with open("D:\\360Downloads\\{}.jpg".format(fname), "wb") as f:
f.write(imgres.content)
x += 1
print("第", x ,"张")
print("下载完毕")
'''
读取详情页
'''
def getdetails(url):
res = requests.get(url)
downloadedList = [] # 下载网址列表
res.raise_for_status()
html = bs4.BeautifulSoup(res.text, 'html5lib')
data = html.select('.reveal-work-wrap > img') # 返回数组
for path in data:
target = path.get('src') # 返回src属性
target = target.split('@')[0] # 图片路径处理
downloadedList.append(target) # 加入全局数组
print(target)
downfiles(downloadedList)
# 获取网页源代码
url = 'https://www.zcool.com.cn/'
res = requests.get(url)
downloadedList = [] # 下载网址列表
res.raise_for_status()
html = bs4.BeautifulSoup(res.text, 'html5lib')
data = html.select('.card-img > a') # 返回列表页数组
for path in data:
target = path.get('href') # 返回src属性
target = target.split('@')[0] # 图片路径处理
downloadedList.append(target) # 加入全局数组
print(target)
getdetails(target) #下载详情页图片
print(type(downloadedList))
优化版
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os, sys, time
import urllib.request, requests, bs4
'''
windows系统文件命名,去除特殊字符
'''
def strreplace(str):
str = str.lstrip() #去开头空格
str = str.replace('?', '').replace('、', '').replace('/', '').replace('╲', '').replace('*', '').replace('<', '').replace('>', '').replace('|', '').replace(':', '').replace('~', '').replace('!', '')
return str
'''
下载文件
'''
def downfiles(imglist, title):
x = 0
folder = 'D:\\360Downloads\\'+ time.strftime("%Y%m%d", time.localtime()) +'\\' + strreplace(title) + '\\'
if not os.path.exists(folder):
os.makedirs(folder)
print(folder+' => 已创建')
# 遍历
for imgurl in imglist:
# 获取从imglist中遍历得到的imgurl
imgres = requests.get(imgurl)
fname = imgurl.split('/')[-1]
with open(folder + fname, "wb") as f:
f.write(imgres.content)
x += 1
print("第", x ,"张")
else:
pass
print('忽略 => ' + title);
'''
读取html,返回相关源码
'''
def geturl(url):
res = requests.get(url)
res.raise_for_status()
html = bs4.BeautifulSoup(res.text, 'html5lib')
return html
'''
读取详情页
'''
def getdetails(url, title):
templateid = 0 # 模板id
downloadedList = [] # 下载网址列表
html = geturl(url)
data = html.select('.reveal-work-wrap img') # 返回数组
if 0 == len(data): # 模板不匹配时
data = html.select('.article-content-wraper img') # 返回数组
templateid = 1 # 模板id
for path in data:
target = path.get('src') # 返回src属性
if templateid != 1:
target = target.split('@')[0] # 图片路径处理
downloadedList.append(target) # 加入全局数组
print(target)
downfiles(downloadedList, title)
'''
Start:抓取需要采集的链接
'''
def getlinks(url='https://www.zcool.com.cn/'):
downloadedList = [] # 下载网址列表
downloadedTitle = [] # 下载网址标题数组
html = geturl(url)
data = html.select('.card-img > a') # 返回列表页数组
for path in data:
target = path.get('href') # 返回src属性
target = target.split('@')[0] # 图片路径处理
title = path.get('title') # 返回title属性
downloadedList.append(target) # 加入全局数组
downloadedTitle.append(title) # 加入全局数组
print(title + ' => ' + target)
getdetails(target, title) #下载详情页图片
getlinks('https://www.zcool.com.cn/discover/607!0!0!0!0!!!!2!-1!1')