import urllib.parse
import threading
import requests
import os
# 设置最大线程锁
thread_lock = threading.BoundedSemaphore(value=10)
# 解析页面
def get_page(url):
page = requests.get(url)
page = page.content
# 将byte转成字符串
page = page.decode('utf-8')
return page
# 获取想要爬取的页面
def pages_from_duitang(label):
pages = []
url = 'https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}'
label = urllib.parse.quote(label)
# 设置数量范围,步长24
for index in range(0, 1800, 24):
# 网址格式化
u = url.format(label, index)
print(u)
page = get_page(u)
pages.append(page)
return pages
def findall_in_page(page, startpart, endpart):
all_strings = []
end = 0
while page.find(startpart, end) != -1:
start = page.find(startpart, end) + len(startpart)
end = page.find(endpart, start)
string = page[start: end]
all_strings.append(string)
return all_strings
# 解析网页内容
def pic_urls_from_pages(pages):
pic_urls = []
for page in pages:
urls = findall_in_page(page, 'path":"', '"')
pic_urls.extend(urls)
return pic_urls
# 下载图片
def download_pics(url, n, dir_name):
img = requests.get(url)
# 设置文件名
file_name = url.split('/')[-1]
# 构建存储路径及文件名
path = dir_name + '/' + file_name
with open(path, 'wb') as fp:
fp.write(img.content)
# 下载完了,解锁
thread_lock.release()
def main(label):
# 判断是否存在指定文件夹,若没有则创建
if not os.path.exists(label):
os.mkdir(label)
dir_name = label
pages = pages_from_duitang(label)
pic_urls = pic_urls_from_pages(pages)
n = 0
for url in pic_urls:
n += 1
print('正在下载第{}张图片...'.format(n))
# 上锁
thread_lock.acquire()
t = threading.Thread(target=download_pics, args=(url, n, dir_name))
t.start()
# 更改关键字,即可下载对应的图片
main('火影忍者')
更改main中的关键字,即可下载对应图片。