Python爬虫 爬取糖堆网指定图片

import urllib.parse
import threading
import requests
import os


# 设置最大线程锁
thread_lock = threading.BoundedSemaphore(value=10)


# 解析页面
def get_page(url):
    page = requests.get(url)
    page = page.content
    # 将byte转成字符串
    page = page.decode('utf-8')
    return page


# 获取想要爬取的页面
def pages_from_duitang(label):
    pages = []
    url = 'https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}'
    label = urllib.parse.quote(label)
    # 设置数量范围,步长24
    for index in range(0, 1800, 24):
        # 网址格式化
        u = url.format(label, index)
        print(u)
        page = get_page(u)
        pages.append(page)
    return pages


def findall_in_page(page, startpart, endpart):
    all_strings = []
    end = 0
    while page.find(startpart, end) != -1:
        start = page.find(startpart, end) + len(startpart)
        end = page.find(endpart, start)
        string = page[start: end]
        all_strings.append(string)
    return all_strings


# 解析网页内容
def pic_urls_from_pages(pages):
    pic_urls = []
    for page in pages:
        urls = findall_in_page(page, 'path":"', '"')
        pic_urls.extend(urls)
    return pic_urls


# 下载图片
def download_pics(url, n, dir_name):
    img = requests.get(url)
    # 设置文件名
    file_name = url.split('/')[-1]
    # 构建存储路径及文件名
    path = dir_name + '/' + file_name
    with open(path, 'wb') as fp:
        fp.write(img.content)
    # 下载完了,解锁
    thread_lock.release()


def main(label):
    # 判断是否存在指定文件夹,若没有则创建
    if not os.path.exists(label):
        os.mkdir(label)
    dir_name = label

    pages = pages_from_duitang(label)
    pic_urls = pic_urls_from_pages(pages)
    n = 0
    for url in pic_urls:
        n += 1
        print('正在下载第{}张图片...'.format(n))
        # 上锁
        thread_lock.acquire()
        t = threading.Thread(target=download_pics, args=(url, n, dir_name))
        t.start()


# 更改关键字,即可下载对应的图片
main('火影忍者')

更改main中的关键字,即可下载对应图片。

你可能感兴趣的:(python爬虫学习笔记)