Python获取网站上的图片到本地(附从单页面到多页面提取的全部代码)

学习目标:

通过自动化技术实现网络图片批量保存到本地电脑

例如:

  • 以优美图库网站上提供的图片为基础进行批量保存

学习内容:

  1. 从优美图库首页定位到要保存图片的代码部门
  2. 逐一跳转到各个图片的子页面上
  3. 从各子页面中提取出需求的图片信息
  4. 保存获取的图片

学习产出:

# coding=UTF-8
# 拿到首页代码,定位提取到子页面的位置上 href
# 通过href  找到子页面的内容 提取图片下载地址 img -> src
# 下载图片
import requests
import io
import sys
import re
from bs4 import BeautifulSoup

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='UTF-8')
domain = "https://www.umei.cc"
url = "https://www.umei.cc/tags/xinggannvshen-9.htm"
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 "
    "Safari/537.36 "
}
resp = requests.get(url, headers=headers)
obj = re.compile(r'

由于技术水平的限制,还没有实现自动翻页的功能,往后会继续学习,同时也恳求各行各业的精英积极提出宝贵的意见,相互学习、相互进步


今天修改后的代码再之前的基础上增加了分页遍历所有图片,统计单个照片、一个页面、整个活动的使用时间,输出显示进度条功能,文件统一进行命名存放的功能,具体完整部分代码见下:

# coding=UTF-8
# 拿到首页代码,定位提取到子页面的位置上 href
# 通过href  找到子页面的内容 提取图片下载地址 img -> src
# 下载图片
import requests
import io
import sys
import re
from bs4 import BeautifulSoup
import datetime
from tqdm import tqdm

all_start = datetime.datetime.now()
domain = "https://www.umei.cc"
for page in range(1, 16):    # 提前查看原网页,看有多少页,然后n+1
    page_start = datetime.datetime.now()
    print('开始获取第{}页的图片信息'.format(page))
    url = f"https://www.umei.cc/tags/xinggannvshen-{page}.htm"
    headers = {
                "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 "
                "Safari/537.36 "
            }
    resp = requests.get(url, headers=headers)
    obj = re.compile(r'

以上代码,运行时发现,统一命名编号有问题,原网页中每页展示的数据条数不是统一的,所以统一命名的问题还存在,等待进一步去解决

这次加入了文件统一命名

# coding=UTF-8
# 拿到首页代码,定位提取到子页面的位置上 href
# 通过href  找到子页面的内容 提取图片下载地址 img -> src
# 下载图片
import requests
import io
import sys
import re
from bs4 import BeautifulSoup
import datetime
from tqdm import tqdm
import time

all_start = datetime.datetime.now()
domain = "https://www.umei.cc"
for page in range(1, 252):    # 提前查看原网页,看有多少页,然后n+1    28\29\62\65\72\111\112\115\125\127\152\162\197\202页有异常链接
    page_start = datetime.datetime.now()
    print('开始获取第{}页的图片信息'.format(page))
    url = f"https://www.umei.cc/meinvtupian/xingganmeinv/index_{page}.htm"
    headers = {
                "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 "
                "Safari/537.36 "
            }
    resp = requests.get(url, headers=headers)
    obj = re.compile(r'
', re.S) # 取到图片页面的链接 obj1 = obj.finditer(resp.text) ulli_list = [] for it in obj1: # 拼接成完整的url链接 ul = it.group("page1") ul_li = domain + ul print('取到第{}页的链接:'.format(page) + ul_li) ulli_list.append(ul_li) x = 0 pbar = tqdm(total=len(ulli_list), desc="本页采集进度", unit="单文件用时") # 进度条设置 for liurl in ulli_list: start = datetime.datetime.now() # 拿到子页面源代码 child_page_resp = requests.get(liurl) child_page_text = child_page_resp.text # 获取下载路径 child_page = BeautifulSoup(child_page_text, "html.parser") # 要加个异常处理抓取到的空白页面时的中断操作 div = child_page.find("div", class_="big-pic") # 要加个异常处理没有图片时的中断操作 img = div.find("img") # name = img.get("alt") + ".jpg" src = img.get("src") # 下载图片 x += 1 img_resp = requests.get(src) # img_name = src.split("/")[-1] # 图片url中最后一个/后面的内容 img_name1 = "xingganmeinv_{}".format(x) # 所有图片采用一样的命名格式 img_name = img_name1 + "_第{}页.jpg".format(page) with open("img1/" + img_name, mode="wb+") as f: # 存放路径的文件夹提前创建好 f.write(img_resp.content) f.close() pbar.update(1) # 进度条更新 time.sleep(1) delta = int((datetime.datetime.now() - start).total_seconds()) print("\n" + img_name, f",下载用时:{delta}s") child_page_resp.close() page_delta = int((datetime.datetime.now() - page_start).total_seconds()) resp.close() print('\n第{}页的图片信息获取完成'.format(page) + f",用时:{page_delta}s") all_delta = int((datetime.datetime.now() - all_start).total_seconds()) print(f"全部下载完毕,用时:{all_delta}s")

实际测试中,存在偶尔出现各种系统性问题导致的任务中断,从而重新根据最新的时间在新的断点处开始

#coding=UTF-8
import asyncio
import aiohttp
import aiofiles
import requests
from lxml import etree
import time
from concurrent.futures import ThreadPoolExecutor


async def download_pic(url, sem):
    name = url.rsplit('/', 1)[1]    # 设定文件名字
    timeout = aiohttp.ClientTimeout(total=300)
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 "
                "Safari/537.36 ",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    }
    conn = aiohttp.TCPConnector(limit=10)
    async with sem:
        async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session:  # requests
            async with session.get(url, headers=headers) as resp:  # requests.get()
                async with aiofiles.open("pic/" + name, 'wb') as f:
                    await f.write(await resp.content.read())  # 读取内容异步需要挂起
                print('下载完成{}'.format(name))


def get_page_url():
    urls = []
    for page_num in range(1, 2):
        if page_num == 1:
            url = 'https://www.umei.cc/bizhitupian/diannaobizhi'
        else:
            url = 'https://www.umei.cc/bizhitupian/diannaobizhi/index_{}.htm'.format(page_num)
        resp = requests.get(url)
        html = etree.HTML(resp.text)
        table = html.xpath('//div[contains(@class,"item masonry_brick")]')
        if table:
            url_list = table[0].xpath('//div[contains(@class,"img")]//@href')
            urls.extend(url_list)
    return urls


page_urls = []


def get_page_urls1(page_num):
    if page_num == 1:
        url = 'https://www.umei.cc/bizhitupian/diannaobizhi'
    else:
        url = 'https://www.umei.cc/bizhitupian/diannaobizhi/index_{}.htm'.format(page_num)
    resp = requests.get(url)
    html = etree.HTML(resp.text)
    table = html.xpath('//div[contains(@class,"item masonry_brick")]')
    if table:
        url_list = table[0].xpath('//div[contains(@class,"img")]/a/@href')
        page_urls.extend(url_list)


def get_page_urls():
    with ThreadPoolExecutor(100) as t:
        for i in range(1, 1044):        # 尽量不要一次取完,占用内存太高
            args = [i]                  # 如果中间任务断了,不知道进度走到了哪里,无法确定进度
            t.submit(lambda p: get_page_urls1(*p), args)


clean_urls = list()


def get_pic_url1(url):
    url = 'https://www.umei.cc{}'.format(url)
    resp = requests.get(url)
    html = etree.HTML(resp.text)
    pic_link = html.xpath('//div[contains(@class,"big-pic")]/a/img/@src')[0]
    clean_urls.append(pic_link)


def get_pic_url():
    with ThreadPoolExecutor(100) as t:
        for i in page_urls:
            args = [i]
            t.submit(lambda p: get_pic_url1(*p), args)


async def main():
    tasks = []
    get_page_urls()
    get_pic_url()
    print('即将下载的文件总数{}'.format(len(clean_urls)))
    sem = asyncio.Semaphore(15)
    for url in clean_urls:
        task = asyncio.create_task(download_pic(url, sem))
        tasks.append(task)
    await asyncio.wait(tasks)


if __name__ == '__main__':
    start = time.time()
    print("任务开始执行,后台计时")
    asyncio.run(main())
    end = time.time()
    all_time = int(end - start)
    print("任务执行完毕,计时结束")
    print('抓取耗时:{}s'.format(all_time))

加入了异步协程,下载文件速度提高了许多,但是开始下载前会在后台一次存取打开所有的图片子页面链接,占用系统资源很高。出错后不容易确定具体的进度

你可能感兴趣的:(python,开发语言)