Python网络爬虫实战:通过requests+bs4爬取并保存图片

Python爬虫实战基础篇——图片爬取

Python版本:3.5.4

涉及内容都是网络爬虫基础:requests和bs4库的应用、正则表达式等

import requests
import os
from bs4 import BeautifulSoup
import re

def GetHtml(url):#获取soup
    try:
        r = requests.get(url)
        demo = r.text
        soup = BeautifulSoup(demo, "html.parser")
        return soup
    except:
        return ""

def GetPic(url):#下载某一页的图片
    try:
        pic = requests.get(url)
        path = "爬取图片/图片page"+str(page)+".jpg"
        # 保存非文本类的图片、文件都可以用这个模板
        with open(path, 'wb') as f:
            f.write(pic.content)
        # 
            print("图片page"+str(page)+"爬取成功")
    except:
        print("page"+str(page)+"爬取失败")
 
# 获取页数
soup = GetHtml("http://www.zbjuran.com/mei/xinggan/201708/85005.html")
for x in soup.find(attrs = 'page').strings:
    pagestring = x
    break
pageend = re.sub("\D","",pagestring)

# 翻页爬取
page = 1
while page <= int(pageend):#不断访问网页的循环
    if page==1:
        url="http://www.zbjuran.com/mei/xinggan/201708/85005.html"
    else:
        url = "http://www.zbjuran.com/mei/xinggan/201708/85005_"+str(page)+".html"
    soup = GetHtml(url)
    img_src = soup.find('img').get('src')
    GetPic(img_src)
    page += 1

 

 

 

 

你可能感兴趣的:(Python)