Python批量爬取图片


'''
爬取图片,并且下载图片
url = 'https://pic.netbian.com/4kmeinv/'
爬取网页:requests
解析网页:beautifulsoup

url = 'https://pic.netbian.com/4kmeinv/'
url = 'https://pic.netbian.com/4kmeinv/index_2.html'
"https://pic.netbian.com/uploads/allimg/220809/101035-16600110352f43.jpg"

'''


import os
import requests
from bs4 import BeautifulSoup

# 获取网页的源代码
def craw_html(url):
    resp = requests.get(url)
    resp.encoding = 'gbk'  #  ISO-8859-1
    print(resp.status_code) # 200 页面没有做任何反扒措施
    html = resp.text
    # print(html)
    return html

# 解析图片的地址
def parse_and_download(html):
    soup = BeautifulSoup(html,'html.parser')
    imgs = soup.find_all('img')
    for img in imgs:
        src = img.get('src') # 或者:img['src]
        if "/uploads/" not in src:
            continue
        '''图片后缀没有添加域名,需要拼接一下'''
        src = f"https://pic.netbian.com{src}"

        # 首先得到图片的本地文件地址
        filename = os.path.basename(src)

        # 当我们从网上下文件、图片的时候,都用wb二进制形式
        with open(f"./美女图片/{filename}",'wb') as f:
            resp_img = requests.get(src)
            print(src)
            f.write(resp_img.content)

if __name__ == '__main__':

    ''' \ 连接符号'''
    urls = [ 'https://pic.netbian.com/4kmeinv/'] +\
          [f'https://pic.netbian.com/4kmeinv/index_{i}.html'
           for i in range(2, 11)
    ]

    for url in urls:
        print("#### 正在爬取:",url)
        html = craw_html(url)
        parse_and_download(html)


你可能感兴趣的:(M8-100,Python,python,数据分析,爬虫)