python3实现爬取网页图片实例

实例1:通过含有几千个图片网址的文件爬取图片(一般在运维中爬取数据库中的图片)

要求:根据网址的子目录下载到相应的目录中
注意:可能由于访问量过大,网页安全做了设置,需要分几次下
部分图片网址格式如下

https://www.zz.com/group1/M00/08/7c/rB8GAllbb_6AeRy4AAB4tpUqgws864.732X1136.JPG
https://www.zz.com/group1/M00/09/3B/rB8GAllbcACAMp3VAAB5ExWWpUk632.732X1136.JPG
https://www.zz.com/group1/M00/07/4e/rB8GAllbcAOAbB3-AADMbbJiM64570.732X1136.JPG
https://www.zz.com/group1/M00/07/7a/rB8GAllbcAOAYasKAAEK3uFRb1s333.732X1136.JPG

from urllib import request
import os
import traceback

fname = './url.txt'                 #图片网址文件
with open(fname, "r") as fobj:
    for line in fobj:
        if line:
            line = line.strip()     #移除字符串头尾指定的字符(默认为空格或换行符)
            p = line.find('group1') #搜索每行网址中是否包含group1子字符串
            p1 = line.rfind('/')    #返回第一个"/"左边的字符串(从右向左查询)
            filename = line[p:p1]   #截取网址的子目录如:group1/M00/08/7c
            # print("%s" % line)
        else:
            break
        
        dst = './rym2/' + filename  #组合目录
        if not os.path.exists(dst): #如果不存在则创建
            os.makedirs(dst)
        fname_1 = line.split('/')[-1]  
        fname_1 = os.path.join(dst, fname_1) #以图片名作为文件名

        try: 
            html = request.urlopen(line)     #下载图片
            with open(fname_1, 'wb') as fobj:
                data = html.read()
                fobj.write(data)
                print('1')
        except:                #收集不能爬取到的图片网址和报错信息放到文件中
            f=open('./try.txt','a')
            traceback.print_exc(file=f)
            f.flush()
            f.close()
            with open('./wenti.txt','a') as fb:
                fb.write(line) #导入的网址不会换行

实例2:通过一个网址爬取网页图片

from urllib import request
import re
import os

def get_file(url, fname):
    html = request.urlopen(url)
    with open(fname, 'wb') as fobj:
        while True:
            data = html.read()
            if not data:
                break
            fobj.write(data)

def get_urls(patt, fname, charset='utf8'):
    url_list = []               # 将匹配到的网址放到列表中 
    cpatt = re.compile(patt)    # 将模式编译,提升效率
    with open(fname, encoding=charset) as fobj:
        for line in fobj:
            m = cpatt.search(line)   # 在每一行中匹配网址
            if m:
                url_list.append(m.group())
    return url_list

if __name__ == '__main__':
    url_163 = 'http://www.163.com'
    fname_163 = '/tmp/163.html'
    get_file(url_163, fname_163)
    img_patt = '(http|https)://[\w./]+\.(jpg|jpeg|gif|png)'
    img_list = get_urls(img_patt, fname_163, 'GBK')
    dst = '/tmp/163imgs/'
    if not os.path.exists(dst):
        os.mkdir(dst)
    for url in img_list:
        fname = url.split('/')[-1]
        fname = os.path.join(dst, fname)
        get_file(url, fname)

你可能感兴趣的:(python)