要求:根据网址的子目录下载到相应的目录中
注意:可能由于访问量过大,网页安全做了设置,需要分几次下
部分图片网址格式如下:
https://www.zz.com/group1/M00/08/7c/rB8GAllbb_6AeRy4AAB4tpUqgws864.732X1136.JPG
https://www.zz.com/group1/M00/09/3B/rB8GAllbcACAMp3VAAB5ExWWpUk632.732X1136.JPG
https://www.zz.com/group1/M00/07/4e/rB8GAllbcAOAbB3-AADMbbJiM64570.732X1136.JPG
https://www.zz.com/group1/M00/07/7a/rB8GAllbcAOAYasKAAEK3uFRb1s333.732X1136.JPG
from urllib import request
import os
import traceback
fname = './url.txt' #图片网址文件
with open(fname, "r") as fobj:
for line in fobj:
if line:
line = line.strip() #移除字符串头尾指定的字符(默认为空格或换行符)
p = line.find('group1') #搜索每行网址中是否包含group1子字符串
p1 = line.rfind('/') #返回第一个"/"左边的字符串(从右向左查询)
filename = line[p:p1] #截取网址的子目录如:group1/M00/08/7c
# print("%s" % line)
else:
break
dst = './rym2/' + filename #组合目录
if not os.path.exists(dst): #如果不存在则创建
os.makedirs(dst)
fname_1 = line.split('/')[-1]
fname_1 = os.path.join(dst, fname_1) #以图片名作为文件名
try:
html = request.urlopen(line) #下载图片
with open(fname_1, 'wb') as fobj:
data = html.read()
fobj.write(data)
print('1')
except: #收集不能爬取到的图片网址和报错信息放到文件中
f=open('./try.txt','a')
traceback.print_exc(file=f)
f.flush()
f.close()
with open('./wenti.txt','a') as fb:
fb.write(line) #导入的网址不会换行
from urllib import request
import re
import os
def get_file(url, fname):
html = request.urlopen(url)
with open(fname, 'wb') as fobj:
while True:
data = html.read()
if not data:
break
fobj.write(data)
def get_urls(patt, fname, charset='utf8'):
url_list = [] # 将匹配到的网址放到列表中
cpatt = re.compile(patt) # 将模式编译,提升效率
with open(fname, encoding=charset) as fobj:
for line in fobj:
m = cpatt.search(line) # 在每一行中匹配网址
if m:
url_list.append(m.group())
return url_list
if __name__ == '__main__':
url_163 = 'http://www.163.com'
fname_163 = '/tmp/163.html'
get_file(url_163, fname_163)
img_patt = '(http|https)://[\w./]+\.(jpg|jpeg|gif|png)'
img_list = get_urls(img_patt, fname_163, 'GBK')
dst = '/tmp/163imgs/'
if not os.path.exists(dst):
os.mkdir(dst)
for url in img_list:
fname = url.split('/')[-1]
fname = os.path.join(dst, fname)
get_file(url, fname)