思路:
1.根据初始url获取网页内容
2.根据网页内容获取总页数及所有页面的url
3.根据每页的url,将网页下载到本地
4.读取本地文件从文件中解析出所有的jpg图片的url
5.用图片的url下载图片并保存成指定的文件夹
6.批量下载图片,默认保存到当前目录下
7.封装,从百度贴吧下载图片
import re
import urllib.request
import os
def gethtml(url):
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
return html
def getimgurl():
# os.chdir("htmlpage")
imgurls = []
for i in range(1,86):
filename = "pn="+str(i)+".html"
with open(filename,'r',encoding='utf-8') as f:
html = f.read()
if html.strip() == "":
print("此页面为空!")
else:
recom = re.compile(r'(\d+)')
num1 = re.findall(recom,html).pop()
num = int(num1)
allpagesurl = []
for i in range(1,num+1):
url = "http://tieba.baidu.com/p/2256306796?pn="+str(i)
allpagesurl.append(url)
return allpagesurl
def getallpageshtml(allpagesurl):
os.mkdir("htmlpage")
os.chdir("htmlpage")
allpageshtml = []
for pageurl in allpagesurl:
pagethtml = gethtml(pageurl)
filename = pageurl.split(sep='?')[-1]+".html"
with open(filename,'w',encoding='utf-8') as f:
f.write(pagethtml)
# allpageshtml.append(pagethtml.encode("utf-8"))
# return allpageshtml
def downloadimg(imgurl,filename):
try:
urllib.request.urlretrieve(imgurl,filename,None)
except Exception as e:
print(e)
finally:
print(imgurl+"-->下载成功!")
def bitchdown(imgurls):
try:
os.mkdir("baiduimg")
except Exception as e:
print(e)
os.chdir("baiduimg")
for imgurl in imgurls:
filename = imgurl.split(sep="/")[-1]
downloadimg(imgurl,filename)
def download(url):
html = gethtml(url)
allpagesurl = getpagesurl(html)
getallpageshtml(allpagesurl)
imgurls = getimgurl()
print(imgurls)
bitchdown(imgurls)
def main():
download("http://tieba.baidu.com/p/2256306796")
if __name__ == '__main__':
main()