简单爬虫,爬去百度贴吧图片

思路:

1.根据初始url获取网页内容

2.根据网页内容获取总页数及所有页面的url

3.根据每页的url,将网页下载到本地

4.读取本地文件从文件中解析出所有的jpg图片的url

5.用图片的url下载图片并保存成指定的文件夹

6.批量下载图片,默认保存到当前目录下

7.封装,从百度贴吧下载图片

import re
import urllib.request
import os

def gethtml(url):
    response = urllib.request.urlopen(url)
    html = response.read().decode('utf-8')
    return html

def getimgurl():
    # os.chdir("htmlpage")
    imgurls = []
    for i in range(1,86):
        filename = "pn="+str(i)+".html"
        with open(filename,'r',encoding='utf-8') as f:
            html = f.read()
            if html.strip() == "":
                print("此页面为空!")
            else:
                recom = re.compile(r'(\d+)')
    num1 = re.findall(recom,html).pop()
    num = int(num1)
    allpagesurl = []
    for i in range(1,num+1):
        url = "http://tieba.baidu.com/p/2256306796?pn="+str(i)
        allpagesurl.append(url)
    return allpagesurl

def getallpageshtml(allpagesurl):
    os.mkdir("htmlpage")
    os.chdir("htmlpage")
    allpageshtml = []
    for pageurl in allpagesurl:
        pagethtml = gethtml(pageurl)
        filename = pageurl.split(sep='?')[-1]+".html"
        with open(filename,'w',encoding='utf-8') as f:
            f.write(pagethtml)
        # allpageshtml.append(pagethtml.encode("utf-8"))
    # return allpageshtml

def downloadimg(imgurl,filename):
    try:
        urllib.request.urlretrieve(imgurl,filename,None)
    except Exception as e:
        print(e)
    finally:
        print(imgurl+"-->下载成功!")

def bitchdown(imgurls):
    try:
        os.mkdir("baiduimg")
    except Exception as e:
        print(e)

    os.chdir("baiduimg")

    for imgurl in imgurls:
        filename = imgurl.split(sep="/")[-1]
        downloadimg(imgurl,filename)

def download(url):
    html = gethtml(url)
    allpagesurl = getpagesurl(html)
    getallpageshtml(allpagesurl)
    imgurls = getimgurl()
    print(imgurls)
    bitchdown(imgurls)

def main():
    download("http://tieba.baidu.com/p/2256306796")

if __name__ == '__main__':
    main()


你可能感兴趣的:(python)