Python爬虫爬取高清壁纸

        电脑壁纸?手机壁纸?还去各个壁纸网站上去搜索吗?现在不需要了!只需要选择想要的壁纸类型,然后就静静等待一会儿,大量壁纸就保存在你的电脑上,一个爬虫解决你的想要壁纸的烦恼。

         该爬虫比较简单,很容易上手,通过接口的方式去获取图片链接地址,其中有正则的运用,不会正则的小伙伴可以去学习一下正则,因为这是爬虫领域很重要的东西,在数据清洗中占领着重要位置,好了,不多说,直接展示代码

import requests,re,os

class Downloadpucture(object):

    def __init__(self):
        #请求头
        self.headers={
        'Accept': 'text / html, application / xhtml + xml, application /'
        ' xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8',
        'Accept - Encoding': 'gzip, deflate',
        'Accept - Language': 'zh - CN, zh;q = 0.9',
        'Cache - Control': 'max - age = 0',
        'Host': 'www.netbian.com',
        'Upgrade - Insecure - Requests': '1',
        'User - Agent': 'Mozilla / 5.0(WindowsNT10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / '
        '70.0.3538.25Safari / 537.36Core / 1.70.3870.400QQBrowser / 10.8.4405.400'
        }



    #选择图片类型
    def choosevarise(self):
        list=["rili","dongman","fengjing","meinv","youxi","yingshi","dongtai","weimei","sheji","keai","qiche","huahui",
              "dongwu","jieri","renwu","meishi","shuiguo","jianzhu","tiyu","junshi","feizhuliu","qita","wangzherongyao","huyan","lol"]
        LIST=["0.日历","1.动漫","2.风景","3.美女","4.游戏","5.影视","6.动态","7.唯美","8.设计","9.可爱","10.汽车","11.花卉","12.动物",
              "13.节日","14.人物","15.美食","16.水果","17.建筑","18.体育","19.军事","20.非主流","21.其他","22.王者荣耀","23.护眼","24.LOL"]
        print(LIST[0:12])
        print(LIST[13:25])
        Downloadpucture.choosepath(self) #调用路径填写函数
        Downloadpucture.choosenum(self) #调用图片类型选择函数
        Downloadpucture.judge(self, number, LIST, list) #调用路径填写函数

    #保存路径,主要对路径做一个判断,判断路径是否填写正确,如果该路径下存在文件夹则跳过,不存在则创建文件夹
    def choosepath(self):
        global PATH
        while True: #对文件进行判定,文件夹后面是否带“/”,不带则主动添加“/”,因为这儿是为了下面图片下载函数能正确下载到填写的文件夹下
            try:
                try:
                    PATH=input("请输入保存路径,具体到某个文件夹:")
                    gz = r"/$"
                    rep = re.findall(gz, PATH)[0]
                    pass
                    if rep == "/":
                        pass
                    else:
                        pass
                except:
                    PATH += "/"
                folder=os.path.exists(PATH)
                if not folder:
                    os.mkdir(PATH) #创建文件夹
                    break
                else:
                    break
            except:
                print("路径错误,请仔细检查路径后重试!!")
        print("图片保存路径:%s" % PATH)

    #判断输入的序号是否正确
    def choosenum(self):
        global number
        while True:
            try:
                number = int(input("请输入要下载的类型图序号:"))
                if isinstance(number,int):
                    if  0<=number<=24:
                        break
                    else:
                        print("请输入正确序号!!!")
                else:
                    print("请输入正确序号!!!")
            except:
                print("请输入正确序号!!!")

    #对页面URL进行处理,主要是爬取的页面URL不一致,进行判断,获取URL
    def judge(self,number,LIST,list):
        global Url
        kd = list[number]
        print("你已选择:%s" % LIST[number])
        for i in range(1, Downloadpucture.picturepages(self, kd, number) + 1):
            if 0 <= number < 22:
                Url = "http://www.netbian.com/%s/index_%d.htm" % (kd, i)
                if i == 1:
                    Url = "http://www.netbian.com/%s/" % kd
                else:
                    pass
            elif 22 <= number <= 24:
                Url = "http://www.netbian.com/s/%s/index_%d.htm" % (kd, i)
                if i == 1:
                    Url = "http://www.netbian.com/s/%s/" % kd
                else:
                    pass
            Downloadpucture.picturenum(self)



    #获取图片;类型下所有图片的二级链接
    def indexdata(self):
        rep = requests.get(url=Url, headers=self.headers)
        return rep.text




    #正则提取出二级链接下响应页面的三级地址
    def picturenum(self):
        data=Downloadpucture.indexdata(self)
        zz=r'href="/desk/.*?.htm"'
        global URl
        for i in data.split():
            try:
                ret=re.search(zz,i).group()[12:17]
                URl = 'http://www.netbian.com/desk/%s-1920x1080.htm' % ret
                Downloadpucture.download(self,PATH)
            except:
                pass



    #获取图片所有页数,找到该图片类型下所有的页数
    def picturepages(self, kd,number):
        if 0<=number<22:
            req = requests.get(url="http://www.netbian.com/%s/" % kd, headers=self.headers).text
            gz = r'.htm">.*?")[-2]
            PAGE=re.match(r'\d{0,4}',NUM).group()
            return int(PAGE)
        else:
            req = requests.get(url="http://www.netbian.com/s/%s/" % kd, headers=self.headers).text
            gz = r'.htm">.*?")[-2]
            PAGE=re.match(r'\d{0,4}',NUM).group()
            return int(PAGE)




    # 获取图片正式地址
    def htmldata(self,URl):
        re = requests.get(url=URl, headers=self.headers)
        return re.text



    # 响应数据处理,获取图片相应的url
    def picturelink(self):
        data = Downloadpucture.htmldata(self, URl).split()
        list = []
        for i in data:
            if i[0:4] == "src=":
                if i[-4:-1] == "jpg":
                    url = i[5:-1]
                    list.append(url)
                else:
                    pass
            else:
                pass
        return list[1]


    # 下载图片
    def download(self,PATH):
        D = requests.get(Downloadpucture.picturelink(self), stream=True)
        path = PATH + Downloadpucture.picturelink(self)[-10:-4] + ".jpg"
        with open(path, "wb") as f:
            f.write(D.content)
            print(Downloadpucture.picturelink(self)[-10:-4] + ".jpg" + "下载完成!")



if __name__=="__main__":
    a=Downloadpucture()
    a.choosevarise()

成果真的很nice,再也不用对没有好看的壁纸而烦恼啦!

你可能感兴趣的:(python,爬虫)