python爬取某网站原图作为壁纸

不得不说 python真是一个神奇的东西,学三天就能爬网站 真香

完整代码

# -*- coding: utf-8 -*-
"""
Created on Wed May 26 17:53:13 2021

@author: 19088
"""
import urllib.request
import os
import pickle
import re
import random
import sys


#获取http代理
class getHttpAgents:
    #初始化函数
    def __init__(self):
        self.attArray=self.__loadAgentList()
        self.myagent=""
    
    #注意 返回对象未进行解码
    def openUrl(self,url,istry=1):
        response=""
        ip=""
        if(0 != len(self.myagent.strip())):
            ip=self.myagent
        i=1
        if not istry:
            i=99
        while i<100:
            try:
                #print(self.attArray)
                if(0 == len(self.attArray) and 0==len(ip.strip())):
                    req=urllib.request.Request(url)
                    #设置访问头
                    req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
                    response=urllib.request.urlopen(req)
                else:
                    if(0 != len(self.attArray)):
                        ip=random.choice(self.attArray)
                    if(0 != len(self.myagent.strip())):
                        ip=self.myagent
                    print("以{}访问 {}".format(ip,url))
                    #设置代理
                    proxy={"http":ip}
                    #print(proxy)
                    #定义一个代理字段
                    proxy_support=urllib.request.ProxyHandler(proxy)

                    #建立一个opener
                    opener=urllib.request.build_opener(proxy_support)
                    opener.addheaders=[("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")]
                    #urllib.request.install_opener(opener)   
                    #获得网页对象
                    response=opener.open(url)
            except:
                if not istry:
                    print("{} 无法使用".format(ip))
                else:
                    print("第{}次尝试连接!".format(i))
            else:
                break;
            finally:
                i+=1
        if 11==i and istry:
            raise ValueError
        if not response:
            return 
        html=response.read()
        #print(html)
        return html

    #检查代理池 去除掉不可用代理ip
    def checkMyIpPool(self):
        agentsResult=[]
        agentList=self.attArray
        for iter in agentList:
            ip=iter
            self.setMyIp(ip)
            b=self.__getMyIp()
            if not b:
                #代理不能用
                #agentList.pop(-iter)
                pass
            else:
                agentsResult.append(ip)
                #print(b)
        #记录爬取过的可以使用的代理ip
        self.__writeAgentList(agentsResult)
        self.__setAgents(agentsResult)
        self.setMyIp("")
    
    #解析读取网页中所有的代理地址
    def getAgents(self,html):
        #print(html)
        #匹配 ip地址 正则表达式
        pattern = re.compile(r'()\s*((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\s*')
        ipList=[]
        ip=pattern.finditer(html)
        for ipiter in ip:
            ipText=ipiter.group()
            ipGroup=re.search(r"((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)", ipText)
            ipList.append(ipGroup.group())

        #匹配 端口地址 正则表达式
        portList=[]
        pattern = re.compile(r'()\s*\d+\s*')
        port = pattern.finditer(html) 
        for portiter in port:
            portText=portiter.group()
            portGroup=re.search(r"\d+", portText)
            portList.append(portGroup.group())

        if(len(ipList) is not len(portList)):
            print("注意: ip和端口参数不匹配!")
            return
        ipDict=dict(zip(ipList,portList))

        agentList=[]
        for key in ipDict:
            agentList.append(key+":"+ipDict.get(key))  
        agentsResult=[]
        for iter in agentList:
            ip=iter
            self.setMyIp(ip)
            b=self.__getMyIp()
            if not b:
                #代理不能用
                pass
                #agentList.pop(-iter)
            else :
                agentsResult.append(ip)
                self.__setAgents(agentsResult)
                print("{} 可以使用".format(ip))
        agentsResult.extend(self.attArray)  
        #记录爬取过的可以使用的代理ip
        if(0==len(agentsResult)):
            return
        self.__writeAgentList(agentsResult)
        self.__setAgents(agentsResult)
        self.setMyIp("")
        return agentList

    
    def __setAgents(self,ipArray):
        self.attArray=ipArray
    def setMyIp(self,ip):
        self.myagent=ip
    #存储爬取过的ip代理
    def __writeAgentList(self, agentList): 
        if os.path.exists("agent.pkl"):
            os.remove("agent.pkl")          #每次重新生成 要不多次 dump需要多次 load
        with open("agent.pkl.","wb") as f:
            pickle.dump(agentList, f)
        print("存储{}条代理".format(len(agentList)))
    
    #加载之前存储过的ip代理
    def __loadAgentList(self):
        agentlist=[]
        if not os.path.exists("agent.pkl"):
            return agentlist
        with open("agent.pkl","rb") as f:
            agentlist=pickle.load(f)
            print("加载{}条代理".format(len(agentlist)))
            return agentlist

    #获取当前使用的ip地址 类的内部方法 仅供内部调用
    def __getMyIp(self,ip=""):
        url="https://www.baidu.com/"
        html=""
        try:
            html=self.openUrl(url,0).decode("utf-8")
        except:
            return 
        #匹配ip地址
        #pattern = re.compile(r'((25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)\.){3}(25[0-5]|2[0-4]\d|[0-1]\d\d|\d\d|\d)')
        #groupIp=pattern.search(html)
        #if groupIp:
            #return groupIp.group()
        else:
            return html
    
    #通过不同的网站去爬取代理
    def crawlingAgents(self,index):
        try:
            url ="http://ip.yqie.com/ipproxy.htm"
            print(url)
            html=self.openUrl(url) 
            html=html.decode("utf-8") 
            self.setMyIp("")                                                #不指定ip 随机挑选一个作为代理
            self.getAgents(html)
        except Exception as e:
            print("{} 爬取失败".format(url))
        
        #一共搜集多少页
        page=index
        
        indexCur=1
        while indexCur<=page:
            try:
                url=r"https://www.89ip.cn/index_{}.html".format(indexCur)
                print(url)
                self.setMyIp("") 
                html=self.openUrl(url)                                               #不指定ip 随机挑选一个作为代理
                html=html.decode("utf-8")
                self.getAgents(html)
            except Exception as e:
                print("{} 爬取失败".format(url))
            finally:
                indexCur+=1
        
        indexCur=1
        while indexCur<=page:
            try:
                url=r"http://www.66ip.cn/{}.html".format(indexCur)
                print(url)
                self.setMyIp("") 
                html=a.openUrl(url)                                               #不指定ip 随机挑选一个作为代理
                html=html.decode("gb2312")
                self.getAgents(html)
            except Exception as e:
                print("{} 爬取失败".format(url))
            finally:
                indexCur+=1                                  

        indexCur=1
        while indexCur<=page:
            try:
                url=r"http://www.ip3366.net/?stype=1&page={}".format(indexCur)
                print(url)
                self.setMyIp("") 
                html=a.openUrl(url)                                               #不指定ip 随机挑选一个作为代理
                html=html.decode("gb2312")
                self.getAgents(html)
            except Exception as e:
                print("{} 爬取失败".format(url))
            finally:
                indexCur+=1  

        indexCur=1
        while indexCur<=page:
            try:
                url=r"http://www.kxdaili.com/dailiip/1/{}.html".format(indexCur)
                print(url)
                self.setMyIp("") 
                html=a.openUrl(url)                                               #不指定ip 随机挑选一个作为代理
                html=html.decode("utf-8")
                self.getAgents(html)
            except Exception as e:
                print("{} 爬取失败".format(url))
            finally:
                indexCur+=1


#下载图片封装类
class downLoadPictures:
    #构造函数
    def __init__(self):
        self.sortKey={}                                 #定义一个搜索关键字的字典
        self.urlLoad=getHttpAgents()
        self.bzmenuDict={}                              #分类信息 风景 美女 什么的分类
        self.sortscreenDict={}                          #按照屏幕尺寸分类
        self.littleSignDict={}                          #分类信息下面的小分类
        pass
    
    
    def getPictures(self,url):
        #第一步 打开网页 读取page信息 
        pagerHtml=self.urlLoad.openUrl(url)
        #第二步 获取 pageFolder 链接和各种分类信息 返回的是一堆folder链接的url
        folderPictursUrl=self.readPages(pagerHtml).values()
        if not folderPictursUrl:
            print("获取图片集失败!")
            return
        for floderiterUrl in folderPictursUrl:
            folderUrl=str("https://www.ivsky.com/")+floderiterUrl
            folderHtml=self.urlLoad.openUrl(folderUrl)
            #第三步 读取图片集 获取单个图片的链接地址 返回的是图片集里面的一堆文件url
            pictursUrlDict=self.readFolders(folderHtml)
            for iterPictureKey in pictursUrlDict:
                fileName=iterPictureKey+".jpg"
                pictureUrl=str("https://www.ivsky.com/")+pictursUrlDict.get(iterPictureKey)
                
                #读取图片页相关信息
                pictureHtml=self.urlLoad.openUrl(pictureUrl)
                picturDownUrl=self.readPictures(pictureHtml)
                pictureDownHtml=self.urlLoad.openUrl(picturDownUrl)
                if not pictureDownHtml:
                    continue
                #保存图片
                with open(fileName,"wb+") as f:
                    f.write(pictureDownHtml)
        
    
    #提取匹配内容中的所有链接地址
    def getHrefMap(self,html,isPicture=0,isFolder=0):
        hrefDict={}
        pattern=re.compile(r'',re.I)
        if isPicture:
            pattern=re.compile(r'

\s*?',re.I) hrefIter=pattern.finditer(html) index=0 for iter in hrefIter: hrefText=iter.group() #匹配分类名字 pattern=re.compile(r'"\s*?>\s*?.*?',re.I) name="" nameGroup=pattern.search(hrefText) if nameGroup: name=nameGroup.group() if(5==len(nameGroup.group().replace(" ", ""))): pattern=re.compile(r'title=".*?"',re.I) nameGroup=pattern.search(hrefText) if nameGroup: name=nameGroup.group()[7:-1] name=name[2:-4].replace(" ", '') #匹配href pattern=re.compile(r'href=".*?" rel="external nofollow" ',re.I) url="" urlGroup=pattern.search(hrefText) if urlGroup: url=urlGroup.group()[6:-1].replace(" ", '') if isFolder: index+=1 name+="_"+str(index) hrefDict[name]=url return hrefDict #读取首页信息 包含各种分类的链接地址 以及图片集的地址集合 def readPages(self,html): html=html.decode("utf-8") #检索壁纸分类 #匹配 壁纸分类信息 pattern=re.compile(r'',re.I) sortClassGroup=pattern.search(html) if sortClassGroup: sortMessage=sortClassGroup.group() self.bzmenuDict=self.getHrefMap(sortMessage) #print(self.bzmenuDict) else: print("匹配壁纸分类出错!") return #匹配 按照屏幕大小分类 pattern=re.compile(r'',re.I) sortClassGroup=pattern.search(html) if sortClassGroup: sortMessage=sortClassGroup.group() self.sortscreenDict=self.getHrefMap(sortMessage) #print(self.sortscreenDict) else: print("匹配屏幕尺寸分类失败!") return #匹配 获取小分类 pattern=re.compile(r'',re.I) sortClassGroup=pattern.search(html) if sortClassGroup: sortMessage=sortClassGroup.group() #print(sortMessage) self.littleSignDict=self.getHrefMap(sortMessage) #print(self.littleSignDict) else: print("匹配小分类失败") return pictureDict={} #匹配 图片集地址 pattern=re.compile(r'',re.I) sortClassGroup=pattern.search(html) if sortClassGroup: sortMessage=sortClassGroup.group() pictureDict=self.getHrefMap(sortMessage,1) #print(pictureDict) else: print("匹配图片集地址失败!") return #print(html) return pictureDict #解析每个图片集合对应的图片集内容 解析出单个图片的链接地址 def readFolders(self,html): if not html: return html=html.decode("utf-8") #获取图片集里面每个图片的具体地址和名称 #匹配 获取小分类 pattern=re.compile(r'',re.I) sortClassGroup=pattern.search(html) pictureUrlDict={} if sortClassGroup: sortMessage=sortClassGroup.group() #print(sortMessage) pictureUrlDict=self.getHrefMap(sortMessage,1,1) #print(pictureUrlDict) else: print("匹配小分类失败") return return pictureUrlDict #解析每个图片集合对应的图片集内容 解析出单个图片的链接地址 def readPictures(self,html): if not html: return html=html.decode("utf-8") #获取图片集里面每个图片的具体地址和名称 #匹配 获取小分类 pattern=re.compile(r'',re.I) sortClassGroup=pattern.search(html) pictureUrl="" if sortClassGroup: sortMessage=sortClassGroup.group() #匹配href pattern=re.compile(u"src='.*?'",re.I) url="" urlGroup=pattern.search(sortMessage) if urlGroup: url=urlGroup.group()[5:-1].replace(" ", '') url=url.replace('img-pre', 'img-picdown') url=url.replace('pre', 'pic') url=str("https:")+url #print(sortMessage) pictureUrlDict=url #print(url) else: print("匹配小分类失败") return return pictureUrlDict class UrlUser: def __init__(self): self.agent=getHttpAgents() self.downPicture=downLoadPictures() #下载图片调用函数 def downPictures(self): #url="https://www.ivsky.com/bizhi" #b.getPictures(url) #确定保存路径 dirPath=input("请输入保存路径:") if not os.path.exists(dirPath): os.mkdir(dirPath) if not os.path.isdir(dirPath): print("savePath is wrong!") sys.exit() os.chdir(dirPath) #切换工作目录 #url=r"https://www.ivsky.com/bizhi/nvxing_1920x1080/index_{}.html" page=input("爬取前多少页的图片?\n") indexRe = re.search(r"\d+", page) if(not indexRe): print("输入页数有误!") indexRe=int(indexRe.group()) indexCur=1 while indexCur<=indexRe: try: #注意 爬取什么类型的图片可以根据不同的网址进行设计 下载类里面已经读取了所有分类对应的地址 有兴趣可以自己完善 url=r"https://www.ivsky.com/bizhi/nvxing_1920x1080/index_{}.html".format(indexCur) print(url) self.downPicture.getPictures(url) except: print("打开出错!") pass finally: indexCur+=1 #爬取代理 def downAgents(self): page=input("爬取前多少页的代理?\n") indexRe = re.search(r"\d+", page) if(not indexRe): print("输入页数有误!") return indexRe=int(indexRe.group()) self.agent.crawlingAgents(indexRe) # 检查当前代理池是否可以 def checkPool(self): self.agent.checkMyIpPool() if __name__ == "__main__": print("*"*20) print("1.爬取代理\n") print("2.检查代理\n") print("3.爬取图片") print("*"*20) mode=input("请输入数字选择处理模式:\n") indexRe = re.search(r"\d+", mode) if(not indexRe): print("输入页数有误!") sys.exit() indexRe=int(indexRe.group()) #实例化一个对象 uesrObj=UrlUser() if 1 == indexRe: uesrObj.downAgents() elif 2 == indexRe: uesrObj.checkPool() elif 3 == indexRe: uesrObj.downPictures() else: print("模式选择错误!") sys.exit() print("爬取完毕!")

效果图

python爬取某网站原图作为壁纸_第1张图片

你可能感兴趣的:(python爬取某网站原图作为壁纸)