uumnt美女图片爬虫

# _*_ coding:utf-8 _*_
import urllib
import urllib2
import re
from lxml import etree
#遍历所有471个图片页面,从中拿到每一个美女的html页面
def allurl(url,headers):
    for a in range(1,472):
        Newurl = url + str(a) +'.html'  #拼接471个页面
        print Newurl
        request = urllib2.Request(Newurl, headers=headers)
        response = urllib2.urlopen(request).read()
        pattern = etree.HTML(response)  #转化为lxml页面
        link_list = pattern.xpath('//p[@class="list_h"]/a/@href')   #提取每一个美女的html
        for link in link_list:
            Newurl = "http://www.uumnt.com" + link  
            #print Newurl
            allgirl(Newurl,headers = headers) 
def allgirl(url,headers):
    request = urllib2.Request(url,headers = headers)
    response = urllib2.urlopen(request).read()
    pattern = etree.HTML(response)
    link_num = pattern.xpath('//div[@class="page"]/a[7]/@href')   #拿到每一个美女图片数
    #print link_num
    #print url
    Newurl = url[:-5] + '_'
    #print Newurl
    for num in link_num:
    #pass
    Newnum =  num[-7:-5]
        allimgurl(Newnum,Newurl,headers)  #将图片页数和链接传到下一个方法里面做进一步处理

def allimgurl(num,url,headers):
    for Num in range(1,int(num)):
        Newurl = url + str(Num) + '.html'
        #print Newurl
        getimgurl(Newurl,headers)
def getimgurl(url,headers):
    request = urllib2.Request(url,headers = headers)
    response = urllib2.urlopen(request).read()
    pattern = etree.HTML(response)
    link_list = pattern.xpath('//div[@class="bg-white p15 center imgac clearfix"]/a/img/@src')
    name_list = pattern.xpath('//div[@class="bg-white p15 center imgac clearfix"]/a/img/@alt')
    for link,name in zip(link_list, name_list):
        saveimg(link,name,headers)
        print link + 'is saving '
    #print name
    #print link_list

def saveimg(url,name,headers):
    request = urllib2.Request(url,headers = headers)
    response = urllib2.urlopen(request).read()
    #print url
    #print name
    with open('/home/cgs/python/uumnt/' + name + '.jpg','wb') as f:
        f.write(response)
    print  name + "is save ok"



if __name__ == "__main__":
    url = "http://www.uumnt.com/meinv/list_"
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0","Referer":"https://newimg.uumnt.com/"}

allurl(url,headers)

你可能感兴趣的:(爬虫)