我的第二个py脚本


import time

import urllib

import urllib.request

import requests

from lxml import etree

header = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}

def allimg(url):

url=requests.get(url,headers=header).content

url=etree.HTML(url)

#提取标题

title=url.xpath('//div[@class="show"]/h1/text()')

title=title[0]

#替换图片命名时标题中不可辨认的字符

title=title.replace('/','1')

print(title)


#提取最终链接

link=url.xpath('//div[@class="show"]/a/img/@src')

link=str(link[0])

link=link[2:]

link=link[:-14]

link="http://"+link

print(link)

time.sleep(5)

urllib.request.urlretrieve(link,'D:\\test\\%s.jpg'%title)


def allpages(url,parturl):

url=requests.get(url,headers=header).content

url=etree.HTML(url)

maxpage=url.xpath('//div[@class="epages"]/a/text()')

maxpage=int(maxpage[-1])

for i in range(1,maxpage):

allpages=parturl+'_'+str(i)+'.html'

allimg(allpages)

url='https://www.mrtui.com/'

url=requests.get(url,headers=header).content

url=etree.HTML(url)

pageurl=url.xpath('//div[@class="content"]/ul[@class="list"]/li/a/@href')

for pageurl in pageurl:

    #制作字符串切片,为后面的操作做准备

    pageurl='https://www.mrtui.com/'+pageurl


    parturl=pageurl[:-5]

    allpages(pageurl,parturl)




你可能感兴趣的:(我的第二个py脚本)