爬虫下载壁纸

import requests
from requests_html import HTMLSession,HTML
import os
import re
import random
from threading import Thread
o=0
headers={
  'user-agent':'Mozilla/5.0'
}
def html(string):
    pattern = re.compile(r'\{\'(.*?)\'\}', re.S)
    gq = re.findall(pattern, string)
    return (gq[0])
def reurl(url):
    pattern = re.compile(r'(.*?).html', re.S)
    a = re.findall(pattern, url)
    return (a[0])
def withdown(imgtitle):
    r = requests.get(imgsrc['src'])
    with open(path + imgtitle + str(random.randint(1, 10000)) + '.jpg', 'wb') as f:
        f.write(r.content)

try:
    os.mkdir('E:\\自然1')
except:
    pass
path= 'E:\\自然1\\'
s = HTMLSession()
for y in range(1,6):
    print('在下载第%d-----------------------------------------------页'%y)
    # http://www.win4000.com/zt/meinvxiezhen_2.html
    index = s.get('http://www.win4000.com/zt/meinvxiezhen_' + str(y) + '.html', headers=headers)
    # index=s.get('http://www.win4000.com/wallpaper_208_0_0_'+str(y)+'.html',headers=headers)
    root=index.html
    for i in range(1,25):
        src=root.find('body > div.main > div > div.w1180.clearfix > div.Left_bar > '
                  'div.list_cont.Left_list_cont > div > div > div > ul > li:nth-child('+str(i)+') > a')
        link=src[0].absolute_links
        strlink=str(link)
        url=(html(strlink))#获取到每一个分类的link
        print("开始下载第%d类"%i)
        imghtml=s.get(url).html#进入详情类
        number=imghtml.find('body > div.main > div > div.pic_main > div > div.Bigimg > div.ptitle > em')
        number=(number[0].text)
        for g in range(1,int(number)):
            print('这是第%s%s',i,g)
            # print(url)
            url0=reurl(url)
            tupianhtml=url0+'_'+str(g)+'.html'
            downhtml=s.get(tupianhtml).html
            # downhtml=url+"_"+
            imgsrc=downhtml.find('body > div.main > div > div.pic_main > div > div.col-main > div.main-wrap > div.pic-meinv > a > img')[0].attrs
            imgtitle=downhtml.find("body > div.main > div > div.pic_main > div > div.Bigimg > div.ptitle > h1")[0].text
            # print(imgtitle)
            # print(imgsrc['src'])#这是定位的最终图片地址
            o=o+1
            print('在下载第%d张'%o)
            th = Thread(target=withdown, args=(imgtitle,)) 
            th.start()






你可能感兴趣的:(爬虫,单元测试)