咳咳直接上代码
import requests
from lxml import etree
import os
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36'
}
m_path = './1000图库'
if not os.path.exists(m_path):
os.mkdir(m_path)
for i in range(1, 2):
url = f'https://www.1000tuku.com/guoneimeinv/list_5_{i}.html'
resp = requests.get(url=url, headers=headers).text
tree = etree.HTML(resp)
href_list = tree.xpath('/html/body/div[4]/ul//@href')
for href in href_list:
resp1 = requests.get(url=href, headers=headers)
resp1_text = resp1.text.encode('ISO-8859-1').decode('GB18030')
tree1 = etree.HTML(resp1_text)
page_num = int(tree1.xpath('//*[@id="content"]/div[1]/div[2]/ul/li[1]/a/text()')[0][1:3])
title = tree1.xpath('/html/head/title/text()')[0]
title_path = m_path + f'/{title}'
if not os.path.exists(title_path):
os.mkdir(title_path)
for j in range(1, page_num + 1):
if j == 1:
every_herf = href
else:
every_herf = href.replace('.html', f'_{j}.html')
resp2 = requests.get(url=every_herf, headers=headers).text
tree2 = etree.HTML(resp2)
src_list = tree2.xpath('//*[@id="content"]/div[1]/div[1]//@src')
for src in src_list:
jpg_data = requests.get(url=src, headers=headers).content
jpg_name = src.split('/')[-1]
jpg_path = title_path + f'/{jpg_name}'
with open(jpg_path, 'wb') as fp:
fp.write(jpg_data)
print(jpg_name, '下载完成')
咳咳,看看效果
放学啦,回家吃饭去吧