# 网页地址:https://sc.chinaz.com/
# 1) 请求对象的定制中
# (2)获取网页的源码
# (3)下载
# 需求:下载的前十页的图片
# 第一页页码和其他页码不一样
# https://sc.chinaz.com/tupian/dahaitupian.html
# https://sc.chinaz.com/tupian/dahaitupian_2.html
import urllib.request
from lxml import etree
def create_request(page):
if (page == 1):
url = "https://sc.chinaz.com/tupian/dahaitupian.html"
else:
url = "https://sc.chinaz.com/tupian/dahaitupian_" + str(page) + ".html"
# print(url)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
# 请求对象的定制
request = urllib.request.Request(url=url, headers=headers)
return request
def get_content(request):
# 模拟浏览器向服务器发起请求
response = urllib.request.urlopen(request)
# 获取响应数据
content = response.read().decode("utf-8")
return content
def save_img(content):
# urllib.request.urlretrieve("图片地址","文件名字")
tree = etree.HTML(content)
# //div[@class='container']//img/@data-original
# //div[@class='container']//img/@alt"
# 图片xpath路径
src_list = tree.xpath("//div[@class='container']//img/@data-original") # xpath语法
name_list = tree.xpath("//div[@class='container']//img/@alt")
# print(len(src_list),len(name_list))
for i in range(len(name_list)):
name = name_list[i]
src = src_list[i]
url = "http:"+src
# print(url)
# 下载图片
urllib.request.urlretrieve(url=url,filename="./image/"+name+".jpg") # 需要自己穿建一个image文件夹
if __name__ == '__main__':
start_page = int(input("请输入起始页码"))
end_page = int(input("请输入结束页码"))
for page in range(start_page, end_page + 1):
# print(page)
# 1.请求对象的定制
request = create_request(page)
# 2.获取网页源码
content = get_content(request)
# 3.下载图片
save_img(content)