文章目录
- 案例一:4k图片解析下载
- 案例二:全国城市名称爬取
案例一:4k图片解析下载
-
目的:爬取并下载相关网站中的图片
-
网站地址
-
示例代码
import requests
from lxml import etree
import os
import time
// 1. 创建文件夹,然后获取url,进行UA伪装
if not os.path.exists('./4kpic'):
os.mkdir('./4kpic')
url = 'http://pic.netbian.com/4kbeijing/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56',
'Cookie': '__cfduid=da7f51b97bb9f9e2236b55baf8a0ef6bd1612596271; Hm_lvt_526caf4e20c21f06a4e9209712d6a20e=1612596274; zkhanecookieclassrecord=%2C54%2C56%2C60%2C66%2C65%2C; Hm_lpvt_526caf4e20c21f06a4e9209712d6a20e=1612598106'
}
// 2. 获取响应对象
response = requests.get(url = url,headers = headers)
// 3.当爬取的源代码中的中文出现乱码时,使用下述方法
response.encoding = response.apparent_encoding
page_text = response.text
// 4. 实例化etree对象
tree = etree.HTML(page_text)
// 5. 爬取图片
all_li = tree.xpath('//div[@class="slist"]/ul/li')
for li in all_li://遍历all_li中的所有的li标签,再利用xpath提取局部内容
time.sleep(1)
img_src = "http://pic.netbian.com" + li.xpath('./a/img/@src')[0]
//注意在给爬取到的图片命名时要加后缀名
img_name = li.xpath('./a/img/@alt')[0]+'.jpg'
img_response = requests.get(url = img_src,headers = headers)
img_content = img_response.content //爬取的图片是二进制格式
img_response.close()
time.sleep(5)
img_path = '4kpic/' + img_name
with open(img_path,'wb') as fp:
fp.write(img_content)
print(img_name+'下载成功')
案例二:全国城市名称爬取
-
目的:爬取网站中的热门城市和全部城市的名称
-
网站地址
-
示例代码
import requests
from lxml import etree
// 1. 获取url,进行UA伪装
url = 'https://www.aqistudy.cn/historydata/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.56'
}
// 2. 获取响应对象
page_text = requests.get(url = url,headers = headers).text
// 3. 实例化etree对象
tree = etree.HTML(page_text)
// 4. 利用 A|B (竖线)可以匹配A+B两部分的内容
citys_list = tree.xpath('//div[@class="bottom"]/ul/li/a/text()|//div[@class="bottom"]/ul/div[2]/li/a/text()')
print(citys_list)