#利用xpath解析列表数据
from lxml import etree
import requests
import os
# 需求:爬取百度贴吧图片,翻页,下载图片保存到本地
# 流程:
# 1、构建url和headers
# 2、发送请求、获取响应
# 3、解析列表数据,使用xpath,提取贴吧的列表页面的数据,返回detail_list,next_url
# //li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a
# //a[contains(text(),'下一页')]/@href # https:
# 4、解析详情数据,使用xpath,提取详情的图片数据,返回图片列表
# //*[contains(@id,"post_content")]/img/@src
# 5、下载图片,遍历图片列表,发送请求,获取响应,提取图片名称,
class Tieba(object):
def __init__(self):
self.url = 'https://tieba.baidu.com/f?ie=utf-8&kw=%E7%BE%8E%E5%A5%B3%E5%90%A7&fr=search'
self.headers = {
# Mozilla/5.0获取不到百度贴吧内js内容,更换 不支持js的4.0
# 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)'
}
def get_data(self,url):
resp = requests.get(url,headers=self.headers)
return resp.content
def parse_data(self,data):
# 首页标题xpath
# //li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a
html = etree.HTML(data)
node_list = html.xpath('//li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a')
detail_list = []
for node in node_list:
temp={}
temp['url'] = 'https://tieba.baidu.com'+ node.xpath('./@href')[0]
detail_list.append(temp)
next_url = html.xpath('//a[contains(text(),"下一页")]/@href')
return detail_list,next_url
def parse_detail_data(self,detail_list):
html = etree.HTML(detail_list)
image_list = html.xpath('//*[contains(@id,"post_content")]/img/@src')
print(image_list)
return image_list
def downloads(self,image_list):
# 创建文件夹,数据保存到文件夹中
if not os.path.exists('images'):
os.makedirs('images')
for url in image_list:
image = self.get_data(url)
file_name = 'images'+ '/' + url.split('/')[-1]
with open(file_name,'wb') as f:
f.write(image)
def run(self):
# 1、构建url和headers
url = self.url
# 循环请求
while True:
# 2、发送请求、获取响应
data = self.get_data(url)
# 3、解析列表数据,使用xpath,提取贴吧的列表页面的数据,返回detail_list,next_url
detail_list,next_url = self.parse_data(data)
for detail in detail_list:
detail_data= self.get_data(detail['url'])
# 4、解析详情数据,使用xpath,提取详情的图片数据,返回图片列表
image_list = self.parse_detail_data(detail_data)
# 5、下载图片,遍历图片列表,发送请求,获取响应,提取图片名称,
self.downloads(image_list)
# 判断循环结束条件
if next_url == []:
break
else:
url = 'https:'+ next_url[0]
if __name__ == '__main__':
tieba = Tieba()
tieba.run()