python-关于爬虫爬取贴吧图片

#利用xpath解析列表数据

from lxml import etree
import requests
import os
# 需求:爬取百度贴吧图片,翻页,下载图片保存到本地
# 流程:
# 1、构建url和headers
# 2、发送请求、获取响应
# 3、解析列表数据,使用xpath,提取贴吧的列表页面的数据,返回detail_list,next_url
# //li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a
# //a[contains(text(),'下一页')]/@href # https:
# 4、解析详情数据,使用xpath,提取详情的图片数据,返回图片列表
# //*[contains(@id,"post_content")]/img/@src
# 5、下载图片,遍历图片列表,发送请求,获取响应,提取图片名称,

class Tieba(object):
    def __init__(self):
        self.url = 'https://tieba.baidu.com/f?ie=utf-8&kw=%E7%BE%8E%E5%A5%B3%E5%90%A7&fr=search'
        self.headers = {

                      # Mozilla/5.0获取不到百度贴吧内js内容,更换 不支持js的4.0
            # 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)'
        }

    def get_data(self,url):
        resp = requests.get(url,headers=self.headers)
        return resp.content
    def parse_data(self,data):
        # 首页标题xpath
        # //li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a
        html = etree.HTML(data)
        node_list = html.xpath('//li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a')
        detail_list = []
        for node in node_list:
            temp={}
            temp['url'] = 'https://tieba.baidu.com'+ node.xpath('./@href')[0]
            detail_list.append(temp)
        next_url = html.xpath('//a[contains(text(),"下一页")]/@href')

        return detail_list,next_url

    def parse_detail_data(self,detail_list):
        html = etree.HTML(detail_list)
        image_list = html.xpath('//*[contains(@id,"post_content")]/img/@src')
        print(image_list)
        return image_list
    def downloads(self,image_list):
        # 创建文件夹,数据保存到文件夹中
        if not os.path.exists('images'):
            os.makedirs('images')
        for url in image_list:
            image = self.get_data(url)
            file_name = 'images'+ '/' + url.split('/')[-1]
            with open(file_name,'wb') as f:
                f.write(image)

    def run(self):
        # 1、构建url和headers
        url = self.url
        # 循环请求
        while True:
            # 2、发送请求、获取响应
            data = self.get_data(url)
            # 3、解析列表数据,使用xpath,提取贴吧的列表页面的数据,返回detail_list,next_url
            detail_list,next_url = self.parse_data(data)
            for detail in detail_list:
                detail_data= self.get_data(detail['url'])
                # 4、解析详情数据,使用xpath,提取详情的图片数据,返回图片列表
                image_list = self.parse_detail_data(detail_data)
                # 5、下载图片,遍历图片列表,发送请求,获取响应,提取图片名称,
                self.downloads(image_list)
                # 判断循环结束条件
                if next_url == []:
                    break
                else:
                    url = 'https:'+ next_url[0]

if __name__ == '__main__':
    tieba = Tieba()
    tieba.run()

你可能感兴趣的:(python,webspider)