爬取百度贴吧所有数据

# !/user/bin/python3
# -*- coding:utf-8 -*-
import requests
from lxml import etree

class Tiebasprite():
    def __init__(self, tieba_name):
        self.tieba_name = tieba_name
        self.post_url = 'https://tieba.baidu.com/f?kw='+tieba_name+' ie=utf-8&pn={}'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Mobile Safari/537.36'
            # 'User-Agent': "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
        }

    def get_url_list(self):
        '''得到爬取贴吧的网址'''
        list = []
        for i in range(1000):
            list.append(self.post_url.format(i * 50))
            # print(list)
        return list

    def get_post(self, url):
        '''访问网址'''
        response = requests.get(url=url, headers=self.headers)
        return response

    def save_html(self, html_str):
        '''将爬取的数据存入文件当中去'''
        data_list=[]
        tieba_data = html_str.content.decode('utf-8')
        tieba_shuju = etree.HTML(tieba_data)
        # 获取数据 进行解析
        jiexi_shuju = tieba_shuju.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
        print(1,jiexi_shuju)
        for shuju in jiexi_shuju:
            print(shuju)
            list = {}
            # 获取标题和链接
            list['title'] = shuju.xpath('./text()')[0]
            print(list['title'])
            list['link'] = 'https://tieba.baidu.com/' + shuju.xpath('./@href')[0]
            data_list.append(list)
            with open('a.text','w')as e:
                for i in list:
                    e.write(i)
        # print(data_list)
        return data_list

    def run(self):
        # 1.获取地址
        url_list = self.get_url_list()
        # 2.访问
        for url in url_list:
            html_str = self.get_post(url)
            # 3.存档
            # page_num = url_list.index(url) + 1
            self.save_html(html_str)


tieba_spider = Tiebasprite(input('请输入想查询的贴吧名字'))  # 根据用户输入的内容爬取百度贴吧不同的内容
tieba_spider.run()

你可能感兴趣的:(爬虫)