其实,爬取百度贴吧是一件非常容易的事情,首先,观察百度贴吧网址变化,如下图
很明显贴吧根据页面的变化是有规律的(网址不变,pn=页数*50),这就导致爬取贴吧数据的时候带来了极大的便利。
代码如下图:
import requests
class Tiebasprite():
def __init__(self,tieba_name):
self.tieba_name =tieba_name
self.post_url = 'http://tieba.baidu.com/f?kw='+tieba_name+'%A6&ie=utf-8&pn={}'
self.headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Mobile Safari/537.36'}
def get_url_list(self):
'''得到爬取贴吧的网址'''
list =[]
for i in range(1000):
list.append(self.post_url.format(i*50))
return list
def get_post(self,url):
'''访问网址'''
response = requests.get(url=url,headers = self.headers)
return response.content.decode()
def save_html(self,html_str,page_num):
'''将爬取的数据存入文件当中去'''
file_path = '{}--第 {}页'.format(self.tieba_name,page_num)
with open(file_path,"w",encoding='utf-8')as f:
f.write(html_str)
def run(self):
#1.获取地址
url_list = self.get_url_list()
#2.访问
for url in url_list:
html_str = self.get_post(url)
#3.存档
page_num = url_list.index(url)+1
self.save_html(html_str,page_num)
tieba_spider = Tiebasprite('台山侨中') #根据用户输入的内容爬取百度贴吧不同的内容
tieba_spider.run()