import requests
import re
from bs4 import BeautifulSoup
class TiebaSpider:
def __init__(self, tieba_name):
self.tieba_name = tieba_name
self.url_temp = "https://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}"
self.headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Mobile Safari/537.36"}
def get_url_list(self):
return [self.url_temp.format(i * 50) for i in range(2)]
def parse_url(self, url):
print(url)
response = requests.get(url, headers=self.headers)
return response.content.decode()
def del_title(self, no_html):
new_html = re.sub(r'', '', no_html, 2)
return new_html
def write(self, content, txt_name):
with open(txt_name + '.txt', 'a', encoding='UTF-8') as f:
for i in range(len(content)):
string = content[i].split("\\n")
for i in string:
print(i)
f.writelines(i)
f.write("\n")
def save_html(self, html_str, page_num):
print(html_str)
match = re.findall('(?s)'
,
html_str)
html = match[0]
new_html = self.del_title(html)
soup = BeautifulSoup(new_html, 'lxml')
all_result = soup.find_all(class_='threadlist_abs_onlyline')
list = []
for i in all_result:
new_i = re.sub('\s+', '', i.get_text()).strip()
if new_i != '':
list.append(new_i)
print(new_i)
self.write(list, self.tieba_name)
def run(self):
url_list = self.get_url_list()
for url in url_list:
html_str = self.parse_url(url)
page_num = url_list.index(url) + 1
self.save_html(html_str, page_num)
if __name__ == '__main__':
tieba_spider = TiebaSpider("河南")
tieba_spider.run()