1.对百度贴吧的任意帖子进行抓取
2.指定是否只抓取楼主发帖内容
3.将抓取到的内容分析并保存到文件
import re
import bs4
from bs4 import BeautifulSoup
import requests
class TiebaSpider(object):
def __init__(self,see_lz):
self.see_lz=see_lz
def getHTMLText(self,url,pageNumber):
try:
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'}
r=requests.get(url+str(pageNumber),timeout=30,headers=headers)
r.raise_for_status()
r.encoding='utf-8'
return r.text
except:
return 'ERROR'
def getTitle(self,html):
try:
title=re.search(r'