学习python时做了一个爬虫爬取百度贴吧的内容,但是用BeautifulSoup得到的结果使用find_all函数却无法获取。
getCommentInfo.py:
1 from urllib import request 2 import requests 3 from bs4 import BeautifulSoup 4 from mylog import MyLog as mylog 5 import random 6 7 class Item(object): 8 title = None #帖子标题 9 firstAuthor = None #创建者 10 firstTime = None #创建时间 11 reNum = None #总回复数 12 content = None #最后回复内容 13 lastAuthor = None #最后回复者 14 lastTime = None #最后回复时间 15 16 class GetTiebaInfo(object): 17 def __init__(self,url): 18 self.url = url 19 self.log = mylog() 20 self.pageSum = 5 21 self.urls = self.getUrls(self.pageSum) 22 self.items = self.spider(self.urls) 23 self.pipelines(self.items) 24 25 def getUrls(self,pageSum): 26 urls = [] 27 pns = [str(i*50) for i in range(pageSum)] 28 ul = self.url.split('=') 29 for pn in pns: 30 ul[-1] = pn 31 url = '='.join(ul) 32 urls.append(url) 33 self.log.info(u"获取URLS成功 ") 34 return urls 35 36 def spider(self,urls): 37 items = [] 38 for url in urls: 39 htmlContent = self.getResponseContent(url) 40 with open("content.html","w",encoding='utf-8') as f: 41 f.write(htmlContent) 42 soup = BeautifulSoup(htmlContent,'lxml') 43 with open('soup.txt','w',encoding='utf-8') as fp: 44 fp.write(soup.text) 45 46 tagsli = soup.find_all('li',attrs={'class':'j_thread_list clearfix'}) 47 for tag in tagsli: 48 item = Item() 49 item.title = tag.find('a',attrs={'class':'j_th_tit '}).get_text().strip() 50 item.firstAuthor = tag.find('span',attrs={'class':'frs-author-name-wrap'}).a.get_text().strip() 51 item.firstTime = tag.find('span',attrs={'title':u'创建时间'.encode('utf-8')}).get_text().strip() 52 item.reNum = tag.find('span',attrs={'title':u'回复'.encode('utf-8')}).get_text().strip() 53 item.content = tag.find('div',attrs={'class':'threadlist_abs threadlist_abs_onlyline '}).get_text().strip() 54 item.lastAuthor = tag.find('span',attrs={'class':'tb_icon_author_rely j_replyer'}).a.get_text().strip() 55 item.lastTime = tag.find('span',attrs={'title':u'最后回复时间'.encode('utf-8')}).get_text().strip() 56 items.append(item) 57 self.log.info(u'获取标题为<<%s>>的项成功 ...' %item.title) 58 return items 59 60 def pipelines(self,items): 61 fileName = u'百度贴吧_权力的游戏.txt'.encode('utf-8') 62 with open(fileName,'w') as fp: 63 for item in items: 64 fp.write('title:%s \t author:%s \t firstTime:%s \n content:%s \n return:%s \n lastAuthor:%s \t lastTime:%s \n\n\n\n' 65 %(item.title.encode('utf-8'),item.firstAuthor.encode('utf-8'),item.firstTime.encode('utf-8'),item.content.encode('utf-8'),item.lastTime.encode('utf-8'))) 66 self.log.info(u'标题为<<%s>>的项输入到"%s"成功' %(item.title,fileName.decode('utf-8'))) 67 68 def getResponseContent(self,url): 69 header = { 70 'Accept': 'text/heml,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 71 'Accept-Encoding': 'gzip,deflate,sdch', 'Accept-Language': 'zh_CN,zh;q=0.8', 72 'Connect': 'keep-alive', 73 'User-Agent': 'Mozilla/5.0(Windows NT 6.3;WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/43.0.235' 74 } 75 timeout = random.choice(range(80, 180)) 76 try: 77 response = requests.get(url,headers = header,timeout = timeout) 78 response.encoding = 'utf-8' 79 except: 80 self.log.error(u'Python 返回 URL:%s 数据失败' %url) 81 else: 82 self.log.info(u'Python 返回URL:%s 数据成功' %url) 83 return response.content.decode('utf-8') 84 if __name__=='__main__': 85 url = u'http://tieba.baidu.com/f?kw=权力的游戏&ie=utf-8&pn=50' 86 GTI = GetTiebaInfo(url)
mylog.py
1 import logging 2 import getpass 3 import sys 4 5 #定义MyLog类 6 class MyLog(object): 7 def __init__(self): 8 self.user = getpass.getuser() 9 self.logger = logging.getLogger(self.user) 10 self.logger.setLevel(logging.DEBUG) 11 12 #日志文件名 13 self.logFile = sys.argv[0][0:-3] + '.log' 14 self.formatter = logging.Formatter('%(asctime)-12s %(levelname)-8s %(name)-10s %(message)-12s\r\n') 15 16 #文件显示到屏幕并输出到日志文件 17 self.logHand = logging.FileHandler(self.logFile,encoding='utf-8') 18 self.logHand.setFormatter(self.formatter) 19 self.logHand.setLevel(logging.DEBUG) 20 21 self.logHandSt = logging.StreamHandler() 22 self.logHand.setFormatter(self.formatter) 23 self.logHandSt.setLevel(logging.DEBUG) 24 25 self.logger.addHandler(self.logHand) 26 self.logger.addHandler(self.logHandSt) 27 28 def debug(self,msg): 29 self.logger.debug(msg) 30 31 def info(self,msg): 32 self.logger.info(msg) 33 34 def warn(self,msg): 35 self.logger.warning(msg) 36 37 def error(self,msg): 38 self.logger.error(msg) 39 40 def critical(self,msg): 41 self.logger.critical(msg) 42 43 # if __name__=='__main__': 44 # # mylog = MyLog() 45 # # mylog.debug(u"I'm debug 测试中文") 46 # # mylog.info("I'm info") 47 # # mylog.warn("I'm warn") 48 # # mylog.error(u"I'm error 测试中文") 49 # # mylog.critical("I'm critical")
错误:
在getCommentInfo.py中40行左右的htmlContent可得到原html的正确内容,但经BeautifulSoup后,返回的soup内容变化,导致无法爬取结果。可从两个调式文件content.html和soup.txt得知。