最近看完《python基础教程》后,想学下数据分析和爬虫。找了一些简单的爬虫练练手。
参考了Python爬虫实战一之爬取糗事百科段子,因为糗事百科代码改版了,我修改了Re后基本实现,然后用bs4改为第二版。
工具:python2.7.11 IDE:pyhcarn
1、用Re匹配识别:
#-*- coding:utf8 -*- """ 爬取糗事百科 ——>2.0版本 ——>使用正则表达式 """ import re import urllib, urllib2 import thread import time class Baike: def __init__(self): self.pageIndex = 1 self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' self.headers = {'User-Agent': self.user_agent} self.stories = [] self.enable = False def getPageCode(self, pageIndex): '传入某一页的索引获得页面代码' URL = "http://m.qiushibaike.com/hot/page/" + str(pageIndex) try: req = urllib2.Request(URL, headers=self.headers) Response = urllib2.urlopen(req) text = Response.read() pageCode = text.decode('utf-8') #将页面转化为UTF-8编码 return pageCode except urllib2.URLError, e: if hasattr(e,"code"): print e.code def getPageContent(self, pageIndex): '传入某一页代码,返回本页的段子列表' pageCode = self.getPageCode(pageIndex) if not pageCode: print "页面加载失败...." return None patt = re.compile(r'<div.*?class="author.*?>.*?<h2>(.*?)</h2>.*?<div.*?class="content".*?>(.*?)</div>', re.S) replaceBr = re.compile('<br/>') replaceQuote = re.compile(r'"') m = re.findall(patt, pageCode) pageListItems = [] for i in m: text = re.sub(replaceBr, "\n", i[1]) end = re.sub(replaceQuote, '"', text) pageListItems.append([i[0], end.strip()]) return pageListItems #返回当前页的段子列表 def loadPageListItems(self): ' 如果当前未看的页数少于2页,则加载新一页内容加入到stories列表中' if self.enable == True: if len(self.stories) < 2: list = self.getPageContent(self.pageIndex) if list: self.stories.append(list) self.pageIndex += 1 def getOneStory(self, pageListItems, page): '每次敲回车打印输出一个段子,若输入quit则退出' for one in pageListItems: input = raw_input() self.loadPageListItems() if input == 'quit': self.enable = False return print u"第%d页\t发布人:%s\n内容:%s" %(page, one[0], one[1]) def start(self): '先默认加载一页' print u"正在读取糗事百科段子,输入quit退出" self.enable = True thread.start_new_thread(self.loadPageListItems, ()) nowPage = 1 while self.enable: if len(self.stories) > 0: pageStories = self.stories[0] del self.stories[0] self.getOneStory(pageStories, nowPage) #输出段子 nowPage += 1 spider = Baike() spider.start()
#-*- coding:utf8 -*- """ 爬取糗事百科 ——>2.0版本 ——>使用bs4模块 """ import re import urllib, urllib2 import thread from bs4 import BeautifulSoup class Baike: def __init__(self): self.pageIndex = 1 self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' self.headers = {'User-Agent': self.user_agent} self.stories = [] self.enable = False def getPageCode(self, pageIndex): '传入某一页的索引获得页面代码' URL = "http://m.qiushibaike.com/hot/page/" + str(pageIndex) try: req = urllib2.Request(URL, headers=self.headers) Response = urllib2.urlopen(req) text = Response.read() pageCode = text.decode('utf-8') #将页面转化为UTF-8编码 return pageCode except urllib2.URLError, e: if hasattr(e,"code"): print e.code def getPageContent(self, pageIndex): '传入某一页代码,返回本页的段子列表' pageCode = self.getPageCode(pageIndex) if not pageCode: print "页面加载失败...." return None soup = BeautifulSoup(pageCode, "html.parser") pageListItems, aa, bb = [], [], [] for m, n in zip(soup('h2'), soup.select('div .content')): ans1 = ''.join(m.string).strip() aa.append(ans1) bb.append(n.string) for i,j in zip(aa, bb): if str(type(j)) != '<type \'NoneType\'>': pageListItems.append([i+'\n', j.strip()]) else: pageListItems.append([i, j]) return pageListItems #返回当前页的段子列表 def loadPageListItems(self): ' 如果当前未看的页数少于2页,则加载新一页内容加入到stories列表中' if self.enable == True: if len(self.stories) < 2: list = self.getPageContent(self.pageIndex) if list: self.stories.append(list) self.pageIndex += 1 def getOneStory(self, pageListItems, page): '每次敲回车打印输出一个段子,若输入quit则退出' for one in pageListItems: input = raw_input() self.loadPageListItems() if input == 'quit': self.enable = False return print u"第%d页\t发布人:%s内容:%s" %(page, one[0], one[1]) def start(self): '先默认加载一页' print u"正在读取糗事百科段子,输入quit退出" self.enable = True thread.start_new_thread(self.loadPageListItems, ()) nowPage = 1 while self.enable: if len(self.stories) > 0: pageStories = self.stories[0] del self.stories[0] self.getOneStory(pageStories, nowPage) #输出段子 nowPage += 1 spider = Baike() spider.start()
运行过程:运行后,每按一次Enter就弹出一条,输入quit则退出程序
因为不太熟悉python的模块等语法,在用bs写时捣鼓了一下午才成功。个人感觉用bs4库比用正则表达式好用。
《python基础教程》:1、正则表达式并不是完全可读的。对于复杂的html代码和查询来说,表达式会变得乱七八糟而且不可维护;
2、正则表达式被html源代码束缚,而不是取决于更抽象的结构,这就意味着网页结构中很小的改变就会导致程序中断;
3、bs4库是专门用于屏幕抓取,可对html代码进行解析等