1 # -*- coding:utf-8 -*- 2 import urllib 3 import urllib.request 4 5 page=1 6 url='http://www.qiushibaike.com/hot/page/'+str(page) 7 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 8 headers = { 'User-Agent' : user_agent } 9 10 try: 11 request=urllib.request.Request(url,headers=headers) 12 response=urllib.request.urlopen(request) 13 print(response.read()) 14 except urllib.request.URLError as e: 15 if hasattr(e,"code"): 16 print(e.code) 17 if hasattr(e,"reason"): 18 print(e.reason)
打印出第一页的html代码。
------------------------------继续--------------------------------
用正则表达式取出 用户名:
# -*- coding:utf-8 -*- import urllib import urllib.request import re page=1 url='http://www.qiushibaike.com/hot/page/'+str(page) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: request=urllib.request.Request(url,headers=headers) response=urllib.request.urlopen(request) content=response.read().decode('utf-8') pattern = re.compile('.*?title="(.*?)">.*?(.*?) ',re.S) #pattern=re.compile('.*?title="(.*?)">.*?(.*?)' # +'.*?.*?(.*?).*?' # +'.*?.*?(.*?)',re.S) items = re.findall(pattern, content) for item in items: print(item[0])#, item[1], item[2], item[3], item[4] except urllib.request.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason)
------------------------------继续--------------------------------
取出其他信息
# -*- coding:utf-8 -*- import urllib import urllib.request import re page=1 url='http://www.qiushibaike.com/hot/page/'+str(page) user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } try: request=urllib.request.Request(url,headers=headers) response=urllib.request.urlopen(request) content=response.read().decode('utf-8') pattern = re.compile('.*?title="(.*?)"> ' + '.*?span>(.*?)' + '.*?' + '.*?(.*?)' ,re.S) #pattern=re.compile('.*?title="(.*?)">.*?(.*?)' # +'.*?.*?(.*?).*?' # +'.*?.*?(.*?)',re.S) items = re.findall(pattern, content) for item in items: haveImg=re.search("img",item[2]) if not haveImg: print(item[0],item[1],item[3])#, item[3], item[4] except urllib.request.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason)
------------------------------继续--------------------------------
完善交互界面
# -*- coding:utf-8 -*-
import urllib
import urllib.request
import re
import time
import _thread
class QSBK:
def __init__(self):
self.pageIndex=1
self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
self.headers = {'User-Agent': self.user_agent}
self.stories=[]
self.enable=False
def getPage(self,pageIndex):
try:
url = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex)
request = urllib.request.Request(url, headers=self.headers)
response = urllib.request.urlopen(request)
pageCode=response.read().decode('utf-8')
return pageCode
except urllib.request.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print("糗事百科连接失败原因:",e.reason)
return None
def getPageItems(self,pageIndex):
pageCode=self.getPage(pageIndex)
if not pageCode:
print("页面加载失败")
return None
pattern=re.compile('.*?title="(.*?)">'
+ '.*?span>(.*?)'
+ '.*?(.*?)'
,re.S)
items = re.findall(pattern,pageCode)
pageStories=[]
for item in items:
haveImg = re.search("img", item[2])
if not haveImg:
replaceBR=re.compile('
')
text=re.sub(replaceBR,"\n",item[1])
pageStories.append([item[0].strip(),text.strip(),item[2].strip(),item[3].strip()])
return pageStories
def loadPage(self):
if self.enable==True:
if len(self.stories)<2:
pageStories=self.getPageItems(self.pageIndex)
if pageStories:
self.stories.append(pageStories)
self.pageIndex+=1
def getOneStor4y(self,pageStories,page):
for story in pageStories:
input1=input()
self.loadPage()
if input1=="Q":
self.enable=False
return
print("第%d页\t发布人:%s\t发布时间:%s\t赞:%s\n%s"%(page,story[0],story[2],story[3],story[1]))
def start(self):
print("正在读取糗事百科,回车-继续,Q-退出 !")
self.enable=True
self.loadPage()
nowPage=0
while self.enable:
if len(self.stories)>0:
pageStories=self.stories[0]
nowPage+=1
del self.stories[0]
self.getOneStor4y(pageStories,nowPage)
spider=QSBK()
spider.start()
参考文档:崔庆才python2.7 Python爬虫实战一之爬取糗事百科段子