python版本为:
chao@chao-machine:~/python_study$ python Python 3.4.3 (default, May 31 2015, 17:07:22) [GCC 4.9.1] on linux Type "help", "copyright", "credits" or "license" for more information. >>>
#_*_ coding:utf-8 _*_ from bs4 import BeautifulSoup import urllib import urllib.request import urllib.error import urllib.parse def print_qiushi(item): #过滤掉有图片的段子 if item.find('div',class_='thumb'): return #过滤掉有视频的段子 if item.find(name="div",class_='video_holder'): return #获取发表这条段子的用户名 author = item.find("div",class_='author') if author != None: author = author.get_text().strip() else: author = 'anonymous' #获取段子的发表时间 times = item.find("div",class_='content').contents[-2] if times == None: times = '' else: times = str(times) times.strip() #获取段子内容 content = item.find("div",class_='content').get_text().strip() print('-_-:',author," ",times,'\n') print(content) print("\n\n") url="http://www.qiushibaike.com/text" 换成"http://www.qiushibaike.com/"也可以的,反正有过滤机制 user_agent = 'Mozellb/4.0 (compatible;MSIE 5.5;Windows NT)' heads = {'User-Agent':user_agent} try: request = urllib.request.Request(url,headers=heads) response = urllib.request.urlopen(request) soup = BeautifulSoup(response.read()) items = soup.find_all(name='div',class_='article block untagged mb15') #循环处理没一条状态,包括用户名,内容,发布的时间 for item in items: print_qiushi(item) except urllib.error.URLError as e: if hasattr(e,'code'): print(e.code) if hasattr(e,'reason'): print(e.reason)
这次发的有点匆忙,下次再分析过程