用urllib、BeautifulSoup抓取糗事百科段子

python版本为:

chao@chao-machine:~/python_study$ python
Python 3.4.3 (default, May 31 2015, 17:07:22) 
[GCC 4.9.1] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> 

代码

#_*_ coding:utf-8 _*_
from bs4 import BeautifulSoup
import urllib
import urllib.request
import urllib.error
import urllib.parse


def print_qiushi(item):
	#过滤掉有图片的段子
	if item.find('div',class_='thumb'):
		return
	#过滤掉有视频的段子
	if item.find(name="div",class_='video_holder'):
		return
	#获取发表这条段子的用户名
	author = item.find("div",class_='author')
	if author != None:
		author = author.get_text().strip()
	else:
		author = 'anonymous'
	#获取段子的发表时间
	times = item.find("div",class_='content').contents[-2]
	if times == None:
		times = ''
	else:
		times = str(times)
		times.strip()
	#获取段子内容
	content = item.find("div",class_='content').get_text().strip()


	print('-_-:',author,"  ",times,'\n')
	print(content)
	print("\n\n")


url="http://www.qiushibaike.com/text"     换成"http://www.qiushibaike.com/"也可以的,反正有过滤机制 
user_agent = 'Mozellb/4.0 (compatible;MSIE 5.5;Windows NT)'
heads = {'User-Agent':user_agent}
try:
	request = urllib.request.Request(url,headers=heads)
	response = urllib.request.urlopen(request)
	soup = BeautifulSoup(response.read())
	items = soup.find_all(name='div',class_='article block untagged mb15')
	#循环处理没一条状态,包括用户名,内容,发布的时间
	for item in items:
		print_qiushi(item)
except urllib.error.URLError as e:
	if hasattr(e,'code'):
		print(e.code)
	if hasattr(e,'reason'):
		print(e.reason)


输出结果:




这次发的有点匆忙,下次再分析过程


你可能感兴趣的:(爬虫,python)