python spider 爬虫

#!/usr/bin/env  python
#coding=utf-8
###################################################
#   网页爬虫,版本:0.5
#   开发人员:ixafei,
#   联系:[email protected]
#   发布日期:2007年11月25日
###################################################
import time
import os
import urllib2
import urllib
from sgmllib import SGMLParser


class IXALINK(SGMLParser):
"""获取字符段落的所有连接,可以根据设置的包含和不能包含的条件过滤连接
也可以设置链接基准地址,用来补全那些非绝对路径的地址,这个类用来获取
高校BBS目标论坛的连接地址"""
def reset(self):
"""初始化类"""
self.links = []
self.links_includes=False
self.links_decludes=False
self.links_baseurls=False       
SGMLParser.reset(self)

def setBaseUrls(self,baseurl):
"""设置链接基本地址,默认为空,用来补全是相对地址的连接"""
self.links_baseurls=baseurl       

def setCondition(self,includes,decludes):
"""设置条件过滤连接,分为必须包含和不能包含,可以分别设置
为False用来禁止"""
self.links_includes=includes
self.links_decludes=decludes          

def _hasIncludes(self,links):
"""判断是否包含必须包含的条件"""
for c in self.links_includes:
if links.find(c)>-1:
pass
else:
return False
return True   

def _notHasDecludes(self,links):
"""判断是否包含必须不包含的条件"""
for c in self.links_decludes:
if links.find(c)>-1:
return False
else:
pass
return True    

def _validLinks(self,links):
"""判断连接是否符合包含和不包含的条件"""
if self.links_includes and self.links_decludes:
return self._hasIncludes(links) and self._notHasDecludes(links)
if self.links_includes:
return self._hasIncludes(links)
if self.links_decludes:
return self._notHasDecludes(links)
return True

def do_a(self,attrs):
"""自动根据设置的条件获取连接地址"""
for name,value in attrs:
if self._validLinks(value):
if self.links_baseurls:
link=(self.links_baseurls)%value
else:
link=value
self.links.append(link)           

def do_img(self,attrs):
"""#获取图片地址"""
for name,value in attrs:
return value



class IXABASE():  
"""顶级类,积聚所有公用方法,包含远程内容获取,公用解析方法
日志方法,文件价操作方法"""
def __init__(self):
"""初始化类"""
pass

def _getPos(self,string,s):
"""获取特定字符的位置"""        
return string.find(s)


def region(self,string,s,e):
"""获取给点条件的字符区间"""
_startPos=self._getPos(string,s)      
if bool(_startPos)>-1:
_tmp=string[_startPos+len(s):]
_endPos=self._getPos(_tmp,e)
return _tmp[:_endPos]
else:
return False
def snatch(self,url):
"""抓取指定地址的网页内容,如果失败则返回False"""
tmp=False       
try:
f=urllib2.urlopen(url)       
tmp=f.read()
finally:
f.close()      
return tmp  

class TOPAREA(IXABASE):
"""用来获取指定区域的代码,集成父类的方法"""
def __init(self):
"""初始化类"""
pass




if __name__=="__main__":   
nankai=1
njupt=0
sjt=0
beiy=0
sjtc=0
if nankai:       
#设置条件
region_s='''本日十大衷心祝福'''
region_e='''</td></tr></TABLE>'''
links_includes=["http://bbs.nankai.edu.cn/cgi-bin/bbs/newcon?board="]
links_decludes=["bless","Bless"]
links_baseurls="%"
#初始化
links = IXALINK()
areas = TOPAREA()
links.setCondition(links_includes,links_decludes)
#传入要分析的数据
s=file("hottopic10.htm").read()
#获取区域代码
r=areas.region(s,region_s,region_e)
links.feed(r)
links.close()
print len(links.links)
for k in links.links:
print k
tmp=areas.snatch(k)           
if tmp:  
#设置条件
content_s="<font color=navy>"
content_e='''<font class=c35>'''     
#获取区域代码
content=areas.region(tmp,content_s,content_e)
print content
else:
print "don't cache anything"
tmp=False
#break

if njupt:
#设置条件
region_s='''今日十大热门话题'''
region_e='''</table><center><hr>'''
links_includes=["bbstfind?board="]
links_decludes=[]
links_baseurls="http://bbs.njupt.edu.cn/cgi-bin/%s"
#初始化
links = IXALINK()
areas = TOPAREA()
links.setBaseUrls(links_baseurls)
links.setCondition(links_includes,links_decludes)
#传入要分析的数据
s=file("njupt.htm").read()
#获取区域代码
r=areas.region(s,region_s,region_e)
links.feed(r)
links.close()
print len(links.links)
for k in links.links:
print k                       

if sjt:
#设置条件
region_s='''iconT'''
region_e='''</td></tr></table> '''
links_includes=["http://bbs.sjtu.edu.cn/bbstcon?board="]
links_decludes=[]
links_baseurls="%s"
#初始化
links = IXALINK()
areas = TOPAREA()
links.setBaseUrls(links_baseurls)
links.setCondition(links_includes,links_decludes)
#传入要分析的数据
s=file("sjt.htm").read()
#获取区域代码
r=areas.region(s,region_s,region_e)
links.feed(r)
links.close()

print len(links.links)
for k in links.links:
print k
tmp=areas.snatch(k)
if tmp:  
#设置条件
content_s="), "
content_e='''<font class=c33>'''     
#获取区域代码
content=areas.region(s,content_s,content_e)
print content
else:
print "don't cache anything"


if beiy:
#设置条件
region_s='''hot_title'''
region_e='''SecTable'''
links_includes=["bbstcon.php?board=","&"]
links_decludes=[]
links_baseurls="http://bbs.ustb.edu.cn/%s"
#初始化
links = IXALINK()
areas = TOPAREA()
links.setBaseUrls(links_baseurls)
links.setCondition(links_includes,links_decludes)
#传入要分析的数据
s=file("beiy.htm").read()
#获取区域代码
r=areas.region(s,region_s,region_e)
links.feed(r)
links.close()
print len(links.links)
for k in links.links:
print k
if sjtc:
#设置条件
content_s='''),'''
content_e='''<font class=c37>'''
#初始化
areas = TOPAREA()
#传入要分析的数据
s=file("s2.htm").read()

#获取区域代码
content=areas.region(s,content_s,content_e)
print content

你可能感兴趣的:(python spider 爬虫)