python 抓取分析 SGMLParser 实例
数据:
希望 抓取
div > p id='da' > a text
和 div > p id='da' > html
<
div
>
< p id ="tt" >
< a href =/tag/php > no no </ a >
</ p >
< p id ='da' >
< a href =/tag/php > php </ a >
< a href =/tag/python > python </ a >
< a href =/tag/vim > vim </ a >
< a href =/tag/windows > windows </ a >
< a href =/tag/wingide > wingide </ a >
</ p >
</ div >
< p id ='da' >
< a href =/tag/wingide > hehe </ a >
</ p >
< p id ="tt" >
< a href =/tag/php > no no </ a >
</ p >
< p id ='da' >
< a href =/tag/php > php </ a >
< a href =/tag/python > python </ a >
< a href =/tag/vim > vim </ a >
< a href =/tag/windows > windows </ a >
< a href =/tag/wingide > wingide </ a >
</ p >
</ div >
< p id ='da' >
< a href =/tag/wingide > hehe </ a >
</ p >
希望结果为
$ python t.py
a_text: ["'php'", "'python'", "'vim'", "'windows'", "'wingide'"]
div_html[0]:
< p id ="da" >
< a href ="/tag/php" > php </ a >
< a href ="/tag/python" > python </ a >
< a href ="/tag/vim" > vim </ a >
< a href ="/tag/windows" > windows </ a >
< a href ="/tag/wingide" >刘凯毅 </ a >
</ p >
a_text: ["'php'", "'python'", "'vim'", "'windows'", "'wingide'"]
div_html[0]:
< p id ="da" >
< a href ="/tag/php" > php </ a >
< a href ="/tag/python" > python </ a >
< a href ="/tag/vim" > vim </ a >
< a href ="/tag/windows" > windows </ a >
< a href ="/tag/wingide" >刘凯毅 </ a >
</ p >
#说明
其实 SGMLParser 我感觉最关键的是
#
/usr/lib/python2.5/sgmllib.py
# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
def finish_shorttag(self, tag, data):
# 而 finish_starttag finish_endtag 抓取会调用 end_* start_* 什么的
self.finish_starttag(tag, [])
self.handle_data(data)
self.finish_endtag(tag)
# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
def finish_shorttag(self, tag, data):
# 而 finish_starttag finish_endtag 抓取会调用 end_* start_* 什么的
self.finish_starttag(tag, [])
self.handle_data(data)
self.finish_endtag(tag)
代码:
#
!python
# coding=UTF-8
from sgmllib import SGMLParser
class TestSGMLParser(SGMLParser):
def reset(self, verbose = 0):
SGMLParser.reset(self)
# 提取 a text ; div html
self.a_text = []
self.div_html = []
# 寄存变量
self.data_text = ""
self.data_html = ""
# 业务逻辑表示变量
# 抓取 div > p id="da" > a
# 由于需要得到div p 的 html > test_div_p = 0 , 1 , 2
self.test_div = False
self.test_div_p = 0
self.test_div_p_a = False
# 重写 handle_data
# 寄存变量 填充值
def handle_data(self, data):
self.data_text = self.data_text + data
if self.test_div_p :
self.data_html = self.data_html + data
# 重写 finish_starttag
# self.data_html 填充值
def finish_starttag(self, tag, attrs):
SGMLParser.finish_starttag(self, tag, attrs)
if self.test_div_p :
strattrs = "" .join([ ' %s="%s" ' % (key, value) for key, value in attrs])
self.data_html = self.data_html + " <%(tag)s%(strattrs)s> " % locals()
# 重写 finish_endtag
# self.data_html 填充值
def finish_endtag(self, tag):
SGMLParser.finish_endtag(self, tag)
if self.test_div_p == 2 :
self.data_html = self.data_html + " </%(tag)s> " % locals()
elif self.test_div_p == 1 :
self.data_html = self.data_html + " </%(tag)s> " % locals()
self.test_div_p = 0
# self.test_div 状态修改
def start_div(self, attrs):
self.test_div = True
# self.test_div 状态修改
# self.div_html 填充
def end_div(self):
if self.test_div :
self.div_html.append(self.data_html)
self.test_div = False
# self.test_div_p 状态修改 2 为可以填充
def start_p(self, attrs):
if self.test_div and attrs and ' id ' in [ key for key, value in attrs ] and len([ value for key, value in attrs if key == ' id ' and value == ' da ' ]) > 0 :
self.test_div_p = 2
# self.test_div_p 状态修改 1 为只能填充最后一次
def end_p(self):
if self.test_div_p == 2 :
self.test_div_p = 1
# self.test_div_p_a 状态修改
def start_a(self, attrs):
self.data_text = ""
if self.test_div_p :
self.test_div_p_a = True
# self.test_div_p_a 状态修改
# self.a_text 填充
def end_a(self):
if self.test_div_p and self.test_div and self.test_div_p_a :
self.a_text.append(repr(self.data_text))
self.test_div_p_a = False
def close(self):
SGMLParser.close(self)
if __name__ == ' __main__ ' :
try :
f = open( ' google.html ' , ' r ' )
data = f.read()
x = TestSGMLParser()
x.feed(data)
x.close()
# 我这 gvim utf8 ; cygwin gbk ,转码 unicode( str , 'utf8').encode('gbk')
print " a_text: %s \n div_html[0]: \n %s " % (x.a_text[:-1], unicode(x.div_html[0], ' utf8 ' ).encode( ' gbk ' ) )
except IOError, msg:
print file, " : " , msg
# coding=UTF-8
from sgmllib import SGMLParser
class TestSGMLParser(SGMLParser):
def reset(self, verbose = 0):
SGMLParser.reset(self)
# 提取 a text ; div html
self.a_text = []
self.div_html = []
# 寄存变量
self.data_text = ""
self.data_html = ""
# 业务逻辑表示变量
# 抓取 div > p id="da" > a
# 由于需要得到div p 的 html > test_div_p = 0 , 1 , 2
self.test_div = False
self.test_div_p = 0
self.test_div_p_a = False
# 重写 handle_data
# 寄存变量 填充值
def handle_data(self, data):
self.data_text = self.data_text + data
if self.test_div_p :
self.data_html = self.data_html + data
# 重写 finish_starttag
# self.data_html 填充值
def finish_starttag(self, tag, attrs):
SGMLParser.finish_starttag(self, tag, attrs)
if self.test_div_p :
strattrs = "" .join([ ' %s="%s" ' % (key, value) for key, value in attrs])
self.data_html = self.data_html + " <%(tag)s%(strattrs)s> " % locals()
# 重写 finish_endtag
# self.data_html 填充值
def finish_endtag(self, tag):
SGMLParser.finish_endtag(self, tag)
if self.test_div_p == 2 :
self.data_html = self.data_html + " </%(tag)s> " % locals()
elif self.test_div_p == 1 :
self.data_html = self.data_html + " </%(tag)s> " % locals()
self.test_div_p = 0
# self.test_div 状态修改
def start_div(self, attrs):
self.test_div = True
# self.test_div 状态修改
# self.div_html 填充
def end_div(self):
if self.test_div :
self.div_html.append(self.data_html)
self.test_div = False
# self.test_div_p 状态修改 2 为可以填充
def start_p(self, attrs):
if self.test_div and attrs and ' id ' in [ key for key, value in attrs ] and len([ value for key, value in attrs if key == ' id ' and value == ' da ' ]) > 0 :
self.test_div_p = 2
# self.test_div_p 状态修改 1 为只能填充最后一次
def end_p(self):
if self.test_div_p == 2 :
self.test_div_p = 1
# self.test_div_p_a 状态修改
def start_a(self, attrs):
self.data_text = ""
if self.test_div_p :
self.test_div_p_a = True
# self.test_div_p_a 状态修改
# self.a_text 填充
def end_a(self):
if self.test_div_p and self.test_div and self.test_div_p_a :
self.a_text.append(repr(self.data_text))
self.test_div_p_a = False
def close(self):
SGMLParser.close(self)
if __name__ == ' __main__ ' :
try :
f = open( ' google.html ' , ' r ' )
data = f.read()
x = TestSGMLParser()
x.feed(data)
x.close()
# 我这 gvim utf8 ; cygwin gbk ,转码 unicode( str , 'utf8').encode('gbk')
print " a_text: %s \n div_html[0]: \n %s " % (x.a_text[:-1], unicode(x.div_html[0], ' utf8 ' ).encode( ' gbk ' ) )
except IOError, msg:
print file, " : " , msg
页面抓取
抓取 pycurl + 分析用 SGMLParser + 验证码用 pytesser
下面就差算法了,抓取的准备工作终于要完成了。
整理 www.blogjava.net/Good-Game