python 抓取分析 SGMLParser 实例

数据：
希望抓取
div > p id='da' > a text
和 div > p id='da' > html

< div >
    < p id ="tt" >
      < a href =/tag/php > no no </ a >
   </ p >
   < p id ='da' >
     < a href =/tag/php > php </ a >
     < a href =/tag/python > python </ a >
     < a href =/tag/vim > vim </ a >
     < a href =/tag/windows > windows </ a >
    < a href =/tag/wingide > wingide </ a >
   </ p >
</ div >
< p id ='da' >
    < a href =/tag/wingide > hehe </ a >
</ p >

希望结果为

$ python t.py
a_text: ["'php'", "'python'", "'vim'", "'windows'", "'wingide'"]

div_html[0]:
< p id ="da" >
     < a href ="/tag/php" > php </ a >
     < a href ="/tag/python" > python </ a >
     < a href ="/tag/vim" > vim </ a >
     < a href ="/tag/windows" > windows </ a >
   < a href ="/tag/wingide" >刘凯毅 </ a >
   </ p >

#说明
其实 SGMLParser 我感觉最关键的是

# /usr/lib/python2.5/sgmllib.py
# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
     def finish_shorttag(self, tag, data):
         # 而 finish_starttag finish_endtag 抓取会调用 end_* start_* 什么的
        self.finish_starttag(tag, [])
        self.handle_data(data)
        self.finish_endtag(tag)

代码：

# !python
# coding=UTF-8

from sgmllib import SGMLParser
class TestSGMLParser(SGMLParser):

     def reset(self, verbose = 0):
        SGMLParser.reset(self)

         # 提取 a text ; div html
        self.a_text = []
        self.div_html = []

         # 寄存变量
        self.data_text = ""
        self.data_html = ""

         # 业务逻辑表示变量
         # 抓取 div > p id="da" > a
         # 由于需要得到div p 的 html > test_div_p = 0 , 1 , 2
        self.test_div = False
        self.test_div_p = 0
        self.test_div_p_a = False


     # 重写 handle_data
     # 寄存变量填充值
     def handle_data(self, data):
            self.data_text = self.data_text + data
         if self.test_div_p :
            self.data_html = self.data_html + data


     # 重写 finish_starttag
     # self.data_html 填充值
     def finish_starttag(self, tag, attrs):
        SGMLParser.finish_starttag(self, tag, attrs)
         if self.test_div_p :
                strattrs = "" .join([ ' %s="%s" ' % (key, value) for key, value in attrs])
                self.data_html = self.data_html + " <%(tag)s%(strattrs)s> " % locals()

     # 重写 finish_endtag
     # self.data_html 填充值
         def finish_endtag(self, tag):
        SGMLParser.finish_endtag(self, tag)
         if self.test_div_p == 2 :
            self.data_html = self.data_html + " </%(tag)s> " % locals()
         elif self.test_div_p == 1 :
            self.data_html = self.data_html + " </%(tag)s> " % locals()
            self.test_div_p = 0

     # self.test_div 状态修改
         def start_div(self, attrs):
        self.test_div = True

     # self.test_div 状态修改
     # self.div_html 填充
         def end_div(self):
         if self.test_div :
            self.div_html.append(self.data_html)
        self.test_div = False

     # self.test_div_p 状态修改 2 为可以填充
         def start_p(self, attrs):
             if self.test_div and attrs and ' id ' in [ key for key, value in attrs ] and   len([ value for key, value in attrs if key == ' id ' and value == ' da ' ]) > 0 :
            self.test_div_p = 2

     # self.test_div_p 状态修改 1 为只能填充最后一次
         def end_p(self):
         if self.test_div_p == 2 :
            self.test_div_p = 1

         # self.test_div_p_a 状态修改
         def start_a(self, attrs):
        self.data_text = ""
             if self.test_div_p :
                self.test_div_p_a = True

         # self.test_div_p_a 状态修改
     # self.a_text 填充
         def end_a(self):
             if self.test_div_p and self.test_div and self.test_div_p_a  :
                    self.a_text.append(repr(self.data_text))
            self.test_div_p_a = False

         def close(self):
            SGMLParser.close(self)

if __name__ == ' __main__ ' :
     try :
        f = open( ' google.html ' , ' r ' )
        data = f.read()
        x = TestSGMLParser()
        x.feed(data)
        x.close()
         # 我这 gvim utf8 ; cygwin gbk ,转码  unicode( str , 'utf8').encode('gbk')
         print " a_text: %s \n div_html[0]: \n %s " % (x.a_text[:-1],  unicode(x.div_html[0], ' utf8 ' ).encode( ' gbk ' ) )

    except IOError, msg:
         print file, " : " , msg

页面抓取
抓取 pycurl + 分析用 SGMLParser + 验证码用 pytesser
下面就差算法了，抓取的准备工作终于要完成了。

整理 www.blogjava.net/Good-Game

python 抓取分析 SGMLParser 实例

你可能感兴趣的:(python 抓取分析 SGMLParser 实例)