无法正确解析<br/><img src=""/>等单个标签:
测试代码如下:
class TestSGMLParser(SGMLParser): def __init__(self, verbose=0): self.testdata = "" SGMLParser.__init__(self, verbose) def handle_data(self, data): self.testdata = self.testdata + data if len(repr(self.testdata)) >= 70: self.flush() def flush(self): data = self.testdata if data: self.testdata = "" print 'data:', repr(data) def handle_comment(self, data): self.flush() r = repr(data) if len(r) > 68: r = r[:32] + '...' + r[-32:] print 'comment:', r def unknown_starttag(self, tag, attrs): self.flush() if not attrs: print 'start tag: <' + tag + '>' else: print 'start tag: <' + tag, for name, value in attrs: print name + '=' + '"' + value + '"', print '>' def unknown_endtag(self, tag): self.flush() print 'end tag: </' + tag + '>' def unknown_entityref(self, ref): self.flush() print '*** unknown entity ref: &' + ref + ';' def unknown_charref(self, ref): self.flush() print '*** unknown char ref: &#' + ref + ';' def unknown_decl(self, data): self.flush() print '*** unknown decl: [' + data + ']' def close(self): SGMLParser.close(self) self.flush() if __name__=="__main__": #htmldata=urllib.urlopen("http://www.sogou.com").read().decode("gbk") #pros=BaseHTMLProcessor() #pros.feed(htmldata) #print pros.gethtmltext() htmldata="""<html><head><title>Google Page</title></head><body> <table id="tab"> <tr id="tr1"><td id="tr1td1">tr1 td1</td><td>tr1 td2</td><td>tr1 td3</td></tr> <tr id="tr2"><td id="tr2td1">tr2 td1</td><td>tr2 td2</td><td>tr2 td3</td></tr> </table> <br/> <img src="http://www.baidu.com/img/baidu_logo.gif" id="baidulogo" /><br/> <a href="http://baidu.com">baidu</a><br/> <b>bold font</b><br/> <script language="javascript">alert("hello, world ");</script> <style>#tab{background-color:#fcdad5;}</style> </body></html> """ pros=TestSGMLParser() #BaseHTMLProcessor() pros.feed(htmldata)
输出如下:
start tag: <html> start tag: <head> start tag: <title> data: 'Google Page' end tag: </title> end tag: </head> start tag: <body> data: ' \n ' start tag: <table id="tab" > data: '\n ' start tag: <tr id="tr1" > start tag: <td id="tr1td1" > data: 'tr1 td1' end tag: </td> start tag: <td> data: 'tr1 td2' end tag: </td> start tag: <td> data: 'tr1 td3' end tag: </td> end tag: </tr> data: '\n ' start tag: <tr id="tr2" > start tag: <td id="tr2td1" > data: 'tr2 td1' end tag: </td> start tag: <td> data: 'tr2 td2' end tag: </td> start tag: <td> data: 'tr2 td3' end tag: </td> end tag: </tr> data: '\n ' end tag: </table> data: '\n ' start tag: <br> data: '>\n <img src="http:' end tag: </br> data: '/www.baidu.com/img/baidu_logo.gif" id="baidulogo" />' start tag: <br> data: '>\n <a href="http:' end tag: </br> data: '/baidu.com">baidu' end tag: </a> start tag: <br> data: '>\n <b>bold font<' end tag: </br> data: 'b>' start tag: <br> data: '>\n \n <script language="javascript">alert("hello, world ");<' end tag: </br> data: 'script>\n ' start tag: <style> data: '#tab{background-color:#fcdad5;}' end tag: </style> data: '\n ' end tag: </body> end tag: </html>
从输出可见:<br/> <img/>都没有被正确解析。
错误根源可以从sgmllib.py找到
另有人报:sgmlparser 解析时,把标签事件属性js代码中的大于号(> )误作为结束标记。
这两个bug,其实都是正则表达式惹的祸。
所以做网页内容提取时,尽量不要用正则,而是在把html转换为xhtml后,用dom解析,或者xpath。