http://www.cnzzad.com/outtut/35897.html
HTMLParser是python用来解析html的模块。它可以分析出html里面的标签、数据等等,是一种处理html的简便途径。HTMLParser采用的是一种事件驱动的模式,当HTMLParser找到一个特定的标记时,它会去调用一个用户定义的函数,以此来通知程序处理。它主要的用户回调函数的命名都是以handler_开头的,都是HTMLParser的成员函数。当我们使用时,就从HTMLParser派生出新的类,然后重新定义这几个以handler_开头的函数即可。这几个函数包括:
handle_startendtag 处理开始标签和结束标签这里我以从网页中获取到url为例,介绍一下。要想获取到url,肯定是要分析标签,然后取到它的href属性的值。下面是代码:
先来大致看看HTMLParser的源代码:
class HTMLParseError(Exception):
"""Exception raised for all parse errors."""
def __init__(self, msg, position=(None, None)):
assert msg
self.msg = msg
self.lineno = position[0]
self.offset = position[1]
def __str__(self):
result = self.msg
if self.lineno is not None:
result = result + ", at line %d" % self.lineno
if self.offset is not None:
result = result + ", column %d" % (self.offset + 1)
return result
class HTMLParser(_markupbase.ParserBase):
"""Find tags and other markup and call handler functions.
Usage:
p = HTMLParser()
p.feed(data)
...
p.close()
Start tags are handled by calling self.handle_starttag() or
self.handle_startendtag(); end tags by self.handle_endtag(). The
data between tags is passed from the parser to the derived class
by calling self.handle_data() with the data as argument (the data
may be split up in arbitrary chunks). Entity references are
passed by calling self.handle_entityref() with the entity
reference as the argument. Numeric character references are
passed to self.handle_charref() with the string containing the
reference as the argument.
"""
CDATA_CONTENT_ELEMENTS = ("script", "style")
def __init__(self):
"""Initialize and reset this instance."""
self.reset()
def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.rawdata = ''
self.lasttag = '???'
self.interesting = interesting_normal
_markupbase.ParserBase.reset(self)
def feed(self, data):
"""Feed data to the parser.
Call this as often as you want, with as little or as much text
as you want (may include '/n').
"""
self.rawdata = self.rawdata + data
self.goahead(0)
def close(self):
"""Handle any buffered data."""
self.goahead(1)
def error(self, message):
raise HTMLParseError(message, self.getpos())
__starttag_text = None
def get_starttag_text(self):
"""Return full source of start tag: '<...>'."""
return self.__starttag_text
def set_cdata_mode(self):
self.interesting = interesting_cdata
def clear_cdata_mode(self):
self.interesting = interesting_normal
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n:
match = self.interesting.search(rawdata, i) # < or &
if match:
j = match.start()
else:
j = n
if i < j: self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
if i == n: break
startswith = rawdata.startswith
if startswith('<', i):
if starttagopen.match(rawdata, i): # < + letter
k = self.parse_starttag(i)
elif startswith("
if not match:
return -1
j = match.start()
self.handle_pi(rawdata[i+2: j])
j = match.end()
return j
# Internal -- handle starttag, return end or -1 if not terminated
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos:
m = attrfind.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '/'' == attrvalue[-1:] or /
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "/n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("/n")
offset = len(self.__starttag_text) /
- self.__starttag_text.rfind("/n")
else:
offset = offset + len(self.__starttag_text)
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
if end.endswith('/>'):
# XHTML-style empty tag:
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode()
return endpos
# Internal -- check to see if we have a complete starttag; return end
# or -1 if incomplete.
def check_for_whole_start_tag(self, i):
rawdata = self.rawdata
m = locatestarttagend.match(rawdata, i)
if m:
j = m.end()
next = rawdata[j:j+1]
if next == ">":
return j + 1
if next == "/":
if rawdata.startswith("/>", j):
return j + 2
if rawdata.startswith("/", j):
# buffer boundary
return -1
# else bogus input
self.updatepos(i, j + 1)
self.error("malformed empty start tag")
if next == "":
# end of input
return -1
if next in ("abcdefghijklmnopqrstuvwxyz=/"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
# end of input in or before attribute value, or we have the
# '/' from a '/>' ending
return -1
self.updatepos(i, j)
self.error("malformed start tag")
raise AssertionError("we should not get here!")
# Internal -- parse endtag, return end or -1 if incomplete
def parse_endtag(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == "
if not match:
return -1
j = match.end()
match = endtagfind.match(rawdata, i) #
if not match:
self.error("bad end tag: %r" % (rawdata[i:j],))
tag = match.group(1)
self.handle_endtag(tag.lower())
self.clear_cdata_mode()
return j
# Overridable -- finish processing of start+end tag:
def handle_startendtag(self, tag, attrs):
self.handle_starttag(tag, attrs)
self.handle_endtag(tag)
# Overridable -- handle start tag
def handle_starttag(self, tag, attrs):
pass
# Overridable -- handle end tag
def handle_endtag(self, tag):
pass
# Overridable -- handle character reference
def handle_charref(self, name):
pass
# Overridable -- handle entity reference
def handle_entityref(self, name):
pass
# Overridable -- handle data
def handle_data(self, data):
pass
# Overridable -- handle comment
def handle_comment(self, data):
pass
# Overridable -- handle declaration
def handle_decl(self, decl):
pass
# Overridable -- handle processing instruction
def handle_pi(self, data):
pass
def unknown_decl(self, data):
self.error("unknown declaration: %r" % (data,))
# Internal -- helper to remove special character quoting
entitydefs = None
def unescape(self, s):
if '&' not in s:
return s
def replaceEntities(s):
s = s.groups()[0]
if s[0] == "#":
s = s[1:]
if s[0] in ['x','X']:
c = int(s[1:], 16)
else:
c = int(s)
return chr(c)
else:
# Cannot use name2codepoint directly, because HTMLParser
# supports apos, which is not part of HTML 4
import html.entities
if HTMLParser.entitydefs is None:
entitydefs = HTMLParser.entitydefs = {'apos':"'"}
for k, v in html.entities.name2codepoint.items():
entitydefs[k] = chr(v)
try:
return self.entitydefs[s]
except KeyError:
return '&'+s+';'
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|/w{1,8}));",
replaceEntities, s, re.ASCII)
http://hi.baidu.com/muinlive/blog/item/ce584ff43c569adaf2d385b8.html#-*- encoding: gb2312 -*- import HTMLParser class MyParser(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): # 这里重新定义了处理开始标签的函数 if tag == 'a': # 判断标签
的属性 for name,value in attrs: if name == 'href': print value if __name__ == '__main__': a = '
test 链接到163' my = MyParser() # 传入要分析的数据,是html的。 my.feed(a) 第二个示例程序:找图片链接 # -*- coding:utf-8 -*- # file: GetImage.py # import Tkinter import urllib import HTMLParser class MyHTMLParser(HTMLParser.HTMLParser): # 创建HTML解析类 def __init__(self): HTMLParser.HTMLParser.__init__(self) self.gifs = [] # 创建列表,保存gif self.jpgs = [] # 创建列表,保存jpg def handle_starttag(self, tags, attrs): # 处理起始标记 if tags == 'img': # 处理图片 for attr in attrs: for t in attr: if 'gif' in t: self.gifs.append(t) # 添加到gif列表 elif 'jpg' in t: self.jpgs.append(t) # 添加到jpg列表 else: pass def get_gifs(self): # 返回gif列表 return self.gifs def get_jpgs(self): # 返回jpg列表 return self.jpgs class Window: def __init__(self, root): self.root = root # 创建组件 self.label = Tkinter.Label(root, text = '输入URL:') self.label.place(x = 5, y = 15) self.entryUrl = Tkinter.Entry(root,width = 30) self.entryUrl.place(x = 65, y = 15) self.get = Tkinter.Button(root, text = '获取图片', command = self.Get) self.get.place(x = 280, y = 15) self.edit = Tkinter.Text(root,width = 470,height = 600) self.edit.place(y = 50) def Get(self): url = self.entryUrl.get() # 获取URL page = urllib.urlopen(url) # 打开URL data = page.read() # 读取URL内容 parser = MyHTMLParser() # 生成实例对象 parser.feed(data) # 处理HTML数据 self.edit.insert(Tkinter.END, '====GIF====\n') # 输出数据 gifs = parser.get_gifs() for gif in gifs: self.edit.insert(Tkinter.END, gif + '\n') self.edit.insert(Tkinter.END, '===========\n') self.edit.insert(Tkinter.END, '====JPG====\n') jpgs = parser.get_jpgs() for jpg in jpgs: self.edit.insert(Tkinter.END, jpg + '\n') self.edit.insert(Tkinter.END, '===========\n') page.close() root = Tkinter.Tk() window = Window(root) root.minsize(600,480) root.mainloop() 示例代码二:
#! /usr/bin/env python
# -*- coding:gb18030 -*-
from HTMLParser import HTMLParser
import re
class HtmlTag:
def __init__(self,parent,tagname):
self.tagname=tagname
self.attrs={}
self.parent=parent
self.childs=[]
self.data=''
def setattr(self,name,value):
self.attrs[name]=value
def addchild(self,child):
self.childs.append(child)
def setdata(self,data):
self.data=data
class htmlsnif (HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self._ActiveTag=HtmlTag(None,'Root')
self._TagTile=[]
self._TagTree=self._ActiveTag
self._TagCatalog={}
self._ParentTag=self._ActiveTag
self._TagTitle=None
self._TagBody=None
def handle_starttag(self,tag,attrs):
newtag=HtmlTag(self._ActiveTag,tag)
for k,v in attrs:
newtag.setattr(k,v)
self._TagTile.append(newtag)
self._ActiveTag.addchild(newtag)
self._ParentTag=self._ActiveTag
self._ActiveTag=newtag
if str.lower(tag)=='title':
self._TagTitle=newtag
elif str.lower(tag)=='body':
self._TagBody=newtag
if tag in self._TagCatalog:
self._TagCatalog[tag].append(newtag)
else:
self._TagCatalog[tag]=[newtag]
print(tag)
def handle_endtag(self,tag):
self._ParentTag=self._ParentTag.parent
self._ActiveTag=self._ActiveTag.parent
def handle_data(self,data):
self._ActiveTag.setdata(data)
def handle_startendtag(self,tag,attrs):
newtag=HtmlTag(self._ActiveTag,tag)
for k,v in attrs:
newtag.setattr(k,v)
self._ActiveTag.addchild(newtag)
self._TagTile.append(newtag)
if tag in self._TagCatalog:
self._TagCatalog[tag].append(newtag)
else:
self._TagCatalog[tag]=[newtag]
def handle_comment(self,data):
newtag=HtmlTag(self._TagTree,'comment')
newtag.setdata(data)
self._TagTile.append(newtag)
if 'comment' in self._TagCatalog:
self._TagCatalog['comment'].append(newtag)
else:
self._TagCatalog['comment']=[newtag]
#!/usr/bin/env python
import sys
import urllib
import HTMLParser
class CustomParser(HTMLParser.HTMLParser):
selected = ('table', 'h1', 'font', 'ul', 'li', 'tr', 'td', 'a')
def reset(self):
HTMLParser.HTMLParser.reset(self)
self._level_stack = []
def handle_starttag(self, tag, attrs):
if tag in CustomParser.selected:
self._level_stack.append(tag)
def handle_endtag(self, tag):
if self._level_stack \
and tag in CustomParser.selected \
and tag == self._level_stack[-1]:
self._level_stack.pop()
def handle_data(self, data):
if "/".join(self._level_stack) in (
'table/tr/td',
'table/tr/td/h1/font',
'table/tr/td/ul/li'):
print self._level_stack, data
if len(sys.argv) > 1:
params = urllib.urlencode({'ip': sys.argv[1], 'action': 2})
else:
params = None
content = unicode(urllib.urlopen('http://www.ip138.com/ips8.asp',params).read(), 'GB2312')
parser = CustomParser()
parser.feed(content)
parser.close()