HTMLParser 解析 Title 和body
from htmlentitydefs import entitydefs
import HTMLParser
class TitleParser(HTMLParser.HTMLParser):
def __init__(self):
self.taglevels=[]
self.handledtags=['title','body']
self.processing=None
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if tag in self.handledtags:
self.data=''
self.processing=tag
def handle_data(self,data):
if self.processing:
self.data +=data
def handle_endtag(self,tag):
if tag==self.processing:
print str(tag)+':'+str(tp.gettitle())
self.processing=None
def handle_entityref(self,name):
if entitydefs.has_key(name):
self.handle_data(entitydefs[name])
else:
self.handle_data('&'+name+';')
def handle_charref(self,name):
try:
charnum=int(name)
except ValueError:
return
if charnum<1 or charnum>255:
return
self.handle_data(chr(charnum))
def gettitle(self):
return self.data
fd=open('test1.html')
tp=TitleParser()
tp.feed(fd.read())
XML解析
"""
解析XML文件
1.Element XML树的节点
2.Text代表文本,包括Element的换行符
3.scanNode为一递归函数,如果当前的节点有子节点,进行递归调用
4.Node的类型
ELEMENT_NODE = 1
ATTRIBUTE_NODE = 2
TEXT_NODE = 3
CDATA_SECTION_NODE = 4
ENTITY_REFERENCE_NODE = 5
ENTITY_NODE = 6
PROCESSING_INSTRUCTION_NODE = 7
COMMENT_NODE = 8
DOCUMENT_NODE = 9
DOCUMENT_TYPE_NODE = 10
DOCUMENT_FRAGMENT_NODE = 11
NOTATION_NODE = 12
"""
from xml.dom import minidom,Node
Node.TEXT_NODE
def scanNode(node,level = 0):
msg = node.__class__.__name__
if node.nodeType == Node.ELEMENT_NODE:
msg += ",tag" + node.tagName
print " " * level * 4,msg
if node.hasChildNodes:
for child in node.childNodes:
scanNode(child,level + 1)
doc = minidom.parse("JCSample.xml")
scanNode(doc)
使用DOM解析XMl
from xml.dom import minidom, Node
import re, textwrap
########################################################################
class SampleScanner:
""""""
#----------------------------------------------------------------------
def __init__(self, doc):
"""Constructor"""
assert(isinstance(doc, minidom.Document))
for child in doc.childNodes:
if child.nodeType == Node.ELEMENT_NODE and \
child.tagName == "book":
self.handle_book(child)
def handle_book(self, node):
for child in node.childNodes:
if child.nodeType != Node.ELEMENT_NODE:
continue
if child.tagName == "title":
print "Book titile is:", self.gettext(child.childNodes)
if child.tagName == "author":
self.handle_author(child)
if child.tagName == "chapter":
self.handle_chapter(child)
def handle_chapter(self, node):
number = node.getAttribute("number")
print "number:", number
title_node = node.getElementsByTagName("title")
print "title:", self.gettext(title_node)
for child in node.childNodes:
if child.nodeType != Node.ELEMENT_NODE:
continue
if child.tagName == "para":
self.handle_chapter_para(child)
def handle_chapter_para(self, node):
company = ""
company = self.gettext(node.getElementsByTagName("company"))
print "chapter:para:company", company
def handle_author(self, node):
for child in node.childNodes:
if child.nodeType != Node.ELEMENT_NODE:
continue
if child.tagName == "name":
self.handle_author_name(child)
if child.tagName == "affiliation":
print "affiliation:", self.gettext(child.childNodes)
def handle_author_name(self, node):
first = ""
last = ""
for child in node.childNodes:
if child.nodeType != Node.ELEMENT_NODE:
continue
if child.tagName == "first":
first = self.gettext(child.childNodes)
if child.tagName == 'last':
last = self.gettext(child.childNodes)
print "firstname:%s,lastname:%s" % (first, last)
def gettext(self, nodelist):
retlist = []
for node in nodelist:
if node.nodeType == Node.TEXT_NODE:
retlist.append(node.wholeText)
elif node.hasChildNodes:
retlist.append(self.gettext(node.childNodes))
return re.sub('\s+', " ", ''.join(retlist))
if __name__=="__main__":
doc = minidom.parse("simple.xml")
sample = SampleScanner(doc)
XML如下
<?xml version="1.0" ?>
<!--Simple xml document__chapter 8-->
<book>
<title>
sample xml thing
</title>
<author>
<name>
<first>
ma
</first>
<last>
xiaoju
</last>
</name>
<affiliation>
Springs Widgets, Inc.
</affiliation>
</author>
<chapter number="1">
<title>
First
</title>
<para>
I think widgets are greate.You should buy lots of them forom
<company>
Spirngy Widgts, Inc
</company>
</para>
</chapter>
</book>