Python解析已下载html文件

离线解析百度百科中的“百度百科”,提取各级标题:

#encoding:UTF-8
#_Author_:Ibsen

import urllib2
from sgmllib import SGMLParser
 
class ListName(SGMLParser):
	def __init__(self):
		SGMLParser.__init__(self)
		self.is_h1=False
		self.flag=False
		self.getdata=False
		self.name = []
	def start_h1(self,attrs):
		self.is_h1=True
	def end_h1(self):
		self.is_h1=False
	def start_span(self, attrs):
		for k,v in attrs:
			if k=='class' and v=='title-text':
				self.flag=True;
				return 
	def end_span(self):
		self.flag=False
	def handle_data(self, text):
		if self.is_h1:
			self.name.append(text)
		if self.flag:
			self.name.append(text)
 
content = urllib2.urlopen('file:///C:/Users/John/Desktop/1.html').read()
#content=file('C:/Users/John/Desktop/1.html').read()
listname = ListName()
listname = ListName()
listname.feed(content)
for item in listname.name:
	print item




运行结果:

Python解析已下载html文件_第1张图片



提取主标题和前两个<h2>标签下的内容:

#encoding:UTF-8
#_Author_:Ibsen

import sys
import urllib2
from sgmllib import SGMLParser
 
class ListName(SGMLParser):
	def __init__(self):
		SGMLParser.__init__(self)
		self.is_h1=False #标记<h1>
		self.f_div=False #标记<div>
		self.div_cnt=0 #计数div出现的次数,防止<div>嵌套出现
		self.f_divp=False  #标记<div class='para'>标签中的内容输出
		self.id=0 #纪录para对应的</div>的编号
		self.f_divd=False #标记<div class='description'>标签:此标签中内容不输出
		self.idd=0 #纪录description对应的</div>的编号
		self.f_h2=False #标记<h2>
		self.cnt=0 #计数<h2>出现的次数,只解析前两个
		self.f_sup=False #标记<sup>,此标签中的内容不保留
		self.name = [] #提取内容放入链表中

 
	#提取<h1>标签中的内容
	def start_h1(self,attrs):
		self.is_h1=True
	def end_h1(self):
		self.is_h1=False


	#提取<div class="para">标签中的内容
	def start_div(self,attrs):	
		self.f_div=True
		self.div_cnt+=1 
		for k,v in attrs:
			if k=='class' and v=='para':
				self.f_divp=True
				self.id=self.div_cnt
		for k,v in attrs:
			if k=='class' and v=='description':
				self.f_divd=True
				self.idd=self.div_cnt
	def end_div(self):
		if self.div_cnt==self.id:
			self.f_divp=False
		if self.div_cnt==self.idd:
			self.f_divd=False
		if self.div_cnt==0:
			self.f_div=False
		else:
			self.div_cnt-=1

	#纪录<h2>标签出现的次数
	def start_h2(self,attrs):
		for k,v in attrs:
			if k=='class' and v=='para-title level-2':
				self.cnt+=1
				self.f_h2=True;
	def end_h2(self):
		self.f_h2=False

	
	#<sup>标签中的内容不保留
	def start_sup(self,attrs):
		self.f_sup=True
		if self.f_divp:
			self.f_divp=False
	def end_sup(self):
		if self.f_sup and self.f_divp==False:
			self.f_divp=True
		self.f_sup=False


	def handle_data(self, text):
		if self.is_h1:
			self.name.append(text)
		if self.cnt<=2:
			if self.f_divp and self.f_divd==False:
				self.name.append(text) 
 
content = urllib2.urlopen('file:///C:/Users/John/Desktop/1.html').read()
#content=file('C:/Users/John/Desktop/1.html').read()
listname = ListName()
listname.feed(content)

output=sys.stdout 
outputfile=open('C:\Users\John\Desktop\oput.txt','w') 
sys.stdout=outputfile 

for item in listname.name:
	print item

outputfile.close() 
sys.stdout=output 
#print str(len(listname.name))





你可能感兴趣的:(Python解析已下载html文件)