python html解析

下面以获取IP地址的物理位置作简要记录,以备后用:

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import urllib2, HTMLParser, re

class IPParser(HTMLParser.HTMLParser):
	def __init__(self):
		HTMLParser.HTMLParser.__init__(self)
		
		url = "http://iframe.ip138.com/ic.asp"
		try:
			fp = urllib2.urlopen(url, timeout=5)
			souce = fp.read()
			fp.close()
			self.feed(souce)
		except:
			print "So sorry!"

	def handle_starttag(self, tag, attrs):
		self.flag = tag
		if tag == "meta":
			tmp = re.findall("charset=([A-Za-z0-9-]*)", str(attrs))
			if tmp:
				self.code = tmp[0]

	def handle_data(self, data):
		if self.flag == "center":
			self.info = data#.decode(self.code).encode("UTF-8")
			
	def handle_endtag(self, tag):
		if tag == "center":
			print self.info.decode(self.code).encode("UTF-8")
	

if __name__ == "__main__":
	IPParser()

更详细的用法请参考官方文档: http://docs.python.org/2/library/htmlparser.html

对于上面的获取方式还可以:

html = urllib2.urlopen("http://iframe.ip138.com/ic.asp").read().decode("GB2312")
print re.findall("
(.*)
", html)[0].encode("UTF-8")


你可能感兴趣的:(Python编程)