最近想在一些网站获取汉字的一些,简单的使用python做了一个小解析工具。
工具使用 SGMLParser解析htm网页,获取里面指定的一些内容。
代码中获取的是http://www.chazidian.com/r_zi_zd4e50/里面的内容。
现有另一个问题,www.chazidian.com下面可能有上万个类似的网页,我怎么能获取这上万个网页的地址呢?希望高手解答解答···
#-*- coding:utf-8 -*- import urllib2 import codecs class WordInfo: def __init__(self): self.id=''#字unicode self.word=''#字 self.hzjg=''#汉字结构 self.zzf=''#造字法 self.wb86=''#五笔86 self.wb98=''#五笔98 self.wx=''#五行 self.unicode=''#unicode self.sjhm=''#四角号码 self.cj=''#仓颉 self.gbk=''#GBK编码 self.gfhzbh=''#规范汉字编号 def print_info(self): print u'字unicode:', self.id print u'字:', self.word print u'汉字结构:', self.hzjg print u'造字法:', self.zzf print u'五笔86:', self.wb86 print u'五笔98:', self.wb98 print u'五行:', self.wx print u'unicode:', self.unicode print u'四角号码:', self.sjhm print u'仓颉:', self.cj print u'GBK编码:', self.gbk print u'规范汉字编号:', self.gfhzbh def write_file(self, fpWord): info = "%(word)s\t%(jt)s\t%(ft)s\t%(pinyin)s\t%(py)s\t%(zy)s\t%(yy)s\t%(bs)s\t%(bh)s\t%(bwbh)s\t%(stroke)s\t%(coding)s\t%(uni)s\t%(wb)s\t%(cj)s\t%(zm)s\t%(fc)s\r\n" \ % { 'word':self.word, 'jt':self.jt, 'ft':self.ft, 'pinyin':','.join(self.pinyin), 'py': ','.join(self.py), 'zy': ','.join(self.zy), 'yy': ','.join(self.yy), 'bs': self.bs, 'bh': str(self.bh), 'bwbh': str(self.bwbh), 'stroke': self.stroke, 'coding': self.coding, 'uni': self.uni, 'wb': self.wb, 'cj': self.cj, 'zm': self.zm, 'fc': self.fc } fpWord.write(info) from sgmllib import SGMLParser class ZdicParser(SGMLParser): def __init__(self): SGMLParser.__init__(self) self.word_info = WordInfo() #标记 self.word = 0#字 self.li_info = 0 self.binfo = 0 def start_div(self, attrs): if len(attrs) == 0: return if attrs[0] == ('class', 'zititle'): self.word = 1 elif attrs[0] == ('class', 'li_info'): self.li_info = 1 def end_div(self): self.word = 0 self.li_info = 0 def start_ul(self, attrs): if len(attrs) == 0: return if attrs[0] == ('class', 'binfo'): self.binfo = 1 def end_ul(self): self.binfo = 0 def handle_data(self, data): try: data = data.decode('utf8') except: data = data.decode('gbk') if data.find(u'对不起,资源或文件无法找到') >= 0: return data.strip() if self.word: self.word_info.word = data self.word_info.id = ord(data) elif self.li_info: index = data.find(u'汉字结构:') if index >= 0: self.word_info.hzjg = data[index+len(u'汉字结构:'):] index = data.find(u'造字法:') if index >= 0: self.word_info.zzf = data[index+len(u'造字法:'):] elif self.binfo: print data index = data.find(u'五笔86:') if index >= 0: self.word_info.wb86 = data[index+len(u'五笔86:'):] index = data.find(u'五笔98:') if index >= 0: self.word_info.wb98 = data[index+len(u'五笔98:'):] index = data.find(u'五行:') if index >= 0: self.word_info.wx = data[index+len(u'五行:'):] index = data.find(u'UniCode:') if index >= 0: self.word_info.unicode = data[index+len(u'UniCode:'):] index = data.find(u'四角号码:') if index >= 0: self.word_info.sjhm = data[index+len(u'四角号码:'):] index = data.find(u'仓颉:') if index >= 0: self.word_info.cj = data[index+len(u'仓颉:'):] index = data.find(u'GBK编码:') if index >= 0: self.word_info.gbk = data[index+len(u'GBK编码:'):] index = data.find(u'规范汉字编号:') if index >= 0: self.word_info.gfhzbh = data[index+len(u'规范汉字编号:'):] if __name__ == '__main__': url_fp = urllib2.urlopen(r'http://www.chazidian.com/r_zi_zd4e50/',timeout=10) chazidian = url_fp.read() url_fp.close() zdicParser = ZdicParser() zdicParser.feed(chazidian) zdicParser.word_info.print_info()