hd3

抓取详细解释,康熙字典,说文解字,字形演变信息

#---------------------------------------
#   抓取其余字典信息,保存
#---------------------------------------

import urllib.request
import re
from bs4 import BeautifulSoup
import codecs

#打开网页地址文件并抓取

def scrapt(zurl):
    print(zurl)
    req = urllib.request.Request(zurl)
    req.add_header('Referer', 'http://www.zdic.net/z/jbs/')
    req.add_header('User-Agent', 'None')
    responseb = urllib.request.urlopen(req)
    index_z = responseb.read()
   #处理数据得到字典解释及相关字典
    index_z = index_z.decode('utf8')#这个真是无语了
#抓取页面中的信息及字页面地址
    soup = BeautifulSoup(index_z, "html.parser")

#抓取字典解释
    tab_page = soup.find_all(attrs={'class':'tab-page'})
#抓取url中的16进制代码
    keyq = re.split(r'[/.]',zurl)[-2]
    print(keyq)
    if len(keyq)>4:
        keyq = keyq[1:]
    print(keyq)
    key = (b'\u' + keyq.encode()).decode('unicode-escape')
    print(key)

#添加索引

    for tab_page_itme in tab_page:
        tab_page_itme['key'] = key
        type(str(tab_page_itme))
        hdfile.write(str(tab_page_itme)+'\n')#参数不能为叠加器

#保存数据

hdfile = codecs.open("hdkangxizidian", "w",'utf-8')
hdfile.write("")#参数不能为叠加器

#获取地址文件中的地址

infile = open('zurlkangxi','r')
a = infile.read()
b = a.split('\n')
for zurl in b:
    if len(zurl) != 0:
        scrapt(zurl)

#退出前关闭文件
# zurllistfile.close()
hdfile.write("")#参数不能为叠加器
hdfile.close()
infile.close()



你可能感兴趣的:(hd3)