下载词典列表地址

简化不必要的细节,现实版的实践?自觉的优化代码o()o

# -*- coding: utf-8 -*-
#导入re模块
import re
import urllib.request
from bs4 import BeautifulSoup

#部首列表正则表达式
bspattern = re.compile(r"(?:%[^%']{2}){3}")
cipattern = re.compile(r"/c/[^']*?htm")
#1获取部首列表页面
req1 = urllib.request.Request('http://www.zdic.net/c/cibs/')
response1 = urllib.request.urlopen(req1)
index_page1 = response1.read()
#分析得到部首列表
#先试试直接正则表达式提取部首列表
index_page1 = index_page1.decode('utf8')
bslist = re.findall(bspattern, index_page1)

#1获取词列表      部首列表页面
for bu in bslist:
    print(bu)
    bu = "http://www.zdic.net/c/cibs/bs/?bs=" + b
    reqb = urllib.request.Request(bu)
    reqb.add_header('Referer', 'http://www.zdic.net/c/cibs/')
    responseb = urllib.request.urlopen(reqb)
    index_z = responseb.read()
#分析得到字列表
    index_z = index_z.decode('utf8')
    zlist = re.findall(bspattern, index_z)
#部首列表地址
    for z in zlist:
        if len(z) != 0:           
            z = "http://www.zdic.net/c/cibs/ci/?z=" + z
            print(z)
            reqz = urllib.request.Request(z)
            reqz.add_header('Referer', 'http://www.zdic.net/c/cibs/')
            responseb = urllib.request.urlopen(reqz)
            index_c = responseb.read()
        #分析得到字列表
            index_c = index_c.decode('utf8')        
            clist = re.findall(r"/z/[^']*?\.htm", index_c)
            #转化为字地址列表
            for uc in clist:
                line = "http://www.zdic.net/" + uc
                outfile.write(line+'\n')#参数不能为叠加器
outfile.close()


你可能感兴趣的:(下载词典列表地址)