import os
import requests
from bs4 import BeautifulSoup
urls = []
chinese = []
for i in range(0x4E00, 0x9FA6):
chinese.append((b"\u" + str(hex(i)).lstrip("0x").encode("unicode-escape")).decode("unicode-escape"))
urls.append("http://www.guoxuedashi.net/zidian/" + str(hex(i)).lstrip("0x") + ".html")
print("共有{}个所要请求的url 以及 对应的{}个汉字".format(len(urls), len(chinese)))
request_header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
}
path = "E:\汉字数据集"
for (i, url), character in zip(enumerate(urls), chinese):
response = requests.get(url, request_header, verify=False)
print("******************第 {} 个 url --- {}".format(i+1, url), response, "******************")
response.encoding = response.apparent_encoding
response = response.text
soup = BeautifulSoup(response, "html.parser")
all_a = soup.find_all("a")
href = "http://www.guoxuedashi.net"
for a in all_a:
if ' 详情\r\n ' in list(a.children):
href += a["href"]
r = requests.get(href, request_header, verify=False)
print("******************", href, r, "******************")
r.encoding = r.apparent_encoding
r = r.text
s = BeautifulSoup(r, "html.parser")
imgs = s.find_all("img", {"width": "80"})
print("****当前汉字\"{}\"文件夹***共有{}张汉字图片***正在生成...****".format(character, len(imgs)))
if(len(imgs) == 0):
continue
for j, img in enumerate(imgs):
src = img["src"]
rp = requests.get(src, request_header, verify=False)
chinese_path = os.path.join(path, character)
if not os.path.exists(chinese_path):
os.makedirs(chinese_path)
img_path = os.path.join(chinese_path, str(j+1) + '.png')
with open(img_path, 'wb+') as pic:
pic.write(rp.content)
print("****当前汉字\"{}\"文件夹***共有{}张汉字图片***生成完毕!!!****".format(character, len(imgs)))
执行结果