成语接龙助手
传入指定文字,即可查找所有以此开头的成语
有时候允许同音字,那么就将该字转换成拼音来查找,并指定音调(将拼音和音调拼接,like指令)
有时候允许不同音调,那么在转换成拼音后即可(仅查找拼音)
项目缺陷:
字段设计问题,多音字有多个拼音,同音查询的时候使用like关键字,导致查找数据时不够精确
如'唉声叹气'的唉的拼音包含有'ai4',会与'爱'字冲突
import requests
from bs4 import BeautifulSoup
import time
import string
import pymysql
from pypinyin import pinyin, Style, lazy_pinyin
db = pymysql.connect(host='localhost',user='root',passwd='123456',db='forchengyu')
cursor = db.cursor()
#
def chengyudaquan():
letters = string.ascii_lowercase # a-z英文字母
for l in string.ascii_lowercase:
letter = l # 字母索引
try:
url = 'http://www.xxx.com/zimu-%s-p1.html' % (letter)
resp = requests.get(url)
html = resp.text.encode(resp.encoding).decode(resp.apparent_encoding)
bs = BeautifulSoup(html, 'html.parser')
print('即将爬取的网页: ' + url)
pageCount = bs.find(name='span',class_='pageinfo').find('strong').string # 有多少页
index = 1 # 页码索引
n = 1
while n <= int(pageCount):
url = 'http://www.xxx.com/zimu-%s-p%s.html' % (letter,index) # 每一个字母类别下,接着爬取每页的数据
index += 1
n += 1
print(url)
pull_chengyu_by_sequence(url) # 爬取该页下的成语数据
except AttributeError as e:
print('解析出错,没有 %s 开头的成语', (l))
db.close()
# 具体爬取成语的函数
def pull_chengyu_by_sequence(url):
resp = requests.get(url)
print(resp)
print('编码格式: ' + resp.encoding)
print('解码格式: ' + resp.apparent_encoding)
html = resp.text.encode(resp.encoding).decode(resp.apparent_encoding)
# print(html)
bs = BeautifulSoup(html, 'html.parser')
try:
# f = open('/Users/chencheng/Desktop/chengyu.txt','a',encoding='utf-8')
current_dds = bs.find(name='div',class_='col-md-8 content-left').find('dd').find_all('a') # 找到成语区域模块
print('当前页有: ' + str(len(current_dds)) + ' 个成语')
for i in current_dds:
# print(i)
name = i.string # 成语
detail = i.get('title') # 成语简介,描述
href = i.get('href') # 具体解释
print(name)
print(detail)
# print(href)
# f.write(str(name) + '\n')
# f.write(str(detail) + '\n')
# f.href(str(href) + '\n')
sql = "insert into chengyu(name,detail,href) values('%s','%s','%s')" % (str(name), str(detail),str(href)) # 插入到数据库
try:
cursor.execute(sql)
db.commit()
print('插入成功!')
except:
db.rollback()
except AttributeError as e:
print('解析出错,结束此类查询')
# 获取每个成语的拼音和音调
def updatePinyin():
db = pymysql.connect(host='localhost',user='root',passwd='123456',db='forchengyu')
cursor = db.cursor()
sql = "select * from chengyu"
cursor.execute(sql)
result = cursor.fetchall() # 获取所有数据,是一个数组,以数组的形式来访问
print(len(result))
count = 0
lastWord = '' # 上一个解析拼音的字,如果一样就可以直接赋值
letter = '' # 第一个字的首字母
pyText = '' # 纯拼音
pyTone = '' # 带音调的拼音
for re in result:
name = re[0]
detail = re[1]
href = re[2]
word = name[0]
if lastWord == word:
print('和上一个一样')
else:
print(word)
text = word
pyText = lazy_pinyin(text)[0]
letter = pyText[0]
pyTone = ''
tone = pinyin(text,style=Style.TONE3,heteronym=True) # 包括多音字在内,以二维数组的形式返回
for i in tone:
for j in i:
pyTone += j
pyTone += ' '
print(letter)
print(pyText)
print(pyTone)
letter = str(letter)
pyText = str(pyText)
pyTone = str(pyTone)
sql = "update chengyu set letter='%s',pinyin='%s',pinyin_tone='%s' where name='%s'" % (letter,pyText,pyTone,name)
print(sql)
try:
cursor.execute(sql)
db.commit()
print('更新成功!')
except:
db.rollback()
lastWord = word
print('更新完成')
db.close()
# 看看数据库收录了多少个成语
def selectCount():
db = pymysql.connect(host='localhost',user='root',passwd='123456',db='forchengyu')
cursor = db.cursor()
sql = "select * from chengyu"
cursor.execute(sql)
result = cursor.fetchall() # 获取所有数据,是一个数组,以数组的形式来访问
print(len(result))
# 指定查找
def search_by_word(word, limit):
db = pymysql.connect(host='localhost',user='root',passwd='123456',db='forchengyu')
cursor = db.cursor()
sql = "select * from chengyu where name like '%s%%' and char_length(name) = 4 limit %d" % (word, limit)
cursor.execute(sql)
result = cursor.fetchall()
for i in result:
print(i)
db.close()
# 同音查找
def search_by_tone(word, limit):
db = pymysql.connect(host='localhost',user='root',passwd='123456',db='forchengyu')
cursor = db.cursor()
pyText = lazy_pinyin(word)[0]
letter = pyText[0]
pyTone = ''
tone = pinyin(word,style=Style.TONE3,heteronym=True)
for i in tone:
for j in i:
pyTone += j
pyTone += ' '
letter = str(letter)
pyText = str(pyText)
pyTone = str(pyTone)
print(pyTone)
print(letter)
sql = "select * from chengyu where letter='%s' AND pinyin_tone like('%%%s%%') limit %d" % (letter, pyTone, limit)
cursor.execute(sql)
result = cursor.fetchall()
for i in result:
print(i)
db.close()
# 谐音查找,即 1234 音调都可以
def search_by_pinyin(word,limit):
db = pymysql.connect(host='localhost',user='root',passwd='123456',db='forchengyu')
cursor = db.cursor()
pyText = lazy_pinyin(word)[0]
letter = pyText[0]
pyTone = ''
print('首字母: ' + letter)
print('拼音: ' + pyText)
sql = "select * from chengyu where pinyin='%s' limit %d" % (pyText, limit)
cursor.execute(sql)
result = cursor.fetchall()
for i in result:
print(i)
db.close()
"""
爬取成语数据,需要现在mysql数据库创建一下内容:
database: forchengyu
table: chengyu {
name varchar(50),
detail varchar(255),
href varchar(255),
letter varchar(1),
pinyin varchar(50),
pinyin_tone varchar(50)
}
"""
# chengyudaquan()
"""
获取每一个成语的首字母、拼音、带音调的拼音,需要大概600秒
"""
# updatePinyin()
"""
获取爬取的成语数量
"""
# selectCount()
"""
指定字查询
"""
# search_by_word('离',50)
"""
同音查找
"""
search_by_tone('爱',50)
"""
谐音字查询
"""
# search_by_pinyin('爱',50)