成语接龙助手 for python

成语接龙助手

传入指定文字,即可查找所有以此开头的成语
有时候允许同音字,那么就将该字转换成拼音来查找,并指定音调(将拼音和音调拼接,like指令)
有时候允许不同音调,那么在转换成拼音后即可(仅查找拼音)

项目缺陷:

字段设计问题,多音字有多个拼音,同音查询的时候使用like关键字,导致查找数据时不够精确
如'唉声叹气'的唉的拼音包含有'ai4',会与'爱'字冲突

import requests
from bs4 import BeautifulSoup
import time
import string

import pymysql
from pypinyin import pinyin, Style, lazy_pinyin
db = pymysql.connect(host='localhost',user='root',passwd='123456',db='forchengyu')
cursor = db.cursor()

# 
def chengyudaquan():
    letters = string.ascii_lowercase    # a-z英文字母
    for l in string.ascii_lowercase:
        letter = l      # 字母索引

        try:
            url = 'http://www.xxx.com/zimu-%s-p1.html' % (letter)
            resp = requests.get(url)
            html = resp.text.encode(resp.encoding).decode(resp.apparent_encoding)
            bs = BeautifulSoup(html, 'html.parser')
            print('即将爬取的网页: ' + url)
            
            pageCount = bs.find(name='span',class_='pageinfo').find('strong').string    # 有多少页

            index = 1   # 页码索引
            n = 1       
            while n <= int(pageCount):
                url = 'http://www.xxx.com/zimu-%s-p%s.html' % (letter,index)    # 每一个字母类别下,接着爬取每页的数据
                index += 1
                n += 1
                print(url)
                pull_chengyu_by_sequence(url)   # 爬取该页下的成语数据

        except AttributeError as e:
            print('解析出错,没有 %s 开头的成语', (l))

    db.close()

# 具体爬取成语的函数
def pull_chengyu_by_sequence(url):
    resp = requests.get(url)
    print(resp)
    print('编码格式: ' + resp.encoding)
    print('解码格式: ' + resp.apparent_encoding)
    html = resp.text.encode(resp.encoding).decode(resp.apparent_encoding)
    # print(html)
    bs = BeautifulSoup(html, 'html.parser')

    try:
        # f = open('/Users/chencheng/Desktop/chengyu.txt','a',encoding='utf-8')
        current_dds = bs.find(name='div',class_='col-md-8 content-left').find('dd').find_all('a')   # 找到成语区域模块
        print('当前页有: ' + str(len(current_dds)) + ' 个成语')

        for i in current_dds:
            # print(i)
            name = i.string # 成语
            detail = i.get('title') # 成语简介,描述
            href = i.get('href')    # 具体解释
            print(name)
            print(detail)
            # print(href)
            # f.write(str(name) + '\n')
            # f.write(str(detail) + '\n')
            # f.href(str(href) + '\n') 
            sql = "insert into chengyu(name,detail,href) values('%s','%s','%s')" % (str(name), str(detail),str(href))   # 插入到数据库
            try:
                cursor.execute(sql)
                db.commit()
                print('插入成功!')
            except:
                db.rollback()
    except AttributeError as e:
        print('解析出错,结束此类查询')

# 获取每个成语的拼音和音调
def updatePinyin():

    db = pymysql.connect(host='localhost',user='root',passwd='123456',db='forchengyu')
    cursor = db.cursor()
    sql = "select * from chengyu"

    cursor.execute(sql)
    result = cursor.fetchall()    # 获取所有数据,是一个数组,以数组的形式来访问
    print(len(result))

    count = 0
    lastWord = ''   # 上一个解析拼音的字,如果一样就可以直接赋值
    letter = ''     # 第一个字的首字母
    pyText = ''     # 纯拼音
    pyTone = ''     # 带音调的拼音
    for re in result:
        name = re[0]
        detail = re[1]
        href = re[2]

        word = name[0]

        if lastWord == word:
            print('和上一个一样')
        else:           
            print(word)
            text = word
            pyText = lazy_pinyin(text)[0]
            letter = pyText[0]
            pyTone = ''
            tone = pinyin(text,style=Style.TONE3,heteronym=True)    # 包括多音字在内,以二维数组的形式返回

            for i in tone:  
                for j in i:
                    pyTone += j
                    pyTone += ' '
                    print(letter)
                    print(pyText)
                    print(pyTone)

                    letter = str(letter)
                    pyText = str(pyText)
                    pyTone = str(pyTone)

        sql = "update chengyu set letter='%s',pinyin='%s',pinyin_tone='%s' where name='%s'" % (letter,pyText,pyTone,name)
        
        print(sql)
        try:
            cursor.execute(sql)
            db.commit()
            print('更新成功!')
        except:
            db.rollback()

        lastWord = word

    print('更新完成')
    db.close()

# 看看数据库收录了多少个成语
def selectCount():
    db = pymysql.connect(host='localhost',user='root',passwd='123456',db='forchengyu')
    cursor = db.cursor()
    sql = "select * from chengyu"

    cursor.execute(sql)
    result = cursor.fetchall()    # 获取所有数据,是一个数组,以数组的形式来访问
    print(len(result))

# 指定查找
def search_by_word(word, limit):
    db = pymysql.connect(host='localhost',user='root',passwd='123456',db='forchengyu')
    cursor = db.cursor()

    sql = "select * from chengyu where name like '%s%%' and char_length(name) = 4 limit %d" % (word, limit)
    cursor.execute(sql)
    result = cursor.fetchall()
    for i in result:
        print(i)
    db.close()

# 同音查找
def search_by_tone(word, limit):
    db = pymysql.connect(host='localhost',user='root',passwd='123456',db='forchengyu')
    cursor = db.cursor()

    pyText = lazy_pinyin(word)[0]
    letter = pyText[0]
    pyTone = ''
    tone = pinyin(word,style=Style.TONE3,heteronym=True)

    for i in tone:  
        for j in i:
            pyTone += j
            pyTone += ' '

            letter = str(letter)
            pyText = str(pyText)
            pyTone = str(pyTone)
    print(pyTone)   
    print(letter)

    sql = "select * from chengyu where letter='%s' AND pinyin_tone like('%%%s%%') limit %d" % (letter, pyTone, limit)
    cursor.execute(sql)
    result = cursor.fetchall()
    for i in result:
        print(i)
    db.close()

# 谐音查找,即 1234 音调都可以
def search_by_pinyin(word,limit):
    db = pymysql.connect(host='localhost',user='root',passwd='123456',db='forchengyu')
    cursor = db.cursor()

    pyText = lazy_pinyin(word)[0]
    letter = pyText[0]
    pyTone = ''

    print('首字母:  ' + letter)
    print('拼音: ' + pyText)  

    sql = "select * from chengyu where pinyin='%s' limit %d" % (pyText, limit)
    cursor.execute(sql)
    result = cursor.fetchall()
    for i in result:
        print(i)
    db.close()

"""
爬取成语数据,需要现在mysql数据库创建一下内容:
database: forchengyu
table: chengyu {
    name varchar(50),
    detail varchar(255),
    href varchar(255),
    letter varchar(1),
    pinyin varchar(50),
    pinyin_tone varchar(50)
}

"""
# chengyudaquan()   

"""
获取每一个成语的首字母、拼音、带音调的拼音,需要大概600秒
"""
# updatePinyin()

"""
获取爬取的成语数量
"""
# selectCount()

"""
指定字查询
"""
# search_by_word('离',50)

"""
同音查找
"""
search_by_tone('爱',50)

"""
谐音字查询
"""
# search_by_pinyin('爱',50)

你可能感兴趣的:(成语接龙助手 for python)