想做个成语接龙,就爬取了某网站的所有成语及它的拼音。中间还出错了两次,浪费了一天的时间,加上try后就好了。
from lxml import etree
import requests
from urllib.parse import urljoin
import re
import time
import random
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'zh-CN,zh,en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
s = requests.Session()
s.headers.update(headers)
url = 'http://chengyu.t086.com/'
r = s.get(url=url)
r.encoding = 'GBK'
tree=etree.HTML(r.text)
nodes=tree.xpath("/html/body/div[3]/a")
urls = []
words = []
for href in nodes:
if 'cy' not in href.xpath('@href')[0]:
urls.append(urljoin(url,href.xpath('@href')[0]))
for line in urls:
try:
time.sleep(0.3+random.random())
r = s.get(url=line)
r.encoding = 'GBK'
tree=etree.HTML(r.text)
wordlist = tree.xpath('//div[@class="listw"]//a/@href')
for word in wordlist:
wordurl = urljoin(url,word)
time.sleep(0.3+random.random())
try:
r2 = s.get(url = wordurl)
r2.encoding = 'GBK'
tree2=etree.HTML(r2.text)
word = tree2.xpath('//*[@id="main"]//h1/text()')[0]
pingyin = re.split(r'\s',tree2.xpath('//*[@id="main"]//td[text() = "发音"]/following-sibling::td[1]/text()')[0])
print(word,pingyin)
words.append((word,pingyin))
except:
print(' Wrong url:'+wordurl)
nextUrl = tree.xpath("//a[text() = '下一页']")
if nextUrl:
x,y = line.split('_')
y1 = int(y.split('.')[0])
newline = x + '_' + str(y1+1) + '.html'
urls.append(newline)
except:
print('first wrong url',line)
with open('words.txt','w') as f:
for term in words:
print(term,file=f)
爬出来的成语是这样的
有3万多个成语,不想自己爬的话,有链接直接下载:点击打开链接(密码0v9i)
有了成语,接下来就是做个接龙程序了。
思路:
1、判断是否成语。
2、为了减少搜索量,先收集同音字。
3、在同音字中判断是否有相同汉字,如果有,返回相同汉字成语,否则返回同音字成语。
import random
words = [eval(i.strip()) for i in open('words.txt','r')]
# 显示太多了也看不过来,加个计数
count = 0
inputWord = ''
while True:
spell = []
theword = ''
if inputWord == '' or count == 10:
inputWord = input('请输入一个成语:\n')
count = 0
if inputWord == 'q':
break
# 判断成语库中是否含有
for word in words:
if word[0] == inputWord:
for w in words:
# 比较拼音,相同拼音收入spell中
if word[1][-1] == w[1][0]:
spell.append(w)
if spell:
same = []
count += 1
for a in spell:
# 在相同拼音的情况下,比较首尾汉字是否相同
if inputWord.endswith(a[0]):
same.append(a[0])
if same:
# 增加点乐趣,加个随机程序
theword = random.choice(same)
else:
theword = random.choice(spell)[0]
print(theword+'\n')
break
# 如果有的话,所匹配的成语默认为输入。
inputWord = theword
结果如下: