转载:https://www.cnblogs.com/combfish/p/8126857.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
import
jieba
from
nltk
import
word_tokenize
from
nltk.corpus
import
stopwords
from
time
import
time
start_nb
=
time()
import
logging
print
(
20
*
'*'
,
'loading data'
,
40
*
'*'
)
f
=
open
(
'全唐诗.txt'
,encoding
=
'utf-8'
)
lines
=
f.readlines()
corpus
=
[]
documents
=
[]
useless
=
[
','
,
'.'
,
'('
,
')'
,
'!'
,
'?'
,
'\''
,
'\"'
,
':'
,
'<'
,
'>'
,
','
,
'。'
,
'('
,
')'
,
'!'
,
'?'
,
'’'
,
'“'
,
':'
,
'《'
,
'》'
,
'['
,
']'
,
'【'
,
'】'
]
for
each
in
lines:
each
=
each.replace(
'\n'
,'')
each.replace(
'-'
,'')
each
=
each.strip()
each
=
each.replace(
' '
,'')
if
(
len
(each)>
3
):
if
(each[
0
]!
=
'卷'
):
documents.append(each)
each
=
list
(jieba.cut(each))
text
=
[w
for
w
in
each
if
not
w
in
useless]
corpus.append(text)
print
(
len
(corpus))
print
(
20
*
'*'
,
'trainning models'
,
40
*
'*'
)
from
gensim.models
import
Word2Vec
model
=
Word2Vec(corpus, workers
=
3
, size
=
100
)
# Initialize WmdSimilarity.
from
gensim.similarities
import
WmdSimilarity
num_best
=
10
instance
=
WmdSimilarity(corpus, model, num_best
=
10
)
print
(
20
*
'*'
,
'testing'
,
40
*
'*'
)
while
True
:
sent
=
input
(
'输入查询语句: '
)
sent_w
=
list
(jieba.cut(sent))
query
=
[w
for
w
in
sent_w
if
not
w
in
useless]
sims
=
instance[query]
# A query is simply a "look-up" in the similarity class.
# Print the query and the retrieved documents, together with their similarities.
print
(
'Query:'
)
print
(sent)
for
i
in
range
(num_best):
print
print
(
'sim = %.4f'
%
sims[i][
1
])
print
(documents[sims[i][
0
]])
|