在不用第三方库的情况下,用python 写一个分词器
需要先有一个本地词典才能进行分词
import re
import os
os.chdir('d:\workpath')
# #将jieba分词的词典变为自己的词典
# f=open('dict.txt','r+',encoding="utf-8")
# g=open('dic.txt','w+',encoding='UTF-8')
# f=f.readlines()
# for i in range(0, len(f)):
# m=re.search(r'(.*?) ',f[i])
# a=m.group()
# g.writelines('%s\n'%str(a))#将结果输出
# g.close()
h=open('dic.txt','r+',encoding='UTF-8')
text=h.readlines()
mydict=[]#创建新字典
for i in range(0, len(text)):
mydict.append(str(text[i][0:len(text[i])-2]))
h.close()
def CutWords(sentence):
result = []
start = 0
m=7
while len(sentence)- start>=1:
n =len(sentence)- start
if n<m:
m=n
cutword = sentence[start:start+m]
def CheckWords():
nonlocal start
nonlocal m
if cutword in mydict:
result.append(cutword)
start = start + m
m=7
CheckWords()
if cutword not in mydict:
if m==1:
result.append(cutword)
start = start + m
m = 7
else:
m = m - 1
CheckWords()
print(result)