log_console = logging.StreamHandler(sys.stderr) #日志相关
logger = logging.getLogger(__name__)7、函数名:cut(sentence, cut_all=False, HMM=True) 这里源代码中带有注释,函数根据参数的不同调用上面不同的函数4、5、6
测试代码:
#coding=utf-8
#author:zhangyang
#2015-5-27
#程序用于结巴分词根目录__init__.py测试
from __future__ import absolute_import, unicode_literals
import os
from math import log
import json
dirname = os.path.dirname(__file__)
print dirname
cwd=os.getcwd()
print cwd
ww=os.path.join(os.getcwd(), os.path.dirname(__file__))
print ww
FREQ={}
total=0
def gen_pfdict(f_name):
lfreq = {}
ltotal = 0
with open(f_name, 'rb') as f:
lineno = 0
for line in f.read().rstrip().decode('utf-8').splitlines():
lineno += 1
try:
word, freq = line.split(' ')[:2]
freq = int(freq)
lfreq[word] = freq
ltotal += freq
for ch in xrange(len(word)):
wfrag = word[:ch + 1]
if wfrag not in lfreq:
lfreq[wfrag] = 0
except ValueError as e:
logger.debug('%s at line %s %s' % (f_name, lineno, line))
raise e
return lfreq, ltotal
def get_DAG(sentence):
global FREQ
DAG = {}
N = len(sentence)
for k in xrange(N):
tmplist = []
i = k
frag = sentence[k]
while i < N and frag in FREQ:
if FREQ[frag]:
tmplist.append(i)
i += 1
frag = sentence[k:i + 1]
if not tmplist:
tmplist.append(k)
DAG[k] = tmplist
return DAG
def calc(sentence, DAG, route):
N = len(sentence)
route[N] = (0, 0)
logtotal = log(total)
for idx in xrange(N - 1, -1, -1):
route[idx] = max((log(FREQ.get(sentence[idx:x + 1]) or 1) -
logtotal + route[x + 1][0], x) for x in DAG[idx])
print json.dumps(route)
#for k,v in route.items():
#print k,str(v)
def main():
global FREQ,total
dictfile='./dict.txt'
FREQ,total=gen_pfdict(dictfile)
print "total frequnce is "+str(total)
print "dict length is "+str(len(FREQ))
#i=0
#g = lambda m: '\n'.join([ '%s=%d'%(k, v) for k, v in m.items() ])
sent="英语单词很难记忆"
dag=get_DAG(sent)
print json.dumps(dag)
route={}
calc(sent,dag,route)
N=len(sent)
x=0
while x < N:
y = route[x][1] + 1
print y
lword = sent[x:y]
print lword
x=y
if __name__=='__main__':
main()