class
Viterbi:
#
#####################################################################################
def
__init__
(self):
import
cPickle as mypickle
self.singleWordDict
=
mypickle.load(file(
'
mySingleWordTrie.dat
'
))
self.doubleWordDict
=
mypickle.load(file(
'
myDoubleWordTrie2.dat
'
))
#
########################################################################################
def
calWordLength(self,string):
'''
计算一个词含有多少个汉字
'''
import
re
p
=
re.compile(r
'
'
)
tmp
=
p.findall(string)
return
len(tmp)
+
1
#
#######################################################################################
def
CreateWordGraph(self,sentence):
#
sentence为一个list,每一个元素为一个字
'''
对于一个给定的句子生成词图
'''
NodesCollection
=
{}
#
每一个节点信息为一个元组(start_p,max_p,hierachy) hierachy initialized 0
length
=
len(sentence)
for
i
in
range(0,length):
expandlen
=
self.GetMaxLengthFromTrie(sentence[i])
#
句子中每个字,粗切分窗的最大长度
if
expandlen
==
0:
#
说明以sentence[i]为首字的词在字典里不存在
NodesCollection[sentence[i]]
=
(
1
,
1
,None)
continue
else
:
for
k
in
range(
1
,expandlen
+
1
):
sent
=
self.FormatSent(sentence[i:i
+
k])
if
self.singleWordDict.has_key(sentence[i]):
tmpdict
=
self.singleWordDict[sentence[i]]
if
tmpdict.has_key(sent):
prob
=
tmpdict[sent]
NodesCollection[sent]
=
(prob,prob,None)
#
i
#
print prob
#
print sent
else
:
if
k
==
1
:
#
以sentence[i]为首字的词在字典中存在,但是字典中不存在此单字
NodesCollection[sent]
=
(
1
,
1
,None)
return
NodesCollection
#
############################################################################################
def
TopologicalSortWordGraph(self,sentence):
'''
对原词图生成一个拓扑序,以后缀字为Hierachy
'''
import
re
NodesCollection
=
self.CreateWordGraph(sentence)
length
=
len(sentence)
TopologicalOrderNodes
=
{}
for
i
in
range(0,length):
TopologicalOrderNodes[i]
=
{}
p
=
re.compile(sentence[i]
+
'
$
'
)
for
key ,val
in
NodesCollection.iteritems():
#
从点集合中取出以此字为后缀的词,加入该阶梯
if
p.search(key):
TopologicalOrderNodes[i][key]
=
val
return
TopologicalOrderNodes
#
##########################################################################################
def
GetMaxLengthFromTrie(self,key):
'''
给出首字,查看以此字为首字的词的最大长度
'''
maxlen
=
0
if
self.singleWordDict.has_key(key):
maxlen
=
max([self.calWordLength(subkey)
for
subkey
in
self.singleWordDict[key].iterkeys()])
return
maxlen
#
###########################################################################################
def
FormatSent(self,sentence):
'''
由字拼成短句
'''
delimiter
=
'
'
sent
=
delimiter.join(sentence)
return
sent
#
#################################################################################################
def
Segment(self,sentence):
'''
如果句子中只有一个字不分词
'''
#
print 'segment'
if
len(sentence)
==
1
:
return
sentence
else
:
result
=
self.MyViterbi(sentence)
return
result
#
############################################################################################
def
MyViterbi(self,sentence):
'''
在词图上利用动态规划算法,维特比求最优解
'''
import
math
import
re
#
N=10001600
N
=
1000150
smooth
=
1.0
/
N
#
如果是稀缺字则概率值平滑为一个小值
TopologicalOrderNodes
=
self.TopologicalSortWordGraph(sentence)
#
获得一个具有拓扑序的图
optimalCandidates
=
{}
#
存放每一个hierachy内最优的候选节点的键值,元素形式为(key,p,hierachy)
p
=
smooth
#
如果双字典中不存在以此字为首字的词,则概率平滑到一个小值
#
首先求词网格lattice第一级 hierachy 0的最优解
if
self.doubleWordDict.has_key(sentence[0]):
tmpp
=
self.doubleWordDict[sentence[0]].get(sentence[0]
+
'
|
'
+
'
S
'
)
if
tmpp
>
0:
p
=
tmpp
#
如果双字典中存在以此单字为句首的情况,则p值取这个概率值
optimalCandidates[0]
=
(sentence[0],math.log(p,
2
),0)
for
i
in
range(
1
,len(sentence)):
#
对于 hierachy 1 to len(sentence-1)
keysToRemove
=
[]
#
待去除的键值,以免它的存在影响最大概率的计算
for
key
in
TopologicalOrderNodes[i].iterkeys():
keylen
=
self.calWordLength(key)
#
对于每一个hierachy遍历所有键
flag
=
i
-
keylen
#
flag表示回溯到标号为flag 的hierachy
p
=
re.compile(
'
^\d+
'
)
if
flag
<-
1
:
keysToRemove.append(key)
else
:
if
flag
==-
1
:
#
表示词键值可以成为句首词
searchresult
=
p.findall(key)
if
searchresult[0]
==
sentence[flag
+
1
]:
prob
=
self.calConditionPFirst(key)
prob
=
math.log(prob,
2
)
TopologicalOrderNodes[i][key]
=
(TopologicalOrderNodes[i][key][0],prob,None)
else
:
keysToRemove.append(key)
else
:
firstword
=
optimalCandidates[flag][0]
#
取出标号为flag的hierachy对应的词
#
看看在句子中位置为flag+1的字,是不是等于key的首字,如果相等则利用标号为flag的hierachy的最优结果进行计算,否则将该key加入待删除集合
searchresult
=
p.findall(key)
if
searchresult[0]
==
sentence[flag
+
1
]:
jointseg
=
firstword
+
'
|
'
+
key
con_prob
=
self.calConditionP(jointseg,firstword)
prob
=
optimalCandidates[flag][
1
]
+
math.log(con_prob,
2
)
TopologicalOrderNodes[i][key]
=
(TopologicalOrderNodes[i][key][0],prob,flag)
else
:
keysToRemove.append(key)
for
toremove
in
keysToRemove:
TopologicalOrderNodes[i].pop(toremove)
maxP
=
max([value[
1
]
for
value
in
TopologicalOrderNodes[i].itervalues()])
for
cankey
in
TopologicalOrderNodes[i].iterkeys():
if
TopologicalOrderNodes[i][cankey][
1
]
==
maxP:
candidatekey
=
cankey
break
hierachyP
=
TopologicalOrderNodes[i][candidatekey][
2
]
optimalCandidates[i]
=
(candidatekey,maxP,hierachyP)
print
'
finishcalculating
'
finalresult
=
[]
pieceOfWord
=
optimalCandidates[len(sentence)
-
1
][0]
finalresult.append(pieceOfWord)
index
=
TopologicalOrderNodes[len(sentence)
-
1
][pieceOfWord][
2
]
while
index
!=
None:
pieceOfWord
=
optimalCandidates[index][0]
finalresult.append(pieceOfWord)
index
=
TopologicalOrderNodes[index][pieceOfWord][
2
]
finalresult.reverse()
return
finalresult
#
print items
#
############################################################################################
def
calConditionPFirst(self,seg):
'''
计算第能成为首词的候选节点的条件概率,输入参数seg为句子的一个片段
'''
import
re
p
=
re.compile(
'
\d+
'
)
#
N=10002000;
N
=
1000023
smooth
=
1.0
/
N
tmp
=
p.findall(seg)
probability
=
smooth
if
self.doubleWordDict.has_key(tmp[0]):
count
=
self.doubleWordDict[tmp[0]].get(seg,0)
if
count
>
0:
probability
=
count
/
N
#
print'call calConditionPFirst probability%s' %probability
return
probability
#
######################################################################################
def
calConditionP(self,jointseg,prevseg):
'''
计算其他节点的条件概率 jointseg两个词合一起的形式,prevseg为前一个词
'''
import
re
p
=
re.compile(
'
\d+
'
)
#
N=10001600
#
N=886000
V
=
52287.0
#
词类数number of word type
smooth
=
1.0
/
V
conditionP
=
smooth
tmp
=
p.findall(jointseg)
assist
=
jointseg.split(
'
|
'
)[0]
#
找到第一个词
p_fist
=
re.compile(assist)
if
self.doubleWordDict.has_key(tmp[0]):
count_joint
=
self.doubleWordDict[tmp[0]].get(jointseg,0)
count_sum_for_edge
=
sum([val
for
(key,val)
in
self.doubleWordDict[tmp[0]].iteritems()
if
p_fist.match(key)])
conditionP
=
(
1.0
+
count_joint)
/
(V
+
count_sum_for_edge)
#
prob_prev=self.singleWordDict[tmp[0]].get(prevseg)
#
if prob_joint>0 and prob_prev>0:
#
conditionP=prob_joint/prob_prev
#
print 'call calConditonP probability=%s'%conditionP
return
conditionP
#
##########################################################################################
def
getSingleWordDict(self):
return
self.singleWordDict
def
getDoubleWordDict(self):
return
self.doubleWordDict