

  1  ''' Implements SPM(Shortest Path Matching) Method
  2  '''
  4  import  string
  5  import  codecs
  6  import  re
  8  # {entry1:cateory1, entry2:category2, ..., entryN:categoryM}
  9  dict  =  {}
10  # a string contains delimiting punctuations
11  punc  =   ''
12  max_len  =  0
14  def  segment(str):
15       ''' segment the given string in a method which made the number of tokens after
16      segmentation is minium. The algorithm used here can bu summaried as following:
17      1. use delimiting punctuations to segment the given string into short sentences.
18      2. pick the first sentence
19      3. find all known words in this picked sentence.
20      4. organize all words into DAG
21      5. find the shortest path from the start to the end, which is the segmentation we want
22      6. pick the next sentence and repeat from 3 until all sentences has been processed.
23       '''
24       global  punc
25      ret  =  []
27      re_sent  =  re.compile( ' ([^%s]+)([%s])+ '   %  (punc, punc), re.MULTILINE)
28      cnt  =  0
29       for  match  in  re_sent.finditer(str):
30          sen  =  match.group( 1 )
31           # print sen
32          dag  =  organize(sen)
33           # print dag
34          path  =  find_path(dag)
35           for  i, l  in  path:
36               # print sen[i:i+l],
37              ret.append(sen[i:i + l])
38           # print
39           #  append a punctuation after the sentence
40           #  NOTICE: multiple punctuations is not supported
41          ret.append(match.group( 2 ))
42       return  ret
44  def  organize(sentence):
45       ''' find all known words in the given sentence and organize it into a DAG
46      To represent nodes in a DAG, here a data structure of node is used as following:
47      [hop1, hop2]
48      hop is the distence from this node to the next one. On one char in the sentence
49      there could be more than one node structures that represent the multiple ways to
50      the segment the chars after this one. There is an ending node, [0], for easily
51      traversing the DAG.
52      And to represent the DAG, the following structure is used:
53      [[2,5], ..., [0]]
54       '''
55       global  dict
56      dag  =  []
57       # find all known words
58      n  =  l  =  len(sentence)
59       if  l > max_len:
60          l  =  max_len
61      c  =  0
62       while  c < len(sentence):
63          tl  =  l
64           if  c + tl > len(sentence):
65              tl  =  len(sentence) - c
66           while  tl > 1 :
67              t  =  sentence[c:c + tl]
68               if  dict.has_key(t):  #  find
69                   if  len(dag) == c:  #  first time to reach a node
70                      dag.append([len(t)])
71                   else :
72                      dag[c].append(len(t))
73               #  truncate one and retry
74              tl  -=   1
75           else # only one char left
76               if  len(dag) == c:
77                  dag.append([ 1 ])
78               else :
79                  dag[c].append( 1 )
80          c  +=   1   # try from next char
82      dag.append([0])
83       return  dag
85  def  find_path(dag):
86       ''' uses statnd Dijkstra algorithm to find the shortest path
87      from in the given dag. returns the path in such a format:
88      [(0,2), (2, 3), (5, 1), (6,4)]
89      the format of tuples in above sequence is (n, l), in which n
90      represent the index of this token and l is the length of this
91      token.
92       '''
93      wt  =  []
94      rc  =  []
95      pre  =  []
96      es  =  []
97       for  i  in  range(0, len(dag)):
98          wt.append(len(dag) + 1 )
99          rc.append(0)
100          pre.append(i - 1 )
101      rc[0]  =   1
102      wt[0]  =  0
103      es.append(0)
104       while   1 :
105           if  len(es) == 0:
106               break
107          min_node  =   - 1
108          min  =  len(dag) + 1
109           for  e  in  es:
110               if  wt[e] < min:
111                  min_node  =  e
112                  min  =  e
113          c  =  min_node
114          es.remove(c)
116           for  e  in  dag[c]:
117              t  =  e + c
118               if   not  rc[t]:
119                  d  =  wt[c]  +   1
120                   if  d < wt[t]:
121                      wt[t]  =  d
122                      pre[t]  =  c
123                      es.append(t)
124      c  =  len(dag) - 1
125      path  =  []
126       while  pre[c] !=- 1 :
127          path.append((pre[c], c - pre[c]))
128          c  =  pre[c]
129      path.reverse()
130       return  path
