pyscws4 是一个python的分词程序 | mei year-美叶 专注思想。
pyscws4 是一个python的分词程序
Posted on 2012 年 11 月 15 日 by dingyangfan
注意:pyscws4 是一个python的分词程序,抄袭至:马明练开发的php版的pscws4 地址是:http://www.ftphp.com/scws/ 。
翻译了两个文件:
1. pscws4.php
2.xdb_r.php
希望高人可以帮我优化一下代码
规则文件和词典下载:
分词.tar
pyscws4.py文件源码:
view source
001 #coding=gbk
002 from __future__ import division
003 from collections import OrderedDict
004 from xdb_r import XDB_R
005 import math ,struct,copy
006 import sys,time
007 reload(sys)
008 sys.setdefaultencoding('gbk')
009 ''' defines for ruleset '''
010 PSCWS4_RULE_MAX = 31 # just 31, PHP do not support unsigined Int
011 PSCWS4_RULE_SPECIAL= 0x80000000
012 PSCWS4_RULE_NOSTATS= 0x40000000
013 PSCWS4_ZRULE_NONE= 0x00
014 PSCWS4_ZRULE_PREFIX= 0x01
015 PSCWS4_ZRULE_SUFFIX= 0x02
016 PSCWS4_ZRULE_INCLUDE= 0x04 # with include
017 PSCWS4_ZRULE_EXCLUDE= 0x08 # with exclude
018 PSCWS4_ZRULE_RANGE = 0x10 # with znum range
019
020 ''' defines for mode of scws <= 0x800 '''
021 PSCWS4_IGN_SYMBOL= 0x01
022 PSCWS4_DEBUG= 0x02
023 PSCWS4_DUALITY= 0x04
024
025 ''' multi segment policy >= 0x1000 '''
026 PSCWS4_MULTI_NONE= 0x0000 # nothing
027 PSCWS4_MULTI_SHORT= 0x1000 # split long words to short words from left to right
028 PSCWS4_MULTI_DUALITY= 0x2000 # split every long words(3 chars?) to two chars
029 PSCWS4_MULTI_ZMAIN= 0x4000 # split to main single chinese char atr = j¦a¦n?¦v?
030 PSCWS4_MULTI_ZALL= 0x8000 # attr = ** , all split to single chars
031 PSCWS4_MULTI_MASK= 0xf000 # mask check for multi set
032 PSCWS4_ZIS_USED= 0x8000000
033
034 ''' single bytes segment flag (纯单字节字符) '''
035 PSCWS4_PFLAG_WITH_MB= 0x01
036 PSCWS4_PFLAG_ALNUM= 0x02
037 PSCWS4_PFLAG_VALID= 0x04
038 PSCWS4_PFLAG_DIGIT= 0x08
039 PSCWS4_PFLAG_ADDSYM= 0x10
040
041 ''' constant var define '''
042 PSCWS4_WORD_FULL= 0x01 # 多字: 整词
043 PSCWS4_WORD_PART= 0x02 # 多字: 前词段
044 PSCWS4_WORD_USED= 0x04 # 多字: 已使用
045 PSCWS4_WORD_RULE= 0x08 # 多字: 自动识别的
046
047 PSCWS4_ZFLAG_PUT= 0x02 # 单字: 已使用
048 PSCWS4_ZFLAG_N2= 0x04 # 单字: 双字名词头
049 PSCWS4_ZFLAG_NR2= 0x08 # 单字: 词头且为双字人名
050 PSCWS4_ZFLAG_WHEAD= 0x10 # 单字: 词头
051 PSCWS4_ZFLAG_WPART= 0x20 # 单字: 词尾或词中
052 PSCWS4_ZFLAG_ENGLISH= 0x40 # 单字: 夹在中间的英文
053 PSCWS4_ZFLAG_SYMBOL= 0x80 # 单字: 符号系列
054
055 PSCWS4_MAX_EWLEN= 16
056 PSCWS4_MAX_ZLEN= 128
057
058 class PSCWS4(object):
059 _xd = None # xdb dict handler
060 _rs = None # ruleset resource
061 _rd = None # ruleset data
062 _cs = '' # charset
063 _ztab = [] # zi len table
064 _mode = 0 # scws mode
065 _txt = None # text string
066 _res = None
067 _zis = None # z if used?(duality)
068 _off = 0
069 _len = 0
070 _wend = 0
071 _wmap = []
072 _zmap = []
073 i = 0
074
075 def __init__(self,charset='gbk'):
076 self._xd = False
077 self._rs = self._rd = OrderedDict()
078 self.set_charset(charset)
079 def __del__(self):
080 self.close()
081 def debug(self):
082 print "off:{0} len(_res):{1} len(_wmap):{2}\
083 len(_zmap):{3} _wend:{4} _zis:{5}\
084 len(_rs):{6} len(_rd):{7}\
085 ".format(\
086 self._off,len(self._res),len(self._wmap),len(self._zmap),self._wend,self._zis,\
087 len(self._rs),len(self._rd)
088 )
089 #设置字符集(ztab)
090 def set_charset(self,charset='gbk'):
091 charset = charset.strip().lower()
092 if(charset != self._cs):
093 self._cs = charset
094 self._ztab = [1 for i in range(0,0x81)]
095 if(charset == 'utf-8' or charset == 'utf8'):
096 self._ztab.extend([1 for i in range(0x81,0xc0)])
097 self._ztab.extend([2 for i in range(0xc0,0xe0)])
098 self._ztab.extend([3 for i in range(0xe0,0xf0)])
099 self._ztab.extend([4 for i in range(0xf0,0xf8)])
100 self._ztab.extend([5 for i in range(0xf8,0xfc)])
101 self._ztab.extend([6 for i in range(0xfc,0xfe)])
102 self._ztab.extend([1])
103 else:
104 self._ztab.extend([2 for i in range(0x81,0xff)])
105 self._ztab.extend([1])
106 #print len(self._ztab)
107 # 设置词典
108 def set_dict(self,fpath,mem=False):
109 xdb = XDB_R(mem)
110 if(xdb.Open(fpath) is not True): return False
111 self._xd = xdb
112 #设置规则集
113 def set_rule(self,fpath):
114 self._rule_load(fpath)
115 #设置忽略符号与无用字符
116 def set_igonre(self,yes):
117 if(yes is True):self._mode ¦= PSCWS4_IGN_SYMBOL
118 else: self._mode &= ~PSCWS4_IGN_SYMBOL
119 #设置复合分词等级 ($level = 0,15)
120 def set_multi(self,level):
121 level = (int(level) << 12)
122 self._mode &= ~PSCWS4_MULTI_MASK
123 if(level & PSCWS4_MULTI_MASK): self._mode ¦= level
124 #设置是否显示分词调试信息
125 def set_debug(self,yes):
126 if(yes is True): self._mode ¦= PSCWS4_DEBUG
127 else:self._mode &= ~PSCWS4_DEBUG
128 #设置是否自动将散字二元化
129 def set_duality(self,yes):
130 if(yes is True): self._mode ¦= PSCWS4_DUALITY
131 else:self._mode &= ~PSCWS4_DUALITY
132 # 设置要分词的文本字符串
133 def send_text(self,text):
134 self._txt = str(text)
135 self._len = len(self._txt)
136 self._off =0
137 # 取回一批分词结果(需要多次调用, 直到返回 false)
138 def get_result(self):
139 off = self._off
140 tlen = self._len
141 txt = self._txt
142 self._res = []
143
144 while ((off < tlen) and (ord(txt[off])<=0x20)):
145 if(txt[off] == "\r" or txt[off] == "\n"):
146 self._off = off +1
147 self._put_res(off,0,1,'un')
148 return self._res
149 off +=1
150 if(off >= tlen): return False
151 self._off = off
152 ch = txt[off]
153 cx = ord(ch)
154 if(self._char_token(ch)):
155 self._off +=1
156 self._put_res(off,0,1,'un')
157 return self._res
158 clen = self._ztab[cx]
159
160 zlen = 1
161 pflag = (PSCWS4_PFLAG_WITH_MB if clen >1 else (PSCWS4_PFLAG_ALNUM if self._is_alnum(cx) else 0))
162 off = (off + clen)
163 while off < tlen:
164 ch = txt[off]
165 cx = ord(ch)
166 if (cx <= 0x20 or self._char_token(ch)):break
167 clen = self._ztab[cx]
168 if(not (pflag & PSCWS4_PFLAG_WITH_MB)):
169 if(clen ==1):
170 if((pflag & PSCWS4_PFLAG_ALNUM) and not self._is_alnum(cx)):
171 pflag ^= PSCWS4_PFLAG_ALNUM
172 else:
173 if(not ((pflag & PSCWS4_PFLAG_ALNUM) ) or zlen > 2): break
174 pflag ¦= PSCWS4_PFLAG_WITH_MB
175 elif ( ((pflag & PSCWS4_PFLAG_WITH_MB) ) and clen ==1):
176 #mb + single-byte. allowd: alpha+num + 中文
177 if(not self._is_alnum(cx)): break
178 pflag &= ~PSCWS4_PFLAG_VALID
179 i = off+1
180 while i<(off+3):
181 ch = txt[i]
182 cx = ord(ch)
183 if( (i >= tlen) or (cx <=0x20) or (self._ztab[cx] > 1)):
184 pflag ¦= PSCWS4_PFLAG_VALID
185 break
186 if(not self._is_alnum(cx)): break
187 i+=1
188 if( not(pflag & PSCWS4_PFLAG_VALID) ): break
189 clen += (i - off -1)
190 #add max zlen limit
191 zlen +=1
192 if(zlen >=PSCWS4_MAX_ZLEN):break
193 off = (off + clen)
194
195 #处理半个字的问题
196 ch =off
197 if (ch > tlen):
198 off -= clen
199 #do the real segment
200 if(off <= self._off):
201 return False
202 elif ( pflag & PSCWS4_PFLAG_WITH_MB ):
203 self._msegment(off,zlen)
204 elif ( not(pflag & PSCWS4_PFLAG_ALNUM) or ((off - self._off) >=PSCWS4_MAX_EWLEN ) ):
205 self._ssegment(off)
206 else:
207 zlen = off -self._off
208 self._put_res(self._off,2.5*math.log(zlen),zlen,'en')
209 self._off = (tlen if ch > tlen else off)
210 if(len(self._res) == 0): return self.get_result()
211 return self._res
212 def get_tops(self,limit = 10,xattr = ''):
213 ret = {}
214 if(self._txt is None): return False
215 xmode = False
216 attrs = {}
217 if(xattr != ''):
218 if(xattr[0:1] == '~'):
219 xattr = xattr[1:]
220 xmode = 1
221 for tmp in xattr.split(','):
222 tmp = tmp.strip().lower()
223 if( tmp != ''): attrs[tmp] = True
224 off = self._off
225 self._off = cnt = 0
226 tlist = {}
227 while 1:
228 tmpa = self.get_result()
229 if (not tmpa): break
230 for tmp in tmpa:
231 #有改
232 if(tmp['idf'] < 0.2 or tmp['attr'][0:1] == '#'): continue
233 if(len(attrs) >0):
234 if(xmode == True and not attrs.has_key(tmp['attr'])): continue
235 if(xmode == False and attrs.has_key(tmp['attr'])): continue
236 word = tmp['word'].lower()
237 if(self._rule_checkbit(word,PSCWS4_RULE_NOSTATS)): continue
238 if(tlist.has_key(word)):
239 tlist[word]['weight'] += tmp['idf']
240 tlist[word]['times'] +=1
241 else:
242 tlist[word] = {'word':tmp['word'],'times':1,'weight':tmp['idf'],'attr':tmp['attr']}
243 self._off = off
244 t= sorted(tlist.values(),key=lambda d:d['weight'],cmp=lambda a,b: 1 if b > a else -1)
245 return t[0:limit]
246 def close(self):
247 if(self._xd):
248 self._xd.Close()
249 self._xd = False
250 self._rd = []
251 self._rs = []
252 def version(self):
253 return 'pySCWS/1.0 - by donghongyi'
254 def _rule_load(self,fpath):
255 try:
256 fd = file(fpath,'r')
257 except IOError:
258 return False
259 i = j = 0
260 self._rs = OrderedDict()
261 while 1:
262 buf = fd.readline()
263 if not buf:
264 break
265 if (buf[0:1] != '['): continue
266 pos = buf.find(']')
267 if(pos == -1 or pos ==1 or pos > 15):continue
268 key = buf[1:pos].lower()
269 if(self._rs.has_key(key)): continue
270 item = {'tf':5.0, 'idf':3.5, 'attr':'un', 'bit':0, 'flag':0, 'zmin':0, 'zmax':0, 'inc':0, 'exc':0}
271 if(key == 'special'):
272 item['bit'] = PSCWS4_RULE_SPECIAL
273 elif (key == 'nostats'):
274 item['bit'] = PSCWS4_RULE_NOSTATS
275 else:
276 item['bit'] = (1 << j)
277 j +=1
278 self._rs[key] = item
279 #这里可能是错误
280 i +=1
281 if(i >=PSCWS4_RULE_MAX): break
282 #load the ruleset
283 fd.seek(0)
284 rbl = False
285 item= {}
286 while 1:
287 buf = fd.readline()
288 if not buf:
289 break
290 ch = buf[0:1]
291 if(ch == ';'): continue
292 if(ch == '['):
293 item = {}
294 pos = buf.find(']')
295 if(pos > 1):
296 key = buf[1:pos].lower()
297 if(self._rs.has_key(key)):
298 rbl = True
299 item = self._rs[key]
300 continue
301 if(ch == ':'):
302 buf = buf[1:]
303 pos = buf.find('=')
304 if(pos == -1):
305 continue
306 pkey,pval = buf.split('=',2)
307 pkey = pkey.strip()
308 pval = pval.strip()
309 if(pkey == 'line'): rbl = False if pval[0:1].strip() == 'n' else True
310 elif (pkey =='tf'): item['tf'] = float(pval)
311 elif (pkey =='idf'): item['idf'] = float(pval)
312 elif (pkey =='attr'): item['attr'] = pval
313 elif (pkey == 'znum'):
314 pos = pval.find(',')
315 if(pos > -1):
316 item['zmax'] = int(pval[pos+1:].strip())
317 item['flag'] ¦= PSCWS4_ZRULE_RANGE
318 pval = pval[0:pos]
319 item['zmin'] = int(pval)
320 elif (pkey == 'type'):
321 if(pval == 'prefix'):
322 item['flag'] ¦= PSCWS4_ZRULE_PREFIX
323 if(pval == 'suffix'):
324 item['flag'] ¦= PSCWS4_ZRULE_SUFFIX
325 elif (pkey == 'include' or pkey =='exclude'):
326 clude = 0
327 for tmp in pval.split(','):
328 tmp = tmp.strip().lower()
329 if(not self._rs.has_key(tmp)): continue
330 clude ¦= self._rs[tmp]['bit']
331 if(pkey == 'include'):
332 item['inc'] ¦= clude
333 item['flag'] ¦= PSCWS4_ZRULE_INCLUDE
334 else:
335 item['exc'] ¦= clude
336 item['flag'] ¦=PSCWS4_ZRULE_EXCLUDE
337 continue
338 if(item == {}): continue
339 buf = buf.strip()
340 if (buf == ''): continue
341 if(rbl):
342 self._rd[buf] = item
343 else:
344 tlen = len(buf)
345 off =0
346 while off < tlen:
347 tord = ord(buf[off:off+1])
348 zlen = self._ztab[tord]
349 if( off + zlen >= tlen): break
350 zch = buf[off:off+zlen]
351 self._rd[zch] = item
352 off += zlen
353 #get the ruleset
354 def _rule_get(self,str):
355 if(not self._rd.has_key(str)): return False
356 return self._rd[str]
357 #check the bit with str
358 def _rule_checkbit(self,str,bit):
359 if(not self._rd.has_key(str)): return False
360 bit2 = self._rd[str]['bit']
361 return (True if (bit & bit2) else False)
362 #check the rule include ¦ exclude
363 def _rule_check(self,rule,str):
364 if( (rule['flag'] & PSCWS4_ZRULE_INCLUDE) and not self._rule_checkbit(str,rule['bit'])): return False
365 if( (rule['flag'] & PSCWS4_ZRULE_EXCLUDE) and self._rule_checkbit(str,rule['bit'])): return False
366 return True
367 #bulid res
368 def _put_res(self,o,i,l,a):
369 word = self._txt[o:o+l]
370 item = {'word':word,'off':o,'idf':i,'len':l,'attr':a}
371 self._res.append(item)
372 #alpha, numeric check by ORD value
373 def _is_alnum(self,c):
374 return ((c>=48 and c<=57) or (c>=65 and c<=90) or (c>=97 and c<=122))
375 def _is_alpha(self,c):
376 return ((c>=65 and c<=90) or ( c>=97 and c<=122))
377 def _is_ualpha(self,c):
378 return (c>=65 and c<=90)
379 def _is_digit(self,c):
380 return (c>=48 and c<=57)
381 def _no_rule1(self,f):
382 return ((f & (PSCWS4_ZFLAG_SYMBOL¦PSCWS4_ZFLAG_ENGLISH)) or ((f & (PSCWS4_ZFLAG_WHEAD¦PSCWS4_ZFLAG_NR2)) == PSCWS4_ZFLAG_WHEAD))
383 def _no_rule2(self,f):
384 return self._no_rule1(f)
385 def _char_token(self,c):
386 return (c=='('or c==')'or c=='['or c==']'or c=='{'or c=='}'or c==':'or c=='"')
387 # query the dict
388 def _dict_query(self,word):
389 if(not self._xd): return False
390 value = self._xd.Get(word)
391 if(not value): return False
392 tmp = struct.unpack('f f B 3s',value)
393 return {'tf':tmp[0],'idf':tmp[1],'flag':tmp[2],'attr':tmp[3].rstrip(b'\x00')}
394 #ssegment, 单字节用语切割
395 def _ssegment(self,end):
396 start = self._off
397 wlen = end - start
398 #check special words (need strtoupper)
399 if(wlen > 1):
400 #可能出错
401 txt = self._txt[start:start+wlen].lower()
402 if(self._rule_checkbit(txt,PSCWS4_RULE_SPECIAL)):
403 self._put_res(start,9.5,wlen,'nz')
404 return
405 txt = self._txt
406 #check brief words such as S.H.E M.R.
407 if( self._is_ualpha(ord(txt[start])) and txt[start+1] == '.'):
408 #修改
409 ch = start +2
410 while ch< end:
411 if(not self._is_alpha(ord(txt[ch]))): break
412 ch +=1
413 if(ch == end or txt[ch] != '.'): break
414 ch +=1
415 if(ch == end):
416 self._put_res(start,7.5,wlen,'nz')
417 return
418 #取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的'
419 #print 1111
420 while start < end:
421 #修改过的
422 ch = txt[start]
423 start +=1
424 cx = ord(ch)
425 if(self._is_alnum(cx)):
426 pflag =PSCWS4_PFLAG_DIGIT if self._is_digit(cx) else 0
427 wlen = 1
428 while start < end:
429 ch = txt[start]
430 cx = ord(ch)
431 if(pflag & PSCWS4_PFLAG_DIGIT):
432 if(not self._is_digit(cx)):
433 if( (pflag & PSCWS4_PFLAG_ADDSYM) or cx !=0x2e or not self._is_digit(ord(txt[start+1]))):
434 break
435 pflag ¦= PSCWS4_PFLAG_ADDSYM
436 else:
437 if(not self._is_alpha(cx)):
438 if( (pflag & PSCWS4_PFLAG_ADDSYM) or cx !=0x27 or not self._is_alpha(ord(txt[start+1]))):
439 break
440 pflag ¦= PSCWS4_PFLAG_ADDSYM
441 start +=1
442 #可能出错
443 wlen +=1
444 if(wlen >=PSCWS4_MAX_EWLEN): break
445 self._put_res(start-wlen,2.5*math.log(wlen),wlen,'en')
446 elif (not(self._mode & PSCWS4_IGN_SYMBOL)):
447 self._put_res(start-1,0,1,'un')
448 #get one z by ZMAP
449 def _get_zs(self,i,j = -1):
450 if(j == -1): j = i
451 return self._txt[self._zmap[i]['start']:self._zmap[i]['start']+( self._zmap[j]['end'] - self._zmap[i]['start'])]
452 #mget_word
453 def _mget_word(self,i,j):
454 wmap = self._wmap
455 if(not (wmap[i][i]['flag'] & PSCWS4_ZFLAG_WHEAD)): return i
456 r = i
457 #观察
458 #k=i+1
459 for k in range(i+1,j+1):
460 #while k<=j:
461 if(wmap[i][k] and wmap[i][k]['flag'] & PSCWS4_WORD_FULL): r =k
462 #k+=1
463 return r
464 #mset_word
465 def _mset_word(self,i,j):
466 wmap = self._wmap
467 zmap = self._zmap
468 item = wmap[i][j]
469 if( (item is False) or (( self._mode & PSCWS4_IGN_SYMBOL)\
470 and not (item['flag'] & PSCWS4_ZFLAG_ENGLISH) and item['attr'] == 'un' )\
471 ):
472 return
473 #散字自动二元聚合
474 if(self._mode & PSCWS4_DUALITY):
475 k = self._zis
476 if(i == j and not(item['flag'] & PSCWS4_ZFLAG_ENGLISH) and item['attr'] == 'un'):
477 self._zis = i
478 if(k < 0): return
479 i = (k & ~PSCWS4_ZIS_USED)
480 if( (i != (j-1)) or (not (k & PSCWS4_ZIS_USED) and self._wend == i)):
481 self._put_res(zmap[i]['start'],wmap[i][i]['idf'],zmap[i]['end'] - zmap[i]['start'],wmap[i][i]['attr'])
482 if( i != (j -1)): return
483 self._zis ¦= PSCWS4_ZIS_USED
484 else:
485 if( (k >=0) and (not (k & PSCWS4_ZIS_USED) or ( j > i))):
486 k &= ~PSCWS4_ZIS_USED
487 self._put_res(zmap[k]['start'], wmap[k][k]['idf'], zmap[k]['end'] - zmap[k]['start'], wmap[k][k]['attr'])
488 if( j > i): self._wend = j + 1
489 self._zis = -1
490 #save the res
491 self._put_res(zmap[i]['start'], item['idf'], zmap[j]['end'] - zmap[i]['start'], item['attr'])
492 if( (j -i) > 1):
493 m = i
494 if ( self._mode & PSCWS4_MULTI_SHORT):
495 while (m < j):
496 k = m
497 n = m+1
498 while n<=j:
499 if(n ==j and m ==i): break
500 item = wmap[m][n]
501 if(item and item['flag'] & PSCWS4_WORD_FULL):
502 k = n
503 self._put_res(zmap[m]['start'], item['idf'], zmap[n]['end'] - zmap[m]['start'], item['attr'])
504 if (not (item['flag'] & PSCWS4_WORD_PART)): break
505 n +=1
506 if (k == m):
507 if (m == i): break
508 item = wmap[m][m]
509
510 self._put_res(zmap[m]['start'], item['idf'], zmap[m]['end'] - zmap[m]['start'], item['attr'])
511 m = k+1
512 if(m == j):
513 m -=1
514 break
515 if( self._mode & PSCWS4_MULTI_DUALITY):
516 while m < j:
517 self._put_res(zmap[m]['start'], wmap[m][m]['idf'], zmap[m+1]['end'] - zmap[m]['start'], wmap[m][m]['attr'])
518 m +=1
519 if( (j > i) and (self._mode & (PSCWS4_MULTI_ZMAIN¦PSCWS4_MULTI_ZALL))):
520 if( (j -i) == 1 and not wmap[i][j]):
521 if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_PUT): i +=1
522 else: wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_PUT
523 wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_PUT
524 #这里可能错误
525 while i <=j:
526 if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_PUT): continue
527 ssss = wmap[i][i]['attr'][0:1]
528 #print ssss
529 if( not (self._mode & PSCWS4_MULTI_ZALL) and not ( ssss[ssss.find('jnv'):])): continue
530 self._put_res(zmap[i]['start'], wmap[i][i]['idf'], zmap[i]['end'] - zmap[i]['start'], wmap[i][i]['attr'])
531 i +=1
532 #mseg_zone
533 def _mseg_zone(self,f,t):
534 weight = nweight = 0.0
535 wmap = self._wmap
536 zmap = self._zmap
537 mpath = npath = []
538 x = f
539 for i in range(f,t+1):
540 j = self._mget_word(i,t)
541 if ( j == i or j <=x or ( wmap[i][j]['flag'] & PSCWS4_WORD_USED)): continue
542 #one word only
543 if (i ==f and j ==t):
544 mpath = [(j-i),0xff]
545 break
546 if( i !=f and (wmap[i][j]['flag'] & PSCWS4_WORD_RULE)): continue
547 #create the new path
548 wmap[i][j]['flag'] ¦= PSCWS4_WORD_USED
549 nweight = wmap[i][j]['tf'] * (j-i+1)
550
551 if (i ==f): nweight *=1.2
552 elif (j ==t): nweight *=1.4
553 if(npath == []):
554 npath = [0xff for uuu in range(t-f+2)]
555
556 #lookfor backward
557 x = 0
558 m = f
559 while m< i:
560 n = self._mget_word(m,i-1)
561 nweight *= wmap[m][n]['tf'] * (n-m+1)
562 npath[x] = n-m
563 x +=1
564 if(n>m): wmap[m][n]['flag'] ¦= PSCWS4_WORD_USED
565 m = n+1
566 #my self
567 npath[x] = j-i
568 x+=1
569 #lookfor forward
570 m = j+1
571 while m <=t:
572 n = self._mget_word(m,t)
573 nweight *= wmap[m][n]['tf'] * (n-m+1)
574 npath[x] = n-m
575 x +=1
576 if(n >m): wmap[m][n]['flag'] ¦= PSCWS4_WORD_USED
577 m = n+1
578 npath[x] = 0xff
579 nweight /= pow(x-1,4)
580 #draw the path for debug
581 if(self._mode & PSCWS4_DEBUG):
582 print "PATH by keyword = {0} (weight={1}):\n".format(self._get_zs(i,j),nweight)
583 m=f
584 x=0
585 n = npath[x]
586 while n !=0xff:
587 n +=m
588 print self._get_zs(m,n),' '
589 m = n+1
590 x+=1
591 n = npath[x]
592 print "\n--\n"
593 x = j
594 #check better path
595 if(nweight > weight):
596 weight = copy.deepcopy(nweight)
597 swap = copy.deepcopy(mpath)
598 mpath = copy.deepcopy(npath)
599 npath = copy.deepcopy(swap)
600 del swap
601 #set the result, mpath != NULL
602 if(mpath == []): return
603 m = f
604 x=0
605 n = mpath[x]
606 #print mpath
607 while n !=0xff:
608 n +=m
609 #print m,n
610 self._mset_word(m,n)
611 m = n +1
612 x+=1
613 n = mpath[x]
614 #msegment(重点函数)
615 def _msegment(self,end,zlen):
616 self._wmap = [[False for ooooo in range(zlen)] for i in range(zlen)]
617 self._zmap = [False for ooooo in range(zlen)]
618 wmap = self._wmap
619 zmap = self._zmap
620 txt = self._txt
621 start = self._off
622 self._zis = -1
623 #load the zmap
624 i =0
625 #load the zmap
626 while start < end:
627 ch = txt[start]
628 cx = ord(ch)
629 clen = self._ztab[cx]
630 if(clen == 1):
631 while start < end:
632 start +=1 #修改
633 cx = ord(txt[start])
634 if(self._ztab[cx] > 1): break
635 clen +=1
636 wmap[i][i] = {'tf':0.5, 'idf':0, 'flag':PSCWS4_ZFLAG_ENGLISH, 'attr':'un'}
637 else:
638 query = self._dict_query(txt[start:start+clen])
639 if(not query):
640 wmap[i][i] = {'tf':0.5, 'idf':0, 'flag':0, 'attr':'un'}
641 else:
642 if(query['attr'][0:1] == '#'): query['flag'] ¦= PSCWS4_ZFLAG_SYMBOL
643 wmap[i][i] = query
644 start += clen
645 zmap[i] = {'start':start-clen, 'end':start}
646 i+=1
647
648 #fixed real zlength
649 zlen = i
650 #create word query table
651 for i in range(zlen):
652 k=0
653 j = i +1
654 while j<zlen:
655 query = self._dict_query(self._get_zs(i,j))
656 if (not query):break
657 ch = query['flag']
658 if(ch & PSCWS4_WORD_FULL):
659 wmap[i][j] = query
660 wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
661 k = i+1
662 while k<=j:
663 wmap[k][k]['flag'] ¦= PSCWS4_ZFLAG_WPART
664 k+=1
665 if (not (ch & PSCWS4_WORD_PART)): break
666 j +=1
667 k-=1
668 if(k and k>=0):
669 #set nr2 to some short name
670 if(k == (i+1)):
671 if(wmap[i][k]['attr'] == 'nr'):
672 wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_NR2
673 #clean the PART flag for the last word
674 if(k < j):
675 wmap[i][k]['flag'] ^= PSCWS4_WORD_PART
676 # try to do the ruleset match
677 # for name & zone & chinese numeric
678 if(len(self._rd) > 0):
679 #check for 'one word'
680 for i in range(zlen):
681 if(self._no_rule1(wmap[i][i]['flag'])): continue
682 r1 = self._rule_get(self._get_zs(i))
683 if(not r1): continue
684 clen = r1['zmin'] if r1['zmin'] >0 else 1
685 if(( r1['flag'] & PSCWS4_ZRULE_PREFIX) and (i < (zlen-clen))):
686 #先检查 zmin 字内是否全部符合要求, 再在 zmax 范围内取得符合要求的字
687 ch =1
688 while ch <=clen:
689 j = i + ch
690 if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])):break
691 if(not self._rule_check(r1,self._get_zs(j))): break
692 ch+=1
693 if(ch <= clen): continue
694 #no limit znum or limit to a range
695 j = i +ch
696 while 1:
697 if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >= r1['zmax']))): break
698 if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])): break
699 if( not self._rule_check(r1,self._get_zs(j))): break
700 clen +=1
701 j +=1
702 # 注意原来2字人名,识别后仍为2字的情况
703 if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_NR2):
704 if(clen ==1): continue
705 wmap[i][i+1]['flag'] ¦= PSCWS4_WORD_PART
706 #ok, got: i & clen
707 k = i + clen
708 wmap[i][k] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':(PSCWS4_WORD_RULE¦PSCWS4_WORD_FULL), 'attr':r1['attr']}
709 wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
710
711 j = i+1
712 while j<=k:
713 wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
714 j+=1
715 if(not (wmap[i][i]['flag'] & PSCWS4_ZFLAG_WPART)): i =k
716 continue
717 if( (r1['flag'] & PSCWS4_ZRULE_SUFFIX) and (i >= clen)):
718 #suffix, check before
719 ch = 1
720 while ch<=clen:
721 j = i -ch
722 if(j < 0 or self._no_rule1(wmap[j][j]['flag'])): break
723 if(not self._rule_check(r1, self._get_zs(j))):break
724 ch+=1
725 if (ch <= clen): continue
726 #no limit znum or limit to a range
727 j = i - ch
728 while 1:
729 if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >= r1['zmax']))): break
730 if( j < 0 or self._no_rule2(wmap[j][j]['flag'])): break
731 if( not self._rule_check(r1,self._get_zs(j))): break
732 clen +=1
733 j -=1
734 #ok, got: i & clen (maybe clen=1 & [k][i] isset)
735 k = i -clen
736 if(wmap[k][i] is not False): continue
737 wmap[k][i] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':PSCWS4_WORD_FULL, 'attr':r1['attr']}
738 wmap[k][k]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
739 j = k+1
740 while j <=i:
741 wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
742 if( (j != i) and (wmap[k][i] is not False) ): wmap[k][j]['flag'] ¦= PSCWS4_WORD_PART
743 j+=1
744 continue
745 #check for 'two words' (such as: 欧阳** , **西路)
746 #print wmap[6]
747 for i in range(zlen-2,-1,-1):
748 #with value ==> must be have SCWS_WORD_FULL, so needn't check it ag.
749 if( (wmap[i][i+1] is False) or wmap[i][i+1]['flag'] & PSCWS4_WORD_PART): continue
750 k = i +1
751 #print k
752 r1= self._rule_get(self._get_zs(i,k))
753 if(not r1): continue
754 clen =r1['zmin'] if r1['zmin'] else 1
755 if( (r1['flag'] & PSCWS4_ZRULE_PREFIX) and (k < (zlen-clen))):
756 ch = 1
757 while ch<=clen:
758 j = k +ch
759 if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])): break
760 if(not self._rule_check(r1,self._get_zs(j))): break
761 ch +=1
762 if(ch <= clen):continue
763 #no limit znum or limit to a range
764 j = k+ch
765 while 1:
766 if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >=r1['zmax']))): break
767 if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])): break
768 if(not self._rule_check(r1,self._get_zs(j))): break
769 clen +=1
770 j +=1
771 #ok, got: i & clen
772 k = k + clen
773 wmap[i][k] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':PSCWS4_WORD_FULL, 'attr':r1['attr']}
774 wmap[i][i+1]['flag'] ¦= PSCWS4_WORD_PART
775 j=i+2
776 while j<=k:
777 wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
778 j+=1
779 i -=1
780 continue
781 if ( (r1['flag'] & PSCWS4_ZRULE_SUFFIX) and (i >= clen)):
782 # suffix, check before
783 ch = 1
784 while ch<=clen:
785 j = i -ch
786 if(j < 0 or self._no_rule1(wmap[j][j]['flag'])): break
787 if ( not self._rule_check(r1,self._get_zs(j))): break
788 ch +=1
789 if (ch <= clen): continue
790 #no limit znum or limit to a range
791 j = i - ch
792 while 1:
793 if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >= r1['zmax'])) ): break
794 if(j < 0 or self._no_rule2(wmap[j][j]['flag'])): break
795 if( not self._rule_check(r1,self._get_zs(j))): break
796 clen +=1
797 j -=1
798 #ok, got: i & clen (maybe clen=1 & [k][i] isset)
799 k = i - clen
800 i = i +1
801 wmap[k][i] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':PSCWS4_WORD_FULL, 'attr':r1['attr']}
802 wmap[k][k]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
803 j = k+1
804 while j<=i:
805 wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
806 if(wmap[k][j] is not False): wmap[k][j]['flag'] ¦= PSCWS4_WORD_PART
807 j+=1
808 i -= (clen +1)
809 continue
810 # do the segment really
811 # find the easy break point
812
813 j=0
814 i=0
815 for i in range(zlen):
816 if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_WPART): continue
817 if(i > j):
818 self._mseg_zone(j,i-1)
819 j = i
820 if (not (wmap[i][i]['flag'] & PSCWS4_ZFLAG_WHEAD)):
821 self._mset_word(i,i)
822 j+=1
823 i+=1
824 #错在这里
825 #the lastest zone
826 if(i > j):
827 self._mseg_zone(j,i-1)
828 if( (self._mode & PSCWS4_DUALITY) and (self._zis >=0) and not (self._zis & PSCWS4_ZIS_USED) ):
829 i = self._zis
830 self._put_res(zmap[i]['start'],wmap[i][i]['idf'],zmap[i]['end'] - zmap[i]['start'],wmap[i][i]['attr'])
831
832 def test(text):
833 st = time.time()
834 text = text
835 for i in range(100):
836 cws.send_text(text)
837 while cws.get_result():
838 pass
839
840 ret = cws.get_tops(10,'r,v,p')
841 print "No.\tWord\t\t\tAttr\tTimes\tRank\n------------------------------------------------------\n"
842 i = 0
843 for tmp in ret:
844 i+=1
845 print "%02d.\t%-8s\t%s\t%d\t%.2f" %( i, tmp['word'].decode('gbk'),tmp['attr'], tmp['times'], tmp['weight'])
846
847 print u'所花时间:',time.time()-st
848 if __name__=='__main__':
849 cws = PSCWS4('gbk')
850 cws.set_dict('dict.xdb',True)
851 cws.set_rule('rules.ini')
852 cws.send_text("""中国航天官员应邀到美国与太空总署官员开会 发展中国家 上海大学城书店 表面的东西 今天我买了一辆面的,于是我坐着面的去上班 化妆和服装 这个门把手坏了,请把手拿开 将军任命了一名中将,产量三年中将增长两倍 王军虎去广州了,王军虎头虎脑的 欧阳明练功很厉害可是马明练不厉害 北京华烟云 人中出吕布 马中出赤兔Q1,中我要买Q币充值""")
853 cws.set_igonre(False) #设置忽略符号与无用字符
854 #cws.set_debug(True) #设置是否显示分词调试信息
855 cws.set_multi(3) #设置复合分词等级 ($level = 0,15)
856 cws.set_duality(True) #设置是否自动将散字二元化
857 #test("中国航天官员应邀到美国与太空总署官员开会 发展中国家 上海大学城书店 表面的东西 今天我买了一辆面的,于是我坐着面的去上班 化妆和服装 这个门把手坏了,请把手拿开 将军任命了一名中将,产量三年中将增长两倍 王军虎去广州了,王军虎头虎脑的 欧阳明练功很厉害可是马明练不厉害 京华烟云 人中出吕布 马中出赤兔Q1,中我要买Q币充值")
858
859 while 1:
860 tmp = cws.get_result()
861 if(not tmp):break
862 line = ''
863 for w in tmp:
864 if (w['word'] == "\r"): continue
865 if (w['word'] == "\n"):
866 line = line.rstrip(' ') + "\n"
867 #else: line .= w['word'] . "/{w['attr']} "
868 else: line += w['word'] + " "
869 print line
870 #t = ','
871 #print len(t[0:2])
872 #print ord(t[1])
873
874 # ret = cws.get_tops(10,'r,v,p')
875
876 # print "No.\tWord\t\t\tAttr\tTimes\tRank\n------------------------------------------------------\n"
877 # i = 0
878 # for tmp in ret:
879 # i+=1
880 # print "%02d.\t%-8s\t%s\t%d\t%.2f" %( i, tmp['word'].decode('gbk'),tmp['attr'], tmp['times'], tmp['weight'])
xdb_r.py文件源码:
view source
001 #coding=gbk
002 import os
003 import struct
004 import sys
005 reload(sys)
006 sys.setdefaultencoding('gbk')
007 XDB_VERSION = 34 # 0x01 ~ 0xff
008 XDB_TAGNAME = 'XDB' # First bytes
009 XDB_MAXKLEN = 0xf0 # maxklen: < 255
010
011 class XDB_R(object):
012 fd = False
013 hash_base = 0
014 hash_prime = 0
015 memread = None #内存
016 mem = False #是否启用内存
017 off = 0 #位置
018 len = 0 #内存长度
019 def __init__(self,mem=False):
020 self.mem = mem
021 pass
022 def __del__(self):
023 self.Close()
024 pass
025 def Open(self,fpath):
026 self.Close()
027 try:
028 fd = file(fpath,'rb')
029 except IOError:
030 raise Exception('XDB::Open("' + os.path.basename(fpath) + '"),invalid xdb failed.')
031 else:
032 if(self.mem):
033 self.memread = fd.read()
034 self.len = len(self.memread)
035 self.fd = fd
036 if( self._check_header(fd) is False):
037 raise Exception('XDB::Open("' + os.path.basename(fpath) + '"),invalid xdb format.')
038 fd.close()
039 return True
040 def _read(self,size):
041 if(self.mem):
042 return self.memread[self.off:self.off+size]
043 else:
044 return self.fd.read(size)
045 def _seek(self,seek,flag=False):
046 if(self.mem):
047 if self.off > self.len: raise Exception('Mem offset !')
048 self.off = seek
049 else:
050 self.fd.seek(seek,flag)
051 def _close(self):
052 if(self.mem):
053 self.memread = None
054 else:
055 self.fd.close()
056 self.fd = False
057 def Get(self,key):
058 if(self.fd is False):
059 raise Exception('XDB:Get(), null db handler.')
060 klen = len(key)
061 #print klen
062 if(klen ==0 or klen > XDB_MAXKLEN):
063 return False
064 rec = self._get_record(key)
065 if(not rec.has_key('vlen') or rec['vlen'] ==0):
066 return False
067
068 return rec['value']
069 def Close(self):
070 if(self.fd is False):
071 return
072 self._close()
073 def _get_index(self,key):
074 l = len(key)
075 h = self.hash_base
076 while l:
077 l-=1
078 h += (h << 5)
079 h ^= ord(key[l])
080 h &= 0x7fffffff
081 return (h % self.hash_prime)
082 def _check_header(self,fd):
083 fd.seek(0,os.SEEK_SET)
084 buf = fd.read(32)
085 if(len(buf) != 32): return False
086 unpack = struct.unpack('3s B I I I f 12s',buf)
087 if(len(unpack) <=6):
088 unpack = list(unpack)
089 unpack.extend(' ')
090 hdr = {}
091 hdr['tag'],hdr['ver'],hdr['base'],hdr['prime'],hdr['fsize'],hdr['check'],hdr['reversed'] =unpack[0],unpack[1],unpack[2],unpack[3],unpack[4],unpack[5],unpack[6]
092 if(hdr['tag'] != XDB_TAGNAME): return False
093 fstat = os.fstat(fd.fileno())
094 if(fstat.st_size != hdr['fsize']): return False
095 self.hash_base = hdr['base']
096 self.hash_prime = hdr['prime']
097 self.version = hdr['ver']
098 self.fsize = hdr['fsize']
099 def _get_record(self,key):
100 self._io_times = 1
101 index = self._get_index(key) if self.hash_prime > 1 else 0
102 poff = index * 8 + 32
103 self._seek(poff,os.SEEK_SET)
104 buf = self._read(8)
105
106 if(len(buf) ==8):
107 tmp = struct.unpack('I I',buf)
108 tmp = {'off':tmp[0],'len':tmp[1]}
109 else:tmp = {'off':0,'len':0}
110 return self._tree_get_record(tmp['off'],tmp['len'],poff,key)
111
112 def _tree_get_record(self,off,len,poff =0,key =''):
113 if(len == 0): return {'poff':poff}
114 self._io_times+=1
115 self._seek(off,os.SEEK_SET)
116 rlen = XDB_MAXKLEN + 17
117
118 if(rlen > len): rlen = len
119 buf = self._read(rlen)
120 unpack = struct.unpack('I I I I B',buf[0:17])
121 rec = {}
122 rec['loff'],rec['llen'],rec['roff'],rec['rlen'],rec['klen'] = unpack[0],unpack[1],unpack[2],unpack[3],unpack[4]
123
124 fkey = buf[17:17+rec['klen']]
125 cmpl = cmp(key,fkey) if(key) else 0
126 #print key.decode('gbk'),fkey.decode('gbk')
127 if(cmpl > 0):
128 buf =''
129 return self._tree_get_record(rec['roff'],rec['rlen'],off+8,key)
130 elif (cmpl < 0):
131 buf=''
132 return self._tree_get_record(rec['loff'],rec['llen'],off,key)
133 else:
134 rec['poff'] = poff
135 rec['off'] = off
136 rec['len'] = len
137 rec['voff'] = off + 17 + rec['klen']
138 rec['vlen'] = len - 17 - rec['klen']
139 rec['key'] = fkey
140 self._seek(rec['voff'],os.SEEK_SET)
141 rec['value'] = self._read(rec['vlen'])
142 return rec
143 #
144
145 #aa = XDB_R(True)
146 #aa.Open('./dict.xdb')
147 #aab = aa.Get('上海')
148 #print aab