pyscws4 是一个python的分词程序

pyscws4 是一个python的分词程序 | mei year-美叶 专注思想。

     pyscws4 是一个python的分词程序
    Posted on 2012 年 11 月 15 日 by dingyangfan   

    注意:pyscws4 是一个python的分词程序,抄袭至:马明练开发的php版的pscws4 地址是:http://www.ftphp.com/scws/ 。

    翻译了两个文件:

    1. pscws4.php

    2.xdb_r.php

    希望高人可以帮我优化一下代码

    规则文件和词典下载:

    分词.tar

    pyscws4.py文件源码:
    view source
    001    #coding=gbk
    002    from __future__ import division
    003    from collections import OrderedDict
    004    from xdb_r import XDB_R
    005    import math ,struct,copy
    006    import sys,time
    007    reload(sys)
    008    sys.setdefaultencoding('gbk')
    009    ''' defines for ruleset '''
    010    PSCWS4_RULE_MAX     = 31    # just 31, PHP do not support unsigined Int
    011    PSCWS4_RULE_SPECIAL=    0x80000000
    012    PSCWS4_RULE_NOSTATS=    0x40000000
    013    PSCWS4_ZRULE_NONE=  0x00
    014    PSCWS4_ZRULE_PREFIX=    0x01
    015    PSCWS4_ZRULE_SUFFIX=    0x02
    016    PSCWS4_ZRULE_INCLUDE=   0x04    # with include
    017    PSCWS4_ZRULE_EXCLUDE=   0x08    # with exclude
    018    PSCWS4_ZRULE_RANGE =    0x10    # with znum range
    019     
    020    ''' defines for mode of scws <= 0x800 '''
    021    PSCWS4_IGN_SYMBOL=  0x01
    022    PSCWS4_DEBUG=           0x02
    023    PSCWS4_DUALITY=     0x04
    024     
    025    ''' multi segment policy >= 0x1000 '''
    026    PSCWS4_MULTI_NONE=    0x0000        # nothing
    027    PSCWS4_MULTI_SHORT= 0x1000      # split long words to short words from left to right
    028    PSCWS4_MULTI_DUALITY=   0x2000      # split every long words(3 chars?) to two chars
    029    PSCWS4_MULTI_ZMAIN=   0x4000        # split to main single chinese char atr = j¦a¦n?¦v?
    030    PSCWS4_MULTI_ZALL=  0x8000      # attr = ** , all split to single chars
    031    PSCWS4_MULTI_MASK=  0xf000      # mask check for multi set
    032    PSCWS4_ZIS_USED=        0x8000000
    033     
    034    ''' single bytes segment flag (纯单字节字符) '''
    035    PSCWS4_PFLAG_WITH_MB=   0x01
    036    PSCWS4_PFLAG_ALNUM= 0x02
    037    PSCWS4_PFLAG_VALID= 0x04
    038    PSCWS4_PFLAG_DIGIT= 0x08
    039    PSCWS4_PFLAG_ADDSYM=    0x10
    040     
    041    ''' constant var define '''
    042    PSCWS4_WORD_FULL=       0x01    # 多字: 整词
    043    PSCWS4_WORD_PART=       0x02    # 多字: 前词段
    044    PSCWS4_WORD_USED=       0x04    # 多字: 已使用
    045    PSCWS4_WORD_RULE=       0x08    # 多字: 自动识别的
    046     
    047    PSCWS4_ZFLAG_PUT=       0x02    # 单字: 已使用
    048    PSCWS4_ZFLAG_N2=        0x04    # 单字: 双字名词头
    049    PSCWS4_ZFLAG_NR2=       0x08    # 单字: 词头且为双字人名
    050    PSCWS4_ZFLAG_WHEAD= 0x10    # 单字: 词头
    051    PSCWS4_ZFLAG_WPART= 0x20    # 单字: 词尾或词中
    052    PSCWS4_ZFLAG_ENGLISH=   0x40    # 单字: 夹在中间的英文
    053    PSCWS4_ZFLAG_SYMBOL=    0x80    # 单字: 符号系列
    054     
    055    PSCWS4_MAX_EWLEN=       16
    056    PSCWS4_MAX_ZLEN=        128
    057     
    058    class PSCWS4(object):
    059        _xd = None  # xdb dict handler
    060        _rs = None      # ruleset resource
    061        _rd = None      # ruleset data
    062        _cs = ''    # charset
    063        _ztab = []      # zi len table
    064        _mode = 0   # scws mode
    065        _txt = None     # text string
    066        _res = None
    067        _zis = None     # z if used?(duality)
    068        _off = 0
    069        _len = 0
    070        _wend = 0
    071        _wmap = []
    072        _zmap = []
    073        i = 0
    074     
    075        def __init__(self,charset='gbk'):
    076            self._xd = False
    077            self._rs = self._rd = OrderedDict()
    078            self.set_charset(charset)
    079        def __del__(self):
    080            self.close()
    081        def debug(self):
    082            print "off:{0} len(_res):{1} len(_wmap):{2}\
    083     len(_zmap):{3} _wend:{4} _zis:{5}\
    084     len(_rs):{6} len(_rd):{7}\
    085            ".format(\
    086            self._off,len(self._res),len(self._wmap),len(self._zmap),self._wend,self._zis,\
    087            len(self._rs),len(self._rd)
    088            )
    089        #设置字符集(ztab)
    090        def set_charset(self,charset='gbk'):
    091            charset = charset.strip().lower()
    092            if(charset != self._cs):
    093                self._cs = charset
    094                self._ztab = [1 for i in range(0,0x81)]
    095                if(charset == 'utf-8' or charset == 'utf8'):
    096                    self._ztab.extend([1 for i in range(0x81,0xc0)])
    097                    self._ztab.extend([2 for i in range(0xc0,0xe0)])
    098                    self._ztab.extend([3 for i in range(0xe0,0xf0)])
    099                    self._ztab.extend([4 for i in range(0xf0,0xf8)])
    100                    self._ztab.extend([5 for i in range(0xf8,0xfc)])
    101                    self._ztab.extend([6 for i in range(0xfc,0xfe)])
    102                    self._ztab.extend([1])
    103                else:
    104                    self._ztab.extend([2 for i in range(0x81,0xff)])
    105                self._ztab.extend([1])
    106                #print len(self._ztab)
    107            # 设置词典
    108        def set_dict(self,fpath,mem=False):
    109            xdb = XDB_R(mem)
    110            if(xdb.Open(fpath) is not True): return False
    111            self._xd = xdb
    112        #设置规则集
    113        def set_rule(self,fpath):
    114            self._rule_load(fpath)
    115        #设置忽略符号与无用字符
    116        def set_igonre(self,yes):
    117            if(yes is True):self._mode ¦= PSCWS4_IGN_SYMBOL
    118            else: self._mode &= ~PSCWS4_IGN_SYMBOL
    119        #设置复合分词等级 ($level = 0,15)
    120        def set_multi(self,level):
    121            level = (int(level) << 12)
    122            self._mode &= ~PSCWS4_MULTI_MASK
    123            if(level & PSCWS4_MULTI_MASK): self._mode ¦= level
    124        #设置是否显示分词调试信息
    125        def set_debug(self,yes):
    126            if(yes is True): self._mode ¦= PSCWS4_DEBUG
    127            else:self._mode &= ~PSCWS4_DEBUG
    128        #设置是否自动将散字二元化
    129        def set_duality(self,yes):
    130            if(yes is True): self._mode ¦= PSCWS4_DUALITY
    131            else:self._mode &= ~PSCWS4_DUALITY
    132        # 设置要分词的文本字符串
    133        def send_text(self,text):
    134            self._txt = str(text)
    135            self._len = len(self._txt)
    136            self._off =0
    137        # 取回一批分词结果(需要多次调用, 直到返回 false)
    138        def get_result(self):
    139            off = self._off
    140            tlen = self._len
    141            txt = self._txt
    142            self._res = []
    143     
    144            while  ((off < tlen) and (ord(txt[off])<=0x20)):
    145                if(txt[off] == "\r" or txt[off] == "\n"):
    146                    self._off = off +1
    147                    self._put_res(off,0,1,'un')
    148                    return self._res
    149                off +=1
    150            if(off >= tlen): return False
    151            self._off = off
    152            ch = txt[off]
    153            cx = ord(ch)
    154            if(self._char_token(ch)):
    155                self._off +=1
    156                self._put_res(off,0,1,'un')
    157                return self._res
    158            clen = self._ztab[cx]
    159     
    160            zlen = 1
    161            pflag = (PSCWS4_PFLAG_WITH_MB if clen >1 else (PSCWS4_PFLAG_ALNUM if self._is_alnum(cx) else 0))
    162            off = (off + clen)
    163            while off < tlen:
    164                ch = txt[off]
    165                cx = ord(ch)
    166                if (cx <= 0x20 or self._char_token(ch)):break
    167                clen = self._ztab[cx]
    168                if(not (pflag & PSCWS4_PFLAG_WITH_MB)):
    169                    if(clen ==1):
    170                        if((pflag & PSCWS4_PFLAG_ALNUM) and not self._is_alnum(cx)):
    171                            pflag ^= PSCWS4_PFLAG_ALNUM
    172                    else:
    173                        if(not ((pflag & PSCWS4_PFLAG_ALNUM) ) or zlen > 2): break
    174                        pflag  ¦= PSCWS4_PFLAG_WITH_MB
    175                elif ( ((pflag & PSCWS4_PFLAG_WITH_MB) ) and clen ==1):
    176                    #mb + single-byte. allowd: alpha+num + 中文
    177                    if(not self._is_alnum(cx)): break
    178                    pflag &= ~PSCWS4_PFLAG_VALID
    179                    i = off+1
    180                    while i<(off+3):
    181                        ch = txt[i]
    182                        cx = ord(ch)
    183                        if( (i >= tlen) or (cx <=0x20) or (self._ztab[cx] > 1)):
    184                            pflag ¦= PSCWS4_PFLAG_VALID
    185                            break
    186                        if(not self._is_alnum(cx)): break
    187                        i+=1
    188                    if( not(pflag & PSCWS4_PFLAG_VALID) ): break
    189                    clen += (i - off -1)
    190                #add max zlen limit
    191                zlen +=1
    192                if(zlen >=PSCWS4_MAX_ZLEN):break
    193                off = (off + clen)
    194     
    195            #处理半个字的问题
    196            ch =off
    197            if (ch > tlen):
    198                off -= clen
    199            #do the real segment
    200            if(off <= self._off):
    201                return False
    202            elif ( pflag & PSCWS4_PFLAG_WITH_MB ):
    203                self._msegment(off,zlen)
    204            elif ( not(pflag & PSCWS4_PFLAG_ALNUM)  or ((off - self._off) >=PSCWS4_MAX_EWLEN ) ):
    205                self._ssegment(off)
    206            else:
    207                zlen = off -self._off
    208                self._put_res(self._off,2.5*math.log(zlen),zlen,'en')
    209            self._off = (tlen if ch > tlen else off)
    210            if(len(self._res) == 0): return self.get_result()
    211            return self._res
    212        def get_tops(self,limit = 10,xattr = ''):
    213            ret = {}
    214            if(self._txt is None): return False
    215            xmode = False
    216            attrs = {}
    217            if(xattr != ''):
    218                if(xattr[0:1] == '~'):
    219                    xattr = xattr[1:]
    220                    xmode = 1
    221                for tmp in xattr.split(','):
    222                    tmp = tmp.strip().lower()
    223                    if( tmp != ''): attrs[tmp] = True
    224            off = self._off
    225            self._off = cnt = 0
    226            tlist = {}
    227            while 1:
    228                tmpa = self.get_result()
    229                if (not tmpa): break
    230                for tmp in tmpa:
    231                    #有改
    232                    if(tmp['idf'] < 0.2 or tmp['attr'][0:1] == '#'): continue
    233                    if(len(attrs) >0):
    234                        if(xmode == True and not attrs.has_key(tmp['attr'])): continue
    235                        if(xmode == False and attrs.has_key(tmp['attr'])): continue
    236                    word = tmp['word'].lower()
    237                    if(self._rule_checkbit(word,PSCWS4_RULE_NOSTATS)): continue
    238                    if(tlist.has_key(word)):
    239                        tlist[word]['weight'] += tmp['idf']
    240                        tlist[word]['times'] +=1
    241                    else:
    242                        tlist[word] = {'word':tmp['word'],'times':1,'weight':tmp['idf'],'attr':tmp['attr']}
    243            self._off = off
    244            t= sorted(tlist.values(),key=lambda d:d['weight'],cmp=lambda a,b: 1 if b > a else -1)
    245            return t[0:limit]
    246        def close(self):
    247            if(self._xd):
    248                self._xd.Close()
    249                self._xd = False
    250            self._rd = []
    251            self._rs = []
    252        def version(self):
    253            return 'pySCWS/1.0 - by donghongyi'
    254        def _rule_load(self,fpath):
    255            try:
    256                fd = file(fpath,'r')
    257            except IOError:
    258                return False
    259            i = j = 0
    260            self._rs = OrderedDict()
    261            while 1:
    262                buf = fd.readline()
    263                if not buf:
    264                    break
    265                if (buf[0:1] != '['): continue
    266                pos = buf.find(']')
    267                if(pos == -1 or pos ==1 or pos > 15):continue
    268                key = buf[1:pos].lower()
    269                if(self._rs.has_key(key)): continue
    270                item = {'tf':5.0, 'idf':3.5, 'attr':'un', 'bit':0, 'flag':0, 'zmin':0, 'zmax':0, 'inc':0, 'exc':0}
    271                if(key == 'special'):
    272                    item['bit'] = PSCWS4_RULE_SPECIAL
    273                elif (key == 'nostats'):
    274                    item['bit'] = PSCWS4_RULE_NOSTATS
    275                else:
    276                    item['bit'] = (1 << j)
    277                    j +=1
    278                self._rs[key] = item
    279                #这里可能是错误
    280                i +=1
    281                if(i >=PSCWS4_RULE_MAX): break
    282            #load the ruleset
    283            fd.seek(0)
    284            rbl = False
    285            item= {}
    286            while 1:
    287                buf = fd.readline()
    288                if not buf:
    289                    break
    290                ch = buf[0:1]
    291                if(ch == ';'): continue
    292                if(ch == '['):
    293                    item = {}
    294                    pos = buf.find(']')
    295                    if(pos > 1):
    296                        key = buf[1:pos].lower()
    297                        if(self._rs.has_key(key)):
    298                            rbl = True
    299                            item = self._rs[key]
    300                    continue
    301                if(ch == ':'):
    302                    buf = buf[1:]
    303                    pos = buf.find('=')
    304                    if(pos == -1):
    305                        continue
    306                    pkey,pval = buf.split('=',2)
    307                    pkey = pkey.strip()
    308                    pval = pval.strip()
    309                    if(pkey == 'line'):    rbl = False if pval[0:1].strip() == 'n' else True
    310                    elif (pkey =='tf'):    item['tf'] = float(pval)
    311                    elif (pkey =='idf'):    item['idf'] = float(pval)
    312                    elif (pkey =='attr'):    item['attr'] = pval
    313                    elif (pkey == 'znum'):
    314                        pos = pval.find(',')
    315                        if(pos > -1):
    316                            item['zmax'] = int(pval[pos+1:].strip())
    317                            item['flag'] ¦= PSCWS4_ZRULE_RANGE
    318                            pval = pval[0:pos]
    319                        item['zmin'] = int(pval)
    320                    elif (pkey == 'type'):
    321                        if(pval == 'prefix'):
    322                            item['flag'] ¦= PSCWS4_ZRULE_PREFIX
    323                        if(pval == 'suffix'):
    324                            item['flag'] ¦= PSCWS4_ZRULE_SUFFIX
    325                    elif (pkey == 'include' or pkey =='exclude'):
    326                        clude = 0
    327                        for tmp in pval.split(','):
    328                            tmp = tmp.strip().lower()
    329                            if(not self._rs.has_key(tmp)): continue
    330                            clude ¦= self._rs[tmp]['bit']
    331                        if(pkey == 'include'):
    332                            item['inc'] ¦= clude
    333                            item['flag'] ¦= PSCWS4_ZRULE_INCLUDE
    334                        else:
    335                            item['exc'] ¦= clude
    336                            item['flag'] ¦=PSCWS4_ZRULE_EXCLUDE
    337                    continue
    338                if(item == {}): continue
    339                buf = buf.strip()
    340                if (buf == ''): continue
    341                if(rbl):
    342                    self._rd[buf] = item
    343                else:
    344                    tlen = len(buf)
    345                    off =0
    346                    while off < tlen:
    347                        tord = ord(buf[off:off+1])
    348                        zlen = self._ztab[tord]
    349                        if( off + zlen >= tlen): break
    350                        zch = buf[off:off+zlen]
    351                        self._rd[zch] = item
    352                        off += zlen
    353        #get the ruleset
    354        def _rule_get(self,str):
    355            if(not self._rd.has_key(str)): return False
    356            return self._rd[str]
    357        #check the bit with str
    358        def _rule_checkbit(self,str,bit):
    359            if(not self._rd.has_key(str)): return False
    360            bit2 = self._rd[str]['bit']
    361            return (True if (bit & bit2) else False)
    362        #check the rule include ¦ exclude
    363        def _rule_check(self,rule,str):
    364            if( (rule['flag'] & PSCWS4_ZRULE_INCLUDE) and not self._rule_checkbit(str,rule['bit'])): return  False
    365            if( (rule['flag'] & PSCWS4_ZRULE_EXCLUDE) and self._rule_checkbit(str,rule['bit'])): return False
    366            return True
    367        #bulid res
    368        def _put_res(self,o,i,l,a):
    369            word = self._txt[o:o+l]
    370            item = {'word':word,'off':o,'idf':i,'len':l,'attr':a}
    371            self._res.append(item)
    372        #alpha, numeric check by ORD value
    373        def _is_alnum(self,c):
    374            return ((c>=48 and c<=57) or (c>=65 and c<=90) or (c>=97 and c<=122))
    375        def _is_alpha(self,c):
    376            return ((c>=65 and c<=90) or ( c>=97 and c<=122))
    377        def _is_ualpha(self,c):
    378            return (c>=65 and c<=90)
    379        def _is_digit(self,c):
    380            return (c>=48 and c<=57)
    381        def _no_rule1(self,f):
    382            return ((f & (PSCWS4_ZFLAG_SYMBOL¦PSCWS4_ZFLAG_ENGLISH)) or ((f & (PSCWS4_ZFLAG_WHEAD¦PSCWS4_ZFLAG_NR2)) == PSCWS4_ZFLAG_WHEAD))
    383        def _no_rule2(self,f):
    384            return self._no_rule1(f)
    385        def _char_token(self,c):
    386            return (c=='('or c==')'or c=='['or c==']'or c=='{'or c=='}'or c==':'or c=='"')
    387        # query the dict
    388        def _dict_query(self,word):
    389            if(not self._xd): return False
    390            value = self._xd.Get(word)
    391            if(not value): return False
    392            tmp = struct.unpack('f f B 3s',value)
    393            return {'tf':tmp[0],'idf':tmp[1],'flag':tmp[2],'attr':tmp[3].rstrip(b'\x00')}
    394        #ssegment, 单字节用语切割
    395        def _ssegment(self,end):
    396            start = self._off
    397            wlen = end - start
    398            #check special words (need strtoupper)
    399            if(wlen > 1):
    400                #可能出错
    401                txt = self._txt[start:start+wlen].lower()
    402                if(self._rule_checkbit(txt,PSCWS4_RULE_SPECIAL)):
    403                    self._put_res(start,9.5,wlen,'nz')
    404                    return
    405            txt = self._txt
    406            #check brief words such as S.H.E M.R.
    407            if( self._is_ualpha(ord(txt[start])) and txt[start+1] == '.'):
    408                #修改
    409                ch = start +2
    410                while ch< end:
    411                    if(not self._is_alpha(ord(txt[ch]))): break
    412                    ch +=1
    413                    if(ch == end or txt[ch] != '.'): break
    414                    ch +=1
    415                if(ch == end):
    416                    self._put_res(start,7.5,wlen,'nz')
    417                    return
    418            #取出单词及标点. 数字允许一个点且下一个为数字,不连续的. 字母允许一个不连续的'
    419            #print 1111
    420            while start < end:
    421                #修改过的
    422                ch = txt[start]
    423                start +=1
    424                cx = ord(ch)
    425                if(self._is_alnum(cx)):
    426                    pflag =PSCWS4_PFLAG_DIGIT if self._is_digit(cx) else 0
    427                    wlen = 1
    428                    while start < end:
    429                        ch = txt[start]
    430                        cx = ord(ch)
    431                        if(pflag & PSCWS4_PFLAG_DIGIT):
    432                            if(not self._is_digit(cx)):
    433                                if( (pflag & PSCWS4_PFLAG_ADDSYM) or cx !=0x2e or not self._is_digit(ord(txt[start+1]))):
    434                                    break
    435                                pflag ¦= PSCWS4_PFLAG_ADDSYM
    436                        else:
    437                            if(not self._is_alpha(cx)):
    438                                if( (pflag & PSCWS4_PFLAG_ADDSYM) or cx !=0x27 or not self._is_alpha(ord(txt[start+1]))):
    439                                    break
    440                                pflag ¦= PSCWS4_PFLAG_ADDSYM
    441                        start +=1
    442                        #可能出错
    443                        wlen +=1
    444                        if(wlen >=PSCWS4_MAX_EWLEN): break
    445                    self._put_res(start-wlen,2.5*math.log(wlen),wlen,'en')
    446                elif (not(self._mode & PSCWS4_IGN_SYMBOL)):
    447                    self._put_res(start-1,0,1,'un')
    448        #get one z by ZMAP
    449        def _get_zs(self,i,j = -1):
    450            if(j == -1): j = i
    451            return self._txt[self._zmap[i]['start']:self._zmap[i]['start']+( self._zmap[j]['end'] - self._zmap[i]['start'])]
    452        #mget_word
    453        def _mget_word(self,i,j):
    454            wmap = self._wmap
    455            if(not (wmap[i][i]['flag'] & PSCWS4_ZFLAG_WHEAD)): return i
    456            r = i
    457            #观察
    458            #k=i+1
    459            for k in range(i+1,j+1):
    460                #while k<=j:
    461                if(wmap[i][k] and wmap[i][k]['flag'] & PSCWS4_WORD_FULL): r =k
    462                #k+=1
    463            return r
    464        #mset_word
    465        def _mset_word(self,i,j):
    466                wmap = self._wmap
    467                zmap = self._zmap
    468                item = wmap[i][j]
    469                if( (item is False) or (( self._mode & PSCWS4_IGN_SYMBOL)\
    470                    and not (item['flag'] & PSCWS4_ZFLAG_ENGLISH) and item['attr'] == 'un' )\
    471                    ):
    472                    return
    473                #散字自动二元聚合
    474                if(self._mode & PSCWS4_DUALITY):
    475                    k = self._zis
    476                    if(i == j and not(item['flag'] & PSCWS4_ZFLAG_ENGLISH) and item['attr'] == 'un'):
    477                        self._zis = i
    478                        if(k < 0): return
    479                        i = (k & ~PSCWS4_ZIS_USED)
    480                        if( (i != (j-1)) or (not (k & PSCWS4_ZIS_USED) and self._wend == i)):
    481                            self._put_res(zmap[i]['start'],wmap[i][i]['idf'],zmap[i]['end'] - zmap[i]['start'],wmap[i][i]['attr'])
    482                            if( i != (j -1)): return
    483                        self._zis ¦= PSCWS4_ZIS_USED
    484                    else:
    485                        if( (k >=0) and (not (k & PSCWS4_ZIS_USED) or ( j > i))):
    486                            k &= ~PSCWS4_ZIS_USED
    487                            self._put_res(zmap[k]['start'], wmap[k][k]['idf'], zmap[k]['end'] - zmap[k]['start'], wmap[k][k]['attr'])
    488                        if( j > i): self._wend = j + 1
    489                        self._zis = -1
    490                #save the res
    491                self._put_res(zmap[i]['start'], item['idf'], zmap[j]['end'] - zmap[i]['start'], item['attr'])
    492                if( (j -i) > 1):
    493                    m = i
    494                    if ( self._mode & PSCWS4_MULTI_SHORT):
    495                        while (m < j):
    496                            k = m
    497                            n = m+1
    498                            while n<=j:
    499                                if(n ==j and m ==i): break
    500                                item = wmap[m][n]
    501                                if(item and item['flag'] & PSCWS4_WORD_FULL):
    502                                    k = n
    503                                    self._put_res(zmap[m]['start'], item['idf'], zmap[n]['end'] - zmap[m]['start'], item['attr'])
    504                                    if (not (item['flag'] & PSCWS4_WORD_PART)): break
    505                                n +=1
    506                            if (k == m):
    507                                if (m == i): break
    508                                item = wmap[m][m]
    509     
    510                                self._put_res(zmap[m]['start'], item['idf'], zmap[m]['end'] - zmap[m]['start'], item['attr'])
    511                            m = k+1
    512                            if(m == j):
    513                                m -=1
    514                                break
    515                    if( self._mode & PSCWS4_MULTI_DUALITY):
    516                        while m < j:
    517                            self._put_res(zmap[m]['start'], wmap[m][m]['idf'], zmap[m+1]['end'] - zmap[m]['start'], wmap[m][m]['attr'])
    518                            m +=1
    519                if( (j > i) and (self._mode & (PSCWS4_MULTI_ZMAIN¦PSCWS4_MULTI_ZALL))):
    520                    if( (j -i) == 1 and not wmap[i][j]):
    521                        if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_PUT): i +=1
    522                        else: wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_PUT
    523                        wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_PUT
    524                    #这里可能错误
    525                    while i <=j:
    526                        if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_PUT): continue
    527                        ssss = wmap[i][i]['attr'][0:1]
    528                        #print ssss
    529                        if( not (self._mode & PSCWS4_MULTI_ZALL) and not ( ssss[ssss.find('jnv'):])): continue
    530                        self._put_res(zmap[i]['start'], wmap[i][i]['idf'], zmap[i]['end'] - zmap[i]['start'], wmap[i][i]['attr'])
    531                        i +=1
    532        #mseg_zone
    533        def _mseg_zone(self,f,t):
    534            weight = nweight = 0.0
    535            wmap = self._wmap
    536            zmap = self._zmap
    537            mpath = npath = []
    538            x = f
    539            for i in range(f,t+1):
    540                j = self._mget_word(i,t)
    541                if ( j == i or j <=x or ( wmap[i][j]['flag'] & PSCWS4_WORD_USED)): continue
    542                #one word only
    543                if (i ==f and j ==t):
    544                    mpath = [(j-i),0xff]
    545                    break
    546                if( i !=f and (wmap[i][j]['flag'] & PSCWS4_WORD_RULE)): continue
    547                #create the new path
    548                wmap[i][j]['flag'] ¦= PSCWS4_WORD_USED
    549                nweight = wmap[i][j]['tf'] * (j-i+1)
    550     
    551                if (i ==f): nweight *=1.2
    552                elif (j ==t): nweight *=1.4
    553                if(npath == []):
    554                    npath = [0xff for uuu in range(t-f+2)]
    555     
    556                #lookfor backward
    557                x = 0
    558                m = f
    559                while m< i:
    560                    n = self._mget_word(m,i-1)
    561                    nweight *= wmap[m][n]['tf'] * (n-m+1)
    562                    npath[x] = n-m
    563                    x +=1
    564                    if(n>m): wmap[m][n]['flag'] ¦= PSCWS4_WORD_USED
    565                    m = n+1
    566                #my self
    567                npath[x] = j-i
    568                x+=1
    569                #lookfor forward
    570                m = j+1
    571                while m <=t:
    572                    n = self._mget_word(m,t)
    573                    nweight *= wmap[m][n]['tf'] * (n-m+1)
    574                    npath[x] = n-m
    575                    x +=1
    576                    if(n >m): wmap[m][n]['flag'] ¦= PSCWS4_WORD_USED
    577                    m = n+1
    578                npath[x] = 0xff
    579                nweight /= pow(x-1,4)
    580                #draw the path for debug
    581                if(self._mode & PSCWS4_DEBUG):
    582                    print "PATH by keyword = {0} (weight={1}):\n".format(self._get_zs(i,j),nweight)
    583                    m=f
    584                    x=0
    585                    n = npath[x]
    586                    while n !=0xff:
    587                        n +=m
    588                        print self._get_zs(m,n),' '
    589                        m = n+1
    590                        x+=1
    591                        n = npath[x]
    592                    print  "\n--\n"
    593                x = j
    594                #check better path
    595                if(nweight > weight):
    596                    weight = copy.deepcopy(nweight)
    597                    swap = copy.deepcopy(mpath)
    598                    mpath = copy.deepcopy(npath)
    599                    npath = copy.deepcopy(swap)
    600                    del swap
    601            #set the result, mpath != NULL
    602            if(mpath == []): return
    603            m = f
    604            x=0
    605            n = mpath[x]
    606            #print mpath
    607            while n !=0xff:
    608                n +=m
    609                #print m,n
    610                self._mset_word(m,n)
    611                m = n +1
    612                x+=1
    613                n = mpath[x]
    614        #msegment(重点函数)
    615        def _msegment(self,end,zlen):
    616            self._wmap = [[False for ooooo in range(zlen)] for i in range(zlen)]
    617            self._zmap = [False for ooooo in range(zlen)]
    618            wmap = self._wmap
    619            zmap = self._zmap
    620            txt = self._txt
    621            start = self._off
    622            self._zis = -1
    623            #load the zmap
    624            i =0
    625            #load the zmap
    626            while start < end:
    627                ch = txt[start]
    628                cx = ord(ch)
    629                clen = self._ztab[cx]
    630                if(clen == 1):
    631                    while start < end:
    632                        start +=1 #修改
    633                        cx = ord(txt[start])
    634                        if(self._ztab[cx] > 1): break
    635                        clen +=1
    636                    wmap[i][i] = {'tf':0.5, 'idf':0, 'flag':PSCWS4_ZFLAG_ENGLISH, 'attr':'un'}
    637                else:
    638                    query = self._dict_query(txt[start:start+clen])
    639                    if(not query):
    640                        wmap[i][i] = {'tf':0.5, 'idf':0, 'flag':0, 'attr':'un'}
    641                    else:
    642                        if(query['attr'][0:1] == '#'): query['flag'] ¦= PSCWS4_ZFLAG_SYMBOL
    643                        wmap[i][i] = query
    644                    start += clen
    645                zmap[i] = {'start':start-clen, 'end':start}
    646                i+=1
    647     
    648            #fixed real zlength
    649            zlen = i
    650            #create word query table
    651            for i in range(zlen):
    652                k=0
    653                j = i +1
    654                while j<zlen:
    655                    query = self._dict_query(self._get_zs(i,j))
    656                    if (not query):break
    657                    ch = query['flag']
    658                    if(ch & PSCWS4_WORD_FULL):
    659                        wmap[i][j] = query
    660                        wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
    661                        k = i+1
    662                        while k<=j:
    663                            wmap[k][k]['flag'] ¦= PSCWS4_ZFLAG_WPART
    664                            k+=1
    665                    if (not (ch & PSCWS4_WORD_PART)): break
    666                    j +=1
    667                k-=1
    668                if(k and k>=0):
    669                    #set nr2 to some short name
    670                    if(k == (i+1)):
    671                        if(wmap[i][k]['attr'] == 'nr'):
    672                            wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_NR2
    673                    #clean the PART flag for the last word
    674                    if(k < j):
    675                        wmap[i][k]['flag'] ^= PSCWS4_WORD_PART
    676            # try to do the ruleset match
    677            # for name & zone & chinese numeric
    678            if(len(self._rd) > 0):
    679                #check for 'one word'
    680                for i in range(zlen):
    681                    if(self._no_rule1(wmap[i][i]['flag'])): continue
    682                    r1 = self._rule_get(self._get_zs(i))
    683                    if(not r1): continue
    684                    clen = r1['zmin'] if r1['zmin'] >0 else 1
    685                    if(( r1['flag'] & PSCWS4_ZRULE_PREFIX) and (i < (zlen-clen))):
    686                        #先检查 zmin 字内是否全部符合要求, 再在 zmax 范围内取得符合要求的字
    687                        ch =1
    688                        while ch <=clen:
    689                            j = i + ch
    690                            if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])):break
    691                            if(not self._rule_check(r1,self._get_zs(j))): break
    692                            ch+=1
    693                        if(ch <= clen): continue
    694                        #no limit znum or limit to a range
    695                        j = i +ch
    696                        while 1:
    697                            if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >= r1['zmax']))): break
    698                            if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])): break
    699                            if( not self._rule_check(r1,self._get_zs(j))): break
    700                            clen +=1
    701                            j +=1
    702                        # 注意原来2字人名,识别后仍为2字的情况
    703                        if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_NR2):
    704                            if(clen ==1): continue
    705                            wmap[i][i+1]['flag'] ¦= PSCWS4_WORD_PART
    706                        #ok, got: i & clen
    707                        k = i + clen
    708                        wmap[i][k] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':(PSCWS4_WORD_RULE¦PSCWS4_WORD_FULL), 'attr':r1['attr']}
    709                        wmap[i][i]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
    710     
    711                        j = i+1
    712                        while j<=k:
    713                            wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
    714                            j+=1
    715                        if(not (wmap[i][i]['flag'] & PSCWS4_ZFLAG_WPART)): i =k
    716                        continue
    717                    if( (r1['flag'] & PSCWS4_ZRULE_SUFFIX) and (i >= clen)):
    718                        #suffix, check before
    719                        ch = 1
    720                        while ch<=clen:
    721                            j = i -ch
    722                            if(j < 0 or self._no_rule1(wmap[j][j]['flag'])): break
    723                            if(not self._rule_check(r1, self._get_zs(j))):break
    724                            ch+=1
    725                        if (ch <= clen): continue
    726                        #no limit znum or limit to a range
    727                        j = i - ch
    728                        while 1:
    729                            if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >= r1['zmax']))): break
    730                            if( j < 0 or self._no_rule2(wmap[j][j]['flag'])): break
    731                            if( not self._rule_check(r1,self._get_zs(j))): break
    732                            clen +=1
    733                            j -=1
    734                        #ok, got: i & clen (maybe clen=1 & [k][i] isset)
    735                        k = i -clen
    736                        if(wmap[k][i] is not False): continue
    737                        wmap[k][i] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':PSCWS4_WORD_FULL, 'attr':r1['attr']}
    738                        wmap[k][k]['flag']  ¦= PSCWS4_ZFLAG_WHEAD
    739                        j = k+1
    740                        while j <=i:
    741                            wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
    742                            if( (j != i) and (wmap[k][i] is not False) ): wmap[k][j]['flag'] ¦= PSCWS4_WORD_PART
    743                            j+=1
    744                        continue
    745                #check for 'two words' (such as: 欧阳** , **西路)
    746                #print wmap[6]
    747                for i in range(zlen-2,-1,-1):
    748                    #with value ==> must be have SCWS_WORD_FULL, so needn't check it ag.
    749                    if( (wmap[i][i+1] is False) or wmap[i][i+1]['flag'] & PSCWS4_WORD_PART): continue
    750                    k = i +1
    751                    #print k
    752                    r1= self._rule_get(self._get_zs(i,k))
    753                    if(not r1): continue
    754                    clen =r1['zmin'] if r1['zmin'] else 1
    755                    if( (r1['flag'] & PSCWS4_ZRULE_PREFIX) and (k < (zlen-clen))):
    756                        ch = 1
    757                        while ch<=clen:
    758                            j = k +ch
    759                            if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])): break
    760                            if(not self._rule_check(r1,self._get_zs(j))): break
    761                            ch +=1
    762                        if(ch <= clen):continue
    763                        #no limit znum or limit to a range
    764                        j = k+ch
    765                        while 1:
    766                            if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >=r1['zmax']))): break
    767                            if(j >= zlen or self._no_rule2(wmap[j][j]['flag'])): break
    768                            if(not self._rule_check(r1,self._get_zs(j))): break
    769                            clen +=1
    770                            j +=1
    771                        #ok, got: i & clen
    772                        k = k + clen
    773                        wmap[i][k] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':PSCWS4_WORD_FULL, 'attr':r1['attr']}
    774                        wmap[i][i+1]['flag'] ¦= PSCWS4_WORD_PART
    775                        j=i+2
    776                        while j<=k:
    777                            wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
    778                            j+=1
    779                        i -=1
    780                        continue
    781                    if ( (r1['flag'] & PSCWS4_ZRULE_SUFFIX) and (i >= clen)):
    782                        # suffix, check before
    783                        ch = 1
    784                        while ch<=clen:
    785                            j = i -ch
    786                            if(j < 0 or self._no_rule1(wmap[j][j]['flag'])): break
    787                            if ( not self._rule_check(r1,self._get_zs(j))): break
    788                            ch +=1
    789                        if (ch <= clen): continue
    790                        #no limit znum or limit to a range
    791                        j = i - ch
    792                        while 1:
    793                            if( (not r1['zmax'] and r1['zmin']) or (r1['zmax'] and (clen >= r1['zmax'])) ): break
    794                            if(j < 0 or self._no_rule2(wmap[j][j]['flag'])): break
    795                            if( not self._rule_check(r1,self._get_zs(j))): break
    796                            clen +=1
    797                            j -=1
    798                        #ok, got: i & clen (maybe clen=1 & [k][i] isset)
    799                        k = i - clen
    800                        i = i +1
    801                        wmap[k][i] = {'tf':r1['tf'], 'idf':r1['idf'], 'flag':PSCWS4_WORD_FULL, 'attr':r1['attr']}
    802                        wmap[k][k]['flag'] ¦= PSCWS4_ZFLAG_WHEAD
    803                        j = k+1
    804                        while j<=i:
    805                            wmap[j][j]['flag'] ¦= PSCWS4_ZFLAG_WPART
    806                            if(wmap[k][j] is not False): wmap[k][j]['flag'] ¦= PSCWS4_WORD_PART
    807                            j+=1
    808                        i -= (clen +1)
    809                        continue
    810            # do the segment really
    811            # find the easy break point
    812     
    813            j=0
    814            i=0
    815            for i in range(zlen):
    816                if(wmap[i][i]['flag'] & PSCWS4_ZFLAG_WPART): continue
    817                if(i > j):
    818                    self._mseg_zone(j,i-1)
    819                j = i
    820                if (not (wmap[i][i]['flag'] & PSCWS4_ZFLAG_WHEAD)):
    821                    self._mset_word(i,i)
    822                    j+=1
    823            i+=1
    824            #错在这里
    825            #the lastest zone
    826            if(i > j):
    827                self._mseg_zone(j,i-1)
    828            if( (self._mode & PSCWS4_DUALITY) and (self._zis >=0) and not (self._zis & PSCWS4_ZIS_USED) ):
    829                i = self._zis
    830                self._put_res(zmap[i]['start'],wmap[i][i]['idf'],zmap[i]['end'] - zmap[i]['start'],wmap[i][i]['attr'])
    831     
    832    def test(text):
    833            st = time.time()
    834            text = text
    835            for i in range(100):
    836                cws.send_text(text)
    837                while cws.get_result():
    838                    pass
    839     
    840            ret = cws.get_tops(10,'r,v,p')
    841            print "No.\tWord\t\t\tAttr\tTimes\tRank\n------------------------------------------------------\n"
    842            i = 0
    843            for tmp in ret:
    844                i+=1
    845                print "%02d.\t%-8s\t%s\t%d\t%.2f" %( i, tmp['word'].decode('gbk'),tmp['attr'], tmp['times'], tmp['weight'])
    846     
    847            print u'所花时间:',time.time()-st
    848    if __name__=='__main__':
    849        cws =  PSCWS4('gbk')
    850        cws.set_dict('dict.xdb',True)
    851        cws.set_rule('rules.ini')
    852        cws.send_text("""中国航天官员应邀到美国与太空总署官员开会 发展中国家 上海大学城书店 表面的东西 今天我买了一辆面的,于是我坐着面的去上班 化妆和服装 这个门把手坏了,请把手拿开 将军任命了一名中将,产量三年中将增长两倍 王军虎去广州了,王军虎头虎脑的 欧阳明练功很厉害可是马明练不厉害 北京华烟云 人中出吕布 马中出赤兔Q1,中我要买Q币充值""")
    853        cws.set_igonre(False) #设置忽略符号与无用字符
    854        #cws.set_debug(True) #设置是否显示分词调试信息
    855        cws.set_multi(3) #设置复合分词等级 ($level = 0,15)
    856        cws.set_duality(True) #设置是否自动将散字二元化
    857        #test("中国航天官员应邀到美国与太空总署官员开会 发展中国家 上海大学城书店 表面的东西 今天我买了一辆面的,于是我坐着面的去上班 化妆和服装 这个门把手坏了,请把手拿开 将军任命了一名中将,产量三年中将增长两倍 王军虎去广州了,王军虎头虎脑的 欧阳明练功很厉害可是马明练不厉害 京华烟云 人中出吕布 马中出赤兔Q1,中我要买Q币充值")
    858     
    859    while 1:
    860        tmp = cws.get_result()
    861        if(not tmp):break
    862        line = ''
    863        for w in tmp:
    864            if (w['word'] == "\r"): continue
    865            if (w['word'] == "\n"):
    866                line =  line.rstrip(' ') + "\n"
    867            #else: line .= w['word'] . "/{w['attr']} "
    868            else: line += w['word'] + " "
    869        print line
    870    #t = ','
    871    #print len(t[0:2])
    872    #print ord(t[1])
    873     
    874    #    ret = cws.get_tops(10,'r,v,p')
    875     
    876    #    print "No.\tWord\t\t\tAttr\tTimes\tRank\n------------------------------------------------------\n"
    877    #    i = 0
    878    #    for tmp in ret:
    879    #        i+=1
    880    #       print "%02d.\t%-8s\t%s\t%d\t%.2f" %( i, tmp['word'].decode('gbk'),tmp['attr'], tmp['times'], tmp['weight'])

    xdb_r.py文件源码:
    view source
    001    #coding=gbk
    002    import os
    003    import struct
    004    import sys
    005    reload(sys)
    006    sys.setdefaultencoding('gbk')
    007    XDB_VERSION = 34 # 0x01 ~ 0xff
    008    XDB_TAGNAME = 'XDB' # First bytes
    009    XDB_MAXKLEN = 0xf0 # maxklen: < 255
    010     
    011    class XDB_R(object):
    012        fd = False
    013        hash_base = 0
    014        hash_prime = 0
    015        memread = None #内存
    016        mem = False #是否启用内存
    017        off = 0 #位置
    018        len = 0 #内存长度
    019        def __init__(self,mem=False):
    020            self.mem = mem
    021            pass
    022        def __del__(self):
    023            self.Close()
    024            pass
    025        def Open(self,fpath):
    026            self.Close()
    027            try:
    028                fd = file(fpath,'rb')
    029            except IOError:
    030                raise Exception('XDB::Open("' + os.path.basename(fpath) + '"),invalid xdb failed.')
    031            else:
    032                if(self.mem):
    033                    self.memread = fd.read()
    034                    self.len = len(self.memread)
    035                self.fd = fd
    036            if( self._check_header(fd) is False):
    037                raise Exception('XDB::Open("' + os.path.basename(fpath) + '"),invalid xdb format.')
    038                fd.close()
    039            return True
    040        def _read(self,size):
    041            if(self.mem):
    042                return self.memread[self.off:self.off+size]
    043            else:
    044                return self.fd.read(size)
    045        def _seek(self,seek,flag=False):
    046            if(self.mem):
    047                if self.off > self.len: raise Exception('Mem offset !')
    048                self.off = seek
    049            else:
    050                self.fd.seek(seek,flag)
    051        def _close(self):
    052            if(self.mem):
    053                self.memread = None
    054            else:
    055                self.fd.close()
    056            self.fd = False
    057        def Get(self,key):
    058            if(self.fd is False):
    059                raise Exception('XDB:Get(), null db handler.')
    060            klen = len(key)
    061            #print klen
    062            if(klen ==0 or klen > XDB_MAXKLEN):
    063                return False
    064            rec = self._get_record(key)
    065            if(not rec.has_key('vlen')  or rec['vlen'] ==0):
    066                return False
    067     
    068            return rec['value']
    069        def Close(self):
    070            if(self.fd is False):
    071                return
    072            self._close()
    073        def _get_index(self,key):
    074            l = len(key)
    075            h = self.hash_base
    076            while l:
    077                l-=1
    078                h += (h << 5)
    079                h ^= ord(key[l])
    080                h &= 0x7fffffff
    081            return (h % self.hash_prime)
    082        def _check_header(self,fd):
    083            fd.seek(0,os.SEEK_SET)
    084            buf = fd.read(32)
    085            if(len(buf) != 32): return False
    086            unpack = struct.unpack('3s B I I I f 12s',buf)
    087            if(len(unpack) <=6):
    088                unpack = list(unpack)
    089                unpack.extend(' ')
    090            hdr = {}
    091            hdr['tag'],hdr['ver'],hdr['base'],hdr['prime'],hdr['fsize'],hdr['check'],hdr['reversed'] =unpack[0],unpack[1],unpack[2],unpack[3],unpack[4],unpack[5],unpack[6]
    092            if(hdr['tag'] != XDB_TAGNAME): return False
    093            fstat = os.fstat(fd.fileno())
    094            if(fstat.st_size != hdr['fsize']): return False
    095            self.hash_base = hdr['base']
    096            self.hash_prime = hdr['prime']
    097            self.version = hdr['ver']
    098            self.fsize = hdr['fsize']
    099        def _get_record(self,key):
    100            self._io_times = 1
    101            index = self._get_index(key) if self.hash_prime > 1 else 0
    102            poff = index * 8 + 32
    103            self._seek(poff,os.SEEK_SET)
    104            buf = self._read(8)
    105     
    106            if(len(buf) ==8):
    107                tmp = struct.unpack('I I',buf)
    108                tmp = {'off':tmp[0],'len':tmp[1]}
    109            else:tmp = {'off':0,'len':0}
    110            return self._tree_get_record(tmp['off'],tmp['len'],poff,key)
    111     
    112        def _tree_get_record(self,off,len,poff =0,key =''):
    113            if(len == 0): return {'poff':poff}
    114            self._io_times+=1
    115            self._seek(off,os.SEEK_SET)
    116            rlen = XDB_MAXKLEN + 17
    117     
    118            if(rlen > len): rlen = len
    119            buf = self._read(rlen)
    120            unpack = struct.unpack('I I I I B',buf[0:17])
    121            rec = {}
    122            rec['loff'],rec['llen'],rec['roff'],rec['rlen'],rec['klen'] = unpack[0],unpack[1],unpack[2],unpack[3],unpack[4]
    123     
    124            fkey = buf[17:17+rec['klen']]
    125            cmpl = cmp(key,fkey) if(key) else 0
    126            #print key.decode('gbk'),fkey.decode('gbk')
    127            if(cmpl > 0):
    128                buf =''
    129                return self._tree_get_record(rec['roff'],rec['rlen'],off+8,key)
    130            elif (cmpl < 0):
    131                buf=''
    132                return self._tree_get_record(rec['loff'],rec['llen'],off,key)
    133            else:
    134                rec['poff'] = poff
    135                rec['off'] = off
    136                rec['len'] = len
    137                rec['voff'] = off + 17 + rec['klen']
    138                rec['vlen'] = len - 17 - rec['klen']
    139                rec['key'] = fkey
    140                self._seek(rec['voff'],os.SEEK_SET)
    141                rec['value'] = self._read(rec['vlen'])
    142                return rec
    143    #
    144     
    145    #aa = XDB_R(True)
    146    #aa.Open('./dict.xdb')
    147    #aab = aa.Get('上海')
    148    #print aab


你可能感兴趣的:(python)