python词法分析(分词+词性标注)

 # -*- coding: cp936 -*- ###librarys: import sys ###global variables: freqdic={} dic={} transferdic={} inputfilename='' outputfilename='' trainingfilename='' marklist=[] ###classes: class nodeinfotable: def __init__(self,No,length,previousdict): self.No=No self.length=length self.previousdict=previousdict class node: def __init__(self,outpointerdict,tablelist): self.outpointerdict=outpointerdict self.tablelist=tablelist def addOutPointer(self,targetnum,wordname): self.outpointerdict[targetnum]=wordname class wordnode: def __init__(self,wordval,propertynodelist): self.wordval=wordval self.propertynodelist=propertynodelist class propertynode: def __init__(self,propertyval,probability,beststack): self.propertyval=propertyval self.probability=probability self.beststack=beststack ###functions: #find cmpstr in valuestr,which is splited by ',' def findinstr(valuestr,cmpstr): wordlist=valuestr.split(',') for w in wordlist: if cmpstr==w: return True return False # generate the dictionary def chnsegtager_training(): global dic global freqdic global transferdic transfercounter=0 wordcounter=0 orgname = '' f=file(trainingfilename) while True: line = f.readline() if len(line)==0: break line=line.strip() linewordindex=0 prewordproperty='' currwordproperty='' if line.find('/') != -1: lineset = line.split(' ') for w in lineset:#对于一行中每一个字母 wordcounter+=1 singlewordset = w.split('/') formerword = singlewordset[0] laterword = singlewordset[1] if formerword.find('[') != -1: #has '[' token formerword=formerword[1:] orgname=formerword else: if orgname != '': orgname += formerword rbracketpos = laterword.find(']') if rbracketpos != -1: #has ']' token orgproperty=laterword[rbracketpos+1:] laterword=laterword[:rbracketpos] if dic.has_key(orgname): if freqdic.has_key(orgname+','+orgproperty): freqdic[orgname+','+orgproperty]+=1 else: freqdic[orgname+','+orgproperty]=1 orgvalueStr = dic[orgname] if findinstr(orgvalueStr,orgproperty)==False: dic[orgname] += ','+orgproperty orgname = '' else: dic[orgname] = orgproperty freqdic[orgname+','+orgproperty]=1 orgname = '' if dic.has_key(formerword):#字典里存在此单词 if freqdic.has_key(formerword+','+laterword): freqdic[formerword+','+laterword]+=1 else: freqdic[formerword+','+laterword]=1 valueStr = dic[formerword] if findinstr(valueStr,laterword)==False: dic[formerword] += ','+laterword else:#字典不存在,创建 dic[formerword] = laterword freqdic[formerword+','+laterword]=1 linewordindex+=1 if linewordindex==1: currwordproperty=laterword else: prewordproperty=currwordproperty currwordproperty=laterword transstr=prewordproperty+':'+currwordproperty if transferdic.has_key(transstr): transferdic[transstr]+=1 transfercounter+=1 else: transferdic[transstr]=1 transfercounter+=1 linewordindex+=1 #统计转移概率 for trstr,trnum in transferdic.items(): pcl=float(trnum) probability=pcl/transfercounter transferdic[trstr]=probability #统计词频 for fx,lx in freqdic.items(): nlx=float(lx) prox=nlx/wordcounter freqdic[fx]=prox f.close() #get the file name from command line def getArguments(): global inputfilename,outputfilename,trainingfilename #sys.argv += ['input.txt','output.txt'] if len(sys.argv) < 3: print 'Error:No file specified.(arg1:name of input file; arg2:name of output file; arg3:training file name' sys.exit() inputfilename = sys.argv[1] outputfilename = sys.argv[2] trainingfilename=sys.argv[3] #get the punctuation marks in the dict def getMarksList(): global dic,freqdic,marklist for f,l in dic.items(): if l=='w': marklist.append(f) #check if the character is a mark def checkMark(unicha): for w in marklist: uniw=w.decode('gbk') if uniw==unicha: return True return False #process single sentance def processSentance(unisent,unisymbol): global inputfilename,outputfilename if len(unisent)==0: gbksy = unisymbol.encode('gbk') if gbksy!='' and gbksy!='/n': savefile=file(outputfilename,'a') savefile.write(gbksy+'//w') savefile.write(' ') savefile.close() return global dic,freqdic nodelist=[] #store the word in the the single phrase index=0 sentlen=len(unisent) for w in unisent: index+=1 #index is the character behind w #---------add the single word edge---------- gbksingleword=w.encode('gbk') if dic.has_key(gbksingleword): newnode=node({gbksingleword:dic[gbksingleword]},[]) else: newnode=node({gbksingleword:'unknown'},[]) if index==sentlen: #reach the last element nodelist.append(newnode) break #newnode= node({},[])#make a node unistrafterw=unisent[index:] fw=w lw=u'' for bw in unistrafterw: lw+=bw uniword=fw+lw gbkword=uniword.encode('gbk') if dic.has_key(gbkword):#create a chain newnode.outpointerdict[gbkword]=dic[gbkword] nodelist.append(newnode) lastnode=node({},[]) nodelist.append(lastnode) #generate the information table for each node nodeindex=0 #the index of the current node for nodeitem in nodelist: #nodeitem:node if nodeindex==0: nodeindex+=1 continue if nodeindex==1: newtableitem=nodeinfotable(1,1,{0:0}) nodeitem.tablelist.append(newtableitem) nodeindex+=1 continue No=0 #the sequence number of the table item for each node prelist = nodelist[:nodeindex] #nodes before the nodeitem preindex=0 for prenode in prelist:#node in before nodeitem for wordname,wordproperty in prenode.outpointerdict.items(): uniwordname=wordname.decode('gbk') wordlength=len(uniwordname) if nodeindex==(preindex+wordlength):#exist a pointer to current node if preindex==0: No+=1 newtableitem=nodeinfotable(No,1,{0:0}) nodeitem.tablelist.append(newtableitem) else: preNo=0 #sequence of tableitem number in the prenode for pretableitem in prenode.tablelist:#pretableitem:nodeinfotable in the prenode preNo+=1 No+=1 newtableitem=nodeinfotable(No,pretableitem.length+1,{preindex:preNo})#add the item nodeitem.tablelist.append(newtableitem) preindex+=1 nodeindex+=1 n=0 #find all the partition results in nodelist[], and store them into a hashtable pathhashtable={} tailnode=nodelist[-1] tailnodenum=len(nodelist)-1 for tableitem in tailnode.tablelist: #tableitem:single table item of the last node pathlength=tableitem.length #the length of the path for ff,fl in tableitem.previousdict.items(): prenodenumber=ff pathstack=[tailnodenum] iteritem=tableitem while prenodenumber!=0: #go until reach the node 0 for f,l in iteritem.previousdict.items(): #exactly, there is noly one item in the dict prenodenumber=f #previous node number prenodeitemnumber=l #the number of sequence item of the previous node pathstack.insert(0,prenodenumber) iteritem=nodelist[prenodenumber].tablelist[prenodeitemnumber-1] for f,l in iteritem.previousdict.items(): #update data prenodenumber=f #previous node number prenodeitemnumber=l #the number of sequence item of the previous node pathstack.insert(0,0) pathhashtable[pathlength]=pathstack xulie=0 for length,path in pathhashtable.items(): if xulie==0: minlen=length shortestpath=path else: if lengthmaxprobability: maxprobability=nowprobability #maxprenode=pnode maxprenode.propertyval=pnode.propertyval maxprenode.probability=pnode.probability maxprenode.beststack=pnode.beststack nowbeststack=[] for se in maxprenode.beststack: nowbeststack.append(se) nowbeststack.append(p) pronode=propertynode(p,maxprobability,nowbeststack) nowwordnode.propertynodelist.append(pronode) else: #字典不存在此词,则将转移概率置为1 #取得前结点的所有词性集,然后取最大概率词性,生成当前结点的最优路径 prewordnode=hmmwordlist[index-1] prepronodelist=prewordnode.propertynodelist maxprobability=0 for pnode in prepronodelist: prepbt=pnode.probability if prepbt>maxprobability: maxprobability=prepbt maxprenode=pnode nowbeststack=[] for se in maxprenode.beststack: nowbeststack.append(se) nowbeststack.append('nz') pronode=propertynode('nz',maxprobability,nowbeststack) nowwordnode.propertynodelist.append(pronode) hmmwordlist.append(nowwordnode) hmmlastword=hmmwordlist[index] maxp=0 for l in hmmlastword.propertynodelist: if maxp

分析器较简单,没进行复杂语法(如叠词)的处理。

你可能感兴趣的:(python词法分析(分词+词性标注))