pylucene的使用

前一段时间做东西用到了pylucene,包括建立索引,检索,高亮显示等等。贴两段代码,希望对大家有用。

pylucene的安装就不多说了,我用的版本是PyLucene-1.9.1。

建立索引:

# !/usr/bin/env python

import  os
import  PyLucene


class  IndexFiles:
    
"""
    create index by PyLucene, just need your dir path,
    the result files saved in the directory index in
    current path
    
"""

    
def   __init__ (self, root, storeDir, analyzer):
        
if   not  os.path.exists(storeDir):
            os.mkdir(storeDir)
        analyzer 
=  PyLucene.StandardAnalyzer()       
        store 
=  PyLucene.FSDirectory.getDirectory(storeDir , True)
        writer 
=  PyLucene.IndexWriter(store, analyzer, True)
        self.indexDocs(root, writer)
        
print   ' optimizing index ' ,
        writer.optimize()
        writer.close()
        
print   ' done '

    
def  indexDocs(self, root, writer):
        
for  root, dirnames, filenames  in  os.walk(root):
            
for  filename  in  filenames:
                
if   not  filename.endswith( ' .txt ' ):
                    
continue
                
print   " adding " , filename
                
try :
                    path 
=  os.path.join(root, filename)
                    file 
=  open(path)
                    contents 
=  unicode(file.read(),  ' gbk ' )
                    file.close()
                    doc 
=  PyLucene.Document()
                    doc.add(PyLucene.Field.Keyword(u
" name " , filename.decode( ' gbk ' )))
                    doc.add(PyLucene.Field.Text(u
" path " , path.decode( ' gbk ' )))
                    
if  len(contents)  >  0:
                        
pass
                        doc.add(PyLucene.Field.Text(u
" contents " , contents))
                    
else :
                        
print   " warning: no content in %s "   %  filename
                    writer.addDocument(doc)
                
except  Exception, e:
                    
print   " Failed in indexDocs: " , e

def  indexmain(path):
    
try :
        IndexFiles(path, 
" index " , PyLucene.StandardAnalyzer())
        
return   ''
    
except  Exception, e:
        
return  e
    
if   __name__   ==   ' __main__ ' :
    indexpath 
=  raw_input( " path:  " )
    indexmain(indexpath)

检索,高亮显示:

 

# !/usr/bin/env python

import  time
from  StringIO  import  StringIO
from  PyLucene  import   *


class  TestFormatter(Formatter):
    
    
    
def   __init__ (self):
        
pass

    
def  highlightTerm(self, originalText, group):
        
if  group.getTotalScore()  <=  0:
            
return  originalText    
        
return   " "   +  originalText  +   " "

class  Search:
    
    
    
def   __init__ (self):
        STORE_DIR 
=   " index "
        self.directory 
=  FSDirectory.getDirectory(STORE_DIR, False)
        self.analyzer 
=  ChineseAnalyzer()
        self.maxNumFragmentsRequired 
=   2
        self.fragmentSeparator 
=  u " ... "
        
    
def  search(self, query, start):
        searcher 
=  IndexSearcher(self.directory)  
        query 
=  query.decode( ' gbk ' )
        query 
=  QueryParser.parse(query,  " contents " , self.analyzer)
        starttime 
=  time.time()
        hits 
=  searcher.search(query)        
        formatter 
=  TestFormatter()
        highlighter 
=  Highlighter(formatter, QueryScorer(query))
        highlighter.setTextFragmenter(SimpleFragmenter(
60 ))
        resultdic 
=  {}
        totalnum 
=  hits.length()
        
for  i  in  range( 10 ):
            index 
=  start  +  i
            
if  index  >=  totalnum:
                
break
            
try :
                doc 
=  hits.doc(index)
            
except :
                
continue
            text 
=  doc.get( " contents " )
            tokenStream 
=  self.analyzer.tokenStream( " contents " , StringIO(text)) 
            result 
=  highlighter.getBestFragments(
              tokenStream,
              text,
              self.maxNumFragmentsRequired,
              self.fragmentSeparator)
            score 
=  hits.score(index)
            
if  resultdic.has_key(score):
                score 
+=   0.0001
            resultdic[score] 
=  [result, doc.get( " path " )]
        stoptime 
=  time.time()
        usetime 
=  stoptime  -  starttime            
        searcher.close()    
        ks 
=  resultdic.keys()
                
        
return  resultdic, totalnum, usetime
        

if   __name__   ==   ' __main__ ' :
    tt 
=  Search()
    command 
=  raw_input( " Query: " ).decode( ' gbk ' )
    tt.search(command, 0)

处理中文时注意编码

 

你可能感兴趣的:(pylucene的使用)