Lucene索引的建立和查找索引

本篇文章使用的是Lucene4.2版本

首先创建索引分析器,建立IndexWriter对象

File docDir = new File(filePath);
Directory dir = null;
try {
    dir = FSDirectory.open(new File(indexPath));
} catch (Exception e) {
    return;
}
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42);				
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_42, analyzer);		
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);//设置索引模式
IndexWriter writer = new IndexWriter(dir, iwc);
indexDocs(writer, docDir); 

对象docDir为你要建立索引的目录或文件,参数indexPath为你要保存索引文件的目录

	private void indexDocs(IndexWriter writer, File file) throws Exception {
		if (file.canRead()) {
			if (file.isDirectory()) {									
				String[] files = file.list();
				for(int j=0;j<files.length;j++){
					System.out.println(j+"文件名为:"+files[j]);
				}
				if (files != null) {
					for (int i = 0; i < files.length; i++) {
						indexDocs(writer, new File(file, files[i]));	
					}
				}
			} else {
				FileInputStream fis;
				try {
					fis = new FileInputStream(file);
				} catch (FileNotFoundException fnfe) {
					return;
				}

				try {

					Document doc = new Document();		
					doc.add(new StringField("path", file.getPath(), Field.Store.YES));
					doc.add(new TextField("filename", file.getName(), Field.Store.YES));
					doc.add(new TextField("size", file.length()+"", Field.Store.YES));
					doc.add(new TextField("type", "file", Field.Store.YES));
					doc.add(new LongField("modify", file.lastModified(), Field.Store.YES));			

					String fileName = file.getName();			
					String extention = "";						
					int index = fileName.lastIndexOf('.');
					if(index > -1 && index <fileName.length()){
						extention = fileName.substring(index+1);
					}

					if(extention.equalsIgnoreCase("doc")){						
						WordExtractor wordExtractor = new WordExtractor(fis);
						String result = wordExtractor.getText();
						//BufferedReader br = new BufferedReader(new StringReader(result));
						doc.add(new TextField("contents", result,Field.Store.YES));
					}else if(extention.equalsIgnoreCase("docx")){				
						XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(new XWPFDocument(fis));
						String result = xwpfWordExtractor.getText();
						//BufferedReader br = new BufferedReader(new StringReader(result));
						doc.add(new TextField("contents",result,Field.Store.YES));
					}else if(extention.equalsIgnoreCase("wps")){
						HWPFDocument hwpfDocument = new HWPFDocument(fis);
						Range range = hwpfDocument.getRange();
						String result = range.text();
						doc.add(new TextField("contents",result,Field.Store.YES));
					}else if(extention.equalsIgnoreCase("xlsx")){				
						XSSFWorkbook wb = new XSSFWorkbook(fis);
						StringBuffer sb = new StringBuffer();
						for(int sheetNum = 0;sheetNum < wb.getNumberOfSheets() ;sheetNum++){			
							if(wb.getSheetAt(sheetNum)!=null){
								XSSFSheet sheet = wb.getSheetAt(sheetNum);
								for(int sheetRow =0;sheetRow<sheet.getLastRowNum();sheetRow++){
									if(sheet.getRow(sheetRow)!=null){
										XSSFRow row = sheet.getRow(sheetRow);
										for(int sheetCol =0;sheetCol<row.getLastCellNum();sheetCol++){
											if(row.getCell(sheetCol)!=null){
												XSSFCell aCell = row.getCell(sheetCol); 
												if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
													sb.append(aCell.getNumericCellValue() + "\t");
												} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {
													sb.append(aCell.getBooleanCellValue() + "\t");
												} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
													sb.append(aCell.getStringCellValue() + "\t");
												} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
													sb.append(aCell.getCellFormula() + "\t");
												} 
											}
											if(sheetCol==row.getLastCellNum()-1){
												sb.append("\n");
											}
										}
									}
								}
							}
						}
						//BufferedReader br = new BufferedReader(new StringReader(sb.toString()));
						doc.add(new TextField("contents", sb.toString(),Field.Store.YES));

					}else if(extention.equalsIgnoreCase("xls")){				
						POIFSFileSystem poifsFileSystem = new POIFSFileSystem(fis);
						StringBuffer sb = new StringBuffer();
						HSSFWorkbook wb = new HSSFWorkbook(poifsFileSystem);
						for(int sheetNum = 0;sheetNum < wb.getNumberOfSheets() ;sheetNum++){	
							if(wb.getSheetAt(sheetNum)!=null){
								HSSFSheet sheet = wb.getSheetAt(sheetNum);
								for(int sheetRow =0;sheetRow<sheet.getLastRowNum();sheetRow++){
									if(sheet.getRow(sheetRow)!=null){
										HSSFRow row = sheet.getRow(sheetRow);
										for(int sheetCol =0;sheetCol<row.getLastCellNum();sheetCol++){
											if(row.getCell(sheetCol)!=null){
												HSSFCell aCell = row.getCell(sheetCol); 
												if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
													sb.append(aCell.getNumericCellValue() + "\t");
												} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) {
													sb.append(aCell.getBooleanCellValue() + "\t");
												} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) {
													sb.append(aCell.getStringCellValue() + "\t");
												} else if (aCell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){
													sb.append(aCell.getCellFormula() + "\t");
												} 
											}
											if(sheetCol==row.getLastCellNum()-1){
												sb.append("\n");
											}
										}
									}
								}
							}
						}
						//BufferedReader br = new BufferedReader(new StringReader(sb.toString()));
						doc.add(new TextField("contents", sb.toString(),Field.Store.YES));
					}else if(extention.equalsIgnoreCase("ppt")){							
						StringBuffer sb = new StringBuffer();
						SlideShow ss = new SlideShow(new HSLFSlideShow(fis));
						Slide[] s = ss.getSlides();			
						for(int i=0;i<s.length;i++){
							sb.append(s[i].getTitle());
							TextRun[] t = s[i].getTextRuns();
							for(int j=0;j<t.length;j++){
								sb.append(t[j].getText()+"\t");
							}
							sb.append("\n");
						}
						//BufferedReader br = new BufferedReader(new StringReader(sb.toString()));
						doc.add(new TextField("contents", sb.toString(),Field.Store.YES));
					}else if(extention.equalsIgnoreCase("pdf")){							
						PDFParser parser = new PDFParser(fis);								
						parser.parse();														
						PDDocument pdDocument = parser.getPDDocument();						
						PDFTextStripper stripper = new PDFTextStripper();					
						String result = stripper.getText(pdDocument);						
						//BufferedReader br = new BufferedReader(new StringReader(result));
						doc.add(new TextField("contents", result,Field.Store.YES));
					}else if(extention.equalsIgnoreCase("txt")||extention.equalsIgnoreCase("html")||extention.equalsIgnoreCase("xml")||extention.equalsIgnoreCase("java")){																			
						StringBuffer stringBuffer = new StringBuffer();
						BufferedReader br = new BufferedReader(new InputStreamReader(fis, "GBK"));
						String data = null;
						while((data=br.readLine())!=null){
							stringBuffer.append(data+"\n");
						}
						doc.add(new TextField("contents", stringBuffer.toString(),Field.Store.YES));	
					}else{
						return;
					}
					if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
						System.out.println("adding " + file);
						writer.addDocument(doc);
					} else {
						System.out.println("updating " + file);
						writer.updateDocument(new Term("path", file.getPath()), doc);
					}

				} finally {
					fis.close();
				}
			}
		}
	}

方法indexDocs(IndexWriter writer, File file)为递归对目录下的文件进行索引的建立。从代码中可以看出,我们首先要创建Document对象,并通过Field对象将文件的属性(文件名、文件内容、文件路径等)添加到文档对象Document中。最后调用IndexWriter对象中的方法,将文档添加到索引中去。


搜索文档,我们就需要使用IndexReader对象将索引文件读取出来,并使用Query对象来进行索引的检索。


String[] fields = new String[]{"filename","contents"};
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
IndexSearcher searcher = new IndexSearcher(reader);	
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42);
MultiFieldQueryParser multiFieldQueryParser = new MultiFieldQueryParser(Version.LUCENE_42, fields, analyzer);

其中fileds表示你要查找的域,即你要在哪个字段中进行关键字查找。这里使用MultiFieldQueryParser对象来进行多个字段的解析。


Query query = multiFieldQueryParser.parse(condition);
//System.out.println("解析后的查询条件:"+query.toString());
//System.out.println();
TopDocs results = searcher.search(query,10);
ScoreDoc[] hits = results.scoreDocs;

创建Query查询对象,并调用IndexSearcher对象来进行查找。TopDocs表示相关度最高的文档集

for(ScoreDoc hit:hits){
    Document doc = searcher.doc(hit.doc);
    /*
    我们可以通过doc.get('字段名')方式获取我们索引的数据。
    */
}
对于检索出的结果,Lucene的API完美的实现了关键字高亮显示等功能。



你可能感兴趣的:(Lucene,全文检索)