本篇文章使用的是Lucene4.2版本
首先创建索引分析器,建立IndexWriter对象
File docDir = new File(filePath);
Directory dir = null; try { dir = FSDirectory.open(new File(indexPath)); } catch (Exception e) { return; }
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_42, analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);//设置索引模式 IndexWriter writer = new IndexWriter(dir, iwc);
indexDocs(writer, docDir);
对象docDir为你要建立索引的目录或文件,参数indexPath为你要保存索引文件的目录
private void indexDocs(IndexWriter writer, File file) throws Exception { if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); for(int j=0;j<files.length;j++){ System.out.println(j+"文件名为:"+files[j]); } if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { return; } try { Document doc = new Document(); doc.add(new StringField("path", file.getPath(), Field.Store.YES)); doc.add(new TextField("filename", file.getName(), Field.Store.YES)); doc.add(new TextField("size", file.length()+"", Field.Store.YES)); doc.add(new TextField("type", "file", Field.Store.YES)); doc.add(new LongField("modify", file.lastModified(), Field.Store.YES)); String fileName = file.getName(); String extention = ""; int index = fileName.lastIndexOf('.'); if(index > -1 && index <fileName.length()){ extention = fileName.substring(index+1); } if(extention.equalsIgnoreCase("doc")){ WordExtractor wordExtractor = new WordExtractor(fis); String result = wordExtractor.getText(); //BufferedReader br = new BufferedReader(new StringReader(result)); doc.add(new TextField("contents", result,Field.Store.YES)); }else if(extention.equalsIgnoreCase("docx")){ XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(new XWPFDocument(fis)); String result = xwpfWordExtractor.getText(); //BufferedReader br = new BufferedReader(new StringReader(result)); doc.add(new TextField("contents",result,Field.Store.YES)); }else if(extention.equalsIgnoreCase("wps")){ HWPFDocument hwpfDocument = new HWPFDocument(fis); Range range = hwpfDocument.getRange(); String result = range.text(); doc.add(new TextField("contents",result,Field.Store.YES)); }else if(extention.equalsIgnoreCase("xlsx")){ XSSFWorkbook wb = new XSSFWorkbook(fis); StringBuffer sb = new StringBuffer(); for(int sheetNum = 0;sheetNum < wb.getNumberOfSheets() ;sheetNum++){ if(wb.getSheetAt(sheetNum)!=null){ XSSFSheet sheet = wb.getSheetAt(sheetNum); for(int sheetRow =0;sheetRow<sheet.getLastRowNum();sheetRow++){ if(sheet.getRow(sheetRow)!=null){ XSSFRow row = sheet.getRow(sheetRow); for(int sheetCol =0;sheetCol<row.getLastCellNum();sheetCol++){ if(row.getCell(sheetCol)!=null){ XSSFCell aCell = row.getCell(sheetCol); if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) { sb.append(aCell.getNumericCellValue() + "\t"); } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) { sb.append(aCell.getBooleanCellValue() + "\t"); } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) { sb.append(aCell.getStringCellValue() + "\t"); } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){ sb.append(aCell.getCellFormula() + "\t"); } } if(sheetCol==row.getLastCellNum()-1){ sb.append("\n"); } } } } } } //BufferedReader br = new BufferedReader(new StringReader(sb.toString())); doc.add(new TextField("contents", sb.toString(),Field.Store.YES)); }else if(extention.equalsIgnoreCase("xls")){ POIFSFileSystem poifsFileSystem = new POIFSFileSystem(fis); StringBuffer sb = new StringBuffer(); HSSFWorkbook wb = new HSSFWorkbook(poifsFileSystem); for(int sheetNum = 0;sheetNum < wb.getNumberOfSheets() ;sheetNum++){ if(wb.getSheetAt(sheetNum)!=null){ HSSFSheet sheet = wb.getSheetAt(sheetNum); for(int sheetRow =0;sheetRow<sheet.getLastRowNum();sheetRow++){ if(sheet.getRow(sheetRow)!=null){ HSSFRow row = sheet.getRow(sheetRow); for(int sheetCol =0;sheetCol<row.getLastCellNum();sheetCol++){ if(row.getCell(sheetCol)!=null){ HSSFCell aCell = row.getCell(sheetCol); if (aCell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) { sb.append(aCell.getNumericCellValue() + "\t"); } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_BOOLEAN) { sb.append(aCell.getBooleanCellValue() + "\t"); } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_STRING) { sb.append(aCell.getStringCellValue() + "\t"); } else if (aCell.getCellType() == HSSFCell.CELL_TYPE_FORMULA){ sb.append(aCell.getCellFormula() + "\t"); } } if(sheetCol==row.getLastCellNum()-1){ sb.append("\n"); } } } } } } //BufferedReader br = new BufferedReader(new StringReader(sb.toString())); doc.add(new TextField("contents", sb.toString(),Field.Store.YES)); }else if(extention.equalsIgnoreCase("ppt")){ StringBuffer sb = new StringBuffer(); SlideShow ss = new SlideShow(new HSLFSlideShow(fis)); Slide[] s = ss.getSlides(); for(int i=0;i<s.length;i++){ sb.append(s[i].getTitle()); TextRun[] t = s[i].getTextRuns(); for(int j=0;j<t.length;j++){ sb.append(t[j].getText()+"\t"); } sb.append("\n"); } //BufferedReader br = new BufferedReader(new StringReader(sb.toString())); doc.add(new TextField("contents", sb.toString(),Field.Store.YES)); }else if(extention.equalsIgnoreCase("pdf")){ PDFParser parser = new PDFParser(fis); parser.parse(); PDDocument pdDocument = parser.getPDDocument(); PDFTextStripper stripper = new PDFTextStripper(); String result = stripper.getText(pdDocument); //BufferedReader br = new BufferedReader(new StringReader(result)); doc.add(new TextField("contents", result,Field.Store.YES)); }else if(extention.equalsIgnoreCase("txt")||extention.equalsIgnoreCase("html")||extention.equalsIgnoreCase("xml")||extention.equalsIgnoreCase("java")){ StringBuffer stringBuffer = new StringBuffer(); BufferedReader br = new BufferedReader(new InputStreamReader(fis, "GBK")); String data = null; while((data=br.readLine())!=null){ stringBuffer.append(data+"\n"); } doc.add(new TextField("contents", stringBuffer.toString(),Field.Store.YES)); }else{ return; } if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { System.out.println("adding " + file); writer.addDocument(doc); } else { System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
方法indexDocs(IndexWriter writer, File file)为递归对目录下的文件进行索引的建立。从代码中可以看出,我们首先要创建Document对象,并通过Field对象将文件的属性(文件名、文件内容、文件路径等)添加到文档对象Document中。最后调用IndexWriter对象中的方法,将文档添加到索引中去。
搜索文档,我们就需要使用IndexReader对象将索引文件读取出来,并使用Query对象来进行索引的检索。
String[] fields = new String[]{"filename","contents"}; IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42); MultiFieldQueryParser multiFieldQueryParser = new MultiFieldQueryParser(Version.LUCENE_42, fields, analyzer);
其中fileds表示你要查找的域,即你要在哪个字段中进行关键字查找。这里使用MultiFieldQueryParser对象来进行多个字段的解析。
Query query = multiFieldQueryParser.parse(condition); //System.out.println("解析后的查询条件:"+query.toString()); //System.out.println(); TopDocs results = searcher.search(query,10); ScoreDoc[] hits = results.scoreDocs;
创建Query查询对象,并调用IndexSearcher对象来进行查找。TopDocs表示相关度最高的文档集
for(ScoreDoc hit:hits){ Document doc = searcher.doc(hit.doc); /* 我们可以通过doc.get('字段名')方式获取我们索引的数据。 */ }对于检索出的结果,Lucene的API完美的实现了关键字高亮显示等功能。