最近一直再研究lucene,把入门的程序和大家分享:
对索引的操作类:
public class IndexDao { public IndexDao() { try { indexWriter = new IndexWriter(Constants.INDEX_STORE_PATH, Constants.analyzer, MaxFieldLength.LIMITED); } catch (Exception e) { e.printStackTrace(); } } public IndexDao(Directory dir) { try { indexWriter = new IndexWriter(dir,Constants.analyzer,MaxFieldLength.LIMITED); } catch (Exception e) { e.printStackTrace(); } } public IndexDao(boolean isCreate) { try { indexWriter = new IndexWriter(Constants.INDEX_STORE_PATH,Constants.analyzer, isCreate,MaxFieldLength.LIMITED); } catch (Exception e) { e.printStackTrace(); } } // 索引器 private IndexWriter indexWriter = null; /** * 添加/创建索引 * * @param folder * @throws IOException * @throws CorruptIndexException */ public void saveIndex(File folder, String[] unIndeies) throws CorruptIndexException, IOException { if (folder.isDirectory()) { String[] files = folder.list(); for (int i = 0; i < files.length; i++) { File f = new File(folder, files[i]); if (!f.isHidden()) { if (f.isDirectory()) { saveIndex(f, unIndeies);// ② 递归 } String fileTyep = ReadFile.validateFile(f); for (int j = 0; j < unIndeies.length; j++) { if (fileTyep.equalsIgnoreCase(unIndeies[j])) { System.out.println("正在建立索引 : " + f.getName() + ""); Document doc = ReadFile.indexFile(f); indexWriter.addDocument(doc); } } } } } } /** * Term是搜索的最小单位,代表某个 Field 中的一个关键词,如:<title, lucene> new Term( "title", * "lucene" ); new Term( "id", "5" ); new Term( "id", UUID ); * * @param term */ public void deleteIndex(Term term) { try { indexWriter.deleteDocuments(term); } catch (Exception e) { throw new RuntimeException(e); } finally { try { indexWriter.close(); } catch (Exception e) { e.printStackTrace(); } } } /** * 更新索引 indexWriter.deleteDocuments(term); indexWriter.addDocument(doc); * * @param term * @param doc */ public void updateIndex(Term term, Document doc) { try { indexWriter.updateDocument(term, doc); } catch (Exception e) { throw new RuntimeException(e); } finally { try { indexWriter.close(); } catch (Exception e) { e.printStackTrace(); } } } /** * 查询 totalPage = recordCount / pageSize; if (recordCount % pageSize > 0) * totalPage++; * * @param queryString * @param firstResult * @param maxResults * @return */ public QueryResult search(String queryString, int firstResult, int maxResults) { try { // 1,把要搜索的文本解析为 Query String[] fields = { "name", "content" }; Map<String, Float> boosts = new HashMap<String, Float>(); boosts.put("name", 2f); boosts.put("content", 3f); //默认为1.0f QueryParser queryParser = new MultiFieldQueryParser(fields, Constants.analyzer, boosts); Query query = queryParser.parse(queryString); // Query query = IKQueryParser.parse("content", queryString); Date start = new Date(); QueryResult result = search(query, firstResult, maxResults); Date end = new Date(); System.out.println("检索完成,用时" + (end.getTime() - start.getTime()) + "毫秒"); return result; } catch (Exception e) { throw new RuntimeException(e); } } public QueryResult search(Query query, int firstResult, int maxResults) { IndexSearcher indexSearcher = null; try { // 2,进行查询 indexSearcher = new IndexSearcher(Constants.INDEX_STORE_PATH); Filter filter = new RangeFilter("size", NumberTools.longToString(0), NumberTools .longToString(1000000), true, true); // 排序 Sort sort = new Sort(); sort.setSort(new SortField("size")); // 默认为升序 // sort.setSort(new SortField("size", true)); TopDocs topDocs = indexSearcher.search(query, filter, 10000, sort); int recordCount = topDocs.totalHits; List<Document> recordList = new ArrayList<Document>(); // 准备高亮器 Formatter formatter = new SimpleHTMLFormatter("<font color='red'>", "</font>"); Scorer scorer = new QueryScorer(query); Highlighter highlighter = new Highlighter(formatter, scorer); Fragmenter fragmenter = new SimpleFragmenter(50); highlighter.setTextFragmenter(fragmenter); // 3,取出当前页的数据 int end = Math.min(firstResult + maxResults, topDocs.totalHits); for (int i = firstResult; i < end; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; int docSn = scoreDoc.doc; // 文档内部编号 Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档 // 高亮 返回高亮后的结果,如果当前属性值中没有出现关键字,会返回 null String hc = highlighter.getBestFragment(Constants.analyzer, "content", doc.get("content")); if (hc == null) { String content = doc.get("content"); int endIndex = Math.min(50, content.length()); hc = content.substring(0, endIndex);// 最多前50个字符 } doc.getField("content").setValue(hc); recordList.add(doc); } // 返回结果 return new QueryResult(recordCount, recordList); } catch (Exception e) { throw new RuntimeException(e); } finally { try { indexSearcher.close(); } catch (IOException e) { e.printStackTrace(); } } } public void close() { // 对索引进行优化 try { indexWriter.optimize(); indexWriter.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void readIndex(String key, String value) { IndexReader reader; try { // Directory fsDir = FSDirectory.getDirectory( // Constants.INDEX_STORE_PATH, false); // if (IndexReader.isLocked(fsDir)) { // System.out.println("------unlock-----"); // IndexReader.unlock(fsDir); // } reader = IndexReader.open(Constants.INDEX_STORE_PATH); for (int i = 0; i < reader.numDocs(); i++) // System.out.println(reader.document(i)); System.out.println("版本:" + reader.getVersion()); System.out.println("索引内的文档数量:" + reader.numDocs()); Term term = new Term(key, value); TermDocs docs = reader.termDocs(term); IndexSearcher indexSearcher = null; indexSearcher = new IndexSearcher(Constants.INDEX_STORE_PATH); while (docs.next()) { int docSn = docs.doc(); // 文档内部编号 Document doc = indexSearcher.doc(docSn); // 根据编号取出相应的文档 System.out.println("文档路径 " + doc.get("path")); System.out.println("含有所查找的 " + term + "的Document的编号为: "+ docs.doc()); System.out.println("Term在文档中的出现 " + docs.freq()+" 次"); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
读取文件工具类:
public class ReadFile { public static String readWord(File f) { StringBuffer content = new StringBuffer("");// 文档内容 try { HWPFDocument doc = new HWPFDocument(new FileInputStream(f)); Range range = doc.getRange(); int paragraphCount = range.numParagraphs();// 段落 for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据 Paragraph pp = range.getParagraph(i); content.append(pp.text()); } // System.out.println("-------word--------"+content.toString()); } catch (Exception e) { System.out.println("建立索引出错 : " + f.getAbsolutePath() + ""); e.printStackTrace(); } return content.toString().trim(); } public static String readPdf(File f){ StringBuffer content = new StringBuffer("");// 文档内容 PDDocument pdfDocument = null; try { if(f.length()>10048576){ DecimalFormat df = new DecimalFormat("#.00"); System.out.println("---------------------文件大小------"+df.format((double) f.length() / 1048576) + "M"); return f.getName(); } FileInputStream fis = new FileInputStream(f); PDFTextStripper stripper = new PDFTextStripper(); pdfDocument = PDDocument.load(fis); if(pdfDocument.isEncrypted()){ return f.getName(); } StringWriter writer = new StringWriter(); stripper.writeText(pdfDocument, writer); content.append(writer.getBuffer().toString()); fis.close(); } catch (IOException e) { System.out.println("建立索引出错 : " + f.getAbsolutePath() + ""); System.err.println("IOException=" + e); //System.exit(1); } finally { if (pdfDocument != null) { // System.err.println("Closing document " + f + "..."); org.pdfbox.cos.COSDocument cos = pdfDocument.getDocument(); try { cos.close(); // System.err.println("Closed " + cos); pdfDocument.close(); } catch (IOException e) { System.out.println("建立索引出错 : " + f.getAbsolutePath() + ""); e.printStackTrace(); } } } // System.out.println("-------pdf--------"+content.toString()); return content.toString().trim(); } public static String readHtml(File f) { StringBuffer content = new StringBuffer(""); FileInputStream fis = null; try { fis = new FileInputStream(f); // 读取页面 这里的字符编码要注意,要对上html头文件的一致,否则会出乱码 BufferedReader reader = new BufferedReader(new InputStreamReader(fis, "gb2312")); String line = null; while ((line = reader.readLine()) != null) { content.append(line + "\n"); } reader.close(); } catch (Exception e) { System.out.println("建立索引出错 : " + f.getAbsolutePath() + ""); e.printStackTrace(); } String contentString = content.toString(); // System.out.println("---------htm索引----"+contentString); return contentString; } public static String readTxt(File f) { StringBuffer content = new StringBuffer(""); try { BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(f))); for (String line = null; (line = reader.readLine()) != null;) { content.append(line).append("\n"); } } catch (IOException e) { System.out.println("建立索引出错 : " + f.getAbsolutePath() + ""); e.printStackTrace(); } return content.toString().trim(); } public static String readExcel(File f,String fileType){ StringBuffer content = new StringBuffer(""); try{ ExcelReader er=new ExcelReader(f,fileType); String line=er.readLine(); content.append(line).append("\n"); while(line!=null){ line=er.readLine(); content.append(line).append("\n"); } er.close(); }catch(Exception e){ System.out.println("建立索引出错 : " + f.getAbsolutePath() + ""); e.printStackTrace(); } return content.toString(); } public static String validateFile(File f) { String fileType = "otherType"; String fileName = f.getName(); if (fileName.lastIndexOf('.') == -1) { fileType = "dir"; return fileType; } fileName = fileName.substring(fileName.lastIndexOf('.') + 1, fileName .length()); int i = 0; String [] extension=Constants.EXTENSION; for (i = 0; i < extension.length; i++) { if (fileName.equalsIgnoreCase(extension[i])) { fileType = extension[i]; break; } } return fileType; } public static Document indexFile(File f) { Document doc = new Document(); try { doc.add(new Field("name", f.getName(), Store.YES, Index.ANALYZED)); doc.add(new Field("size", NumberTools.longToString(f.length()), Store.YES, Index.NOT_ANALYZED)); doc.add(new Field("path", f.getAbsolutePath(), Store.YES, Index.NOT_ANALYZED)); String fileType = validateFile(f); if (fileType.equals("txt")) { doc.add(new Field("content", ReadFile.readTxt(f), Store.YES, Index.ANALYZED)); } else if (fileType.equals("pdf")) { doc.add(new Field("content", ReadFile.readPdf(f), Store.YES, Index.ANALYZED)); } else if (fileType.equals("doc")) { doc.add(new Field("content", ReadFile.readWord(f), Store.YES, Index.ANALYZED)); } else if (fileType.equals("htm")) { doc.add(new Field("content", ReadFile.readHtml(f), Store.YES, Index.ANALYZED)); } else if(fileType.equals("xls")){ doc.add(new Field("content", ReadFile.readExcel(f, fileType), Store.YES, Index.ANALYZED)); }else { doc.add(new Field("content", f.getName(), Store.YES, Index.ANALYZED)); } } catch (Exception e) { System.out.println("建立索引出错 : " + f.getAbsolutePath() + ""); e.printStackTrace(); } return doc; } }
评论