Lucene5.3高级应用

1.Lucence5.3.1高级应用

 来自 caicongyang  http://blog.csdn.net/caicongyang

2.代码

直接上代码,注释清晰!

package com.ccy.lucene.app;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortField.Type;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Test;

/**
 * 
 * 

* Title: IndexDao.java * Package com.ccy.lucene.app *

*

* Description: Lucene应用 *

* @author Tom.Cai * @created 2015-11-9 下午9:57:00 * @version V1.0 * */ public class IndexDao { // 源文件 String filePath1 = "D:\\newWork\\lucene5.3\\luceneSourceFile\\小笑话_总统的房间 Room .txt"; String filePath2 = "D:\\newWork\\lucene5.3\\luceneSourceFile\\A股.txt"; String filePath3 = "D:\\newWork\\lucene5.3\\luceneSourceFile\\IndexWriter addDocument's a javadoc .txt"; // 索引文件 String indexPath = "D:\\newWork\\lucene5.3\\luceneIndex"; //分词器 Analyzer analyzer = new SmartChineseAnalyzer();// 词库分词 /** * 创建索引 * @throws Exception */ @Test public void createIndex() throws Exception { FSDirectory fsDir = FSDirectory.open(Paths.get(indexPath)); // 1.启动时读取原有磁盘索引文件 Directory ramDir = new RAMDirectory(fsDir, new IOContext()); IndexWriterConfig config = new IndexWriterConfig(analyzer); IndexWriter ramIndexWiter = new IndexWriter(ramDir, config); // 2. 添加 Document Document doc1 = file2Document(filePath1); Document doc2 = file2Document(filePath2); Document doc3 = file2Document(filePath3); //内存索引添加Document ramIndexWiter.addDocument(doc1); ramIndexWiter.addDocument(doc2); ramIndexWiter.addDocument(doc3); ramIndexWiter.close(); // 2.退出时将内存索引保存到磁盘索引中 IndexWriterConfig indexConfig = new IndexWriterConfig(analyzer); IndexWriter fsIndexWriter = new IndexWriter(fsDir, indexConfig); fsIndexWriter.addIndexes(ramDir); fsIndexWriter.close(); } /** * 根据关键字删除索引 * @param term */ public void delete(Term term) { IndexWriterConfig indexConfig = new IndexWriterConfig(analyzer); FSDirectory fsDir = null; IndexWriter indexWriter = null; try { fsDir = FSDirectory.open(Paths.get(indexPath)); indexWriter = new IndexWriter(fsDir, indexConfig); indexWriter.deleteDocuments(term); } catch (Exception e) { throw new RuntimeException(e); } finally { try { indexWriter.close(); } catch (Exception e) { e.printStackTrace(); } } } /** * 根据关键字更新索引 * *

	 * indexWriter.deleteDocuments(term);
	 * indexWriter.addDocument(doc);
	 * 
* * @param term * @param doc */ public void update(Term term, Document doc) { IndexWriterConfig indexConfig = new IndexWriterConfig(analyzer); FSDirectory fsDir = null; IndexWriter indexWriter = null; try { fsDir = FSDirectory.open(Paths.get(indexPath)); indexWriter = new IndexWriter(fsDir, indexConfig); indexWriter.updateDocument(term, doc); } catch (Exception e) { throw new RuntimeException(e); } finally { try { indexWriter.close(); } catch (Exception e) { e.printStackTrace(); } } } /** * 搜索 */ @Test public void search() { IndexReader reader = null; try { reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath))); IndexSearcher searcher = new IndexSearcher(reader); // 排序 Sort sort = new Sort(); sort.setSort(new SortField("name", Type.SCORE)); // 默认为升序 //lucene 5.x不建议使用filter,而使用BooleanQuery来代替 //Filter filter = new TermFilter(new Term("title", "A")); // 2、搜索解析器 QueryParser parser = new QueryParser("content", analyzer); Query query = parser.parse("房间"); //lucene 4.x 此方法已经过时,不建议使用filter,而使用BooleanQuery来代替 //TopDocs topDocs = searcher.search(query, filter, Integer.MAX_VALUE, sort); TopDocs topDocs = searcher.search(query,Integer.MAX_VALUE,sort); int recordCount = topDocs.totalHits; System.out.println("总共有【" + recordCount + "】条匹配结果"); List recordList = new ArrayList(); //准备高亮器 Formatter formatter = new SimpleHTMLFormatter("", ""); Scorer scorer = new QueryScorer(query); Highlighter highlighter = new Highlighter(formatter, scorer); // 摘要 Fragmenter fragmenter = new SimpleFragmenter(50); highlighter.setTextFragmenter(fragmenter); // 3取出当前页的数据 int end = Math.min(10, topDocs.totalHits); //循环读出前10条 for (int i = 0; i < end; i++) { ScoreDoc scoreDoc = topDocs.scoreDocs[i]; int docSn = scoreDoc.doc; // 文档内部编号 Document doc = searcher.doc(docSn); // 根据编号取出相应的文档 // 返回高亮后的结果,如果当前属性值中没有出现关键字,会返回 null String hc = highlighter.getBestFragment(analyzer, "content", doc.get("content")); if (hc == null) { String content = doc.get("content"); int endIndex = Math.min(50, content.length()); hc = content.substring(0, endIndex);// 最多前50个字符 } doc.removeField("content"); doc.add(new Field("content", hc, TextField.TYPE_STORED)); recordList.add(doc); } for (Document document : recordList) { System.out.println("------------------------------"); System.out.println("name = " + document.get("name")); System.out.println("content = " + document.get("content")); System.out.println("size = " + document.get("size")); System.out.println("path = " + document.get("path")); } } catch (Exception e) { e.printStackTrace(); } } /** * //1.将文本转化为doc对象 * * @param path * @return */ public Document file2Document(String path) { File file = new File(path); Document doc = new Document(); doc.add(new Field("name", file.getName(), StringField.TYPE_STORED)); doc.add(new Field("content", readFileContent(file), TextField.TYPE_STORED)); doc.add(new LongField("size", file.length(), LongField.TYPE_STORED)); doc.add(new Field("path", file.getAbsolutePath(), StringField.TYPE_STORED)); return doc; } /** * 读取文件内容 */ public static String readFileContent(File file) { try { BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); StringBuffer content = new StringBuffer(); for (String line = null; (line = reader.readLine()) != null;) { content.append(line).append("\n"); } return content.toString(); } catch (Exception e) { throw new RuntimeException(e); } } }

3.欢迎指正

我的博客: http://blog.csdn.net/caicongyang




你可能感兴趣的:(搜索/爬虫)