1.Maven Dependency
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>Lucene3.6</groupId> <artifactId>Lucene3.6</artifactId> <version>1</version> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <lucene-version>3.6.2</lucene-version> <junit-version>4.11</junit-version> </properties> <dependencies> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>${lucene-version}</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>${junit-version}</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency> </dependencies> <build> <sourceDirectory>src</sourceDirectory> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.0</version> <configuration> <source>1.7</source> <target>1.7</target> </configuration> </plugin> </plugins> </build> </project>
2.全文搜索
分为三部分:
索引
1、创建Directory
2、创建IndexWriter
3、创建Document对象
4、为Document对象添加Field
5、通过IndexWriter添加文档到索引中
分词
搜索
1、创建Directory
2、创建IndexReader
3、根据IndexReader创建IndexSearcher
4、创建搜索的Query
5、根据searcher搜索并且返回TopDocs
6、根据TopDocs获取ScoreDocs对象
7、根据searcher和ScoreDocs对象获取具体的Document对象
8、根据Document对象获取需要的值
3.src
HelloLucene.java
package org.fool.lucene; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; public class HelloLucene { /** * 建立索引 */ public void index() { Directory directory = null; IndexWriter writer = null; try { // 1、创建Directory directory = FSDirectory.open(new File("C:/index")); // 创建在硬盘上 // 2、创建IndexWriter IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)); writer = new IndexWriter(directory, iwc); // 3、创建Document对象 Document doc = null; // 4、为Document对象添加Field File f = new File("C:/lucene"); for (File file : f.listFiles()) { doc = new Document(); // String content = FileUtils.readFileToString(file); // System.out.println(content); // doc.add(new Field("content", content, Field.Store.YES, // Field.Index.ANALYZED_NO_NORMS)); doc.add(new Field("content", new FileReader(file))); doc.add(new Field("fileName", file.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("filePath", file.getAbsolutePath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // 5、通过IndexWriter添加文档到索引中 writer.addDocument(doc); } } catch (CorruptIndexException | LockObtainFailedException | FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (writer != null) { try { writer.close(); } catch (CorruptIndexException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } } } /** * 搜索 */ public void search() { Directory directory = null; IndexReader reader = null; IndexSearcher searcher = null; try { // 1、创建Directory directory = FSDirectory.open(new File("C:/index")); // 创建在硬盘上 // 2、创建IndexReader reader = IndexReader.open(directory); // 3、根据IndexReader创建IndexSearcher searcher = new IndexSearcher(reader); // 4、创建搜索的Query QueryParser parser = new QueryParser(Version.LUCENE_36, "content", new StandardAnalyzer(Version.LUCENE_36)); Query query = parser.parse("World"); // 5、根据searcher搜索并且返回TopDocs TopDocs tds = searcher.search(query, 10); // 6、根据TopDocs获取ScoreDocs对象 ScoreDoc[] sds = tds.scoreDocs; for (ScoreDoc sd : sds) { // 7、根据searcher和ScoreDocs对象获取具体的Document对象 Document document = searcher.doc(sd.doc); // 8、根据Document对象获取需要的值 System.out.println(document.get("fileName") + "(" + document.get("filePath") + ")"); } } catch (CorruptIndexException | ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { // 9、关闭reader try { searcher.close(); reader.close(); } catch (IOException e) { e.printStackTrace(); } } } }
4.test
TestHelloLucene.java
package org.fool.lucene; import org.junit.Before; import org.junit.Test; public class TestHelloLucene { private HelloLucene lucene; @Before public void setUp() throws Exception { lucene = new HelloLucene(); } @Test public void testIndex() { lucene.index(); } @Test public void testSearch() { lucene.search(); } }
5.Details
Field.Store.YES/NO(存储域选项)
设置为YES表示或把这个域中的内容完全存储到文件中,方便进行文本的还原
设置为NO表示把这个域的内容不存储到文件中,但是可以被索引,此时内容无法完全还原(doc.get)
Field.Index.*(索引域选项)
Index.ANALYZED:进行分词和索引,适用于标题、内容等
Index.NOT_ANALYZED:进行索引,但是不进行分词,如果身份证号、姓名、ID等,适用于精确搜索
Index.ANALYZED_NOT_NORMS:进行分词但是不存储norms信息,这个norms中包括了创建索引的时间和权值等信息
Index.NOT_ANALYZED_NOT_NORMS:即不进行分词也不存储norms信息
Index.NO:不进行索引
最佳实践
Index.NOT_ANALYZED_NOT_NORMS YES 标识符(主键、文件名),电话号码,身份证号,姓名,日期
Index.ANALYZED YES 文档标题和摘要
Index.ANALYZED NO 文档正文
Index.NO YES 文档类型,数据库主键(不进行索引)
Index.NOT_ANALYZED NO 隐藏关键字
更多细节可以参考Lucene实战(第二版)