Hello Lucene

1.Maven Dependency

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<groupId>Lucene3.6</groupId>
	<artifactId>Lucene3.6</artifactId>
	<version>1</version>

	<properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
		<lucene-version>3.6.2</lucene-version>
		<junit-version>4.11</junit-version>
	</properties>

	<dependencies>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>${lucene-version}</version>
		</dependency>

		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>${junit-version}</version>
		</dependency>

		<dependency>
			<groupId>commons-io</groupId>
			<artifactId>commons-io</artifactId>
			<version>2.4</version>
		</dependency>
	</dependencies>

	<build>
		<sourceDirectory>src</sourceDirectory>
		<plugins>
			<plugin>
				<artifactId>maven-compiler-plugin</artifactId>
				<version>3.0</version>
				<configuration>
					<source>1.7</source>
					<target>1.7</target>
				</configuration>
			</plugin>
		</plugins>
	</build>
</project>

 

 

2.全文搜索

分为三部分:

索引

1、创建Directory

2、创建IndexWriter

3、创建Document对象

4、为Document对象添加Field

5、通过IndexWriter添加文档到索引中

 

分词

 

搜索

1、创建Directory

2、创建IndexReader

3、根据IndexReader创建IndexSearcher

4、创建搜索的Query

5、根据searcher搜索并且返回TopDocs

6、根据TopDocs获取ScoreDocs对象

7、根据searcher和ScoreDocs对象获取具体的Document对象

8、根据Document对象获取需要的值

 

 

3.src

HelloLucene.java

package org.fool.lucene;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;

public class HelloLucene
{
	/**
	 * 建立索引
	 */
	public void index()
	{
		Directory directory = null;
		IndexWriter writer = null;

		try
		{
			// 1、创建Directory
			directory = FSDirectory.open(new File("C:/index")); // 创建在硬盘上

			// 2、创建IndexWriter
			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36,
					new StandardAnalyzer(Version.LUCENE_36));
			writer = new IndexWriter(directory, iwc);

			// 3、创建Document对象
			Document doc = null;

			// 4、为Document对象添加Field
			File f = new File("C:/lucene");
			for (File file : f.listFiles())
			{
				doc = new Document();

				// String content = FileUtils.readFileToString(file);
				// System.out.println(content);
				// doc.add(new Field("content", content, Field.Store.YES,
				// Field.Index.ANALYZED_NO_NORMS));

				doc.add(new Field("content", new FileReader(file)));
				doc.add(new Field("fileName", file.getName(), Field.Store.YES,
						Field.Index.NOT_ANALYZED));
				doc.add(new Field("filePath", file.getAbsolutePath(),
						Field.Store.YES, Field.Index.NOT_ANALYZED));

				// 5、通过IndexWriter添加文档到索引中
				writer.addDocument(doc);
			}
		}
		catch (CorruptIndexException | LockObtainFailedException
				| FileNotFoundException e)
		{
			e.printStackTrace();
		}
		catch (IOException e)
		{
			e.printStackTrace();
		}
		finally
		{
			if (writer != null)
			{
				try
				{
					writer.close();
				}
				catch (CorruptIndexException e)
				{
					e.printStackTrace();
				}
				catch (IOException e)
				{
					e.printStackTrace();
				}
			}
		}
	}

	/**
	 * 搜索
	 */
	public void search()
	{
		Directory directory = null;
		IndexReader reader = null;
		IndexSearcher searcher = null;

		try
		{
			// 1、创建Directory
			directory = FSDirectory.open(new File("C:/index")); // 创建在硬盘上

			// 2、创建IndexReader
			reader = IndexReader.open(directory);

			// 3、根据IndexReader创建IndexSearcher
			searcher = new IndexSearcher(reader);

			// 4、创建搜索的Query
			QueryParser parser = new QueryParser(Version.LUCENE_36, "content",
					new StandardAnalyzer(Version.LUCENE_36));
			Query query = parser.parse("World");

			// 5、根据searcher搜索并且返回TopDocs
			TopDocs tds = searcher.search(query, 10);

			// 6、根据TopDocs获取ScoreDocs对象
			ScoreDoc[] sds = tds.scoreDocs;
			for (ScoreDoc sd : sds)
			{
				// 7、根据searcher和ScoreDocs对象获取具体的Document对象
				Document document = searcher.doc(sd.doc);

				// 8、根据Document对象获取需要的值
				System.out.println(document.get("fileName") + "("
						+ document.get("filePath") + ")");
			}
		}
		catch (CorruptIndexException | ParseException e)
		{
			e.printStackTrace();
		}
		catch (IOException e)
		{
			e.printStackTrace();
		}
		finally
		{
			// 9、关闭reader
			try
			{
				searcher.close();
				reader.close();
			}
			catch (IOException e)
			{
				e.printStackTrace();
			}
		}
	}
}

 

 

4.test

TestHelloLucene.java

package org.fool.lucene;

import org.junit.Before;
import org.junit.Test;

public class TestHelloLucene
{
	private HelloLucene lucene;

	@Before
	public void setUp() throws Exception
	{
		lucene = new HelloLucene();
	}

	@Test
	public void testIndex()
	{
		lucene.index();
	}

	@Test
	public void testSearch()
	{
		lucene.search();
	}
}

 

5.Details

Field.Store.YES/NO(存储域选项)

设置为YES表示或把这个域中的内容完全存储到文件中,方便进行文本的还原

设置为NO表示把这个域的内容不存储到文件中,但是可以被索引,此时内容无法完全还原(doc.get)

 

Field.Index.*(索引域选项)

Index.ANALYZED:进行分词和索引,适用于标题、内容等

Index.NOT_ANALYZED:进行索引,但是不进行分词,如果身份证号、姓名、ID等,适用于精确搜索

Index.ANALYZED_NOT_NORMS:进行分词但是不存储norms信息,这个norms中包括了创建索引的时间和权值等信息

Index.NOT_ANALYZED_NOT_NORMS:即不进行分词也不存储norms信息

Index.NO:不进行索引

 

最佳实践

Index.NOT_ANALYZED_NOT_NORMS        YES        标识符(主键、文件名),电话号码,身份证号,姓名,日期

Index.ANALYZED                 YES        文档标题和摘要

Index.ANALYZED                 NO          文档正文

Index.NO                              YES        文档类型,数据库主键(不进行索引)

Index.NOT_ANALYZED        NO          隐藏关键字

 

更多细节可以参考Lucene实战(第二版)

 

 

你可能感兴趣的:(Hello Lucene)