本章代码及资源收录在 jiajia154569836/luncene · GitHub
一.Lucene是apache软件基金子项目,但它不是一个完整的全文检索引擎,而是一个全文检索引擎的架构,为开发人员提供一个简单易用的工具包。
二.核心jar:lucene-core,lucene-queryparser,lucene-analyzers-common
三.luncene简单组件 indexwriter document analyzer indexsearch query
Demo步骤 (注意:demo代码中实例版本为高版本5.3 与下面ik版本不同)
1 新建indexwriter
2 新建document
3 将document 写入 indexwriter 并创建索引到指定的目录
4 新建indexSearch
5 选择分词器
6 填写查询条件QueryParser
7 通过indexSearch到指定 的目录查询
代码如下:
public class LunceneDemo{
public static String PATH = "E:\\workspace\\git\\luncene\\index\\hello001";
String doc1 = "hello abc";
String doc2 = "hello java";
String doc3 = "hello 源码";
String doc4 = "hello File练习";
/**
* 1 indexwriter
* 2 document
* 3 document 写入 indexwriter
*/
@Test
public void testCreate() throws IOException {
Directory d = FSDirectory.open(Paths.get(PATH));
IndexWriterConfig conf = new IndexWriterConfig(new SimpleAnalyzer());
IndexWriter indexWriter = new IndexWriter(d, conf);
Document document1 = new Document();
document1.add(new TextField("id", "1", Field.Store.YES));
document1.add(new TextField("title", "doc1", Field.Store.YES));
document1.add(new TextField("content", doc1, Field.Store.YES));
Document document2 = new Document();
document2.add(new TextField("id", "2", Field.Store.YES));
document2.add(new TextField("title", "doc2", Field.Store.YES));
document2.add(new TextField("content", doc2, Field.Store.YES));
Document document3 = new Document();
document3.add(new TextField("id", "3", Field.Store.YES));
document3.add(new TextField("title", "doc3", Field.Store.YES));
document3.add(new TextField("content", doc3, Field.Store.YES));
indexWriter.addDocument(document1);
indexWriter.addDocument(document2);
indexWriter.addDocument(document3);
indexWriter.commit();
indexWriter.close();
}
@Test
public void testSearch() throws IOException, ParseException {
Directory d = FSDirectory.open(Paths.get(PATH));
IndexReader reader = DirectoryReader.open(d);
IndexSearcher searcher = new IndexSearcher(reader);
String parse = "content:hello";
Analyzer analyzer = new SimpleAnalyzer();
String defa = "content";
QueryParser pa = new QueryParser(defa, analyzer);
Query query = pa.parse(parse);
TopDocs topDocs = searcher.search(query, 10);
ScoreDoc[] scoreDoc = topDocs.scoreDocs;
for (ScoreDoc s : scoreDoc) {
int docid = s.doc;
Document document = searcher.doc(docid);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(document.get("content"));
}
}
}
四.luncene需要掌握的核心是分词器及查询
例:SimpleAnalyzer() IKAnalyzer()
BoolQuery FuzzyQuery
五.IKAnalyzer分词器的操作(注意:代码中实例为3.6 因为高版本不兼容)
1 导入jar IKAnalyzer2012_u6.jar
2 根目录创建rsources目录并将stopword.dic,IKAnalyzer.cfg.xml,ext.dic文件放入其中
代码如下:
public class LunceneIKAnalyzer {
public static String PATH = "E:\\workspace\\git\\luncene\\index\\hello003";
String doc1 = "源代码的开源 dadad hahaada 哈哈";
String doc2 ="哈哈 dadads";
String doc3 ="wrwrwr 源码";
public void testAnalyzer(Analyzer analyzer,String str) throws IOException {
TokenStream tokenStream = analyzer.tokenStream("content",new StringReader(str));
tokenStream.reset();
while (tokenStream.incrementToken())
{
System.out.println(tokenStream);
}
}
@Test
public void testIk () throws IOException {
//true------最大化原则
//不写------最小化原则-------细粒度划分
testAnalyzer(new IKAnalyzer(true),doc1);
}
sources目录中导入ik分词器的配置文件 如图:
说明:
stopword.dic 停滞词(分词之后不想出现的词)
IKAnalyzer.cfg.xm ik配置文件
ext.dic 新创建的单词
3 new IKAnalyzer(true); 粗细力度的划分
//true------最大化原则 例子:源代码的开源
//false-----最小化原则(细粒度划分)例子:源代码的开源 源代码 代码 开源
六. indexableField说明
1 indexFile相当于数据库中的列(document相当于行)
2 其中有indexableField 一般使用StringField LongField
3 其中包含FieldType有三个属性
type.setStored(true);//是否存储数据库
type.setIndexOptions();//该字段是否创建索引
type.setTokenized(true);//是否分词
代码如下:
@Test
public void testCreate() throws IOException {
Directory d = FSDirectory.open(Paths.get(PATH));
IndexWriterConfig conf = new IndexWriterConfig(new SimpleAnalyzer());
IndexWriter indexWriter = new IndexWriter(d,conf);
Document document4= new Document();
document4.add(new TextField("id","4",Field.Store.YES));
document4.add(new TextField("title","doc4",Field.Store.YES));
FieldType type = new FieldType();
type.setStored(true);//是否存储数据库
type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);//该字段是否创建索引
type.setTokenized(true);//是否分词
Field field = new Field("content",doc4,type);
document4.add(field);
indexWriter.addDocument(document4);
indexWriter.commit();
indexWriter.close();
}
}
七. Query查询
Query子类常用的几个
1 PhraseQuery 查询短语
2 WildcardQuery 通配符查询
3 FuzzyQuery 模糊查询 容错查询
4 BooleanQuery 组合查询
代码如下:
public class LunceneQuery {
public static String PATH = "E:\\workspace\\git\\luncene\\index\\hello006";
String doc1 = "hello haha abc";
String doc2 = "hello haha java";
String doc3 = "hello haha 源码";
String doc4 = "hello haha File练习";
/**
* 1 indexwriter
* 2 document
* 3 document 写入 indexwriter
*/
@Test
public void testCreate() throws IOException {
Directory d = FSDirectory.open(Paths.get(PATH));
IndexWriterConfig conf = new IndexWriterConfig(new SimpleAnalyzer());
IndexWriter indexWriter = new IndexWriter(d, conf);
Document document1 = new Document();
document1.add(new TextField("id", "1", Field.Store.YES));
document1.add(new TextField("title", "doc1", Field.Store.YES));
document1.add(new TextField("content", doc1, Field.Store.YES));
Document document2 = new Document();
document2.add(new TextField("id", "2", Field.Store.YES));
document2.add(new TextField("title", "doc2", Field.Store.YES));
document2.add(new TextField("content", doc2, Field.Store.YES));
Document document3 = new Document();
document3.add(new TextField("id", "3", Field.Store.YES));
document3.add(new TextField("title", "doc3", Field.Store.YES));
document3.add(new TextField("content", doc3, Field.Store.YES));
Document document4 = new Document();
document4.add(new TextField("id", "4", Field.Store.YES));
document4.add(new TextField("title", "doc4", Field.Store.YES));
document4.add(new TextField("content", doc4, Field.Store.YES));
indexWriter.addDocument(document1);
indexWriter.addDocument(document2);
indexWriter.addDocument(document3);
indexWriter.addDocument(document4);
indexWriter.commit();
indexWriter.close();
}
//查询短语 PhraseQuery
@Test
public void testPhraseQuery() throws IOException {
Directory d = FSDirectory.open(Paths.get(PATH));
IndexReader reader = DirectoryReader.open(d);
IndexSearcher searcher = new IndexSearcher(reader);
//新建查询
PhraseQuery query = new PhraseQuery();
//设置分词之间的距离 hello (10之内都可以) abc
query.setSlop(10);
//添加前一个和后一个分词
query.add(new Term("content", "hello"));
query.add(new Term("content", "abc"));
TopDocs topDocs = searcher.search(query, 10);
ScoreDoc[] scoreDoc = topDocs.scoreDocs;
for (ScoreDoc s : scoreDoc) {
int docid = s.doc;
Document document = searcher.doc(docid);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(document.get("content"));
}
}
//通配符查询 WildcardQuery
@Test
public void testWildcardQuery() throws IOException {
Directory d = FSDirectory.open(Paths.get(PATH));
IndexReader reader = DirectoryReader.open(d);
IndexSearcher searcher = new IndexSearcher(reader);
//新建查询
Term term = new Term("content", "he???");
Query query = new WildcardQuery(term);
TopDocs topDocs = searcher.search(query, 10);
ScoreDoc[] scoreDoc = topDocs.scoreDocs;
for (ScoreDoc s : scoreDoc) {
int docid = s.doc;
Document document = searcher.doc(docid);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(document.get("content"));
}
}
//模糊查询 容错查询 FuzzyQuery
@Test
public void testFuzzyQuery() throws IOException {
Directory d = FSDirectory.open(Paths.get(PATH));
IndexReader reader = DirectoryReader.open(d);
IndexSearcher searcher = new IndexSearcher(reader);
//新建查询
Term term = new Term("content", "hekko");
FuzzyQuery query = new FuzzyQuery(term);
TopDocs topDocs = searcher.search(query, 10);
ScoreDoc[] scoreDoc = topDocs.scoreDocs;
for (ScoreDoc s : scoreDoc) {
int docid = s.doc;
Document document = searcher.doc(docid);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(document.get("content"));
}
}
//组合查询 BooleanQuery
@Test
public void testBooleanQuery() throws IOException {
Directory d = FSDirectory.open(Paths.get(PATH));
IndexReader reader = DirectoryReader.open(d);
IndexSearcher searcher = new IndexSearcher(reader);
//新建查询
BooleanQuery boolQuery = new BooleanQuery();
Query query1 = new TermQuery(new Term("content", "hello"));
Query query2 = new TermQuery(new Term("content", "java"));
boolQuery.add(query1, BooleanClause.Occur.MUST);
boolQuery.add(query2, BooleanClause.Occur.MUST);
TopDocs topDocs = searcher.search(boolQuery, 10);
ScoreDoc[] scoreDoc = topDocs.scoreDocs;
for (ScoreDoc s : scoreDoc) {
int docid = s.doc;
Document document = searcher.doc(docid);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(document.get("content"));
}
}
@Test
public void testSearch() throws IOException, ParseException {
Directory d = FSDirectory.open(Paths.get(PATH));
IndexReader reader = DirectoryReader.open(d);
IndexSearcher searcher = new IndexSearcher(reader);
String parse = "content:hello";
Analyzer analyzer = new SimpleAnalyzer();
String defa = "content";
QueryParser pa = new QueryParser(defa, analyzer);
Query query = pa.parse(parse);
TopDocs topDocs = searcher.search(query, 10);
ScoreDoc[] scoreDoc = topDocs.scoreDocs;
for (ScoreDoc s : scoreDoc) {
int docid = s.doc;
Document document = searcher.doc(docid);
System.out.println(document.get("id"));
System.out.println(document.get("title"));
System.out.println(document.get("content"));
}
}
}
详细学习可以参考这篇文章
https://www.yiibai.com/lucene/