在全文索引工具中,都是由这样的三部分组成
1.索引部分
2.分词部分
3.搜索部分
----------------------------------
索引创建域选项
----------------------------------
Field.Store.YES或者NO(存储域选项)
YES:表示会把这个域中的内容完全存储到文件中,方便进行还原[对于主键,标题可以是这种方式存储]
NO:表示把这个域的内容不存储到文件中,但是可以被索引,此时内容无法完全还原(doc.get())[对于内容而言,没有必要进行存储,可以设置为No]
Field.index(索引选项)
Index.ANALYZED:进行分词和索引,适用于标题,内容等
Index.NOT_ANALYZED:进行索引,但不进行分词,比如身份证号,姓名,ID等,使用于精确搜索
Index.ANALYZED_NOT_NORMS:进行分词但是不存储norms信息,这个norms中包含了创建索引的时间和权值(排序)等信息
Index.NOT_ANALYZED_NOT_NORMS:即不进行分词也不存储norms信息
Index.NO:不进行索引
最佳实践
NOT_ANALYZED_NOT_NORMS Store.YES 标识符(主键,文件名),电话号码,身份证号,姓名,日期
ANALYZED Store.YES 文档标题和摘要
ANALYZED Store.NO 文档正文
NO Store.YES 文档类型,数据库主键(不进行索引)
NOT_ANALYZED Store.NO 隐藏关键字
索引文件结构剖析
.fnm保存着域字段的信息
.fdt和.fdx保存着store=yes的数据
.frq保存着哪些相同的单词出现多少次(可用作排序和评级)
.nrm专门用来保持一些评级信息
.tii和.tis保存着索引里面的所有信息
文档和域的概念
文档相当于表中的每一条记录,域相当于表中的每一个字段
----------------------------------
索引的删除与更新
----------------------------------
1.删除
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
// 参数是一个选项,可以是一个query,也可以是一个term,term是一个精确查找的值
// 此时删除的文档并不会被完全删除,而是存储在一个回收站中,可以恢复
writer.deleteDocuments(new Term("id", "1"));
2.恢复删除
// 使用indexreader恢复
// 将readeronly=false
IndexReader reader = IndexReader.open(directory, false);
reader.undeleteAll();
reader.close();
3.强制删除(清空回收站)
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
// 强制删除数据,清空回收站
// Lucene3.5之前是optimize()方法进行处理,但此方法消耗大量内存已经被弃用
writer.forceMergeDeletes();
4.优化和合并
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
// 添加了多次索引,可以设置允许的最大段索引,会将索引合并为两段,这两段中的被删除的数据会被情空
// 特别注意:此次不建议使用,会消耗大量的开销,Lucene会根据情况自动优化
writer.forceMerge(2);
5.更新索引
writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
/*
* Lucene并没有提供更新的操作,这里的更新是两个操作的合并 先删除之后再添加
*/
Document doc = new Document();
// 先将文档id=1的索引删除,再添加一个新的文档索引
// 先删除再代替的工作
writer.updateDocument(new Term("id", "1"), doc);
--------------------------------------------------
lucene索引_加权操作
--------------------------------------------------
通过Map
假设对特定邮箱进行评级
/*
* document.setBoost(float) 设置评级
*/
String et = emails[i].substring(emails[i].lastIndexOf("@") + 1);
//System.out.println(et);
if (scores.containsKey(et)) {
document.setBoost(scores.get(et));
} else {
document.setBoost(0.5f);
}
--------------------------------------------------
对日期和数字进行索引
--------------------------------------------------
private int[] attachs = { 2, 3, 1, 4, 5, 5 };
private Date[] dates = null;
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
dates = new Date[ids.length];
dates[0] = sdf.parse("2015-1-1");
dates[1] = sdf.parse("2015-2-1");
dates[2] = sdf.parse("2015-3-1");
dates[3] = sdf.parse("2015-4-1");
dates[4] = sdf.parse("2015-5-1");
dates[5] = sdf.parse("2015-6-1");
// 为数字添加索引
document.add(new NumericField("attach", Field.Store.YES, true).setIntValue(attachs[i]));
// 给日期添加索引
document.add(new NumericField("date", Field.Store.YES, true).setLongValue(dates[i].getTime()));
1.创建索引代码
* 一.建立索引
*/
public void index() {
IndexWriter writer = null;
try {
// 1.创建Directory(索引位置)
// 创建内存的索引
// Directory directory = new RAMDirectory();
// 创建自定义的索引位置
Directory directory = FSDirectory
.open(new File(
"F:/BaiduYunDownload/Cache/lucune/LuceneExamples/indexdata"));
// 2.创建IndexWriter(写入索引)
IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35,
new StandardAnalyzer(Version.LUCENE_35));// 参数2Analyzer表示创建的分词器
writer = new IndexWriter(directory, conf);
// 3.创建Document对象
Document document = null;
// 4.为Document添加Field(相当于添加些属性)
File fs = new File(
"F:/BaiduYunDownload/Cache/lucune/LuceneExamples/testdata");
// 遍历所有文件
for (File f : fs.listFiles()) {
document = new Document();
// 将内容添加成索引
document.add(new Field("content", new FileReader(f)));
// 添加文件的名字 第三个参数将文件的名字存储到索引中 第四个参数是否进行分词
document.add(new Field("fileName", f.getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
// 添加文件的路径
document.add(new Field("path", f.getAbsolutePath(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
// 5.通过IndexWriter添加文档到索引中
writer.addDocument(document);
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (writer != null) {
try {
writer.close();
writer = null;
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
2.对索引进行的增删改更新操作
/*
* 用作专门创建索引的工具
*/
public class IndexUtil {
/*
* 假设6个文档
*/
private String[] ids = { "1", "2", "3", "4", "5", "6" };
private String[] emails = { "[email protected]", "[email protected]", "[email protected]",
"[email protected]", "[email protected]", "[email protected]" };
private String[] contents = { "hello boy,i like pingpang", "like boy",
"xx bye i like swim", "hehe, i like basketball",
"dd fsfs, i like movie", "hello xxx,i like game" };
private int[] attachs = { 2, 3, 1, 4, 5, 5 };
private Date[] dates = null;
private String[] names = { "lili", "wangwu", "lisi", "jack", "tom", "mark" };
// 设置加权map
private Map scores = new HashMap();
/*
* 创建索引
*/
private Directory directory = null;
public IndexUtil() throws Exception {
// 创建日期索引时,给日期赋值
createDate();
// 给Emails加权处理
scores.put("sina", 2.0f);
scores.put("google", 1.5f);
directory = FSDirectory.open(new File(
"F:/BaiduYunDownload/Cache/lucune/Code/code01/indexdata"));
}
/*
* 给日期属性初始化
*/
private void createDate() throws Exception {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
dates = new Date[ids.length];
dates[0] = sdf.parse("2015-1-1");
dates[1] = sdf.parse("2015-2-1");
dates[2] = sdf.parse("2015-3-1");
dates[3] = sdf.parse("2015-4-1");
dates[4] = sdf.parse("2015-5-1");
dates[5] = sdf.parse("2015-6-1");
}
public void index() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(
Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
writer.deleteAll();
// 创建documents
Document document = null;
for (int i = 0; i < ids.length; i++) {
document = new Document();
document.add(new Field("id", ids[i], Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS));
document.add(new Field("email", emails[i], Field.Store.YES,
Field.Index.NOT_ANALYZED)); // 不分词
document.add(new Field("content", contents[i], Field.Store.NO,
Field.Index.ANALYZED));
document.add(new Field("name", names[i], Field.Store.YES,
Field.Index.NOT_ANALYZED));
// 为数字添加索引
document.add(new NumericField("attach", Field.Store.YES, true)
.setIntValue(attachs[i]));
// 给日期添加索引
document.add(new NumericField("date", Field.Store.YES, true)
.setLongValue(dates[i].getTime()));
/*
* document.setBoost(float) 设置评级
*/
String et = emails[i].substring(emails[i].lastIndexOf("@") + 1);
// System.out.println(et);
if (scores.containsKey(et)) {
document.setBoost(scores.get(et));
} else {
document.setBoost(0.5f);
}
writer.addDocument(document);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (writer != null) {
try {
writer.close();
writer = null;
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/*
* 强制优化索引(forceMerge()将所有索引都重新优化一遍)
*/
public void forceMerge() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(
Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
// 强制删除数据,清空回收站
// Lucene3.5之前是optimize()方法进行处理,但此方法消耗大量内存已经被弃用
writer.forceMergeDeletes();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (writer != null) {
try {
writer.close();
writer = null;
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/*
* 手动进行Merge优化
*/
public void merge() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(
Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
// 添加了多次索引,可以设置允许的最大段索引,会将索引合并为两段,这两段中的被删除的数据会被情空
// 特别注意:此次不建议使用,会消耗大量的开销,Lucene会根据情况自动优化
writer.forceMerge(2);
} catch (IOException e) {
e.printStackTrace();
} finally {
if (writer != null) {
try {
writer.close();
writer = null;
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/*
* 恢复索引文件
*/
public void undelete() throws Exception {
// 使用indexreader恢复
// 将readeronly=false
IndexReader reader = IndexReader.open(directory, false);
reader.undeleteAll();
reader.close();
}
/*
* 删除索引文件
*/
public void deleteIndex() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(
Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
// 参数是一个选项,可以是一个query,也可以是一个term,term是一个精确查找的值
// 此时删除的文档并不会被完全删除,而是存储在一个回收站中,可以恢复
writer.deleteDocuments(new Term("id", "1"));
} catch (IOException e) {
e.printStackTrace();
} finally {
if (writer != null) {
try {
writer.close();
writer = null;
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/*
* 更新索引
*/
public void update() {
IndexWriter writer = null;
try {
writer = new IndexWriter(directory, new IndexWriterConfig(
Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
/*
* Lucene并没有提供更新的操作,这里的更新是两个操作的合并 先删除之后再添加
*/
Document doc = new Document();
// 先将文档id=1的索引删除,再添加一个新的文档索引
// 先删除再代替的工作
writer.updateDocument(new Term("id", "1"), doc);
} catch (IOException e) {
e.printStackTrace();
} finally {
if (writer != null) {
try {
writer.close();
writer = null;
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/*
* 查询
*/
public void Query() throws Exception {
IndexReader reader = IndexReader.open(directory);
// 通过reader可以有效获取文档的数量
System.out.println("本索引存储的文档数:" + reader.numDocs());
System.out.println("总文档数(包括回收站):" + reader.maxDoc());
}
/*
* search
*/
public void Search() {
try {
IndexReader reader = IndexReader.open(directory);
IndexSearcher search = new IndexSearcher(reader);
// 精确搜索
TermQuery query = new TermQuery(new Term("content", "like"));
TopDocs tds = search.search(query, 10);
for (ScoreDoc sd : tds.scoreDocs) {
Document doc = search.doc(sd.doc);
System.out.println(sd.doc + doc.get("name") + "["
+ doc.get("email") + "," + doc.get("id") + ","
+ doc.get("attach") + "," + doc.get("date") + "]");
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}