jar包使用:
lucene-core-2.4.0.jar
//Lucene核心包.
lucene-analyzers-2.4.0.jar
//分词器包.
paoding-analysis-2.0.4-alpha2
//paoding分词器包.
//首先我们想要分词就得创建一个分词的所索引.
public void createLuceneIndex(List<Bean> beanList) throws Exception {
System.out.println("--------- 开始创建索引 ------------");
Directory fsDir = FSDirectory.getDirectory("E:\\Workspaces\\Lucene\\FileIndex");
//IndexWriter参数说明: 目录参数, 分词器, 是否重新创建, 字段最大长度.
IndexWriter fsindexWriter = new IndexWriter(fsDir, analyzer, true, MaxFieldLength.LIMITED);
//创建随机储存目录.
Directory ramDir = new RAMDirectory(fsDir);
IndexWriter ramIndexWriter = new IndexWriter(ramDir, analyzer, MaxFieldLength.LIMITED);
for (Bean bean : beanList) {
//把文件转为Document类型
Document doc = DocumentUtil.getDocuement(bean);
System.out.println("索引号------> : " + ramIndexWriter.numRamDocs());
System.out.println("文章标题----> : " + bean.getText());
ramIndexWriter.addDocument(doc);
}
//关闭随机索性写入器.
ramIndexWriter.optimize();
ramIndexWriter.close();
//把内存里的索引存入硬盘里.
fsindexWriter.addIndexesNoOptimize(new Directory[]{ ramDir });
System.out.println("--------- 创建索引成功 ------------");
fsindexWriter.optimize();
fsindexWriter.close();
}
//我们创建索引的时候会插入的数据是Document类型的.创建一个方法来把一个Bean对像转为Document对象.
private static int index = 0; //声明一个全局变量来做为索引的gid, 方便对单一索引进行操作.
public synchronized static Document getDocuement(Bean bean){
// File file = new File(path);
Document doc = new Document();
//Filed 参数说明: 索引名, 文件名, 是否存储(Store.COMPRESS(压缩之后在存)), 是否建立索引
//(index.NOT_ANALYZED(不分词索引);index.ANALYZED(分词后索引);index.NO(不索引)).
doc.add(new Field("gid", getNextIndex()+"", Store.COMPRESS, Index.ANALYZED));
doc.add(new Field("title", bean.getTitle(), Store.COMPRESS, Index.ANALYZED));
doc.add(new Field("text", bean.getText(), Store.COMPRESS, Index.ANALYZED));
return doc;
}
//插入索引.
public synchronized boolean insertIndex(Bean bean) throws Exception{
System.out.println("开始插入数据........");
//创建读入索引对象.创建读入对像是为了看是否插入数据.也可以不查看.
IndexReader indexReader = IndexReader.open("E:\\Workspaces\\Lucene\\FileIndex");
System.out.println("插入前号:--》" + indexReader.numDocs());
indexReader.close();
//获取索引对象.
Directory fsDir = FSDirectory.getDirectory("E:\\Workspaces\\Lucene\\FileIndex");
//创建索引对写入对象.
IndexWriter fsindexWriter = new IndexWriter(fsDir, analyzer, false, MaxFieldLength.LIMITED);
Document doc = FileToDocument.getDocuement(bean); //传入BEAN得到文档对象.
fsindexWriter.addDocument(doc); //添加文档对象.
fsindexWriter.optimize(); //
fsindexWriter.close();
IndexReader indexReaderLast = IndexReader.open("E:\\Workspaces\Lucene\\FileIndex");
System.out.println("插入后号:--》" + indexReaderLast.numDocs());
indexReaderLast.close();
return true;
}
//删除相对的索引.
public synchronized boolean deleteIndex(int index) throws Exception {
Directory fsDir = FSDirectory.getDirectory("E:\\Workspaces\\Lucene\\FileIndex");
IndexWriter fsindexWriter = new IndexWriter(fsDir, analyzer, false, MaxFieldLength.LIMITED);
//删除相对的索引.通过对我们自己设置的GID进行查找.就能找以我们想要删除的对象.
fsindexWriter.deleteDocuments(new Term("gid",index+""));
fsindexWriter.commit();
fsindexWriter.close();
IndexReader indexReader2 = IndexReader.open("E:\\Workspaces\\Lucene\\FileIndex");
System.out.println("删除后---------->" + indexReader2.numDocs());
}
public class LuceneTest {
public static void main(String[] args) throws Exception {
String queryStr = "中国";
//查询用的分词器和创建用的最好使用一个.
Analyzer analyzer = new PaodingAnalyzer();
//把要解析的文本解析为Query对像.
String[] fields = {"title"};
QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer);
Query query = queryParser.parse(queryStr);
//过滤器.为空就不过滤.
Filter filter = null;
IndexSearcher indexSearcher = new IndexSearcher("E:\\Workspaces\\Lucene\\FileIndex");
//indexSearcher参数说明: 查询对象, 过滤器, 一次性能查的最大文档.
TopDocs topDocs = indexSearcher.search(query, filter, 10000);
System.out.println("匹配条数:" + topDocs.totalHits);
for (ScoreDoc topDoc : topDocs.scoreDocs) {
int docid = topDoc.doc;
System.out.println(docid); //文档内部编号.
Document doc = indexSearcher.doc(docid);
System.out.println(doc.get("text"));
}
}
}
// 分词器的分词例子.
package com.testLucene.analyzer;
import java.io.StringReader;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
public class AnalyzerTest {
public static void main(String[] args) throws Exception {
Analyzer analyzer = new PaodingAnalyzer();
String str = "中华人民共和国";
new AnalyzerTest().analyzer(analyzer, str);
}
public void analyzer(Analyzer analyzer, String str) throws Exception{
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(str));
for(Token token = new Token();(token = tokenStream.next(token)) != null;){
System.out.println(token);
}
}
}
paoding分词器的配置方法.
版本: paoding-analysis-2.0.4-alpha2
使用paoding分词要配置环境变量.(没试验过不配能不能用).
1. PAODING_DIC_HOME = E:\MyDocument\paoding-analysis-2.0.4-alpha2\dic
2. 把E:\MyDocument\paoding-analysis-2.0.4-alpha2\src\paoding-dic-home.properties这个文件复制到项目的src目录下.
3. 修改paoding-dic-home.properties的内容:
paoding.dic.home =/MyDocument/paoding-analysis-2.0.4-alpha2/dic
如果报错:Caused by: java.lang.ClassNotFoundException: org.apache.commons.logging.LogFactory
需添加一个包: commons-logging-1.0.4.jar.
这样paoding分词器就搞定.