一:下载lucene 以下版本, lucene-3.0.1 及lucene-2.9.2
3.0.1 版本的与以前的版本有很大的差异,而像paoding-analysis-2.0.4-beta (庖丁 中文分词器,还只能使用2.*版本的) ,故两种版本都 下载,进行对照学习
lucene-analyzers-2.9.2.jar
lucene-core-2.9.2.jar
lucene-highlighter-2.9.2.jar
将以上三个文件放到classpath 下就可以了
这个文件是 庖丁 中文分词器 里的 需要单独下载 paoding-analysis-2.0.4-beta
paoding-analysis.jar
(关于中文分词器)
如一句话 "人之所以痛苦,在于追求错误的东西"
庖丁 会分成 : 之所以 痛苦 在于 追求 错误 东西
lucene 自带的分词器会分成(每个汉字) : 人 之 所 以 痛 苦 在 于 追 求 错 误 的 东 西
一般中文都用庖丁
=====================================================================
将 庖丁加入到项目中来
1 paoding-analysis.jar 加到路径
2 copy /paoding-analysis-2.0.4-beta/dic/ 词典到项目根路径,dic与src 平级
3 copy paoding-analysis-2.0.4-beta/src/paoding-dic-home.properties 到src 下并修改为
#values are "system-env" or "this"; 可以是this 各system-env 两个值
#if value is "this" , using the paoding.dic.home as dicHome if configed! 如果配置成this 就用下面配的paoding.dic.home 路径作为词典路径
paoding.dic.home.config-fisrt=this
#dictionary home (directory)
#"classpath:xxx" means dictionary home is in classpath.
#e.g "classpath:dic" means dictionaries are in "classes/dic" directory or any other classpath directory
paoding.dic.home=dic
#seconds for dic modification detection
paoding.dic.detector.interval=60
======================================================================
import java.io.File;
import java.io.IOException;
import java.util.List;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.jixiuf.compass.pojo.Book;
import org.junit.Test;
public class Test1 {
//一个目录 ,lucene 存储索引的地方,(可以是文件系统上的目录,也可以是内存中,)
Directory dir;
@org.junit.Before
public void setUp() {
File f = new File("target");
if (!f.exists()) {
f.mkdirs();
}
try {
dir = FSDirectory.open(f);
} catch (IOException e) {
e.printStackTrace();
}
}
//将一本书的 id name author 添加到索引 中
注意这里的几个类
IndexWriter ,向索引目录中写入数据 ,
Document (相当于数据库中的一条记录, )
Field (相当于数据库的的一个字段) new Field (name , value);
此方法做的事相当 于
insert into book (id ,name ,author) values (00001,"围城","钱钟书" );
如果想把多个Book 存进去,多建 几个Document
@Test
public void testSave() {
Book b = new Book();
b.setId("00001");
b.setName("english ");
b.setAuthor("jack");
try {
IndexWriter out = new IndexWriter(dir, new PaodingAnalyzer(), true,
IndexWriter.MaxFieldLength.UNLIMITED);
Field idF = new Field("id", b.getId(), Field.Store.YES,
Field.Index.NOT_ANALYZED);
Field nameF = new Field("name", b.getName(), Field.Store.YES,
Field.Index.ANALYZED);
Field authorF = new Field("author", b.getAuthor(), Field.Store.YES,
Field.Index.ANALYZED);
Document d = new Document();
d.add(idF);
d.add(nameF);
d.add(authorF);
out.addDocument(d);
out.commit();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
===========================
@Test
public void testRead() {
try {
IndexReader reader = IndexReader.open(dir);
for (int i = 0; i < reader.numDocs(); i++) {
Document d = reader.document(i); //读出所有Document ,相当于数据库中的所有记录
List<Fieldable> fields = d.getFields(); //读出每个Document 中的字段,
for (Fieldable f : fields) {
if (f instanceof Field) {
System.out.println(f.name() + ":===" + f.stringValue());
}
}
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
//search
@Test
public void testSearch() {
try {
IndexSearcher sc = new IndexSearcher(dir);
//查name == english 的
Query query = new TermQuery(new Term("name", "english"));
TopDocs docs = sc.search(query, 10);
System.out.println(docs.totalHits);
for (int i = 0; i < docs.totalHits; i++) {
Document d = sc.doc(i);
System.out.println(d.get("name"));
System.out.println(d.get("id"));
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void testSearch2() {
try {
IndexSearcher sc = new IndexSearcher(dir);
QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "id",
new StandardAnalyzer(Version.LUCENE_CURRENT));
Query q = parser.parse("id:00002"); // 词条author 为jack 的
TopScoreDocCollector c = TopScoreDocCollector.create(10, true);
sc.search(q, c);
System.out.println("共有数据document的条数:" + c.getTotalHits());
ScoreDoc[] docs = c.topDocs().scoreDocs;
for (ScoreDoc doc : docs) {
System.out.println("第" + doc.doc + "条");
Document d = sc.doc(doc.doc);
List<Fieldable> fs = d.getFields();
for (Fieldable f : fs) {
System.out.println("============");
System.out.println(f.stringValue());
}
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
}
}
@Test
public void testDel() {
try {
IndexWriter out = new IndexWriter(dir, new PaodingAnalyzer(),
MaxFieldLength.LIMITED);
out.deleteDocuments(new Term("id", "00002"));
out.commit();
out.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
// 词条搜索
@Test
public void testTermSearch() {
try {
IndexSearcher sc = new IndexSearcher(dir, true);
Term t = new Term("name", "围城");
TopScoreDocCollector c = TopScoreDocCollector.create(10, true);
sc.search(new TermQuery(t), c);
int count = c.getTotalHits();
System.out.println(count);
ScoreDoc[] docs = c.topDocs().scoreDocs;
for (ScoreDoc doc : docs) {
System.out.println("第" + doc.doc + "条");
Document d = sc.doc(doc.doc);
List<Fieldable> fs = d.getFields();
for (Fieldable f : fs) {
System.out.println("============");
System.out.println(f.stringValue());
}
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
public void testBoolean() {
try {
IndexSearcher sc = new IndexSearcher(dir);
TopScoreDocCollector c = TopScoreDocCollector.create(10, true);
BooleanQuery b = new BooleanQuery();
b.add(new TermQuery(new Term("name", "围城")), Occur.MUST);
b.add(new TermQuery(new Term("id", "00001")), Occur.SHOULD);
sc.search(b, c);
int count = c.getTotalHits();
System.out.println(count);
for (int i = 0; i < count; i++) {
Document d = sc.doc(i);
System.out.println(d);
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
// 字符范围
public void testTermRange1() {
try {
IndexWriter out = new IndexWriter(dir, new PaodingAnalyzer(),
false, IndexWriter.MaxFieldLength.UNLIMITED);
Field b = new Field("char", "b", Store.YES, Index.NOT_ANALYZED);
Document d = new Document();
d.add(b);
out.addDocument(d);
d = new Document();
Field a = new Field("char", "a", Store.YES, Index.NOT_ANALYZED);
d.add(a);
out.addDocument(d);
out.commit();
IndexSearcher sc = new IndexSearcher(dir);
TopScoreDocCollector c = TopScoreDocCollector.create(10, true);
TermRangeQuery tq = new TermRangeQuery("char", "a", "z", true, true);
System.out.println(tq.toString());
sc.search(tq, c);
int count = c.getTotalHits();
System.out.println(count);
for (ScoreDoc dc : c.topDocs().scoreDocs) {
System.out.println(sc.doc(dc.doc));
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
// 前缀
public void testPrefix() {
try {
IndexWriter out = new IndexWriter(dir, new PaodingAnalyzer(),
false, IndexWriter.MaxFieldLength.UNLIMITED);
Document d = new Document();
d.add(new Field("prefix", "ab", Store.YES, Index.NOT_ANALYZED));
out.addDocument(d);
d = new Document();
d.add(new Field("prefix", "abc", Store.YES, Index.NOT_ANALYZED));
out.addDocument(d);
d = new Document();
d.add(new Field("prefix", "aabc", Store.YES, Index.NOT_ANALYZED));
out.addDocument(d);
out.commit();
out.close();
IndexSearcher sc = new IndexSearcher(dir, true);
TopScoreDocCollector c = TopScoreDocCollector.create(10, true);
PrefixQuery q = new PrefixQuery(new Term("prefix", "ab"));
System.out.println(q.toString());
sc.search(q, c);
int count = c.getTotalHits();
System.out.println(count);
for (ScoreDoc doc : c.topDocs().scoreDocs) {
System.out.println(sc.doc(doc.doc).getField("prefix")
.stringValue());
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
@Test
// 短语
public void testPhrase() {
try {
IndexSearcher sc = new IndexSearcher(dir, true);
TopScoreDocCollector c = TopScoreDocCollector.create(10, true);
PhraseQuery q = new PhraseQuery();
q.add(new Term("name", "我们"));
q.add(new Term("name", "孩子"));
q.setSlop(2);// 表示 我们 和 孩子 两词间可以有2 个不相干的词
System.out.println(q.toString());
sc.search(q, c);
int count = c.getTotalHits();
System.out.println(count);
for (ScoreDoc doc : c.topDocs().scoreDocs) {
System.out.println(sc.doc(doc.doc).getField("name")
.stringValue());
System.out.println(sc.explain(q, doc.doc).toString());
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}