Lucene是一套用于全文检索和搜索的开放源代码程序库,由Apache软件基金会支持和提供。Lucene提供了一个简单却强大的应用程序接口,能够做全文索引和搜索,在Java开发环境里Lucene是一个成熟的免费开放源代码工具;就其本身而论,Lucene是现在并且是这几年,最受欢迎的免费Java信息检索程序库。
1.pom.xml
<dependency>
<groupId>org.apache.lucenegroupId>
<artifactId>lucene-coreartifactId>
<version>4.6.1version>
dependency>
<dependency>
<groupId>org.apache.lucenegroupId>
<artifactId>lucene-analyzers-commonartifactId>
<version>4.6.1version>
dependency>
<dependency>
<groupId>org.apache.lucenegroupId>
<artifactId>lucene-queryparserartifactId>
<version>4.6.1version>
dependency>
<dependency>
<groupId>org.apache.lucenegroupId>
<artifactId>lucene-codecsartifactId>
<version>4.6.1version>
dependency>
<dependency>
<groupId>junitgroupId>
<artifactId>junitartifactId>
<version>4.12version>
dependency>
2、testLucene类
//1.创建directory,保存索引,可以保存在内存中也可以保存在硬盘上
//保存在内存中使用Directory directory=new RAMDirectory();
File indexDir = new File(this.getClass().getClassLoader().getResource("").getFile());//new File("F:\\lucene\\index")
/**
* 创建索引
*/
@Test
public void createIndex() throws IOException {
// Directory index = new RAMDirectory();
Directory index = FSDirectory.open(indexDir);
// 0. Specify the analyzer for tokenizing text.
// The same analyzer should be used for indexing and searching
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
//2.1创建indexwriterConfig,并指定分词器版本
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer);
// 1. create the index
IndexWriter w = new IndexWriter(index, config);
addDoc(w, "Lucene in Action", "193398817");
addDoc(w, "Lucene for Dummies", "55320055Z");
addDoc(w, "Managing Gigabytes", "55063554A");
addDoc(w, "The Art of Computer Science", "9900333X");
w.close();
}
private void addDoc(IndexWriter w, String title, String isbn) throws IOException {
//File docDirectory=new File("F:\\lucene\\example");
Document doc = new Document();
doc.add(new TextField("title", title, Field.Store.YES));
// use a string field for isbn because we don't want it tokenized
doc.add(new StringField("isbn", isbn, Field.Store.YES));
w.addDocument(doc);
/*for(File file: docDirectory.listFiles()){
doc=new Document();
//创建搜索域,并说明是否进行分词
doc.add(new TextField("content",new FileReader(file)));
doc.add(new StringField("filename", file.getName(), Store.YES));
doc.add(new StringField("path",file.getAbsolutePath(), Store.YES));
//写入文档
writer.addDocument(doc);
}*/
}
/**
* 检索
* @throws IOException
*/
@Test
public void search() throws IOException {
// 2. query
String querystr = "lucene";//检索的字
// the "title" arg specifies the default field to use
// when no field is explicitly specified in the query.
Query q = null;
try {
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
//4创建搜索用的query,指定搜索域
q = new QueryParser(Version.LUCENE_46,"title", analyzer).parse(querystr);
} catch (Exception e) {
e.printStackTrace();
}
// 3. search
int hitsPerPage = 10;
//.1创建索引在的文件夹
//Directory indexDirectory=FSDirectory.open(new File("F:\\lucene\\index"));
Directory index = FSDirectory.open(indexDir);
//2.创建indexReader
IndexReader reader = DirectoryReader.open(index);
//根据indexReader创建indexSeacher
IndexSearcher searcher = new IndexSearcher(reader);
TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
searcher.search(q, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
// 4. display results
System.out.println("Found " + hits.length + " hits.");
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
}
// reader can only be closed when there
// is no need to access the documents any more.
reader.close();
//答案
//1. 193398817 Lucene in Action
//2. 55320055Z Lucene for Dummies
}
/**
* 分词
*/
@Test
public void cutWords() throws IOException {
// StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
// CJKAnalyzer analyzer = new CJKAnalyzer(Version.LUCENE_46);
SimpleAnalyzer analyzer = new SimpleAnalyzer(Version.LUCENE_46);
String text = "Spark是当前最流行的开源大数据内存计算框架,采用Scala语言实现,由UC伯克利大学AMPLab实验室开发并于2010年开源。";
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text));
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
try {
tokenStream.reset();
while (tokenStream.incrementToken()) {
System.out.println(charTermAttribute.toString());
}
tokenStream.end();
} finally {
tokenStream.close();
analyzer.close();
}
}
//答案
spark是当前最流行的开源大数据内存计算框架
采用scala语言实现
由uc伯克利大学amplab实验室开发并于
年开源