1. Why do we use Lucene?
1) If we want to execute the query like this:
(content like '%DataStructure%') or (content like '%XMU%') in DB. Then it starts searching the whole content from start to end. That would be low efficiency.
The Lucene comes to build index for the whole content. If we want to execute operations above. We just have to search from index file and not the real content. That would be much more efficient.
2) If we want to search the content in the attachment, it would be impossible using DB techonlogy.
2.The versions of Lucene?
1) 2.9-Core
2) 3.0-Core --> There is a big difference from 2.9
3) 3.5-Core --> There are some big differences from 3.0
3.In all kinds of full text indexing tools, they are all consists of three parts:
1) Index part ---> What kind of information should be stored in index files?
---> Eg. (I am a boy.) Should 'a' be stored in index files?
2) Participle part ---> How should the sentence be breaked into part?
3) Search part---> How should the sentence be searched in index file?
4. An example of Create Index using Lucene
1. Core function
package edu.xmu.lucene.Lucene_ModuleOne; import java.io.File; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; /** * Hello world! * */ public class App { /** * Create Index * * @throws IOException * @throws LockObtainFailedException * @throws CorruptIndexException */ public void buildIndex() throws CorruptIndexException, LockObtainFailedException, IOException { // 1. Create Directory // --> Where the directory be stored? Memory or HardDisk? // Directory dir = new RAMDirectory(); --> Index File Stored in MEM Directory dir = FSDirectory.open(new File("E:/LuceneIndex")); // 2. Create IndexWriter // --> It is used to write data into index files IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)); IndexWriter writer = new IndexWriter(dir, config); // Before 3.5 the way to create index is like below(depreciated): // new IndexWriter(Direcotry d, Analyzer a, boolean c, MaxFieldLength // mfl); // d: Directory, a: Analyzer, c: Shoule we create new one each time // mlf: The max length of the field to be indexed. // 3. Create Document // --> The target we want to search may be a doc file or a table in DB. // --> The path, name, size and modified date of the file. // --> All the information of the file should be stored in the Document. Document doc = null; // 4. Each Item of The Document is Called a Field. // --> The relationship of document and field is like table and cell. // Eg. We want to build index for all the txt file in the c:/lucene dir. // So each txt file in this dir is called a document. // And the name, size, modified date, content is called a field. File files = new File("E:/LuceneData"); for (File file : files.listFiles()) { doc = new Document(); doc.add(new Field("content", new FileReader(file))); doc.add(new Field("name", file.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // Field.Store.YES --> The field should be stored in index file // Field.Index.ANALYZED --> The filed should be participled doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // 5. Create Index File for Target Document by IndexWriter. writer.addDocument(doc); } // 6. Close Index Writer if (null != writer) { writer.close(); } } }
2. Test Case
package edu.xmu.lucene.Lucene_ModuleOne; import java.io.IOException; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.store.LockObtainFailedException; import org.junit.Test; /** * Unit test for simple App. */ public class AppTest { @Test public void buildIndex() { App app = new App(); try { app.buildIndex(); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (LockObtainFailedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
5. An Example of Query Using Index Files
1. Core Function of Query
package edu.xmu.lucene.Lucene_ModuleOne; import java.io.File; import java.io.FileReader; import java.io.IOException; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; /** * Hello world! * */ public class App { /** * Create Index * * @throws IOException * @throws LockObtainFailedException * @throws CorruptIndexException */ public void buildIndex() throws CorruptIndexException, LockObtainFailedException, IOException { // 1. Create Directory // --> Where the directory be stored? Memory or HardDisk? // Directory dir = new RAMDirectory(); --> Index File Stored in MEM Directory dir = FSDirectory.open(new File("E:/LuceneIndex")); // 2. Create IndexWriter // --> It is used to write data into index files IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)); IndexWriter writer = new IndexWriter(dir, config); // Before 3.5 the way to create index is like below(depreciated): // new IndexWriter(Direcotry d, Analyzer a, boolean c, MaxFieldLength // mfl); // d: Directory, a: Analyzer, c: Shoule we create new one each time // mlf: The max length of the field to be indexed. // 3. Create Document // --> The target we want to search may be a doc file or a table in DB. // --> The path, name, size and modified date of the file. // --> All the information of the file should be stored in the Document. Document doc = null; // 4. Each Item of The Document is Called a Field. // --> The relationship of document and field is like table and cell. // Eg. We want to build index for all the txt file in the c:/lucene dir. // So each txt file in this dir is called a document. // And the name, size, modified date, content is called a field. File files = new File("E:/LuceneData"); for (File file : files.listFiles()) { doc = new Document(); doc.add(new Field("content", new FileReader(file))); doc.add(new Field("name", file.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // Field.Store.YES --> The field should be stored in index file // Field.Index.ANALYZED --> The filed should be participled doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // 5. Create Index File for Target Document by IndexWriter. writer.addDocument(doc); } // 6. Close Index Writer if (null != writer) { writer.close(); } } /** * Search * @throws IOException * @throws ParseException */ public void search() throws IOException, ParseException { // 1. Create Directory Directory dir = FSDirectory.open(new File("E:/LuceneIndex")); // 2. Create IndexReader IndexReader reader = IndexReader.open(dir); // 3. Create IndexSearcher using IndexReader IndexSearcher searcher = new IndexSearcher(reader); // 4. Create query for search // Search the documents whose content have 'java' key word QueryParser parser = new QueryParser(Version.LUCENE_35, "content", new StandardAnalyzer(Version.LUCENE_35)); Query query = parser.parse("java"); // 5. Execute query and return TopDocs // param1: The query to be executed // param2: The number of result items TopDocs topDocs = searcher.search(query, 10); // 6. Get ScoreDoc according to TopDocs ScoreDoc[] docs = topDocs.scoreDocs; System.out.println("Hits: " + docs.length); for(ScoreDoc scoreDoc : docs) { // 7. Get Document using searcher and ScoreDoc Document d = searcher.doc(scoreDoc.doc); // 8. Get information using Document System.out.println("File Name : " + d.get("path")); } // 9. Close Reader reader.close(); } }
2. Test Case
package edu.xmu.lucene.Lucene_ModuleOne; import java.io.IOException; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.store.LockObtainFailedException; import org.junit.Test; /** * Unit test for simple App. */ public class AppTest { @Test public void buildIndex() { App app = new App(); try { app.buildIndex(); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (LockObtainFailedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } @Test public void search() { App app = new App(); try { app.search(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }