package lighter.iteye.com; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; public class StandardAnalyzerTest { //构造函数, public StandardAnalyzerTest() { } public static void main(String[] args) { //生成一个StandardAnalyzer对象 Analyzer aAnalyzer = new StandardAnalyzer(); //测试字符串 StringReader sr = new StringReader("lighter javaeye com is the are on"); //生成TokenStream对象 TokenStream ts = aAnalyzer.tokenStream("name", sr); try { int i=0; Token t = ts.next(); while(t!=null) { //辅助输出时显示行号 i++; //输出处理后的字符 System.out.println("第"+i+"行:"+t.termText()); //取得下一个字符 t=ts.next(); } } catch (IOException e) { e.printStackTrace(); } } }
package lighter.iteye.com; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.store.FSDirectory; public class FSDirectoryTest { //建立索引的路径 public static final String path = "c:\\index2"; public static void main(String[] args) throws Exception { Document doc1 = new Document(); doc1.add( new Field("name", "lighter javaeye com",Field.Store.YES,Field.Index.TOKENIZED)); Document doc2 = new Document(); doc2.add(new Field("name", "lighter blog",Field.Store.YES,Field.Index.TOKENIZED)); IndexWriter writer = new IndexWriter(FSDirectory.getDirectory(path, true), new StandardAnalyzer(), true); writer.setMaxFieldLength(3); writer.addDocument(doc1); writer.setMaxFieldLength(3); writer.addDocument(doc2); writer.close(); IndexSearcher searcher = new IndexSearcher(path); Hits hits = null; Query query = null; QueryParser qp = new QueryParser("name",new StandardAnalyzer()); query = qp.parse("lighter"); hits = searcher.search(query); System.out.println("查找\"lighter\" 共" + hits.length() + "个结果"); query = qp.parse("javaeye"); hits = searcher.search(query); System.out.println("查找\"javaeye\" 共" + hits.length() + "个结果"); } }
1、在windows系统下的的C盘,建一个名叫s的文件夹,在该文件夹里面随便建三个txt文件,随便起名啦,就叫"1.txt","2.txt"和"3.txt"啦
其中1.txt的内容如下:
中华人民共和国 全国人民 2006年
而"2.txt"和"3.txt"的内容也可以随便写几写,这里懒写,就复制一个和1.txt文件的内容一样吧
2、下载lucene包,放在classpath路径中
建立索引:
package lighter.iteye.com; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Date; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; /** * author lighter date 2006-8-7 */ public class TextFileIndexer { public static void main(String[] args) throws Exception { /* 指明要索引文件夹的位置,这里是C盘的S文件夹下 */ File fileDir = new File("c:\\s"); /* 这里放索引文件的位置 */ File indexDir = new File("c:\\index"); Analyzer luceneAnalyzer = new StandardAnalyzer(); IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer, true); File[] textFiles = fileDir.listFiles(); long startTime = new Date().getTime(); //增加document到索引去 for (int i = 0; i < textFiles.length; i++) { if (textFiles[i].isFile() && textFiles[i].getName().endsWith(".txt")) { System.out.println("File " + textFiles[i].getCanonicalPath() + "正在被索引...."); String temp = FileReaderAll(textFiles[i].getCanonicalPath(), "GBK"); System.out.println(temp); Document document = new Document(); Field FieldPath = new Field("path", textFiles[i].getPath(), Field.Store.YES, Field.Index.NO); Field FieldBody = new Field("body", temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); document.add(FieldPath); document.add(FieldBody); indexWriter.addDocument(document); } } //optimize()方法是对索引进行优化 indexWriter.optimize(); indexWriter.close(); //测试一下索引的时间 long endTime = new Date().getTime(); System.out .println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!" + fileDir.getPath()); } public static String FileReaderAll(String FileName, String charset) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(FileName), charset)); String line = new String(); String temp = new String(); while ((line = reader.readLine()) != null) { temp += line; } reader.close(); return temp; } }
索引的结果:
File C:\s\1.txt正在被索引.... 中华人民共和国全国人民2006年 File C:\s\2.txt正在被索引.... 中华人民共和国全国人民2006年 File C:\s\3.txt正在被索引.... 中华人民共和国全国人民2006年 这花费了297 毫秒来把文档增加到索引里面去!c:\s
3、建立了索引之后,查询啦....
package lighter.iteye.com; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; public class TestQuery { public static void main(String[] args) throws IOException, ParseException { Hits hits = null; String queryString = "中华"; Query query = null; IndexSearcher searcher = new IndexSearcher("c:\\index"); Analyzer analyzer = new StandardAnalyzer(); try { QueryParser qp = new QueryParser("body", analyzer); query = qp.parse(queryString); } catch (ParseException e) { } if (searcher != null) { hits = searcher.search(query); if (hits.length() > 0) { System.out.println("找到:" + hits.length() + " 个结果!"); } } } }
其运行结果: