引用地址:http://clucene.org/lucene/63
================================================================
到官网下载lucene 3.0.2 官网地址:http://lucene.apache.org/
官网下载地址:http://repo1.maven.org/maven2/org/apache/lucene/
下载以下包:
lucene-core-3.0.2.jar
lucene-demos-3.0.2.jar
lucene-analyzers-3.0.2.jar
lucene-fast-vector-highlighter-3.0.2.jar
lucene-highlighter-3.0.2.jar
lucene-memory-3.0.2.jar
中文分词使用google IKAnalyze 官网地址:http://code.google.com/p/ik-analyzer/
IKAnalyzer3.2.5Stable.jar
创建索引,添加txt内容文件代码:
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Date; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.FSDirectory; import org.wltea.analyzer.lucene.IKAnalyzer; public class IndexerOK { private static String INDEX_DIR = “c:\\Lucene\\index”;// 索引存放目录 private static String DATA_DIR = “c:\\Lucene\\file1″;// 小文件存放的目录 public static void main(String[] args) throws Exception { long start = new Date().getTime(); int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));// 调用index方法 long end = new Date().getTime(); System.out.println(“Indexing ” + numIndexed + ” files took ” + (end – start) + ” milliseconds”); } /** * 索引dataDir下的.txt文件,并储存在indexDir下,返回索引的文件数量 * * @param indexDir * @param dataDir * @return int * @throws IOException */ public static int index(File indexDir, File dataDir) throws IOException { if (!dataDir.exists() || !dataDir.isDirectory()) { throw new IOException(dataDir + ” does not exist or is not a directory”); } Analyzer analyzer = new IKAnalyzer();// 采用的分词器 //第三个参数 为true表示新建,false表示添加到原有索引中 IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), analyzer, false, IndexWriter.MaxFieldLength.LIMITED); indexDirectory(writer, dataDir);// 调用indexDirectory方法 int numIndexed = writer.numDocs(); writer.optimize(); writer.close(); return numIndexed; } /** * 循环遍历目录下的所有.txt文件并进行索引 * * @param writer * @param dir * @throws IOException */ private static void indexDirectory(IndexWriter writer, File dir) throws IOException { File[] files = dir.listFiles(); for (int i = 0; i < files.length; i++) { File f = files[i]; if (f.isDirectory()) { indexDirectory(writer, f); // recurse } else if (f.getName().endsWith(“.txt”)) { indexFile(writer, f); } } } /** * 对单个txt文件进行索引 * * @param writer * @param f * @throws IOException */ private static void indexFile(IndexWriter writer, File f) throws IOException { if (f.isHidden() || !f.exists() || !f.canRead()) { return; } System.out.println(“Indexing ” + f.getCanonicalPath()); Document doc = new Document(); // doc.add(new Field(“contents”, new FileReader(f))); doc.add(new Field(“filename”, f.getCanonicalPath(), Field.Store.YES, Field.Index.ANALYZED)); String temp = FileReaderAll(f.getCanonicalPath(), “GBK”); System.out.println(temp); doc.add(new Field(“TTT”, temp, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field(“path”, f.getPath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field(“modified”, DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), Field.Store.YES, Field.Index.ANALYZED)); FileInputStream fis = new FileInputStream(f); // 按照 UTF-8 编码方式将字节流转化为字符流 InputStreamReader isr = new InputStreamReader(fis, “utf-8″); // 从字符流中获取文本并进行缓冲 BufferedReader br = new BufferedReader(isr); doc.add(new Field(“contents”, br)); writer.setUseCompoundFile(false); writer.addDocument(doc); } public static String FileReaderAll(String FileName, String charset) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(FileName), charset)); String line = new String(); String temp = new String(); while ((line = reader.readLine()) != null) { temp += line; } reader.close(); return temp; } }
管理lucene代码:
public class SearchDocBean { private String id; private String path; private String contents; private String dateTime; private String fileName; /** * @return the id */ public String getId() { return id; } /** * @param id * the id to set */ public void setId(String id) { this.id = id; } /** * @return the path */ public String getPath() { return path; } /** * @param path * the path to set */ public void setPath(String path) { this.path = path; } /** * @return the contents */ public String getContents() { return contents; } /** * @param contents * the contents to set */ public void setContents(String contents) { this.contents = contents; } /** * @return the dateTime */ public String getDateTime() { return dateTime; } /** * @param dateTime * the dateTime to set */ public void setDateTime(String dateTime) { this.dateTime = dateTime; } /** * @return the fileName */ public String getFileName() { return fileName; } /** * @param fileName * the fileName to set */ public void setFileName(String fileName) { this.fileName = fileName; } }
import java.io.File; import java.io.IOException; import java.sql.Connection; import java.sql.SQLException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; public class ManageIndexFile { private static String INDEX_DIR = “c:\\Lucene\\index”;// 索引存放目录 public static void DeleteIndex(SearchDocBean bean) throws IOException { Directory dir = FSDirectory.open(new File(INDEX_DIR)); IndexReader reader = IndexReader.open(dir, false); Term term = new Term(“modified”, bean.getId()); int count = reader.deleteDocuments(term); reader.close(); System.out.println(“Successful Delete ” + count + ” path==” + bean.getId()); } public static void DeleteIndex(int[] posIDS) throws IOException { Directory dir = FSDirectory.open(new File(INDEX_DIR)); IndexReader reader = IndexReader.open(dir, false); for (int i = 0; i < posIDS.length; i++) { Term term = new Term(“posID”, Integer.toString(posIDS[i])); reader.deleteDocuments(term); } reader.close(); } public static void UpdateIndex(SearchDocBean bean) throws IOException { Directory dir = FSDirectory.open(new File(INDEX_DIR)); IndexReader reader = IndexReader.open(dir, false); Term term = new Term(“modified”, bean.getId()); reader.deleteDocuments(term); reader.close(); IndexWriter writer = new IndexWriter(FSDirectory.open(new File( INDEX_DIR)), new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.add(new Field(“modified”, bean.getId(), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); writer.optimize(); writer.close(); } public static void AddIndex(SearchDocBean bean, Connection conn) throws IOException, SQLException { Analyzer analyzer = new IKAnalyzer();// 采用的分词器 IndexWriter writer = new IndexWriter(FSDirectory.open(new File( INDEX_DIR)), analyzer, false, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.add(new Field(“filename”, bean.getFileName(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field(“path”, bean.getPath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field(“dateTime”, bean.getId(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field(“TTT”, bean.getContents(), Field.Store.YES, Field.Index.ANALYZED)); writer.setUseCompoundFile(false); writer.addDocument(doc); writer.optimize(); writer.close(); } }
分词查询加高亮显示:
import java.io.File; import java.io.StringReader; import java.util.Date; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.FSDirectory; import org.wltea.analyzer.lucene.IKAnalyzer; import org.wltea.analyzer.lucene.IKQueryParser; import org.wltea.analyzer.lucene.IKSimilarity; public class SearchQueryOK { private static String INDEX_DIR = “c:\\Lucene\\index”;// 索引所在的路径 private static String KEYWORD = “人民”;// 关键词 private static int TOP_NUM = 100;// 显示前100条结果 public static void main(String[] args) throws Exception { File indexDir = new File(INDEX_DIR); if (!indexDir.exists() || !indexDir.isDirectory()) { throw new Exception(indexDir + ” does not exist or is not a directory.”); } search(indexDir, KEYWORD);// 调用search方法进行查询 } /** * 查询 * * @param indexDir * @param q * @throws Exception */ public static void search(File indexDir, String q) throws Exception { IndexSearcher is = new IndexSearcher(FSDirectory.open(indexDir), true);// read-only String[] field = {“TTT”,”modified”,”filename”}; long start = new Date().getTime();// start time // 高亮设置 Analyzer analyzer = new IKAnalyzer();// 设定分词器 Query query2 = IKQueryParser.parseMultiField(field, KEYWORD); // 实例化搜索器 IndexSearcher isearcher1 = new IndexSearcher(FSDirectory.open(indexDir)); // 在索引器中使用IKSimilarity相似度评估器 isearcher1.setSimilarity(new IKSimilarity()); Sort sort = new Sort(new SortField(“path”, SortField.DOC,false)); //TermQuery q1 = new TermQuery(new Term(“filename”, “1″)); // 搜索相似度最高的记录 TopDocs topDocs1 = isearcher1.search(query2,null, TOP_NUM,sort); ScoreDoc[] hits3 = topDocs1.scoreDocs; SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter( “<span style=’color:#ff0000′>”, “</span>”);// 设定高亮显示的格式,也就是对高亮显示的词组加上前缀后缀 Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query2)); for (int i = 0; i < hits3.length; i++) { Document doc = is.doc(hits3[i].doc); String docTTT = doc.get(“TTT”); highlighter.setTextFragmenter(new SimpleFragmenter(docTTT.length()));// 设置每次返回的字符数.想必大家在使用搜索引擎的时候也没有一并把全部数据展示出来吧,当然这里也是设定只展示部分数据 TokenStream tokenStream = analyzer.tokenStream(“”, new StringReader(docTTT)); String str = highlighter.getBestFragment(tokenStream, docTTT); System.out.println(” 高亮设置: ” + str ); String docModified = doc.get(“filename”); highlighter.setTextFragmenter(new SimpleFragmenter(docModified.length())); TokenStream tokenStream2 = analyzer.tokenStream(“”, new StringReader(docModified)); String str2 = highlighter.getBestFragment(tokenStream2, docModified); System.out.println(” 高亮设置: ” + str2 ); List<Fieldable> list = doc.getFields(); for (int j = 0; j < list.size(); j++) { Fieldable fieldable = list.get(j); System.out.println(fieldable.name() + ” : ” + fieldable.stringValue() + “<br>”); } } long end = new Date().getTime();// end time System.out.println(“Found ” + hits3.length + ” document(s) (in ” + (end – start) + ” milliseconds) that matched query ‘” + q + “‘:”); } }