- 创建索引
001 package org.apache.lucene.demo; 002 003 /* 004 * Licensed to the Apache Software Foundation (ASF) under one or more 005 * contributor license agreements. See the NOTICE file distributed with 006 * this work for additional information regarding copyright ownership. 007 * The ASF licenses this file to You under the Apache License, Version 2.0 008 * (the "License"); you may not use this file except in compliance with 009 * the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019 020 import org.apache.lucene.analysis.Analyzer; 021 import org.apache.lucene.analysis.standard.StandardAnalyzer; 022 import org.apache.lucene.document.Document; 023 import org.apache.lucene.document.Field; 024 import org.apache.lucene.document.LongField; 025 import org.apache.lucene.document.StringField; 026 import org.apache.lucene.document.TextField; 027 import org.apache.lucene.index.IndexWriter; 028 import org.apache.lucene.index.IndexWriterConfig.OpenMode; 029 import org.apache.lucene.index.IndexWriterConfig; 030 import org.apache.lucene.index.Term; 031 import org.apache.lucene.store.Directory; 032 import org.apache.lucene.store.FSDirectory; 033 import org.apache.lucene.util.Version; 034 035 import java.io.BufferedReader; 036 import java.io.File; 037 import java.io.FileInputStream; 038 import java.io.FileNotFoundException; 039 import java.io.IOException; 040 import java.io.InputStreamReader; 041 import java.util.Date; 042 043 /** Index all text files under a directory. 044 * <p> 045 * This is a command-line application demonstrating simple Lucene indexing. 046 * Run it with no command-line arguments for usage information. 047 */ 048 public class IndexFiles { 049 050 private IndexFiles() {} 051 052 /** Index all text files under a directory. */ 053 public static void main(String[] args) { 054 String usage = "java org.apache.lucene.demo.IndexFiles" 055 + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" 056 + "This indexes the documents in DOCS_PATH, creating a Lucene index" 057 + "in INDEX_PATH that can be searched with SearchFiles"; 058 String indexPath = "index"; 059 String docsPath = null; 060 boolean create = true; 061 for(int i=0;i<args.length;i++) { 062 if ("-index".equals(args[i])) { 063 indexPath = args[i+1]; 064 i++; 065 } else if ("-docs".equals(args[i])) { 066 docsPath = args[i+1]; 067 i++; 068 } else if ("-update".equals(args[i])) { 069 create = false; 070 } 071 } 072 073 if (docsPath == null) { 074 System.err.println("Usage: " + usage); 075 System.exit(1); 076 } 077 078 final File docDir = new File(docsPath); 079 if (!docDir.exists() || !docDir.canRead()) { 080 System.out.println("Document directory '" +docDir.getAbsolutePath()+ "' does not exist or is not readable, please check the path"); 081 System.exit(1); 082 } 083 084 Date start = new Date(); 085 try { 086 System.out.println("Indexing to directory '" + indexPath + "'..."); 087 088 Directory dir = FSDirectory.open(new File(indexPath)); 089 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); 090 IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); 091 092 if (create) { 093 // Create a new index in the directory, removing any 094 // previously indexed documents: 095 iwc.setOpenMode(OpenMode.CREATE); 096 } else { 097 // Add new documents to an existing index: 098 iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); 099 } 100 101 // Optional: for better indexing performance, if you 102 // are indexing many documents, increase the RAM 103 // buffer. But if you do this, increase the max heap 104 // size to the JVM (eg add -Xmx512m or -Xmx1g): 105 // 106 // iwc.setRAMBufferSizeMB(256.0); 107 108 IndexWriter writer = new IndexWriter(dir, iwc); 109 indexDocs(writer, docDir); 110 111 // NOTE: if you want to maximize search performance, 112 // you can optionally call forceMerge here. This can be 113 // a terribly costly operation, so generally it's only 114 // worth it when your index is relatively static (ie 115 // you're done adding documents to it): 116 // 117 // writer.forceMerge(1); 118 119 writer.close(); 120 121 Date end = new Date(); 122 System.out.println(end.getTime() - start.getTime() + " total milliseconds"); 123 124 } catch (IOException e) { 125 System.out.println(" caught a " + e.getClass() + 126 "\n with message: " + e.getMessage()); 127 } 128 } 129 130 /** 131 * Indexes the given file using the given writer, or if a directory is given, 132 * recurses over files and directories found under the given directory. 133 * 134 * NOTE: This method indexes one document per input file. This is slow. For good 135 * throughput, put multiple documents into your input file(s). An example of this is 136 * in the benchmark module, which can create "line doc" files, one document per line, 137 * using the 138 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" 139 * >WriteLineDocTask</a>. 140 * 141 * @param writer Writer to the index where the given file/dir info will be stored 142 * @param file The file to index, or the directory to recurse into to find files to index 143 * @throws IOException If there is a low-level I/O error 144 */ 145 static void indexDocs(IndexWriter writer, File file) 146 throws IOException { 147 // do not try to index files that cannot be read 148 if (file.canRead()) { 149 if (file.isDirectory()) { 150 String[] files = file.list(); 151 // an IO error could occur 152 if (files != null) { 153 for (int i = 0; i < files.length; i++) { 154 indexDocs(writer, new File(file, files[i])); 155 } 156 } 157 } else { 158 159 FileInputStream fis; 160 try { 161 fis = new FileInputStream(file); 162 } catch (FileNotFoundException fnfe) { 163 // at least on windows, some temporary files raise this exception with an "access denied" message 164 // checking if the file can be read doesn't help 165 return; 166 } 167 168 try { 169 170 // make a new, empty document 171 Document doc = new Document(); 172 173 // Add the path of the file as a field named "path". Use a 174 // field that is indexed (i.e. searchable), but don't tokenize 175 // the field into separate words and don't index term frequency 176 // or positional information: 177 Field pathField = new StringField("path", file.getPath(), Field.Store.YES); 178 doc.add(pathField); 179 180 // Add the last modified date of the file a field named "modified". 181 // Use a LongField that is indexed (i.e. efficiently filterable with 182 // NumericRangeFilter). This indexes to milli-second resolution, which 183 // is often too fine. You could instead create a number based on 184 // year/month/day/hour/minutes/seconds, down the resolution you require. 185 // For example the long value 2011021714 would mean 186 // February 17, 2011, 2-3 PM. 187 doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); 188 189 // Add the contents of the file to a field named "contents". Specify a Reader, 190 // so that the text of the file is tokenized and indexed, but not stored. 191 // Note that FileReader expects the file to be in UTF-8 encoding. 192 // If that's not the case searching for special characters will fail. 193 doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); 194 195 if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { 196 // New index, so we just add the document (no old document can be there): 197 System.out.println("adding " + file); 198 writer.addDocument(doc); 199 } else { 200 // Existing index (an old copy of this document may have been indexed) so 201 // we use updateDocument instead to replace the old one matching the exact 202 // path, if present: 203 System.out.println("updating " + file); 204 writer.updateDocument(new Term("path", file.getPath()), doc); 205 } 206 207 } finally { 208 fis.close(); 209 } 210 } 211 } 212 } 213 }
2.通过索引查询指定的字段
001 package org.apache.lucene.demo; 002 003 /* 004 * Licensed to the Apache Software Foundation (ASF) under one or more 005 * contributor license agreements. See the NOTICE file distributed with 006 * this work for additional information regarding copyright ownership. 007 * The ASF licenses this file to You under the Apache License, Version 2.0 008 * (the "License"); you may not use this file except in compliance with 009 * the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019 020 import java.io.BufferedReader; 021 import java.io.File; 022 import java.io.FileInputStream; 023 import java.io.IOException; 024 import java.io.InputStreamReader; 025 import java.util.Date; 026 027 import org.apache.lucene.analysis.Analyzer; 028 import org.apache.lucene.analysis.standard.StandardAnalyzer; 029 import org.apache.lucene.document.Document; 030 import org.apache.lucene.index.DirectoryReader; 031 import org.apache.lucene.index.IndexReader; 032 import org.apache.lucene.queryparser.classic.QueryParser; 033 import org.apache.lucene.search.IndexSearcher; 034 import org.apache.lucene.search.Query; 035 import org.apache.lucene.search.ScoreDoc; 036 import org.apache.lucene.search.TopDocs; 037 import org.apache.lucene.store.FSDirectory; 038 import org.apache.lucene.util.Version; 039 040 /** Simple command-line based search demo. */ 041 public class SearchFiles { 042 043 private SearchFiles() {} 044 045 /** Simple command-line based search demo. */ 046 public static void main(String[] args) throws Exception { 047 String usage = 048 "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details."; 049 if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) { 050 System.out.println(usage); 051 System.exit(0); 052 } 053 054 String index = "index"; 055 String field = "contents"; 056 String queries = null; 057 int repeat = 0; 058 boolean raw = false; 059 String queryString = null; 060 int hitsPerPage = 10; 061 062 for(int i = 0;i < args.length;i++) { 063 if ("-index".equals(args[i])) { 064 index = args[i+1]; 065 i++; 066 } else if ("-field".equals(args[i])) { 067 field = args[i+1]; 068 i++; 069 } else if ("-queries".equals(args[i])) { 070 queries = args[i+1]; 071 i++; 072 } else if ("-query".equals(args[i])) { 073 queryString = args[i+1]; 074 i++; 075 } else if ("-repeat".equals(args[i])) { 076 repeat = Integer.parseInt(args[i+1]); 077 i++; 078 } else if ("-raw".equals(args[i])) { 079 raw = true; 080 } else if ("-paging".equals(args[i])) { 081 hitsPerPage = Integer.parseInt(args[i+1]); 082 if (hitsPerPage <= 0) { 083 System.err.println("There must be at least 1 hit per page."); 084 System.exit(1); 085 } 086 i++; 087 } 088 } 089 090 IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index))); 091 IndexSearcher searcher = new IndexSearcher(reader); 092 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); 093 094 BufferedReader in = null; 095 if (queries != null) { 096 in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8")); 097 } else { 098 in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); 099 } 100 QueryParser parser = new QueryParser(Version.LUCENE_40, field, analyzer); 101 while (true) { 102 if (queries == null && queryString == null) { // prompt the user 103 System.out.println("Enter query: "); 104 } 105 106 String line = queryString != null ? queryString : in.readLine(); 107 108 if (line == null || line.length() == -1) { 109 break; 110 } 111 112 line = line.trim(); 113 if (line.length() == 0) { 114 break; 115 } 116 117 Query query = parser.parse(line); 118 System.out.println("Searching for: " + query.toString(field)); 119 120 if (repeat > 0) { // repeat & time as benchmark 121 Date start = new Date(); 122 for (int i = 0; i < repeat; i++) { 123 searcher.search(query, null, 100); 124 } 125 Date end = new Date(); 126 System.out.println("Time: "+(end.getTime()-start.getTime())+"ms"); 127 } 128 129 doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null); 130 131 if (queryString != null) { 132 break; 133 } 134 } 135 reader.close(); 136 } 137 138 /** 139 * This demonstrates a typical paging search scenario, where the search engine presents 140 * pages of size n to the user. The user can then go to the next page if interested in 141 * the next hits. 142 * 143 * When the query is executed for the first time, then only enough results are collected 144 * to fill 5 result pages. If the user wants to page beyond this limit, then the query 145 * is executed another time and all hits are collected. 146 * 147 */ 148 public static void doPagingSearch(BufferedReader in, IndexSearcher searcher, Query query, 149 int hitsPerPage, boolean raw, boolean interactive) throws IOException { 150 151 // Collect enough docs to show 5 pages 152 TopDocs results = searcher.search(query, 5 * hitsPerPage); 153 ScoreDoc[] hits = results.scoreDocs; 154 155 int numTotalHits = results.totalHits; 156 System.out.println(numTotalHits + " total matching documents"); 157 158 int start = 0; 159 int end = Math.min(numTotalHits, hitsPerPage); 160 161 while (true) { 162 if (end > hits.length) { 163 System.out.println("Only results 1 - " + hits.length +" of " + numTotalHits + " total matching documents collected."); 164 System.out.println("Collect more (y/n) ?"); 165 String line = in.readLine(); 166 if (line.length() == 0 || line.charAt(0) == 'n') { 167 break; 168 } 169 170 hits = searcher.search(query, numTotalHits).scoreDocs; 171 } 172 173 end = Math.min(hits.length, start + hitsPerPage); 174 175 for (int i = start; i < end; i++) { 176 if (raw) { // output raw format 177 System.out.println("doc="+hits[i].doc+" score="+hits[i].score); 178 continue; 179 } 180 181 Document doc = searcher.doc(hits[i].doc); 182 String path = doc.get("path"); 183 if (path != null) { 184 System.out.println((i+1) + ". " + path); 185 String title = doc.get("title"); 186 if (title != null) { 187 System.out.println(" Title: " + doc.get("title")); 188 } 189 } else { 190 System.out.println((i+1) + ". " + "No path for this document"); 191 } 192 193 } 194 195 if (!interactive || end == 0) { 196 break; 197 } 198 199 if (numTotalHits >= end) { 200 boolean quit = false; 201 while (true) { 202 System.out.print("Press "); 203 if (start - hitsPerPage >= 0) { 204 System.out.print("(p)revious page, "); 205 } 206 if (start + hitsPerPage < numTotalHits) { 207 System.out.print("(n)ext page, "); 208 } 209 System.out.println("(q)uit or enter number to jump to a page."); 210 211 String line = in.readLine(); 212 if (line.length() == 0 || line.charAt(0)=='q') { 213 quit = true; 214 break; 215 } 216 if (line.charAt(0) == 'p') { 217 start = Math.max(0, start - hitsPerPage); 218 break; 219 } else if (line.charAt(0) == 'n') { 220 if (start + hitsPerPage < numTotalHits) { 221 start+=hitsPerPage; 222 } 223 break; 224 } else { 225 int page = Integer.parseInt(line); 226 if ((page - 1) * hitsPerPage < numTotalHits) { 227 start = (page - 1) * hitsPerPage; 228 break; 229 } else { 230 System.out.println("No such page"); 231 } 232 } 233 } 234 if (quit) break; 235 end = Math.min(numTotalHits, start + hitsPerPage); 236 } 237 } 238 } 239 }