import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
/**
* @author ht 预处理
*
*/
public class FilePreprocess {
public static void main(String[] arg) {
String outputpath = "E:\\lucenetest\\small\\";// 小文件存放路径
String filename = "E:\\lucenetest\\三国演义.txt";// 原文件存放路径
if (!new File(outputpath).exists()) {
new File(outputpath).mkdirs();
}
splitToSmallFiles(new File(filename), outputpath);
}
/**
* 大文件切割为小的
*
* @param file
* @param outputpath
*/
public static void splitToSmallFiles(File file, String outputpath) {
int filePointer = 0;
int MAX_SIZE = 10240;
String filename = "output";
BufferedWriter writer = null;
try {
BufferedReader reader = new BufferedReader(new FileReader(file));
StringBuffer buffer = new StringBuffer();
String line = reader.readLine();
while (line != null) {
buffer.append(line).append("\r\n");
if (buffer.toString().getBytes().length >= MAX_SIZE) {
writer = new BufferedWriter(new FileWriter(outputpath
+ filename + filePointer + ".txt"));
writer.write(buffer.toString());
writer.close();
filePointer++;
buffer = new StringBuffer();
}
line = reader.readLine();
}
writer = new BufferedWriter(new FileWriter(outputpath + filename
+ filePointer + ".txt"));
writer.write(buffer.toString());
writer.close();
System.out.println("The file hava splited to small files !");
} catch (FileNotFoundException e) {
System.out.println("file not found !");
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
package com.uphenan.lucene.test;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* @author ht
* 索引生成
*
*/
public class Indexer {
private static String INDEX_DIR = "E:\\lucenetest\\index";//索引存放目录
private static String DATA_DIR = "E:\\lucenetest\\small\\";//小文件存放的目录
public static void main(String[] args) throws Exception {
long start = new Date().getTime();
int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));//调用index方法
long end = new Date().getTime();
System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds");
}
/**索引dataDir下的.txt文件,并储存在indexDir下,返回索引的文件数量
* @param indexDir
* @param dataDir
* @return int
* @throws IOException
*/
public static int index(File indexDir, File dataDir) throws IOException {
if (!dataDir.exists() || !dataDir.isDirectory()) {
throw new IOException(dataDir + " does not exist or is not a directory");
}
IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), new StandardAnalyzer(Version.LUCENE_CURRENT), true,
IndexWriter.MaxFieldLength.LIMITED);//有变化的地方
indexDirectory(writer, dataDir);
int numIndexed = writer.numDocs();
writer.optimize();
writer.close();
return numIndexed;
}
/**循环遍历目录下的所有.txt文件并进行索引
* @param writer
* @param dir
* @throws IOException
*/
private static void indexDirectory(IndexWriter writer, File dir)
throws IOException {
File[] files = dir.listFiles();
for (int i = 0; i < files.length; i++) {
File f = files[i];
if (f.isDirectory()) {
indexDirectory(writer, f); // recurse
} else if (f.getName().endsWith(".txt")) {
indexFile(writer, f);
}
}
}
/**对单个txt文件进行索引
* @param writer
* @param f
* @throws IOException
*/
private static void indexFile(IndexWriter writer, File f)
throws IOException {
if (f.isHidden() || !f.exists() || !f.canRead()) {
return;
}
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = new Document();
doc.add(new Field("contents",new FileReader(f)));//有变化的地方
doc.add(new Field("filename",f.getCanonicalPath(),Field.Store.YES, Field.Index.ANALYZED));//有变化的地方
writer.addDocument(doc);
}
}
package com.uphenan.lucene.test;
import java.io.File;
import java.util.Date;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* @author ht
* 查询
*
*/
public class Searcher {
private static String INDEX_DIR = "E:\\lucenetest\\index\\";//索引所在的路径
private static String KEYWORD = "诸葛亮";//关键词
private static int TOP_NUM = 100;//显示前100条结果
public static void main(String[] args) throws Exception {
File indexDir = new File(INDEX_DIR);
if (!indexDir.exists() || !indexDir.isDirectory()) {
throw new Exception(indexDir +
" does not exist or is not a directory.");
}
search(indexDir, KEYWORD);//调用search方法进行查询
}
/**查询
* @param indexDir
* @param q
* @throws Exception
*/
public static void search(File indexDir, String q) throws Exception {
IndexSearcher is = new IndexSearcher(FSDirectory.open(indexDir),true);//read-only
String field = "contents";
QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, new StandardAnalyzer(Version.LUCENE_CURRENT));//有变化的地方
Query query = parser.parse(q);
TopScoreDocCollector collector = TopScoreDocCollector.create(TOP_NUM , false);//有变化的地方
long start = new Date().getTime();// start time
is.search(query, collector);
ScoreDoc[] hits = collector.topDocs().scoreDocs;
System.out.println(hits.length);
for (int i = 0; i < hits.length; i++) {
Document doc = is.doc(hits[i].doc);//new method is.doc()
System.out.println(doc.getField("filename")+" "+hits[i].toString()+" ");
}
long end = new Date().getTime();//end time
System.out.println("Found " + collector.getTotalHits() +
" document(s) (in " + (end - start) +
" milliseconds) that matched query '" +
q + "':");
}
}
Lucene Results:
43
stored,indexed,tokenized doc=95 score=0.20325354
stored,indexed,tokenized doc=93 score=0.18696608
stored,indexed,tokenized doc=83 score=0.17826515
stored,indexed,tokenized doc=91 score=0.1527987
stored,indexed,tokenized doc=85 score=0.14914733
stored,indexed,tokenized doc=90 score=0.13808359
stored,indexed,tokenized doc=41 score=0.1260525
stored,indexed,tokenized doc=48 score=0.1260525
stored,indexed,tokenized doc=50 score=0.1260525
stored,indexed,tokenized doc=82 score=0.1260525
stored,indexed,tokenized doc=86 score=0.1260525
stored,indexed,tokenized doc=88 score=0.1260525
stored,indexed,tokenized doc=89 score=0.1260525
stored,indexed,tokenized doc=92 score=0.1260525
stored,indexed,tokenized doc=37 score=0.11274478
stored,indexed,tokenized doc=38 score=0.11274478
stored,indexed,tokenized doc=49 score=0.11274478
stored,indexed,tokenized doc=96 score=0.11274478
stored,indexed,tokenized doc=42 score=0.097639844
stored,indexed,tokenized doc=55 score=0.097639844
stored,indexed,tokenized doc=64 score=0.097639844
stored,indexed,tokenized doc=94 score=0.09663838
stored,indexed,tokenized doc=35 score=0.0797226
stored,indexed,tokenized doc=46 score=0.0797226
stored,indexed,tokenized doc=52 score=0.0797226
stored,indexed,tokenized doc=53 score=0.0797226
stored,indexed,tokenized doc=61 score=0.0797226
stored,indexed,tokenized doc=71 score=0.0797226
stored,indexed,tokenized doc=79 score=0.0797226
stored,indexed,tokenized doc=84 score=0.0797226
stored,indexed,tokenized doc=99 score=0.0797226
stored,indexed,tokenized doc=102 score=0.0797226
stored,indexed,tokenized doc=36 score=0.05637239
stored,indexed,tokenized doc=45 score=0.05637239
stored,indexed,tokenized doc=47 score=0.05637239
stored,indexed,tokenized doc=56 score=0.05637239
stored,indexed,tokenized doc=59 score=0.05637239
stored,indexed,tokenized doc=62 score=0.05637239
stored,indexed,tokenized doc=67 score=0.05637239
stored,indexed,tokenized doc=75 score=0.05637239
stored,indexed,tokenized doc=100 score=0.05637239
stored,indexed,tokenized doc=101 score=0.05637239
stored,indexed,tokenized doc=103 score=0.05637239
Found 43 document(s) (in 47 milliseconds) that matched query '诸葛亮':