Lucene学习入门之简单示例

Lucene主要就是一个用来进行信息检索的工具。

 

信息检索主要分为以下步骤:

1:构建文本库

2:建立索引

3:进行搜索

4:对结果进行过滤

 

初次接触lucene,主要流程如下:

1:切割文档,将一份文档分解为多个小文档

2:创建索引文件

3:执行索引

具体代码如下:

public class FilePreprocess {
	
	public static void preprocess(File file, String outputDir){
		try{
			splitToSmallFiles(charactorProcess(file, outputDir + "output.all"), outputDir);
		}catch(Exception e){
			e.printStackTrace();
		}
	}
        // 对文件进行处理
	public static File charactorProcess(File file, String destFile) throws Exception{
		BufferedWriter writer = new BufferedWriter(new FileWriter(destFile));
		BufferedReader reader = new BufferedReader(new FileReader(file));
		String line = reader.readLine();
		while(line != null){
			if(!line.equals("\r\n")){
				String newline = replace(line);
				writer.write(newline);
				writer.newLine();
			}
			line = reader.readLine();
		}
		reader.close();
		writer.close();
		return new File(destFile);
	}
        // 拆分文件
	public static void splitToSmallFiles(File file, String outputpath) throws IOException{
		int filePointer = 0;
		int MAX_SIZE = 10240;
		BufferedWriter writer = null;
		BufferedReader reader = new BufferedReader(new FileReader(file));
		StringBuffer buffer = new StringBuffer();
		String line = reader.readLine();
		while(line != null){
			buffer.append(line).append("\r\n");
			if(buffer.toString().getBytes().length >= MAX_SIZE){
				writer = new BufferedWriter(new FileWriter(outputpath + "output" + filePointer + ".txt"));
				writer.write(buffer.toString());
				writer.close();
				filePointer ++;
				// 清空缓存区的数据
				buffer = new StringBuffer();
			}
			line = reader.readLine();
		}
		writer = new BufferedWriter(new FileWriter(outputpath + "output" + filePointer + ".txt"));
		writer.write(buffer.toString());
		writer.close();
	}
        // 转换文档中的特殊字符
	private static String replace(String line){
		HashMap map = new HashMap();
		map.put(",", ",");
		map.put("。", ",");
		map.put("《", "<");
		map.put("》", ">");
		map.put("【", "[");
		map.put("】", "]");
		map.put("{", "{");
		map.put("}", "}");
		map.put(":", ":");
		map.put("!", "!");
		int length = line.length();
		for(int i =0;i

 建立索引:

public class IndexProcesser {
	
	// 成员变量,存储创建的索引文件存放的位置
	private String INDEX_STORE_PATH = "d:\\index";
	
	// 创建索引
	public void createIndex(String inputDir){
		try{
			// 以MMAnalyzer作为分词工具创建一个IndexWriter
			IndexWriter writer = new IndexWriter(INDEX_STORE_PATH, new MMAnalyzer(), true);
			File filesDir = new File(inputDir);
			// 取得所有需要建立索引的文件数组
			File[] files = filesDir.listFiles();
			// 遍历数组
			for(int i =0;i

 执行查询:

public class Search {
	
	private String INDEX_STORE_PATH = "d:\\index";
	
	public void indexSearch(String searchType, String searchKey){
		try{
			IndexSearcher searcher = new IndexSearcher(INDEX_STORE_PATH);
			
			Term t = new Term(searchType, searchKey);
			Query q = new TermQuery(t);
//			Hits hit = searcher.search(q);
//			System.out.println("*************************");
//			for(int i =0;i的枚举对象TermDocs
			TermDocs termDocs = searcher.getIndexReader().termDocs(t);
			while(termDocs.next()){
				// 输出文档中出现关键字的次数
				System.out.println(termDocs.freq());
				//输出搜索到关键词的文档
				System.out.println(searcher.getIndexReader().document(termDocs.doc()));
			}
			Date endTime = new Date();
			// 时长
			long timeOfSearch = endTime.getTime() - beginTime.getTime();
			System.out.println("The time For indexsearch is " + timeOfSearch + " ms");
		}catch(Exception e){
			e.printStackTrace();
		}
	}
}

 所用jar包请参考附件

 

 

你可能感兴趣的:(Lucene)