Lucene笔记

参考http://blog.csdn.net/zzpchina/archive/2006/01/15/579875.aspx
IR(Information Retrieval)来描述像Lucene这样的搜索工具。
lucene in action第二版,亚马逊没抢购到
直接看源码http://www.manning.com/hatcher3/LIAsourcecode.zip
使用的是lucene-core-3.0.2.jar
-------------------------------------------------------
Evaluating search quality:
评价搜索质量:
D.5.1 Precision and recall
Precision and recall are standard metrics in the information retrieval community for objectively measuring
relevance of search results. Precision measures what subset of the documents returned for each query were
relevant. For example, if a query has 20 hits and only 1 is relevant, precision is 0.05. If only 1 hit was returned
and it was relevant, precision is 1.0. Recall measures what percentage of the relevant documents for that query
was actually returned. So if the query listed 8 documents as being relevant, but 6 were in the result set, that’s a
recall of 0.75.
In a properly configured search application, these two measures are naturally at odds with one another. Let’s
say, on one extreme, you only show the user the very best (top 1) document matching their query. With such an
approach, your precision will typically be high, because the first result has a good chance of being relevant, while
your recall would be very low, because if there are many relevant documents for a given query you have only
returned one of them. If we increase top 1 to top 10, then suddenly we will be returning many documents for each
query. The precision will necessarily drop because most likely you are now allowing some non-relevant documents
into the result set. But recall should increase because each query should return a larger subset of its relevant
documents.
Still, you’d like the relevant documents to be higher up in the ranking. To measure this, average precision is
computed. This measure computes precision at each of the N cutoffs, where N ranges from 1 to a maximum value,
and then takes the average. So this measure is higher if your search application generally returns relevant
documents earlier in the result set. Mean average precision, or MAP, then measures the mean of average precision
across a set of queries. A related measure, mean reciprocal rank or MRR, measures 1/M where M is the first rank
that had a relevant document. You want both of these numbers to be as high as possible!

import java.io.File;
import java.io.PrintWriter;
import java.io.BufferedReader;
import java.io.FileReader;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;
import org.apache.lucene.benchmark.quality.*;
import org.apache.lucene.benchmark.quality.utils.*;
import org.apache.lucene.benchmark.quality.trec.*;
public class PrecisionRecall {
  public static void main(String[] args) throws Throwable {
    File topicsFile = new File("D:/Workspaces/suanfa/sohu3/src/lia/benchmark/topics.txt");
    File qrelsFile = new File("D:/Workspaces/suanfa/sohu3/src/lia/benchmark/qrels.txt");
    Directory dir = FSDirectory.open(new File("indexes/MeetLucene"));
    org.apache.lucene.search.Searcher searcher = new IndexSearcher(dir, true);
    String docNameField = "filename"; 
    PrintWriter logger = new PrintWriter(System.out, true); 
    TrecTopicsReader qReader = new TrecTopicsReader();   //#1
    QualityQuery qqs[] = qReader.readQueries( new BufferedReader(new FileReader(topicsFile))); //#1
    Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile)));                     //#2
    judge.validateData(qqs, logger);                     //#3
    QualityQueryParser qqParser = new SimpleQQParser("title", "contents");  //#4
    QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField);
    SubmissionReport submitLog = null;
    QualityStats stats[] = qrun.execute(judge,submitLog, logger);
    QualityStats avg = QualityStats.average(stats);      //#6
    avg.log("SUMMARY",2,logger, "  ");
    dir.close();
  }
}




-----------------------------------------------------
helloword在
LIAsourcecode\lia2e\src\lia\meetlucene\Indexer.java
简化一下:
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class Indexer {
	public static void main(String[] args) throws IOException {
		String indexDir = "D:\\Workspaces\\suanfa\\sohu3\\src\\lia\\meetlucene\\index";// args[0];
		String dataDir = "D:\\Workspaces\\suanfa\\sohu3\\src\\lia\\meetlucene\\data";// args[1];
		long start = System.currentTimeMillis();
		Directory dir = FSDirectory.open(new File(indexDir));
		IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); // 3
		int numIndexed = 0;
		try {
			TextFilesFilter filter = new TextFilesFilter();
			File[] files = new File(dataDir).listFiles();
			for (File f : files) {
				if (!f.isDirectory() && !f.isHidden() && f.exists()&& f.canRead() && (filter == null || filter.accept(f))) {
					// indexFile(f);
					System.out.println("Indexing " + f.getCanonicalPath());
					Document doc = new Document();
					doc.add(new Field("contents", new FileReader(f))); // 7
					doc.add(new Field("filename", f.getName(),Field.Store.YES, Field.Index.NOT_ANALYZED));// 8
					doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));// 9
					writer.addDocument(doc);
					numIndexed = writer.numDocs();
				}
			}
		} finally {
			writer.close();
		}
		long end = System.currentTimeMillis();
		System.out.println("Indexing " + numIndexed + " files took "+ (end - start) + " milliseconds");
	}
	private static class TextFilesFilter implements FileFilter {
		public boolean accept(File path) {
			return path.getName().toLowerCase().endsWith(".txt"); // 6
		}
	}
}

import org.apache.lucene.document.Document;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.util.Version;
import java.io.File;
import java.io.IOException;
public class Searcher {
  public static void main(String[] args) throws IllegalArgumentException,IOException, ParseException {
    String indexDir = "D:\\Workspaces\\suanfa\\sohu3\\src\\lia\\meetlucene\\index";//args[0];               //1 
    String q = "Redistri*";//args[1];                      //2   
    Directory dir = FSDirectory.open(new File(indexDir)); //3
    IndexSearcher is = new IndexSearcher(dir);   //3   
    QueryParser parser = new QueryParser(Version.LUCENE_30,"contents",new StandardAnalyzer(Version.LUCENE_30));  //4
    Query query = parser.parse(q);              //4   
    long start = System.currentTimeMillis();
    TopDocs hits = is.search(query, 10); //5
    long end = System.currentTimeMillis();
    System.err.println("Found " + hits.totalHits + " document(s) (in " + (end - start) +" milliseconds) that matched query '" +q + "':");                                   // 6
    for(ScoreDoc scoreDoc : hits.scoreDocs) {
      Document doc = is.doc(scoreDoc.doc);               //7      
      System.out.println(doc.get("fullpath"));  //8  
    }
    is.close();  
  }
}



----------------------------------

不用lucene,直接用流统计一个文件夹中字符出现的个数
package com.hao;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UserTreeMap {
	/**
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		//test();
		Map map=getMapFromFile("D:\\Workspaces\\suanfa\\sohu3\\src\\english.txt");
		Iterator it = map.entrySet().iterator();
        while (it.hasNext()) {
            Map.Entry entry = (Map.Entry) it.next();
            Object key = entry.getKey();
            Object value = entry.getValue();
            System.out.println(key+"--"+value);
        }

	}
	public static Map getMapFromFile(String filepath) throws Exception{
		BufferedReader buf = new BufferedReader(new FileReader(filepath));
		StringBuffer sbuf = new StringBuffer();// 缓冲字符串
		String line = null;
		while ((line = buf.readLine()) != null) {
			sbuf.append(line);// 追加到缓冲字符串中
		}
		buf.close();// 读取结束
		Pattern expression = Pattern.compile("[a-zA-Z]+");// 定义正则表达式匹配单词
		String string1 = sbuf.toString();//.toLowerCase();// 转换成小写
		Matcher matcher = expression.matcher(string1);// 定义string1的匹配器
		TreeMap myTreeMap = new TreeMap();// 创建树映射 存放键/值对
		int n = 0;// 文章中单词总数
		Object word = null;// 文章中的单词
		Object num = null;// 出现的次数
		while (matcher.find()) {// 是否匹配单词
			word = matcher.group();// 得到一个单词-树映射的键
			n++;// 单词数加1
			if (myTreeMap.containsKey(word)) {// 如果包含该键,单词出现过
				num = myTreeMap.get(word);// 得到单词出现的次数
				Integer count = (Integer) num;// 强制转化
				myTreeMap.put(word, new Integer(count.intValue() + 1));
			} else {
				myTreeMap.put(word,new Integer(1));//否则单词第一次出现,添加到映射中
			}
		}
		return myTreeMap;
	}
	public static void test() throws Exception{
		BufferedReader buf = new BufferedReader(new FileReader("D:\\sohu3\\english.txt"));
		System.out.println("Read under this dir English.txt");
		StringBuffer sbuf = new StringBuffer();// 缓冲字符串
		String line = null;
		while ((line = buf.readLine()) != null) {
			sbuf.append(line);// 追加到缓冲字符串中
		}
		buf.close();// 读取结束
		Pattern expression = Pattern.compile("[a-zA-Z]+");// 定义正则表达式匹配单词
		String string1 = sbuf.toString().toLowerCase();// 转换成小写
		Matcher matcher = expression.matcher(string1);// 定义string1的匹配器
		TreeMap myTreeMap = new TreeMap();// 创建树映射 存放键/值对
		int n = 0;// 文章中单词总数
		Object word = null;// 文章中的单词
		Object num = null;// 出现的次数
		while (matcher.find()) {// 是否匹配单词
			word = matcher.group();// 得到一个单词-树映射的键
			n++;// 单词数加1
			if (myTreeMap.containsKey(word)) {// 如果包含该键,单词出现过
				num = myTreeMap.get(word);// 得到单词出现的次数
				Integer count = (Integer) num;// 强制转化
				myTreeMap.put(word, new Integer(count.intValue() + 1));
			} else {// src="http://images.csdn.net/syntaxhighlighting/OutliningIndicators/InBlock.gif"
					// alt="" /> 
				myTreeMap.put(word,new Integer(1));//否则单词第一次出现,添加到映射中
			}
		}
		System.out.println("统计分析如下:");
		System.out.println(" 文章中单词总数" + n + "个");
		System.out.println("具体的信息在当前目录的result.txt文件中");
		BufferedWriter bufw = new BufferedWriter(new FileWriter("result.txt"));
		Iterator iter = myTreeMap.keySet().iterator();// 得到树映射键集合的迭代器
		Object key = null;
		while (iter.hasNext()) {// 使用迭代器遍历树映射的键
			key = iter.next();
			bufw.write((String) key + ":" + myTreeMap.get(key));// 键/值写到文件中
			bufw.newLine();
		}
		bufw.write("english.txt中的单词总数" + n + "个");
		bufw.newLine();
		bufw.write("english.txt中不同单词" + myTreeMap.size() + "个");
		bufw.close();
	}
}

你可能感兴趣的:(java,.net,正则表达式,Blog,Lucene)