BM25

黄天不服苦心人,终于实现了,用trac(最早那个版本700M数据)的数据测了一下,和lucene的原有算法排序基本一致,共享一下代码:很粗陋,还需要改很多:


package org.apache.lucene.BM25;

import java.io.IOException;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Vector;

import org.apache.lucene.BM25.bm25.BM25BooleanQuery.BooleanTermQuery;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;

public class BM25OfMine {

	private Set<Term> terms = new HashSet<Term>();
	private Term t[] = null;// 查询输入

	private IndexReader reader = null;
	private TermDocs termDocs = null;

	private int DocNo[] = null;// 打分的文档编号
	private float length[] = null;// 文档的长度
	private double Score[] = null;// 文档的得分
	private int NumOfDoc = 0;// 文档的总数

	// =========暂时存储
	private final int Doc[] = new int[200];
	private int frq[] = new int[200];
	// 这里读代码不是很清楚,为什么要给它们固定值,也许要比reader功能底层一些

	private double AverLength = 100;// 所有文档的平均长度
	private int NumofTheDoc = 0;// 包含某个词的文档数

	private int TimeofDoc[] = new int[1];

	// Searcher searcher = null;

	public BM25OfMine(IndexReader r) {

		this.reader = r;
		// this.searcher = new IndexSearcher(reader);
	}

	public void ConQuery(Query q) {
		// some way to
		if (q instanceof BooleanQuery) {
			List<BooleanClause> clauses = ((BooleanQuery) q).clauses();
			for (int i = 0; i < clauses.size(); i++) {
				clauses.get(i).getQuery().extractTerms(terms);

			}
			// Iterator<Term> iter = terms.iterator();
			// while (iter.hasNext())
			// {
			// System.out.println(iter.next().toString()+"**");
			// }
		} else if (q instanceof TermQuery) {
			q.extractTerms(terms);
			// Iterator<Term> iter = terms.iterator();
			// while (iter.hasNext())
			// {
			// System.out.println(iter.next().toString()+"**");
			// }
		}
		// else if(q instanceof )
		t = new Term[terms.size()];
		Iterator<Term> iter = terms.iterator();
		int i = 0;
		while (iter.hasNext()) {
			// System.out.println(iter.next().toString()+"**");
			t[i] = iter.next();
			i++;
		}

		System.out.println(t.length);
		for (int j = 0; j < t.length; j++)
			System.out.println("查的词: " + t[j].toString());

	}

	// ==================================================================
	public int GetDocNum()// 获取文档总数
	{
		return reader.maxDoc();
	}

	public int GetTermDocNum(Term t)// 获得一个词出现的文档数
	{
		int s = 0;
		try {
			s = reader.docFreq(t);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return s;
	}

	public double GetIDF(Term t) {
		int nq = GetTermDocNum(t);

		return (double) Math.log(((double) (NumOfDoc - nq + 0.5))
				/ ((double) (nq + 0.5)) + 1.0);

	}

	// =================================
	public double GetTermFreq(Term t) {
		try {
			termDocs = reader.termDocs(t);
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		try {
			termDocs.read(Doc, frq);
			// 读出一个词出现的所有文档编号,以及词的次数
			// 1---5
			// 2---6
			// 3---5
			// 4---13
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return 0;
	}

	public double GetTF(int f, double length) {
		double k = 2.0;
		double b = 0.75;
		double result = 0;
		result = ((double) (f * (k + 1)))
				/ ((double) (f + k * ((1 - b) + b * length / AverLength)));
		return result;
	}

	// =======================================================
	public double GetLength(int DocNo[]) throws Exception// 写入每一个文档的长度
	{
		length = new float[DocNo.length];
		for (int i = 0; i < DocNo.length; i++) {
			// reader.document(DocNo[i]).get("title").length();
			length[i] = 1.0f;
		}
		return 0;
	}

	public double GetAverLength() {
		double aver = 0;
		double sum = 0.0;
		for (int i = 0; i < DocNo.length; i++) {
			sum = sum + (double) length[i];
		}
		return 1.0;

	}

	// =============================================================
	public void Score() throws Exception// 按照term进行打分,
	{

		for (int i = 0; i < t.length; i++) {
			if (i == 0) { // 第一次进入要进行一些初始化
				GetTermFreq(t[i]);
				int j;
				for (j = 0; j < Doc.length; j++) {
					if (Doc[j] == 0 && Doc[j + 1] == 0)
						break;// 编号为零的文档
				}
				DocNo = new int[j];
				Score = new double[j];
				TimeofDoc = new int[j];

				for (j = 0; j < DocNo.length; j++) {
					DocNo[j] = Doc[j];
					TimeofDoc[j] = 1;// 文档出现了一个关键词
				}

				GetLength(DocNo);// 记录每一个文档的长度到全局变量length
				AverLength = GetAverLength();// 计算平均长度

				for (int k = 0; k < DocNo.length; k++) {
					Score[k] = GetTF(frq[k], length[k]) * GetIDF(t[i]);
				}

				// for(int ii=0;ii<DocNo.length;ii++)
				// {

				// System.out.println("文档编号:"+DocNo[ii]+"分数:"+Score[ii]+"路径"+
				// reader.document(DocNo[ii]).get("path")+"出现次数:"+frq[ii]);
				// }

			} else {
				GetTermFreq(t[i]);
				for (int j = 0; j < Doc.length; j++) {
					if (Doc[j] == 0 && Doc[j + 1] == 0)
						break;
					else {
						for (int k = 0; k < DocNo.length; k++) {
							if (Doc[j] == DocNo[k]) {
								// System.out.println("之前"+DocNo[k]+Score[k]);
								Score[k] = Score[k] + GetTF(frq[j], length[k])
										* GetIDF(t[i]);
								// System.out.println("之后"+DocNo[k]+Score[k]);
								TimeofDoc[k]++;
							}
						}

					}
				}

			}
		}

	}

	public void search(Query q) throws Exception {
		NumOfDoc = GetDocNum();
		ConQuery(q);
		// =================================
		// System.out.println(t[0].toString());
		// int a = GetTermDocNum(t[0]);
		// System.out.println("一个词出现在多少文档:"+a);
		// double d =GetIDF(t[0]);
		// System.out.println("IDF:"+d);
		// ==============================================
		// GetTermFreq(t[0]);
		// for(int i=0;i<Doc.length;i++)
		// System.out.println("次数:"+Doc[i]+" "+frq[i]);
		// System.out.println(reader.document(10).get("path"));
		// ======================================================
		Score();
		for (int ii = 0; ii < DocNo.length; ii++) {
			if (TimeofDoc[ii] == t.length)// 如果全部出现
				System.out.println("文档编号:" + DocNo[ii] + " 分数:" + Score[ii]
						+ " 路径:" + reader.document(DocNo[ii]).get("path"));
		}

	}

}


 

你可能感兴趣的:(BM25)