mongodb 全文搜索解决方案(lucene+IKAnalyzer)

mongodb 解决 全文搜索是个不小的问题

可以用 正则匹配 但是效率很低 往往到大数据量的搜索的时候就会出现 查询超时等现象

当然也可以用官方的做法(在mongodb的文档类型中加字段,存分词结果,

然后从该字段中匹配) 但是我尝试了 效率比原先的好像还要低

http://www.oschina.net/question/200745_61968 

后来我尝试了 lucene+IKAnalyzer 发现效率有所提升啊

原理:lucene 把大文本的数据 利用分词器 在新建的索引文件中建立索引

取数据的时候从索引文件中取

取出mongodb 中的数据进行 索引的创建

package sample3;

import java.io.File;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;

import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.Mongo;

/**
 * 创建索引
 * @author  zhanghaijun
 *
 */
public class Demo1 {
	public static void main(String[] args) throws Exception {
		//先在数据库中拿到要创建索引的数据 
		Mongo mongo = new Mongo();
		DB db = mongo.getDB("zhang");
		DBCollection msg = db.getCollection("test3");
		DBCursor cursor = msg.find();
		//是否重新创建索引文件,false:在原有的基础上追加
		boolean create = true;
		//创建索引
		Directory directory = FSDirectory.open(new File("E:\\lucene\\index"));
		Analyzer analyzer = new IKAnalyzer();//IK中文分词器 
		IndexWriter indexWriter = new IndexWriter(directory,analyzer,MaxFieldLength.LIMITED);
		boolean exist = cursor.hasNext();
		while(exist){
			//System.out.println(cursor.next().get("text").toString());
			Document doc = new Document();
			Field fieldText = new Field("text",cursor.next().get("text").toString(),Field.Store.YES, 
					  Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
			doc.add(fieldText);
			indexWriter.addDocument(doc);
			exist = cursor.hasNext();
		}
		cursor = null;
		//optimize()方法是对索引进行优化
	    indexWriter.optimize();     
	    //最后关闭索引
	    indexWriter.close();
	}
}

数据的查找(直接从索引文件中查找)

package sample3;

import java.io.File;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKQueryParser;
import org.wltea.analyzer.lucene.IKSimilarity;

/**
 * 查找索引
 */
public class Demo2 {
	public static void main(String[] args) throws Exception {
		// onlysearching, so read-only=true
		long starttime = System.currentTimeMillis();
		IndexReader reader =IndexReader.open(FSDirectory.open(new File("E:\\lucene\\index")),true); 
		IndexSearcher searcher = new IndexSearcher(reader);
		searcher.setSimilarity(new IKSimilarity());   //在索引器中使用IKSimilarity相似度评估器 
		//String[] keys = {"4","testtest"};      //关键字数组
		//String[] fields = {"id","title"};  //搜索的字段
		//BooleanClause.Occur[] flags = {BooleanClause.Occur.MUST,BooleanClause.Occur.MUST};    //BooleanClause.Occur[]数组,它表示多个条件之间的关系 
		//使用 IKQueryParser类提供的parseMultiField方法构建多字段多条件查询
		//Query query = IKQueryParser.parseMultiField(fields,keys, flags);     //IKQueryParser多个字段搜索  
		Query query =IKQueryParser.parse("text","上海人");  //IK搜索单个字段       
		IKAnalyzer analyzer = new IKAnalyzer();
        //Query query =MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, keys, fields, flags,analyzer);   //用MultiFieldQueryParser得到query对象   
	    System.out.println("query:"+query.toString()); //查询条件    
		/*TopScoreDocCollector topCollector = TopScoreDocCollector.create(searcher.maxDoc(), false);
		searcher.search(query,topCollector);
		
		ScoreDoc[] docs = topCollector.topDocs(3).scoreDocs;
		System.out.println(docs.length);*/
	    
	    /**
	    *得到TopDocs对象之后,可以获取它的成员变量totalHits和scoreDocs.这两个成员变量的访问权限是public的,所以可以直接访问
	    */
	    TopDocs topDocs = searcher.search(query,1000001);
	    Integer count = topDocs.totalHits;
	    ScoreDoc[] scoreDocs = topDocs.scoreDocs;
	    for(int i = 0;i<count;i++){
	        ScoreDoc scoreDoc = scoreDocs[i];
	        Document document = searcher.doc(scoreDoc.doc);
	        document.get("text");
	    }
	    System.out.println("查找数据量:"+count);
	    long endtime = System.currentTimeMillis();
	    System.out.println(endtime-starttime);
		reader.close(); //关闭索引   
	}
}

//直接查找的代码:

package sample3;

import java.net.UnknownHostException;

import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.mongodb.MongoException;

public class Demo3 {
	public static void main(String[] args) throws Exception{
		Mongo mongo = new Mongo();
		DB db = mongo.getDB("zhang");
		DBCollection dbc = db.getCollection("test3");
		DBObject basicdb = new BasicDBObject();
		basicdb.put("$regex","上海人");
		basicdb.put("$options","");
		long startTime = System.currentTimeMillis();
		DBCursor cursor = dbc.find(new BasicDBObject("text",basicdb));
	    int j =0;
		while(cursor.hasNext()){
		    cursor.next().get("text");
		    j++;
		}
		System.out.println("查找数据量"+j);
		long endTime = System.currentTimeMillis();
		System.out.println("未优化前:"+(endTime-startTime));
	}
}

测试结果

       从索引中找数据  

优化后的:
query:text:上海人 text:上海
查找数据量:1000001
查找用时:2917

      直接从mongodb 正则匹配的

查找数据量1000000
未优化前:6573

恩 速度还是有不少提高的

还有更好的解决方案吗 求指导哦

 

 

你可能感兴趣的:(mongodb,Lucene,IKAnalyzer)