mongodb 解决 全文搜索是个不小的问题
可以用 正则匹配 但是效率很低 往往到大数据量的搜索的时候就会出现 查询超时等现象
当然也可以用官方的做法(在mongodb的文档类型中加字段,存分词结果,
然后从该字段中匹配) 但是我尝试了 效率比原先的好像还要低
http://www.oschina.net/question/200745_61968
后来我尝试了 lucene+IKAnalyzer 发现效率有所提升啊
原理:lucene 把大文本的数据 利用分词器 在新建的索引文件中建立索引
取数据的时候从索引文件中取
取出mongodb 中的数据进行 索引的创建
package sample3; import java.io.File; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.wltea.analyzer.lucene.IKAnalyzer; import com.mongodb.DB; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.Mongo; /** * 创建索引 * @author zhanghaijun * */ public class Demo1 { public static void main(String[] args) throws Exception { //先在数据库中拿到要创建索引的数据 Mongo mongo = new Mongo(); DB db = mongo.getDB("zhang"); DBCollection msg = db.getCollection("test3"); DBCursor cursor = msg.find(); //是否重新创建索引文件,false:在原有的基础上追加 boolean create = true; //创建索引 Directory directory = FSDirectory.open(new File("E:\\lucene\\index")); Analyzer analyzer = new IKAnalyzer();//IK中文分词器 IndexWriter indexWriter = new IndexWriter(directory,analyzer,MaxFieldLength.LIMITED); boolean exist = cursor.hasNext(); while(exist){ //System.out.println(cursor.next().get("text").toString()); Document doc = new Document(); Field fieldText = new Field("text",cursor.next().get("text").toString(),Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(fieldText); indexWriter.addDocument(doc); exist = cursor.hasNext(); } cursor = null; //optimize()方法是对索引进行优化 indexWriter.optimize(); //最后关闭索引 indexWriter.close(); } }
数据的查找(直接从索引文件中查找)
package sample3; import java.io.File; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.wltea.analyzer.lucene.IKAnalyzer; import org.wltea.analyzer.lucene.IKQueryParser; import org.wltea.analyzer.lucene.IKSimilarity; /** * 查找索引 */ public class Demo2 { public static void main(String[] args) throws Exception { // onlysearching, so read-only=true long starttime = System.currentTimeMillis(); IndexReader reader =IndexReader.open(FSDirectory.open(new File("E:\\lucene\\index")),true); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new IKSimilarity()); //在索引器中使用IKSimilarity相似度评估器 //String[] keys = {"4","testtest"}; //关键字数组 //String[] fields = {"id","title"}; //搜索的字段 //BooleanClause.Occur[] flags = {BooleanClause.Occur.MUST,BooleanClause.Occur.MUST}; //BooleanClause.Occur[]数组,它表示多个条件之间的关系 //使用 IKQueryParser类提供的parseMultiField方法构建多字段多条件查询 //Query query = IKQueryParser.parseMultiField(fields,keys, flags); //IKQueryParser多个字段搜索 Query query =IKQueryParser.parse("text","上海人"); //IK搜索单个字段 IKAnalyzer analyzer = new IKAnalyzer(); //Query query =MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, keys, fields, flags,analyzer); //用MultiFieldQueryParser得到query对象 System.out.println("query:"+query.toString()); //查询条件 /*TopScoreDocCollector topCollector = TopScoreDocCollector.create(searcher.maxDoc(), false); searcher.search(query,topCollector); ScoreDoc[] docs = topCollector.topDocs(3).scoreDocs; System.out.println(docs.length);*/ /** *得到TopDocs对象之后,可以获取它的成员变量totalHits和scoreDocs.这两个成员变量的访问权限是public的,所以可以直接访问 */ TopDocs topDocs = searcher.search(query,1000001); Integer count = topDocs.totalHits; ScoreDoc[] scoreDocs = topDocs.scoreDocs; for(int i = 0;i<count;i++){ ScoreDoc scoreDoc = scoreDocs[i]; Document document = searcher.doc(scoreDoc.doc); document.get("text"); } System.out.println("查找数据量:"+count); long endtime = System.currentTimeMillis(); System.out.println(endtime-starttime); reader.close(); //关闭索引 } }
//直接查找的代码:
package sample3; import java.net.UnknownHostException; import com.mongodb.BasicDBObject; import com.mongodb.DB; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBObject; import com.mongodb.Mongo; import com.mongodb.MongoException; public class Demo3 { public static void main(String[] args) throws Exception{ Mongo mongo = new Mongo(); DB db = mongo.getDB("zhang"); DBCollection dbc = db.getCollection("test3"); DBObject basicdb = new BasicDBObject(); basicdb.put("$regex","上海人"); basicdb.put("$options",""); long startTime = System.currentTimeMillis(); DBCursor cursor = dbc.find(new BasicDBObject("text",basicdb)); int j =0; while(cursor.hasNext()){ cursor.next().get("text"); j++; } System.out.println("查找数据量"+j); long endTime = System.currentTimeMillis(); System.out.println("未优化前:"+(endTime-startTime)); } }
测试结果
从索引中找数据
优化后的: query:text:上海人 text:上海 查找数据量:1000001 查找用时:2917
直接从mongodb 正则匹配的
查找数据量1000000 未优化前:6573
恩 速度还是有不少提高的
还有更好的解决方案吗 求指导哦