lucene插入document建立索引代码
import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.util.Version; public class DocInsert { private static IndexWriter indexwrite = null; static{ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); // Store the index in memory: // Directory directory = new RAMDirectory(); // To store an index on disk, use this instead: try { Directory directory = FSDirectory.open(new File("E:\\output\\lucence\\index")); indexwrite = new IndexWriter(directory, analyzer, true, new IndexWriter.MaxFieldLength(25000)); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (LockObtainFailedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void createDoc() throws CorruptIndexException, IOException{ List<String> datalist = org.apache.commons.io.IOUtils.readLines(new InputStreamReader(new FileInputStream(new File("E:\\output\\lucence\\data\\data.txt")),"GBK")); for(String str:datalist){ Document doc = new Document(); String[] text = str.split("\t"); if(text.length < 2){ continue; } doc.add(new Field("context", text[1], Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("id", text[0], Field.Store.YES, Field.Index.ANALYZED)); indexwrite.addDocument(doc); } } public static void main(String[] args) throws CorruptIndexException, IOException { createDoc(); indexwrite.commit(); indexwrite.close(); } }
数据格式为:
4915779 球泡灯套件
4915777 15018506651求购三星i559 i569 i579 4915775 采购雪纺格子印花面料 4915773 汽泡信封袋 4915771 电泳加工 4915769 6405 2RS 4915767 蓝色丁腈手套 4915765 采购求购KO3-15T八角 4915763 胸杯 4915761 封箱胶带 4915759 6404 2RS 4915757 Ipad 车载支架 4915755 礼品,文具,墙贴,基督教礼品 4915753 品牌内衣 4915751 聚丙烯酸 4915749 餐饮消毒毛巾、湿巾 4915747 提花 4915745 6403 2RS 4915743 采购如:葛根粉丝、蕨根粉丝、南瓜粉丝、野菜粉丝、香菇粉丝等 4915741 二手摩托车 4915739 急需采购PVC特殊袋子 4915737 女士T恤 4915735 烤弯镀膜玻璃 4915731 批发野生羊肚菌 4915733 ABS管道粘结剂 ABS胶
检索代码示例如下:
import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class DocSearch { private static IndexSearcher isearcher = null; public static void search(String key) throws IOException, ParseException{ Directory directory = FSDirectory.open(new File("E:\\output\\lucence\\index")); // Now search the index: IndexReader ireader = IndexReader.open(directory); // read-only=true isearcher = new IndexSearcher(ireader); // Parse a simple query that searches for "text": Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,"context", analyzer); Query query = parser.parse(key); ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; // Iterate through the results: for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); System.out.println(hitDoc.getValues("id")[0] + "\t" + hitDoc.getValues("context")[0] + "\t" + hits[i].score); } } public static void main(String[] args) throws IOException, ParseException { search("旧水泥袋"); isearcher.close(); } }
执行结果:
4801857 采购旧编织袋、旧水泥袋 4.0172114
4829927 水泥 1.7585585 4903199 采购水泥电阻 1.0551351 4815595 求购水泥输送链条和提升机 0.70342344 4861233 1万5 潜水料啤酒手提包 手提袋 0.47982088 4815637 大量采购包装用的编织袋(新的旧的,有无商标皆可) 0.47913262 4915391 铁泥 铁灰 0.46250635 4889169 废旧砂轮 0.39993972 4903163 软陶泥,超轻粘土 0.34687978 4801611 水泵 0.30114633 4801911 手袋 0.29862976 4889443 水锈石 上水石 吸水石 0.2608004 4861275 足浴袋 泡脚袋 异形袋 0.25862095 4801871 手提袋制袋机 0.25339574 4915383 回收库存废旧油墨油漆 0.24996233 4903189 回收库存旧油漆13463048572 0.24996233 4903187 求购废旧油漆油墨13463048572 0.24996233 4903175 求购库存旧化工树脂 0.24996233 4903245 污水泵 0.24091707 4801705 出水霜 0.24091707 4874727 服裝紙袋 0.2389038 4829965 工作证袋 0.2389038 4815531 棉布袋 0.2389038 4815479 冷敷冰袋 0.2389038
可以看到这个检索结果:
1.默认的分词是最终分成一个汉字,
2.匹配出来的分数还是比较靠谱。
如何用lucene设计一个搜索引擎如何考虑的问题太多:
1.如何设计一个分布式查询;
2.数据增量更新,全量更新如何处理,不影响当前的查询引擎;
3.性能如何保证,更好地利用缓存,分布式?
4.如果设计得更通用,需要添加字段,添加排序字段,统计字段的时候能够做到快速满足需求?
5.分词模块的选择和处理
。。。
后续慢慢研究