1、solr导入到eclipse
下载solr-5.4.1-src.tgz,官网地址http://www.apache.org/dyn/closer.lua/lucene/solr/5.4.1
解压solr-5.4.1-src.tgz到D:\project\java\solr-5.4.1目录,在目录的命令行下输入ant eclipse,然后进入漫长的等待过程,中间需要从网上下载很多依赖包。
编译时,可能会报Ivy could not be found in you ant classpath,去ivy官网(http://ant.apache.org/ivy/download.cgi)下载ivy.jar即可。
直到出现BUILD SUCCESSFUL,使用eclipse导入。
打开org.apache.solr.client.solrj.StartSolrJetty,设置solr.solr.home
public class StartSolrJetty { public static void main( String[] args ) { System.setProperty("solr.solr.home", "solr/example/solr"); Server server = new Server(); ServerConnector connector = new ServerConnector(server, new HttpConnectionFactory()); // Set some timeout options to make debugging easier. connector.setIdleTimeout(1000 * 60 * 60); connector.setSoLingerTime(-1); connector.setPort(8983); server.setConnectors(new Connector[] { connector }); WebAppContext bb = new WebAppContext(); bb.setServer(server); bb.setContextPath("/solr"); bb.setWar("solr/webapp/web"); server.setHandler(bb); try { System.out.println(">>> STARTING EMBEDDED JETTY SERVER, PRESS ANY KEY TO STOP"); server.start(); while (System.in.available() == 0) { Thread.sleep(5000); } server.stop(); server.join(); } catch (Exception e) { e.printStackTrace(); System.exit(100); } } }
2、实时更新词库
本文使用Jcseg这个中文分词库,查看org.lionsoul.jcseg.analyzer.v5x.JcsegTokenizerFactory的源码,词库数据保存在ADictionary dic这个变量中,
public class JcsegTokenizerFactory extends TokenizerFactory { private int mode; private JcsegTaskConfig config = null; private ADictionary dic = null; // 词库变量 /** * set the mode arguments in the schema.xml * configuration file to change the segment mode for jcseg * */ public JcsegTokenizerFactory(Map<String, String> args) { super(args); String _mode = args.get("mode"); if ( _mode == null ) mode = JcsegTaskConfig.COMPLEX_MODE; else { _mode = _mode.toLowerCase(); if ( "simple".equals(_mode) ) mode = JcsegTaskConfig.SIMPLE_MODE; else if ( "detect".equals(_mode) ) mode = JcsegTaskConfig.DETECT_MODE; else mode = JcsegTaskConfig.COMPLEX_MODE; } //initialize the task config and the dictionary config = new JcsegTaskConfig(); dic = DictionaryFactory.createDefaultDictionary(config); } public void setConfig( JcsegTaskConfig config ) { this.config = config; } public void setDict( ADictionary dic ) { this.dic = dic; } public JcsegTaskConfig getTaskConfig() { return config; } public ADictionary getDict() { return dic; } @Override public Tokenizer create( AttributeFactory factory ) { try { return new JcsegTokenizer(mode, config, dic); } catch (JcsegException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } }
继续查看org.apache.solr.handler.FieldAnalysisRequestHandler代码,知道可以通过SolrQueryRequest获取到TokenizerFactory
只需要取得JcsegTokenizerFactory对应实例,就能取得dic,通过add和remove方法实时更新词库。
dic.add(ILexicon.CJK_WORD, word, IWord.T_CJK_WORD);
dic.remove(ILexicon.CJK_WORD, word);
自定义实现request handler,在request handler里面通过SolrQueryRequest取得IndexSchema->IndexAnalyzer->TokenizerFactory,最终取得dic实例操作词库,内存中的词库更新后,也需要保存到本地词库文件中,避免重启后丢失词库。实现代码
package com.penngo.solr; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.solr.analysis.TokenizerChain; import org.apache.solr.common.StringUtils; import org.apache.solr.common.params.SolrParams; import org.apache.solr.handler.RequestHandlerBase; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.IndexSchema; import org.lionsoul.jcseg.analyzer.v5x.JcsegTokenizerFactory; import org.lionsoul.jcseg.tokenizer.core.ADictionary; import org.lionsoul.jcseg.tokenizer.core.ILexicon; import org.lionsoul.jcseg.tokenizer.core.IWord; import org.lionsoul.jcseg.tokenizer.core.JcsegTaskConfig; import com.fasterxml.jackson.databind.ObjectMapper; public class TestHandler extends RequestHandlerBase{ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { SolrParams params = req.getParams(); System.out.println("params=======" + params); JcsegTaskConfig config = new JcsegTaskConfig(); String addDatas = params.get("add"); Map<String,Object> dataResult = new HashMap<String,Object>(); ObjectMapper mapper = new ObjectMapper(); String lexiconPath = config.getLexiconPath()[0]; String fileLex = lexiconPath + "/lex-penngo.lex"; IndexSchema indexSchema = req.getSchema(); FieldType filetype = indexSchema.getFieldTypeByName("textComplex"); Analyzer analyzer = filetype.getIndexAnalyzer(); TokenizerChain tokenizerChain = (TokenizerChain) analyzer; TokenizerFactory tfac = tokenizerChain.getTokenizerFactory(); if (tfac instanceof JcsegTokenizerFactory) { JcsegTokenizerFactory jtf = (JcsegTokenizerFactory) tfac; ADictionary dic = jtf.getDict(); if (dic != null) { if (StringUtils.isEmpty(addDatas) == false) { FileOutputStream fos = new FileOutputStream(new File(fileLex), true); OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8"); BufferedWriter bw = new BufferedWriter(osw); ArrayList<List<String>> wordList = mapper.readValue(addDatas, ArrayList.class); for(List<String> word: wordList){ String name = word.get(0); String type = word.get(1); String pinyin = word.get(2); String syn = word.get(3); IWord iword = dic.get(ILexicon.CJK_WORD, name); // 如果不存在,则添加到词库 if (iword == null) { dic.add(ILexicon.CJK_WORD, name, IWord.T_CJK_WORD); iword = dic.get(ILexicon.CJK_WORD, name); iword.addPartSpeech(type); iword.setPinyin(pinyin); String[] syns = syn.split(","); for (String s : syns) { iword.addSyn(s); } StringBuffer sff = new StringBuffer(); sff.append(name).append("/").append(type).append("/").append(pinyin).append("/").append(syn); // 把分词添加到词库文件lex-penngo.lex中 bw.write(sff.toString()); bw.newLine(); } } bw.close(); osw.close(); fos.close(); } dataResult.put("status", "ok"); } } rsp.add("response", dataResult); } public String getDescription() { return null; } }
solrconfig.xml添加配置
<requestHandler name="/test" class="com.penngo.solr.TestHandler"> <lst name="defaults"> <str name="wt">json</str> <str name="indent">true</str> </lst> </requestHandler>
以"公众号"这个词来测试
jcseg自带词库分词结果
客户端通过接口添加分词后结果,php代码
<?php $url = "http://localhost:8983/solr/news/test"; $data = array( array("公众号", "n", "gong zhong hao", "大众号,社会号,penngo") ); $url = $url . "?add=" . urlencode(json_encode($data)); echo $url . "\n"; $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_HEADER, 0); $result = curl_exec($ch); curl_close($ch); print_r($result); ?>