插件的下载地址: https://github.com/medcl/elasticsearch-analysis-ik/releases/tag/v6.8.1
主要原理: 创建热更新的http服务,配置IK远端更新地址;步骤如下:
vi plugins/ik/config/IKAnalyzer.cfg.xml
修改 remote_ext_dict和remote_ext_stopwords这两项
<entry key="remote_ext_dict">http://localhost:9527/extdicentry>
<entry key="remote_ext_stopwords">http://localhost:9527/stopwordsentry>
使用python的tornado模块构建服务器,代码如下:
remotedic.py
import tornado.ioloop
import tornado.web
import os,time
##配置文件
conf={
"port": 9527,
"ext_dic":"ext.dic",
"stopwords":"stop.dic"
}
## Server句柄
class MainHandler(tornado.web.RequestHandler):
# 初始化,传入字典文件
def initialize(self, file):
self.file = file
# 文件不存在就创建
if not os.access(self.file, os.F_OK):
f = open(self.file, 'w')
f.close()
# GET method
def get(self):
f = open(self.file, 'r', encoding='utf-8')
data = f.read()
f.close()
self.set_header("Content-Type", "text/plain; charset=UTF-8")
self.set_header("ETag", "2")
self.write(data)
# HEAD mothod
def head(self):
# 获取更新时间,设置为上次更改的标志
mTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(os.stat(self.file).st_mtime))
self.set_header("Last-Modified",mTime)
self.set_header("ETag","2" )
self.set_header("Content-Length", "0")
self.finish()
# 注册webMapping
def make_app():
return tornado.web.Application([
(r"/extdic", MainHandler,{
"file": conf["ext_dic"]}),
(r"/stopwords", MainHandler,{
"file": conf["stopwords"]})
])
if __name__ == "__main__":
app = make_app()
app.listen(conf["port"])
tornado.ioloop.IOLoop.current().start()
先启动服务器` python remotedic.py ';重启 elasticsearch;
然后就可以进行热更新使用了;默认会有2个文件;
ext.dic :新词汇
stop.dic :停用词词典
到此以及可以使用;后面为测试。
GET /_analyze
{
"text": ["虽然身处自媒体时代,但是网红主播雨女无瓜呀,蓝瘦香菇"],
"analyzer": "ik_max_word"
}
{
"tokens": [
{
"token": "虽然",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 0
},
{
"token": "身处",
"start_offset": 2,
"end_offset": 4,
"type": "CN_WORD",
"position": 1
},
{
"token": "自",
"start_offset": 4,
"end_offset": 5,
"type": "CN_CHAR",
"position": 2
},
{
"token": "媒体",
"start_offset": 5,
"end_offset": 7,
"type": "CN_WORD",
"position": 3
},
{
"token": "媒",
"start_offset": 5,
"end_offset": 6,
"type": "CN_WORD",
"position": 4
},
{
"token": "体",
"start_offset": 6,
"end_offset": 7,
"type": "CN_WORD",
"position": 5
},
{
"token": "时代",
"start_offset": 7,
"end_offset": 9,
"type": "CN_WORD",
"position": 6
},
{
"token": "但是",
"start_offset": 10,
"end_offset": 12,
"type": "CN_WORD",
"position": 7
},
{
"token": "网",
"start_offset": 12,
"end_offset": 13,
"type": "CN_CHAR",
"position": 8
},
{
"token": "红",
"start_offset": 13,
"end_offset": 14,
"type": "CN_CHAR",
"position": 9
},
{
"token": "主",
"start_offset": 14,
"end_offset": 15,
"type": "CN_CHAR",
"position": 10
},
{
"token": "播",
"start_offset": 15,
"end_offset": 16,
"type": "CN_WORD",
"position": 11
},
{
"token": "雨",
"start_offset": 16,
"end_offset": 17,
"type": "CN_CHAR",
"position": 12
},
{
"token": "女",
"start_offset": 17,
"end_offset": 18,
"type": "CN_CHAR",
"position": 13
},
{
"token": "无",
"start_offset": 18,
"end_offset": 19,
"type": "CN_CHAR",
"position": 14
},
{
"token": "瓜",
"start_offset": 19,
"end_offset": 20,
"type": "CN_WORD",
"position": 15
},
{
"token": "呀",
"start_offset": 20,
"end_offset": 21,
"type": "CN_CHAR",
"position": 16
},
{
"token": "蓝",
"start_offset": 22,
"end_offset": 23,
"type": "CN_WORD",
"position": 17
},
{
"token": "瘦",
"start_offset": 23,
"end_offset": 24,
"type": "CN_WORD",
"position": 18
},
{
"token": "香菇",
"start_offset": 24,
"end_offset": 26,
"type": "CN_WORD",
"position": 19
},
{
"token": "香",
"start_offset": 24,
"end_offset": 25,
"type": "CN_WORD",
"position": 20
},
{
"token": "菇",
"start_offset": 25,
"end_offset": 26,
"type": "CN_WORD",
"position": 21
}
]
}
向ext.dic 添加新词汇,一个词占一行,utf-8编码
网红
主播
自媒体
雨女无瓜
蓝瘦香菇
向stop.dic 添加停用词
呀
{
"tokens": [
{
"token": "虽然",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 0
},
{
"token": "身处",
"start_offset": 2,
"end_offset": 4,
"type": "CN_WORD",
"position": 1
},
{
"token": "自媒体",
"start_offset": 4,
"end_offset": 7,
"type": "CN_WORD",
"position": 2
},
{
"token": "媒体",
"start_offset": 5,
"end_offset": 7,
"type": "CN_WORD",
"position": 3
},
{
"token": "媒",
"start_offset": 5,
"end_offset": 6,
"type": "CN_WORD",
"position": 4
},
{
"token": "体",
"start_offset": 6,
"end_offset": 7,
"type": "CN_WORD",
"position": 5
},
{
"token": "时代",
"start_offset": 7,
"end_offset": 9,
"type": "CN_WORD",
"position": 6
},
{
"token": "但是",
"start_offset": 10,
"end_offset": 12,
"type": "CN_WORD",
"position": 7
},
{
"token": "网红",
"start_offset": 12,
"end_offset": 14,
"type": "CN_WORD",
"position": 8
},
{
"token": "主播",
"start_offset": 14,
"end_offset": 16,
"type": "CN_WORD",
"position": 9
},
{
"token": "播",
"start_offset": 15,
"end_offset": 16,
"type": "CN_WORD",
"position": 10
},
{
"token": "雨女无瓜",
"start_offset": 16,
"end_offset": 20,
"type": "CN_WORD",
"position": 11
},
{
"token": "瓜",
"start_offset": 19,
"end_offset": 20,
"type": "CN_WORD",
"position": 12
},
{
"token": "蓝瘦香菇",
"start_offset": 22,
"end_offset": 26,
"type": "CN_WORD",
"position": 13
},
{
"token": "蓝",
"start_offset": 22,
"end_offset": 23,
"type": "CN_WORD",
"position": 14
},
{
"token": "瘦",
"start_offset": 23,
"end_offset": 24,
"type": "CN_WORD",
"position": 15
},
{
"token": "香菇",
"start_offset": 24,
"end_offset": 26,
"type": "CN_WORD",
"position": 16
},
{
"token": "香",
"start_offset": 24,
"end_offset": 25,
"type": "CN_WORD",
"position": 17
},
{
"token": "菇",
"start_offset": 25,
"end_offset": 26,
"type": "CN_WORD",
"position": 18
}
]
}
前后对比可以看到 “网红 主播 自媒体 雨女无瓜 蓝瘦香菇”这些新词添加后,可以重新进行分词为单独一个词条,向停用词中添加 “呀”,重新分词后,看不到词条了。符合预期结果。