注意:本文使用的是jupyter做演示。
(1)、jupyter notebook安装
pip install jupyter notebook
jupyter notebook命令在相应的目录下启动即可
(2)、whoosh安装
pip install whoosh
(3)、jieba分词器安装----->目前最火最叼的中文分词器
pip install jieba
import pandas as pd
data = [{'T': '1000', 'd1': '国足', 'd2': 1},
{'T': '1100', 'd1': '女足', 'd2': 1},
{'T': '1200', 'd1': '欧洲', 'd2': 2},
{'T': '1400', 'd1': '亚欧', 'd2': 1},
{'T': '1500', 'd1': '欧盟', 'd2': 1},
{'T': '1600', 'd1': '国瑞', 'd2': 1},
{'T': '1700', 'd1': '南国', 'd2': 1},
{'T': '1800', 'd1': '北欧', 'd2': 1}
]
frame = pd.DataFrame(data)
from whoosh.fields import Schema, TEXT, ID
from jieba.analyse import ChineseAnalyzer
# jieba中文分词器
analyzer = ChineseAnalyzer()
schema = Schema(T=ID(stored=True, unique=True),
d1=TEXT(stored=True, analyzer=analyzer),
d2=TEXT(stored=True)
)
import os.path
from whoosh.index import create_in
# 创建索引的路径,也就是存放的位置
index_path = '/home/sjp/gitproject/test_data'
if not os.path.exists(index_path):
os.mkdir(index_path)
ix = create_in(index_path, schema)
from whoosh import index
index_path = '/home/sjp/gitproject/tests/test_data'
ix = index.open_dir(index_path)
with ix.writer() as writer:
for _, row in frame.iterrows():
elements = {key: unicode(value) for key, value in row.iteritems()}
# 我用的是update,后期可以更新
writer.update_document(**elements)
from whoosh.qparser import MultifieldParser, OperatorsPlugin
from whoosh import query
ix = index.open_dir(index_path)
# MultifieldParser这个是多字段查询
query_parser = MultifieldParser(['T', 'd1', 'd2'], schema=ix.schema)
# query_parser.add_plugin(FuzzyTermPlugin()) 这个是支持模糊查询
search_pattern = query_parser.parse(u'国')
with ix.searcher() as searcher:
# 介个是limit=None是查询所有
result = searcher.search(search_pattern, limit=20)
# 介个是支持分页查询
# result = searcher.search_page(search_pattern, 1, pagelen=20)
for _ in result:
print _.values()
后记:正所谓先把程序跑起来,再去看文档做优化,这才是最快的学习方式------>自我观点。
所以呢此处贴上中文文档供各位参考:https://www.osgeo.cn/whoosh/schema.html#modifying-the-schema-after-indexing