ElasticSearch是一个基于Lucene的搜索服务器。它提供了一个分布式多用户能力的全文搜索引擎,基于RESTful Web接口.Elasticsearch是用Java开发的,并作为Apache许可条款的开放源码发布,是当前流行的企业级搜索引擎设计用于云计算中,能够达到实时搜索,稳定,可靠,快速,安装使用方便。
要想制作一款搜索引擎,首先数据库里面得有大量的数据,如果数据库里面都没有数据,那这个搜索引擎还能教搜索引擎吗?所以我们先来爬取大量的数据,这里写了一个小说网的爬虫,以搜索小说为例。
编写model.py文件,编写完毕调用init函数,创建es索引的mapping
#coding:utf-8
from elasticsearch_dsl import DocType,Completion,Text,Boolean,Integer,Date
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl.analysis import CustomAnalyzer
# 1.创建个ES连接
connections.create_connection(hosts=['127.0.0.1'])
# 3.自定义分词器
class MyAnalyzer(CustomAnalyzer):
def get_analysis_definition(self):
return {}
# 创建分析器对象 filter 忽略大小写
ik_analyzer = MyAnalyzer('ik_max_word',filter=['lowercase'])
# 2.创建数据Model
class NovelModel(DocType):
# 2.1普通字段
title = Text(analyzer='ik_max_word')
author = Text(analyzer='ik_max_word')
classify = Text()
rate = Text()
collect = Integer()
number = Text()
time = Text()
click_week = Integer()
click_month = Integer()
click_all = Integer()
collect_week = Integer()
collect_month = Integer()
collect_all = Integer()
abstract = Text()
picture = Text()
download_url = Text()
# 2.2搜索建议字段
suggest = Completion(analyzer=ik_analyzer)
# 2.3创建Meta
class Meta:
# index 索引名(数据库)
index = 'alldata'
# doc_type 类型(表名称)
doc_type = 'novel'
if __name__ == '__main__':
NovelModel.init()
写一个Pipeline来存储数据
因为考虑到一个爬虫项目可能不止一个爬虫,每个爬虫的Item又不一样,所以在每一个Item类中来进行写入储存操作,然后每次当Item交给Pipeline来处理的时候,会根据不同的Item来进行不同的处理操作。
|-pipelines文件
class ToEsPipeline(object):
def process_item(self,item,spider):
item.save_to_es()
return item
编写Item
import scrapy
from elasticsearch_dsl.connections import connections
from .es_model import NovelModel
# 1.创建连接,获得连接对象
es = connections.create_connection(hosts=['http://39.107.255.196'])
# 3.处理搜索意见分词
def process_suggest(index,*args):
'''
:param index: index 索引(数据库)
:param args: 需要进行分词的内容
:return: 返回分词之后的列表,不允许有重复的数据
'''
#创建一个空集合
use_words = set()
#声明搜索建议分词列表
suggest = []
for text,weight in args:
# text 需要分词的文本
# weight 权重
# 调用es的分词analyzer接口进行分词
words = es.indices.analyze(
# es索引(数据库)
index = index,
analyzer='ik_max_word',
# 其他参数,顾虑器
params={
'filter':['lowercase'],
},
body={
'text':text
}
)
# 列表生成式 并转换set集合进行去重
analyzer_words = set([dic['token'] for dic in words['tokens']])
new_words = analyzer_words - use_words
#把没有重复的数据追加到列表
suggest.append({'input':list(new_words),'weight':weight})
use_words = analyzer_words
return suggest
# 2.处理Item
class MyItem(scrapy.Item):
novel_classify = scrapy.Field()
novel_title = scrapy.Field()
novel_author = scrapy.Field()
novel_rate = scrapy.Field()
novel_collect = scrapy.Field()
novel_number = scrapy.Field()
novel_time = scrapy.Field()
click_all = scrapy.Field()
click_month = scrapy.Field()
click_week = scrapy.Field()
collect_all = scrapy.Field()
collect_month = scrapy.Field()
collect_week = scrapy.Field()
novel_abstract = scrapy.Field()
novel_picture = scrapy.Field()
novel_download = scrapy.Field()
# 2.创建保存方法
def save_to_es(self):
# 2.1创建Novel数据Model对象
novel = NovelModel()
# 2.2普通字段赋值
novel.title = self['novel_title']
novel.author = self['novel_author']
novel.classify = self['novel_classify']
novel.rate = self['novel_rate']
novel.collect = self['novel_collect']
novel.number = self['novel_number']
novel.time = self['novel_time']
novel.click_week = self['click_week']
novel.click_month = self['click_month']
novel.click_all = self['click_all']
novel.collect_week = self['collect_week']
novel.collect_month = self['collect_month']
novel.collect_all = self['collect_all']
novel.bstract = self['novel_abstract']
novel.picture = self['novel_picture']
novel.download_url = self['novel_download']
# 2.3搜索建议
novel.suggest = process_suggest(NovelModel._doc_type.index,(novel.title,10),(novel.author,8))
# 2.4保存
novel.save()
由于在Django项目中也会用到我们在scrapy爬虫项目中的model.py文件,所以复制一份到django项目中
import math
from redis import Redis
from urllib import parse
from datetime import datetime
from django.shortcuts import render, redirect
from django.http import JsonResponse
from elasticsearch_dsl.connections import connections
from .es_models.es_types import NovelModel
rds = Redis(host='127.0.0.1',port=6379)
es = connections.create_connection(hosts=['127.0.0.1'])
def index(request):
# 定义搜搜哦数据的类型
navs = [
{'type': 'novel', 'title': '小说'},
{'type': 'movie', 'title': '电影'},
{'type': 'job', 'title': '职位'},
{'type': 'news', 'title': '新闻'},
]
content = {
'navs': navs,
'search_type': 'novel'
}
if request.method == 'GET':
return render(request, 'index.html', content)
def result(request):
if request.method == 'GET':
# 取出关键词,类型
keyword = request.GET.get('kw')
s_type = request.GET.get('s_type')
# 如果没有页码参数,默认为1
page_num = request.GET.get('pn', 1)
# 如果没有搜索关键词,重定向到主页
if not keyword:
return redirect('index')
rds.zincrby('hotkey',keyword)
hot_top5 = rds.zrevrange('hotkey',0,5)
history = request.COOKIES.get('history',None)
cookie_str = ''
if history:
cookies = history.split(',')
if parse.quote(keyword) in cookies:
cookies.remove(parse.quote(keyword))
cookies.insert(0,parse.quote(keyword))
if len(cookies) > 5:
cookies.pop()
cookie_str = ','.join(cookies)
else:
cookies = []
cookie_str = parse.quote(keyword)
# 判断搜索类型
if s_type == 'novel':
# 1.搜索的索引
index = 'alldata'
# 2.type名
doc_type = 'novel'
# 3.获取数据字段
fields = ['title', 'bstract']
start_time = datetime.now()
rs = es.search(
index=index,
doc_type=doc_type,
body={
"query": {
"multi_match": {
"query": keyword,
"fields": fields
}
},
"from": (int(page_num) - 1) * 10,
"size": 10,
'highlight': {
'pre_tags': [''],
"post_tags": [''],
"fields": {
"title": {},
"bstract": {}
}
}
}
)
use_time = (datetime.now() - start_time).total_seconds()
hits_list = []
for hit in rs['hits']['hits']:
h_dic = {}
if 'title' in hit['highlight'].keys():
h_dic['title'] = hit['highlight']['title'][0]
else:
h_dic['title'] = hit['_source']['title']
if 'bstract' in hit['highlight'].keys():
h_dic['abstract'] = hit['highlight']['bstract']
else:
h_dic['abstract'] = hit['_source']['bstract']
h_dic['detail_url'] = hit['_source']['download_url'][0]
hits_list.append(h_dic)
navs = [
{'type': 'novel', 'title': '博客'},
{'type': 'job', 'title': '职位'},
{'type': 'movie', 'title': '电影'},
{'type': 'news', 'title': '新闻'},
]
# 总记录条数
totle = rs['hits']['total']
# 页数,向上取证
page_nums = math.ceil(totle / 10)
page_num = int(page_num)
if page_num - 4 <= 0:
pages = range(1, 11)
elif page_num + 5 >= page_nums:
pages = range(page_nums - 9, page_nums + 1)
else:
pages = range(page_num - 4, page_num + 6)
content = {
'hits': hits_list,
'kw': keyword,
'use_time': use_time,
'total': totle,
'page_nums': page_nums,
'navs': navs,
'search_type': s_type,
'pages': pages,
'history':[his for his in parse.unquote(cookie_str).split(',')],
'hot_top5':hot_top5
}
response = render(request,'result.html',content)
response.set_cookie('history',cookie_str)
return response
def suggest(request):
if request.method == 'GET':
# 取出搜索内容、类型
s = request.GET.get('s', None)
s_type = request.GET.get('s_type')
content = {}
if s:
# 去ES中根据搜索关键词、搜索类型
datas = get_suggest(s, s_type)
content['status'] = 0
content['datas'] = datas
content['s_type'] = s_type
if len(datas) == 0:
content['status'] = -1
else:
content['status'] = -1
return JsonResponse(content)
# 在es中搜索数据
def get_suggest(keyword, s_type):
'''
:param keyword: 搜索关键词
:param s_type: 搜索类型
:return: 搜索结果
'''
# 创建一个search对象用于搜索
if s_type == 'novel':
search = NovelModel.search()
elif s_type == 'job':
pass
# suggest()获取搜索建议的接口
# 1.自定义搜索结果对应的key
# 2.搜索关键词
result = search.suggest(
'r_suggest',
keyword,
completion={
'field': 'suggest',
'fuzzy': {
'fuzziness': 2
},
'size': 5
}
)
# s返回一个字典
s = result.execute_suggest()
fileds = {'novel': 'title'}
# 定义一个结果列表
datas = []
for dic in s['r_suggest'][0]['options']:
sug = dic._source[fileds[s_type]]
datas.append(sug)
# 返回搜索建议
return datas