from datetime import datetime
from elasticsearch_dsl import DocType, Date, Nested, Boolean,analyzer, InnerObjectWrapper, Completion, Keyword, Text, Integer
from elasticsearch_dsl.connections import connections
#创建服务器链接,非常终于
connections.create_connection(hosts=["localhost"])
#定义数据类,继承DocType,定义各个字段数据类型,在from elasticsearch_dsl import中导入需要的数据类型,包括字符串,整型,布尔等等
class LagouType(DocType):
job_name = Text(analyzer="ik_max_word")
company = Text(analyzer="ik_max_word")
url = Keyword()
job_id = Keyword()
salary = Text(analyzer="ik_max_word")
city = Keyword()
experience = Text(analyzer="ik_max_word")
education = Text(analyzer="ik_max_word")
job_type = Keyword()
label = Text(analyzer="ik_max_word")
job_benefit = Text(analyzer="ik_max_word")
job_description = Text(analyzer="ik_max_word")
addr = Text(analyzer="ik_max_word")
publish_time = Text(analyzer="ik_max_word")
crawl_time = Date()
#建立链接的index和doc,在类中建立类,必须是Meta类,用于传入index值和type(表)值
class Meta:
index = "lagou"
doc_type = "job"
if __name__ == "__main__":
#调用init()方法建立映射(mappings)
LagouType.init()
1.直接写在pipeline中,但是爬去的item不一定存入elasticsearch中或某数据库中,并且值内容不一,容易混乱,配置性低
#pipeline中写入
class Elasticsearch_pipeline(object):
def __init__(self):
pass
def process_item(self,item,spider):
lagou = LagouType()
lagou.job_name = item['job_name']
lagou.company = item['company']
lagou.url = item['url']
lagou.job_id = item['job_id']
lagou.salary = item['salary']
lagou.city = item['city']
lagou.experience = item['experience']
lagou.education = item['education']
lagou.job_type = item['job_type']
lagou.label = item['label']
lagou.job_benefit = item['job_benefit']
lagou.job_description = item['job_description']
lagou.addr = item['addr']
lagou.publish_time = item['publish_time']
lagou.crawl_time = item['crawl_time']
lagou.save()
return item
2.在item中定制save_to_elasticsearch接口,并在pipeline中调用item方法,增强item的可配置性
#item方法
def save_to_elasticsearch(self):
# 继承类
lagou = LagouType()
lagou.job_name = self['job_name']
lagou.company = self['company']
lagou.url = self['url']
lagou.job_id = self['job_id']
lagou.salary = self['salary']
lagou.city = self['city']
lagou.experience = self['experience']
lagou.education = self['education']
lagou.job_type = self['job_type']
lagou.label = self['label']
lagou.job_benefit = self['job_benefit']
lagou.job_description = self['job_description']
lagou.addr = self['addr']
lagou.publish_time = self['publish_time']
lagou.crawl_time = self['crawl_time']
lagou.save()
#pipeline调用
class Elasticsearch_pipeline(object):
def __init__(self):
pass
#在process_item中调用item的方法(item.save_to_elasticsearch())
def process_item(self,item,spider):
item.save_to_elasticsearch()
return item
#settings中开启item_pipeline
ITEM_PIPELINES = {
'lagou_spider.pipelines.Elasticsearch_pipeline': 1
}
#settings中开启item_pipeline
ITEM_PIPELINES = {
'lagou_spider.pipelines.Elasticsearch_pipeline': 1
}
class Elasticsearch_Option:
def __init__(self):
pass
#注意点1:注意大小写,进行分词分析时,elasticsearch的分词器会把自动把所有词变成小写
#match 用法,对 match 传入的值进行分词,符合分词结果的都可以检索到
def match_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"match": {
'title':'C++后端工程师'
}
}
}
)
#term 用法,不对 term 传入的值进行分词
def term_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"term": {
'salary_min':'2000000'
}
}
}
)
#terms 用法,可传入列表,符合列表内的值都可以检索到
def terms_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"terms": {
'title': ['python','java','c++'] #千万注意大小写
}
}
}
)
#from 和 size 的用法
def from_size_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"match": {
'title':'工程师'
}
},
"from":0,
"size":4
}
)
#match_all操作
def match_all_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"match_all": {
}
}
}
)
# match_phrase 短语查询
def match_phrase_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"match_phrase": {
"title": 'python研发工程师'
}
}
}
)
for i in response['hits']['hits']:
print(i['_source'])
#multi_match查询,单一查询条件查询多列(fields)
def multi_match_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"multi_match": {
"query":"深圳",
"fields": ['title','city'] #查询 fields 多个字段中,只要有:query查询内容的关键字的就查询出来。
}
}
}
)
#仔细留意response返回结构
for i in response['hits']['hits']:
print(i['_source'])
#排序操作
def sort_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"match_all":{}
},
"sort":{
"comment":{ #sort下面先制定需要排序的栏
"order": "asc"
}
}
}
)
# 仔细留意response返回结构
for i in response['hits']['hits']:
print(i['_source'])
#范围查询,gte:大于等于; gt:大于; lte:小于等于; lt:小于; boots:表示权重
def range_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"range":{
"comment":{ #range下面是要确定范围的field
"gt": 15,
"lt": 20
}
}
}
}
)
# 仔细留意response返回结构
for i in response['hits']['hits']:
print(i['_source'])
#wildcard,模糊查询
def wildcard_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"wildcard":{
"title":{ #range下面是要确定范围的field
"value":"pyth*n" # "*" 标识通配
}
}
}
}
)
# 仔细留意response返回结构
for i in response['hits']['hits']:
print(i['_source'])
# bool查询
# filter:字段过滤并且不参与打分,过滤掉非数组内的内容
# must:满足数组中所有的条件,“与”
# should:数组中的查询条件满足一个或多个,“或”
# must_not:数组中的查询条件一个都不能去满足,“非”
def bool_option(self):
client = Elasticsearch()
response = client.search(
index="lagou",
body={
"query": {
"bool": {
"must": [{
"match_all":{}
}],
"filter": {
"term": {
"title": '工程师'
}
},
"must_not": [{
"match": {
"comment": 16
}
}],
"should": [{
"match": {
"title": 'c'
}
}]
}
}
}
)
# 仔细留意response返回结构
for i in response['hits']['hits']:
print(i['_source'])