elasticsearch使用python构建本地代码仓库索引

本文为快速上手elasticsearch python客户端pyes来索引自己的代码仓库。

elasticsearch使用python构建本地代码仓库索引_第1张图片
最终效果

集群搭建请使用本攻城狮搭建的开箱即用的集成版本
https://github.com/full-stack-engineer/elasticsearch-integrated

# -*- coding: utf-8 -*-
import os
import sys
from pyes import *

INDEX_NAME = 'javafiles'
INDEX_ALIAS = 'javafiles_alias'
TYPE_NAME = "code"


class IndexFiles(object):
    def __init__(self, root):
        conn = ES('127.0.0.1:9200', timeout=3.5)  # 连接ES
        try:
            conn.indices.delete_index(INDEX_NAME)
            # pass
        except:
            pass
        conn.indices.create_index(INDEX_NAME)  # 新建一个索引

        # 定义索引存储结构
        mapping = {u'content': {'boost': 1.0,
                                'index': 'analyzed',
                                'store': 'yes',
                                'type': u'string',
                                "indexAnalyzer": "ik",
                                "searchAnalyzer": "ik",
                                "term_vector": "with_positions_offsets"},
                   u'name': {'boost': 1.0,
                             'index': 'analyzed',
                             'store': 'yes',
                             'type': u'string',
                             "indexAnalyzer": "ik",
                             "searchAnalyzer": "ik",
                             "term_vector": "with_positions_offsets"},
                   u'dirpath': {'boost': 1.0,
                                'index': 'analyzed',
                                'store': 'yes',
                                'type': u'string',
                                "indexAnalyzer": "ik",
                                "searchAnalyzer": "ik",
                                "term_vector": "with_positions_offsets"},
                   }

        conn.indices.put_mapping(TYPE_NAME, {'properties': mapping}, [INDEX_NAME])  # 定义test-type

        self.addIndex(conn, root)
        conn.indices.add_alias(INDEX_ALIAS, INDEX_NAME)
        conn.default_indices = [INDEX_NAME]  # 设置默认的索引
        conn.indices.refresh()  # 刷新以获得最新插入的文档

    def addIndex(self, conn, root):
        print root
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.java'):
                    continue
                print "Indexing file ", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'utf-8')
                    file.close()
                    if len(contents) > 0:
                        conn.index({'name': filename, 'dirpath': root, 'content': contents}, INDEX_NAME, TYPE_NAME)
                    else:
                        print 'no contents in file %s', path
                except Exception, e:
                    print e


if __name__ == '__main__':
    IndexFiles('/Users/xxx/Projects')

你可能感兴趣的:(elasticsearch使用python构建本地代码仓库索引)