pandas 读取 Elasticsearch|Neo4j |Kafka |Flume |Logstash

pandas 读取 Elasticsearch|Neo4j |Kafka |Flume |Logstash_第1张图片
image.png

发现一个 牛逼的 hub
https://github.com/peter-woyzbun/querygraph

Sqlite
MySQL
Postgres
Mongo Db
Elastic Search
Apache Cassandra (untested)
Maria Db (untested)
InfluxDB (untested)
MS Sql (untested)
Neo4j (untested)

pandas读取 ElasticSearch 有现成的 包,还是比较简单
首先 ElasticSearch 官方也是支持 python的

  • elasticsearch-py (Official)
  • Elasticsearch-SQL
  • pyelasticsearch
  • pyes

我就简单 搬运 github的 readme 过来

DataFrame API
A DataFrame object accesses Elasticsearch data with high level operations. It is type-safe, easy-to-use and Pandas-flavored.

# Create a DataFrame object
from pandasticsearch import DataFrame
df = DataFrame.from_es(url='http://localhost:9200', index='people')

# Print the schema(mapping) of the index
df.print_schema()
# company
# |-- employee
#   |-- name: {'index': 'not_analyzed', 'type': 'string'}
#   |-- age: {'type': 'integer'}
#   |-- gender: {'index': 'not_analyzed', 'type': 'string'}

# Inspect the columns
df.columns
#['name', 'age', 'gender']

# Denote a column
df.name
# Column('name')
df['age']
# Column('age')

# Projection
df.filter(df.age < 25).select('name', 'age').collect()
# [Row(age=12,name='Alice'), Row(age=11,name='Bob'), Row(age=13,name='Leo')]

# Print the rows into console
df.filter(df.age < 25).select('name').show(3)
# +------+
# | name |
# +------+
# | Alice|
# | Bob  |
# | Leo  |
# +------+

# Convert to Pandas object for subsequent analysis
df[df.gender == 'male'].agg(df.age.avg).to_pandas()
#    avg(age)
# 0        12

# Translate the DataFrame to an ES query (dictionary)
df[df.gender == 'male'].agg(df.age.avg).to_dict()
# {'query': {'filtered': {'filter': {'term': {'gender': 'male'}}}}, 'aggregations': {'avg(birthYear)':
# {'avg': {'field': 'birthYear'}}}, 'size': 0}

Filter

# Filter by a boolean condition
df.filter(df.age < 13).collect()
# [Row(age=12,gender='female',name='Alice'), Row(age=11,gender='male',name='Bob')]

# Filter by a set of boolean conditions
df.filter(df.age < 13 & df.gender == 'male').collect()
# Row(age=11,gender='male',name='Bob')]

# Filter by a wildcard (sql `like`)
df.filter(df.name.like('A*')).collect()
# [Row(age=12,gender='female',name='Alice')]

# Filter by a regular expression (sql `rlike`)
df.filter(df.name.rlike('A.l.e')).collect()
# [Row(age=12,gender='female',name='Alice')]

# Filter by a prefixed string pattern
df.filter(df.name.startswith('Al')).collect()
# [Row(age=12,gender='female',name='Alice')]

# Filter by a script
from pandasticsearch.operators import ScriptFilter
df.filter(ScriptFilter('2016 - doc["age"].value > 1995')).collect()
# [Row(age=12,name='Alice'), Row(age=13,name='Leo')]

5.0 compatibility: By default, pandasticsearch use filtered query (deprecated since 5.0). To use pandasticsearch against the latest ES version, a compat arg can be passed to from_es:

df = DataFrame.from_es(url='http://localhost:9200', index='people', compat=5)

Aggregation

# Aggregation
df[df.gender == 'male'].agg(df.age.avg).collect()
# [Row(avg(age)=12)]

# Metric alias
df[df.gender == 'male'].agg(df.age.avg.alias('avg_age')).collect()
# [Row(avg_age=12)]

# Groupby only (will give the `doc_count`)
df.groupby('gender').collect()
# [Row(doc_count=1), Row(doc_count=2)]

# Groupby and then aggregate
df.groupby('gender').agg(df.age.max).collect()
# [Row(doc_count=1, max(age)=12), Row(doc_count=2, max(age)=13)]

# Group by a set of ranges
df.groupby(df.age.ranges([10,12,14])).to_pandas()
#                   doc_count
# range(10,12,14)
# 10.0-12.0                 2
# 12.0-14.0                 1

# Advanced ES aggregation
df.groupby(df.gender).agg(df.age.stats).to_pandas()
df.agg(df.age.extended_stats).to_pandas()
df.agg(df.age.percentiles).to_pandas()
df.groupby(df.date.date_interval('1d')).to_pandas()

# Customized aggregation terms
df.groupby(df.age.terms(size=5, include=[1, 2, 3]))

Sort

# Sort
df.sort(df.age.asc).select('name', 'age').collect()
# [Row(age=11,name='Bob'), Row(age=12,name='Alice'), Row(age=13,name='Leo')]

# Sort by a script
from pandasticsearch.operators import ScriptSorter
df.sort(ScriptSorter('doc["age"].value * 2')).collect()
# [Row(age=11,name='Bob'), Row(age=12,name='Alice'), Row(age=13,name='Leo')]

build Query

from pandasticsearch import DataFrame
body = df[df['gender'] == 'male'].agg(df['age'].avg).to_dict()
 
from elasticsearch import Elasticsearch
result_dict = es.search(index="recruit", body=body)

Parse Result

from elasticsearch import Elasticsearch
es = Elasticsearch('http://localhost:9200')
result_dict = es.search(index="recruit", body={"query": {"match_all": {}}})

from pandasticsearch import Select
pandas_df = Select.from_dict(result_dict).to_pandas()

pandas 读取 Neo4j

import time
from py2neo import Graph
from pandas import DataFrame

start = time.time()

graph = Graph("http://neo4j:mypassword@:7474/db/data/")

df = DataFrame(graph.data("""
            MATCH (start: Person{name: 'A'}), (end: Person{name: 'D'}),
            path = (start)-[*]->(end)
            WITH EXTRACT(person IN NODES (path) | person.name) AS allpath
            RETURN allpath;
"""))

print df

end = time.time() - start

print ("\ntime:{0}".format(end) + "[sec]")

import pandas as pd
from py2neo import Graph,Node
import pkg_resources as pr
import sys
import csv2graphdb as cdb

print "[main] - connecting to db..."
g = cdb.connectdb()

print "[main] - now getting df"
df = cdb.get_entities_graphdb(g,"Party","Party","AAP")
print df
print "[main] - COMPLETED"
df.to_csv("neo4j_result.csv")

import pandas as pd
import py2neo

from querygraph.db.interface import DatabaseInterface
from querygraph.db.type_converter import TypeConverter


class Neo4j(DatabaseInterface):

    TYPE_CONVERTER = TypeConverter()

    def __init__(self, name, host, user, password):
        self.host = host
        self.user = user
        self.password = password

        DatabaseInterface.__init__(self,
                                   name=name,
                                   db_type='Neo4j',
                                   conn_exception=py2neo.database.status.Unauthorized,
                                   execution_exception=py2neo.database.status.DatabaseError,
                                   type_converter=self.TYPE_CONVERTER)

    def _conn(self):
        return py2neo.Graph(bolt=True, host=self.host, user=self.user, password=self.password)

    def _execute_query(self, query):
        graph = self.conn()
        df = pd.DataFrame(graph.data(query))
        return df

    def execute_insert_query(self, query):
        graph = self.conn()
        graph.run(query)
from py2neo import Graph, authenticate
import pandas as pa
import matplotlib as plt


VERBOSE = True
SAVE = False

def connect_to_db():
    authenticate("localhost:7474", "neo4j", "social")
    graph = Graph()
    return graph


def get_histogram_of_relation():
    query = "MATCH (n) RETURN n.name AS name, SIZE((n)-[:FACEBOOK]->()) AS FACEBOOK,SIZE((n)-[:GOOGLE]->()) AS GOOGLE,SIZE((n)-[:LINKEDIN]->()) AS LINKEDIN"
    cursor = graph.run(query)
    # http://py2neo.org/v3/database.html#py2neo.database.Cursor
    # chose between DataFrame(graph.run("MATCH (a:Person) RETURN a.name, a.born LIMIT 4").data()) (dataframe (dict)) or navigator)
    # and for record in cursor or while cursor.next()

    df= pa.DataFrame(cursor)
    # add columns with sum of relations per individual
    df['TOTAL']= df['FACEBOOK']+df['GOOGLE']+df['LINKEDIN']


    # build histogram
    plt.figure()


    cursor.close()

pandas 操作 Kafka

import json
import pandas as pd
from kafka import KafkaProducer
from kafka.errors import KafkaError

df = pd.read_csv('realAWSCloudwatch/ec2_cpu_utilization_5f5533.csv')

print(len(df['timestamp']))
print(len(df['value']))

producer = KafkaProducer(bootstrap_servers=['152.46.19.55:9092'], value_serializer=lambda m: json.dumps(m).encode('utf-8'))
for index, row in df.iterrows():
   producer.send('cpu-util', { 'timestamp' : row['timestamp'], 'value' : row['value'] })
   print(row['timestamp'])

pandas 读取写入 flume

https://github.com/lsjostro/flumelogger

pandas 读取 logstash
使用现成的logstash 包 ,结合用就可以了

其中最受欢迎的是
https://github.com/vklochan/python-logstash

logstash-easy

logstash

https://github.com/israel-fl/python3-logstash

你可能感兴趣的:(pandas 读取 Elasticsearch|Neo4j |Kafka |Flume |Logstash)