pipelinedb download
rpm -ivh --prefix=~/pipe/pipelinedb pipelinedb-<version>.rpm
# (此处安装目录使用简写,用户可以根据自己需要和习惯自定义,称~/pipe/pipelinedb:安装目录)
-D 数据库实例目录
-l 日志文件目录
可以在安装目录下建立data、log文件夹,后续安装命令指定即可
pipeline-init -D <data directory>
pipeline-ctl -D <data directory> -l pipelinedb.log start
(首次初始化数据库需要"-D "制定实例化目录)
pg_ctl start/stop
pipeline-ctl -D <data directory> start/stop
(首次初始化数据库需要"-D "制定实例化目录)
两种方式,后一种与Postgresql一样
psql -p 5432 -h localhost pipeline (pipeline 默认数据库)
alter user pipe with password 'pipeline'; (用户pipe)
修改配置文件两个处与Postgresql修改配置文件一样
修改后重启即生效
对应位置修改为:listen_addresses = '*'
TYPE DATABASE USER ADDRESS METHOD
增加:host all all 0.0.0.0/0 truth
允许客户端通过连续的视图来推送数据。数据流的行,或者说简单的事件,看起来就像一个普通表的行,
而将数据写入流的接口与写入表格的接口是相同的。
也就是说,事件只存在于流中,直到它们被从该流中读取的所有连续视图所消耗。
即便如此,用户仍然无法从流中进行选择。流只作为连续视图的输入。
CREATE STREAM stream_name ( [
{ column_name data_type [ COLLATE collation ] | LIKE parent_stream } [, ... ] ] )
CREATE STREAM stream (x integer, y integer);
ALTER STREAM stream ADD COLUMN z integer;
INSERT INTO stream (x, y, z) VALUES (0, 1, 2), (3, 4, 5), (6, 7, 8);
# 使用Prepared INSERT 可以实现如上方式的多列插入
PREPARE write_to_stream AS INSERT INTO stream (x, y, z) VALUES ($1, $2, $3);
EXECUTE write_to_stream(0, 1, 2);
EXECUTE write_to_stream(3, 4, 5);
EXECUTE write_to_stream(6, 7, 8);
一个连续的视图很像一个常规视图,除了它从流和表的组合中选择作为输入,
并且随着新数据被写入到这些输入中,它会实时地更新。
一旦一个流行被连续的视图读取,必须读取它,它就会被丢弃,它没有存储在任何地方。
对于连续视图来说,唯一持久的数据是通过从该视图运行SELECT来返回的任何数据。
因此,可以将连续视图看作是一个非常高吞吐量、实时的物化视图。
CREATE CONTINUOUS VIEW name AS query
# 创建视图
DROP CONTINUOUS VIEW name
# 删除视图
SELECT truncate_continuous_view('name');
# 删除所有连续视图的数据而不删除连续视图本身
SELECT * FROM pipeline_views();
# 查看视图
SELECT activate('continuous_view_or_transform');
SELECT deactivate('continuous_view_or_transform');
# 用于检测视图或者转换(触发器)是否是活跃状态,函数只接受一个连续视图或转换名称
在这个例子中,将计算一些关于一天的Wikipedia页面视图数据的基本统计信息。
数据集中的每个记录都包含每个Wikipedia页面的小时页面视图统计信息
数据集可以下载下来看一下
psql -h localhost -p 5432 -d pipeline -c "
CREATE STREAM wiki_stream(hour timestamp,project text,title text,view_count bigint,size bigint);
CREATE CONTINUOUS VIEW wiki_stats AS
SELECT hour, project,
count(*) AS total_pages,
sum(view_count) AS total_views,
min(view_count) AS min_views,
max(view_count) AS max_views,
avg(view_count) AS avg_views,
percentile_cont(0.99) WITHIN GROUP (ORDER BY view_count) AS p99_views,
sum(size) AS total_bytes_served
FROM wiki_stream
GROUP BY hour, project;"
curl -sL http://pipelinedb.com/data/wiki-pagecounts | gunzip | \
psql -h localhost -p 5432 -d pipeline -c "
COPY wiki_stream (hour, project, title, view_count, size) FROM STDIN"
psql -h localhost -p 5432 -d pipeline -c "
SELECT * FROM wiki_stats ORDER BY total_views DESC";
FROM pipelinedb docs
view和transform的区别在于,view的计算结果会保存在数据库中,
而transform不会保存,只能定义触发器来将结果输出到其他地方
可以在存储过程中调用view,但是 transform由于结果已经被重定向了,所以无法被调用
此处没有过多的介绍,后面找到更合适的应用实例,进行补写
滑动窗口的使用是重点
由于连续的视图不断地更新,所以数据库系统可以在更新连续视图的结果时考虑当前的时间。包含与当前时间相关的时间组件的WHERE子句的查询称为滑窗查询。滑动WHERE子句过滤或接受的一系列事件会随着时间的推移而变化。
clock_timestamp()
一个内置函数,它总是返回当前时间戳
arrival_timestamp
所有传入事件的特殊属性,其中包含数据库系统接收到的时间,如到达顺序所述
没有必要显式地添加引用这些值的WHERE子句。数据库在内部执行这个操作,只需要在一个连续视图定义中指定sw存储参数。
CREATE CONTINUOUS VIEW recent_users WITH (sw = '1 minute') AS
SELECT user_id::integer FROM stream;
CREATE CONTINUOUS VIEW recent_users AS
SELECT user_id::integer FROM stream
WHERE (arrival_timestamp > clock_timestamp() - interval '1 minute')
CREATE CONTINUOUS VIEW sensor_temps WITH (sw = '5 minutes') AS
SELECT sensor::integer, AVG(temp::numeric) FROM sensor_stream
GROUP BY sensor;
CREATE CONTINUOUS VIEW uniques WITH (sw = '30 days') AS
SELECT COUNT(DISTINCT user::integer) FROM user_stream;
CREATE CONTINUOUS VIEW latency WITH (sw = '5 minutes') AS
SELECT server_id::integer, percentile_cont(0.99)
WITHIN GROUP (ORDER BY latency::numeric) FROM server_stream
GROUP BY server_id;
方式与连接Postgresql一样
Java需要相应的驱动程序jar
Python使用psycopg2较方便
conn = psycopg2.connect("dbname='pipeline' user='pipe' password='pipeline' host='ip' port=5432")
pipeline = conn.cursor()
# stream(page_views)需要提前建立,或者加句建立stream的sql程序在执行建立视图前执行下即可
query = """
CREATE CONTINUOUS VIEW view AS
SELECT
url::text,
count(*) AS total_count,
count(DISTINCT cookie::text) AS uniques,
percentile_cont(0.99) WITHIN GROUP (ORDER BY latency::integer) AS p99_latency
FROM page_views GROUP BY url
"""
pipeline.execute(query)
conn.commit()
# Now let's simulate some page views
for n in range(10000):
# 10 unique urls
url = '/some/url/%d' % (n % 10)
# 1000 unique cookies
cookie = '%032d' % (n % 1000)
# latency uniformly distributed between 1 and 100
latency = random.randint(1, 100)
pipeline.execute("""
INSERT INTO page_views (url, cookie, latency) VALUES ('%s', '%s', %d)
""" % (url, cookie, latency))
# The output of a continuous view can be queried like any other table or view
pipeline.execute('SELECT * FROM view ORDER BY url')
rows = pipeline.fetchall()
for row in rows:
print(row)
pipeline.execute('DROP CONTINUOUS VIEW v')
conn.commit()
示例目的是练习executemany()的使用,stream以及视图的建立就不多表述
第一个for循环目的:构造内层为以aa、bb、cc为键的字典,namedict型如:
{1:{'aa':10,'bb':50,'cc':100},2:{'aa':10,'bb':50,'cc':100},3:{'aa':10,'bb':50,'cc':100}
tup型如:({'aa':10,'bb':50,'cc':100},{'aa':10,'bb':50,'cc':100},{'aa':10,'bb':50,'cc':100})
import psycopg2
namedict = {}
aa = 'aa'
bb = 'bb'
cc = 'cc'
# if条件句可以忽略,只为赋值所写
for n in range(100000):
rows1 = {}
x = n % 10
rows1[aa]=100000 - n
rows1[bb]=x
if 0<n< 500:
rows1[cc]=n*n-200
else:
if n<10000:
rows1[cc]=n*20-500
else:
rows1[cc]=n*3-200
namedict[n] = rows1
for i in range(10):
print(namedict[i])
print(namedict[i]['aa'],namedict[i]['bb'],namedict[i]['cc'])
tup = tuple(namedict.values())
pipeline.executemany("""INSERT INTO test_stream (key,value,x)
VALUES (%(aa)s,%(bb)s,%(cc)s)""", tup)
pipeline.execute('SELECT * FROM view ORDER BY url')
rows = pipeline.fetchall()
for row in rows:
print(row)
pipeline.execute('DROP CONTINUOUS VIEW view')
conn.commit()
需要先将驱动程序jar文件引入
import java.util.Properties;
import java.sql.*;
public class example {
static final String HOST = "ip**";
static final String DATABASE = "pipeline";
static final String USER = "pipe";
static final String PASSWORD = "pipeline";
public static void main(String[] args) throws SQLException {
String url = "jdbc:postgresql://" + HOST + ":5432/" + DATABASE;
ResultSet rs;
// Properties props = new Properties();
// props.setProperty("user", USER);
Connection conn = DriverManager.getConnection(url, USER, PASSWORD);
Statement stmt = conn.createStatement();
stmt.executeUpdate("CREATE CONTINUOUS VIEW view AS SELECT x::integer,
COUNT(*) FROM stream GROUP BY x");
for (int i = 0; i < 100000; i++) {
int x = i % 10;
stmt.addBatch("INSERT INTO stream (x) VALUES (" + Integer.toString(x) + ")");
}
stmt.executeBatch();
rs = stmt.executeQuery("SELECT * FROM view");
while (rs.next()) {
int id = rs.getInt("x");
int count = rs.getInt("count");
System.out.println(id + " = " + count);
}
// stmt.executeUpdate("DROP CONTINUOUS VIEW v");
conn.close();
}
}
pipelinedb docx