pyflink版本号:
1.10.1
安装方法:
pip install apache-flink==1.10.1
python版本要求:
截止到2020年6月7号,官方要求至少为3.5,最高为3.7
找到你当前环境中的pyflink包,一般在你的python目录中的site-package里边找到pyflink
─➤ tree
.
├── README.txt
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-36.pyc
│ ├── find_flink_home.cpython-36.pyc
│ ├── gen_protos.cpython-36.pyc
│ ├── java_gateway.cpython-36.pyc
│ ├── serializers.cpython-36.pyc
│ ├── shell.cpython-36.pyc
│ └── version.cpython-36.pyc
├── bin
│ ├── bash-java-utils.jar
│ ├── config.sh
│ ├── find-flink-home.sh
│ ├── flink
│ ├── flink-console.sh
│ ├── flink-daemon.sh
│ ├── historyserver.sh
│ ├── jobmanager.sh
│ ├── kubernetes-entry.sh
│ ├── kubernetes-session.sh
│ ├── mesos-appmaster-job.sh
│ ├── mesos-appmaster.sh
│ ├── mesos-taskmanager.sh
│ ├── pyflink-gateway-server.sh
│ ├── pyflink-shell.sh
│ ├── sql-client.sh
│ ├── standalone-job.sh
│ ├── start-cluster.sh
│ ├── start-scala-shell.sh
│ ├── start-zookeeper-quorum.sh
│ ├── stop-cluster.sh
│ ├── stop-zookeeper-quorum.sh
│ ├── taskmanager.sh
│ ├── yarn-session.sh
│ └── zookeeper.sh
├── common
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── configuration.cpython-36.pyc
│ │ ├── dependency_manager.cpython-36.pyc
│ │ ├── execution_config.cpython-36.pyc
│ │ ├── execution_mode.cpython-36.pyc
│ │ ├── input_dependency_constraint.cpython-36.pyc
│ │ └── restart_strategy.cpython-36.pyc
│ ├── configuration.py
│ ├── dependency_manager.py
│ ├── execution_config.py
│ ├── execution_mode.py
│ ├── input_dependency_constraint.py
│ └── restart_strategy.py
├── conf
│ ├── flink-conf.yaml
│ ├── log4j-cli.properties
│ ├── log4j-console.properties
│ ├── log4j-yarn-session.properties
│ ├── log4j.properties
│ ├── logback-console.xml
│ ├── logback-yarn.xml
│ ├── logback.xml
│ ├── masters
│ ├── slaves
│ ├── sql-client-defaults.yaml
│ └── zoo.cfg
├── dataset
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── execution_environment.cpython-36.pyc
│ └── execution_environment.py
├── datastream
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── checkpoint_config.cpython-36.pyc
│ │ ├── checkpointing_mode.cpython-36.pyc
│ │ ├── state_backend.cpython-36.pyc
│ │ ├── stream_execution_environment.cpython-36.pyc
│ │ └── time_characteristic.cpython-36.pyc
│ ├── checkpoint_config.py
│ ├── checkpointing_mode.py
│ ├── state_backend.py
│ ├── stream_execution_environment.py
│ └── time_characteristic.py
├── examples
│ └── python
│ └── table
│ └── batch
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── word_count.cpython-36.pyc
│ └── word_count.py
├── find_flink_home.py
├── fn_execution
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── boot.cpython-36.pyc
│ │ ├── coder_impl.cpython-36.pyc
│ │ ├── coders.cpython-36.pyc
│ │ ├── flink_fn_execution_pb2.cpython-36.pyc
│ │ ├── operations.cpython-36.pyc
│ │ └── sdk_worker_main.cpython-36.pyc
│ ├── boot.py
│ ├── coder_impl.py
│ ├── coders.py
│ ├── flink_fn_execution_pb2.py
│ ├── operations.py
│ └── sdk_worker_main.py
├── gen_protos.py
├── java_gateway.py
├── lib
│ ├── flink-csv-1.10.0-sql-jar.jar
│ ├── flink-dist_2.11-1.10.1.jar
│ ├── flink-jdbc_2.11-1.10.0.jar
│ ├── flink-sql-connector-kafka_2.11-1.10.0.jar
│ ├── flink-table-blink_2.11-1.10.1.jar
│ ├── flink-table_2.11-1.10.1.jar
│ ├── log4j-1.2.17.jar
│ ├── mysql-connector-java-8.0.19.jar
│ └── slf4j-log4j12-1.7.15.jar
├── licenses
│ ├── LICENSE-hdrhistogram
│ ├── LICENSE-protobuf
│ ├── LICENSE-re2j
│ ├── LICENSE-stax2api
│ ├── LICENSE-xmlenc
│ ├── LICENSE.asm
│ ├── LICENSE.automaton
│ ├── LICENSE.base64
│ ├── LICENSE.cloudpickle
│ ├── LICENSE.google-auth-library-credentials
│ ├── LICENSE.grizzled-slf4j
│ ├── LICENSE.influx
│ ├── LICENSE.janino
│ ├── LICENSE.javax.activation
│ ├── LICENSE.jaxb
│ ├── LICENSE.jline
│ ├── LICENSE.jsr166y
│ ├── LICENSE.jzlib
│ ├── LICENSE.kryo
│ ├── LICENSE.minlog
│ ├── LICENSE.protobuf
│ ├── LICENSE.py4j
│ ├── LICENSE.pyrolite
│ ├── LICENSE.scala
│ ├── LICENSE.scopt
│ ├── LICENSE.slf4j
│ ├── LICENSE.slf4j-api
│ └── LICENSE.webbit
├── log
│ ├── empty.txt
│ ├── flink-zhoujingwei-client-zhoujingweideMacBook-Pro.local.log
│ ├── flink-zhoujingwei-client-zhoujinweidembp.log
│ ├── flink-zhoujingwei-python-zhoujingweideMacBook-Pro.local.log
│ ├── flink-zhoujingwei-python-zhoujinweidembp.log
│ ├── flink-zhoujingwei-standalonesession-1-zhoujinweidembp.log
│ ├── flink-zhoujingwei-standalonesession-1-zhoujinweidembp.out
│ ├── flink-zhoujingwei-standalonesession-2-zhoujinweidembp.log
│ ├── flink-zhoujingwei-standalonesession-2-zhoujinweidembp.out
│ ├── flink-zhoujingwei-taskexecutor-1-zhoujinweidembp.log
│ ├── flink-zhoujingwei-taskexecutor-1-zhoujinweidembp.out
│ ├── flink-zhoujingwei-taskexecutor-2-zhoujinweidembp.log
│ └── flink-zhoujingwei-taskexecutor-2-zhoujinweidembp.out
├── opt
│ ├── flink-azure-fs-hadoop-1.10.1.jar
│ ├── flink-cep-scala_2.11-1.10.1.jar
│ ├── flink-cep_2.11-1.10.1.jar
│ ├── flink-gelly-scala_2.11-1.10.1.jar
│ ├── flink-gelly_2.11-1.10.1.jar
│ ├── flink-metrics-datadog-1.10.1.jar
│ ├── flink-metrics-graphite-1.10.1.jar
│ ├── flink-metrics-influxdb-1.10.1.jar
│ ├── flink-metrics-prometheus-1.10.1.jar
│ ├── flink-metrics-slf4j-1.10.1.jar
│ ├── flink-metrics-statsd-1.10.1.jar
│ ├── flink-oss-fs-hadoop-1.10.1.jar
│ ├── flink-python_2.11-1.10.1.jar
│ ├── flink-queryable-state-runtime_2.11-1.10.1.jar
│ ├── flink-s3-fs-hadoop-1.10.1.jar
│ ├── flink-s3-fs-presto-1.10.1.jar
│ ├── flink-shaded-netty-tcnative-dynamic-2.0.25.Final-9.0.jar
│ ├── flink-sql-client_2.11-1.10.1.jar
│ ├── flink-state-processor-api_2.11-1.10.1.jar
│ └── flink-swift-fs-hadoop-1.10.1.jar
├── plugins
│ └── README.txt
├── serializers.py
├── shell.py
├── table
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── catalog.cpython-36.pyc
│ │ ├── descriptors.cpython-36.pyc
│ │ ├── environment_settings.cpython-36.pyc
│ │ ├── sinks.cpython-36.pyc
│ │ ├── sources.cpython-36.pyc
│ │ ├── sql_dialect.cpython-36.pyc
│ │ ├── table.cpython-36.pyc
│ │ ├── table_config.cpython-36.pyc
│ │ ├── table_environment.cpython-36.pyc
│ │ ├── table_schema.cpython-36.pyc
│ │ ├── types.cpython-36.pyc
│ │ ├── udf.cpython-36.pyc
│ │ └── window.cpython-36.pyc
│ ├── catalog.py
│ ├── descriptors.py
│ ├── environment_settings.py
│ ├── sinks.py
│ ├── sources.py
│ ├── sql_dialect.py
│ ├── table.py
│ ├── table_config.py
│ ├── table_environment.py
│ ├── table_schema.py
│ ├── types.py
│ ├── udf.py
│ └── window.py
├── util
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── exceptions.cpython-36.pyc
│ │ └── utils.cpython-36.pyc
│ ├── exceptions.py
│ └── utils.py
└── version.py
25 directories, 205 files
- bin:
执行命令
跟flink启动相关的脚本命令
比如zookeeper.sh 自带一个zk,现在很多大数据的安装包里边都会有自带,kafka、hadoop都会有;
比如start-cluster.sh可以启动一个flink集群,相关的配置、环境变量都在config.sh中
- connon:
通用模块
相对底层一些的对接java对象接口封装,为上一层提供服务dataset、datastream提供基础、通用服务,比如对象转换、配置获取、配置更新等
可以看到该模块下的文件,通常都会导入gateway
from pyflink.java_gateway import get_gateway
- conf:
配置文件
主要为flink本身的配置文件,不是pyflink的配置,因为pyflink内置了flink程序,所以需要有conf目录,如果想使用包里边自带的flink,可以修改这里的配置文件,具体的配置这里不详细解释,关于conf/flink-conf.yaml,有兴趣的同学可以参考我的1.9的中文翻译版本这里
或者1.10.1的官方配置
- dataset:
批处理相关
只有一个文件,就是执行环境的主类,封装了配置的获取、设置、重启策略、并行度设置、程序执行等函数
- datastream:
流处理相关
主要包括三个核心stream的执行环境、checkpoint、statebackend,后边针对每个模块进行说明
- example:
默认内置了一个wordcount计算字符数量的批处理例子,有兴趣的话可以看看。
- fn-execution:
- lib:
flink依赖的jar包,本身pyflink用不到,如果使用到类似kafka、mysql这种connector的话,需要把对应的jar包放在这里,因为flink需要用
- license:
- log:
flink运行的话会把日志输出在这里
- opt:
同样也是flink使用的jar包
- table:
高级table api
pyflink中最最核心的模块,包括table api的核心组件、table配置、table环境、sink、source、流批配置、数据类型、window、schema、udf
from pyflink.table.environment_settings import EnvironmentSettings
from pyflink.table.sql_dialect import SqlDialect
from pyflink.table.table import Table, GroupedTable, GroupWindowedTable, OverWindowedTable, \
WindowGroupedTable
from pyflink.table.table_config import TableConfig
from pyflink.table.table_environment import (TableEnvironment, StreamTableEnvironment,
BatchTableEnvironment)
from pyflink.table.sinks import TableSink, CsvTableSink, WriteMode
from pyflink.table.sources import TableSource, CsvTableSource
from pyflink.table.types import DataTypes, UserDefinedType, Row
from pyflink.table.table_schema import TableSchema
from pyflink.table.udf import FunctionContext, ScalarFunction
- util:
同java交互中的通用异常模块
- other:
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, EnvironmentSettings
# 创建Table Environment, 并选择使用的Planner
env = StreamExecutionEnvironment.get_execution_environment()
t_env = StreamTableEnvironment.create(
env,
environment_settings=EnvironmentSettings.new_instance().use_blink_planner().build())
主要分为两步,首先如果是想开发一个流处理任务的话,需要导入datastream中的StreamExecutionEnvironment类,并通过函数get_execution_environment()获取一个程序需要的上下文执行环境,这个初始化的环境中会首先获取寻找jvm虚拟机,调用的是pyflink根目录下的java_gateway.py中的get_getway()方法。
如果存在jvm虚拟机,即环境变量PYFLINK_GATEWAY_PORT存在当前运行环境中的话。
如果不存在jvm虚拟机,会首先调用方法launch_gateway()启动一个,注意这里不支持windows环境,只支持mac和linux。启动会取bin目录下的pyflink-gateway-server.sh加上参数[’-c’, ‘org.apache.flink.client.python.PythonGatewayServer’] ,成功启动gateway之后,会连接到gateway端口,并加载java类库,如下
def import_flink_view(gateway):
"""
import the classes used by PyFlink.
:param gateway:gateway connected to JavaGateWayServer
"""
# Import the classes used by PyFlink
java_import(gateway.jvm, "org.apache.flink.table.api.*")
java_import(gateway.jvm, "org.apache.flink.table.api.java.*")
java_import(gateway.jvm, "org.apache.flink.table.api.dataview.*")
java_import(gateway.jvm, "org.apache.flink.table.catalog.*")
java_import(gateway.jvm, "org.apache.flink.table.descriptors.*")
java_import(gateway.jvm, "org.apache.flink.table.descriptors.python.*")
java_import(gateway.jvm, "org.apache.flink.table.sources.*")
java_import(gateway.jvm, "org.apache.flink.table.sinks.*")
java_import(gateway.jvm, "org.apache.flink.table.sources.*")
java_import(gateway.jvm, "org.apache.flink.table.types.*")
java_import(gateway.jvm, "org.apache.flink.table.types.logical.*")
java_import(gateway.jvm, "org.apache.flink.table.util.python.*")
java_import(gateway.jvm, "org.apache.flink.api.common.python.*")
java_import(gateway.jvm, "org.apache.flink.api.common.typeinfo.TypeInformation")
java_import(gateway.jvm, "org.apache.flink.api.common.typeinfo.Types")
java_import(gateway.jvm, "org.apache.flink.api.java.ExecutionEnvironment")
java_import(gateway.jvm,
"org.apache.flink.streaming.api.environment.StreamExecutionEnvironment")
java_import(gateway.jvm, "org.apache.flink.api.common.restartstrategy.RestartStrategies")
加载完之后就可以使用java中的api了,这里我们初始化的是一个流api,所以pyflink中封装的也是java stream api
如下
def get_execution_environment():
"""
Creates an execution environment that represents the context in which the
program is currently executed. If the program is invoked standalone, this
method returns a local execution environment.
:return: The execution environment of the context in which the program is executed.
"""
gateway = get_gateway()
j_stream_exection_environment = gateway.jvm.org.apache.flink.streaming.api.environment\
.StreamExecutionEnvironment.getExecutionEnvironment()
return StreamExecutionEnvironment(j_stream_exection_environment)
根据阿里大神金竹的说法,flink社区不会单独开发一套针对python的api,目前pyflink的接口都是对已有的java api的python再次封装,都是通过py4j实现python和java的互通,从第一步初始化我们已经可以看出这种实现了。
接下来看下一步
t_env = StreamTableEnvironment.create(
env,
environment_settings=EnvironmentSettings.new_instance().use_blink_planner().build())
这里通过StreamTableEnvironment创建了一个table的运行环境,其中参数为上一步实现的一个java流处理api环境和一个新创建出来的环境实例,下边一次来看一下。
首先看一下StreamTableEnvironment,它的基类是一个TableEnvironment,之前提到过table模块中是对stream和dataset的再次封装,在官方上已经说明sql api是最高级的api接口,table api其次,再然后就是stream/dataset api,这里就可以看出来,我们创建的是一个基于table类的streamtable类。
其实这里也可以不带参数create、带tableconfig create
>>> env = StreamExecutionEnvironment.get_execution_environment()
# create without optional parameters.
>>> table_env = StreamTableEnvironment.create(env)
# create with TableConfig
>>> table_config = TableConfig()
>>> table_config.set_null_check(False)
>>> table_env = StreamTableEnvironment.create(env, table_config)
# create with EnvrionmentSettings
>>> environment_settings = EnvironmentSettings.new_instance().use_blink_planner().build()
>>> table_env = StreamTableEnvironment.create(env, environment_settings=environment_settings)
不论那种启动,create函数总归还是会转换为java端的gateway.jvm.StreamTableEnvironment.create,同时create的参数也会转换为java对象传给java的create函数,注意table_config和environment_settings在同一时刻只能用一个。此处的environment_settings通过EnvironmentSettings.new_instance().use_blink_planner().build()方法创建了java对象gateway.jvm.EnvironmentSettings.Builder()
至于里边提到的blink,其实是阿里巴巴实现了将流批接口统一之后形成的blink并开源回馈给了flink社区,在1.9之后table api就实现了dataset和stream的统一,未来flink将会移除dataset,具体可以参考文章
至此,初始化的pyflink代码基本也看完了,剩下的都交给java端了。
# 创建Kafka数据源表
t_env.sql_update(kafka_source_ddl)
# 创建MySql结果表
t_env.sql_update(mysql_sink_ddl)
这里主要用到一个sql_update方法,这个方法的定义是在TableEnvironment,这个方法主要是执行一些ddl,目前只支持create和insert类型,需要注意的是所有需要查询的表都要注册到TableEnvironment,简单说不管是数据源source、输出sink、中间过程tmp表,只要使用到,都需要创建一下。比如
create table tbl1(
a int,
b bigint,
c varchar
) with (
'connector.type' = 'filesystem',
'format.type' = 'csv',
'connector.path' = 'xxx'
)
也可以这样
>>> source_ddl = \\
... '''
... create table sourceTable(
... a int,
... b varchar
... ) with (
... 'connector.type' = 'kafka',
... 'update-mode' = 'append',
... 'connector.topic' = 'xxx',
... 'connector.properties.zookeeper.connect' = 'localhost:2181',
... 'connector.properties.bootstrap.servers' = 'localhost:9092'
... )
... '''
>>> sink_ddl = \\
... '''
... create table sinkTable(
... a int,
... b varchar
... ) with (
... 'connector.type' = 'filesystem',
... 'format.type' = 'csv',
... 'connector.path' = 'xxx'
... )
... '''
>>> query = "INSERT INTO sinkTable SELECT FROM sourceTable"
>>> table_env.sql(source_ddl)
>>> table_env.sql(sink_ddl)
>>> table_env.sql(query)
>>> table_env.execute("MyJob")
总之最后sql语句还是提交给java去执行
这里需要了解的是TableEnvironment的职责
1、连接到外部系统
2、从类class:~pyflink.table.Table
注册、从catalog获取一些原数据
3、执行sql statement
4、提供一些未来的配置