参照flink 官方文档
https://nightlies.apache.org/flink/flink-docs-release-1.14/docs/deployment/resource-providers/standalone/kubernetes/
kubectl create serviceaccount flink-service-account -n $namespace
kubectl create clusterrolebinding flink-role-binding-flink --clusterrole=edit --serviceaccount=$namespace:flink-service-account -n $namespace
kind: ConfigMap
apiVersion: v1
metadata:
name: {{configMapName}}
namespace: {{nameSpace}}
data:
flink-conf.yaml: |-
#jobmanager.rpc.address: flink-jobmanager-configmap
jobmanager.rpc.address: {{serviceName}}
taskmanager.numberOfTaskSlots: 4
blob.server.port: 6124
jobmanager.rpc.port: 6123
taskmanager.rpc.port: 6122
queryable-state.proxy.ports: 6125
jobmanager.memory.process.size: 1600m
taskmanager.memory.process.size: 1600m
parallelism.default: 2
classloader.resolve-order: parent-first
#jobmanager.archive.fs.dir: /data/completed-jobs/
#historyserver.archive.fs.dir: /data/completed-jobs/
#historyserver.web.address: 0.0.0.0
#historyserver.web.port: 8082
#historyserver.archive.fs.refresh-interval: 10000
web.upload.dir: /data/jarPackages
web.tmpdir: /data/jarPackages
env.java.home: /usr/local/openjdk-8
high-availability: org.apache.flink.kubernetes.highavailability.KubernetesHaServicesFactory
high-availability.storageDir: /data/completed-jobs/
kubernetes.cluster-id: k8s-dev
log4j-console.properties: |-
# This affects logging for both user code and Flink
rootLogger.level = INFO
rootLogger.appenderRef.console.ref = ConsoleAppender
rootLogger.appenderRef.rolling.ref = RollingFileAppender
# Uncomment this if you want to _only_ change Flink's logging
#logger.flink.name = org.apache.flink
#logger.flink.level = INFO
# The following lines keep the log level of common libraries/connectors on
# log level INFO. The root logger does not override this. You have to manually
# change the log levels here.
logger.akka.name = akka
logger.akka.level = INFO
logger.kafka.name= org.apache.kafka
logger.kafka.level = INFO
logger.hadoop.name = org.apache.hadoop
logger.hadoop.level = INFO
logger.zookeeper.name = org.apache.zookeeper
logger.zookeeper.level = INFO
# Log all infos to the console
appender.console.name = ConsoleAppender
appender.console.type = CONSOLE
appender.console.layout.type = PatternLayout
appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n
# Log all infos in the given rolling file
appender.rolling.name = RollingFileAppender
appender.rolling.type = RollingFile
appender.rolling.append = false
appender.rolling.fileName = ${sys:log.file}
appender.rolling.filePattern = ${sys:log.file}.%i
appender.rolling.layout.type = PatternLayout
appender.rolling.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n
appender.rolling.policies.type = Policies
appender.rolling.policies.size.type = SizeBasedTriggeringPolicy
appender.rolling.policies.size.size=100MB
appender.rolling.strategy.type = DefaultRolloverStrategy
appender.rolling.strategy.max = 10
# Suppress the irrelevant (wrong) warnings from the Netty channel handler
logger.netty.name = org.jboss.netty.channel.DefaultChannelPipeline
logger.netty.level = OFF
metadata:
name: {{serviceName}}
namespace: {{nameSpace}}
labels:
app: {{serviceName}}
version: v1
annotations: {}
spec:
ports:
- name: cce-service-0
protocol: TCP
port: 8081
targetPort: 8081
- name: cce-service-1
protocol: TCP
port: 6122
targetPort: 6122
- name: cce-service-2
protocol: TCP
port: 6123
targetPort: 6123
- name: cce-service-3
protocol: TCP
port: 6124
targetPort: 6124
- name: cce-service-4
protocol: TCP
port: 6125
targetPort: 6125
selector:
app: {{serviceName}}
version: v1
type: ClusterIP
sessionAffinity: None
status:
loadBalancer: {}
apiVersion: v1
kind: Service
kind: Deployment
apiVersion: apps/v1
metadata:
name: {{serviceName}}
namespace: {{nameSpace}}
spec:
replicas: 2
selector:
matchLabels:
app: {{serviceName}}
version: v1
template:
metadata:
creationTimestamp: null
labels:
app: {{serviceName}}
version: v1
spec:
volumes:
- name: flink-cdc-volume
persistentVolumeClaim:
claimName: {{volume}}
- name: vol-166002733388512148
configMap:
name: {{configMapName}}
defaultMode: 420
containers:
- name: jobmanager
image: swr.cn-south-1.myhuaweicloud.com/harsonscloud/flink:{{version}}
args:
- jobmanager
- $(POD_IP)
ports:
- name: rpc
containerPort: 6123
protocol: TCP
- name: blob-server
containerPort: 6124
protocol: TCP
- name: webui
containerPort: 8081
protocol: TCP
env:
- name: PROFILE
value: {{profile}}
# - name: FLINK_PROPERTIES
# value: 'jobmanager.rpc.address: flink-jobmanager-configmap'
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
resources:
limits:
cpu: '2'
memory: 4Gi
requests:
cpu: '1'
memory: 2Gi
volumeMounts:
- name: flink-cdc-volume
mountPath: /data
# - name: vol-166002733388512148
# readOnly: true
# mountPath: /opt/flink/conf/flink-conf.yaml
# subPath: flink-conf.yaml
- name: vol-166002733388512148
readOnly: true
mountPath: /opt/flink/conf/log4j-console.properties
subPath: log4j-console.properties
livenessProbe:
tcpSocket:
port: 6123
initialDelaySeconds: 30
timeoutSeconds: 1
periodSeconds: 60
successThreshold: 1
failureThreshold: 3
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
imagePullPolicy: IfNotPresent
securityContext:
runAsUser: 0
restartPolicy: Always
terminationGracePeriodSeconds: 30
dnsPolicy: ClusterFirst
serviceAccountName: flink-service-account
serviceAccount: flink-service-account
securityContext: {}
imagePullSecrets:
- name: default-secret
schedulerName: default-scheduler
tolerations:
- key: node.kubernetes.io/not-ready
operator: Exists
effect: NoExecute
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
operator: Exists
effect: NoExecute
tolerationSeconds: 300
dnsConfig:
options:
- name: timeout
value: ''
- name: ndots
value: '5'
- name: single-request-reopen
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 25%
maxSurge: 25%
revisionHistoryLimit: 10
progressDeadlineSeconds: 600
kind: Deployment
apiVersion: apps/v1
metadata:
name: {{serviceName}}
namespace: {{nameSpace}}
spec:
replicas: 2
selector:
matchLabels:
app: {{serviceName}}
version: v1
template:
metadata:
creationTimestamp: null
labels:
app: {{serviceName}}
version: v1
annotations:
redeploy-timestamp: '1660102260690'
spec:
volumes:
- name: vol-166002733388512148
configMap:
name: {{configMapName}}
defaultMode: 420
- name: flink-cdc-stroage
persistentVolumeClaim:
claimName: {{volume}}
containers:
- name: taskmanager
image: swr.cn-south-1.myhuaweicloud.com/harsonscloud/flink:{{version}}
args:
- taskmanager
ports:
- name: rpc
containerPort: 6122
protocol: TCP
- name: query-state
containerPort: 6125
protocol: TCP
env:
- name: PROFILE
value: {{profile}}
# - name: FLINK_PROPERTIES
# value: 'jobmanager.rpc.address: flink-jobmanager-configmap'
resources:
limits:
cpu: '2'
memory: 4Gi
requests:
cpu: {{cpuNum}}
memory: {{memSize}}
volumeMounts:
# - name: vol-166002733388512148
# readOnly: true
# mountPath: /opt/flink/conf/flink-conf.yaml
# subPath: flink-conf.yaml
- name: vol-166002733388512148
readOnly: true
mountPath: /opt/flink/conf/log4j-console.properties
subPath: log4j-console.properties
- name: flink-cdc-stroage
mountPath: /data
livenessProbe:
tcpSocket:
port: 6122
initialDelaySeconds: 30
timeoutSeconds: 1
periodSeconds: 60
successThreshold: 1
failureThreshold: 3
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
imagePullPolicy: IfNotPresent
securityContext:
runAsUser: 9999
restartPolicy: Always
terminationGracePeriodSeconds: 30
dnsPolicy: ClusterFirst
serviceAccountName: flink-service-account
serviceAccount: flink-service-account
securityContext: {}
imagePullSecrets:
- name: default-secret
schedulerName: default-scheduler
tolerations:
- key: node.kubernetes.io/not-ready
operator: Exists
effect: NoExecute
tolerationSeconds: 300
- key: node.kubernetes.io/unreachable
operator: Exists
effect: NoExecute
tolerationSeconds: 300
dnsConfig:
options:
- name: timeout
value: ''
- name: ndots
value: '5'
- name: single-request-reopen
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 25%
maxSurge: 25%
revisionHistoryLimit: 10
progressDeadlineSeconds: 600
flink k8s 部署 k8s版本要求为1.19,部署环境为1.15 不知道会遇到什么问题,将就部署,坑惨我了
原因是官方文档,yaml 有如下字段:
securityContext:
runAsUser: 9999
以普通用户启动,添加不了系统后台进程,那就以root 方式运行,flink程序还是以flink用户启动,修改如下:
securityContext:
runAsUser: 0
Caused by: java.io.FileNotFoundException: /data/completed-jobs/default/submittedJobGraphdcfc0f99c0a8 (No such file or directory)
如果出现这条报错是因为开启了flink ha模式,然后元数据被清除找不到了,解决方法是修改cluster-id
kubernetes.cluster-id: k8s-dev
sed: cannot rename /opt/flink/conf/sedMNFSQ8: Device or resource busy
Caused by: java.io.FileNotFoundException: /data/completed-jobs/default/submittedJobGraphdcfc0f99c0a8 (No such file or directory)
Caused by: org.apache.flink.util.FlinkException: Could not retrieve submitted JobGraph from state handle under jobGraph-04b7e9f8727d5573cb84a9d7fcd046ca. This indicates that the retrieved state handle is broken. Try cleaning the state handle store.
high-availability: org.apache.flink.kubernetes.highavailability.KubernetesHaServicesFactory
high-availability.storageDir: /data/completed-jobs/
kubernetes.cluster-id: k8s-dev
kubernetes.cluster-id: k8s-dev 这个参数坑了我好久。
因为放在最后一行,不知道是不是没有换行符,部署的时候一直报这个错误
Caused by: io.fabric8.kubernetes.client.KubernetesClientException: Failure executing: GET at: https://10.247.0.1/api/v1/namespaces/default/configmaps?watch=false&labelSelector=app%3Dk8s-3386query.server.port%3A%206125%2Ctype%3Dflink-native-kubernetes%2Cconfigmap-type%3Dhigh-availability. Message: unable to parse requirement: invalid label value: "k8s-3386query.server.port:": at key: "app": a valid label must be an empty string or consist of alphanumeric characters, '-', '_' or '.', and must start and end with an alphanumeric character (e.g. 'MyValue', or 'my_value', or '12345', regex used for validation is '(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?'). Received status: Status(apiVersion=v1, code=400, details=null, kind=Status, message=unable to parse requirement: invalid label value: "k8s-3386query.server.port:": at key: "app": a valid label must be an empty string or consist of alphanumeric characters, '-', '_' or '.', and must start and end with an alphanumeric character (e.g. 'MyValue', or 'my_value', or '12345', regex used for validation is '(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?'), metadata=ListMeta(_continue=null, remainingItemCount=null, resourceVersion=null, selfLink=null, additionalProperties={}), reason=BadRequest, status=Failure, additionalProperties={}).
提示如下:
invalid label value: “k8s-3386query.server.port:”: at key: “app”: a valid label must be an empty string or consist of alphanumeric characters, ‘-’, ‘_’ or ‘.’, and must start and end with an alphanumeric character (e.g. ‘MyValue’, or ‘my_value’, or ‘12345’
k8s-3386是flink k8s的 集群id ,说是不支持这个格式,我按照提示改成了12345 都不行,
仔细观察,k8s-3386query.server.port: 后面怎么多出了query.server.port: 关键就再这里,flink-conf.yaml配置文件中,没有这行参数,找了好几遍都没有,后来想了想是不是flink启动自动添加的,就把这个参数往中间挪了挪,结果就ok,幸运。
中间有很多小插曲,镜像运行shell脚本是有问题的,在docker里面最好写python脚本。