codemosi

flink akka 应用

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.flink.runtime.jobmanager

import java.io.{File, IOException}
import java.lang.reflect.{Constructor, InvocationTargetException}
import java.net.{UnknownHostException, InetAddress, InetSocketAddress}
import java.util.UUID

import akka.actor.Status.Failure
import akka.actor._
import akka.pattern.ask
import grizzled.slf4j.Logger
import org.apache.flink.api.common.{ExecutionConfig, JobID}
import org.apache.flink.configuration.{ConfigConstants, Configuration, GlobalConfiguration}
import org.apache.flink.core.io.InputSplitAssigner
import org.apache.flink.runtime.accumulators.AccumulatorSnapshot
import org.apache.flink.runtime.akka.{AkkaUtils, ListeningBehaviour}
import org.apache.flink.runtime.blob.BlobServer
import org.apache.flink.runtime.checkpoint.{CheckpointRecoveryFactory, StandaloneCheckpointRecoveryFactory, ZooKeeperCheckpointRecoveryFactory}
import org.apache.flink.runtime.client._
import org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager
import org.apache.flink.runtime.executiongraph.{ExecutionGraph, ExecutionJobVertex}
import org.apache.flink.runtime.instance.{AkkaActorGateway, InstanceManager}
import org.apache.flink.runtime.jobgraph.jsonplan.JsonPlanGenerator
import org.apache.flink.runtime.jobgraph.{JobGraph, JobStatus, JobVertexID}
import org.apache.flink.runtime.jobmanager.SubmittedJobGraphStore.SubmittedJobGraphListener
import org.apache.flink.runtime.jobmanager.scheduler.{Scheduler => FlinkScheduler}
import org.apache.flink.runtime.leaderelection.{LeaderContender, LeaderElectionService, StandaloneLeaderElectionService}
import org.apache.flink.runtime.leaderretrieval.{StandaloneLeaderRetrievalService, LeaderRetrievalService}
import org.apache.flink.runtime.messages.ArchiveMessages.ArchiveExecutionGraph
import org.apache.flink.runtime.messages.ExecutionGraphMessages.JobStatusChanged
import org.apache.flink.runtime.messages.JobManagerMessages._
import org.apache.flink.runtime.messages.Messages.{Acknowledge, Disconnect}
import org.apache.flink.runtime.messages.RegistrationMessages._
import org.apache.flink.runtime.messages.TaskManagerMessages.{Heartbeat, SendStackTrace}
import org.apache.flink.runtime.messages.TaskMessages.{PartitionState, UpdateTaskExecutionState}
import org.apache.flink.runtime.messages.accumulators.{AccumulatorMessage, AccumulatorResultStringsFound, AccumulatorResultsErroneous, AccumulatorResultsFound, RequestAccumulatorResults, RequestAccumulatorResultsStringified}
import org.apache.flink.runtime.messages.checkpoint.{AbstractCheckpointMessage, AcknowledgeCheckpoint}
import org.apache.flink.runtime.messages.webmonitor._
import org.apache.flink.runtime.process.ProcessReaper
import org.apache.flink.runtime.security.SecurityUtils
import org.apache.flink.runtime.security.SecurityUtils.FlinkSecuredRunner
import org.apache.flink.runtime.taskmanager.TaskManager
import org.apache.flink.runtime.util._
import org.apache.flink.runtime.webmonitor.{WebMonitor, WebMonitorUtils}
import org.apache.flink.runtime.{FlinkActor, LeaderSessionMessageFilter, LogMessages, StreamingMode}
import org.apache.flink.util.{ExceptionUtils, InstantiationUtil, NetUtils}

import scala.collection.JavaConverters._
import scala.concurrent._
import scala.concurrent.duration._
import scala.concurrent.forkjoin.ForkJoinPool
import scala.language.postfixOps

/**
* The job manager is responsible for receiving Flink jobs, scheduling the tasks, gathering the
* job status and managing the task managers. It is realized as an actor and receives amongst others
* the following messages:
*
* - [[RegisterTaskManager]] is sent by a TaskManager which wants to register at the job manager.
* A successful registration at the instance manager is acknowledged by [[AcknowledgeRegistration]]
*
* - [[SubmitJob]] is sent by a client which wants to submit a job to the system. The submit
* message contains the job description in the form of the JobGraph. The JobGraph is appended to
* the ExecutionGraph and the corresponding ExecutionJobVertices are scheduled for execution on
* the TaskManagers.
*
* - [[CancelJob]] requests to cancel the job with the specified jobID. A successful cancellation
* is indicated by [[CancellationSuccess]] and a failure by [[CancellationFailure]]
*
* - [[UpdateTaskExecutionState]] is sent by a TaskManager to update the state of an
ExecutionVertex contained in the [[ExecutionGraph]].
* A successful update is acknowledged by true and otherwise false.
*
* - [[RequestNextInputSplit]] requests the next input split for a running task on a
* [[TaskManager]]. The assigned input split or null is sent to the sender in the form of the
* message [[NextInputSplit]].
*
* - [[JobStatusChanged]] indicates that the status of job (RUNNING, CANCELING, FINISHED, etc.) has
* changed. This message is sent by the ExecutionGraph.
*/
class JobManager(
protected val flinkConfiguration: Configuration,
protected val executionContext: ExecutionContext,
protected val instanceManager: InstanceManager,
protected val scheduler: FlinkScheduler,
protected val libraryCacheManager: BlobLibraryCacheManager,
protected val archive: ActorRef,
protected val defaultExecutionRetries: Int,
protected val delayBetweenRetries: Long,
protected val timeout: FiniteDuration,
protected val mode: StreamingMode,
protected val leaderElectionService: LeaderElectionService,
protected val submittedJobGraphs : SubmittedJobGraphStore,
protected val checkpointRecoveryFactory : CheckpointRecoveryFactory)
extends FlinkActor
with LeaderSessionMessageFilter // mixin oder is important, we want filtering after logging
with LogMessages // mixin order is important, we want first logging
with LeaderContender
with SubmittedJobGraphListener {

override val log = Logger(getClass)

/** Either running or not yet archived jobs (session hasn't been ended). */
protected val currentJobs = scala.collection.mutable.HashMap[JobID, (ExecutionGraph, JobInfo)]()

protected val recoveryMode = RecoveryMode.fromConfig(flinkConfiguration)

var leaderSessionID: Option[UUID] = None

/** Futures which have to be completed before terminating the job manager */
var futuresToComplete: Option[Seq[Future[Unit]]] = None

/**
* The port of the web monitor as configured. Make sure that it is actually configured before
* starting the JobManager. This tightly couples the web monitor with the job manager. It is a
* temporary workaround until all execution graph components are properly serializable and all
* web monitors can transparently interact with each job manager. Currently each web server has
* to run in the actor system of the associated job manager.
*/
val webMonitorPort : Int = flinkConfiguration.getInteger(
ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, -1)

/**
* Run when the job manager is started. Simply logs an informational message.
* The method also starts the leader election service.
*/
override def preStart(): Unit = {
log.info(s"Starting JobManager at ${getAddress}.")

try {
leaderElectionService.start(this)
} catch {
case e: Exception =>
log.error("Could not start the JobManager because the leader election service did not " +
"start.", e)
throw new RuntimeException("Could not start the leader election service.", e)
}

try {
submittedJobGraphs.start(this)
} catch {
case e: Exception =>
log.error("Could not start the submitted job graphs service.", e)
throw new RuntimeException("Could not start the submitted job graphs service.", e)
}

try {
checkpointRecoveryFactory.start()
} catch {
case e: Exception =>
log.error("Could not start the checkpoint recovery service.", e)
throw new RuntimeException("Could not start the checkpoint recovery service.", e)
}
}

override def postStop(): Unit = {
log.info(s"Stopping JobManager ${getAddress}.")

val newFuturesToComplete = cancelAndClearEverything(
new Exception("The JobManager is shutting down."),
true)

implicit val executionContext = context.dispatcher

val futureToComplete = Future.sequence(
futuresToComplete.getOrElse(Seq()) ++ newFuturesToComplete)

Await.ready(futureToComplete, timeout)

// disconnect the registered task managers
instanceManager.getAllRegisteredInstances.asScala.foreach {
_.getActorGateway().tell(
Disconnect("JobManager is shutting down"),
new AkkaActorGateway(self, leaderSessionID.orNull))
}

try {
// revoke leadership and stop leader election service
leaderElectionService.stop()
} catch {
case e: Exception => log.error("Could not properly shutdown the leader election service.")
}

try {
submittedJobGraphs.stop()
} catch {
case e: Exception => log.error("Could not properly stop the submitted job graphs service.")
}

try {
checkpointRecoveryFactory.stop()
} catch {
case e: Exception => log.error("Could not properly stop the checkpoint recovery service.")
}

if (archive != ActorRef.noSender) {
archive ! decorateMessage(PoisonPill)
}

instanceManager.shutdown()
scheduler.shutdown()

try {
libraryCacheManager.shutdown()
} catch {
case e: IOException => log.error("Could not properly shutdown the library cache manager.", e)
}

log.debug(s"Job manager ${self.path} is completely stopped.")
}

/**
* Central work method of the JobManager actor. Receives messages and reacts to them.
*
* @return
*/
override def handleMessage: Receive = {

case GrantLeadership(newLeaderSessionID) =>
log.info(s"JobManager ${getAddress} was granted leadership with leader session ID " +
s"${newLeaderSessionID}.")

leaderSessionID = newLeaderSessionID

// confirming the leader session ID might be blocking, thus do it in a future
future{
leaderElectionService.confirmLeaderSessionID(newLeaderSessionID.orNull)

// TODO (critical next step) This needs to be more flexible and robust (e.g. wait for task
// managers etc.)
if (recoveryMode != RecoveryMode.STANDALONE) {
context.system.scheduler.scheduleOnce(new FiniteDuration(delayBetweenRetries,
MILLISECONDS), self, decorateMessage(RecoverAllJobs))(context.dispatcher)
}
}(context.dispatcher)

case RevokeLeadership =>
log.info(s"JobManager ${self.path.toSerializationFormat} was revoked leadership.")

val newFuturesToComplete = cancelAndClearEverything(
new Exception("JobManager is no longer the leader."),
false)

futuresToComplete = Some(futuresToComplete.getOrElse(Seq()) ++ newFuturesToComplete)

// disconnect the registered task managers
instanceManager.getAllRegisteredInstances.asScala.foreach {
_.getActorGateway().tell(
Disconnect("JobManager is no longer the leader"),
new AkkaActorGateway(self, leaderSessionID.orNull))
}

instanceManager.unregisterAllTaskManagers()

leaderSessionID = None

case RegisterTaskManager(
connectionInfo,
hardwareInformation,
numberOfSlots) =>

val taskManager = sender()

if (instanceManager.isRegistered(taskManager)) {
val instanceID = instanceManager.getRegisteredInstance(taskManager).getId

// IMPORTANT: Send the response to the "sender", which is not the
// TaskManager actor, but the ask future!
sender() ! decorateMessage(
AlreadyRegistered(
instanceID,
libraryCacheManager.getBlobServerPort)
)
}
else {
try {
val instanceID = instanceManager.registerTaskManager(
taskManager,
connectionInfo,
hardwareInformation,
numberOfSlots,
leaderSessionID.orNull)

// IMPORTANT: Send the response to the "sender", which is not the
// TaskManager actor, but the ask future!
sender() ! decorateMessage(
AcknowledgeRegistration(
instanceID,
libraryCacheManager.getBlobServerPort)
)

// to be notified when the taskManager is no longer reachable
context.watch(taskManager)
}
catch {
// registerTaskManager throws an IllegalStateException if it is already shut down
// let the actor crash and restart itself in this case
case e: Exception =>
log.error("Failed to register TaskManager at instance manager", e)

// IMPORTANT: Send the response to the "sender", which is not the
// TaskManager actor, but the ask future!
sender() ! decorateMessage(
RefuseRegistration(
ExceptionUtils.stringifyException(e))
)
}
}

case RequestNumberRegisteredTaskManager =>
sender ! decorateMessage(instanceManager.getNumberOfRegisteredTaskManagers)

case RequestTotalNumberOfSlots =>
sender ! decorateMessage(instanceManager.getTotalNumberOfSlots)

case SubmitJob(jobGraph, listeningBehaviour) =>
val client = sender()

val jobInfo = new JobInfo(client, listeningBehaviour, System.currentTimeMillis(),
jobGraph.getSessionTimeout)

submitJob(jobGraph, jobInfo)

case RecoverSubmittedJob(submittedJobGraph) =>
if (!currentJobs.contains(submittedJobGraph.getJobId)) {
submitJob(
submittedJobGraph.getJobGraph(),
submittedJobGraph.getJobInfo(),
isRecovery = true)
}

case RecoverJob(jobId) =>
future {
// The ActorRef, which is part of the submitted job graph can only be deserialized in the
// scope of an actor system.
akka.serialization.JavaSerializer.currentSystem.withValue(
context.system.asInstanceOf[ExtendedActorSystem]) {

log.info(s"Attempting to recover job $jobId.")

val submittedJobGraphOption = submittedJobGraphs.recoverJobGraph(jobId)

submittedJobGraphOption match {
case Some(submittedJobGraph) =>
if (!leaderElectionService.hasLeadership()) {
// we've lost leadership. mission: abort.
log.warn(s"Lost leadership during recovery. Aborting recovery of $jobId.")
}
else {
self ! decorateMessage(RecoverSubmittedJob(submittedJobGraph))
}
case None => log.warn(s"Failed to recover job graph $jobId.")
}
}
}(context.dispatcher)

case RecoverAllJobs =>
future {
// The ActorRef, which is part of the submitted job graph can only be deserialized in the
// scope of an actor system.
akka.serialization.JavaSerializer.currentSystem.withValue(
context.system.asInstanceOf[ExtendedActorSystem]) {

log.info(s"Recovering all jobs.")

val jobGraphs = submittedJobGraphs.recoverJobGraphs().asScala

if (!leaderElectionService.hasLeadership()) {
// we've lost leadership. mission: abort.
log.warn(s"Lost leadership during recovery. Aborting recovery of ${jobGraphs.size} " +
s"jobs.")
}
else {
log.debug(s"Attempting to recover ${jobGraphs.size} job graphs.")

jobGraphs.foreach{
submittedJobGraph =>
self ! decorateMessage(RecoverSubmittedJob(submittedJobGraph))
}
}
}
}(context.dispatcher)

case CancelJob(jobID) =>
log.info(s"Trying to cancel job with ID $jobID.")

currentJobs.get(jobID) match {
case Some((executionGraph, _)) =>
// execute the cancellation asynchronously
Future {
executionGraph.cancel()
}(context.dispatcher)

sender ! decorateMessage(CancellationSuccess(jobID))
case None =>
log.info(s"No job found with ID $jobID.")
sender ! decorateMessage(
CancellationFailure(
jobID,
new IllegalArgumentException(s"No job found with ID $jobID."))
)
}

case UpdateTaskExecutionState(taskExecutionState) =>
if (taskExecutionState == null) {
sender ! decorateMessage(false)
} else {
currentJobs.get(taskExecutionState.getJobID) match {
case Some((executionGraph, _)) =>
val originalSender = sender()

Future {
val result = executionGraph.updateState(taskExecutionState)
originalSender ! decorateMessage(result)
}(context.dispatcher)

case None => log.error("Cannot find execution graph for ID " +
s"${taskExecutionState.getJobID} to change state to " +
s"${taskExecutionState.getExecutionState}.")
sender ! decorateMessage(false)
}
}

case RequestNextInputSplit(jobID, vertexID, executionAttempt) =>
val serializedInputSplit = currentJobs.get(jobID) match {
case Some((executionGraph,_)) =>
val execution = executionGraph.getRegisteredExecutions.get(executionAttempt)

if (execution == null) {
log.error(s"Can not find Execution for attempt $executionAttempt.")
null
} else {
val slot = execution.getAssignedResource
val taskId = execution.getVertex.getParallelSubtaskIndex

val host = if (slot != null) {
slot.getInstance().getInstanceConnectionInfo.getHostname
} else {
null
}

executionGraph.getJobVertex(vertexID) match {
case vertex: ExecutionJobVertex => vertex.getSplitAssigner match {
case splitAssigner: InputSplitAssigner =>
val nextInputSplit = splitAssigner.getNextInputSplit(host, taskId)

log.debug(s"Send next input split $nextInputSplit.")

try {
InstantiationUtil.serializeObject(nextInputSplit)
} catch {
case ex: Exception =>
log.error(s"Could not serialize the next input split of " +
s"class ${nextInputSplit.getClass}.", ex)
vertex.fail(new RuntimeException("Could not serialize the next input split " +
"of class " + nextInputSplit.getClass + ".", ex))
null
}

case _ =>
log.error(s"No InputSplitAssigner for vertex ID $vertexID.")
null
}
case _ =>
log.error(s"Cannot find execution vertex for vertex ID $vertexID.")
null
}
}
case None =>
log.error(s"Cannot find execution graph for job ID $jobID.")
null
}

sender ! decorateMessage(NextInputSplit(serializedInputSplit))

case checkpointMessage : AbstractCheckpointMessage =>
handleCheckpointMessage(checkpointMessage)

case JobStatusChanged(jobID, newJobStatus, timeStamp, error) =>
currentJobs.get(jobID) match {
case Some((executionGraph, jobInfo)) => executionGraph.getJobName

log.info(
s"Status of job $jobID (${executionGraph.getJobName}) changed to $newJobStatus.",
error)

if (newJobStatus.isTerminalState()) {
jobInfo.end = timeStamp

future{
// TODO If removing the JobGraph from the SubmittedJobGraphsStore fails, the job will
// linger around and potentially be recovered at a later time. There is nothing we
// can do about that, but it should be communicated with the Client.
if (jobInfo.sessionAlive) {
jobInfo.setLastActive()
val lastActivity = jobInfo.lastActive
context.system.scheduler.scheduleOnce(jobInfo.sessionTimeout seconds) {
// remove only if no activity occurred in the meantime
if (lastActivity == jobInfo.lastActive) {
self ! decorateMessage(RemoveJob(jobID, true))
}
}(context.dispatcher)
} else {
self ! decorateMessage(RemoveJob(jobID, true))
}

// is the client waiting for the job result?
if (jobInfo.listeningBehaviour != ListeningBehaviour.DETACHED) {
newJobStatus match {
case JobStatus.FINISHED =>
try {
val accumulatorResults = executionGraph.getAccumulatorsSerialized()
val result = new SerializedJobExecutionResult(
jobID,
jobInfo.duration,
accumulatorResults)

jobInfo.client ! decorateMessage(JobResultSuccess(result))
} catch {
case e: Exception =>
log.error(s"Cannot fetch final accumulators for job $jobID", e)
val exception = new JobExecutionException(jobID,
"Failed to retrieve accumulator results.", e)

jobInfo.client ! decorateMessage(JobResultFailure(
new SerializedThrowable(exception)))
}

case JobStatus.CANCELED =>
// the error may be packed as a serialized throwable
val unpackedError = SerializedThrowable.get(
error, executionGraph.getUserClassLoader())

jobInfo.client ! decorateMessage(JobResultFailure(
new SerializedThrowable(
new JobCancellationException(jobID, "Job was cancelled.", unpackedError))))

case JobStatus.FAILED =>
val unpackedError = SerializedThrowable.get(
error, executionGraph.getUserClassLoader())

jobInfo.client ! decorateMessage(JobResultFailure(
new SerializedThrowable(
new JobExecutionException(jobID, "Job execution failed.", unpackedError))))

case x =>
val exception = new JobExecutionException(jobID, s"$x is not a terminal state.")
jobInfo.client ! decorateMessage(JobResultFailure(
new SerializedThrowable(exception)))
throw exception
}
}
}(context.dispatcher)
}
case None =>
self ! decorateMessage(RemoveJob(jobID, true))
}

case ScheduleOrUpdateConsumers(jobId, partitionId) =>
currentJobs.get(jobId) match {
case Some((executionGraph, _)) =>
sender ! decorateMessage(Acknowledge)
executionGraph.scheduleOrUpdateConsumers(partitionId)
case None =>
log.error(s"Cannot find execution graph for job ID $jobId to schedule or update " +
s"consumers.")
sender ! decorateMessage(
Failure(
new IllegalStateException("Cannot find execution graph for job ID " +
s"$jobId to schedule or update consumers.")
)
)
}

case RequestPartitionState(jobId, partitionId, taskExecutionId, taskResultId) =>
val state = currentJobs.get(jobId) match {
case Some((executionGraph, _)) =>
val execution = executionGraph.getRegisteredExecutions.get(partitionId.getProducerId)

if (execution != null) execution.getState else null
case None =>
// Nothing to do. This is not an error, because the request is received when a sending
// task fails during a remote partition request.
log.debug(s"Cannot find execution graph for job $jobId.")

null
}

sender ! decorateMessage(
PartitionState(
taskExecutionId,
taskResultId,
partitionId.getPartitionId,
state)
)

case RequestJobStatus(jobID) =>
currentJobs.get(jobID) match {
case Some((executionGraph,_)) =>
sender ! decorateMessage(CurrentJobStatus(jobID, executionGraph.getState))
case None =>
// check the archive
archive forward decorateMessage(RequestJobStatus(jobID))
}

case RequestRunningJobs =>
val executionGraphs = currentJobs map {
case (_, (eg, jobInfo)) => eg
}

sender ! decorateMessage(RunningJobs(executionGraphs))

case RequestRunningJobsStatus =>
try {
val jobs = currentJobs map {
case (_, (eg, _)) =>
new JobStatusMessage(
eg.getJobID,
eg.getJobName,
eg.getState,
eg.getStatusTimestamp(JobStatus.CREATED)
)
}

sender ! decorateMessage(RunningJobsStatus(jobs))
}
catch {
case t: Throwable => log.error("Exception while responding to RequestRunningJobsStatus", t)
}

case RequestJob(jobID) =>
currentJobs.get(jobID) match {
case Some((eg, _)) => sender ! decorateMessage(JobFound(jobID, eg))
case None =>
// check the archive
archive forward decorateMessage(RequestJob(jobID))
}

case RequestBlobManagerPort =>
sender ! decorateMessage(libraryCacheManager.getBlobServerPort)

case RequestArchive =>
sender ! decorateMessage(ResponseArchive(archive))

case RequestRegisteredTaskManagers =>
import scala.collection.JavaConverters._
sender ! decorateMessage(
RegisteredTaskManagers(
instanceManager.getAllRegisteredInstances.asScala
)
)

case RequestTaskManagerInstance(instanceID) =>
sender ! decorateMessage(
TaskManagerInstance(Option(instanceManager.getRegisteredInstanceById(instanceID)))
)

case Heartbeat(instanceID, metricsReport, accumulators) =>
log.debug(s"Received hearbeat message from $instanceID.")

updateAccumulators(accumulators)

instanceManager.reportHeartBeat(instanceID, metricsReport)

case message: AccumulatorMessage => handleAccumulatorMessage(message)

case message: InfoMessage => handleInfoRequestMessage(message, sender())

case RequestStackTrace(instanceID) =>
val gateway = instanceManager.getRegisteredInstanceById(instanceID).getActorGateway
gateway.forward(SendStackTrace, new AkkaActorGateway(sender, leaderSessionID.orNull))

case Terminated(taskManager) =>
if (instanceManager.isRegistered(taskManager)) {
log.info(s"Task manager ${taskManager.path} terminated.")

instanceManager.unregisterTaskManager(taskManager, true)
context.unwatch(taskManager)
}

case RequestJobManagerStatus =>
sender() ! decorateMessage(JobManagerStatusAlive)

case RemoveJob(jobID, clearPersistedJob) =>
currentJobs.get(jobID) match {
case Some((graph, info)) =>
removeJob(graph.getJobID, clearPersistedJob) match {
case Some(futureToComplete) =>
futuresToComplete = Some(futuresToComplete.getOrElse(Seq()) :+ futureToComplete)
case None =>
}
case None =>
}

case RemoveCachedJob(jobID) =>
currentJobs.get(jobID) match {
case Some((graph, info)) =>
if (graph.getState.isTerminalState) {
removeJob(graph.getJobID, true) match {
case Some(futureToComplete) =>
futuresToComplete = Some(futuresToComplete.getOrElse(Seq()) :+ futureToComplete)
case None =>
}
} else {
// triggers removal upon completion of job
info.sessionAlive = false
}
case None =>
}

case Disconnect(msg) =>
val taskManager = sender()

if (instanceManager.isRegistered(taskManager)) {
log.info(s"Task manager ${taskManager.path} wants to disconnect, because $msg.")

instanceManager.unregisterTaskManager(taskManager, false)
context.unwatch(taskManager)
}

case RequestLeaderSessionID =>
sender() ! ResponseLeaderSessionID(leaderSessionID.orNull)

case RequestWebMonitorPort =>
sender() ! ResponseWebMonitorPort(webMonitorPort)
}

/**
* Submits a job to the job manager. The job is registered at the libraryCacheManager which
* creates the job's class loader. The job graph is appended to the corresponding execution
* graph and the execution vertices are queued for scheduling.
*
* @param jobGraph representing the Flink job
* @param jobInfo the job info
* @param isRecovery Flag indicating whether this is a recovery or initial submission
*/
private def submitJob(jobGraph: JobGraph, jobInfo: JobInfo, isRecovery: Boolean = false): Unit = {
if (jobGraph == null) {
jobInfo.client ! decorateMessage(JobResultFailure(
new SerializedThrowable(
new JobSubmissionException(null, "JobGraph must not be null.")
)
))
}
else {
val jobId = jobGraph.getJobID
val jobName = jobGraph.getName
var executionGraph: ExecutionGraph = null

log.info(s"Submitting job $jobId ($jobName)" + (if (isRecovery) " (Recovery)" else "") + ".")

try {
// Important: We need to make sure that the library registration is the first action,
// because this makes sure that the uploaded jar files are removed in case of
// unsuccessful
try {
libraryCacheManager.registerJob(jobGraph.getJobID, jobGraph.getUserJarBlobKeys,
jobGraph.getClasspaths)
}
catch {
case t: Throwable =>
throw new JobSubmissionException(jobId,
"Cannot set up the user code libraries: " + t.getMessage, t)
}

val userCodeLoader = libraryCacheManager.getClassLoader(jobGraph.getJobID)
if (userCodeLoader == null) {
throw new JobSubmissionException(jobId,
"The user code class loader could not be initialized.")
}

if (jobGraph.getNumberOfVertices == 0) {
throw new JobSubmissionException(jobId, "The given job is empty")
}

// see if there already exists an ExecutionGraph for the corresponding job ID
executionGraph = currentJobs.get(jobGraph.getJobID) match {
case Some((graph, currentJobInfo)) =>
currentJobInfo.setLastActive()
graph
case None =>
val graph = new ExecutionGraph(
executionContext,
jobGraph.getJobID,
jobGraph.getName,
jobGraph.getJobConfiguration,
timeout,
jobGraph.getUserJarBlobKeys,
jobGraph.getClasspaths,
userCodeLoader)

currentJobs.put(jobGraph.getJobID, (graph, jobInfo))
graph
}

// configure the execution graph
val jobNumberRetries = if (jobGraph.getNumberOfExecutionRetries() >= 0) {
jobGraph.getNumberOfExecutionRetries()
} else {
defaultExecutionRetries
}

val executionRetryDelay = if (jobGraph.getExecutionRetryDelay() >= 0) {
jobGraph.getExecutionRetryDelay()
}
else {
delayBetweenRetries
}

executionGraph.setNumberOfRetriesLeft(jobNumberRetries)
executionGraph.setDelayBeforeRetrying(executionRetryDelay)
executionGraph.setScheduleMode(jobGraph.getScheduleMode())
executionGraph.setQueuedSchedulingAllowed(jobGraph.getAllowQueuedScheduling())

try {
executionGraph.setJsonPlan(JsonPlanGenerator.generatePlan(jobGraph))
}
catch {
case t: Throwable =>
log.warn("Cannot create JSON plan for job", t)
executionGraph.setJsonPlan("{}")
}

// initialize the vertices that have a master initialization hook
// file output formats create directories here, input formats create splits
if (log.isDebugEnabled) {
log.debug(s"Running initialization on master for job ${jobId} (${jobName}).")
}

val numSlots = scheduler.getTotalNumberOfSlots()

for (vertex <- jobGraph.getVertices.asScala) {
val executableClass = vertex.getInvokableClassName
if (executableClass == null || executableClass.length == 0) {
throw new JobSubmissionException(jobId,
s"The vertex ${vertex.getID} (${vertex.getName}) has no invokable class.")
}

if (vertex.getParallelism() == ExecutionConfig.PARALLELISM_AUTO_MAX) {
vertex.setParallelism(numSlots)
}

try {
vertex.initializeOnMaster(userCodeLoader)
}
catch {
case t: Throwable =>
throw new JobExecutionException(jobId,
"Cannot initialize task '" + vertex.getName() + "': " + t.getMessage, t)
}
}

// topologically sort the job vertices and attach the graph to the existing one
val sortedTopology = jobGraph.getVerticesSortedTopologicallyFromSources()
if (log.isDebugEnabled) {
log.debug(s"Adding ${sortedTopology.size()} vertices from " +
s"job graph ${jobId} (${jobName}).")
}
executionGraph.attachJobGraph(sortedTopology)

if (log.isDebugEnabled) {
log.debug("Successfully created execution graph from job " +
s"graph ${jobId} (${jobName}).")
}

// configure the state checkpointing
val snapshotSettings = jobGraph.getSnapshotSettings
if (snapshotSettings != null) {
val jobId = jobGraph.getJobID()

val idToVertex: JobVertexID => ExecutionJobVertex = id => {
val vertex = executionGraph.getJobVertex(id)
if (vertex == null) {
throw new JobSubmissionException(jobId,
"The snapshot checkpointing settings refer to non-existent vertex " + id)
}
vertex
}

val triggerVertices: java.util.List[ExecutionJobVertex] =
snapshotSettings.getVerticesToTrigger().asScala.map(idToVertex).asJava

val ackVertices: java.util.List[ExecutionJobVertex] =
snapshotSettings.getVerticesToAcknowledge().asScala.map(idToVertex).asJava

val confirmVertices: java.util.List[ExecutionJobVertex] =
snapshotSettings.getVerticesToConfirm().asScala.map(idToVertex).asJava

val completedCheckpoints = checkpointRecoveryFactory
.createCompletedCheckpoints(jobId, userCodeLoader)

val checkpointIdCounter = checkpointRecoveryFactory.createCheckpointIDCounter(jobId)

executionGraph.enableSnapshotCheckpointing(
snapshotSettings.getCheckpointInterval,
snapshotSettings.getCheckpointTimeout,
triggerVertices,
ackVertices,
confirmVertices,
context.system,
leaderSessionID.orNull,
checkpointIdCounter,
completedCheckpoints,
recoveryMode)
}

// get notified about job status changes
executionGraph.registerJobStatusListener(
new AkkaActorGateway(self, leaderSessionID.orNull))

if (jobInfo.listeningBehaviour == ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES) {
// the sender wants to be notified about state changes
val gateway = new AkkaActorGateway(jobInfo.client, leaderSessionID.orNull)

executionGraph.registerExecutionListener(gateway)
executionGraph.registerJobStatusListener(gateway)
}
}
catch {
case t: Throwable =>
log.error(s"Failed to submit job ${jobId} (${jobName})", t)

libraryCacheManager.unregisterJob(jobId)
currentJobs.remove(jobId)

if (executionGraph != null) {
executionGraph.fail(t)
}

val rt: Throwable = if (t.isInstanceOf[JobExecutionException]) {
t
} else {
new JobExecutionException(jobId, s"Failed to submit job ${jobId} (${jobName})", t)
}

jobInfo.client ! decorateMessage(JobResultFailure(new SerializedThrowable(rt)))
return
}

// execute the recovery/writing the jobGraph into the SubmittedJobGraphStore asynchronously
// because it is a blocking operation
future {
try {
if (isRecovery) {
executionGraph.restoreLatestCheckpointedState()
}
else {
submittedJobGraphs.putJobGraph(new SubmittedJobGraph(jobGraph, jobInfo))
}

jobInfo.client ! decorateMessage(JobSubmitSuccess(jobGraph.getJobID))

if (leaderElectionService.hasLeadership) {
// There is a small chance that multiple job managers schedule the same job after if
// they try to recover at the same time. This will eventually be noticed, but can not be
// ruled out from the beginning.

// NOTE: Scheduling the job for execution is a separate action from the job submission.
// The success of submitting the job must be independent from the success of scheduling
// the job.
log.info(s"Scheduling job $jobId ($jobName).")

executionGraph.scheduleForExecution(scheduler)
} else {
// Remove the job graph. Otherwise it will be lingering around and possibly removed from
// ZooKeeper by this JM.
self ! decorateMessage(RemoveJob(jobId, false))

log.warn(s"Submitted job $jobId, but not leader. The other leader needs to recover " +
"this. I am not scheduling the job for execution.")
}
} catch {
case t: Throwable => try {
executionGraph.fail(t)
}
catch {
case tt: Throwable => {
log.error("Error while marking ExecutionGraph as failed.", tt)
}
}
}
}(context.dispatcher)
}
}

/**
* Dedicated handler for checkpoint messages.
*
* @param actorMessage The checkpoint actor message.
*/
private def handleCheckpointMessage(actorMessage: AbstractCheckpointMessage): Unit = {
actorMessage match {
case ackMessage: AcknowledgeCheckpoint =>
val jid = ackMessage.getJob()
currentJobs.get(jid) match {
case Some((graph, _)) =>
val coordinator = graph.getCheckpointCoordinator()
if (coordinator != null) {
future {
try {
coordinator.receiveAcknowledgeMessage(ackMessage)
}
catch {
case t: Throwable =>
log.error(s"Error in CheckpointCoordinator while processing $ackMessage", t)
}
}(context.dispatcher)
}
else {
log.error(
s"Received ConfirmCheckpoint message for job $jid with no CheckpointCoordinator")
}

case None => log.error(s"Received ConfirmCheckpoint for unavailable job $jid")
}

// unknown checkpoint message
case _ => unhandled(actorMessage)
}
}

/**
* Handle unmatched messages with an exception.
*/
override def unhandled(message: Any): Unit = {
// let the actor crash
throw new RuntimeException("Received unknown message " + message)
}

/**
* Handle messages that request or report accumulators.
*
* @param message The accumulator message.
*/
private def handleAccumulatorMessage(message: AccumulatorMessage): Unit = {
message match {
case RequestAccumulatorResults(jobID) =>
try {
currentJobs.get(jobID) match {
case Some((graph, jobInfo)) =>
val accumulatorValues = graph.getAccumulatorsSerialized()
sender() ! decorateMessage(AccumulatorResultsFound(jobID, accumulatorValues))
case None =>
archive.forward(message)
}
} catch {
case e: Exception =>
log.error("Cannot serialize accumulator result.", e)
sender() ! decorateMessage(AccumulatorResultsErroneous(jobID, e))
}

case RequestAccumulatorResultsStringified(jobId) =>
currentJobs.get(jobId) match {
case Some((graph, jobInfo)) =>
val stringifiedAccumulators = graph.getAccumulatorResultsStringified()
sender() ! decorateMessage(
AccumulatorResultStringsFound(jobId, stringifiedAccumulators)
)
case None =>
archive.forward(message)
}

case unknown =>
log.warn(s"Received unknown AccumulatorMessage: $unknown")
}
}

/**
* Dedicated handler for monitor info request messages.
*
* Note that this handler does not fail. Errors while responding to info messages are logged,
* but will not cause the actor to crash.
*
* @param actorMessage The info request message.
*/
private def handleInfoRequestMessage(actorMessage: InfoMessage, theSender: ActorRef): Unit = {
try {
actorMessage match {

case _ : RequestJobsOverview =>
// get our own overview
val ourJobs = createJobStatusOverview()

// get the overview from the archive
val future = (archive ? RequestJobsOverview.getInstance())(timeout)

future.onSuccess {
case archiveOverview: JobsOverview =>
theSender ! new JobsOverview(ourJobs, archiveOverview)
}(context.dispatcher)

case _ : RequestJobsWithIDsOverview =>
// get our own overview
val ourJobs = createJobStatusWithIDsOverview()

// get the overview from the archive
val future = (archive ? RequestJobsWithIDsOverview.getInstance())(timeout)

future.onSuccess {
case archiveOverview: JobsWithIDsOverview =>
theSender ! new JobsWithIDsOverview(ourJobs, archiveOverview)
}(context.dispatcher)

case _ : RequestStatusOverview =>

val ourJobs = createJobStatusOverview()

val numTMs = instanceManager.getNumberOfRegisteredTaskManagers()
val numSlotsTotal = instanceManager.getTotalNumberOfSlots()
val numSlotsAvailable = instanceManager.getNumberOfAvailableSlots()

// add to that the jobs from the archive
val future = (archive ? RequestJobsOverview.getInstance())(timeout)
future.onSuccess {
case archiveOverview: JobsOverview =>
theSender ! new StatusOverview(numTMs, numSlotsTotal, numSlotsAvailable,
ourJobs, archiveOverview)
}(context.dispatcher)

case msg : RequestJobDetails =>

val ourDetails: Array[JobDetails] = if (msg.shouldIncludeRunning()) {
currentJobs.values.map {
v => WebMonitorUtils.createDetailsForJob(v._1)
}.toArray[JobDetails]
} else {
null
}

if (msg.shouldIncludeFinished()) {
val future = (archive ? msg)(timeout)
future.onSuccess {
case archiveDetails: MultipleJobsDetails =>
theSender ! new MultipleJobsDetails(ourDetails, archiveDetails.getFinishedJobs())
}(context.dispatcher)
} else {
theSender ! new MultipleJobsDetails(ourDetails, null)
}

case _ => log.error("Unrecognized info message " + actorMessage)
}
}
catch {
case e: Throwable => log.error(s"Error responding to message $actorMessage", e)
}
}

private def createJobStatusOverview() : JobsOverview = {
var runningOrPending = 0
var finished = 0
var canceled = 0
var failed = 0

currentJobs.values.foreach {
_._1.getState() match {
case JobStatus.FINISHED => finished += 1
case JobStatus.CANCELED => canceled += 1
case JobStatus.FAILED => failed += 1
case _ => runningOrPending += 1
}
}

new JobsOverview(runningOrPending, finished, canceled, failed)
}

private def createJobStatusWithIDsOverview() : JobsWithIDsOverview = {
val runningOrPending = new java.util.ArrayList[JobID]()
val finished = new java.util.ArrayList[JobID]()
val canceled = new java.util.ArrayList[JobID]()
val failed = new java.util.ArrayList[JobID]()

currentJobs.values.foreach { case (graph, _) =>
graph.getState() match {
case JobStatus.FINISHED => finished.add(graph.getJobID)
case JobStatus.CANCELED => canceled.add(graph.getJobID)
case JobStatus.FAILED => failed.add(graph.getJobID)
case _ => runningOrPending.add(graph.getJobID)
}
}

new JobsWithIDsOverview(runningOrPending, finished, canceled, failed)
}

/**
* Removes the job and sends it to the MemoryArchivist.
*
* This should be called asynchronously. Removing the job from the [[SubmittedJobGraphStore]]
* might block. Therefore be careful not to block the actor thread.
*
* @param jobID ID of the job to remove and archive
* @param removeJobFromStateBackend true if the job shall be archived and removed from the state
* backend
*/
private def removeJob(jobID: JobID, removeJobFromStateBackend: Boolean): Option[Future[Unit]] = {
// Don't remove the job yet...
val futureOption = currentJobs.get(jobID) match {
case Some((eg, _)) =>
val result = if (removeJobFromStateBackend) {
val futureOption = Some(future {
try {
// ...otherwise, we can have lingering resources when there is a concurrent shutdown
// and the ZooKeeper client is closed. Not removing the job immediately allow the
// shutdown to release all resources.
submittedJobGraphs.removeJobGraph(jobID)
} catch {
case t: Throwable => log.error(s"Could not remove submitted job graph $jobID.", t)
}
}(context.dispatcher))

try {
eg.prepareForArchiving()

archive ! decorateMessage(ArchiveExecutionGraph(jobID, eg))
} catch {
case t: Throwable => log.error(s"Could not prepare the execution graph $eg for " +
"archiving.", t)
}

futureOption
} else {
None
}

currentJobs.remove(jobID)

result
case None => None
}

try {
libraryCacheManager.unregisterJob(jobID)
} catch {
case t: Throwable =>
log.error(s"Could not properly unregister job $jobID form the library cache.", t)
}

futureOption
}

/** Fails all currently running jobs and empties the list of currently running jobs. If the
* [[JobClientActor]] waits for a result, then a [[JobExecutionException]] is sent.
*
* @param cause Cause for the cancelling.
*/
private def cancelAndClearEverything(
cause: Throwable,
removeJobFromStateBackend: Boolean)
: Seq[Future[Unit]] = {
val futures = for ((jobID, (eg, jobInfo)) <- currentJobs) yield {
future {
if (removeJobFromStateBackend) {
try {
submittedJobGraphs.removeJobGraph(jobID)
}
catch {
case t: Throwable => {
log.error("Error during submitted job graph clean up.", t)
}
}
}

eg.fail(cause)

if (jobInfo.listeningBehaviour != ListeningBehaviour.DETACHED) {
jobInfo.client ! decorateMessage(
Failure(new JobExecutionException(jobID, "All jobs are cancelled and cleared.", cause)))
}
}(context.dispatcher)
}

currentJobs.clear()

futures.toSeq
}

override def grantLeadership(newLeaderSessionID: UUID): Unit = {
self ! decorateMessage(GrantLeadership(Option(newLeaderSessionID)))
}

override def revokeLeadership(): Unit = {
leaderSessionID = None
self ! decorateMessage(RevokeLeadership)
}

override def onAddedJobGraph(jobId: JobID): Unit = {
if (leaderSessionID.isDefined && !currentJobs.contains(jobId)) {
self ! decorateMessage(RecoverJob(jobId))
}
}

override def onRemovedJobGraph(jobId: JobID): Unit = {
if (leaderSessionID.isDefined) {
currentJobs.get(jobId).foreach(
job =>
future {
// Fail the execution graph
job._1.fail(new IllegalStateException("Another JobManager removed the job from " +
"ZooKeeper."))
}(context.dispatcher)
)
}
}

override def getAddress: String = {
AkkaUtils.getAkkaURL(context.system, self)
}

/** Handles error occuring in the leader election service
*
* @param exception
*/
override def handleError(exception: Exception): Unit = {
log.error("Received an error from the LeaderElectionService.", exception)

// terminate JobManager in case of an error
self ! decorateMessage(PoisonPill)
}

/**
* Updates the accumulators reported from a task manager via the Heartbeat message.
* @param accumulators list of accumulator snapshots
*/
private def updateAccumulators(accumulators : Seq[AccumulatorSnapshot]) = {
accumulators foreach {
case accumulatorEvent =>
currentJobs.get(accumulatorEvent.getJobID) match {
case Some((jobGraph, jobInfo)) =>
future {
jobGraph.updateAccumulators(accumulatorEvent)
}(context.dispatcher)
case None =>
// ignore accumulator values for old job
}
}
}
}

/**
* Job Manager companion object. Contains the entry point (main method) to run the JobManager in a
* standalone fashion. Also contains various utility methods to start the JobManager and to
* look up the JobManager actor reference.
*/
object JobManager {

val LOG = Logger(classOf[JobManager])

val STARTUP_FAILURE_RETURN_CODE = 1
val RUNTIME_FAILURE_RETURN_CODE = 2

/** Name of the JobManager actor */
val JOB_MANAGER_NAME = "jobmanager"

/** Name of the archive actor */
val ARCHIVE_NAME = "archive"

/**
* Entry point (main method) to run the JobManager in a standalone fashion.
*
* @param args The command line arguments.
*/
def main(args: Array[String]): Unit = {
// startup checks and logging
EnvironmentInformation.logEnvironmentInfo(LOG.logger, "JobManager", args)
EnvironmentInformation.checkJavaVersion()

// parsing the command line arguments
val (configuration: Configuration,
executionMode: JobManagerMode,
streamingMode: StreamingMode,
listeningHost: String, listeningPort: Int) =
try {
parseArgs(args)
}
catch {
case t: Throwable => {
LOG.error(t.getMessage(), t)
System.exit(STARTUP_FAILURE_RETURN_CODE)
null
}
}

// we want to check that the JobManager hostname is in the config
// if it is not in there, the actor system will bind to the loopback interface's
// address and will not be reachable from anyone remote
if (listeningHost == null) {
val message = "Config parameter '" + ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY +
"' is missing (hostname/address to bind JobManager to)."
LOG.error(message)
System.exit(STARTUP_FAILURE_RETURN_CODE)
}

if (ZooKeeperUtils.isZooKeeperRecoveryMode(configuration)) {
// address and will not be reachable from anyone remote
if (listeningPort != 0) {
val message = "Config parameter '" + ConfigConstants.JOB_MANAGER_IPC_PORT_KEY +
"' is invalid, it must be equal to 0."
LOG.error(message)
System.exit(STARTUP_FAILURE_RETURN_CODE)
}
} else {
// address and will not be reachable from anyone remote
if (listeningPort <= 0 || listeningPort >= 65536) {
val message = "Config parameter '" + ConfigConstants.JOB_MANAGER_IPC_PORT_KEY +
"' is invalid, it must be greater than 0 and less than 65536."
LOG.error(message)
System.exit(STARTUP_FAILURE_RETURN_CODE)
}
}

// run the job manager
try {
if (SecurityUtils.isSecurityEnabled) {
LOG.info("Security is enabled. Starting secure JobManager.")
SecurityUtils.runSecured(new FlinkSecuredRunner[Unit] {
override def run(): Unit = {
runJobManager(
configuration,
executionMode,
streamingMode,
listeningHost,
listeningPort)
}
})
}
else {
LOG.info("Security is not enabled. Starting non-authenticated JobManager.")
runJobManager(
configuration,
executionMode,
streamingMode,
listeningHost,
listeningPort)
}
}
catch {
case t: Throwable => {
LOG.error("Failed to run JobManager.", t)
System.exit(STARTUP_FAILURE_RETURN_CODE)
}
}
}

/**
* Starts and runs the JobManager with all its components. First, this method starts a
* dedicated actor system for the JobManager. Second, its starts all components of the
* JobManager (including library cache, instance manager, scheduler). Finally, it starts
* the JobManager actor itself.
*
* This method blocks indefinitely (or until the JobManager's actor system is shut down).
*
* @param configuration The configuration object for the JobManager.
* @param executionMode The execution mode in which to run. Execution mode LOCAL will spawn an
* an additional TaskManager in the same process.
* @param streamingMode The streaming mode to run the system in (streaming vs. batch-only)
* @param listeningAddress The hostname where the JobManager should listen for messages.
* @param listeningPort The port where the JobManager should listen for messages.
*/
def runJobManager(
configuration: Configuration,
executionMode: JobManagerMode,
streamingMode: StreamingMode,
listeningAddress: String,
listeningPort: Int)
: Unit = {

val (jobManagerSystem, _, _, _) = startActorSystemAndJobManagerActors(
configuration,
executionMode,
streamingMode,
listeningAddress,
listeningPort,
classOf[JobManager],
classOf[MemoryArchivist]
)

// block until everything is shut down
jobManagerSystem.awaitTermination()
}

/** Starts an ActorSystem, the JobManager and all its components including the WebMonitor.
*
* @param configuration The configuration object for the JobManager
* @param executionMode The execution mode in which to run. Execution mode LOCAL with spawn an
* additional TaskManager in the same process.
* @param streamingMode The streaming mode to run the system in (streaming vs. batch-only)
* @param listeningAddress The hostname where the JobManager should lsiten for messages.
* @param listeningPort The port where the JobManager should listen for messages
* @param jobManagerClass The class of the JobManager to be started
* @param archiveClass The class of the Archivist to be started
* @return A tuple containing the started ActorSystem, ActorRefs to the JobManager and the
* Archivist and an Option containing a possibly started WebMonitor
*/
def startActorSystemAndJobManagerActors(
configuration: Configuration,
executionMode: JobManagerMode,
streamingMode: StreamingMode,
listeningAddress: String,
listeningPort: Int,
jobManagerClass: Class[_ <: JobManager],
archiveClass: Class[_ <: MemoryArchivist])
: (ActorSystem, ActorRef, ActorRef, Option[WebMonitor]) = {

LOG.info("Starting JobManager")

// Bring up the job manager actor system first, bind it to the given address.
val hostPortUrl = NetUtils.hostAndPortToUrlString(listeningAddress, listeningPort)
LOG.info(s"Starting JobManager actor system at $hostPortUrl")

val jobManagerSystem = try {
val akkaConfig = AkkaUtils.getAkkaConfig(
configuration,
Some((listeningAddress, listeningPort))
)
if (LOG.isDebugEnabled) {
LOG.debug("Using akka configuration\n " + akkaConfig)
}
AkkaUtils.createActorSystem(akkaConfig)
}
catch {
case t: Throwable => {
if (t.isInstanceOf[org.jboss.netty.channel.ChannelException]) {
val cause = t.getCause()
if (cause != null && t.getCause().isInstanceOf[java.net.BindException]) {
val address = listeningAddress + ":" + listeningPort
throw new Exception("Unable to create JobManager at address " + address +
" - " + cause.getMessage(), t)
}
}
throw new Exception("Could not create JobManager actor system", t)
}
}

val address = AkkaUtils.getAddress(jobManagerSystem)

configuration.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, address.host.get)
configuration.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, address.port.get)

val webMonitor: Option[WebMonitor] =
if (configuration.getInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, 0) >= 0) {
LOG.info("Starting JobManger web frontend")
val leaderRetrievalService = LeaderRetrievalUtils
.createLeaderRetrievalService(configuration)

// start the web frontend. we need to load this dynamically
// because it is not in the same project/dependencies
val webServer = WebMonitorUtils.startWebRuntimeMonitor(
configuration,
leaderRetrievalService,
jobManagerSystem)

Option(webServer)
}
else {
None
}

// Reset the port (necessary in case of automatic port selection)
webMonitor.foreach{ monitor => configuration.setInteger(
ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, monitor.getServerPort) }

try {
// bring up the job manager actor
LOG.info("Starting JobManager actor")
val (jobManager, archive) = startJobManagerActors(
configuration,
jobManagerSystem,
streamingMode,
jobManagerClass,
archiveClass)

// start a process reaper that watches the JobManager. If the JobManager actor dies,
// the process reaper will kill the JVM process (to ensure easy failure detection)
LOG.debug("Starting JobManager process reaper")
jobManagerSystem.actorOf(
Props(
classOf[ProcessReaper],
jobManager,
LOG.logger,
RUNTIME_FAILURE_RETURN_CODE),
"JobManager_Process_Reaper")

// bring up a local task manager, if needed
if (executionMode == JobManagerMode.LOCAL) {
LOG.info("Starting embedded TaskManager for JobManager's LOCAL execution mode")

val taskManagerActor = TaskManager.startTaskManagerComponentsAndActor(
configuration,
jobManagerSystem,
listeningAddress,
Some(TaskManager.TASK_MANAGER_NAME),
None,
true,
streamingMode,
classOf[TaskManager])

LOG.debug("Starting TaskManager process reaper")
jobManagerSystem.actorOf(
Props(
classOf[ProcessReaper],
taskManagerActor,
LOG.logger,
RUNTIME_FAILURE_RETURN_CODE),
"TaskManager_Process_Reaper")
}

webMonitor.foreach {
monitor =>
val jobManagerAkkaUrl = JobManager.getRemoteJobManagerAkkaURL(configuration)
monitor.start(jobManagerAkkaUrl)
}

(jobManagerSystem, jobManager, archive, webMonitor)
}
catch {
case t: Throwable => {
LOG.error("Error while starting up JobManager", t)
try {
jobManagerSystem.shutdown()
} catch {
case tt: Throwable => LOG.warn("Could not cleanly shut down actor system", tt)
}
throw t
}
}
}

/**
* Loads the configuration, execution mode and the listening address from the provided command
* line arguments.
*
* @param args command line arguments
* @return Quadruple of configuration, execution mode and an optional listening address
*/
def parseArgs(args: Array[String]):
(Configuration, JobManagerMode, StreamingMode, String, Int) = {
val parser = new scopt.OptionParser[JobManagerCliOptions]("JobManager") {
head("Flink JobManager")

opt[String]("configDir") action { (arg, conf) =>
conf.setConfigDir(arg)
conf
} text {
"The configuration directory."
}

opt[String]("executionMode") action { (arg, conf) =>
conf.setJobManagerMode(arg)
conf
} text {
"The execution mode of the JobManager (CLUSTER / LOCAL)"
}

opt[String]("streamingMode").optional().action { (arg, conf) =>
conf.setStreamingMode(arg)
conf
} text {
"The streaming mode of the JobManager (STREAMING / BATCH)"
}

opt[String]("host").optional().action { (arg, conf) =>
conf.setHost(arg)
conf
} text {
"Network address for communication with the job manager"
}

opt[Int]("webui-port").optional().action { (arg, conf) =>
conf.setWebUIPort(arg)
conf
} text {
"Port for the UI web server"
}
}

val config = parser.parse(args, new JobManagerCliOptions()).getOrElse {
throw new Exception(
s"Invalid command line agruments: ${args.mkString(" ")}. Usage: ${parser.usage}")
}

val configDir = config.getConfigDir()

if (configDir == null) {
throw new Exception("Missing parameter '--configDir'")
}
if (config.getJobManagerMode() == null) {
throw new Exception("Missing parameter '--executionMode'")
}

LOG.info("Loading configuration from " + configDir)
GlobalConfiguration.loadConfiguration(configDir)
val configuration = GlobalConfiguration.getConfiguration()

if (new File(configDir).isDirectory) {
configuration.setString(ConfigConstants.FLINK_BASE_DIR_PATH_KEY, configDir + "/..")
}

if (config.getWebUIPort() >= 0) {
configuration.setInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, config.getWebUIPort())
}

if (config.getHost() != null) {
configuration.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, config.getHost())
}

val host = configuration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null)

// high availability mode
val port: Int =
if (ZooKeeperUtils.isZooKeeperRecoveryMode(configuration)) {
LOG.info("Starting JobManager in High-Availability Mode")

configuration.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, 0)
0
}
else {
LOG.info("Staring JobManager without high-availability")

configuration.getInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY,
ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT)
}

val executionMode = config.getJobManagerMode
val streamingMode = config.getStreamingMode
val hostPortUrl = NetUtils.hostAndPortToUrlString(host, port)

LOG.info(s"Starting JobManager on $hostPortUrl with execution mode $executionMode and " +
s"streaming mode $streamingMode")

(configuration, executionMode, streamingMode, host, port)
}

/**
* Create the job manager components as (instanceManager, scheduler, libraryCacheManager,
* archiverProps, defaultExecutionRetries,
* delayBetweenRetries, timeout)
*
* @param configuration The configuration from which to parse the config values.
* @param leaderElectionServiceOption LeaderElectionService which shall be returned if the option
* is defined
* @return The members for a default JobManager.
*/
def createJobManagerComponents(
configuration: Configuration,
leaderElectionServiceOption: Option[LeaderElectionService]) :
(ExecutionContext,
InstanceManager,
FlinkScheduler,
BlobLibraryCacheManager,
Int, // execution retries
Long, // delay between retries
FiniteDuration, // timeout
Int, // number of archived jobs
LeaderElectionService,
SubmittedJobGraphStore,
CheckpointRecoveryFactory) = {

val timeout: FiniteDuration = AkkaUtils.getTimeout(configuration)

val cleanupInterval = configuration.getLong(
ConfigConstants.LIBRARY_CACHE_MANAGER_CLEANUP_INTERVAL,
ConfigConstants.DEFAULT_LIBRARY_CACHE_MANAGER_CLEANUP_INTERVAL) * 1000

val executionRetries = configuration.getInteger(
ConfigConstants.DEFAULT_EXECUTION_RETRIES_KEY,
ConfigConstants.DEFAULT_EXECUTION_RETRIES)

val archiveCount = configuration.getInteger(ConfigConstants.JOB_MANAGER_WEB_ARCHIVE_COUNT,
ConfigConstants.DEFAULT_JOB_MANAGER_WEB_ARCHIVE_COUNT)

// configure the delay between execution retries.
// unless explicitly specifies, this is dependent on the heartbeat timeout
val pauseString = configuration.getString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE,
ConfigConstants.DEFAULT_AKKA_ASK_TIMEOUT)
val delayString = configuration.getString(ConfigConstants.DEFAULT_EXECUTION_RETRY_DELAY_KEY,
pauseString)

val delayBetweenRetries: Long = try {
Duration(delayString).toMillis
}
catch {
case n: NumberFormatException => throw new Exception(
s"Invalid config value for ${ConfigConstants.DEFAULT_EXECUTION_RETRY_DELAY_KEY}: " +
s"${pauseString}. Value must be a valid duration (such as 100 milli or 1 min)");
}

val executionContext = ExecutionContext.fromExecutor(new ForkJoinPool())

var blobServer: BlobServer = null
var instanceManager: InstanceManager = null
var scheduler: FlinkScheduler = null
var libraryCacheManager: BlobLibraryCacheManager = null

try {
blobServer = new BlobServer(configuration)
instanceManager = new InstanceManager()
scheduler = new FlinkScheduler(executionContext)
libraryCacheManager = new BlobLibraryCacheManager(blobServer, cleanupInterval)

instanceManager.addInstanceListener(scheduler)
}
catch {
case t: Throwable => {
if (libraryCacheManager != null) {
libraryCacheManager.shutdown()
}
if (scheduler != null) {
scheduler.shutdown()
}
if (instanceManager != null) {
instanceManager.shutdown()
}
if (blobServer != null) {
blobServer.shutdown()
}
throw t
}
}

// Create recovery related components
val (leaderElectionService, submittedJobGraphs, checkpointRecoveryFactory) =
RecoveryMode.fromConfig(configuration) match {
case RecoveryMode.STANDALONE =>
val leaderElectionService = leaderElectionServiceOption match {
case Some(les) => les
case None => new StandaloneLeaderElectionService()
}

(leaderElectionService,
new StandaloneSubmittedJobGraphStore(),
new StandaloneCheckpointRecoveryFactory())

case RecoveryMode.ZOOKEEPER =>
val client = ZooKeeperUtils.startCuratorFramework(configuration)

val leaderElectionService = leaderElectionServiceOption match {
case Some(les) => les
case None => ZooKeeperUtils.createLeaderElectionService(client, configuration)
}

(leaderElectionService,
ZooKeeperUtils.createSubmittedJobGraphs(client, configuration),
new ZooKeeperCheckpointRecoveryFactory(client, configuration))
}

(executionContext,
instanceManager,
scheduler,
libraryCacheManager,
executionRetries,
delayBetweenRetries,
timeout,
archiveCount,
leaderElectionService,
submittedJobGraphs,
checkpointRecoveryFactory)
}

/**
* Starts the JobManager and job archiver based on the given configuration, in the
* given actor system.
*
* @param configuration The configuration for the JobManager
* @param actorSystem The actor system running the JobManager
* @param streamingMode The execution mode
* @param jobManagerClass The class of the JobManager to be started
* @param archiveClass The class of the MemoryArchivist to be started
*
* @return A tuple of references (JobManager Ref, Archiver Ref)
*/
def startJobManagerActors(
configuration: Configuration,
actorSystem: ActorSystem,
streamingMode: StreamingMode,
jobManagerClass: Class[_ <: JobManager],
archiveClass: Class[_ <: MemoryArchivist])
: (ActorRef, ActorRef) = {

startJobManagerActors(
configuration,
actorSystem,
Some(JOB_MANAGER_NAME),
Some(ARCHIVE_NAME),
streamingMode,
jobManagerClass,
archiveClass)
}

/**
* Starts the JobManager and job archiver based on the given configuration, in the
* given actor system.
*
* @param configuration The configuration for the JobManager
* @param actorSystem The actor system running the JobManager
* @param jobMangerActorName Optionally the name of the JobManager actor. If none is given,
* the actor will have the name generated by the actor system.
* @param archiveActorName Optionally the name of the archive actor. If none is given,
* the actor will have the name generated by the actor system.
* @param streamingMode The mode to run the system in (streaming vs. batch-only)
* @param jobManagerClass The class of the JobManager to be started
* @param archiveClass The class of the MemoryArchivist to be started
*
* @return A tuple of references (JobManager Ref, Archiver Ref)
*/
def startJobManagerActors(
configuration: Configuration,
actorSystem: ActorSystem,
jobMangerActorName: Option[String],
archiveActorName: Option[String],
streamingMode: StreamingMode,
jobManagerClass: Class[_ <: JobManager],
archiveClass: Class[_ <: MemoryArchivist])
: (ActorRef, ActorRef) = {

val (executionContext,
instanceManager,
scheduler,
libraryCacheManager,
executionRetries,
delayBetweenRetries,
timeout,
archiveCount,
leaderElectionService,
submittedJobGraphs,
checkpointRecoveryFactory) = createJobManagerComponents(
configuration,
None)

val archiveProps = Props(archiveClass, archiveCount)

// start the archiver with the given name, or without (avoid name conflicts)
val archive: ActorRef = archiveActorName match {
case Some(actorName) => actorSystem.actorOf(archiveProps, actorName)
case None => actorSystem.actorOf(archiveProps)
}

val jobManagerProps = Props(
jobManagerClass,
configuration,
executionContext,
instanceManager,
scheduler,
libraryCacheManager,
archive,
executionRetries,
delayBetweenRetries,
timeout,
streamingMode,
leaderElectionService,
submittedJobGraphs,
checkpointRecoveryFactory)

val jobManager: ActorRef = jobMangerActorName match {
case Some(actorName) => actorSystem.actorOf(jobManagerProps, actorName)
case None => actorSystem.actorOf(jobManagerProps)
}

(jobManager, archive)
}

def startActor(props: Props, actorSystem: ActorSystem): ActorRef = {
actorSystem.actorOf(props, JOB_MANAGER_NAME)
}

// --------------------------------------------------------------------------
// Resolving the JobManager endpoint
// --------------------------------------------------------------------------

/**
* Builds the akka actor path for the JobManager actor, given the socket address
* where the JobManager's actor system runs.
*
* @param address The address of the JobManager's actor system.
* @return The akka URL of the JobManager actor.
*/
def getRemoteJobManagerAkkaURL(
address: InetSocketAddress,
name: Option[String] = None)
: String = {
val hostPort = NetUtils.socketAddressToUrlString(address)

getJobManagerAkkaURLHelper(s"akka.tcp://flink@$hostPort", name)
}

/**
* Returns the JobManager actor's remote Akka URL, given the configured hostname and port.
*
* @param config The configuration to parse
* @return JobManager actor remote Akka URL
*/
def getRemoteJobManagerAkkaURL(config: Configuration) : String = {
val (hostname, port) = TaskManager.getAndCheckJobManagerAddress(config)

var hostPort: InetSocketAddress = null

try {
val inetAddress: InetAddress = InetAddress.getByName(hostname)
hostPort = new InetSocketAddress(inetAddress, port)
}
catch {
case e: UnknownHostException => {
throw new UnknownHostException(s"Cannot resolve the JobManager hostname '$hostname' " +
s"specified in the configuration")
}
}

JobManager.getRemoteJobManagerAkkaURL(hostPort, Option.empty)
}

/**
* Builds the akka actor path for the JobManager actor to address the actor within
* its own actor system.
*
* @return The local akka URL of the JobManager actor.
*/
def getLocalJobManagerAkkaURL(name: Option[String] = None): String = {
getJobManagerAkkaURLHelper("akka://flink", name)
}

def getJobManagerAkkaURL(system: ActorSystem, name: Option[String] = None): String = {
getJobManagerAkkaURLHelper(AkkaUtils.getAddress(system).toString, name)
}

private def getJobManagerAkkaURLHelper(address: String, name: Option[String]): String = {
address + "/user/" + name.getOrElse(JOB_MANAGER_NAME)
}

def getJobManagerActorRefFuture(
address: InetSocketAddress,
system: ActorSystem,
timeout: FiniteDuration)
: Future[ActorRef] = {
AkkaUtils.getActorRefFuture(getRemoteJobManagerAkkaURL(address), system, timeout)
}

/**
* Resolves the JobManager actor reference in a blocking fashion.
*
* @param jobManagerUrl The akka URL of the JobManager.
* @param system The local actor system that should perform the lookup.
* @param timeout The maximum time to wait until the lookup fails.
* @throws java.io.IOException Thrown, if the lookup fails.
* @return The ActorRef to the JobManager
*/
@throws(classOf[IOException])
def getJobManagerActorRef(
jobManagerUrl: String,
system: ActorSystem,
timeout: FiniteDuration)
: ActorRef = {
AkkaUtils.getActorRef(jobManagerUrl, system, timeout)
}

/**
* Resolves the JobManager actor reference in a blocking fashion.
*
* @param address The socket address of the JobManager's actor system.
* @param system The local actor system that should perform the lookup.
* @param timeout The maximum time to wait until the lookup fails.
* @throws java.io.IOException Thrown, if the lookup fails.
* @return The ActorRef to the JobManager
*/
@throws(classOf[IOException])
def getJobManagerActorRef(
address: InetSocketAddress,
system: ActorSystem,
timeout: FiniteDuration)
: ActorRef = {

val jmAddress = getRemoteJobManagerAkkaURL(address)
getJobManagerActorRef(jmAddress, system, timeout)
}

/**
* Resolves the JobManager actor reference in a blocking fashion.
*
* @param address The socket address of the JobManager's actor system.
* @param system The local actor system that should perform the lookup.
* @param config The config describing the maximum time to wait until the lookup fails.
* @throws java.io.IOException Thrown, if the lookup fails.
* @return The ActorRef to the JobManager
*/
@throws(classOf[IOException])
def getJobManagerActorRef(
address: InetSocketAddress,
system: ActorSystem,
config: Configuration)
: ActorRef = {

val timeout = AkkaUtils.getLookupTimeout(config)
getJobManagerActorRef(address, system, timeout)
}
}

你可能感兴趣的:(flink)

nosql数据库技术与应用知识点皆过客，揽星河 NoSQL nosql 数据库大数据数据分析数据结构非关系型数据库
Nosql知识回顾大数据处理流程数据采集(flume、爬虫、传感器)数据存储(本门课程NoSQL所处的阶段)Hdfs、MongoDB、HBase等数据清洗(入仓)Hive等数据处理、分析(Spark、Flink等)数据可视化数据挖掘、机器学习应用(Python、SparkMLlib等)大数据时代存储的挑战(三高)高并发(同一时间很多人访问)高扩展(要求随时根据需求扩展存储)高效率(要求读写速度快)
全面指南：用户行为从前端数据采集到实时处理的最佳实践数字沉思营销流量运营系统架构前端内容运营大数据
引言在当今的数据驱动世界，实时数据采集和处理已经成为企业做出及时决策的重要手段。本文将详细介绍如何通过前端JavaScript代码采集用户行为数据、利用API和Kafka进行数据传输、通过Flink实时处理数据的完整流程。无论你是想提升产品体验还是做用户行为分析，这篇文章都将为你提供全面的解决方案。设计一个通用的ClickHouse表来存储用户事件时，需要考虑多种因素，包括事件类型、时间戳、用户信
详解 Flink 的常见部署方式文刀小桂 Flink flink 大数据
一、常见部署模式分类1.按是否依赖外部资源调度1.1Standalone模式独立模式(Standalone)是独立运行的，不依赖任何外部的资源管理平台，只需要运行所有Flink组件服务1.2Yarn模式Yarn模式是指客户端把Flink应用提交给Yarn的ResourceManager,Yarn的ResourceManager会在Yarn的NodeManager上创建容器。在这些容器上，Flink
大数据之flink与hive 星辰_mya 大数据 flink hive
其实吧我不太想写flink，因为线上经验确实不多，这也是我需要补的地方，没有条件创造条件，先来一篇吧flink：高性能低延迟流批一体的分布式计算框架基于事件时间对实时数据精准处理快速响应支持批处理，高效离线分析和数据挖掘数据仓库的引擎丰富数据源/接收器，集成多种数据存储格式和源，比较常见就是咱们今天的主题hive了checkpoint恢复机制，故障恢复快速恢复计算任务分布式弹性扩展，据业务灵活增加
Java中的大数据处理框架对比分析省赚客app开发者 java 开发语言
Java中的大数据处理框架对比分析大家好，我是微赚淘客系统3.0的小编，是个冬天不穿秋裤，天冷也要风度的程序猿！今天，我们将深入探讨Java中常用的大数据处理框架，并对它们进行对比分析。大数据处理框架是现代数据驱动应用的核心，它们帮助企业处理和分析海量数据，以提取有价值的信息。本文将重点介绍ApacheHadoop、ApacheSpark、ApacheFlink和ApacheStorm这四种流行的
一文搞懂 Flink Task 数据交互之数据写源码 mn_kw flink 交互 java
一文搞懂FlinkTask数据交互之数据写源码1.RecordWriterOutput2.RecordWriter3.数据分区器ChannelSelector4.数据输出模型ResultPartition5.子模型ResultSubpartition6.本地buffer池LocalBufferPool7.获取buffer8.将buffer添加到ResultSubpartitionFlink重要源码
概率图模型（PGM）综述医学影像处理概率图模型概率图模型综述
RefLink:http://www.sigvc.org/bbs/thread-728-1-1.htmlGraphicalModel的基本类型基本的GraphicalModel可以大致分为两个类别：贝叶斯网络(BayesianNetwork)和马尔可夫随机场(MarkovRandomField)。它们的主要区别在于采用不同类型的图来表达变量之间的关系：贝叶斯网络采用有向无环图(DirectedAc
Python基础知识进阶之正则表达式_头歌python正则表达式进阶前端陈萨龙程序员 python 学习面试
最后硬核资料：关注即可领取PPT模板、简历模板、行业经典书籍PDF。技术互助：技术群大佬指点迷津，你的问题可能不是问题，求资源在群里喊一声。面试题库：由技术群里的小伙伴们共同投稿，热乎的大厂面试真题，持续更新中。知识体系：含编程语言、算法、大数据生态圈组件（Mysql、Hive、Spark、Flink）、数据仓库、Python、前端等等。网上学习资料一大堆，但如果学到的知识不成体系，遇到问题时只是
大数据新视界 --大数据大厂之Flink强势崛起：大数据新视界的璀璨明珠青云交大数据新视界 Flink 大数据数据类型实时处理流处理框架对比应用场景数据处理大数据新视界数据库
亲爱的朋友们，热烈欢迎你们来到青云交的博客！能与你们在此邂逅，我满心欢喜，深感无比荣幸。在这个瞬息万变的时代，我们每个人都在苦苦追寻一处能让心灵安然栖息的港湾。而我的博客，正是这样一个温暖美好的所在。在这里，你们不仅能够收获既富有趣味又极为实用的内容知识，还可以毫无拘束地畅所欲言，尽情分享自己独特的见解。我真诚地期待着你们的到来，愿我们能在这片小小的天地里共同成长，共同进步。本博客的精华专栏：Ja
flink增量检查点降低状态依赖实现的详细步骤 goTsHgo Flink 大数据分布式 flink 大数据
增量检查点启动恢复的时间是很久的，业务上不能接受，所以可以通过降低状态依赖来减少恢复的时间。降低状态依赖尽可能减少状态的复杂性和依赖关系，通过拆分状态或将状态外部化到其他服务中，从而降低恢复的开销。实施措施：将状态分割为更小的单元，减少每次恢复的状态量。使用外部状态存储服务，减少Flink状态后端的负担。拆分状态和将状态外部化到其他服务可以帮助减少作业的状态依赖，从而降低恢复时间和复杂度。以下是详
flink table factory基础知识 loukey_j
一、概述在flink中很多组件都是TableFactory的子类。比如序列化，反序列化，tableSinkFactory,tableSourceFactory.TableFactory是用来创建序列化，反序列器，tableSource和tableSink的工厂。二、TableFactory源码在flink框架中，TableFactory的子类并不是程序员自己随心new出来的。flink的提供给程序
2024年最全使用Python求解方程_python解方程(1)，字节面试官迟到 2401_84569545 程序员 python 学习面试
最后硬核资料：关注即可领取PPT模板、简历模板、行业经典书籍PDF。技术互助：技术群大佬指点迷津，你的问题可能不是问题，求资源在群里喊一声。面试题库：由技术群里的小伙伴们共同投稿，热乎的大厂面试真题，持续更新中。知识体系：含编程语言、算法、大数据生态圈组件（Mysql、Hive、Spark、Flink）、数据仓库、Python、前端等等。网上学习资料一大堆，但如果学到的知识不成体系，遇到问题时只是
01-Flink安装部署及入门案例（仅供学习），音视频时代你还不会NDK开发小猪佩琪962 2024年程序员学习 flink 学习大数据
先自我介绍一下，小编浙江大学毕业，去过华为、字节跳动等大厂，目前阿里P7深知大多数程序员，想要提升技能，往往是自己摸索成长，但自己不成体系的自学效果低效又漫长，而且极易碰到天花板技术停滞不前！因此收集整理了一份《2024年最新大数据全套学习资料》，初衷也很简单，就是希望能够帮助到想自学提升又不知道该从何学起的朋友。既有适合小白学习的零基础资料，也有适合3年以上经验的小伙伴深入学习提升的进阶课程，涵
比较Spark与Flink 傲雪凌霜，松柏长青大数据后端 spark flink 大数据
ApacheSpark和ApacheFlink都是目前非常流行的大数据处理引擎，但它们在架构、处理模式、应用场景等方面有一些显著的区别。下面是二者的对比：1.处理模式Spark:主要支持批处理（BatchProcessing），也能通过SparkStreaming处理流式数据，但SparkStreaming本质上是通过微批（micro-batching）的方式处理流数据，延迟相对较高。SparkS
Apache Flink：实时流处理与批处理的统一框架小码快撩 flink 大数据
导语在大数据处理领域，流处理和批处理是两种主要的处理方式。然而，传统的系统通常将这两者视为独立的任务，需要不同的工具和框架来处理。ApacheFlink是一个开源的流处理框架，它打破了这种界限，提供了一个统一的平台来处理实时流数据和批处理数据。一、基本概念与架构ApacheFlink的基本概念与架构主要包括以下几个核心组成部分：基本概念1.流处理模型：无界流(UnboundedStreams):数
flink独立集群部署嘎子吱吱吱吱 flink hadoop linux
#flink独立集群部署说明安装环境三台服务器47.106.23.1（master）47.112.173.2（worker1）47.115.162.3（worker1）提前装好jdk和ssh,以下操作最好不要用root账号提前下载好flink的包并解压设置三台服务器之间ssh免密登录生成本机秘钥以47.106.23.1为例（其他两台参考本服务器）#生成本机秘钥cd;ssh-keygen-trsa-
Flink的时间与watermarks详解大数据技术与数仓
当我们在使用Flink的时候，避免不了要和时间(time)、水位线(watermarks)打交道，理解这些概念是开发分布式流处理应用的基础。那么Flink支持哪些时间语义？Flink是如何处理乱序事件的？什么是水位线？水位线是如何生成的？水位线的传播方式是什么？让我们带着这些问题来开始本文的内容。时间语义基本概念时间是Flink等流处理中最重要的概念之一，在Flink中Time可以分为三种：Eve
实时数仓之实时数仓架构(Hudi)(1) 2401_84164527 程序员架构
目前比较流行的实时数仓架构有两类，其中一类是以Flink+Doris为核心的实时数仓架构方案；另一类是以湖仓一体架构为核心的实时数仓架构方案。本文针对Flink+Hudi湖仓一体架构进行介绍，这套架构的特点是可以基于一套数据完全实现Lambda架构。实时数仓架构图如下：技术框架Kafka：用于接入数据源；FlinkCDC：如果直接接入业务数据源可以考虑CDC方式，如果通过Kafka缓冲接入业务数据
2024年大数据最新实时数仓之实时数仓架构(Hudi) 2401_84185556 程序员大数据架构
技术框架Kafka：用于接入数据源；FlinkCDC：如果直接接入业务数据源可以考虑CDC方式，如果通过Kafka缓冲接入业务数据可以忽略;Flink：用于数据ETL，包括接入数据、处理数据及输出数据全链路数据计算任务；Spark：用于数据ETL，包括处理数据及输出数据全链路数据计算任务；Hudi：湖仓一体数据管理框架，用来管理模型数据，包括ODS/DWD/DWS/DIM/ADS等；Doris：O
实时数仓之实时数仓架构(Hudi)(1)，2024年最新熬夜整理华为最新大数据开发笔试题 2401_84181221 程序员架构大数据
+Hudi：湖仓一体数据管理框架，用来管理模型数据，包括ODS/DWD/DWS/DIM/ADS等；+Doris：OLAP引擎，同步数仓结果模型，对外提供数据服务支持；+Hbase：用来存储维表信息，维表数据来源一部分有Flink加工实时写入，另一部分是从Spark任务生产，其主要作用用来支持FlinkETL处理过程中的LookupJoin功能。这里选用Hbase原因主要因为Table的HbaseC
Flink - CEP kikiki1
Hadoop3.2集群新版本的搭建详细讲解过程，从下面第一张官方的图来看，最新版是3.2，所以大猪将使用3.2的版本来演示，过程中遇到的坑留给自己，把路留给你们，IT之路还有大猪。大猪为了把文章压缩极简方便小伙伴阅读，将使用root帐号进行所有操作。准备两台主机10.211.55.11、10.211.55.12对应的hostname为m1.example.com、m2.example.com具体命
chapter01 Java语言概述知识点Note 月下绯烟 Java java 开发语言
JavaSEJavaEEJavaME大数据Java基础常用技术栈mysqlJDBCSSMspring+springmvc+mybatisLinuxnacosHadoopFlinkJAVAEE消息队列rabbitMQdocker数据库redisspringbootspringcloudsshstruts+spring+hibernate过时技术栈很少用JAVA虚拟机jvm分布式微服务高并发常见dos
【无标题】大数据之批处理，流处理，批流一体概念数字天下大数据
批处理批处理是将一定量的数据集合在一起，形成一个数据批次，然后对这个批次中的数据进行处理。Spark和Flink都支持批处理，其中Spark使用的是批处理模型，即将一批数据一次性读入内存，然后对其进行处理，处理完成后再将结果写入磁盘。Flink也支持批处理，但使用的是基于流处理的批处理模式，即将一批数据分成多个数据流进行处理，可以实现更高效的内存管理和更低的延迟。流处理流式处理是一种将数据流式地处
python flink_《Flink官方文档》Python 编程指南测试版 weixin_39846361 python flink
原文链接译者：hjjxd校对：清英Flink中的分析程序实现了对数据集的某些操作(例如，数据过滤，映射，合并，分组)。这些数据最初来源于特定的数据源(例如来自于读文件或数据集合)。操作执行的结果通过数据池以写入数据到(分布式)文件系统或标准输出(例如命令行终端)的形式返回。Flink程序可以运行在不同的环境中，既能够独立运行，也可以嵌入到其他程序中运行。程序可以运行在本地的JVM上，也可以运行在服
flink---window 搞数据的小杰 flink 大数据
Window介绍DataStream:https://nightlies.apache.org/flink/flink-docs-release-1.17/zh/docs/dev/datastream/operators/windows/SQL:https://nightlies.apache.org/flink/flink-docs-release-1.17/zh/docs/dev/table/
Flink(1.13) 的window机制(一) 万事万物
窗口概述在流处理应用中，数据是连续不断的，因此我们不可能等到所有数据都到了才开始处理。当然我们可以每来一个消息就处理一次，但是有时我们需要做一些聚合类的处理，例如：在过去的1分钟内有多少用户点击了我们的网页。在这种情况下，我们必须定义一个窗口，用来收集最近一分钟内的数据，并对这个窗口内的数据进行计算。流式计算是一种被设计用于处理无限数据集的数据处理引擎，而无限数据集是指一种不断增长的本质上无限的数
pyflink 自定义函数 scan724 Flink实时计算 python 开发语言
frompyflink.datastreamimportStreamExecutionEnvironmentfrompyflink.common.typeinfoimportTypesfrompyflink.datastreamimportStreamExecutionEnvironmentfrompyflink.tableimportStreamTableEnvironmentfrompyfli
flink 问题记录 Jhon_yh flink flink hadoop 大数据
文章目录1.Causedby:java.lang.UnsatisfiedLinkError:org.apache.hadoop.util.NativeCrc32.nativeComputeChunkedSums(IILjava/nio/ByteBuffer;ILjava/nio/ByteBuffer;IILjava/lang/String;JZ)V原因java.util.concurrent.Ex
Pyflink教程(三)：自定义函数 yuxj记录学习学习笔记学习 pyflink
该文章例子pyflink环境是apache-flink==1.13.6Python自定义函数是PyFlinkTableAPI中最重要的功能之一，其允许用户在PyFlinkTableAPI中使用Python语言开发的自定义函数，极大地拓宽了PythonTableAPI的使用范围。简单来说就是有的业务逻辑和需求是sql语句满足不了或太麻烦的，需要用过函数来实现。PythonUDFPythonUDF，即
pyflink 滚动窗口实例菜鸟社长菜鸟的大数据进阶之路大数据进阶之路 kafka big data python flink
写在前头：更多大数据相关精彩内容请进我的知识星球，每周定期更新正篇技术路线：模拟kafka生产者发送数据——>flink对kafka数据实时计算处理——>处理后的数据发送到kafka1、模拟客流数据的生产者，参考https://blog.csdn.net/qq_22611181/article/details/1199002502、flink聚合操作原理介绍，参考https://blog.csdn
java解析APK 3213213333332132 java apk linux 解析APK
解析apk有两种方法 1、结合安卓提供apktool工具，用java执行cmd解析命令获取apk信息 2、利用相关jar包里的集成方法解析apk 这里只给出第二种方法，因为第一种方法在linux服务器下会出现不在控制范围之内的结果。 public class ApkUtil { /** * 日志对象 */ private static Logger
nginx自定义ip访问N种方法 ronin47 nginx 禁止ip访问
　　　因业务需要，禁止一部分内网访问接口，　由于前端架了F5，直接用deny或allow是不行的，这是因为直接获取的前端Ｆ５的地址。　　　所以开始思考有哪些主案可以实现这样的需求，目前可实施的是三种：　　　一：把ip段放在redis里，写一段lua 二：利用geo传递变量，写一段
mysql timestamp类型字段的CURRENT_TIMESTAMP与ON UPDATE CURRENT_TIMESTAMP属性 dcj3sjt126com mysql
timestamp有两个属性，分别是CURRENT_TIMESTAMP 和ON UPDATE CURRENT_TIMESTAMP两种，使用情况分别如下： 1. CURRENT_TIMESTAMP 当要向数据库执行insert操作时，如果有个timestamp字段属性设为 CURRENT_TIMESTAMP，则无论这
struts2+spring+hibernate分页显示 171815164 Hibernate
分页显示一直是web开发中一大烦琐的难题，传统的网页设计只在一个JSP或者ASP页面中书写所有关于数据库操作的代码，那样做分页可能简单一点，但当把网站分层开发后，分页就比较困难了，下面是我做Spring+Hibernate+Struts2项目时设计的分页代码，与大家分享交流。　　1、DAO层接口的设计，在MemberDao接口中定义了如下两个方法： public in
构建自己的Wrapper应用 g21121 rap
我们已经了解Wrapper的目录结构，下面可是正式利用Wrapper来包装我们自己的应用，这里假设Wrapper的安装目录为:/usr/local/wrapper。首先，创建项目应用 &nb
[简单]工作记录_多线程相关 53873039oycg 多线程
最近遇到多线程的问题,原来使用异步请求多个接口(n*3次请求) 方案一使用多线程一次返回数据,最开始是使用5个线程,一个线程顺序请求3个接口,超时终止返回缺点测试发现必须3个接
调试jdk中的源码，查看jdk局部变量程序员是怎么炼成的 jdk 源码
转自：http://www.douban.com/note/211369821/ 学习jdk源码时使用-- 学习java最好的办法就是看jdk源代码，面对浩瀚的jdk（光源码就有40M多，比一个大型网站的源码都多）从何入手呢，要是能单步调试跟进到jdk源码里并且能查看其中的局部变量最好了。可惜的是sun提供的jdk并不能查看运行中的局部变量
Oracle RAC Failover 详解 aijuans oracle
Oracle RAC 同时具备HA(High Availiablity) 和LB(LoadBalance). 而其高可用性的基础就是Failover(故障转移). 它指集群中任何一个节点的故障都不会影响用户的使用，连接到故障节点的用户会被自动转移到健康节点，从用户感受而言，是感觉不到这种切换。 Oracle 10g RAC 的Failover 可以分为3种： 1. Client-Si
form表单提交数据编码方式及tomcat的接受编码方式 antonyup_2006 JavaScript tomcat 浏览器互联网 servlet
原帖地址：http://www.iteye.com/topic/266705 form有2中方法把数据提交给服务器，get和post,分别说下吧。（一）get提交 1.首先说下客户端（浏览器）的form表单用get方法是如何将数据编码后提交给服务器端的吧。对于get方法来说，都是把数据串联在请求的url后面作为参数，如：http://localhost:
JS初学者必知的基础百合不是茶 js函数 js入门基础
JavaScript是网页的交互语言,实现网页的各种效果, JavaScript 是世界上最流行的脚本语言。 JavaScript 是属于 web 的语言，它适用于 PC、笔记本电脑、平板电脑和移动电话。 JavaScript 被设计为向 HTML 页面增加交互性。许多 HTML 开发者都不是程序员，但是 JavaScript 却拥有非常简单的语法。几乎每个人都有能力将小的
iBatis的分页分析与详解 bijian1013 java ibatis
分页是操作数据库型系统常遇到的问题。分页实现方法很多，但效率的差异就很大了。iBatis是通过什么方式来实现这个分页的了。查看它的实现部分，发现返回的PaginatedList实际上是个接口，实现这个接口的是PaginatedDataList类的对象，查看PaginatedDataList类发现，每次翻页的时候最
精通Oracle10编程SQL(15)使用对象类型 bijian1013 oracle 数据库 plsql
/* *使用对象类型 */ --建立和使用简单对象类型 --对象类型包括对象类型规范和对象类型体两部分。 --建立和使用不包含任何方法的对象类型 CREATE OR REPLACE TYPE person_typ1 as OBJECT( name varchar2(10),gender varchar2(4),birthdate date ); drop type p
【Linux命令二】文本处理命令awk bit1129 linux命令
awk是Linux用来进行文本处理的命令，在日常工作中，广泛应用于日志分析。awk是一门解释型编程语言，包含变量，数组，循环控制结构，条件控制结构等。它的语法采用类C语言的语法。 awk命令用来做什么？ 1.awk适用于具有一定结构的文本行，对其中的列进行提取信息 2.awk可以把当前正在处理的文本行提交给Linux的其它命令处理，然后把直接结构返回给awk 3.awk实际工
JAVA(ssh2框架)+Flex实现权限控制方案分析白糖_ java
目前项目使用的是Struts2+Hibernate+Spring的架构模式，目前已经有一套针对SSH2的权限系统，运行良好。但是项目有了新需求：在目前系统的基础上使用Flex逐步取代JSP，在取代JSP过程中可能存在Flex与JSP并存的情况，所以权限系统需要进行修改。【SSH2权限系统的实现机制】权限控制分为页面和后台两块：不同类型用户的帐号分配的访问权限是不同的，用户使
angular.forEach boyitech AngularJS AngularJS API angular.forEach
angular.forEach 描述: 循环对obj对象的每个元素调用iterator, obj对象可以是一个Object或一个Array. Iterator函数调用方法: iterator(value, key, obj), 其中obj是被迭代对象，key是obj的property key或者是数组的index，value就是相应的值啦. (此函数不能够迭代继承的属性.)
java-谷歌面试题-给定一个排序数组，如何构造一个二叉排序树 bylijinnan 二叉排序树
import java.util.LinkedList; public class CreateBSTfromSortedArray { /** * 题目:给定一个排序数组，如何构造一个二叉排序树 * 递归 */ public static void main(String[] args) { int[] data = { 1, 2, 3, 4,
action执行2次 Chen.H JavaScript jsp XHTML css Webwork
xwork 写道 <action name="userTypeAction" class="com.ekangcount.website.system.view.action.UserTypeAction"> <result name="ssss" type="dispatcher">
[时空与能量]逆转时空需要消耗大量能源 comsci 能源
无论如何,人类始终都想摆脱时间和空间的限制....但是受到质量与能量关系的限制,我们人类在目前和今后很长一段时间内,都无法获得大量廉价的能源来进行时空跨越..... 在进行时空穿梭的实验中,消耗超大规模的能源是必然
oracle的正则表达式(regular expression)详细介绍 daizj oracle 正则表达式
正则表达式是很多编程语言中都有的。可惜oracle8i、oracle9i中一直迟迟不肯加入，好在oracle10g中终于增加了期盼已久的正则表达式功能。你可以在oracle10g中使用正则表达式肆意地匹配你想匹配的任何字符串了。正则表达式中常用到的元数据(metacharacter)如下： ^ 匹配字符串的开头位置。 $ 匹配支付传的结尾位置。 *
报表工具与报表性能的关系 datamachine 报表工具 birt 报表性能润乾报表
在选择报表工具时，性能一直是用户关心的指标，但是，报表工具的性能和整个报表系统的性能有多大关系呢？要回答这个问题，首先要分析一下报表的处理过程包含哪些环节，哪些环节容易出现性能瓶颈，如何优化这些环节。一、报表处理的一般过程分析 1、用户选择报表输入参数后，报表引擎会根据报表模板和输入参数来解析报表，并将数据计算和读取请求以SQL的方式发送给数据库。 2、
初一上学期难记忆单词背诵第一课 dcj3sjt126com word english
what 什么 your 你 name 名字 my 我的 am 是 one 一 two 二 three 三 four 四 five 五 class 班级，课 six 六 seven 七 eight 八 nince 九 ten 十 zero 零 how 怎样 old 老的 eleven 十一 twelve 十二 thirteen
我学过和准备学的各种技术 dcj3sjt126com 技术
语言VB https://msdn.microsoft.com/zh-cn/library/2x7h1hfk.aspxJava http://docs.oracle.com/javase/8/C# https://msdn.microsoft.com/library/vstudioPHP http://php.net/manual/en/Html
struts2中token防止重复提交表单蕃薯耀重复提交表单 struts2中token
struts2中token防止重复提交表单 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 蕃薯耀 2015年7月12日 11:52:32 星期日 ht
线性查找二维数组 hao3100590 二维数组
1.算法描述有序（行有序，列有序，且每行从左至右递增，列从上至下递增）二维数组查找，要求复杂度O(n) 2.使用到的相关知识：结构体定义和使用，二维数组传递（http://blog.csdn.net/yzhhmhm/article/details/2045816） 3.使用数组名传递这个的不便之处很明显，一旦确定就是不能设置列值 //使
spring security 3中推荐使用BCrypt算法加密密码 jackyrong Spring Security
spring security 3中推荐使用BCrypt算法加密密码了，以前使用的是md5， Md5PasswordEncoder 和 ShaPasswordEncoder，现在不推荐了，推荐用bcrpt Bcrpt中的salt可以是随机的，比如： int i = 0; while (i < 10) { String password = "1234
学习编程并不难,做到以下几点即可! lampcy java html 编程语言
不论你是想自己设计游戏，还是开发iPhone或安卓手机上的应用，还是仅仅为了娱乐，学习编程语言都是一条必经之路。编程语言种类繁多，用途各异，然而一旦掌握其中之一，其他的也就迎刃而解。作为初学者，你可能要先从Java或HTML开始学，一旦掌握了一门编程语言，你就发挥无穷的想象，开发各种神奇的软件啦。 1、确定目标学习编程语言既充满乐趣，又充满挑战。有些花费多年时间学习一门编程语言的大学生到
架构师之mysql----------------用group+inner join,left join ,right join 查重复数据（替代in) nannan408 right join
1.前言。如题。 2.代码 (1)单表查重复数据,根据a分组 SELECT m.a,m.b, INNER JOIN （select a,b,COUNT(*) AS rank FROM test.`A` A GROUP BY a HAVING rank>1 )k ON m.a=k.a （2）多表查询，使用改为le
jQuery选择器小结 VS 节点查找（附css的一些东西） Everyday都不同 jquery css name选择器追加元素查找节点
最近做前端页面，频繁用到一些jQuery的选择器，所以特意来总结一下：测试页面： <html> <head> <script src="jquery-1.7.2.min.js"></script> <script> /*$(function() { $(documen
关于EXT tntxia ext
ExtJS是一个很不错的Ajax框架，可以用来开发带有华丽外观的富客户端应用，使得我们的b/s应用更加具有活力及生命力。ExtJS是一个用 javascript编写，与后台技术无关的前端ajax框架。因此，可以把ExtJS用在.Net、Java、Php等各种开发语言开发的应用中。 ExtJs最开始基于YUI技术，由开发人员Jack
一个MIT计算机博士对数学的思考 xjnine Math
在过去的一年中，我一直在数学的海洋中游荡，research进展不多，对于数学世界的阅历算是有了一些长进。为什么要深入数学的世界？作为计算机的学生，我没有任何企图要成为一个数学家。我学习数学的目的，是要想爬上巨人的肩膀，希望站在更高的高度，能把我自己研究的东西看得更深广一些。说起来，我在刚来这个学校的时候，并没有预料到我将会有一个深入数学的旅程。我的导师最初希望我去做的题目，是对appe