2019-07-05T10:52:13,050 ERROR [flink-akka.remote.default-remote-dispatcher-5] org.apache.flink.runtime.rest.handler.legacy.TaskManagerLogHandler - Fetching TaskManager log failed.
java.util.concurrent.CompletionException: java.io.IOException: TaskManager log files are unavailable. Log file could not be found at /mnt/disk1/log/hadoop-yarn/containers/application_1556227576661_152377/container_1556227576661_152377_01_000012/taskmanager.log.
at java.util.concurrent.CompletableFuture.encodeRelay(CompletableFuture.java:326) ~[?:1.8.0_151]
at java.util.concurrent.CompletableFuture.completeRelay(CompletableFuture.java:338) ~[?:1.8.0_151]
at java.util.concurrent.CompletableFuture.uniRelay(CompletableFuture.java:911) ~[?:1.8.0_151]
at java.util.concurrent.CompletableFuture$UniRelay.tryFire(CompletableFuture.java:899) ~[?:1.8.0_151]
at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:474) [?:1.8.0_151]
at java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:1977) [?:1.8.0_151]
at org.apache.flink.runtime.concurrent.FutureUtils$1.onComplete(FutureUtils.java:442) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.dispatch.OnComplete.internal(Future.scala:258) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.dispatch.OnComplete.internal(Future.scala:256) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:186) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:183) [flink-dist_2.11-1.4.0.jar:1.4.0]
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) [cti-report-flink.jar:?]
at org.apache.flink.runtime.concurrent.Executors$DirectExecutionContext.execute(Executors.java:84) [flink-dist_2.11-1.4.0.jar:1.4.0]
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) [cti-report-flink.jar:?]
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) [cti-report-flink.jar:?]
at scala.concurrent.Promise$class.complete(Promise.scala:55) [cti-report-flink.jar:?]
at scala.concurrent.impl.Promise$DefaultPromise.complete(Promise.scala:153) [cti-report-flink.jar:?]
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235) [cti-report-flink.jar:?]
at scala.concurrent.Future$$anonfun$map$1.apply(Future.scala:235) [cti-report-flink.jar:?]
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:32) [cti-report-flink.jar:?]
at scala.concurrent.BatchingExecutor$Batch$$anonfun$run$1.processBatch$1(BatchingExecutor.scala:63) [cti-report-flink.jar:?]
at scala.concurrent.BatchingExecutor$Batch$$anonfun$run$1.apply$mcV$sp(BatchingExecutor.scala:78) [cti-report-flink.jar:?]
at scala.concurrent.BatchingExecutor$Batch$$anonfun$run$1.apply(BatchingExecutor.scala:55) [cti-report-flink.jar:?]
at scala.concurrent.BatchingExecutor$Batch$$anonfun$run$1.apply(BatchingExecutor.scala:55) [cti-report-flink.jar:?]
at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72) [cti-report-flink.jar:?]
at scala.concurrent.BatchingExecutor$Batch.run(BatchingExecutor.scala:54) [cti-report-flink.jar:?]
at scala.concurrent.Future$InternalCallbackExecutor$.unbatchedExecute(Future.scala:599) [cti-report-flink.jar:?]
at scala.concurrent.BatchingExecutor$class.execute(BatchingExecutor.scala:106) [cti-report-flink.jar:?]
at scala.concurrent.Future$InternalCallbackExecutor$.execute(Future.scala:597) [cti-report-flink.jar:?]
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:40) [cti-report-flink.jar:?]
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:248) [cti-report-flink.jar:?]
at akka.pattern.PromiseActorRef.$bang(AskSupport.scala:534) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.remote.DefaultMessageDispatcher.dispatch(Endpoint.scala:97) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.remote.EndpointReader$$anonfun$receive$2.applyOrElse(Endpoint.scala:982) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.actor.Actor$class.aroundReceive(Actor.scala:502) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:446) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:526) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.actor.ActorCell.invoke(ActorCell.scala:495) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:257) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.dispatch.Mailbox.run(Mailbox.scala:224) [flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.dispatch.Mailbox.exec(Mailbox.scala:234) [flink-dist_2.11-1.4.0.jar:1.4.0]
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) [cti-report-flink.jar:?]
at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) [cti-report-flink.jar:?]
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) [cti-report-flink.jar:?]
at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) [cti-report-flink.jar:?]
Caused by: java.io.IOException: TaskManager log files are unavailable. Log file could not be found at /mnt/disk1/log/hadoop-yarn/containers/application_1556227576661_152377/container_1556227576661_152377_01_000012/taskmanager.log.
at org.apache.flink.runtime.taskmanager.TaskManager.org$apache$flink$runtime$taskmanager$TaskManager$$handleRequestTaskManagerLog(TaskManager.scala:848) ~[flink-dist_2.11-1.4.0.jar:1.4.0]
at org.apache.flink.runtime.taskmanager.TaskManager$$anonfun$handleMessage$1.applyOrElse(TaskManager.scala:331) ~[flink-dist_2.11-1.4.0.jar:1.4.0]
at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:36) ~[cti-report-flink.jar:?]
at org.apache.flink.runtime.LeaderSessionMessageFilter$$anonfun$receive$1.applyOrElse(LeaderSessionMessageFilter.scala:49) ~[flink-dist_2.11-1.4.0.jar:1.4.0]
at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:36) ~[cti-report-flink.jar:?]
at org.apache.flink.runtime.LogMessages$$anon$1.apply(LogMessages.scala:33) ~[flink-dist_2.11-1.4.0.jar:1.4.0]
at org.apache.flink.runtime.LogMessages$$anon$1.apply(LogMessages.scala:28) ~[flink-dist_2.11-1.4.0.jar:1.4.0]
at scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123) ~[cti-report-flink.jar:?]
at org.apache.flink.runtime.LogMessages$$anon$1.applyOrElse(LogMessages.scala:28) ~[flink-dist_2.11-1.4.0.jar:1.4.0]
at akka.actor.Actor$class.aroundReceive(Actor.scala:502) ~[flink-dist_2.11-1.4.0.jar:1.4.0]
at org.apache.flink.runtime.taskmanager.TaskManager.aroundReceive(TaskManager.scala:121) ~[flink-dist_2.11-1.4.0.jar:1.4.0]
... 9 more
2019-07-05T10:52:13,518 INFO [yarn-jobmanager-future-thread-1] org.apache.flink.runtime.blob.BlobClient - Downloading null/t-ee89bf6e220711e63561c1971e55f402831e541d-7bd4fc0a3f68e4f73cab56a2f2f11371 from localhost/127.0.0.1:46701
Flink程序在运行一段时间后,程序头突然报出某个几点丢失的问题,遇到这种问题首先需要找到出问题的的哪个TaskManager,去查看taskmanager的值日志,最终在日志中发现
GC OverHead Limit
所以可以断定程序中某个地方内存溢出了,经过排查,发现是程序中的concurrentHashMap的存储值在不停增加,并没有清除,所以肯定会导致内存溢出,由于这个从currentHashMap是MapReduce程序使用的,所以不会存在问题,但是在持续运行为Flink程序中,其内存使用量会一直增加,最终导致内存溢出的问题.最终我的方案将currentHashMap是用Guava缓存替代,问题解决