Flink清理状态异常排查

1. 异常信息

Exception in thread "main" org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
    at org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:146)
    at org.apache.flink.runtime.minicluster.MiniCluster.executeJobBlocking(MiniCluster.java:638)
    at org.apache.flink.streaming.api.environment.LocalStreamEnvironment.execute(LocalStreamEnvironment.java:123)
    at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1509)
    at org.apache.flink.streaming.api.scala.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.scala:645)
    at org.learn.StateWordCount$.main(StateWordCount.scala:50)
    at org.learn.StateWordCount.main(StateWordCount.scala)
Caused by: TimerException{java.util.ConcurrentModificationException}
    at org.apache.flink.streaming.runtime.tasks.SystemProcessingTimeService$TriggerTask.run(SystemProcessingTimeService.java:288)
    at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
    at java.util.concurrent.FutureTask.run(FutureTask.java:266)
    at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
    at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
    at java.lang.Thread.run(Thread.java:748)
Caused by: java.util.ConcurrentModificationException
    at java.util.HashMap$HashIterator.nextNode(HashMap.java:1442)
    at java.util.HashMap$KeyIterator.next(HashMap.java:1466)
    at org.learn.function.WordCountProcessFunction.onTimer(WordCountProcessFunction.scala:43)
    at org.apache.flink.streaming.api.operators.KeyedProcessOperator.invokeUserFunction(KeyedProcessOperator.java:94)
    at org.apache.flink.streaming.api.operators.KeyedProcessOperator.onProcessingTime(KeyedProcessOperator.java:78)
    at org.apache.flink.streaming.api.operators.InternalTimerServiceImpl.onProcessingTime(InternalTimerServiceImpl.java:239)
    at org.apache.flink.streaming.runtime.tasks.SystemProcessingTimeService$TriggerTask.run(SystemProcessingTimeService.java:285)
    ... 7 more
    

报错位置是 org.learn.function.WordCountProcessFunction.onTimer(WordCountProcessFunction.scala:43)

报错原因是java.util.ConcurrentModificationException

2. 代码

package org.learn.function

import org.apache.flink.api.common.state.{MapState, MapStateDescriptor}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.util.Collector

class WordCountProcessFunction extends KeyedProcessFunction[String, (String, Int), (String, Int)] {

  private var mapState: MapState[String, (String, Int)] = _
  private var timerState: MapState[Long, Long] = _

  override def open(parameters: Configuration): Unit = {
    var mapStateDesc = new MapStateDescriptor[String, (String, Int)]("valueStateDesc", classOf[String], classOf[(String, Int)])
    mapState = getRuntimeContext.getMapState(mapStateDesc)

    val timerStateDesc = new MapStateDescriptor[Long, Long]("timerStateDesc", classOf[Long], classOf[Long])
    timerState = getRuntimeContext.getMapState(timerStateDesc)
  }

  override def processElement(value: (String, Int), ctx: KeyedProcessFunction[String, (String, Int), (String, Int)]#Context, out: Collector[(String, Int)]): Unit = {

    var currentState: (String, Int) = mapState.get(value._1)
    if (null == currentState) {
      currentState = (value._1, 0)

      // TTL时间
      val ttlTime: Long = System.currentTimeMillis() - 30 * 1000 // 设置一个历史时间
      ctx.timerService().registerProcessingTimeTimer(ttlTime)
      timerState.put(ttlTime, ttlTime)
      timerState.put(ttlTime - 10, ttlTime - 10)
    }

    var newState: (String, Int) = (currentState._1, currentState._2 + value._2)
    mapState.put(value._1, newState)
  }

  override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[String, (String, Int), (String, Int)]#OnTimerContext, out: Collector[(String, Int)]): Unit = {

    System.out.println("clear..." + " timestamp: " + timestamp + " currentTime: " + System.currentTimeMillis() + " timerState: ")
    val iter = timerState.keys().iterator()
    while (iter.hasNext) {
      val key = iter.next()
      System.out.println("key: " + key + " value: " + timerState.get(key))
      if (key < System.currentTimeMillis()) {
        timerState.remove(key)
      }
    }

    mapState.clear()
  }
}

第 43 行:val key = iter.next()

错误原因:利用迭代器遍历 map 时,如果同时调用 map.remove(Object key) 做移除操作,就会报 java.util.ConcurrentModificationException 异常。

改正方法:利用迭代器的 remove 方法 iter.remove() 做移除操作,则不会抛出该异常信息。

3. 源码

以 HashMap 为例,看看源码。

  • 进入java.util.HashMap.java

    public class HashMap extends AbstractMap
        implements Map, Cloneable, Serializable {
        
        
        // HashMap的remove方法
        public V remove(Object key) {
            Node e;
            return (e = removeNode(hash(key), key, null, false, true)) == null ?
                null : e.value;
        }
        
        
        final Node removeNode(int hash, Object key, Object value,
                                   boolean matchValue, boolean movable) {
            Node[] tab; Node p; int n, index;
            if ((tab = table) != null && (n = tab.length) > 0 &&
                (p = tab[index = (n - 1) & hash]) != null) {
                Node node = null, e; K k; V v;
                if (p.hash == hash &&
                    ((k = p.key) == key || (key != null && key.equals(k))))
                    node = p;
                else if ((e = p.next) != null) {
                    if (p instanceof TreeNode)
                        node = ((TreeNode)p).getTreeNode(hash, key);
                    else {
                        do {
                            if (e.hash == hash &&
                                ((k = e.key) == key ||
                                 (key != null && key.equals(k)))) {
                                node = e;
                                break;
                            }
                            p = e;
                        } while ((e = e.next) != null);
                    }
                }
                if (node != null && (!matchValue || (v = node.value) == value ||
                                     (value != null && value.equals(v)))) {
                    if (node instanceof TreeNode)
                        ((TreeNode)node).removeTreeNode(this, tab, movable);
                    else if (node == p)
                        tab[index] = node.next;
                    else
                        p.next = node.next;
                    ++modCount;
                    --size;
                    afterNodeRemoval(node);
                    return node;
                }
            }
            return null;
        }
        
        
      final class KeyIterator extends HashIterator
            implements Iterator {
            public final K next() { return nextNode().key; }
        }
    
    
        // 内部类
        abstract class HashIterator {
            Node next;        // next entry to return
            Node current;     // current entry
            int expectedModCount;  // for fast-fail
            int index;             // current slot
    
            HashIterator() {
                expectedModCount = modCount;
                Node[] t = table;
                current = next = null;
                index = 0;
                if (t != null && size > 0) { // advance to first entry
                    do {} while (index < t.length && (next = t[index++]) == null);
                }
            }
    
            public final boolean hasNext() {
                return next != null;
            }
    
            final Node nextNode() {
                Node[] t;
                Node e = next;
                if (modCount != expectedModCount)
                    throw new ConcurrentModificationException();
                if (e == null)
                    throw new NoSuchElementException();
                if ((next = (current = e).next) == null && (t = table) != null) {
                    do {} while (index < t.length && (next = t[index++]) == null);
                }
                return e;
            }
    
            // 迭代器的remove方法
            public final void remove() {
                Node p = current;
                if (p == null)
                    throw new IllegalStateException();
                if (modCount != expectedModCount)
                    throw new ConcurrentModificationException();
                current = null;
                K key = p.key;
                removeNode(hash(key), key, null, false, false);
                expectedModCount = modCount;
            }
        }
    
    1. 调用迭代器的 next() 方法,进而调用 nextNode() 方法
    2. nextNode() 方法中会进行判断,如果 modCount != expectedModCount,则抛出java.util.ConcurrentModificationException 异常
    3. 如果调用 HashMap.remove() 方法,则进而会调用 removeNode() 方法,在 removeNode() 方法的最后,会对 modCount+1,此时后面再调用迭代器的 next() 方法时,就会抛出java.util.ConcurrentModificationException异常
    4. 如果调用迭代器的 remove() 方法,该方法最后会 expectedModCount = modCount,此时后面再调用迭代器的 next() 方法时,不会抛出异常

你可能感兴趣的:(Flink清理状态异常排查)