基于log4j2简易实现日志告警

需求

系统报ERROR错误时,能实时做到消息通知。

思路

当前项目比较小,不想过多的依赖额外的第三方组件。
项目在ERROR时,都会打印ERROR日志,所以可以在log4j接收到ERROR日志请求时,发送通知消息。

实践

Filter是log4j2的扩展点,从图中(图片来自如何编写Log4j2脱敏插件)流程可以看到,Filter分别可以在全局LoggerAppender三个地方做过滤。
WX20200525-195412@2x.png
三个地方对应的log4j.xml配置地方如下:


    
    
        
            
                
                 
            
        
    
    
        
             
            
        
    

log4j提供了过滤器的基类AbstractFilter

  • 全局过滤器入口方法是filter(Logger logger, Level level, Marker marker, String msg, Object... params) msg是填充参数之前的内容,params是参数列表,包含Throwable对象。
  • LoggerAppender入口方式是filter(final LogEvent event),通过event.getMessage().getFormattedMessage()取到填充参数之后的内容,通过event.getThrown()获取异常对象。

代码

@Plugin(name = "ErrorNotifyFilter", category = Node.CATEGORY, elementType = Filter.ELEMENT_TYPE, printObject = true)
public class ErrorNotifyLog4j2Filter extends AbstractFilter {
    // 默认打印异常堆栈长度
    public static final int DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH = 3;
    // 默认消息最长长度
    // 2020-07-09 测试企业微信允许最大长度为4000
    public static final int DEFAULT_MAX_MSG_LENGTH = 4000;
    private String projectName;
    private List rtxReceivers;

    private boolean noPrintExceptionStack;
    private int printExceptionStackLength;
    private int maxMsgLength;
    private Object lock = new Object();

    private ErrorNotifyLog4j2Filter(String projectName, String rtxReceivers,
                                    boolean noPrintExceptionStack, int printExceptionStackLength, int maxMsgLength) {
        super();
        this.projectName = projectName;
        this.rtxReceivers = Lists.newArrayList(rtxReceivers.split(","));
        this.noPrintExceptionStack = noPrintExceptionStack;
        this.printExceptionStackLength = printExceptionStackLength <= 0 ? DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH : printExceptionStackLength;
        this.maxMsgLength = (maxMsgLength <= 0 || maxMsgLength > DEFAULT_MAX_MSG_LENGTH) ? DEFAULT_MAX_MSG_LENGTH : maxMsgLength;
    }

    @Override
    public Result filter(LogEvent event) {
        notify(event.getLevel(), event.getMessage().getFormattedMessage(), event.getThrown());
        return super.filter(event);
    }

    @Override
    public Result filter(Logger logger, Level level, Marker marker, Message msg, Throwable t) {
        notify(level, msg.getFormattedMessage(), t);
        return super.filter(logger, level, marker, msg, t);
    }

    @Override
    public Result filter(Logger logger, Level level, Marker marker, Object msg, Throwable t) {
        notify(level, msg == null ? "" : msg.toString(), t);
        return super.filter(logger, level, marker, msg, t);
    }

    @Override
    public Result filter(Logger logger, Level level, Marker marker, String msg, Object... params) {
        notify(level, msg, getExceptionParam(params));
        return super.filter(logger, level, marker, msg, params);
    }

    /**
     * @param level
     * @param msg
     * @param t
     * @author 
     * @date 
     */
    private void notify(Level level, String msg, Throwable t) {
        try {
            if (level == null || level.intLevel() != Level.ERROR.intLevel()) {
                return;
            }
            if (StringUtils.isBlank(msg) && t == null) {
                return;
            }
            Log4j2AsyncExecutor.executorService.submit(() -> {
                try {
                    String notifyMsg = getNotifyMsg(msg, t);                 MessageUtil.postMessage(Lists.newArrayList(MessageTypeEnum.RTX),
                            rtxReceivers,
                            getNotifyTitle(),
                            notifyMsg);
                } catch (Exception ignoreException) {
                    ignoreException.printStackTrace();
                }
            });
        } catch (Throwable ignoreException) {
            ignoreException.printStackTrace();
        }
    }

    /**
     * @param params
     * @return java.lang.Throwable
     * @author 
     * @date 
     */
    private Throwable getExceptionParam(Object... params) {
        if (params == null || params.length == 0) {
            return null;
        }
        for (Object param : params) {
            if (param instanceof Throwable) {
                return (Throwable) param;
            }
        }
        return null;
    }

    /**
     * 如果开启堆栈信息,能让告警更清晰,但同样的也就降低了性能
     *
     * @param msg
     * @param t
     * @return java.lang.String
     * @author 
     * @date 
     */
    private String getNotifyMsg(String msg, Throwable t) {
        String errorMsg = "信息:" + (msg == null ? "" : msg);
        String exceptionMsg = "";
        if (t != null) {
            exceptionMsg += "\n异常:" + t.toString();
            if (noPrintExceptionStack) {
                return errorMsg + exceptionMsg;
            }
            StackTraceElement[] stackTraceElements = t.getStackTrace();
            if (stackTraceElements == null || stackTraceElements.length == 0) {
                return errorMsg + exceptionMsg;
            }
            int length = Math.min(stackTraceElements.length, printExceptionStackLength);
            for (int i = 0; i < length; i++) {
                exceptionMsg += "\n" + stackTraceElements[i];
            }
        }
        String allMsg = errorMsg + exceptionMsg;
        return allMsg.length() > maxMsgLength ? allMsg.substring(0, maxMsgLength) : allMsg;
    }

    /**
     * @return java.lang.String
     * @author 
     * @date
     */
    private String getNotifyTitle() {
        String actualActiveProfiles = getActiveProfiles();
        StringBuilder ret = new StringBuilder("项目异常告警 ");
        if (StringUtils.isNotBlank(actualActiveProfiles)) {
            ret.append("【").append(actualActiveProfiles).append("】");
        }
        ret.append("【").append(projectName).append("】");
        ret.append("【").append(IPUtils.getLocalHostIP()).append("】");
        ret.append("【").append(DateUtils.formatDateTime(new Date())).append("】");
        return ret.toString();
    }

    /**
     * @return java.lang.String
     * @author 
     * @date 
     */
    private String getActiveProfiles() {
        String[] activeProfiles = SpringContextUtil.getApplicationContext() == null ? null : SpringContextUtil.getActiveProfile();
        if (activeProfiles == null || activeProfiles.length == 0) {
            return "";
        }
        List actualActiveProfiles = Arrays.stream(activeProfiles)
                // 部分项目的profile中有用到include,include中含有"-",过滤掉include的数据
                .filter(str -> !str.contains("-"))
                .collect(Collectors.toList());
        return StringUtils.join(actualActiveProfiles, ",");
    }

    /**
     * @return com.tscm.purchase.interfacer.log4j2.ErrorNotifyFilter
     * @author 
     * @date 2020-05-14 18:37
     */
    @PluginFactory
    public static ErrorNotifyLog4j2Filter createFilter(@PluginAttribute("projectName") final String projectName,                                                  @PluginAttribute("rtxReceivers") final String rtxReceivers,                                                                    @PluginAttribute("noPrintExceptionStack") final Boolean noPrintExceptionStack,                                                  @PluginAttribute("printExceptionStackLength") final Integer printExceptionStackLength,                                                @PluginAttribute("maxMsgLength") final Integer maxMsgLength) {
        return new ErrorNotifyLog4j2Filter(projectName, rtxReceivers,
                noPrintExceptionStack == null ? false : noPrintExceptionStack.booleanValue(),
                printExceptionStackLength == null ? DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH : printExceptionStackLength,
                maxMsgLength == null ? DEFAULT_MAX_MSG_LENGTH : maxMsgLength);
    }
}

@Component
public class Log4j2AsyncExecutor {
     // 根据自己项目的情况配置线程池信息
    public static ExecutorService executorService = new ThreadPoolExecutor(1,
            1,
            0L,
            TimeUnit.MILLISECONDS,
            new LinkedBlockingQueue<>(),
            new ThreadFactoryBuilder().setNameFormat("log4j2-async-executor-pool-%d").build());

    /**
     * @author
     * @date 
     */
    @PreDestroy
    public synchronized void shutdown() {
        if (executorService != null) {
            ThreadUtils.shutdown(executorService, 5, TimeUnit.SECONDS);
            executorService = null;
        }
    }
}

这里要特别说明:为什么要有Log4j2AsyncExecutor这个类?
因为发送告警是异步的,用了线程池,那在进程停止时,肯定要销毁线程池;而AbstractFilter虽然实现了LifeCycle接口,有stop方法,但是实际上stop方法并不会被调用到;所以这里依赖了Spring的PreDestroy来销毁。

配置

告警肯定是要取填充参数之后的内容,所以Filter放在LoggerAppender里会更合适;但代码里同时也兼容放在全局配置里的过滤器。


    ......
    
        
            
                
                
            
            ......
        
    
    
        
            ......
            
        
    

优化

以上功能虽然实现了基本的日志告警,但是里有两个地方有问题:

  1. 虽然filter在异常部分都catch了,但是通知类MessageUtil.postMessage是平台封装好的调用企业微信的功能,里面异常地方用了log.error。这里假设通知失败,比如服务器网络异常,导致postMessage方法打印了ERROR日志,再次进入告警模块,然后又继续失败,继续告警,导致无限循环。 (假如通知模块没有调用log.error,那这里就不会出问题)。
  2. 如果系统在短时间内出现大量异常,那就会发送大量的通知。应该对通知做频次过滤,这里过滤分两种:

    1. N秒内只能通知M次,这个控制比较简单,这里如果前M次都是相同的异常A,M+1是异常B,会导致B异常不会被通知。
    2. N秒内,相同的内容只能通知M次,这个控制虽然相对比较复杂,但是可以避免上面的那个问题。

根据内容做频次过滤,同时还能解决第一个问题,因为无限循环告警时,每次告警的内容是一样的,所以第二次告警是就会被过阻断调,频次过滤。
这里频次过滤没必要做的多复杂,N秒内一次即可,这时候可以用基于时间过期的缓存来快速实现。

代码

@Plugin(name = "ErrorNotifyFilter", category = Node.CATEGORY, elementType = Filter.ELEMENT_TYPE, printObject = true)
public class ErrorNotifyLog4j2Filter extends AbstractFilter {
    // 默认打印异常堆栈长度
    public static final int DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH = 3;
    // 默认消息最长长度
    // 2020-07-09 测试企业微信允许最大长度为4000
    public static final int DEFAULT_MAX_MSG_LENGTH = 4000;
    private String projectName;
    private List rtxReceivers;
    private LimitCacher limitCacher;
    private boolean noPrintExceptionStack;
    private int printExceptionStackLength;
    private int maxMsgLength;
    private Object lock = new Object();

    private ErrorNotifyLog4j2Filter(String projectName, String rtxReceivers, int limitSecond, int cacheMaxSize,
                                    boolean noPrintExceptionStack, int printExceptionStackLength, int maxMsgLength) {
        super();
        this.projectName = projectName;
        this.rtxReceivers = Lists.newArrayList(rtxReceivers.split(","));
        this.noPrintExceptionStack = noPrintExceptionStack;
        this.printExceptionStackLength = printExceptionStackLength <= 0 ? DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH : printExceptionStackLength;
        this.maxMsgLength = (maxMsgLength <= 0 || maxMsgLength > DEFAULT_MAX_MSG_LENGTH) ? DEFAULT_MAX_MSG_LENGTH : maxMsgLength;
        this.limitCacher = new LimitCacher(limitSecond, cacheMaxSize);
    }

    @Override
    public Result filter(LogEvent event) {
        notify(event.getLevel(), event.getMessage().getFormattedMessage(), event.getThrown());
        return super.filter(event);
    }

    @Override
    public Result filter(Logger logger, Level level, Marker marker, Message msg, Throwable t) {
        notify(level, msg.getFormattedMessage(), t);
        return super.filter(logger, level, marker, msg, t);
    }

    @Override
    public Result filter(Logger logger, Level level, Marker marker, Object msg, Throwable t) {
        notify(level, msg == null ? "" : msg.toString(), t);
        return super.filter(logger, level, marker, msg, t);
    }

    @Override
    public Result filter(Logger logger, Level level, Marker marker, String msg, Object... params) {
        notify(level, msg, getExceptionParam(params));
        return super.filter(logger, level, marker, msg, params);
    }

    /**
     * @param level
     * @param msg
     * @param t
     * @author 
     * @date 
     */
    private void notify(Level level, String msg, Throwable t) {
        try {
            if (level == null || level.intLevel() != Level.ERROR.intLevel()) {
                return;
            }
            if (StringUtils.isBlank(msg) && t == null) {
                return;
            }
            Log4j2AsyncExecutor.executorService.submit(() -> {
                try {
                    String notifyMsg = getNotifyMsg(msg, t);
                    if (isLimited(notifyMsg)) {
                        // 以下两种场景会被限制
                        // 1. MessageUtil.postMessag方法有问题(比如网络出问题导致调用企业微信失败),里面调用了log.error,会导致无限信号。
                        // 2. 系统在短时间之内大量异常。
                        // 打印到console。
                        System.out.println("notifyMsg[" + notifyMsg + "] is limited");
                        return;
                    }
                    MessageUtil.postMessage(Lists.newArrayList(MessageTypeEnum.RTX),
                            rtxReceivers,
                            getNotifyTitle(),
                            notifyMsg);
                } catch (Exception ignoreException) {
                    ignoreException.printStackTrace();
                }
            });
        } catch (Throwable ignoreException) {
            ignoreException.printStackTrace();
        }
    }

    /**
     * @param params
     * @return java.lang.Throwable
     * @author 
     * @date 
     */
    private Throwable getExceptionParam(Object... params) {
        if (params == null || params.length == 0) {
            return null;
        }
        for (Object param : params) {
            if (param instanceof Throwable) {
                return (Throwable) param;
            }
        }
        return null;
    }

    /**
     * 如果开启堆栈信息,能让告警更清晰,但同样的也就降低了性能
     *
     * @param msg
     * @param t
     * @return java.lang.String
     * @author 
     * @date 
     */
    private String getNotifyMsg(String msg, Throwable t) {
        String errorMsg = "信息:" + (msg == null ? "" : msg);
        String exceptionMsg = "";
        if (t != null) {
            exceptionMsg += "\n异常:" + t.toString();
            if (noPrintExceptionStack) {
                return errorMsg + exceptionMsg;
            }
            StackTraceElement[] stackTraceElements = t.getStackTrace();
            if (stackTraceElements == null || stackTraceElements.length == 0) {
                return errorMsg + exceptionMsg;
            }
            int length = Math.min(stackTraceElements.length, printExceptionStackLength);
            for (int i = 0; i < length; i++) {
                exceptionMsg += "\n" + stackTraceElements[i];
            }
        }
        String allMsg = errorMsg + exceptionMsg;
        return allMsg.length() > maxMsgLength ? allMsg.substring(0, maxMsgLength) : allMsg;
    }

    /**
     * @return java.lang.String
     * @author 
     * @date
     */
    private String getNotifyTitle() {
        String actualActiveProfiles = getActiveProfiles();
        StringBuilder ret = new StringBuilder("项目异常告警 ");
        if (StringUtils.isNotBlank(actualActiveProfiles)) {
            ret.append("【").append(actualActiveProfiles).append("】");
        }
        ret.append("【").append(projectName).append("】");
        ret.append("【").append(IPUtils.getLocalHostIP()).append("】");
        ret.append("【").append(DateUtils.formatDateTime(new Date())).append("】");
        return ret.toString();
    }

    /**
     * 是否被限制
     *
     * @param notifyMsg
     * @return boolean
     * @author 
     * @date 
     */
    private boolean isLimited(String notifyMsg) throws ExecutionException {
        // 根据内容限制
        String encodeMsg = DigestUtils.md5Hex(notifyMsg);
        String value = limitCacher.getCache(encodeMsg);
        if (value == LimitCacher.DEFAILT_VALUE) {
            return true;
        }
        synchronized (lock) {
            value = limitCacher.getCache(encodeMsg);
            if (value == LimitCacher.DEFAILT_VALUE) {
                return true;
            }
            limitCacher.put(encodeMsg, LimitCacher.DEFAILT_VALUE);
            return false;
        }
    }

    /**
     * @return java.lang.String
     * @author 
     * @date 
     */
    private String getActiveProfiles() {
        String[] activeProfiles = SpringContextUtil.getApplicationContext() == null ? null : SpringContextUtil.getActiveProfile();
        if (activeProfiles == null || activeProfiles.length == 0) {
            return "";
        }
        List actualActiveProfiles = Arrays.stream(activeProfiles)
                // 部分项目的profile中有用到include,include中含有"-",过滤掉include的数据
                .filter(str -> !str.contains("-"))
                .collect(Collectors.toList());
        return StringUtils.join(actualActiveProfiles, ",");
    }

    /**
     * @param projectName               项目名,必填
     * @param rtxReceivers              告警人,多个之间以","分隔,必填
     * @param limitSecond               如果瞬间有大量"相同内容"的异常,在limitSecond秒内只会告警一次
     * @param cacheMaxSize              配合limitSecond,缓存当前已告警的数量,指定最多存储多少条,防止无限制存储数据导致内存溢出
     * @param noPrintExceptionStack     异常信息内是否不打印堆栈信息
     * @param printExceptionStackLength 如果打印堆栈信息,打印多少行,取Math.min(printExceptionStackLength, 堆栈长度)
     * @param maxMsgLength              消息内容长度限制,默认4000,当前测试"企业微信接口"超过4000就发送失败
     * @return com.tencent.tscm.common.log4j2.ErrorNotifyLog4j2Filter
     * @author 
     * @date
     */
    @PluginFactory
    public static ErrorNotifyLog4j2Filter createFilter(@PluginAttribute("projectName") final String projectName,                                                  @PluginAttribute("rtxReceivers") final String rtxReceivers,                                                   @PluginAttribute("limitSecond") final Integer limitSecond,                                                   @PluginAttribute("cacheMaxSize") final Integer cacheMaxSize,                                                  @PluginAttribute("noPrintExceptionStack") final Boolean noPrintExceptionStack,                                                  @PluginAttribute("printExceptionStackLength") final Integer printExceptionStackLength,                                                @PluginAttribute("maxMsgLength") final Integer maxMsgLength) {
        return new ErrorNotifyLog4j2Filter(projectName, rtxReceivers,
                limitSecond == null || limitSecond <= 0 ? LimitCacher.DEFAULT_LIMIT_SECOND : limitSecond,
                cacheMaxSize == null || cacheMaxSize <= 0 ? LimitCacher.DEFAULT_CACHE_MAX_SIZE : cacheMaxSize,
                noPrintExceptionStack == null ? false : noPrintExceptionStack.booleanValue(),
                printExceptionStackLength == null ? DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH : printExceptionStackLength,
                maxMsgLength == null ? DEFAULT_MAX_MSG_LENGTH : maxMsgLength);
    }
}

@Slf4j
public abstract class AbstractLocalCache implements Cache {
    //缓存多久超时
    protected static long expireAfterWriteDuration = 30;
    //时间单位
    protected static TimeUnit timeUnit = TimeUnit.MINUTES;
    protected LoadingCache cache;

    public AbstractLocalCache() {
        this(expireAfterWriteDuration, timeUnit);
    }

    public AbstractLocalCache(Executor executor) {
        this(expireAfterWriteDuration, timeUnit, executor);
    }

    public AbstractLocalCache(long expireAfterWriteDuration, TimeUnit timeUnit) {
        this(CacheBuilder.newBuilder().refreshAfterWrite(expireAfterWriteDuration, timeUnit));
    }

    public AbstractLocalCache(long expireAfterWriteDuration, TimeUnit timeUnit, Executor executor) {
        this(CacheBuilder.newBuilder().refreshAfterWrite(expireAfterWriteDuration, timeUnit), executor);
    }

    public AbstractLocalCache(CacheBuilder cacheBuilder) {
        cache = cacheBuilder.build(new CacheLoader() {
            @Override
            public V load(K key) throws Exception {
                return loadData(key);
            }
        });
    }

    public AbstractLocalCache(CacheBuilder cacheBuilder, Executor executor) {
        cache = cacheBuilder.build(CacheLoader.asyncReloading(new CacheLoader() {
            @Override
            public V load(K key) throws Exception {
                return loadData(key);
            }

            @Override
            public ListenableFuture reload(K key, V oldValue) throws Exception {
                try {
                    return super.reload(key, oldValue);
                } catch (Exception e) {
                    log.error("reload cache[{}] error, return oldVlue[{}]!", key, oldValue, e);
                    return Futures.immediateFuture(oldValue);
                }
            }
        }, executor));
    }

    public AbstractLocalCache(CacheLoader cacheLoader) {
        this(CacheBuilder.newBuilder().refreshAfterWrite(expireAfterWriteDuration, timeUnit), cacheLoader);
    }

    public AbstractLocalCache(CacheLoader cacheLoader, Executor executor) {
        this(CacheBuilder.newBuilder().refreshAfterWrite(expireAfterWriteDuration, timeUnit), cacheLoader, executor);
    }

    public AbstractLocalCache(CacheBuilder cacheBuilder, CacheLoader cacheLoader) {
        this(cacheBuilder.build(cacheLoader));
    }

    public AbstractLocalCache(CacheBuilder cacheBuilder, CacheLoader cacheLoader, Executor executor) {
        this(cacheBuilder.build(CacheLoader.asyncReloading(cacheLoader, executor)));
    }

    public AbstractLocalCache(LoadingCache cache) {
        this.cache = cache;
    }

    /**
     * 加载缓存
     *
     * @param key
     * @return
     * @author
     * @date 2017/11/8 14:13
     */
    protected abstract V loadData(K key);

    /**
     * 获取缓存
     *
     * @param key
     * @return
     * @throws ExecutionException
     * @author
     * @date 2017/11/8 14:13
     */
    public V getCache(K key) throws ExecutionException {
        return cache.get(key);
    }

    /**
     * 刷新
     *
     * @param key
     * @author 
     * @date 2017/12/29 11:42
     */
    public void refresh(K key) {
        cache.refresh(key);
    }

    @Override
    public V getIfPresent(Object o) {
        return cache.getIfPresent(o);
    }

    @Override
    public V get(K k, Callable callable) throws ExecutionException {
        return cache.get(k, callable);
    }

    @Override
    public ImmutableMap getAllPresent(Iterable iterable) {
        return cache.getAllPresent(iterable);
    }

    @Override
    public void put(K k, V v) {
        cache.put(k, v);
    }

    @Override
    public void putAll(Map map) {
        cache.putAll(map);
    }

    @Override
    public void invalidate(Object o) {
        cache.invalidate(o);
    }

    @Override
    public void invalidateAll(Iterable iterable) {
        cache.invalidateAll(iterable);
    }

    @Override
    public void invalidateAll() {
        cache.invalidateAll();
    }

    @Override
    public long size() {
        return cache.size();
    }

    @Override
    public CacheStats stats() {
        return cache.stats();
    }

    @Override
    public ConcurrentMap asMap() {
        return cache.asMap();
    }

    @Override
    public void cleanUp() {
        cache.cleanUp();
    }
}

public class LimitCacher extends AbstractLocalCache {
    public static final String DEFAILT_VALUE = "DEFAILT_VALUE";
    public static final String NULL_VALUE = "NULL_VALUE";
    public static final int DEFAULT_LIMIT_SECOND = 1;
    public static final int DEFAULT_CACHE_MAX_SIZE = 1000;

    public LimitCacher(long expireAfterWriteDuration, int maximumSize) {
        super(CacheBuilder.newBuilder().maximumSize(maximumSize).refreshAfterWrite(expireAfterWriteDuration, TimeUnit.SECONDS));
    }

    @Override
    protected String loadData(String key) {
        return NULL_VALUE;
    }
}

避坑

NEUTRALACCEPT的区别是:

  1. 如果配置的是ACCEPT,则表示过滤通过且不走后续的Filter。
  2. 如果配置的是NEUTRAL,也通过并继续后续的Filter。

备注

本篇文章的做法,是告警跟项目在同一进程内,这样有不少缺点,比如:

  1. 如果告警功能有问题,比如造成内存泄漏,可能会影响到项目正常运行。
  2. 升级不方便:功能改进/BUG修复时,业务方需要升级的jar包,这样会给业务方造成困扰;而且无法统一各个业务方的版本,后续版本升级时,还需要考虑各个版本的兼容情况。

综上,本篇文章的做法,比较适合小系统。

你可能感兴趣的:(java,log4j,告警,监控)