需求
系统报ERROR错误时,能实时做到消息通知。
思路
当前项目比较小,不想过多的依赖额外的第三方组件。
项目在ERROR时,都会打印ERROR日志,所以可以在log4j接收到ERROR日志请求时,发送通知消息。
实践
Filter
是log4j2的扩展点,从图中(图片来自如何编写Log4j2脱敏插件)流程可以看到,Filter
分别可以在全局
、Logger
、Appender
三个地方做过滤。
三个地方对应的log4j.xml
配置地方如下:
log4j提供了过滤器的基类AbstractFilter
:
全局过滤器
入口方法是filter(Logger logger, Level level, Marker marker, String msg, Object... params)
,msg
是填充参数之前的内容,params
是参数列表,包含Throwable
对象。Logger
和Appender
入口方式是filter(final LogEvent event)
,通过event.getMessage().getFormattedMessage()
取到填充参数之后的内容,通过event.getThrown()
获取异常对象。
代码
@Plugin(name = "ErrorNotifyFilter", category = Node.CATEGORY, elementType = Filter.ELEMENT_TYPE, printObject = true)
public class ErrorNotifyLog4j2Filter extends AbstractFilter {
// 默认打印异常堆栈长度
public static final int DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH = 3;
// 默认消息最长长度
// 2020-07-09 测试企业微信允许最大长度为4000
public static final int DEFAULT_MAX_MSG_LENGTH = 4000;
private String projectName;
private List rtxReceivers;
private boolean noPrintExceptionStack;
private int printExceptionStackLength;
private int maxMsgLength;
private Object lock = new Object();
private ErrorNotifyLog4j2Filter(String projectName, String rtxReceivers,
boolean noPrintExceptionStack, int printExceptionStackLength, int maxMsgLength) {
super();
this.projectName = projectName;
this.rtxReceivers = Lists.newArrayList(rtxReceivers.split(","));
this.noPrintExceptionStack = noPrintExceptionStack;
this.printExceptionStackLength = printExceptionStackLength <= 0 ? DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH : printExceptionStackLength;
this.maxMsgLength = (maxMsgLength <= 0 || maxMsgLength > DEFAULT_MAX_MSG_LENGTH) ? DEFAULT_MAX_MSG_LENGTH : maxMsgLength;
}
@Override
public Result filter(LogEvent event) {
notify(event.getLevel(), event.getMessage().getFormattedMessage(), event.getThrown());
return super.filter(event);
}
@Override
public Result filter(Logger logger, Level level, Marker marker, Message msg, Throwable t) {
notify(level, msg.getFormattedMessage(), t);
return super.filter(logger, level, marker, msg, t);
}
@Override
public Result filter(Logger logger, Level level, Marker marker, Object msg, Throwable t) {
notify(level, msg == null ? "" : msg.toString(), t);
return super.filter(logger, level, marker, msg, t);
}
@Override
public Result filter(Logger logger, Level level, Marker marker, String msg, Object... params) {
notify(level, msg, getExceptionParam(params));
return super.filter(logger, level, marker, msg, params);
}
/**
* @param level
* @param msg
* @param t
* @author
* @date
*/
private void notify(Level level, String msg, Throwable t) {
try {
if (level == null || level.intLevel() != Level.ERROR.intLevel()) {
return;
}
if (StringUtils.isBlank(msg) && t == null) {
return;
}
Log4j2AsyncExecutor.executorService.submit(() -> {
try {
String notifyMsg = getNotifyMsg(msg, t); MessageUtil.postMessage(Lists.newArrayList(MessageTypeEnum.RTX),
rtxReceivers,
getNotifyTitle(),
notifyMsg);
} catch (Exception ignoreException) {
ignoreException.printStackTrace();
}
});
} catch (Throwable ignoreException) {
ignoreException.printStackTrace();
}
}
/**
* @param params
* @return java.lang.Throwable
* @author
* @date
*/
private Throwable getExceptionParam(Object... params) {
if (params == null || params.length == 0) {
return null;
}
for (Object param : params) {
if (param instanceof Throwable) {
return (Throwable) param;
}
}
return null;
}
/**
* 如果开启堆栈信息,能让告警更清晰,但同样的也就降低了性能
*
* @param msg
* @param t
* @return java.lang.String
* @author
* @date
*/
private String getNotifyMsg(String msg, Throwable t) {
String errorMsg = "信息:" + (msg == null ? "" : msg);
String exceptionMsg = "";
if (t != null) {
exceptionMsg += "\n异常:" + t.toString();
if (noPrintExceptionStack) {
return errorMsg + exceptionMsg;
}
StackTraceElement[] stackTraceElements = t.getStackTrace();
if (stackTraceElements == null || stackTraceElements.length == 0) {
return errorMsg + exceptionMsg;
}
int length = Math.min(stackTraceElements.length, printExceptionStackLength);
for (int i = 0; i < length; i++) {
exceptionMsg += "\n" + stackTraceElements[i];
}
}
String allMsg = errorMsg + exceptionMsg;
return allMsg.length() > maxMsgLength ? allMsg.substring(0, maxMsgLength) : allMsg;
}
/**
* @return java.lang.String
* @author
* @date
*/
private String getNotifyTitle() {
String actualActiveProfiles = getActiveProfiles();
StringBuilder ret = new StringBuilder("项目异常告警 ");
if (StringUtils.isNotBlank(actualActiveProfiles)) {
ret.append("【").append(actualActiveProfiles).append("】");
}
ret.append("【").append(projectName).append("】");
ret.append("【").append(IPUtils.getLocalHostIP()).append("】");
ret.append("【").append(DateUtils.formatDateTime(new Date())).append("】");
return ret.toString();
}
/**
* @return java.lang.String
* @author
* @date
*/
private String getActiveProfiles() {
String[] activeProfiles = SpringContextUtil.getApplicationContext() == null ? null : SpringContextUtil.getActiveProfile();
if (activeProfiles == null || activeProfiles.length == 0) {
return "";
}
List actualActiveProfiles = Arrays.stream(activeProfiles)
// 部分项目的profile中有用到include,include中含有"-",过滤掉include的数据
.filter(str -> !str.contains("-"))
.collect(Collectors.toList());
return StringUtils.join(actualActiveProfiles, ",");
}
/**
* @return com.tscm.purchase.interfacer.log4j2.ErrorNotifyFilter
* @author
* @date 2020-05-14 18:37
*/
@PluginFactory
public static ErrorNotifyLog4j2Filter createFilter(@PluginAttribute("projectName") final String projectName, @PluginAttribute("rtxReceivers") final String rtxReceivers, @PluginAttribute("noPrintExceptionStack") final Boolean noPrintExceptionStack, @PluginAttribute("printExceptionStackLength") final Integer printExceptionStackLength, @PluginAttribute("maxMsgLength") final Integer maxMsgLength) {
return new ErrorNotifyLog4j2Filter(projectName, rtxReceivers,
noPrintExceptionStack == null ? false : noPrintExceptionStack.booleanValue(),
printExceptionStackLength == null ? DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH : printExceptionStackLength,
maxMsgLength == null ? DEFAULT_MAX_MSG_LENGTH : maxMsgLength);
}
}
@Component
public class Log4j2AsyncExecutor {
// 根据自己项目的情况配置线程池信息
public static ExecutorService executorService = new ThreadPoolExecutor(1,
1,
0L,
TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<>(),
new ThreadFactoryBuilder().setNameFormat("log4j2-async-executor-pool-%d").build());
/**
* @author
* @date
*/
@PreDestroy
public synchronized void shutdown() {
if (executorService != null) {
ThreadUtils.shutdown(executorService, 5, TimeUnit.SECONDS);
executorService = null;
}
}
}
这里要特别说明:为什么要有Log4j2AsyncExecutor
这个类?
因为发送告警是异步的,用了线程池,那在进程停止时,肯定要销毁线程池;而AbstractFilter
虽然实现了LifeCycle
接口,有stop
方法,但是实际上stop
方法并不会被调用到;所以这里依赖了Spring的PreDestroy
来销毁。
配置
告警肯定是要取填充参数之后的内容
,所以Filter
放在Logger
或Appender
里会更合适;但代码里同时也兼容放在全局
配置里的过滤器。
......
......
......
优化
以上功能虽然实现了基本的日志告警,但是里有两个地方有问题:
- 虽然
filter
在异常部分都catch
了,但是通知类MessageUtil.postMessage
是平台封装好的调用企业微信的功能,里面异常地方用了log.error
。这里假设通知失败,比如服务器网络异常,导致postMessage
方法打印了ERROR
日志,再次进入告警模块,然后又继续失败,继续告警,导致无限循环。 (假如通知模块没有调用log.error
,那这里就不会出问题)。 如果系统在短时间内出现大量异常,那就会发送大量的通知。应该对通知做频次过滤,这里过滤分两种:
- N秒内只能通知M次,这个控制比较简单,这里如果前M次都是相同的异常A,M+1是异常B,会导致B异常不会被通知。
- N秒内,相同的内容只能通知M次,这个控制虽然相对比较复杂,但是可以避免上面的那个问题。
根据内容做频次过滤,同时还能解决第一个问题,因为无限循环告警时,每次告警的内容是一样的,所以第二次告警是就会被过阻断调,频次过滤。
这里频次过滤没必要做的多复杂,N秒内一次即可,这时候可以用基于时间过期的缓存来快速实现。
代码
@Plugin(name = "ErrorNotifyFilter", category = Node.CATEGORY, elementType = Filter.ELEMENT_TYPE, printObject = true)
public class ErrorNotifyLog4j2Filter extends AbstractFilter {
// 默认打印异常堆栈长度
public static final int DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH = 3;
// 默认消息最长长度
// 2020-07-09 测试企业微信允许最大长度为4000
public static final int DEFAULT_MAX_MSG_LENGTH = 4000;
private String projectName;
private List rtxReceivers;
private LimitCacher limitCacher;
private boolean noPrintExceptionStack;
private int printExceptionStackLength;
private int maxMsgLength;
private Object lock = new Object();
private ErrorNotifyLog4j2Filter(String projectName, String rtxReceivers, int limitSecond, int cacheMaxSize,
boolean noPrintExceptionStack, int printExceptionStackLength, int maxMsgLength) {
super();
this.projectName = projectName;
this.rtxReceivers = Lists.newArrayList(rtxReceivers.split(","));
this.noPrintExceptionStack = noPrintExceptionStack;
this.printExceptionStackLength = printExceptionStackLength <= 0 ? DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH : printExceptionStackLength;
this.maxMsgLength = (maxMsgLength <= 0 || maxMsgLength > DEFAULT_MAX_MSG_LENGTH) ? DEFAULT_MAX_MSG_LENGTH : maxMsgLength;
this.limitCacher = new LimitCacher(limitSecond, cacheMaxSize);
}
@Override
public Result filter(LogEvent event) {
notify(event.getLevel(), event.getMessage().getFormattedMessage(), event.getThrown());
return super.filter(event);
}
@Override
public Result filter(Logger logger, Level level, Marker marker, Message msg, Throwable t) {
notify(level, msg.getFormattedMessage(), t);
return super.filter(logger, level, marker, msg, t);
}
@Override
public Result filter(Logger logger, Level level, Marker marker, Object msg, Throwable t) {
notify(level, msg == null ? "" : msg.toString(), t);
return super.filter(logger, level, marker, msg, t);
}
@Override
public Result filter(Logger logger, Level level, Marker marker, String msg, Object... params) {
notify(level, msg, getExceptionParam(params));
return super.filter(logger, level, marker, msg, params);
}
/**
* @param level
* @param msg
* @param t
* @author
* @date
*/
private void notify(Level level, String msg, Throwable t) {
try {
if (level == null || level.intLevel() != Level.ERROR.intLevel()) {
return;
}
if (StringUtils.isBlank(msg) && t == null) {
return;
}
Log4j2AsyncExecutor.executorService.submit(() -> {
try {
String notifyMsg = getNotifyMsg(msg, t);
if (isLimited(notifyMsg)) {
// 以下两种场景会被限制
// 1. MessageUtil.postMessag方法有问题(比如网络出问题导致调用企业微信失败),里面调用了log.error,会导致无限信号。
// 2. 系统在短时间之内大量异常。
// 打印到console。
System.out.println("notifyMsg[" + notifyMsg + "] is limited");
return;
}
MessageUtil.postMessage(Lists.newArrayList(MessageTypeEnum.RTX),
rtxReceivers,
getNotifyTitle(),
notifyMsg);
} catch (Exception ignoreException) {
ignoreException.printStackTrace();
}
});
} catch (Throwable ignoreException) {
ignoreException.printStackTrace();
}
}
/**
* @param params
* @return java.lang.Throwable
* @author
* @date
*/
private Throwable getExceptionParam(Object... params) {
if (params == null || params.length == 0) {
return null;
}
for (Object param : params) {
if (param instanceof Throwable) {
return (Throwable) param;
}
}
return null;
}
/**
* 如果开启堆栈信息,能让告警更清晰,但同样的也就降低了性能
*
* @param msg
* @param t
* @return java.lang.String
* @author
* @date
*/
private String getNotifyMsg(String msg, Throwable t) {
String errorMsg = "信息:" + (msg == null ? "" : msg);
String exceptionMsg = "";
if (t != null) {
exceptionMsg += "\n异常:" + t.toString();
if (noPrintExceptionStack) {
return errorMsg + exceptionMsg;
}
StackTraceElement[] stackTraceElements = t.getStackTrace();
if (stackTraceElements == null || stackTraceElements.length == 0) {
return errorMsg + exceptionMsg;
}
int length = Math.min(stackTraceElements.length, printExceptionStackLength);
for (int i = 0; i < length; i++) {
exceptionMsg += "\n" + stackTraceElements[i];
}
}
String allMsg = errorMsg + exceptionMsg;
return allMsg.length() > maxMsgLength ? allMsg.substring(0, maxMsgLength) : allMsg;
}
/**
* @return java.lang.String
* @author
* @date
*/
private String getNotifyTitle() {
String actualActiveProfiles = getActiveProfiles();
StringBuilder ret = new StringBuilder("项目异常告警 ");
if (StringUtils.isNotBlank(actualActiveProfiles)) {
ret.append("【").append(actualActiveProfiles).append("】");
}
ret.append("【").append(projectName).append("】");
ret.append("【").append(IPUtils.getLocalHostIP()).append("】");
ret.append("【").append(DateUtils.formatDateTime(new Date())).append("】");
return ret.toString();
}
/**
* 是否被限制
*
* @param notifyMsg
* @return boolean
* @author
* @date
*/
private boolean isLimited(String notifyMsg) throws ExecutionException {
// 根据内容限制
String encodeMsg = DigestUtils.md5Hex(notifyMsg);
String value = limitCacher.getCache(encodeMsg);
if (value == LimitCacher.DEFAILT_VALUE) {
return true;
}
synchronized (lock) {
value = limitCacher.getCache(encodeMsg);
if (value == LimitCacher.DEFAILT_VALUE) {
return true;
}
limitCacher.put(encodeMsg, LimitCacher.DEFAILT_VALUE);
return false;
}
}
/**
* @return java.lang.String
* @author
* @date
*/
private String getActiveProfiles() {
String[] activeProfiles = SpringContextUtil.getApplicationContext() == null ? null : SpringContextUtil.getActiveProfile();
if (activeProfiles == null || activeProfiles.length == 0) {
return "";
}
List actualActiveProfiles = Arrays.stream(activeProfiles)
// 部分项目的profile中有用到include,include中含有"-",过滤掉include的数据
.filter(str -> !str.contains("-"))
.collect(Collectors.toList());
return StringUtils.join(actualActiveProfiles, ",");
}
/**
* @param projectName 项目名,必填
* @param rtxReceivers 告警人,多个之间以","分隔,必填
* @param limitSecond 如果瞬间有大量"相同内容"的异常,在limitSecond秒内只会告警一次
* @param cacheMaxSize 配合limitSecond,缓存当前已告警的数量,指定最多存储多少条,防止无限制存储数据导致内存溢出
* @param noPrintExceptionStack 异常信息内是否不打印堆栈信息
* @param printExceptionStackLength 如果打印堆栈信息,打印多少行,取Math.min(printExceptionStackLength, 堆栈长度)
* @param maxMsgLength 消息内容长度限制,默认4000,当前测试"企业微信接口"超过4000就发送失败
* @return com.tencent.tscm.common.log4j2.ErrorNotifyLog4j2Filter
* @author
* @date
*/
@PluginFactory
public static ErrorNotifyLog4j2Filter createFilter(@PluginAttribute("projectName") final String projectName, @PluginAttribute("rtxReceivers") final String rtxReceivers, @PluginAttribute("limitSecond") final Integer limitSecond, @PluginAttribute("cacheMaxSize") final Integer cacheMaxSize, @PluginAttribute("noPrintExceptionStack") final Boolean noPrintExceptionStack, @PluginAttribute("printExceptionStackLength") final Integer printExceptionStackLength, @PluginAttribute("maxMsgLength") final Integer maxMsgLength) {
return new ErrorNotifyLog4j2Filter(projectName, rtxReceivers,
limitSecond == null || limitSecond <= 0 ? LimitCacher.DEFAULT_LIMIT_SECOND : limitSecond,
cacheMaxSize == null || cacheMaxSize <= 0 ? LimitCacher.DEFAULT_CACHE_MAX_SIZE : cacheMaxSize,
noPrintExceptionStack == null ? false : noPrintExceptionStack.booleanValue(),
printExceptionStackLength == null ? DEFAULT_PRINT_EXCEPTIONSTACK_LENGTH : printExceptionStackLength,
maxMsgLength == null ? DEFAULT_MAX_MSG_LENGTH : maxMsgLength);
}
}
@Slf4j
public abstract class AbstractLocalCache implements Cache {
//缓存多久超时
protected static long expireAfterWriteDuration = 30;
//时间单位
protected static TimeUnit timeUnit = TimeUnit.MINUTES;
protected LoadingCache cache;
public AbstractLocalCache() {
this(expireAfterWriteDuration, timeUnit);
}
public AbstractLocalCache(Executor executor) {
this(expireAfterWriteDuration, timeUnit, executor);
}
public AbstractLocalCache(long expireAfterWriteDuration, TimeUnit timeUnit) {
this(CacheBuilder.newBuilder().refreshAfterWrite(expireAfterWriteDuration, timeUnit));
}
public AbstractLocalCache(long expireAfterWriteDuration, TimeUnit timeUnit, Executor executor) {
this(CacheBuilder.newBuilder().refreshAfterWrite(expireAfterWriteDuration, timeUnit), executor);
}
public AbstractLocalCache(CacheBuilder cacheBuilder) {
cache = cacheBuilder.build(new CacheLoader() {
@Override
public V load(K key) throws Exception {
return loadData(key);
}
});
}
public AbstractLocalCache(CacheBuilder cacheBuilder, Executor executor) {
cache = cacheBuilder.build(CacheLoader.asyncReloading(new CacheLoader() {
@Override
public V load(K key) throws Exception {
return loadData(key);
}
@Override
public ListenableFuture reload(K key, V oldValue) throws Exception {
try {
return super.reload(key, oldValue);
} catch (Exception e) {
log.error("reload cache[{}] error, return oldVlue[{}]!", key, oldValue, e);
return Futures.immediateFuture(oldValue);
}
}
}, executor));
}
public AbstractLocalCache(CacheLoader cacheLoader) {
this(CacheBuilder.newBuilder().refreshAfterWrite(expireAfterWriteDuration, timeUnit), cacheLoader);
}
public AbstractLocalCache(CacheLoader cacheLoader, Executor executor) {
this(CacheBuilder.newBuilder().refreshAfterWrite(expireAfterWriteDuration, timeUnit), cacheLoader, executor);
}
public AbstractLocalCache(CacheBuilder cacheBuilder, CacheLoader cacheLoader) {
this(cacheBuilder.build(cacheLoader));
}
public AbstractLocalCache(CacheBuilder cacheBuilder, CacheLoader cacheLoader, Executor executor) {
this(cacheBuilder.build(CacheLoader.asyncReloading(cacheLoader, executor)));
}
public AbstractLocalCache(LoadingCache cache) {
this.cache = cache;
}
/**
* 加载缓存
*
* @param key
* @return
* @author
* @date 2017/11/8 14:13
*/
protected abstract V loadData(K key);
/**
* 获取缓存
*
* @param key
* @return
* @throws ExecutionException
* @author
* @date 2017/11/8 14:13
*/
public V getCache(K key) throws ExecutionException {
return cache.get(key);
}
/**
* 刷新
*
* @param key
* @author
* @date 2017/12/29 11:42
*/
public void refresh(K key) {
cache.refresh(key);
}
@Override
public V getIfPresent(Object o) {
return cache.getIfPresent(o);
}
@Override
public V get(K k, Callable extends V> callable) throws ExecutionException {
return cache.get(k, callable);
}
@Override
public ImmutableMap getAllPresent(Iterable> iterable) {
return cache.getAllPresent(iterable);
}
@Override
public void put(K k, V v) {
cache.put(k, v);
}
@Override
public void putAll(Map extends K, ? extends V> map) {
cache.putAll(map);
}
@Override
public void invalidate(Object o) {
cache.invalidate(o);
}
@Override
public void invalidateAll(Iterable> iterable) {
cache.invalidateAll(iterable);
}
@Override
public void invalidateAll() {
cache.invalidateAll();
}
@Override
public long size() {
return cache.size();
}
@Override
public CacheStats stats() {
return cache.stats();
}
@Override
public ConcurrentMap asMap() {
return cache.asMap();
}
@Override
public void cleanUp() {
cache.cleanUp();
}
}
public class LimitCacher extends AbstractLocalCache {
public static final String DEFAILT_VALUE = "DEFAILT_VALUE";
public static final String NULL_VALUE = "NULL_VALUE";
public static final int DEFAULT_LIMIT_SECOND = 1;
public static final int DEFAULT_CACHE_MAX_SIZE = 1000;
public LimitCacher(long expireAfterWriteDuration, int maximumSize) {
super(CacheBuilder.newBuilder().maximumSize(maximumSize).refreshAfterWrite(expireAfterWriteDuration, TimeUnit.SECONDS));
}
@Override
protected String loadData(String key) {
return NULL_VALUE;
}
}
避坑
NEUTRAL
和ACCEPT
的区别是:
- 如果配置的是
ACCEPT
,则表示过滤通过且不走后续的Filter。 - 如果配置的是
NEUTRAL
,也通过并继续后续的Filter。
备注
本篇文章的做法,是告警跟项目在同一进程内,这样有不少缺点,比如:
- 如果告警功能有问题,比如造成内存泄漏,可能会影响到项目正常运行。
- 升级不方便:功能改进/BUG修复时,业务方需要升级的jar包,这样会给业务方造成困扰;而且无法统一各个业务方的版本,后续版本升级时,还需要考虑各个版本的兼容情况。
综上,本篇文章的做法,比较适合小系统。