组员直呼这样写太优雅了 | 敏感词过滤与脱敏
业务背景
主要为IM私聊、直播间弹幕、公屏、大厅广播消息实现一个敏感词过滤和脱敏处理
需求分析与拆解
- 敏感词库定义与导入
- 敏感词匹配DFA算法实现
- 黑、白名单词库定义
实现思路
- 封装成无侵入式组件,方便组内人员复用
- 使用时支持选择过滤或脱敏处理方式
- 利用Spring EL表达式动态获取/改变bean属性值
实现过程
黑白名单词库导入
每行一个敏感词,词库可网上寻找或者自定义
词库工具类封装
public class SensitiveWordUtil { /** * 词库上下文环境 */ public static final WordContext CONTENT = new WordContext(); public static final WordFilter WORD_FILTER = new WordFilter(CONTENT); }
public class WordContext { /** * 敏感词字典 */ private final Map wordMap = new HashMap(1024); /** * 是否已初始化 */ private boolean init; /** * 黑名单列表 */ private final String blackList; /** * 白名单列表 */ private final String whiteList; public WordContext() { this.blackList = "/blacklist.txt"; this.whiteList = "/whitelist.txt"; initKeyWord(); } public WordContext(String blackList, String whiteList) { this.blackList = blackList; this.whiteList = whiteList; initKeyWord(); } /** * 获取初始化的敏感词列表 * * @return 敏感词列表 */ public Map getWordMap() { return wordMap; } /** * 初始化 */ private synchronized void initKeyWord() { try { if (!init) { // 将敏感词库加入到HashMap中 addWord(readWordFile(blackList), WordType.BLACK); // 将非敏感词库也加入到HashMap中 addWord(readWordFile(whiteList), WordType.WHITE); } init = true; } catch (Exception e) { throw new RuntimeException(e); } } /** * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br> * 中 = { isEnd = 0 国 = {<br> * isEnd = 1 人 = {isEnd = 0 民 = {isEnd = 1} } 男 = { isEnd = 0 人 = { isEnd = 1 } * } } } 五 = { isEnd = 0 星 = { isEnd = 0 红 = { isEnd = 0 旗 = { isEnd = 1 } } } } */ public void addWord(Iterable<String> wordList, WordType wordType) { Map nowMap; Map<String, String> newWorMap; // 迭代keyWordSet for (String key : wordList) { nowMap = wordMap; for (int i = 0; i < key.length(); i++) { // 转换成char型 char keyChar = key.charAt(i); // 获取 Object wordMap = nowMap.get(keyChar); // 如果存在该key,直接赋值 if (wordMap != null) { nowMap = (Map) wordMap; } else { // 不存在则构建一个map,同时将isEnd设置为0,因为他不是最后一个 newWorMap = new HashMap<>(4); // 不是最后一个 newWorMap.put("isEnd", String.valueOf(EndType.HAS_NEXT.ordinal())); nowMap.put(keyChar, newWorMap); nowMap = newWorMap; } if (i == key.length() - 1) { // 最后一个 nowMap.put("isEnd", String.valueOf(EndType.IS_END.ordinal())); nowMap.put("isWhiteWord", String.valueOf(wordType.ordinal())); } } } } /** * 在线删除敏感词 * * @param wordList 敏感词列表 * @param wordType 黑名单 BLACk,白名单WHITE */ public void removeWord(Iterable<String> wordList, WordType wordType) { Map nowMap; for (String key : wordList) { List<Map> cacheList = new ArrayList<>(); nowMap = wordMap; for (int i = 0; i < key.length(); i++) { char keyChar = key.charAt(i); Object map = nowMap.get(keyChar); if (map != null) { nowMap = (Map) map; cacheList.add(nowMap); } else { return; } if (i == key.length() - 1) { char[] keys = key.toCharArray(); boolean cleanable = false; char lastChar = 0; for (int j = cacheList.size() - 1; j >= 0; j--) { Map cacheMap = cacheList.get(j); if (j == cacheList.size() - 1) { if (String.valueOf(WordType.BLACK.ordinal()).equals(cacheMap.get("isWhiteWord"))) { if (wordType == WordType.WHITE) { return; } } if (String.valueOf(WordType.WHITE.ordinal()).equals(cacheMap.get("isWhiteWord"))) { if (wordType == WordType.BLACK) { return; } } cacheMap.remove("isWhiteWord"); cacheMap.remove("isEnd"); if (cacheMap.size() == 0) { cleanable = true; continue; } } if (cleanable) { Object isEnd = cacheMap.get("isEnd"); if (String.valueOf(EndType.IS_END.ordinal()).equals(isEnd)) { cleanable = false; } cacheMap.remove(lastChar); } lastChar = keys[j]; } if (cleanable) { wordMap.remove(lastChar); } } } } } /** * 读取敏感词库中的内容,将内容添加到set集合中 */ private Set<String> readWordFile(String file) throws Exception { Set<String> set; // 字符编码 String encoding = "UTF-8"; try (InputStreamReader read = new InputStreamReader( this.getClass().getResourceAsStream(file), encoding)) { set = new HashSet<>(); BufferedReader bufferedReader = new BufferedReader(read); String txt; // 读取文件,将文件内容放入到set中 while ((txt = bufferedReader.readLine()) != null) { set.add(txt); } } // 关闭文件流 return set; }
public class WordFilter { /** * 敏感词表 */ private final Map wordMap; /** * 构造函数 */ public WordFilter(WordContext context) { this.wordMap = context.getWordMap(); } /** * 替换敏感词 * * @param text 输入文本 */ public String replace(final String text) { return replace(text, 0, '*'); } /** * 替换敏感词 * * @param text 输入文本 * @param symbol 替换符号 */ public String replace(final String text, final char symbol) { return replace(text, 0, symbol); } /** * 替换敏感词 * * @param text 输入文本 * @param skip 文本距离 * @param symbol 替换符号 */ public String replace(final String text, final int skip, final char symbol) { char[] charset = text.toCharArray(); for (int i = 0; i < charset.length; i++) { FlagIndex fi = getFlagIndex(charset, i, skip); if (fi.isFlag()) { if (!fi.isWhiteWord()) { for (int j : fi.getIndex()) { charset[j] = symbol; } } else { i += fi.getIndex().size() - 1; } } } return new String(charset); } /** * 是否包含敏感词 * * @param text 输入文本 */ public boolean include(final String text) { return include(text, 0); } /** * 是否包含敏感词 * * @param text 输入文本 * @param skip 文本距离 */ public boolean include(final String text, final int skip) { boolean include = false; char[] charset = text.toCharArray(); for (int i = 0; i < charset.length; i++) { FlagIndex fi = getFlagIndex(charset, i, skip); if(fi.isFlag()) { if (fi.isWhiteWord()) { i += fi.getIndex().size() - 1; } else { include = true; break; } } } return include; } /** * 获取敏感词数量 * * @param text 输入文本 */ public int wordCount(final String text) { return wordCount(text, 0); } /** * 获取敏感词数量 * * @param text 输入文本 * @param skip 文本距离 */ public int wordCount(final String text, final int skip) { int count = 0; char[] charset = text.toCharArray(); for (int i = 0; i < charset.length; i++) { FlagIndex fi = getFlagIndex(charset, i, skip); if (fi.isFlag()) { if(fi.isWhiteWord()) { i += fi.getIndex().size() - 1; } else { count++; } } } return count; } /** * 获取敏感词列表 * * @param text 输入文本 */ public List<String> wordList(final String text) { return wordList(text, 0); } /** * 获取敏感词列表 * * @param text 输入文本 * @param skip 文本距离 */ public List<String> wordList(final String text, final int skip) { List<String> wordList = new ArrayList<>(); char[] charset = text.toCharArray(); for (int i = 0; i < charset.length; i++) { FlagIndex fi = getFlagIndex(charset, i, skip); if (fi.isFlag()) { if(fi.isWhiteWord()) { i += fi.getIndex().size() - 1; } else { StringBuilder builder = new StringBuilder(); for (int j : fi.getIndex()) { char word = text.charAt(j); builder.append(word); } wordList.add(builder.toString()); } } } return wordList; } /** * 获取标记索引 * * @param charset 输入文本 * @param begin 检测起始 * @param skip 文本距离 */ private FlagIndex getFlagIndex(final char[] charset, final int begin, final int skip) { FlagIndex fi = new FlagIndex(); Map current = wordMap; boolean flag = false; int count = 0; List<Integer> index = new ArrayList<>(); for (int i = begin; i < charset.length; i++) { char word = charset[i]; Map mapTree = (Map) current.get(word); if (count > skip || (i == begin && Objects.isNull(mapTree))) { break; } if (Objects.nonNull(mapTree)) { current = mapTree; count = 0; index.add(i); } else { count++; if (flag && count > skip) { break; } } if ("1".equals(current.get("isEnd"))) { flag = true; } if ("1".equals(current.get("isWhiteWord"))) { fi.setWhiteWord(true); break; } } fi.setFlag(flag); fi.setIndex(index); return fi; } }
注意:此源代码出自 https://gitee.com/humingzhang/wordfilter 其余的代码我不在此贴出,大家有兴趣可以自己去看
敏感词注解与方法拦截实现
@Retention(value = RetentionPolicy.RUNTIME) @Target(value = {ElementType.METHOD}) public @interface SensitiveWordFilter { /** * 内容 * * @return */ String[] content(); /** * 过滤类型 * * @return */ SensitiveWordFilterType filterType() default SensitiveWordFilterType.FILTER; }
@AllArgsConstructor public enum SensitiveWordFilterType { /** * 过滤 */ FILTER, /** * 替换/脱敏 */ REPLACE, ; }
@Slf4j public class SensitiveWordInterceptor implements MethodInterceptor { private static final ParameterNameDiscoverer NAME_DISCOVERER = new DefaultParameterNameDiscoverer(); private static final ExpressionParser PARSER = new SpelExpressionParser(); private BeanResolver beanResolver; public SensitiveWordInterceptor(BeanFactory beanFactory) { this.beanResolver = new BeanFactoryResolver(beanFactory); } @Override public Object invoke(MethodInvocation invocation) throws Throwable { Class<?> cls = AopProxyUtils.ultimateTargetClass(invocation.getThis()); if (!cls.equals(invocation.getThis().getClass())) { return invocation.proceed(); } SensitiveWordFilter sensitiveWordFilter = invocation.getMethod().getAnnotation(SensitiveWordFilter.class); StandardEvaluationContext context = new MethodBasedEvaluationContext(null, invocation.getMethod(), invocation.getArguments(), NAME_DISCOVERER); context.setBeanResolver(beanResolver); String[] contentKeys = sensitiveWordFilter.content(); if (StringUtils.isEmpty(contentKeys)) { log.warn("过滤内容为空."); return invocation.proceed(); } for (String key : contentKeys) { String content = PARSER.parseExpression(key).getValue(context, String.class); if (StringUtils.isBlank(content)) { continue; } boolean include = SensitiveWordUtil.WORD_FILTER.include(StringUtils.deleteWhitespace(content)); if (sensitiveWordFilter.filterType().equals(SensitiveWordFilterType.FILTER)) { if (include) { log.error("内容包含敏感词,抛出异常 | key:{} | content:{}", key, content); throw new SensitiveWordException(SensitiveWordCode.CONTAINS_SENSITIVE_WORD); } } else if (sensitiveWordFilter.filterType().equals(SensitiveWordFilterType.REPLACE)) { if (include) { PARSER.parseExpression(key).setValue(context, SensitiveWordUtil.WORD_FILTER.replace(StringUtils.deleteWhitespace(content))); log.error("内容包含敏感词,已脱敏处理 | key:{} | content:{}", key, content); } } } return invocation.proceed(); } }
public class SensitiveWordAnnotationAdvisor extends AbstractPointcutAdvisor implements BeanFactoryAware { private final Advice advice; private final Pointcut pointcut = AnnotationMatchingPointcut.forMethodAnnotation(SensitiveWordFilter.class); public SensitiveWordAnnotationAdvisor(@NonNull SensitiveWordInterceptor sensitiveWordInterceptor, int order) { this.advice = sensitiveWordInterceptor; setOrder(order); } @Override public Pointcut getPointcut() { return this.pointcut; } @Override public Advice getAdvice() { return this.advice; } @Override public void setBeanFactory(BeanFactory beanFactory) throws BeansException { if (this.advice instanceof BeanFactoryAware) { ((BeanFactoryAware) this.advice).setBeanFactory(beanFactory); } } }
@Configuration public class SensitiveWordFilterAutoConfiguration { @Bean @ConditionalOnMissingBean public SensitiveWordInterceptor sensitiveWordInterceptor(BeanFactory beanFactory) { return new SensitiveWordInterceptor(beanFactory); } @Bean @ConditionalOnMissingBean public SensitiveWordAnnotationAdvisor sensitiveWordAnnotationAdvisor(SensitiveWordInterceptor sensitiveWordInterceptor) { return new SensitiveWordAnnotationAdvisor(sensitiveWordInterceptor, Ordered.LOWEST_PRECEDENCE); } }
OK代码已完成,接下来我们看下实际使用够不够优雅和直观
过滤抛出异常
@SensitiveWordFilter(content = {"#bo.name", "#bo.intro"}, filterType = SensitiveWordFilterType.FILTER) public void update(LiveRoomUpdateBo bo) { }
脱敏返回数据
@SensitiveWordFilter(content = {"#bo.name", "#bo.intro"}, filterType = SensitiveWordFilterType.REPLACE) public void update(LiveRoomUpdateBo bo) { }
总结
由于这种方式非常优雅和无侵入式,基于这种方式,可以扩展很多的其他用法,比如权限判断、分布式锁等
但是这种Aop代理方法拦截方式也不是用得越多越好,过多的反射操作势必会影响接口性能。
#从0到1千万级直播项目#从0-1开发千万级直播项目 文章被收录于专栏
文章内容源自本人所在互联网社交企业实战项目,分享、记录从0-1做一个千万级直播项目,内容包括高并发场景下技术选型、架构设计、业务解决方案等。