敏感词过滤工具类

2018-08-01  本文已影响0人  皮皮咕
/**
 * @author Administrator
 * @create 2018/7/31
 */
public class SensitiveWordUtil {

    private static Logger logger = LoggerFactory.getLogger(SensitiveWordUtil.class);
    /**
     * 常量定义
     */
    private static final String KEY_IS_END = "isEnd";
    private static final String IS_END = "1";
    private static final String NOT_END = "0";
    private static final String filePath = "src/main/resources/dictionary.txt";
    /**
     * 敏感词匹配规则
     * 最小匹配规则,如:敏感词库["中国","中国人"],语句:"我是中国人",匹配结果:我是[中国]人
     * 最大匹配规则,如:敏感词库["中国","中国人"],语句:"我是中国人",匹配结果:我是[中国人]
     */
    public static final int MIN_MATCH_TYPE = 1;
    public static final int MAX_MaTCH_TYPE = 2;
    /**
     * 本地存储的DFA数据模型
     */
    private static Map sensitiveWordMap;

    static {
        // 初始化操作
        initSensitiveWordMap(getSensitiveWordSet());
    }

    /**
     * 获取本地词典
     * @return
     */
    private static Set<String> getSensitiveWordSet() {
        InputStream inputStream = null;
        InputStreamReader inputStreamReader = null;
        BufferedReader bufferedReader = null;
        try {
            File file = new File(filePath);
            inputStream = new FileInputStream(file);
            inputStreamReader = new InputStreamReader(inputStream, "UTf-8");
            bufferedReader = new BufferedReader(inputStreamReader);
            Set<String> sensitiveWordSet = new HashSet<>();
            String line;
            while ((line = bufferedReader.readLine()) != null) {
                sensitiveWordSet.add(line);
            }
            logger.info("敏感词的数量:" + sensitiveWordSet.size());
            return sensitiveWordSet;
        }  catch (Exception e) {
            logger.error("获取本地敏感词库出错",e);
        } finally {
            // 关闭资源
            try {
                if (bufferedReader != null) {
                    bufferedReader.close();
                }
                if (inputStreamReader != null) {
                    inputStreamReader.close();
                }
                if (inputStream != null) {
                    inputStream.close();
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return null;
    }

    /**
     * 将词典转化为DFA数据模型
     * @param sensitiveWordSet
     */
    private static void initSensitiveWordMap(Set<String> sensitiveWordSet) {
        // 校验
        if (sensitiveWordSet == null || sensitiveWordSet.size() <= 0) {
            return;
        }
        //初始化敏感词容器,减少扩容操作
        sensitiveWordMap = new HashMap(sensitiveWordSet.size());
        String key;
        Map nowMap;
        Map<String, String> newWorMap;
        //迭代sensitiveWordSet
        Iterator<String> iterator = sensitiveWordSet.iterator();
        while (iterator.hasNext()) {
            //关键字
            key = iterator.next();
            nowMap = sensitiveWordMap;
            for (int i = 0; i < key.length(); i++) {
                //转换成char型
                char keyChar = key.charAt(i);
                //库中获取关键字
                Object wordMap = nowMap.get(keyChar);
                //如果存在该key,直接赋值,用于下一个循环获取
                if (wordMap != null) {
                    nowMap = (Map) wordMap;
                } else {
                    //不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
                    newWorMap = new HashMap<>(2);
                    //不是最后一个
                    newWorMap.put(KEY_IS_END, NOT_END);
                    nowMap.put(keyChar, newWorMap);
                    nowMap = newWorMap;
                }

                if (i == key.length() - 1) {
                    //最后一个
                    nowMap.put(KEY_IS_END, IS_END);
                }
            }
        }
    }

    /**
     * 检查文字中是否包含敏感字符,检查规则如下:<br>
     * @param txt
     * @param beginIndex
     * @param matchType
     * @return 如果存在,则返回敏感词字符的长度,不存在返回0
     */
    private static int checkSensitiveWord(String txt, int beginIndex, int matchType) {
        //敏感词结束标识位:用于敏感词只有1位的情况
        boolean flag = false;
        //匹配标识数默认为0
        int matchFlag = 0;
        char word;
        Map nowMap = sensitiveWordMap;
        for (int i = beginIndex; i < txt.length(); i++) {
            word = txt.charAt(i);
            //获取指定key
            nowMap = (Map) nowMap.get(word);
            if (nowMap != null) {
                //存在,则判断是否为最后一个
                //找到相应key,匹配标识+1
                matchFlag++;
                //如果为最后一个匹配规则,结束循环,返回匹配标识数
                if ("1".equals(nowMap.get("isEnd"))) {
                    //结束标志位为true
                    flag = true;
                    //最小规则,直接返回,最大规则还需继续查找
                    if (MIN_MATCH_TYPE == matchType) {
                        break;
                    }
                }
            } else {//不存在,直接返回
                break;
            }
        }
        if (matchFlag < 2 || !flag) {
            //长度必须大于等于1,为词
            matchFlag = 0;
        }
        return matchFlag;
    }

    /**
     * 获取文字中的敏感词
     * @param txt       文字
     * @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则
     * @return
     */
    private static Set<String> getSensitiveWord(String txt, int matchType) {
        Set<String> sensitiveWordSet = new HashSet<>();
        for (int i = 0; i < txt.length(); i++) {
            //判断是否包含敏感字符
            int length = checkSensitiveWord(txt, i, matchType);
            if (length > 0) {
                //存在,加入set中
                sensitiveWordSet.add(txt.substring(i, i + length));
                //减1的原因,是因为for会自增
                i = i + length - 1;
            }
        }
        logger.info("语句中包含敏感词的个数为:" + sensitiveWordSet.size() + "。包含:" + sensitiveWordSet);
        return sensitiveWordSet;
    }

    /**
     * 判断文字是否包含敏感字符
     * @param txt 文字
     * @return 若包含返回true,否则返回false
     */
    public static boolean contains(String txt) {
        logger.info("待检测的字符串为:" + txt);
        Set<String> sensitiveWord = getSensitiveWord(txt, MAX_MaTCH_TYPE);
        if (sensitiveWord != null && sensitiveWord.size() > 0) {
            return true;
        }
        return false;
    }

}
上一篇 下一篇

猜你喜欢

热点阅读