敏感词过滤工具类
2018-08-01 本文已影响0人
皮皮咕
/**
* @author Administrator
* @create 2018/7/31
*/
public class SensitiveWordUtil {
private static Logger logger = LoggerFactory.getLogger(SensitiveWordUtil.class);
/**
* 常量定义
*/
private static final String KEY_IS_END = "isEnd";
private static final String IS_END = "1";
private static final String NOT_END = "0";
private static final String filePath = "src/main/resources/dictionary.txt";
/**
* 敏感词匹配规则
* 最小匹配规则,如:敏感词库["中国","中国人"],语句:"我是中国人",匹配结果:我是[中国]人
* 最大匹配规则,如:敏感词库["中国","中国人"],语句:"我是中国人",匹配结果:我是[中国人]
*/
public static final int MIN_MATCH_TYPE = 1;
public static final int MAX_MaTCH_TYPE = 2;
/**
* 本地存储的DFA数据模型
*/
private static Map sensitiveWordMap;
static {
// 初始化操作
initSensitiveWordMap(getSensitiveWordSet());
}
/**
* 获取本地词典
* @return
*/
private static Set<String> getSensitiveWordSet() {
InputStream inputStream = null;
InputStreamReader inputStreamReader = null;
BufferedReader bufferedReader = null;
try {
File file = new File(filePath);
inputStream = new FileInputStream(file);
inputStreamReader = new InputStreamReader(inputStream, "UTf-8");
bufferedReader = new BufferedReader(inputStreamReader);
Set<String> sensitiveWordSet = new HashSet<>();
String line;
while ((line = bufferedReader.readLine()) != null) {
sensitiveWordSet.add(line);
}
logger.info("敏感词的数量:" + sensitiveWordSet.size());
return sensitiveWordSet;
} catch (Exception e) {
logger.error("获取本地敏感词库出错",e);
} finally {
// 关闭资源
try {
if (bufferedReader != null) {
bufferedReader.close();
}
if (inputStreamReader != null) {
inputStreamReader.close();
}
if (inputStream != null) {
inputStream.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
return null;
}
/**
* 将词典转化为DFA数据模型
* @param sensitiveWordSet
*/
private static void initSensitiveWordMap(Set<String> sensitiveWordSet) {
// 校验
if (sensitiveWordSet == null || sensitiveWordSet.size() <= 0) {
return;
}
//初始化敏感词容器,减少扩容操作
sensitiveWordMap = new HashMap(sensitiveWordSet.size());
String key;
Map nowMap;
Map<String, String> newWorMap;
//迭代sensitiveWordSet
Iterator<String> iterator = sensitiveWordSet.iterator();
while (iterator.hasNext()) {
//关键字
key = iterator.next();
nowMap = sensitiveWordMap;
for (int i = 0; i < key.length(); i++) {
//转换成char型
char keyChar = key.charAt(i);
//库中获取关键字
Object wordMap = nowMap.get(keyChar);
//如果存在该key,直接赋值,用于下一个循环获取
if (wordMap != null) {
nowMap = (Map) wordMap;
} else {
//不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个
newWorMap = new HashMap<>(2);
//不是最后一个
newWorMap.put(KEY_IS_END, NOT_END);
nowMap.put(keyChar, newWorMap);
nowMap = newWorMap;
}
if (i == key.length() - 1) {
//最后一个
nowMap.put(KEY_IS_END, IS_END);
}
}
}
}
/**
* 检查文字中是否包含敏感字符,检查规则如下:<br>
* @param txt
* @param beginIndex
* @param matchType
* @return 如果存在,则返回敏感词字符的长度,不存在返回0
*/
private static int checkSensitiveWord(String txt, int beginIndex, int matchType) {
//敏感词结束标识位:用于敏感词只有1位的情况
boolean flag = false;
//匹配标识数默认为0
int matchFlag = 0;
char word;
Map nowMap = sensitiveWordMap;
for (int i = beginIndex; i < txt.length(); i++) {
word = txt.charAt(i);
//获取指定key
nowMap = (Map) nowMap.get(word);
if (nowMap != null) {
//存在,则判断是否为最后一个
//找到相应key,匹配标识+1
matchFlag++;
//如果为最后一个匹配规则,结束循环,返回匹配标识数
if ("1".equals(nowMap.get("isEnd"))) {
//结束标志位为true
flag = true;
//最小规则,直接返回,最大规则还需继续查找
if (MIN_MATCH_TYPE == matchType) {
break;
}
}
} else {//不存在,直接返回
break;
}
}
if (matchFlag < 2 || !flag) {
//长度必须大于等于1,为词
matchFlag = 0;
}
return matchFlag;
}
/**
* 获取文字中的敏感词
* @param txt 文字
* @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则
* @return
*/
private static Set<String> getSensitiveWord(String txt, int matchType) {
Set<String> sensitiveWordSet = new HashSet<>();
for (int i = 0; i < txt.length(); i++) {
//判断是否包含敏感字符
int length = checkSensitiveWord(txt, i, matchType);
if (length > 0) {
//存在,加入set中
sensitiveWordSet.add(txt.substring(i, i + length));
//减1的原因,是因为for会自增
i = i + length - 1;
}
}
logger.info("语句中包含敏感词的个数为:" + sensitiveWordSet.size() + "。包含:" + sensitiveWordSet);
return sensitiveWordSet;
}
/**
* 判断文字是否包含敏感字符
* @param txt 文字
* @return 若包含返回true,否则返回false
*/
public static boolean contains(String txt) {
logger.info("待检测的字符串为:" + txt);
Set<String> sensitiveWord = getSensitiveWord(txt, MAX_MaTCH_TYPE);
if (sensitiveWord != null && sensitiveWord.size() > 0) {
return true;
}
return false;
}
}