敏感词过滤算法(2)
2020-09-04 本文已影响0人
筑梦之队
去年写过一篇文章,名字叫做《敏感词过滤算法》,介绍了为什么需要敏感词过滤算法?以及使用什么方式来实现的。其中提到三种实现方法,其中最重要的是DFA算法。
当时给出的DFA算法的实现方式较为复杂;而我一贯推崇简单的解决问题的模型和代码实现。因为越复杂的模型和代码实现,意味着更多的BUG和更困难的维护。
果然不出所料,在最近一个游戏上线的过程中,就发现了其中的一个BUG。于是,在解决完BUG后,我重新梳理了一下代码的实现逻辑,发现可以进行优化。于是就进行了重构。
新的核心代码如下:
dfa.go
package dfaUtil
/*
DFA util, is used to verify whether a sentence has invalid words.
The underlying data structure is trie.
https://en.wikipedia.org/wiki/Trie
*/
// dfa util
type DFAUtil struct {
// The root node
root *trieNode
}
// 由于go不支持tuple,所以为了避免定义多余的struct,特别使用两个list来分别返回匹配的索引的上界和下界
// 在处理此方法的返回值时,需要两者配合使用
func (this *DFAUtil) searcSentence(sentence string) (startIndexList, endIndexList []int) {
// Point current node to the root node and initialize some variables.
currNode := this.root
start, end, valid := 0, 0, false
// Iterate the setence to handle each letter
sentenceRuneList := []rune(sentence)
for i := 0; i < len(sentenceRuneList); {
// If the letter can be found in current node's children, then continue to find along this path.
if child, exists := currNode.children[sentenceRuneList[i]]; exists {
// If the letter is end of a word, then it's a valid match.
// Then set valid to true, and assign the index to the end variable.
if child.isEndOfWord {
end = i
valid = true
}
// If the child doesn't have any child, it means it's the end of a path. Then add the last valid index pair into list.
// And continue to handle the next letter from the root node.
// Otherwise, continue to handle along this path.
if len(child.children) == 0 {
startIndexList = append(startIndexList, start)
endIndexList = append(endIndexList, end)
currNode = this.root
// Reset variables, and starts from the next letter.
start, end, valid = i+1, 0, false
} else {
currNode = child
}
// Handle the next letter.
i++
} else {
// When the letter can't be found in current node's children, there are two possibilities:
// 1. There is already a valid match index pair. Then add them to list. And rehandle this letter again from the root node.
// 2. There is no valid match index pair. Then continue to handle next letter from the root node.
if valid {
startIndexList = append(startIndexList, start)
endIndexList = append(endIndexList, end)
currNode = this.root
start, end, valid = i, 0, false
} else {
currNode = this.root
start, end, valid = i+1, 0, false
i++
}
}
}
return
}
// Insert new word into object
func (this *DFAUtil) InsertWord(word []rune) {
currNode := this.root
for _, c := range word {
if cildNode, exist := currNode.children[c]; !exist {
cildNode = newtrieNode()
currNode.children[c] = cildNode
currNode = cildNode
} else {
currNode = cildNode
}
}
currNode.isEndOfWord = true
}
// Check if there is any word in the trie that starts with the given prefix.
func (this *DFAUtil) StartsWith(prefix []rune) bool {
currNode := this.root
for _, c := range prefix {
if cildNode, exist := currNode.children[c]; !exist {
return false
} else {
currNode = cildNode
}
}
return true
}
// Judge if input sentence contains some special caracter
// Return:
// Matc or not
func (this *DFAUtil) IsMatch(sentence string) bool {
startIndexList, _ := this.searcSentence(sentence)
return len(startIndexList) > 0
}
// Handle sentence. Use specified caracter to replace those sensitive caracters.
// input: Input sentence
// replaceCh: candidate
// Return:
// Sentence after manipulation
func (this *DFAUtil) HandleWord(sentence string, replaceCh rune) string {
startIndexList, endIndexList := this.searcSentence(sentence)
if len(startIndexList) == 0 {
return sentence
}
// Manipulate
sentenceList := []rune(sentence)
for i := 0; i < len(startIndexList); i++ {
for index := startIndexList[i]; index <= endIndexList[i]; index++ {
sentenceList[index] = replaceCh
}
}
return string(sentenceList)
}
// Create new DfaUtil object
// wordList:word list
func NewDFAUtil(wordList []string) *DFAUtil {
this := &DFAUtil{
root: newtrieNode(),
}
for _, word := range wordList {
wordRuneList := []rune(word)
if len(wordRuneList) > 0 {
this.InsertWord(wordRuneList)
}
}
return this
}
完整的代码,请参考:https://github.com/Jordanzuo/goutil/tree/master/dfaUtil