前缀树（字典树/Trie）Java实现和应用

2020-08-30 本文已影响0人 xiaogp

摘要： 前缀树，字典树，插入查询逻辑，Java实现，时间复杂度分析

前缀树介绍

Trie树又被称为前缀树、字典树，把单词字母一条一条灌进一棵树中，每个节点是a-z之间的字母，对于都是数字的字符串，字符集就是0-9，每一个节点包含三个元素，分别是节点对应的字符name，存储的子节点信息Map(name -> 节点对象), 是否是 词尾标志end。

前缀树.png
Trie的核心思想是空间换时间。利用字符串的公共前缀来降低查询时间的开销以达到提高效率的目的,最大限度地减少无谓的字符串比较。
常用在字符串检索过滤

前缀树逻辑

插入逻辑

前缀树插入逻辑.png

查询逻辑

前缀树查询逻辑.png

前缀树Java实现

import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;


public class TrieCheckUtil {
    private static TrieCheckUtil instance;
    private static TrieNode root;

    private TrieCheckUtil(String siteNamePath) {
        root = new TrieNode();
        try {
            Files.lines(Paths.get(siteNamePath)).forEach(TrieCheckUtil::addWord);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static TrieCheckUtil getInstance(String siteNamePath) {
        if (instance == null) {
            synchronized (TrieCheckUtil.class) {
                if (instance == null) {
                    instance = new TrieCheckUtil(siteNamePath);
                }
            }
        }
        return instance;
    }

    public static void addWord(String word) {
        if (word == null || word.length() == 0) {
            return;
        }
        // 每次调用addWord都重新拿到全局根节点对象
        TrieNode current = root;
        for (int i = 0; i < word.length(); i++) {
            char code = word.charAt(i);
            // 增加字母, 并且返回子节点继续增加
            current = current.add(code);
        }
        current.end = true;
    }

    /**
     * 一个节点对象
     * value： 当前节点存储的字母
     * child： 当前节点的子节点信息 字母 -> 节点对象
     * end: 是否是整个词的结尾
     */
    private static class TrieNode {
        public char value;
        public Map<Character, TrieNode> child = new HashMap<>();
        private boolean end = false;

        public TrieNode() {
        }

        public TrieNode add(char newChar) {
            if (child == null) {
                this.child = new HashMap<>();
            }
            // 找到对应字符的字典树
            TrieNode t = child.get(newChar);
            // 在map中查找是否已经存在字母
            if (t == null) {
                // 不存在则新建一个节点对象
                t = new TrieNode();
                // 给节点对象命名为该字母
                t.value = newChar;
                child.put(newChar, t);
            }
            // 返回下一个节点
            return t;
        }
    }

    public boolean isContains(String text) {
        if (text == null || text.length() == 0) {
            return false;
        }
        // 获得前缀树
        TrieNode current = root;
        // 从词的首位开始遍历
        int index = 0;
        while (index < text.length()) {
            // 如果在当前层找到当前字母，继续往下一层找
            if (current.child.get(text.charAt(index)) != null) {
                current = current.child.get(text.charAt(index));
            } else {
                // 如果在当前这一层找不到字符子节点，直接切到新的root该子节点下重新找
                // 如果root下也没有该字母，继续返回root给下一个字母调用防止空指针
                current = (root.child.get(text.charAt(index)) == null) ? root : root.child.get(text.charAt(index));
            }
            // 判断是否存在的依据： 当前查找返回的节点对象是否是end标志
            if (current.end) {
                return true;
            }
            index += 1;
        }
        return false;
    }

    public String getContainsItem(String text) {
            if (text == null || text.length() == 0) {
                return null;
            }
            TrieNode current = root;
            int index = 0;
            int startIndex = 0;
            int endIndex = 0;
            while (index < text.length()) {
                if (current.child.get(text.charAt(index)) != null) {
                    current = current.child.get(text.charAt(index));
                } else {
                    // startIndexstartIndex在else条件中更新
                    // 有两种情况，如果在根节点都找不到当前字则从index+1开始，如果根节点存在该字，从index开始
                    if (root.child.get(text.charAt(index)) == null) {
                        current = root;
                        startIndex = index + 1;
                    } else {
                        current = root.child.get(text.charAt(index));
                        startIndex = index;
                    }
                }
                if (current.end) {
                    endIndex = index;
                    return text.substring(startIndex, endIndex + 1);
                }
                index += 1;
            }
            return null;
        }

    public List<String> getContainsItem2(String text) {
            List<String> res = new ArrayList<>();
            if (text == null || text.length() == 0) {
                return res;
            }
            TrieNode current = root;
            int index = 0;
            int startIndex = 0;
            int endIndex = 0;
            while (index < text.length()) {
                if (current.child.get(text.charAt(index)) != null) {
                    current = current.child.get(text.charAt(index));
                } else {
                    // startIndexstartIndex在else条件中更新
                    // 有两种情况，如果在根节点都找不到当前字则从index+1开始，如果根节点存在该字，从index开始
                    if (root.child.get(text.charAt(index)) == null) {
                        current = root;
                        startIndex = index + 1;
                    } else {
                        current = root.child.get(text.charAt(index));
                        startIndex = index;
                    }
                }
                if (current.end) {
                    endIndex = index;
                    res.add(text.substring(startIndex, endIndex + 1));
                    // 重置为root
                    current = root;
                    // 重置startIndex
                    startIndex = endIndex + 1;

                }
                index += 1;
            }
            return res;
        }
}

前缀树应用

使用前缀树过滤黑白名单域名，使用已知的白名单的域名关键词构建前缀树，判断url中是否包含域名关键词

public static void main(String[] args) {
        TrieCheckUtil trieCheckUtil = TrieCheckUtil.getInstance("src/main/resources/whitesitename.txt");
        String url1 = "http://www.chinamoney.com.cn/dqs/cm-s-notice-query/fileDownLoad.do?contentId=1417967&priority=0&mode=open";
        String url2 = "chinamoney.com.cn";
        String url3 = "chinamoney.com.c";
        System.out.println(trieCheckUtil.isContains(url1));  // true
        System.out.println(trieCheckUtil.isContains(url2));  // true
        System.out.println(trieCheckUtil.isContains(url3));  // false
    }
}

也可以直接捕获命中的字符串，通过捕获开始index和结束index得到，默认开始index=0，知道当层判断为空则重置为index+1，结束index为节点值为end的index+1。

public static void main(String[] args) {
        TrieCheckUtil tree = TrieCheckUtil.getInstance("/home/xinjijian.txt");
        System.out.println(tree.isContains("我是比特币吗"));  // true
        System.out.println(tree.getContainsItem("我区块链2号线"));  // 区块链
    }

也可以找到多个存在的关键字元素，再找到一个关键字元素后，可以重置root节点和重置startIndex实现。

public static void main(String[] args) {
        TrieTree trieTree = TrieCheckUtil.getInstance("xinjijian", Arrays.asList("区块链", "智慧城市", "央行", "是吗"));
        List<String> res2 = trieTree.getContainsItem2("我是央行区块链之王");
        System.out.println(res2);  // [央行, 区块链]
    }

时间复杂度分析

设平均查询的query词长n，白名单m条记录，平均长度k,
简单查询：一个query，需要遍历每一个白名单，调用query是否contains方法，contains方法遍历前词，找到头元素一致，再遍历判断尾序列，contains的复杂度是O(n)，整体复杂度是O(mn)
前缀树查询: 一个query，将这个query从头到尾遍历，每个元素在前缀树中判断，操作都是取下一个节点和判断是否是end，时间复杂度是O(1)，整体时间复杂度是O(n)