Algorithms

后缀数组

2018-03-22  本文已影响0人  null12

一、定义

对于长度为n的文本串T[1...n],T的后缀是指从第i个字符开始到T的末尾所形成的子串T[i...n],1 ≤ i ≤ n ;

1-1-1 字符串的后缀

后缀数组就是对文本串T的所有后缀排序后得到的字符串数组。
特点:原字符串中的任意子串都是后缀数组中的某个后缀字符串的前缀

1-1-2 字符串的后缀数组

API定义:

1-1-3 后缀数组的API定义

二、实现

2.1 数据结构定义

由于实际应用中,文本字符串很大(对于长度为N的文本字符串,其后缀数组大小为N),如果将文本的每个后缀字符串都直接保存到后缀数组中,会占用大量空间,所以可以仅保存每个后缀数组在原文本串中的起始索引。

后缀数组定义:

public class SuffixArray {
    private Suffix[] suffixes;   
    public SuffixArray(String text) {
        int n = text.length();
        this.suffixes = new Suffix[n];
        for (int i = 0; i < n; i++)
            suffixes[i] = new Suffix(text, i);
        Arrays.sort(suffixes);
    }
 
    /**
     * 内部类,主要起优化作用,避免保存大量后缀字符串
     */
    private static class Suffix implements Comparable<Suffix> {
        private final String text; // 原文本串
        private final int index;   // 在原文本串中的起始索引
 
        private Suffix(String text, int index) {
            this.text = text;
            this.index = index;
    }
 
        private int length() {
            return text.length() - index;
        }
 
        // 返回当前后缀字符串的第i个字符
        private char charAt(int i) {
            return text.charAt(index + i);
        }
 
        public int compareTo(Suffix that) {
            if (this == that)
                return 0;
            int n = Math.min(this.length(), that.length());
            for (int i = 0; i < n; i++) {
                if (this.charAt(i) < that.charAt(i))
                    return -1;
                else if (this.charAt(i) > that.charAt(i))
                    return +1;
            }
            return this.length() - that.length();
        }
 
        public String toString() {
            return text.substring(index);
        }
    }
    
    public int length() {
        return suffixes.length;
    }
}

2.2 API实现

/**
 * 返回后缀字符串suffixes[i]在原文本串中的起始索引
 */
public int index(int i) {
    if (i < 0 || i >= suffixes.length)
        throw new IllegalArgumentException();
    return suffixes[i].index;
}
 
/**
 * 返回后缀字符串suffixes[i]和suffixes[i-1]的最大公共前缀长度
 */
public int lcp(int i) {
    if (i < 1 || i >= suffixes.length)
        throw new IllegalArgumentException();
    Suffix s = suffixes[i];
    Suffix t = suffixes[i - 1];
    int n = Math.min(s.length(), t.length());
    for (int i = 0; i < n; i++) {
        if (s.charAt(i) != t.charAt(i))
            return i;
    }
    return n;
}
 
/**
 * 返回后缀字符串suffixes[i]
 */
public String select(int i) {
    if (i < 0 || i >= suffixes.length)
        throw new IllegalArgumentException();
    return suffixes[i].toString();
}
 
/**
 * 返回小于键key的后缀数量
 */
public int rank(String key) {
    int lo = 0, hi = suffixes.length - 1;
    while (lo <= hi) {
        int mid = lo + (hi - lo) / 2;
        int cmp = compare(key, suffixes[mid]);
        if (cmp < 0)
            hi = mid - 1;
        else if (cmp > 0)
            lo = mid + 1;
        else
            return mid;
    }
    return lo;
}

2.3 完整源码

public class SuffixArray {
    private Suffix[] suffixes;

    public SuffixArray(String text) {
        int n = text.length();
        this.suffixes = new Suffix[n];
        for (int i = 0; i < n; i++)
            suffixes[i] = new Suffix(text, i);
        Arrays.sort(suffixes);
    }

    private static class Suffix implements Comparable<Suffix> {
        private final String text;
        private final int index;

        private Suffix(String text, int index) {
            this.text = text;
            this.index = index;
        }
        private int length() {
            return text.length() - index;
        }
        private char charAt(int i) {
            return text.charAt(index + i);
        }

        public int compareTo(Suffix that) {
            if (this == that) return 0;  // optimization
            int n = Math.min(this.length(), that.length());
            for (int i = 0; i < n; i++) {
                if (this.charAt(i) < that.charAt(i)) return -1;
                if (this.charAt(i) > that.charAt(i)) return +1;
            }
            return this.length() - that.length();
        }

        public String toString() {
            return text.substring(index);
        }
    }

    /**
     * Returns the length of the input string.
     * @return the length of the input string
     */
    public int length() {
        return suffixes.length;
    }

    /**
     * Returns the index into the original string of the <em>i</em>th smallest suffix.
     * That is, {@code text.substring(sa.index(i))} is the <em>i</em>th smallest suffix.
     * @param i an integer between 0 and <em>n</em>-1
     * @return the index into the original string of the <em>i</em>th smallest suffix
     * @throws java.lang.IllegalArgumentException unless {@code 0 <= i < n}
     */
    public int index(int i) {
        if (i < 0 || i >= suffixes.length) throw new IllegalArgumentException();
        return suffixes[i].index;
    }


    /**
     * Returns the length of the longest common prefix of the <em>i</em>th
     * smallest suffix and the <em>i</em>-1st smallest suffix.
     * @param i an integer between 1 and <em>n</em>-1
     * @return the length of the longest common prefix of the <em>i</em>th
     * smallest suffix and the <em>i</em>-1st smallest suffix.
     * @throws java.lang.IllegalArgumentException unless {@code 1 <= i < n}
     */
    public int lcp(int i) {
        if (i < 1 || i >= suffixes.length) throw new IllegalArgumentException();
        return lcpSuffix(suffixes[i], suffixes[i-1]);
    }

    // longest common prefix of s and t
    private static int lcpSuffix(Suffix s, Suffix t) {
        int n = Math.min(s.length(), t.length());
        for (int i = 0; i < n; i++) {
            if (s.charAt(i) != t.charAt(i)) return i;
        }
        return n;
    }

    /**
     * Returns the <em>i</em>th smallest suffix as a string.
     * @param i the index
     * @return the <em>i</em> smallest suffix as a string
     * @throws java.lang.IllegalArgumentException unless {@code 0 <= i < n}
     */
    public String select(int i) {
        if (i < 0 || i >= suffixes.length) throw new IllegalArgumentException();
        return suffixes[i].toString();
    }

    /**
     * Returns the number of suffixes strictly less than the {@code query} string.
     * We note that {@code rank(select(i))} equals {@code i} for each {@code i}
     * between 0 and <em>n</em>-1.
     * @param query the query string
     * @return the number of suffixes strictly less than {@code query}
     */
    public int rank(String query) {
        int lo = 0, hi = suffixes.length - 1;
        while (lo <= hi) {
            int mid = lo + (hi - lo) / 2;
            int cmp = compare(query, suffixes[mid]);
            if (cmp < 0) hi = mid - 1;
            else if (cmp > 0) lo = mid + 1;
            else return mid;
        }
        return lo;
    }

    // compare query string to suffix
    private static int compare(String query, Suffix suffix) {
        int n = Math.min(query.length(), suffix.length());
        for (int i = 0; i < n; i++) {
            if (query.charAt(i) < suffix.charAt(i)) return -1;
            if (query.charAt(i) > suffix.charAt(i)) return +1;
        }
        return query.length() - suffix.length();
    }

    /**
     * Unit tests the {@code SuffixArray} data type.
     *
     * @param args the command-line arguments
     */
    public static void main(String[] args) {
        String s = StdIn.readAll().replaceAll("\\s+", " ").trim();
        SuffixArray suffix = new SuffixArray(s);

        // StdOut.println("rank(" + args[0] + ") = " + suffix.rank(args[0]));

        StdOut.println("  i ind lcp rnk select");
        StdOut.println("---------------------------");

        for (int i = 0; i < s.length(); i++) {
            int index = suffix.index(i);
            String ith = "\"" + s.substring(index, Math.min(index + 50, s.length())) + "\"";
            assert s.substring(index).equals(suffix.select(i));
            int rank = suffix.rank(s.substring(index));
            if (i == 0) {
                StdOut.printf("%3d %3d %3s %3d %s\n", i, index, "-", rank, ith);
            }
            else {
                int lcp = suffix.lcp(i);
                StdOut.printf("%3d %3d %3d %3d %s\n", i, index, lcp, rank, ith);
            }
        }
    }
}

三、性能优化

后缀数组在构造时,性能消耗主要在于构造时的后缀字符串排序。最坏情况下,源文本字符串的所有字符都相同,此时排序所需时间会达到平方级别,因此可以采用三向字符串快速排序的方法进行优化。

优化有两方面:

  1. 不再保存实际的后缀数组对象,而是用一个索引数组index[i]表示,这样做可以节省空间
  2. 将构造时的排序方法替换为“三向字符串快速排序”,这种排序方法对文本中出现大量重复字符的情况有较好的性能(提升至线性)。

关键源码:

public class SuffixArrayX {
    private static final int CUTOFF = 5; // cutoff to insertion sort
    private final char[] text;   // 保存文本字符串
    private final int[] index;   // index[i] = j表示排序后第i个后缀字符串为text.substring(j)
    private final int n;         // number of characters in text
 
    public SuffixArrayX(String text) {
        n = text.length();
        text = text + '\0';
        this.text = text.toCharArray();
        this.index = new int[n];
        for (int i = 0; i < n; i++)
            index[i] = i;
 
        // 对后缀字符串数组text[0..n-1],text[1..n-1],...,text[n-1..n-1]进行三向字符串快速排序
        // text[index[i]...n-1]表示当前排在第i位的后缀字符串
        sort(0, n - 1, 0);
    }
 
    /**
     * 三向字符串快速排序
     * @param lo 起始后缀字符串text[index[lo]...N-1]
     * @param hi 结束后缀字符串text[index[hi]...N-1]
     * @param d  待排序字符位置
     */
    private void sort(int lo, int hi, int d) {
        // 对于小数组,切换为插入排序
        if (hi <= lo + CUTOFF) {
            insertion(lo, hi, d);
            return;
        }
 
        int lt = lo, gt = hi;
        char v = text[index[lo] + d];
        int i = lo + 1;
        while (i <= gt) {
            char t = text[index[i] + d];
            if (t < v)
                exch(lt++, i++);
            else if (t > v)
                exch(i, gt--);
            else
                i++;
        }
        // a[lo..lt-1] < v = a[lt..gt] < a[gt+1..hi].
        sort(lo, lt - 1, d);
        if (v > 0)
            sort(lt, gt, d + 1);
        sort(gt + 1, hi, d);
    }
}

完整的优化后的后缀数组源码:

public class SuffixArrayX {
    private static final int CUTOFF = 5; // cutoff to insertion sort
    private final char[] text; // 保存文本字符串
    private final int[] index; // index[i] = j表示排序后第i个后缀字符串为text.substring(j)
    private final int n; // number of characters in text

    public SuffixArrayX(String text) {
        n = text.length();
        text = text + '\0';
        this.text = text.toCharArray();
        this.index = new int[n];
        for (int i = 0; i < n; i++)
            index[i] = i;

        // 对后缀字符串数组text[0...n-1],text[1...n-1],...,text[n-1...n-1]进行三向字符串快速排序
        // text[index[i]...n-1]表示当前排在第i位的后缀字符串
        sort(0, n - 1, 0);
    }

    /**
     * 三向字符串快速排序
     * 
     * @param lo
     *            起始后缀字符串text[index[lo]...N-1]
     * @param hi
     *            结束后缀字符串text[index[hi]...N-1]
     * @param d
     *            待排序字符位置
     */
    private void sort(int lo, int hi, int d) {
        // 对于小数组,切换为插入排序
        if (hi <= lo + CUTOFF) {
            insertion(lo, hi, d);
            return;
        }

        int lt = lo, gt = hi;
        char v = text[index[lo] + d];
        int i = lo + 1;
        while (i <= gt) {
            char t = text[index[i] + d];
            if (t < v)
                exch(lt++, i++);
            else if (t > v)
                exch(i, gt--);
            else
                i++;
        }

        // a[lo..lt-1] < v = a[lt..gt] < a[gt+1..hi].
        sort(lo, lt - 1, d);
        if (v > 0)
            sort(lt, gt, d + 1);
        sort(gt + 1, hi, d);
    }

    // sort from a[lo] to a[hi], starting at the dth character
    private void insertion(int lo, int hi, int d) {
        for (int i = lo; i <= hi; i++)
            for (int j = i; j > lo && less(index[j], index[j - 1], d); j--)
                exch(j, j - 1);
    }

    // is text[i+d..n) < text[j+d..n) ?
    private boolean less(int i, int j, int d) {
        if (i == j)
            return false;
        i = i + d;
        j = j + d;
        while (i < n && j < n) {
            if (text[i] < text[j])
                return true;
            if (text[i] > text[j])
                return false;
            i++;
            j++;
        }
        return i > j;
    }

    private void exch(int i, int j) {
        int swap = index[i];
        index[i] = index[j];
        index[j] = swap;
    }

    public int length() {
        return n;
    }

    /**
     * Returns the index into the original string of the <em>i</em>th smallest
     * suffix. That is, {@code text.substring(sa.index(i))} is the <em>i</em>
     * smallest suffix.
     * 
     * @param i
     *            an integer between 0 and <em>n</em>-1
     * @return the index into the original string of the <em>i</em>th smallest
     *         suffix
     * @throws java.lang.IllegalArgumentException
     *             unless {@code 0 <=i < n}
     */
    public int index(int i) {
        if (i < 0 || i >= n)
            throw new IllegalArgumentException();
        return index[i];
    }

    /**
     * Returns the length of the longest common prefix of the <em>i</em>th
     * smallest suffix and the <em>i</em>-1st smallest suffix.
     * 
     * @param i
     *            an integer between 1 and <em>n</em>-1
     * @return the length of the longest common prefix of the <em>i</em>th
     *         smallest suffix and the <em>i</em>-1st smallest suffix.
     * @throws java.lang.IllegalArgumentException
     *             unless {@code 1 <= i < n}
     */
    public int lcp(int i) {
        if (i < 1 || i >= n)
            throw new IllegalArgumentException();
        return lcp(index[i], index[i - 1]);
    }

    // longest common prefix of text[i..n) and text[j..n)
    private int lcp(int i, int j) {
        int length = 0;
        while (i < n && j < n) {
            if (text[i] != text[j])
                return length;
            i++;
            j++;
            length++;
        }
        return length;
    }

    /**
     * Returns the <em>i</em>th smallest suffix as a string.
     * 
     * @param i
     *            the index
     * @return the <em>i</em> smallest suffix as a string
     * @throws java.lang.IllegalArgumentException
     *             unless {@code 0 <= i < n}
     */
    public String select(int i) {
        if (i < 0 || i >= n)
            throw new IllegalArgumentException();
        return new String(text, index[i], n - index[i]);
    }

    /**
     * Returns the number of suffixes strictly less than the {@code query}
     * string. We note that {@code rank(select(i))} equals {@code i} for each
     * {@code i} between 0 and <em>n</em>-1.
     * 
     * @param query
     *            the query string
     * @return the number of suffixes strictly less than {@code query}
     */
    public int rank(String query) {
        int lo = 0, hi = n - 1;
        while (lo <= hi) {
            int mid = lo + (hi - lo) / 2;
            int cmp = compare(query, index[mid]);
            if (cmp < 0)
                hi = mid - 1;
            else if (cmp > 0)
                lo = mid + 1;
            else
                return mid;
        }
        return lo;
    }

    // is query < text[i..n) ?
    private int compare(String query, int i) {
        int m = query.length();
        int j = 0;
        while (i < n && j < m) {
            if (query.charAt(j) != text[i])
                return query.charAt(j) - text[i];
            i++;
            j++;

        }
        if (i < n)
            return -1;
        if (j < m)
            return +1;
        return 0;
    }

    /**
     * Unit tests the {@code SuffixArrayx} data type.
     *
     * @param args
     *            the command-line arguments
     */
    public static void main(String[] args) {
        String s = StdIn.readAll().replaceAll("\n", " ").trim();
        SuffixArrayX suffix1 = new SuffixArrayX(s);
        SuffixArray suffix2 = new SuffixArray(s);
        boolean check = true;
        for (int i = 0; check && i < s.length(); i++) {
            if (suffix1.index(i) != suffix2.index(i)) {
                StdOut.println("suffix1(" + i + ") = " + suffix1.index(i));
                StdOut.println("suffix2(" + i + ") = " + suffix2.index(i));
                String ith = "\"" + s.substring(suffix1.index(i), Math.min(suffix1.index(i) + 50, s.length())) + "\"";
                String jth = "\"" + s.substring(suffix2.index(i), Math.min(suffix2.index(i) + 50, s.length())) + "\"";
                StdOut.println(ith);
                StdOut.println(jth);
                check = false;
            }
        }

        StdOut.println("  i ind lcp rnk  select");
        StdOut.println("---------------------------");

        for (int i = 0; i < s.length(); i++) {
            int index = suffix2.index(i);
            String ith = "\"" + s.substring(index, Math.min(index + 50, s.length())) + "\"";
            int rank = suffix2.rank(s.substring(index));
            assert s.substring(index).equals(suffix2.select(i));
            if (i == 0) {
                StdOut.printf("%3d %3d %3s %3d  %s\n", i, index, "-", rank, ith);
            } else {
                // int lcp = suffix.lcp(suffix2.index(i), suffix2.index(i-1));
                int lcp = suffix2.lcp(i);
                StdOut.printf("%3d %3d %3d %3d  %s\n", i, index, lcp, rank, ith);
            }
        }
    }
}

四、性能分析

对于长度为N的文本串,构造后缀数组:

上一篇 下一篇

猜你喜欢

热点阅读