AC自动机及多模式匹配

2016-05-20  本文已影响804人  胡哈哈哈
////////////////////////////////////////////////////////////////////////////////
/*
1 const vector<string> &patterns: several pattern strings;
2 const string &s: original strings;
3 vector<string> &answer: the patterns which are matched in the original strings;
4 return the number of patterns which are matched.
*/
////////////////////////////////////////////////////////////////////////////////
#include <iostream>
#include <vector>
#include <queue>
#include <string>
#include <unordered_set>
#define ALPH_NUM 26
using namespace std;

struct trieTreeNode {
    vector<trieTreeNode*> next;
    bool mark;
    trieTreeNode *fail;
    trieTreeNode(): next(26, nullptr), mark(false), fail(nullptr) {}
};

trieTreeNode *createAcAutomation(const vector<string> &patterns);
int findPatterns(vector<string> &answer, trieTreeNode *root, const string &s);
void makeFoundPatterns(vector<string> &answer, unordered_set<trieTreeNode*> &save, trieTreeNode *root, string pattern);
inline char turn_char(int index);
int multiPatternsMatchingByAcAutomation(const vector<string> &patterns, const string &s, vector<string> &answer);

trieTreeNode *createAcAutomation(const vector<string> &patterns) {
    trieTreeNode *root = new trieTreeNode(), *ptr, *cur;
    for (int i = 0; i != patterns.size(); ++i) {
        cur = root;
        for (int k = 0; k != patterns[i].size(); ++k) {
            int index = patterns[i][k] - 'a';
            if (!cur->next[index])
                cur->next[index] = new trieTreeNode();
            cur = cur->next[index];
        }
        cur->mark = true;
    }
    queue<trieTreeNode*> makeFail;
    makeFail.push(root);
    while (!makeFail.empty()) {
        cur = makeFail.front(); makeFail.pop();
        for (int i = 0; i != ALPH_NUM; ++i) {
            if (cur->next[i]) {
                for (ptr = cur->fail; ptr && !ptr->next[i]; ptr = ptr->fail);
                cur->next[i]->fail = ptr ? ptr->next[i] : root;
                makeFail.push(cur->next[i]);
            }
        }
    }
    return root;
}

int findPatterns(vector<string> &answer, trieTreeNode *root, const string &s) {
    int count = 0;
    string pattern;
    unordered_set<trieTreeNode*> save;
    trieTreeNode *cur = root;
    for (int i = 0; i != s.size(); ) {
        int index = s[i] - 'a';
        if (!cur) {
            cur = root; ++i;
        }
        else if (cur->next[index]) {
            cur = cur->next[index];
            if (cur->mark) {
                ++count;
                save.insert(cur);
            }
            ++i;
        }
        else {
            cur = cur->fail;
            if (cur && cur->mark) {
                ++count;
                save.insert(cur);
            }
        }
    }
    makeFoundPatterns(answer, save, root, pattern);
    return count;
}

void makeFoundPatterns(vector<string> &answer, unordered_set<trieTreeNode*> &save,
    trieTreeNode *root, string pattern) {
    unordered_set<trieTreeNode*>::iterator it = save.find(root);
    if (it != save.end())
        answer.push_back(pattern);
    for (int i = 0; i != ALPH_NUM; ++i) {
        if (root->next[i]) {
            string t(pattern);
            t.push_back(turn_char(i));
            makeFoundPatterns(answer, save, root->next[i], t);
        }
    }
}

inline char turn_char(int index) {
    return 'a' + index;
}

int multiPatternsMatchingByAcAutomation(const vector<string> &patterns, const string &s, vector<string> &answer) {
    trieTreeNode *root = createAcAutomation(patterns);
    return findPatterns(answer, root, s);
}

上一篇下一篇

猜你喜欢

热点阅读