Aho-Corasick Algorithm

Description

The Aho-Corasick algorithm is used for efficiently searching for multiple patterns in a given text. It constructs a finite state machine (trie) that allows simultaneous search for all patterns in linear time relative to the size of the patterns and the text.

C++ Code

#include <iostream>
#include <queue>
#include <unordered_map>
#include <vector>
using namespace std;

struct TrieNode {
    unordered_map<char, TrieNode*> children;
    TrieNode* fail;
    vector<int> output;

    TrieNode() : fail(nullptr) {}
    ~TrieNode() {
        for (auto& kv : children) {
            delete kv.second;
        }
    }
};

class AhoCorasick {
private:
    TrieNode* root;

public:
    AhoCorasick() {
        root = new TrieNode();
    }

    ~AhoCorasick() {
        delete root;
    }

    void insert(string pattern, int index) {
        TrieNode* current = root;
        for (char c : pattern) {
            if (!current->children[c]) {
                current->children[c] = new TrieNode();
            }
            current = current->children[c];
        }
        current->output.push_back(index);
    }

    void buildAutomaton() {
        queue<TrieNode*> q;
        for (auto& kv : root->children) {
            kv.second->fail = root;
            q.push(kv.second);
        }

        while (!q.empty()) {
            TrieNode* current = q.front();
            q.pop();

            for (auto& kv : current->children) {
                char c = kv.first;
                TrieNode* child = kv.second;
                q.push(child);

                TrieNode* fail = current->fail;
                while (fail != root && !fail->children.count(c)) {
                    fail = fail->fail;
                }
                child->fail = fail->children[c] ? fail->children[c] : root;
                child->output.insert(child->output.end(), child->fail->output.begin(), child->fail->output.end());
            }
        }
    }

    void searchPatterns(string text) {
        TrieNode* current = root;
        for (int i = 0; i < text.length(); ++i) {
            while (current != root && !current->children.count(text[i])) {
                current = current->fail;
            }
            if (current->children.count(text[i])) {
                current = current->children[text[i]];
            } else {
                current = root;
            }
            if (!current->output.empty()) {
                for (int patternIndex : current->output) {
                    cout << "Pattern found at index " << i - patternIndex << ": " << patternIndex << endl;
                }
            }
        }
    }
};

int main() {
    AhoCorasick ac;

    vector<string> patterns = {"he", "she", "his", "hers"};
    for (int i = 0; i < patterns.size(); ++i) {
        ac.insert(patterns[i], i);
    }

    ac.buildAutomaton();

    string text = "ahishers";
    ac.searchPatterns(text);

    return 0;
}

Time and Space Complexity

Operation Time Complexity Space Complexity
Aho-Corasick Algorithm O(n + m + z) O(m)

Where:

  • n: Length of the text to be searched.
  • m: Total length of all patterns.
  • z: Number of occurrences of patterns in the text.

← Back to Homepage