C++ Program to Implement Aho-Corasick Algorithm for Pattern Searching

n computer science, the Aho–Corasick string matching algorithm is a string searching algorithm, it is a kind of dictionary-matching algorithm that locates elements of a finite set of strings (the “dictionary”) within an input text. It matches all patterns simultaneously. The complexity of the algorithm is linear in the length of the patterns plus the length of the searched text plus the number of output matches. Note that because all matches are found, there can be a quadratic number of matches if every substring matches (e.g. dictionary = a, aa, aaa, aaaa and input string is aaaa).

Here is source code of the C++ Program to Implement Aho-Corasick Algorithm for String Matching. The C++ program is successfully compiled and run on a Linux system. The program output is also shown below.

  1. using namespace std;
  2. #include <algorithm>
  3. #include <iostream>
  4. #include <iterator>
  5. #include <numeric>
  6. #include <sstream>
  7. #include <fstream>
  8. #include <cassert>
  9. #include <climits>
  10. #include <cstdlib>
  11. #include <cstring>
  12. #include <string>
  13. #include <cstdio>
  14. #include <vector>
  15. #include <cmath>
  16. #include <queue>
  17. #include <deque>
  18. #include <stack>
  19. #include <list>
  20. #include <map>
  21. #include <set>
  22.  
  23. #define foreach(x, v) for (typeof (v).begin() x=(v).begin(); x !=(v).end(); ++x)
  24. #define For(i, a, b) for (int i=(a); i<(b); ++i)
  25. #define D(x) cout << #x " is " << x << endl
  26.  
  27. const int MAXS = 6 * 50 + 10; // Max number of states in the matching machine.
  28. // Should be equal to the sum of the length of all keywords.
  29.  
  30. const int MAXC = 26; // Number of characters in the alphabet.
  31.  
  32. int out[MAXS]; // Output for each state, as a bitwise mask.
  33. int f[MAXS]; // Failure function
  34. int g[MAXS][MAXC]; // Goto function, or -1 if fail.
  35.  
  36. int buildMatchingMachine(const vector<string> &words, char lowestChar = 'a',
  37.         char highestChar = 'z')
  38. {
  39.     memset(out, 0, sizeof out);
  40.     memset(f, -1, sizeof f);
  41.     memset(g, -1, sizeof g);
  42.     int states = 1; // Initially, we just have the 0 state
  43.     for (int i = 0; i < words.size(); ++i)
  44.     {
  45.         const string &keyword = words[i];
  46.         int currentState = 0;
  47.         for (int j = 0; j < keyword.size(); ++j)
  48.         {
  49.             int c = keyword[j] - lowestChar;
  50.             if (g[currentState][c] == -1)
  51.             { // Allocate a new node
  52.                 g[currentState][c] = states++;
  53.             }
  54.             currentState = g[currentState][c];
  55.         }
  56.         out[currentState] |= (1 << i); // There's a match of keywords[i] at node currentState.
  57.     }
  58.     // State 0 should have an outgoing edge for all characters.
  59.     for (int c = 0; c < MAXC; ++c)
  60.     {
  61.         if (g[0][c] == -1)
  62.         {
  63.             g[0][c] = 0;
  64.         }
  65.     }
  66.  
  67.     // Now, let's build the failure function
  68.     queue<int> q;
  69.     for (int c = 0; c <= highestChar - lowestChar; ++c)
  70.     { // Iterate over every possible input
  71.     // All nodes s of depth 1 have f[s] = 0
  72.         if (g[0][c] != -1 and g[0][c] != 0)
  73.         {
  74.             f[g[0][c]] = 0;
  75.             q.push(g[0][c]);
  76.         }
  77.     }
  78.     while (q.size())
  79.     {
  80.         int state = q.front();
  81.         q.pop();
  82.         for (int c = 0; c <= highestChar - lowestChar; ++c)
  83.         {
  84.             if (g[state][c] != -1)
  85.             {
  86.                 int failure = f[state];
  87.                 while (g[failure][c] == -1)
  88.                 {
  89.                     failure = f[failure];
  90.                 }
  91.                 failure = g[failure][c];
  92.                 f[g[state][c]] = failure;
  93.                 out[g[state][c]] |= out[failure]; // Merge out values
  94.                 q.push(g[state][c]);
  95.             }
  96.         }
  97.     }
  98.  
  99.     return states;
  100. }
  101. int findNextState(int currentState, char nextInput, char lowestChar = 'a')
  102. {
  103.     int answer = currentState;
  104.     int c = nextInput - lowestChar;
  105.     while (g[answer][c] == -1)
  106.         answer = f[answer];
  107.     return g[answer][c];
  108. }
  109.  
  110. int main()
  111. {
  112.     vector<string> keywords;
  113.     keywords.push_back("he");
  114.     keywords.push_back("she");
  115.     keywords.push_back("hers");
  116.     keywords.push_back("his");
  117.     string text = "ahishers";
  118.     buildMatchingMachine(keywords, 'a', 'z');
  119.     int currentState = 0;
  120.     for (int i = 0; i < text.size(); ++i)
  121.     {
  122.         currentState = findNextState(currentState, text[i], 'a');
  123.         if (out[currentState] == 0)
  124.             continue; // Nothing new, let's move on to the next character.
  125.         for (int j = 0; j < keywords.size(); ++j)
  126.         {
  127.             if (out[currentState] & (1 << j))
  128.             { // Matched keywords[j]
  129.                 cout << "Keyword " << keywords[j] << " appears from " << i
  130.                         - keywords[j].size() + 1 << " to " << i << endl;
  131.             }
  132.         }
  133.     }
  134.     return 0;
  135. }

Output:

$ g++ Aho–Corasick.cpp
$ a.out
 
Keyword his appears from 1 to 3
Keyword he appears from 4 to 5
Keyword she appears from 3 to 5
Keyword hers appears from 4 to 7
 
------------------
(program exited with code: 0)
Press return to continue

Sanfoundry Global Education & Learning Series – 1000 C++ Programs.

advertisement
advertisement

Here’s the list of Best Books in C++ Programming, Data Structures and Algorithms.

If you find any mistake above, kindly email to [email protected]

advertisement
advertisement
Subscribe to our Newsletters (Subject-wise). Participate in the Sanfoundry Certification contest to get free Certificate of Merit. Join our social networks below and stay updated with latest contests, videos, internships and jobs!

Youtube | Telegram | LinkedIn | Instagram | Facebook | Twitter | Pinterest
Manish Bhojasia - Founder & CTO at Sanfoundry
Manish Bhojasia, a technology veteran with 20+ years @ Cisco & Wipro, is Founder and CTO at Sanfoundry. He lives in Bangalore, and focuses on development of Linux Kernel, SAN Technologies, Advanced C, Data Structures & Alogrithms. Stay connected with him at LinkedIn.

Subscribe to his free Masterclasses at Youtube & discussions at Telegram SanfoundryClasses.