C++ Program to Find Most Frequent Words in a File

«
»
This C++ Program Finds the Most Frequent Words in a File.

Here is source code of the C++ Program to Find k Most Frequent Words in a File. The C++ program is successfully compiled and run on a Linux system. The program output is also shown below.

  1. /* 
  2.  * C++ Program to Find k Most Frequent Words in a File
  3.  */
  4. #include <iostream>
  5. #include <cstdio>
  6. #include <cstdlib>
  7. #include <cstring>
  8. #include <cctype>
  9. # define MAX_CHARS 26
  10. # define MAX_WORD_SIZE 30
  11. using namespace std; 
  12. /* 
  13.  * Trie Node Declaration
  14.  */
  15. struct TrieNode
  16. {
  17.     bool isEnd;
  18.     unsigned frequency;
  19.     int indexMinHeap;
  20.     TrieNode* child[MAX_CHARS];
  21. };
  22.  
  23. /* 
  24.  * Min Heap Node
  25.  */
  26. struct MinHeapNode
  27. {
  28.     TrieNode* root;
  29.     unsigned frequency;
  30.     char* word;
  31. };
  32.  
  33. /* 
  34.  * Min Heap
  35.  */
  36. struct MinHeap
  37. {
  38.     unsigned capacity;
  39.     int count;
  40.     MinHeapNode* array;
  41. };
  42.  
  43. /* 
  44.  * create a new Trie node
  45.  */
  46. TrieNode* newTrieNode()
  47. {
  48.     TrieNode* trieNode = new TrieNode;
  49.     trieNode->isEnd = 0;
  50.     trieNode->frequency = 0;
  51.     trieNode->indexMinHeap = -1;
  52.     for (int i = 0; i < MAX_CHARS; ++i)
  53.         trieNode->child[i] = NULL;
  54.     return trieNode;
  55. }
  56. /* 
  57.  * create a Min Heap of given capacity
  58.  */ 
  59. MinHeap* createMinHeap(int capacity)
  60. {
  61.     MinHeap* minHeap = new MinHeap;
  62.     minHeap->capacity = capacity;
  63.     minHeap->count  = 0;
  64.     minHeap->array = new MinHeapNode [minHeap->capacity];
  65.     return minHeap;
  66. }
  67. /* 
  68.  * swap two min heap nodes.
  69.  */  
  70. void swapMinHeapNodes (MinHeapNode* a, MinHeapNode* b)
  71. {
  72.     MinHeapNode temp = *a;
  73.     *a = *b;
  74.     *b = temp;
  75. }
  76.  
  77. /* 
  78.  * min heapify
  79.  */
  80. void minHeapify(MinHeap* minHeap, int idx)
  81. {
  82.     int left, right, smallest;
  83.     left = 2 * idx + 1;
  84.     right = 2 * idx + 2;
  85.     smallest = idx;
  86.     if (left < minHeap->count && minHeap->array[left].frequency < minHeap->array[smallest].frequency)
  87.         smallest = left;
  88.     if (right < minHeap->count && minHeap->array[right].frequency < minHeap->array[smallest].frequency)
  89.         smallest = right;
  90.     if (smallest != idx)
  91.     {
  92.         minHeap->array[smallest].root->indexMinHeap = idx;
  93.         minHeap->array[idx].root->indexMinHeap = smallest;
  94.         swapMinHeapNodes (&minHeap->array[smallest], &minHeap->array[idx]);
  95.         minHeapify(minHeap, smallest);
  96.     }
  97. }
  98.  
  99. /* 
  100.  * build a heap
  101.  */
  102. void buildMinHeap(MinHeap* minHeap)
  103. {
  104.     int n, i;
  105.     n = minHeap->count - 1;
  106.     for (i = ( n - 1 ) / 2; i >= 0; --i)
  107.         minHeapify(minHeap, i);
  108. }
  109.  
  110. /* 
  111.  * inserts a word to heap
  112.  */
  113. void insertInMinHeap(MinHeap* minHeap, TrieNode** root, const char* word)
  114. {
  115.     if ((*root)->indexMinHeap != -1)
  116.     {
  117.         ++( minHeap->array[(*root)->indexMinHeap].frequency);
  118.         minHeapify(minHeap, (*root)->indexMinHeap);
  119.     }
  120.     else if (minHeap->count < minHeap->capacity)
  121.     {
  122.         int count = minHeap->count;
  123.         minHeap->array[count].frequency = (*root)->frequency;
  124.         minHeap->array[count].word = new char [strlen( word ) + 1];
  125.         strcpy(minHeap->array[count].word, word);
  126.         minHeap->array[count].root = *root;
  127.         (*root)->indexMinHeap = minHeap->count;
  128.         ++( minHeap->count );
  129.         buildMinHeap( minHeap );
  130.     }
  131.     else if ((*root)->frequency > minHeap->array[0].frequency)
  132.     {
  133.         minHeap->array[0].root->indexMinHeap = -1;
  134.         minHeap->array[0].root = *root;
  135.         minHeap->array[0].root->indexMinHeap = 0;
  136.         minHeap->array[0].frequency = (*root)->frequency;
  137.         delete [] minHeap->array[0].word;
  138.         minHeap->array[0]. word = new char [strlen( word ) + 1];
  139.         strcpy( minHeap->array[0].word, word );
  140.         minHeapify (minHeap, 0);
  141.     }
  142. }
  143. /* 
  144.  * Inserts a new word to both Trie and Heap
  145.  */ 
  146. void insertUtil (TrieNode** root, MinHeap* minHeap, const char* word, const char* dupWord)
  147. {
  148.     if (*root == NULL)
  149.         *root = newTrieNode();
  150.     if (*word != '\0')
  151.         insertUtil (&((*root)->child[tolower( *word ) - 97]), minHeap, word + 1, dupWord);
  152.     else
  153.     {
  154.         if ((*root)->isEnd)
  155.             ++((*root)->frequency);
  156.         else
  157.         {
  158.             (*root)->isEnd = 1;
  159.             (*root)->frequency = 1;
  160.         }
  161.         insertInMinHeap(minHeap, root, dupWord);
  162.     }
  163. }
  164. /* 
  165.  * add a word to Trie & min heap
  166.  */  
  167. void insertTrieAndHeap(const char *word, TrieNode** root, MinHeap* minHeap)
  168. {
  169.     insertUtil(root, minHeap, word, word);
  170. }
  171.  
  172. /* 
  173.  * Display Min Heap
  174.  */
  175. void displayMinHeap(MinHeap* minHeap)
  176. {
  177.     int i;
  178.     for (i = 0; i < minHeap->count; ++i)
  179.     {
  180.        cout<<minHeap->array[i].word<<" : "<<minHeap->array[i].frequency<<endl;
  181.     }
  182. }
  183. /* 
  184.  * takes a file as input, add words to heap and Trie
  185.  */ 
  186. void printKMostFreq(FILE* fp, int k)
  187. {
  188.     MinHeap* minHeap = createMinHeap(k);
  189.     TrieNode* root = NULL;
  190.     char buffer[MAX_WORD_SIZE];
  191.     while (fscanf(fp, "%s", buffer) != EOF)
  192.         insertTrieAndHeap(buffer, &root, minHeap);
  193.     displayMinHeap(minHeap);
  194. }
  195. /* 
  196.  * Main
  197.  */ 
  198. int main()
  199. {
  200.     int k = 5;
  201.     FILE *fp = fopen("file.txt", "r");
  202.     if (fp == NULL)
  203.         printf ("File doesn't exist ");
  204.     else
  205.         printKMostFreq (fp, k);
  206.     return 0;
  207. }

advertisement
$ g++ mosty_frequent.cpp
$ a.out
---------------------file.txt----------------------
Bessie can score a point in the game by picking two of the dots and drawing a straight line between them; however, she is not allowed to draw a line if she has already drawn another line parallel to it. Bessie would like to know her chances of winning, so she has asked you to help find the maximum score she can obtain.
------------------------------------------------------
 
the : 3
line : 3
she : 4
to : 4
a : 3
 
------------------
(program exited with code: 1)
Press return to continue

Sanfoundry Global Education & Learning Series – 1000 C++ Programs.

advertisement
If you wish to look at all C++ Programming examples, go to C++ Programs.

advertisement
advertisement
advertisement
Manish Bhojasia, a technology veteran with 20+ years @ Cisco & Wipro, is Founder and CTO at Sanfoundry. He is Linux Kernel Developer & SAN Architect and is passionate about competency developments in these areas. He lives in Bangalore and delivers focused training sessions to IT professionals in Linux Kernel, Linux Debugging, Linux Device Drivers, Linux Networking, Linux Storage, Advanced C Programming, SAN Storage Technologies, SCSI Internals & Storage Protocols such as iSCSI & Fiber Channel. Stay connected with him @ LinkedIn