This C++ Program Finds the Most Frequent Words in a File.
Here is source code of the C++ Program to Find k Most Frequent Words in a File. The C++ program is successfully compiled and run on a Linux system. The program output is also shown below.
/*
* C++ Program to Find k Most Frequent Words in a File
*/
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cctype>
# define MAX_CHARS 26
# define MAX_WORD_SIZE 30
using namespace std;
/*
* Trie Node Declaration
*/
struct TrieNode
{
bool isEnd;
unsigned frequency;
int indexMinHeap;
TrieNode* child[MAX_CHARS];
};
/*
* Min Heap Node
*/
struct MinHeapNode
{
TrieNode* root;
unsigned frequency;
char* word;
};
/*
* Min Heap
*/
struct MinHeap
{
unsigned capacity;
int count;
MinHeapNode* array;
};
/*
* create a new Trie node
*/
TrieNode* newTrieNode()
{
TrieNode* trieNode = new TrieNode;
trieNode->isEnd = 0;
trieNode->frequency = 0;
trieNode->indexMinHeap = -1;
for (int i = 0; i < MAX_CHARS; ++i)
trieNode->child[i] = NULL;
return trieNode;
}
/*
* create a Min Heap of given capacity
*/
MinHeap* createMinHeap(int capacity)
{
MinHeap* minHeap = new MinHeap;
minHeap->capacity = capacity;
minHeap->count = 0;
minHeap->array = new MinHeapNode [minHeap->capacity];
return minHeap;
}
/*
* swap two min heap nodes.
*/
void swapMinHeapNodes (MinHeapNode* a, MinHeapNode* b)
{
MinHeapNode temp = *a;
*a = *b;
*b = temp;
}
/*
* min heapify
*/
void minHeapify(MinHeap* minHeap, int idx)
{
int left, right, smallest;
left = 2 * idx + 1;
right = 2 * idx + 2;
smallest = idx;
if (left < minHeap->count && minHeap->array[left].frequency < minHeap->array[smallest].frequency)
smallest = left;
if (right < minHeap->count && minHeap->array[right].frequency < minHeap->array[smallest].frequency)
smallest = right;
if (smallest != idx)
{
minHeap->array[smallest].root->indexMinHeap = idx;
minHeap->array[idx].root->indexMinHeap = smallest;
swapMinHeapNodes (&minHeap->array[smallest], &minHeap->array[idx]);
minHeapify(minHeap, smallest);
}
}
/*
* build a heap
*/
void buildMinHeap(MinHeap* minHeap)
{
int n, i;
n = minHeap->count - 1;
for (i = ( n - 1 ) / 2; i >= 0; --i)
minHeapify(minHeap, i);
}
/*
* inserts a word to heap
*/
void insertInMinHeap(MinHeap* minHeap, TrieNode** root, const char* word)
{
if ((*root)->indexMinHeap != -1)
{
++( minHeap->array[(*root)->indexMinHeap].frequency);
minHeapify(minHeap, (*root)->indexMinHeap);
}
else if (minHeap->count < minHeap->capacity)
{
int count = minHeap->count;
minHeap->array[count].frequency = (*root)->frequency;
minHeap->array[count].word = new char [strlen( word ) + 1];
strcpy(minHeap->array[count].word, word);
minHeap->array[count].root = *root;
(*root)->indexMinHeap = minHeap->count;
++( minHeap->count );
buildMinHeap( minHeap );
}
else if ((*root)->frequency > minHeap->array[0].frequency)
{
minHeap->array[0].root->indexMinHeap = -1;
minHeap->array[0].root = *root;
minHeap->array[0].root->indexMinHeap = 0;
minHeap->array[0].frequency = (*root)->frequency;
delete [] minHeap->array[0].word;
minHeap->array[0]. word = new char [strlen( word ) + 1];
strcpy( minHeap->array[0].word, word );
minHeapify (minHeap, 0);
}
}
/*
* Inserts a new word to both Trie and Heap
*/
void insertUtil (TrieNode** root, MinHeap* minHeap, const char* word, const char* dupWord)
{
if (*root == NULL)
*root = newTrieNode();
if (*word != '\0')
insertUtil (&((*root)->child[tolower( *word ) - 97]), minHeap, word + 1, dupWord);
else
{
if ((*root)->isEnd)
++((*root)->frequency);
else
{
(*root)->isEnd = 1;
(*root)->frequency = 1;
}
insertInMinHeap(minHeap, root, dupWord);
}
}
/*
* add a word to Trie & min heap
*/
void insertTrieAndHeap(const char *word, TrieNode** root, MinHeap* minHeap)
{
insertUtil(root, minHeap, word, word);
}
/*
* Display Min Heap
*/
void displayMinHeap(MinHeap* minHeap)
{
int i;
for (i = 0; i < minHeap->count; ++i)
{
cout<<minHeap->array[i].word<<" : "<<minHeap->array[i].frequency<<endl;
}
}
/*
* takes a file as input, add words to heap and Trie
*/
void printKMostFreq(FILE* fp, int k)
{
MinHeap* minHeap = createMinHeap(k);
TrieNode* root = NULL;
char buffer[MAX_WORD_SIZE];
while (fscanf(fp, "%s", buffer) != EOF)
insertTrieAndHeap(buffer, &root, minHeap);
displayMinHeap(minHeap);
}
/*
* Main
*/
int main()
{
int k = 5;
FILE *fp = fopen("file.txt", "r");
if (fp == NULL)
printf ("File doesn't exist ");
else
printKMostFreq (fp, k);
return 0;
}
$ g++ mosty_frequent.cpp $ a.out ---------------------file.txt---------------------- Bessie can score a point in the game by picking two of the dots and drawing a straight line between them; however, she is not allowed to draw a line if she has already drawn another line parallel to it. Bessie would like to know her chances of winning, so she has asked you to help find the maximum score she can obtain. ------------------------------------------------------ the : 3 line : 3 she : 4 to : 4 a : 3 ------------------ (program exited with code: 1) Press return to continue
Sanfoundry Global Education & Learning Series – 1000 C++ Programs.
Sanfoundry Certification Contest of the Month is Live. 100+ Subjects. Participate Now!
advertisement
advertisement
If you wish to look at all C++ Programming examples, go to C++ Programs.
Next Steps:
- Get Free Certificate of Merit in C++ Programming
- Participate in C++ Programming Certification Contest
- Become a Top Ranker in C++ Programming
- Take C++ Programming Tests
- Chapterwise Practice Tests: Chapter 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
- Chapterwise Mock Tests: Chapter 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
Related Posts:
- Practice Computer Science MCQs
- Apply for Information Technology Internship
- Buy Programming Books
- Apply for C++ Internship
- Practice Programming MCQs