Java Program to Implement Min Hash

«
»
This is a java program to implement Min Hash. In computer science, MinHash (or the min-wise independent permutations locality sensitive hashing scheme) is a technique for quickly estimating how similar two sets are.

Here is the source code of the Java Program to Implement Min Hash. The Java program is successfully compiled and run on a Windows system. The program output is also shown below.

  1.  
  2. package com.sanfoundry.datastructures;
  3.  
  4. import java.util.HashMap;
  5. import java.util.HashSet;
  6. import java.util.Map;
  7. import java.util.Random;
  8. import java.util.Set;
  9.  
  10. public class MinHash<T>
  11. {
  12.     private int hash[];
  13.     private int numHash;
  14.  
  15.     public MinHash(int numHash)
  16.     {
  17.         this.numHash = numHash;
  18.         hash = new int[numHash];
  19.         Random r = new Random(11);
  20.         for (int i = 0; i < numHash; i++)
  21.         {
  22.             int a = (int) r.nextInt();
  23.             int b = (int) r.nextInt();
  24.             int c = (int) r.nextInt();
  25.             int x = hash(a * b * c, a, b, c);
  26.             hash[i] = x;
  27.         }
  28.     }
  29.  
  30.     public double similarity(Set<T> set1, Set<T> set2)
  31.     {
  32.         int numSets = 2;
  33.         Map<T, boolean[]> bitMap = buildBitMap(set1, set2);
  34.         int[][] minHashValues = initializeHashBuckets(numSets, numHash);
  35.         computeMinHashForSet(set1, 0, minHashValues, bitMap);
  36.         computeMinHashForSet(set2, 1, minHashValues, bitMap);
  37.         return computeSimilarityFromSignatures(minHashValues, numHash);
  38.     }
  39.  
  40.     private static int[][] initializeHashBuckets(int numSets,
  41.             int numHashFunctions)
  42.     {
  43.         int[][] minHashValues = new int[numSets][numHashFunctions];
  44.         for (int i = 0; i < numSets; i++)
  45.         {
  46.             for (int j = 0; j < numHashFunctions; j++)
  47.             {
  48.                 minHashValues[i][j] = Integer.MAX_VALUE;
  49.             }
  50.         }
  51.         return minHashValues;
  52.     }
  53.  
  54.     private static double computeSimilarityFromSignatures(
  55.             int[][] minHashValues, int numHashFunctions)
  56.     {
  57.         int identicalMinHashes = 0;
  58.         for (int i = 0; i < numHashFunctions; i++)
  59.         {
  60.             if (minHashValues[0][i] == minHashValues[1][i])
  61.             {
  62.                 identicalMinHashes++;
  63.             }
  64.         }
  65.         return (1.0 * identicalMinHashes) / numHashFunctions;
  66.     }
  67.  
  68.     private static int hash(int x, int a, int b, int c)
  69.     {
  70.         int hashValue = (int) ((a * (x >> 4) + b * x + c) & 131071);
  71.         return Math.abs(hashValue);
  72.     }
  73.  
  74.     private void computeMinHashForSet(Set<T> set, int setIndex,
  75.             int[][] minHashValues, Map<T, boolean[]> bitArray)
  76.     {
  77.         int index = 0;
  78.         for (T element : bitArray.keySet())
  79.         {
  80.             /*
  81.              * for every element in the bit array
  82.              */
  83.             for (int i = 0; i < numHash; i++)
  84.             {
  85.                 /*
  86.                  * for every hash
  87.                  */
  88.                 if (set.contains(element))
  89.                 {
  90.                     /*
  91.                      * if the set contains the element
  92.                      */
  93.                     int hindex = hash[index];
  94.                     if (hindex < minHashValues[setIndex][index])
  95.                     {
  96.                         /*
  97.                          * if current hash is smaller than the existing hash in
  98.                          * the slot then replace with the smaller hash value
  99.                          */
  100.                         minHashValues[setIndex][i] = hindex;
  101.                     }
  102.                 }
  103.             }
  104.             index++;
  105.         }
  106.     }
  107.  
  108.     public Map<T, boolean[]> buildBitMap(Set<T> set1, Set<T> set2)
  109.     {
  110.         Map<T, boolean[]> bitArray = new HashMap<T, boolean[]>();
  111.         for (T t : set1)
  112.         {
  113.             bitArray.put(t, new boolean[] { true, false });
  114.         }
  115.         for (T t : set2)
  116.         {
  117.             if (bitArray.containsKey(t))
  118.             {
  119.                 // item is not present in set1
  120.                 bitArray.put(t, new boolean[] { true, true });
  121.             }
  122.             else if (!bitArray.containsKey(t))
  123.             {
  124.                 // item is not present in set1
  125.                 bitArray.put(t, new boolean[] { false, true });
  126.             }
  127.         }
  128.         return bitArray;
  129.     }
  130.  
  131.     public static void main(String[] args)
  132.     {
  133.         Set<String> set1 = new HashSet<String>();
  134.         set1.add("FRANCISCO");
  135.         set1.add("MISSION");
  136.         set1.add("SAN");
  137.         Set<String> set2 = new HashSet<String>();
  138.         set2.add("FRANCISCO");
  139.         set2.add("MISSION");
  140.         set2.add("SAN");
  141.         set2.add("USA");
  142.         MinHash<String> minHash = new MinHash<String>(set1.size() + set2.size());
  143.         System.out.println("Set1 : " + set1);
  144.         System.out.println("Set2 : " + set2);
  145.         System.out.println("Similarity between two sets: "
  146.                 + minHash.similarity(set1, set2));
  147.     }
  148. }

Output:

advertisement
$ javac MinHash.java
$ java MinHash
 
Set1 : [SAN, MISSION, FRANCISCO]
Set2 : [SAN, USA, MISSION, FRANCISCO]
Similarity between two sets: 1.0

Sanfoundry Global Education & Learning Series – 1000 Java Programs.

Sanfoundry Certification Contest of the Month is Live. 100+ Subjects. Participate Now!
advertisement
advertisement

Here’s the list of Best Books in Java Programming, Data Structures and Algorithms.

advertisement
advertisement
Subscribe to our Newsletters (Subject-wise). Participate in the Sanfoundry Certification contest to get free Certificate of Merit. Join our social networks below and stay updated with latest contests, videos, internships and jobs!

Youtube | Telegram | LinkedIn | Instagram | Facebook | Twitter | Pinterest
Manish Bhojasia - Founder & CTO at Sanfoundry
Manish Bhojasia, a technology veteran with 20+ years @ Cisco & Wipro, is Founder and CTO at Sanfoundry. He lives in Bangalore, and focuses on development of Linux Kernel, SAN Technologies, Advanced C, Data Structures & Alogrithms. Stay connected with him at LinkedIn.

Subscribe to his free Masterclasses at Youtube & technical discussions at Telegram SanfoundryClasses.