Skip to content

Commit

Permalink
BAEL-980 lsh code (eugenp#2047)
Browse files Browse the repository at this point in the history
* BAEL-980 lsh code

* BAEL-980 rename test
  • Loading branch information
tomekl007 authored and KevinGilmore committed Jun 18, 2017
1 parent 4ca15c9 commit 0144471
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 0 deletions.
7 changes: 7 additions & 0 deletions libraries/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,12 @@
<artifactId>opennlp-tools</artifactId>
<version>1.8.0</version>
</dependency>
<dependency>
<groupId>info.debatty</groupId>
<artifactId>java-lsh</artifactId>
<version>${java-lsh.version}</version>
</dependency>


</dependencies>
<properties>
Expand All @@ -371,6 +377,7 @@
<serenity.plugin.version>1.4.0</serenity.plugin.version>
<jUnitParams.version>1.1.0</jUnitParams.version>
<netty.version>4.1.10.Final</netty.version>
<java-lsh.version>0.10</java-lsh.version>
</properties>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package com.baeldung.lsh;

import info.debatty.java.lsh.LSHMinHash;
import org.junit.Ignore;
import org.junit.Test;

import java.util.Arrays;

import static org.assertj.core.api.Assertions.assertThat;


public class LocalSensitiveHashingUnitTest {

@Ignore("for simplicity of the example number of input vectors is very low, that's why LSH may yield non deterministic results")
@Test()
public void givenNVectors_whenPerformLSH_thenShouldCalculateSameHashForSimilarVectors() {
//given
boolean[] vector1 = new boolean[]{true, true, true, true, true};
boolean[] vector2 = new boolean[]{false, false, false, true, false};
boolean[] vector3 = new boolean[]{false, false, true, true, false};

int sizeOfVectors = 5;
int numberOfBuckets = 10;
int stages = 4;

LSHMinHash lsh = new LSHMinHash(stages, numberOfBuckets, sizeOfVectors);

//when
int[] firstHash = lsh.hash(vector1);
int[] secondHash = lsh.hash(vector2);
int[] thirdHash = lsh.hash(vector3);

System.out.println(Arrays.toString(firstHash));
System.out.println(Arrays.toString(secondHash));
System.out.println(Arrays.toString(thirdHash));

//then
int lastIndexOfResult = stages - 1;
assertThat(firstHash[lastIndexOfResult]).isNotEqualTo(secondHash[lastIndexOfResult]);
assertThat(firstHash[lastIndexOfResult]).isNotEqualTo(thirdHash[lastIndexOfResult]);
assertThat(isCloseOrEqual(secondHash[lastIndexOfResult], thirdHash[lastIndexOfResult], numberOfBuckets)).isTrue();
}

private boolean isCloseOrEqual(int secondHash, int thirdHash, int numberOfBuckets) {
return Math.abs(secondHash - thirdHash) < numberOfBuckets / 2;
}
}

0 comments on commit 0144471

Please sign in to comment.