Skip to content

Commit

Permalink
made minhash base class and LSHBanding a derivate class
Browse files Browse the repository at this point in the history
  • Loading branch information
Deanamic committed Dec 15, 2018
1 parent 65971cc commit a7f6e89
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 60 deletions.
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
CC = g++
CFLAGS = -O2 -DLOCAL -g -fsanitize=undefined,address -Wall -Wshadow -std=c++14
OBJ = bin/main.o bin/parser.o bin/AhoCorasick.o bin/PolyHash.o bin/MinHash.o bin/Jaccard.o bin/JaccardAhoCorasick.o
OBJ = bin/main.o bin/parser.o bin/AhoCorasick.o bin/PolyHash.o bin/MinHash.o bin/Jaccard.o bin/JaccardAhoCorasick.o bin/LSHBanding.o
EXE = bin/LSH.exe

bin/LSH.exe: $(OBJ)
Expand All @@ -21,6 +21,9 @@ bin/PolyHash.o: src/PolyHash.cc inc/PolyHash.h
bin/MinHash.o: src/MinHash.cc inc/MinHash.h inc/PolyHash.h
$(CC) -o bin/MinHash.o -c src/MinHash.cc $(CFLAGS) -I ./inc

bin/LSHBanding.o: src/LSHBanding.cc inc/LSHBanding.h inc/MinHash.h
$(CC) -o bin/LSHBanding.o -c src/LSHBanding.cc $(CFLAGS) -I ./inc

bin/Jaccard.o: src/Jaccard.cc inc/Jaccard.h
$(CC) -o bin/Jaccard.o -c src/Jaccard.cc $(CFLAGS) -I ./inc

Expand Down
16 changes: 16 additions & 0 deletions inc/LSHBanding.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef LSHBANDING_H
#define LSHBANDING_H
#include "MinHash.h"
#include <set>

class LSHBanding : MinHash {
public:
LSHBanding(vector<vector<string>> shinglesMatrix, int r);
vector <pair<int,int>> getSimilarDocuments(double threshold);
private:
vector<pair<int, int> > getCandidatesLSH(int bandWidth);
int calculateBandWidth(double threshold);
int hashVector(int docIdx, int init, int fin, int value, int mod);
};

#endif
7 changes: 2 additions & 5 deletions inc/MinHash.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,18 @@
#include <vector>
#include <string>
#include <unordered_set>
#include <math.h>
#include "PolyHash.h"
using namespace std;

class MinHash {
public:
MinHash(vector<vector<string>> shinglesMatrix, int r);
vector < pair < int, int > > getSimilarDocuments(double threshold);
double getJaccard(int docIdx1, int docIdx2);
private:
protected:
int numDoc;
int numPerm;
vector<vector<int> > signatureMatrix;
vector<pair<int, int> > getCandidatesLSH(int bandWidth);
int calculateBandWidth(double threshold);
int hashVector(int docIdx, int init, int fin, int value, int mod);
};

#endif
51 changes: 51 additions & 0 deletions src/LSHBanding.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#include "LSHBanding.h"

LSHBanding::LSHBanding(vector<vector<string>> shinglesMatrix, int r) : MinHash(shinglesMatrix,r) {}

int LSHBanding::calculateBandWidth(double threshold){
double margin = 0.1;
if (threshold < margin) {
return -1;
}
for (int bandWidth = 1; bandWidth < numPerm; bandWidth++){
double value = pow(1.0/double(numPerm/bandWidth),1.0/double(bandWidth));
if (value > threshold - margin) return bandWidth;
}
return 1;
}

int LSHBanding::hashVector(int docIdx, int init, int fin, int value, int mod){
vector<int> v(fin-init);
for (int i = init; i < fin; i++) v[i-init] = signatureMatrix[i][docIdx];
PolyHash P = PolyHash(v,mod);
return P.evaluate(value);
}

vector<pair<int,int>> LSHBanding::getCandidatesLSH(int bandWidth){
set<pair<int,int>> candidates1;
int numBands = numPerm/bandWidth;
for (int i = 0; i < numBands; ++i){
vector <vector<int>> v (7919);
for (int j = 0; j < numDoc; ++j){
int hash = hashVector(j,i*bandWidth, (i+1)*bandWidth, 107, 7919);
for (int t = 0; t<(int)v[hash].size(); ++t){
candidates1.insert({min(v[hash][t], j), max(v[hash][t], j)});
}
v[hash].push_back(j);
}
}
vector <pair<int,int>> candidates;
for (auto t : candidates1) candidates.push_back(t);
return candidates;
}

vector <pair<int,int>> LSHBanding::getSimilarDocuments(double threshold){
int bandWidth = calculateBandWidth(threshold);
auto candidates = getCandidatesLSH(bandWidth);
vector <pair<int,int>> similar;
for (auto s : candidates){
if (getJaccard(s.first, s.second)) similar.push_back(s);
}
return similar;
}

53 changes: 1 addition & 52 deletions src/MinHash.cc
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
#include "MinHash.h"
#include "PolyHash.h"
#include <iostream>
#include <set>
#include <math.h>

MinHash::MinHash(vector<vector<string>> shinglesMatrix, int r) : numDoc(shinglesMatrix.size()), numPerm(r){
vector<unordered_set<string>> foundShingles(numDoc);
vector<string> shingles;
Expand All @@ -28,57 +25,9 @@ MinHash::MinHash(vector<vector<string>> shinglesMatrix, int r) : numDoc(shingles
}
}

int MinHash::calculateBandWidth(double threshold){
double margin = 0.1;
if (threshold < margin) {
cout << "threshold too low" << endl;
return 1;
}
for (int bandWidth = 1; bandWidth < numPerm; bandWidth++){
double value = pow(1.0/double(numPerm/bandWidth),1.0/double(bandWidth));
if (value > threshold - margin) return bandWidth;
}
return 1;
}

vector < pair < int , int > > MinHash::getSimilarDocuments(double threshold){
int bandWidth = calculateBandWidth(threshold);
cout << "BandWidth = " << bandWidth << endl;
auto candidates = getCandidatesLSH(bandWidth);
vector < pair < int, int > > similar;
for (auto s : candidates){
if (getJaccard(s.first, s.second)) similar.push_back(s);
}
return similar;
}

double MinHash::getJaccard(int docIdx1, int docIdx2) {
int cnt = 0;
for(int i = 0; i < numPerm; ++i) if(signatureMatrix[i][docIdx1] == signatureMatrix[i][docIdx2]) ++cnt;
return cnt/double(numPerm);
}

int MinHash::hashVector(int docIdx, int init, int fin, int value, int mod){
vector<int> v(fin-init);
for (int i = init; i < fin; i++) v[i-init] = signatureMatrix[i][docIdx];
PolyHash P = PolyHash(v,mod);
return P.evaluate(value);
}

vector<pair<int, int> > MinHash::getCandidatesLSH(int bandWidth){
set < pair < int , int > > candidates1;
int numBands = numPerm/bandWidth;
for (int i = 0; i < numBands; ++i){
vector < vector < int > > v (7919);
for (int j = 0; j < numDoc; ++j){
int hash = hashVector(j,i*bandWidth, (i+1)*bandWidth, 107, 7919);
for (int t = 0; t<(int)v[hash].size(); ++t){
candidates1.insert({min(v[hash][t], j), max(v[hash][t], j)});
}
v[hash].push_back(j);
}
}
vector < pair < int , int > > candidates;
for (auto t : candidates1) candidates.push_back(t);
return candidates;
}
7 changes: 5 additions & 2 deletions src/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "JaccardAhoCorasick.h"
#include "MinHash.h"
#include "Jaccard.h"
#include "LSHBanding.h"
using namespace std;

int main(int argc, char *argv[]) {
Expand All @@ -24,12 +25,14 @@ int main(int argc, char *argv[]) {
string s2 = doc2.getDocument();

JaccardAhoCorasick JAC({v1,v2},{s1,s2});
MinHash M({v1,v2}, 10000);
MinHash M({v1,v2}, 5000);
LSHBanding LSH({v1,v2},5000);
Jaccard J({v1,v2});
cout << JAC.getJaccard(0,1) << endl;
cout << J.getJaccard(0,1) << endl;
cout << M.getJaccard(0,1) << endl;
double threshold = 0.18;
auto candidates = M.getSimilarDocuments(threshold);
auto candidates = LSH.getSimilarDocuments(threshold);
cout << "List of documents more similar than " << threshold << endl;
for (pair<int,int>& p : candidates){
cout << p .first << " " << p.second << ' ' << M.getJaccard(p.first, p.second) << endl;
Expand Down

0 comments on commit a7f6e89

Please sign in to comment.