Skip to content

Commit

Permalink
flake8 canonical.py
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Feb 23, 2018
1 parent ccfe965 commit ee35612
Showing 1 changed file with 32 additions and 24 deletions.
56 changes: 32 additions & 24 deletions dedupe/canonical.py
Original file line number Diff line number Diff line change
@@ -1,63 +1,71 @@
import numpy
from affinegap import normalizedAffineGapDistance as comparator


def getCentroid(attribute_variants, comparator):
"""
"""
Takes in a list of attribute values for a field,
evaluates the centroid using the comparator,
& returns the centroid (i.e. the 'best' value for the field)
"""

n = len(attribute_variants)
distance_matrix = numpy.zeros([n,n])

distance_matrix = numpy.zeros([n, n])

# populate distance matrix by looping through elements of matrix triangle
for i in range (0,n):
for j in range (0, i):
for i in range(0, n):
for j in range(0, i):
distance = comparator(attribute_variants[i], attribute_variants[j])
distance_matrix[i,j] = distance_matrix[j,i] = distance
distance_matrix[i, j] = distance_matrix[j, i] = distance

average_distance = distance_matrix.mean(0)

# there can be ties for minimum, average distance string
min_dist_indices = numpy.where(average_distance==average_distance.min())[0]

min_dist_indices = numpy.where(
average_distance == average_distance.min())[0]

if len(min_dist_indices) > 1:
centroid = breakCentroidTie(attribute_variants, min_dist_indices)
else :
else:
centroid_index = min_dist_indices[0]
centroid = attribute_variants[centroid_index]

return centroid
return centroid


def breakCentroidTie(attribute_variants, min_dist_indices):
"""
Finds centroid when there are multiple values w/ min avg distance
(e.g. any dupe cluster of 2) right now this selects the first among a set of
ties, but can be modified to break ties in strings by selecting the longest string
Finds centroid when there are multiple values w/ min avg distance
(e.g. any dupe cluster of 2) right now this selects the first
among a set of ties, but can be modified to break ties in strings
by selecting the longest string
"""
return attribute_variants[min_dist_indices[0]]


def getCanonicalRep(record_cluster):
"""
Given a list of records within a duplicate cluster, constructs a canonical representation
of the cluster by finding canonical values for each field
Given a list of records within a duplicate cluster, constructs a
canonical representation of the cluster by finding canonical
values for each field
"""
canonical_rep = {}

keys = record_cluster[0].keys()


for key in record_cluster[0].keys():
for key in keys:
key_values = []
for record in record_cluster :
# assume non-empty values always better than empty value for canonical record
for record in record_cluster:
# assume non-empty values always better than empty value
# for canonical record
if record[key]:
key_values.append(record[key])
if key_values:
canonical_rep[key] = getCentroid(key_values, comparator)
else:
canonical_rep[key] = ''

return canonical_rep

0 comments on commit ee35612

Please sign in to comment.