Various hacks to deal with the need to search very deep into raw_matc…

…h ordered list
aldycool · Oct 26, 2014 · ec42a4b · ec42a4b
1 parent cc7f132
commit ec42a4b
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 70 deletions.
diff --git a/audfprint.py b/audfprint.py
@@ -249,7 +249,7 @@ def setup_matcher(args):
     """Create a new matcher objects, set parameters from docopt structure"""
     matcher = audfprint_match.Matcher()
     matcher.window = int(args['--match-win'])
-    matcher.threshold = int(args['--min-count'])
+    matcher.threshcount = int(args['--min-count'])
     matcher.max_returns = int(args['--max-matches'])
     matcher.search_depth = int(args['--search-depth'])
     matcher.sort_by_time = args['--sortbytime']

diff --git a/audfprint_match.py b/audfprint_match.py
@@ -14,21 +14,15 @@
 import audfprint_analyze
 import matplotlib.pyplot as plt
 
+from scipy import stats
+
 def log(message):
     """ log info with stats """
-    print time.ctime(), \
-        "physmem=", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, \
-        "utime=", resource.getrusage(resource.RUSAGE_SELF).ru_utime, \
-        message
-
-def find_mode(data, window=0):
-    """ Find the (mode, count) of a set of data
-    including a tolerance window +/- window if > 0
-    """
-    vals = np.unique(data)
-    counts = [len([x for x in data if abs(x-val) <= window]) for val in vals]
-    bestix = np.argmax(counts)
-    return (vals[bestix], counts[bestix])
+    #print time.ctime(), \
+    #    "physmem=", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss, \
+    #    "utime=", resource.getrusage(resource.RUSAGE_SELF).ru_utime, \
+    #    message
+    pass
 
 def locmax(vec, indices=False):
     """ Return a boolean vector of which points in vec are local maxima.
@@ -53,19 +47,14 @@ def find_modes(data, threshold=5, window=0):
         pairs for every mode greater than or equal to threshold.
         Only local maxima in counts are returned.
     """
-    vals = np.unique(data)
-    #counts = [len([x for x in data if abs(x-val) <= window]) for val in vals]
-    #counts = np.array([np.sum(np.abs(data - val) <= window) for val in vals])
-    counts = np.sum(np.abs(np.subtract.outer(vals, data)) <= window, axis=1)
-    # Put them into an actual vector
-    minval = min(vals)
-    fullvector = np.zeros(max(vals-minval)+1)
-    fullvector[vals-minval] = counts
+    # TODO: Ignores window at present
+    datamin = np.amin(data)
+    fullvector = np.bincount(data - datamin)
     # Find local maxima
-    localmaxes = np.nonzero(locmax(fullvector) &
-                            (fullvector >= threshold))[0].tolist()
-    return [(localmax+minval, fullvector[localmax]) for localmax in localmaxes]
-
+    localmaxes = np.nonzero(np.logical_and(locmax(fullvector),
+                                           np.greater_equal(fullvector,
+                                                            threshold)))[0]
+    return localmaxes + datamin, fullvector[localmaxes]
 
 class Matcher(object):
     """Provide matching for audfprint fingerprint queries to hash table"""
@@ -86,6 +75,8 @@ def __init__(self):
         self.verbose = False
         # Do illustration?
         self.illustrate = False
+        # Careful counts?
+        self.exact_count = False
 
     def match_hashes(self, ht, hashes, hashesfor=None):
         """ Match audio against fingerprint hash table.
@@ -94,9 +85,9 @@ def match_hashes(self, ht, hashes, hashesfor=None):
             hit (0=top hit).
         """
         # find the implicated id, time pairs from hash table
-        #log("nhashes=%d" % np.shape(hashes)[0])
+        log("nhashes=%d" % np.shape(hashes)[0])
         hits = ht.get_hits(hashes)
-        #log("nhits=%d" % np.shape(hits)[0])
+        log("nhits=%d" % np.shape(hits)[0])
         ## Sorted list of all the track ids that got hits
         #idlist = np.r_[-1, sorted([id for id, time, hash, otime in hits]), -1]
         ## Counts of unique entries in the sorted list
@@ -123,48 +114,66 @@ def match_hashes(self, ht, hashes, hashesfor=None):
         #counts = np.sum(np.equal.outer(ids, allids), axis=1)
         # much faster, and doesn't explode memory
         counts = np.bincount(allids)[ids]
-        #log("max(counts)=%d" % np.amax(counts))
+        log("max(counts)=%d" % np.amax(counts))
 
         # Find all the actual hits for a the most popular ids
         bestcountsids = sorted(zip(counts, ids), reverse=True)
+        maxdepth = np.minimum(np.count_nonzero(np.greater(counts, self.threshcount)), self.search_depth)
         # Try the top N results
         results = []
-        for rawcount, tid in bestcountsids[:self.search_depth]:
-            #modescounts = find_modes([time for (id, time, hash, otime) in hits
-            #                          if id == tid],
-            #                          window=window, threshold=threshcount)
-            modescounts = find_modes(alltimes[np.nonzero(allids == tid)[0]],
-                                     window=self.window,
-                                     threshold=self.threshcount)
-            for (mode, filtcount) in modescounts:
-                #matchhashes = [((otime), hash)
-                #               for (id, time, hash, otime) in hits
-                #               if id == tid and abs(time - mode) <= window]
-                ## matchhashes may include repeats because multiple
-                ## ref hashes may match a single query hash under window.
-                ## Uniqify:
-                #matchhashes = sorted(list(set(matchhashes)))
-                matchix = np.nonzero((allids == tid) &
-                                     (np.abs(alltimes-mode) <= self.window))[0]
-                matchhasheshash = np.unique(allotimes[matchix]
-                                            + maxotime*allhashes[matchix])
-                matchhashes = [(hash_ % maxotime, hash_ / maxotime)
-                               for hash_ in matchhasheshash]
-                # much, much faster
-                filtcount = len(matchhashes)
-                results.append((tid, filtcount, mode, rawcount, matchhashes))
+        if not self.exact_count:
+            mintime = np.amin(alltimes)
+            alltimes -= mintime
+            for rawcount, tid in bestcountsids[:maxdepth]:
+                tidtimes = alltimes[allids==tid]
+                if np.amax(np.bincount(tidtimes)) <= self.threshcount:
+                    continue
+                #mode, count = stats.mode(tidtimes)
+                #mode = int(mode[0])
+                modes, counts = find_modes(tidtimes, self.threshcount)
+                if len(modes):
+                    mode = modes[np.nonzero(np.equal(counts,
+                                                     np.amax(counts)))[0][0]]
+                    count = np.count_nonzero(np.less_equal(
+                        np.abs(tidtimes - mode), self.window))
+                    log("tid %d raw %d count %d" % (tid, rawcount, count))
+                    if count > self.threshcount:
+                        results.append((tid, count, mode+mintime, rawcount))
+                        if count > rawcount/4:
+                            break
+        else:
+            for rawcount, tid in bestcountsids[:maxdepth]:
+                modes, counts = find_modes(alltimes[np.nonzero(allids == tid)[0]],
+                                           window=self.window,
+                                           threshold=self.threshcount)
+                for (mode, filtcount) in zip(modes, counts):
+                    # matchhashes may include repeats because multiple
+                    # ref hashes may match a single query hash under window.
+                    # Uniqify:
+                    #matchhashes = sorted(list(set(matchhashes)))
+                    matchix = np.nonzero((allids == tid) &
+                                         (np.abs(alltimes-mode) <= self.window))[0]
+                    matchhasheshash = np.unique(allotimes[matchix]
+                                                + maxotime*allhashes[matchix])
+                    matchhashes = [(hash_ % maxotime, hash_ / maxotime)
+                                   for hash_ in matchhasheshash]
+                    # much, much faster
+                    filtcount = len(matchhashes)
+                    if filtcount >= self.threshcount:
+                        if hashesfor is not None:
+                            results.append((tid, filtcount, mode, rawcount, matchhashes))
+                        else:
+                            results.append((tid, filtcount, mode, rawcount))
 
         results = sorted(results, key=lambda x: x[1], reverse=True)
-        # Make sure again to return only those meeting threshcount (needed??)
-        shortresults = [(tid, filtcnt, mode, rawcount)
-                        for (tid, filtcnt, mode,
-                             rawcount, matchhashes) in results
-                        if filtcnt >= self.threshcount]
-
-        if hashesfor is not None:
-            return shortresults, results[hashesfor][4]
+        if hashesfor is None:
+            return results
         else:
-            return shortresults
+            hashesforhashes = results[hashesfor][4]
+            results = [(tid, filtcnt, mode, rawcount)
+                        for (tid, filtcnt, mode,
+                             rawcount, matchhashes) in results]
+            return results, results[hashesfor][4]
 
     def match_file(self, analyzer, ht, filename, number=None):
         """ Read in an audio file, calculate its landmarks, query against

diff --git a/config.txt b/config.txt
@@ -11,4 +11,6 @@ ncores: 4
 [dpwe_matcher]
 density: 70
 fanout: 8
+search_depth: 10000
+min_count: 20
 ncores: 4
diff --git a/dpwe_matcher.py b/dpwe_matcher.py
@@ -46,8 +46,10 @@
         resultFile = sys.argv[3]
 
 # Default params
-defaults = {'density': "70", 
-            'fanout': "8", 
+defaults = {'density': "70",
+            'fanout': "8",
+            'search_depth': "10000",
+            'min_count': "20",
             'ncores': "4"}
 
 # Parse input file
@@ -62,18 +64,24 @@
 
 density = config.getint(section, 'density')
 fanout = config.getint(section, 'fanout')
+search_depth = config.getint(section, 'search_depth')
+min_count = config.getint(section, 'min_count')
 ncores = config.getint(section, 'ncores')
 
-print sys.argv[0], "density:", density, "fanout:", fanout, "ncores:", ncores
+print sys.argv[0], "density:", density, "fanout:", fanout, \
+    "search_depth", search_depth, "min_count", min_count, \
+    "ncores:", ncores
 
 # Run the command
-argv = ["audfprint", "match", 
-        "-d", os.path.join(dir4db, "data.fpdb"), 
-        "--density", str(density), 
+argv = ["audfprint", "match",
+        "-d", os.path.join(dir4db, "data.fpdb"),
+        "--density", str(density),
         "--fanout", str(fanout),
+        "--search-depth", str(search_depth),
+        "--min-count", str(min_count),
         "--ncores", str(ncores),
-        "--verbose", 0, 
-        "--opfile", resultFile, 
+        "--verbose", 0,
+        "--opfile", resultFile,
         "--list", fileList4query]
 
 # Run audfprint