implemented audfprint_match, see dotest block therein

aldycool · May 26, 2014 · eefa119 · eefa119
1 parent 5194126
commit eefa119
Show file tree

Hide file tree

Showing 3 changed files with 120 additions and 14 deletions.
diff --git a/audfprint.py b/audfprint.py
@@ -192,19 +192,51 @@ def landmarks2hashes(landmarks):
                  | (dtime & dtmask)) ) 
              for time, bin1, bin2, dtime in landmarks ]
 
+import hash_table
 
-test = True
+def ingest(ht, filename):
+    """ Read an audio file and add it to the database
+    :params:
+      ht : HashTable object
+        the hash table to add to
+      filename : str
+        name of the soundfile to add
+    :returns:
+      dur : float
+        the duration of the track
+      nhashes : int
+        the number of hashes it mapped into
+    """
+    targetsr = 11025
+    d, sr = librosa.load(filename, sr=targetsr)
+    hashes = landmarks2hashes(peaks2landmarks(find_peaks(d, sr)))
+    ht.store(filename, hashes)
+    return (len(d)/float(sr), len(hashes))
+
+import glob, time
+
+def glob2hashtable(pattern):
+    """ Build a hash table from the files matching a glob pattern """
+    ht = hash_table.HashTable()
+    filelist = glob.glob(pattern)
+    initticks = time.clock()
+    totdur = 0.0
+    tothashes = 0
+    for file in filelist:
+        print "ingesting ", file, " ..."
+        dur, nhash = ingest(ht, file)
+        totdur += dur
+        tothashes += nhash
+    elapsedtime = time.clock() - initticks
+    print "Added",tothashes,"(",tothashes/float(totdur),"hashes/sec) at ", elapsedtime/totdur, "x RT"
+    return ht
+
+test = False
 if test:
     fn = '/Users/dpwe/Downloads/carol11k.wav'
-    d, sr = librosa.load(fn, sr=11025)
-    hashes = landmarks2hashes(peaks2landmarks(find_peaks(d[:30*sr], sr)))
+    ht = hash_table.HashTable()
 
-    print len(hashes)
-    for hash in hashes[:40]:
-        print hash
+    ingest(ht, fn)
 
-    import hash_table
-    ht = hash_table.HashTable()
-    ht.store(fn, hashes)
     ht.save('httest.pickle', {'version': 20140525})
 
diff --git a/audfprint_match.py b/audfprint_match.py
@@ -0,0 +1,67 @@
+"""
+audfprint_match.py
+
+Fingerprint matching code for audfprint
+
+2014-05-26 Dan Ellis [email protected]
+"""
+import librosa
+import audfprint
+import numpy as np
+
+def find_mode(data, window=0):
+    """ Find the (mode, count) of a set of data, including a tolerance window +/- window if > 0 """
+    vals = np.unique(data)
+    counts = [len([x for x in data if abs(x-val) <= window]) for val in vals]
+    bestix = np.argmax(counts)
+    return (vals[bestix], counts[bestix])
+
+def match_hashes(ht, hashes):
+    """ Match audio against fingerprint hash table.
+        Return top N matches as (id, filteredmatches, timoffs, rawmatches)
+    """
+    # find the implicated id, time pairs from hash table
+    hits = ht.get_hits(hashes)
+    # Sorted list of all the track ids that got hits
+    idlist = np.r_[-1, sorted([id for id, time in hits]), -1]
+    # Counts of unique entries in the sorted list - diff of locations of changes
+    counts = np.diff(np.nonzero(idlist[:-1] != idlist[1:]))[0]
+    # ids corresponding to each count - just read after the changes in the list
+    ids = idlist[np.cumsum(counts)]
+
+    # Find all the actual hits for a the most popular ids
+    bestcountsids = sorted(zip(counts, ids), reverse=True)
+    # Try the top 100 results
+    resultlist = []
+    for rawcount, tid in bestcountsids[:100]:
+        (mode, filtcount) = find_mode([time for (id, time) in hits 
+                                       if id == tid], 
+                                      window=1)
+        resultlist.append( (tid, filtcount, mode, rawcount) )
+    return sorted(resultlist, key=lambda x:x[1], reverse=True)
+
+def match_file(ht, filename):
+    """ Read in an audio file, calculate its landmarks, query against hash table.  Return top N matches as (id, filterdmatchcount, timeoffs, rawmatchcount)
+    """
+    d, sr = librosa.load(filename, sr=11025)
+    # Collect landmarks offset by 0..3 quarter-windows
+    t_win = 0.04644
+    win = int(np.round(sr * t_win))
+    qwin = win/4
+    hq = audfprint.landmarks2hashes(audfprint.peaks2landmarks(audfprint.find_peaks(d, sr)))
+    hq += audfprint.landmarks2hashes(audfprint.peaks2landmarks(audfprint.find_peaks(d[qwin:], sr)))
+    hq += audfprint.landmarks2hashes(audfprint.peaks2landmarks(audfprint.find_peaks(d[2*qwin:], sr)))
+    hq += audfprint.landmarks2hashes(audfprint.peaks2landmarks(audfprint.find_peaks(d[3*qwin:], sr)))
+    print "Analyzed",filename,"to",len(hq),"hashes"
+    # Run query
+    return match_hashes(ht, hq)
+
+
+dotest = True
+if dotest:
+    pat = '/Users/dpwe/projects/shazam/Nine_Lives/*mp3'
+    qry = 'query.mp3'
+    ht = audfprint.glob2hashtable(pat)
+    rslts = match_file(ht, qry)
+    t_hop = 0.02322
+    print "Matched", qry, "as", ht.names[rslts[0][0]], "at", t_hop*float(rslts[0][2]), "with", rslts[0][1], "of", rslts[0][3], "hashes"
diff --git a/hash_table.py b/hash_table.py
@@ -84,15 +84,18 @@ def get_entry(self, hash):
         return idtimelist
 
     def get_hits(self, hashes):
-        """ Return a list of (id, time) pairs associated with each element in hashes """
-        idtimelist = []
-        for hash in hashes:
-            idtimelist += self.get_entry(hash)
-        return idtimelist
+        """ Return a list of (id, delta_time) pairs associated with each element in hashes list of (time, hash) """
+        iddtimelist = []
+        for time, hash in hashes:
+            idtimelist = [(id, rtime-time) 
+                          for id, rtime in self.get_entry(hash)]
+            iddtimelist += idtimelist
+        return iddtimelist
 
     def save(self, name, params=[]):
         """ Save hash table to file <name>, including optional addition params """
         self.params = params
+        self.version = 20140525
         with open(name, 'w') as f:
             pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
         self.dirty = False
@@ -112,3 +115,7 @@ def load(self, name):
         self.hashesperid = temp.hashesperid
         self.dirty = False
         return params
+
+    def totalhashes(self):
+        """ Return the total count of hashesh stored in the table """
+        return np.sum(self.counts)