Suspicious results : Seems worse than whole sentences

sbhttcha · Feb 11, 2018 · 27b59ad · 27b59ad
1 parent 5ff980f
commit 27b59ad
Showing 1 changed file with 200 additions and 14 deletions.
diff --git a/notebooks/2-CNN/8-Speech/SpeechAnalysis_Alignment.ipynb b/notebooks/2-CNN/8-Speech/SpeechAnalysis_Alignment.ipynb
@@ -533,7 +533,7 @@
     "def word_range_mask(w):\n",
     "    mask = np.zeros_like(mel_sym)\n",
     "    for i in word_to_idx.get(w, []):\n",
-    "        t_min = (txt_starts[i]-txt_err[i])\n",
+    "        t_min = txt_starts[i]-txt_err[i]\n",
     "        if t_min<0: t_min=0\n",
     "        \n",
     "        if i+1 < txt_starts.shape[0]:\n",
@@ -959,7 +959,7 @@
     "# For every word (n_occurrences>0) create embedding via masks\n",
     "#   NB: masks depend on current txt_starts, txt_err\n",
     "\n",
-    "def create_word_embeddings():\n",
+    "def create_word_embeddings(ignore_rare=True, ignore_frequent=True):\n",
     "    word_embed = dict()\n",
     "    \n",
     "    overall_bins= np.bincount(mel_sym)\n",
@@ -969,12 +969,12 @@
     "    #for w in words_freq_ordered:\n",
     "    for w in word_to_idx.keys():\n",
     "        n = len(word_to_idx[w])\n",
-    "        if n<2: \n",
+    "        if ignore_rare and n<2: \n",
     "            # Not enough for any logic operations to make a difference...\n",
     "            continue \n",
     "            \n",
     "        word_mask=word_range_mask(w)\n",
-    "        if np.sum(word_mask) > 0.80*word_mask.shape[0]:\n",
+    "        if ignore_frequent and np.sum(word_mask)>0.80*word_mask.shape[0]:\n",
     "            # Too broad to be worthwhile... (includes #EOS)\n",
     "            continue \n",
     "        \n",
@@ -1408,7 +1408,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "i=59\n",
+    "i=65\n",
     "print( sentence_spans_embedding[i,:])\n",
     "print( sentence_spans_embedding[i+1,:])\n",
     "print( np.dot(sentence_spans_embedding[i,:], sentence_spans_embedding[i+1,:]) )\n",
@@ -1470,7 +1470,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "j=103\n",
+    "j=100\n",
     "#print( audio_spans[j]['t_start'], audio_spans[j]['t_end'], )\n",
     "play_audio_span(audio_spans[j], autoplay=True)  \n",
     "#play_audio_span(audio_spans[j+1])  "
@@ -1696,8 +1696,18 @@
     "    period_in_sec=matching_period, ending=True)\n",
     "\n",
     "#overall_emb\n",
-    "print(audio_spans_embedding_starts[100])\n",
-    "print(audio_spans_embedding_ends[100])"
+    "j=103\n",
+    "print(audio_spans_embedding_starts[j])\n",
+    "print(audio_spans_embedding_ends[j])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#sentence_spans[3]"
    ]
   },
   {
@@ -1707,7 +1717,7 @@
    "outputs": [],
    "source": [
     "def create_sentence_embedding_matching(word_embedding, period_in_sec=None, \n",
-    "                                          beginning=False, ending=False):\n",
+    "                                          beginning=False, ending=False, debug_i=-1):\n",
     "    ss_embedding = np.zeros( (len(sentence_spans), embedding_dim))\n",
     "    for i, s in enumerate(sentence_spans):\n",
     "        span_emb = np.zeros( (embedding_dim,) )\n",
@@ -1718,6 +1728,8 @@
     "            for j, w in enumerate(s['span']):\n",
     "                if w in word_embedding:\n",
     "                    span_emb += word_embedding[w]\n",
+    "                    if i==debug_i:\n",
+    "                        print('start : '+w)\n",
     "                if txt_starts[ txt_starts_i+j ]>t_max:\n",
     "                    break\n",
     "\n",
@@ -1730,24 +1742,29 @@
     "            for j, w in enumerate(s['span'][::-1]): # Go backwards\n",
     "                if w in word_embedding:\n",
     "                    span_emb += word_embedding[w]\n",
-    "                if txt_starts[ txt_starts_i-j ]<t_min:\n",
+    "                    if i==debug_i:\n",
+    "                        print('end : '+w)\n",
+    "                if txt_starts[ txt_starts_i-1-j ]<t_min:\n",
     "                    break\n",
     "\n",
     "        norm = np.linalg.norm(span_emb)\n",
     "        if norm>0.:\n",
     "            span_emb /= norm\n",
     "        else:\n",
+    "            print(\"No embeddings found for sentence \"+str(i))\n",
     "            span_emb = word_embedding['marriage']  # Aribitrary to avoid ==0\n",
     "        ss_embedding[i, :] = span_emb\n",
     "    return ss_embedding\n",
     "\n",
+    "# Test this once\n",
+    "i=3\n",
+    "\n",
     "#matching_period=4.0\n",
     "sentence_spans_embedding_starts = create_sentence_embedding_matching(word_embedding, \n",
-    "    period_in_sec=matching_period, beginning=True)\n",
+    "    period_in_sec=matching_period, beginning=True, debug_i=i)\n",
     "sentence_spans_embedding_ends   = create_sentence_embedding_matching(word_embedding, \n",
-    "    period_in_sec=matching_period, ending=True)\n",
+    "    period_in_sec=matching_period, ending=True, debug_i=i)\n",
     "\n",
-    "i=65\n",
     "print(  txt_starts[ sentence_spans[i]['t_end']+1] \n",
     "      - txt_starts[ sentence_spans[i]['t_start'] ])\n",
     "print(sentence_spans_embedding_starts[i])\n",
@@ -1761,6 +1778,92 @@
    "outputs": [],
    "source": []
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# List neighbouring sentences that have the lowest dot products\n",
+    "def get_sorted_sentence_span_match_contrasts():\n",
+    "    match_contrast=[]\n",
+    "    for i in range(0, len(sentence_spans)-2):  \n",
+    "        match_contrast.append( (\n",
+    "            np.dot(sentence_spans_embedding_ends[i,:], \n",
+    "                   sentence_spans_embedding_starts[i+1,:]),\n",
+    "            i, i+1)\n",
+    "        )\n",
+    "    return sorted(match_contrast)\n",
+    "\n",
+    "get_sorted_sentence_span_match_contrasts()[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Loop through audio_spans within 'striking range' of a given sentence start\n",
+    "#  And find the dot product with that sentence\n",
+    "\n",
+    "def return_sentence_vs_audio_dots_match(i, beginning=False, ending=False):\n",
+    "    #print(sentence_spans[i])\n",
+    "    t_start = sentence_spans[i]['t_start']\n",
+    "    t_end   = sentence_spans[i]['t_end']\n",
+    "    if t_end+1 < len(txt_starts):\n",
+    "        t_end += 1\n",
+    "    \n",
+    "    #print( txt_starts[ t_start ], txt_err[ t_start ] )\n",
+    "    #print( txt_starts[ t_end ], txt_err[ t_end ] )\n",
+    "    \n",
+    "    t_min = (txt_starts[ t_start ] - txt_err[ t_start ])/fft_step\n",
+    "    t_max = (txt_starts[ t_end ]   + txt_err[ t_end ]  )/fft_step\n",
+    "    #print(t_min, t_max)\n",
+    "    \n",
+    "    a_arr, dots = [],[]\n",
+    "    for a_i, a in enumerate(audio_spans):\n",
+    "        if a['t_start']>t_max or a['t_end']<t_min:\n",
+    "            continue\n",
+    "        a_arr.append(a_i)\n",
+    "        if ending:    # Match the audio_span ending with sentence ending\n",
+    "            dots.append(np.dot(sentence_spans_embedding_ends[i, :], \n",
+    "                               audio_spans_embedding_ends[a_i, :] ))\n",
+    "        if beginning: # Match the audio_span beginning with sentence beginning\n",
+    "            dots.append(np.dot(sentence_spans_embedding_starts[i, :], \n",
+    "                               audio_spans_embedding_starts[a_i, :] ))\n",
+    "    #print(dots)\n",
+    "    return a_arr, dots\n",
+    "\n",
+    "def show_audio_span_period_matches(i):\n",
+    "    # i set in previous cell : 'contrasting adjacent sentences'\n",
+    "    x,y = return_sentence_vs_audio_dots_match(i, ending=True)\n",
+    "    plt.plot(x,y, 'b-*')\n",
+    "\n",
+    "    x,y = return_sentence_vs_audio_dots_match(i+1, beginning=True)\n",
+    "    plt.plot(x,y, 'r-*')\n",
+    "\n",
+    "    plt.grid(True)\n",
+    "    plt.title(\"Looking for blue peak followed by a red peak\")\n",
+    "    plt.show()\n",
+    "    \n",
+    "show_audio_span_period_matches(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1776,7 +1879,90 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# Loop starts here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "txt_starts, txt_err = create_starts_and_errs(current_starts)\n",
+    "\n",
+    "# Now shift the timings, so that sentence starts hit audio starts exactly\n",
+    "s_to_a = sentence_ends_find_nearest_audio_gaps()\n",
+    "\n",
+    "# This starts-dict aligns the sentences to the nearest audio\n",
+    "tmp_starts=s_to_a_to_starts(s_to_a, known_starts=current_starts)\n",
+    "\n",
+    "# This doesn't update the txt_errs, which are probably sort-of-right\n",
+    "txt_starts, txt_err_ignore = create_starts_and_errs( tmp_starts )\n",
+    "\n",
+    "plt.plot(txt_starts, 'b')\n",
+    "plt.plot(txt_err*10., 'r')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_embedding = create_word_embeddings(ignore_rare=False, ignore_frequent=False)\n",
+    "\n",
+    "sentence_spans_embedding_starts = create_sentence_embedding_matching(word_embedding, \n",
+    "    period_in_sec=matching_period, beginning=True)\n",
+    "sentence_spans_embedding_ends   = create_sentence_embedding_matching(word_embedding, \n",
+    "    period_in_sec=matching_period, ending=True)\n",
+    "\n",
+    "get_sorted_sentence_span_match_contrasts()[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "i=40  # This is the last number in the tuple (higher one)\n",
+    "print(str(i-1)+') '+' '.join(sentence_spans[i-1]['span']) )\n",
+    "show_audio_span_period_matches(i-1)\n",
+    "print(str(i  )+') '+' '.join(sentence_spans[i  ]['span']) )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "j=3  # Look for start of second, red, audio_span (easier to spot)\n",
+    "play_audio_span(audio_spans[j], autoplay=True)  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Add the j found to the current_starts\n",
+    "current_starts[sentence_spans[i]['t_start']] = (\n",
+    "    audio_spans[ j ]['t_start']*fft_step, txt_err_min \n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Now loop around"
+   ]
   },
   {
    "cell_type": "code",