Skip to content

Commit

Permalink
Suspicious results : Seems worse than whole sentences
Browse files Browse the repository at this point in the history
  • Loading branch information
mdda committed Feb 11, 2018
1 parent 5ff980f commit 27b59ad
Showing 1 changed file with 200 additions and 14 deletions.
214 changes: 200 additions & 14 deletions notebooks/2-CNN/8-Speech/SpeechAnalysis_Alignment.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@
"def word_range_mask(w):\n",
" mask = np.zeros_like(mel_sym)\n",
" for i in word_to_idx.get(w, []):\n",
" t_min = (txt_starts[i]-txt_err[i])\n",
" t_min = txt_starts[i]-txt_err[i]\n",
" if t_min<0: t_min=0\n",
" \n",
" if i+1 < txt_starts.shape[0]:\n",
Expand Down Expand Up @@ -959,7 +959,7 @@
"# For every word (n_occurrences>0) create embedding via masks\n",
"# NB: masks depend on current txt_starts, txt_err\n",
"\n",
"def create_word_embeddings():\n",
"def create_word_embeddings(ignore_rare=True, ignore_frequent=True):\n",
" word_embed = dict()\n",
" \n",
" overall_bins= np.bincount(mel_sym)\n",
Expand All @@ -969,12 +969,12 @@
" #for w in words_freq_ordered:\n",
" for w in word_to_idx.keys():\n",
" n = len(word_to_idx[w])\n",
" if n<2: \n",
" if ignore_rare and n<2: \n",
" # Not enough for any logic operations to make a difference...\n",
" continue \n",
" \n",
" word_mask=word_range_mask(w)\n",
" if np.sum(word_mask) > 0.80*word_mask.shape[0]:\n",
" if ignore_frequent and np.sum(word_mask)>0.80*word_mask.shape[0]:\n",
" # Too broad to be worthwhile... (includes #EOS)\n",
" continue \n",
" \n",
Expand Down Expand Up @@ -1408,7 +1408,7 @@
"metadata": {},
"outputs": [],
"source": [
"i=59\n",
"i=65\n",
"print( sentence_spans_embedding[i,:])\n",
"print( sentence_spans_embedding[i+1,:])\n",
"print( np.dot(sentence_spans_embedding[i,:], sentence_spans_embedding[i+1,:]) )\n",
Expand Down Expand Up @@ -1470,7 +1470,7 @@
"metadata": {},
"outputs": [],
"source": [
"j=103\n",
"j=100\n",
"#print( audio_spans[j]['t_start'], audio_spans[j]['t_end'], )\n",
"play_audio_span(audio_spans[j], autoplay=True) \n",
"#play_audio_span(audio_spans[j+1]) "
Expand Down Expand Up @@ -1696,8 +1696,18 @@
" period_in_sec=matching_period, ending=True)\n",
"\n",
"#overall_emb\n",
"print(audio_spans_embedding_starts[100])\n",
"print(audio_spans_embedding_ends[100])"
"j=103\n",
"print(audio_spans_embedding_starts[j])\n",
"print(audio_spans_embedding_ends[j])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#sentence_spans[3]"
]
},
{
Expand All @@ -1707,7 +1717,7 @@
"outputs": [],
"source": [
"def create_sentence_embedding_matching(word_embedding, period_in_sec=None, \n",
" beginning=False, ending=False):\n",
" beginning=False, ending=False, debug_i=-1):\n",
" ss_embedding = np.zeros( (len(sentence_spans), embedding_dim))\n",
" for i, s in enumerate(sentence_spans):\n",
" span_emb = np.zeros( (embedding_dim,) )\n",
Expand All @@ -1718,6 +1728,8 @@
" for j, w in enumerate(s['span']):\n",
" if w in word_embedding:\n",
" span_emb += word_embedding[w]\n",
" if i==debug_i:\n",
" print('start : '+w)\n",
" if txt_starts[ txt_starts_i+j ]>t_max:\n",
" break\n",
"\n",
Expand All @@ -1730,24 +1742,29 @@
" for j, w in enumerate(s['span'][::-1]): # Go backwards\n",
" if w in word_embedding:\n",
" span_emb += word_embedding[w]\n",
" if txt_starts[ txt_starts_i-j ]<t_min:\n",
" if i==debug_i:\n",
" print('end : '+w)\n",
" if txt_starts[ txt_starts_i-1-j ]<t_min:\n",
" break\n",
"\n",
" norm = np.linalg.norm(span_emb)\n",
" if norm>0.:\n",
" span_emb /= norm\n",
" else:\n",
" print(\"No embeddings found for sentence \"+str(i))\n",
" span_emb = word_embedding['marriage'] # Aribitrary to avoid ==0\n",
" ss_embedding[i, :] = span_emb\n",
" return ss_embedding\n",
"\n",
"# Test this once\n",
"i=3\n",
"\n",
"#matching_period=4.0\n",
"sentence_spans_embedding_starts = create_sentence_embedding_matching(word_embedding, \n",
" period_in_sec=matching_period, beginning=True)\n",
" period_in_sec=matching_period, beginning=True, debug_i=i)\n",
"sentence_spans_embedding_ends = create_sentence_embedding_matching(word_embedding, \n",
" period_in_sec=matching_period, ending=True)\n",
" period_in_sec=matching_period, ending=True, debug_i=i)\n",
"\n",
"i=65\n",
"print( txt_starts[ sentence_spans[i]['t_end']+1] \n",
" - txt_starts[ sentence_spans[i]['t_start'] ])\n",
"print(sentence_spans_embedding_starts[i])\n",
Expand All @@ -1761,6 +1778,92 @@
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# List neighbouring sentences that have the lowest dot products\n",
"def get_sorted_sentence_span_match_contrasts():\n",
" match_contrast=[]\n",
" for i in range(0, len(sentence_spans)-2): \n",
" match_contrast.append( (\n",
" np.dot(sentence_spans_embedding_ends[i,:], \n",
" sentence_spans_embedding_starts[i+1,:]),\n",
" i, i+1)\n",
" )\n",
" return sorted(match_contrast)\n",
"\n",
"get_sorted_sentence_span_match_contrasts()[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Loop through audio_spans within 'striking range' of a given sentence start\n",
"# And find the dot product with that sentence\n",
"\n",
"def return_sentence_vs_audio_dots_match(i, beginning=False, ending=False):\n",
" #print(sentence_spans[i])\n",
" t_start = sentence_spans[i]['t_start']\n",
" t_end = sentence_spans[i]['t_end']\n",
" if t_end+1 < len(txt_starts):\n",
" t_end += 1\n",
" \n",
" #print( txt_starts[ t_start ], txt_err[ t_start ] )\n",
" #print( txt_starts[ t_end ], txt_err[ t_end ] )\n",
" \n",
" t_min = (txt_starts[ t_start ] - txt_err[ t_start ])/fft_step\n",
" t_max = (txt_starts[ t_end ] + txt_err[ t_end ] )/fft_step\n",
" #print(t_min, t_max)\n",
" \n",
" a_arr, dots = [],[]\n",
" for a_i, a in enumerate(audio_spans):\n",
" if a['t_start']>t_max or a['t_end']<t_min:\n",
" continue\n",
" a_arr.append(a_i)\n",
" if ending: # Match the audio_span ending with sentence ending\n",
" dots.append(np.dot(sentence_spans_embedding_ends[i, :], \n",
" audio_spans_embedding_ends[a_i, :] ))\n",
" if beginning: # Match the audio_span beginning with sentence beginning\n",
" dots.append(np.dot(sentence_spans_embedding_starts[i, :], \n",
" audio_spans_embedding_starts[a_i, :] ))\n",
" #print(dots)\n",
" return a_arr, dots\n",
"\n",
"def show_audio_span_period_matches(i):\n",
" # i set in previous cell : 'contrasting adjacent sentences'\n",
" x,y = return_sentence_vs_audio_dots_match(i, ending=True)\n",
" plt.plot(x,y, 'b-*')\n",
"\n",
" x,y = return_sentence_vs_audio_dots_match(i+1, beginning=True)\n",
" plt.plot(x,y, 'r-*')\n",
"\n",
" plt.grid(True)\n",
" plt.title(\"Looking for blue peak followed by a red peak\")\n",
" plt.show()\n",
" \n",
"show_audio_span_period_matches(i)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -1776,7 +1879,90 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"# Loop starts here"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"txt_starts, txt_err = create_starts_and_errs(current_starts)\n",
"\n",
"# Now shift the timings, so that sentence starts hit audio starts exactly\n",
"s_to_a = sentence_ends_find_nearest_audio_gaps()\n",
"\n",
"# This starts-dict aligns the sentences to the nearest audio\n",
"tmp_starts=s_to_a_to_starts(s_to_a, known_starts=current_starts)\n",
"\n",
"# This doesn't update the txt_errs, which are probably sort-of-right\n",
"txt_starts, txt_err_ignore = create_starts_and_errs( tmp_starts )\n",
"\n",
"plt.plot(txt_starts, 'b')\n",
"plt.plot(txt_err*10., 'r')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"word_embedding = create_word_embeddings(ignore_rare=False, ignore_frequent=False)\n",
"\n",
"sentence_spans_embedding_starts = create_sentence_embedding_matching(word_embedding, \n",
" period_in_sec=matching_period, beginning=True)\n",
"sentence_spans_embedding_ends = create_sentence_embedding_matching(word_embedding, \n",
" period_in_sec=matching_period, ending=True)\n",
"\n",
"get_sorted_sentence_span_match_contrasts()[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"i=40 # This is the last number in the tuple (higher one)\n",
"print(str(i-1)+') '+' '.join(sentence_spans[i-1]['span']) )\n",
"show_audio_span_period_matches(i-1)\n",
"print(str(i )+') '+' '.join(sentence_spans[i ]['span']) )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"j=3 # Look for start of second, red, audio_span (easier to spot)\n",
"play_audio_span(audio_spans[j], autoplay=True) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Add the j found to the current_starts\n",
"current_starts[sentence_spans[i]['t_start']] = (\n",
" audio_spans[ j ]['t_start']*fft_step, txt_err_min \n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Now loop around"
]
},
{
"cell_type": "code",
Expand Down

0 comments on commit 27b59ad

Please sign in to comment.