Add Zamia setup

wenzhu888 · Feb 20, 2019 · 067721e · 067721e
1 parent da78aa2
commit 067721e
Show file tree

Hide file tree

Showing 10 changed files with 223 additions and 1 deletion.
diff --git a/examples/setups/README.md b/examples/setups/README.md
@@ -4,7 +4,12 @@ Example Setups
 ## aspire
 
 Batteries included example setup for demonstrating the use of PyKaldi with
-pre-trained ASpIRE chain models.
+pre-trained ASpIRE chain tdnn models.
+
+## zamia
+
+Batteries included example setup for demonstrating the use of PyKaldi with
+pre-trained Zamia chain tdnn-f models.
 
 ## ltsv-based-vad
 

diff --git a/examples/setups/zamia/README.md b/examples/setups/zamia/README.md
@@ -0,0 +1,103 @@
+# Models
+
+Download and extract Zamia chain tdnn-f models by running the following command
+inside this directory.
+
+    ./models.sh
+
+This script uses `wget` and `tar` executables, so make sure they are installed.
+
+# Kaldi Setup
+
+You need a Kaldi installation for running this example setup. If you installed
+PyKaldi from source (or if you are using PyKaldi docker image), then you can
+simply run the following inside this directory to add Kaldi executables to your
+`PATH`.
+
+    source path.sh
+
+If you did not install PyKaldi from source, then you will need to install Kaldi
+and edit the first line of `path.sh` before sourcing it.
+
+# Data Preparation
+
+There is an example data setup inside `data` directory. You can skip the rest
+of this section if you simply want to run the example setup.
+
+If you want to decode or align your own recordings, you can edit the files in
+`data/test` directory. For decoding, you need to provide `data/test/wav.scp` and
+`data/test/spk2utt` files. For alignment, you also need to provide
+`data/test/text`.
+
+## List of Recordings
+
+The list of utterances `data/test/wav.scp` has the format:
+
+    utt1 /path/to/utt1.wav
+    utt2 /path/to/utt2.wav
+    utt3 /path/to/utt3.wav
+    utt4 /path/to/utt4.wav
+    ...
+
+Note that each utterance should be a relatively short recording, that is seconds
+long not minutes long. If you want to decode long recordings, you can segment
+them into short utterances using a speech activity detection system.
+
+Also, make sure the wav files are single channel, 16bit PCM files. If your audio
+files are in a different file format, you can use `ffmpeg` and `sox` to convert
+them to the required format.
+
+## Speaker to Utterance Map
+
+The speaker to utterance map `data/test/spk2utt` has the format:
+
+    spk1 utt1 utt4 ...
+    spk2 utt2 utt3 ...
+    ...
+
+If no speaker information is available, make it an identity mapping:
+
+    utt1 utt1
+    utt2 utt2
+    utt3 utt3
+    utt4 utt4
+    ...
+
+## Transcripts (needed for alignment)
+
+The list of transcripts in `data/test/text` has the format:
+
+    utt1 trascript of first utterance
+    utt2 trascript of second utterance
+    utt3 trascript of third utterance
+    utt4 trascript of fourth utterance
+    ...
+
+Note that these should be tokenized transcripts and all of the tokens should be
+in system vocabulary `data/lang/words.txt`.
+
+# ASR
+
+You can decode the utterances listed in `data/test/wav.scp` with the following
+command.
+
+    ./decode.py
+
+The decoding script will print a bunch of logs to stderr.
+
+Decoding outputs are written to `out/test/decode.out`.
+
+# Alignment
+
+You can align the utterances listed in `data/test/wav.scp` with the transcripts
+listed in `data/test/text` using the following command.
+
+    ./align.py
+
+The alignment script will print a bunch of logs to stderr.
+
+Frame-level alignments are written to `out/test/align.out`.
+
+Phone-level alignments are written to `out/test/phone_align.out`.
+
+Word-level alignments are written to `out/test/word_align.out`.
diff --git a/examples/setups/zamia/align.py b/examples/setups/zamia/align.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+from kaldi.alignment import NnetAligner
+from kaldi.fstext import SymbolTable
+from kaldi.lat.align import WordBoundaryInfoNewOpts, WordBoundaryInfo
+from kaldi.nnet3 import NnetSimpleComputationOptions
+from kaldi.util.table import SequentialMatrixReader
+
+# Construct aligner
+decodable_opts = NnetSimpleComputationOptions()
+decodable_opts.acoustic_scale = 1.0
+decodable_opts.frame_subsampling_factor = 3
+decodable_opts.frames_per_chunk = 150
+aligner = NnetAligner.from_files(
+    "exp/nnet3_chain/tdnn_f/final.mdl",
+    "exp/nnet3_chain/tdnn_f/tree",
+    "data/lang/L.fst",
+    "data/lang/words.txt",
+    "data/lang/phones/disambig.int",
+    decodable_opts=decodable_opts)
+phones = SymbolTable.read_text("data/lang/phones.txt")
+wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(),
+                                     "data/lang/phones/word_boundary.int")
+
+# Define feature pipelines as Kaldi rspecifiers
+feats_rspec = (
+    "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |"
+)
+ivectors_rspec = (
+    "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |"
+    "ivector-extract-online2 --config=conf/ivector_extractor.conf ark:data/test/spk2utt ark:- ark:- |"
+)
+
+# Align wav files
+with SequentialMatrixReader(feats_rspec) as f, \
+     SequentialMatrixReader(ivectors_rspec) as i, \
+     open("data/test/text") as t, \
+     open("out/test/align.out", "w") as a, \
+     open("out/test/phone_align.out", "w") as p, \
+     open("out/test/word_align.out", "w") as w:
+    for (fkey, feats), (ikey, ivectors), line in zip(f, i, t):
+        tkey, text = line.strip().split(None, 1)
+        assert(fkey == ikey == tkey)
+        out = aligner.align((feats, ivectors), text)
+        print(fkey, out["alignment"], file=a)
+        phone_alignment = aligner.to_phone_alignment(out["alignment"], phones)
+        print(fkey, phone_alignment, file=p)
+        word_alignment = aligner.to_word_alignment(out["best_path"], wb_info)
+        print(fkey, word_alignment, file=w)
diff --git a/examples/setups/zamia/decode.py b/examples/setups/zamia/decode.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+
+from kaldi.asr import NnetLatticeFasterRecognizer
+from kaldi.decoder import LatticeFasterDecoderOptions
+from kaldi.nnet3 import NnetSimpleComputationOptions
+from kaldi.util.table import SequentialMatrixReader
+
+# Construct recognizer
+decoder_opts = LatticeFasterDecoderOptions()
+decoder_opts.beam = 13
+decoder_opts.max_active = 7000
+decodable_opts = NnetSimpleComputationOptions()
+decodable_opts.acoustic_scale = 1.0
+decodable_opts.frame_subsampling_factor = 3
+decodable_opts.frames_per_chunk = 150
+asr = NnetLatticeFasterRecognizer.from_files(
+    "exp/nnet3_chain/tdnn_f/final.mdl",
+    "exp/nnet3_chain/tdnn_f/graph/HCLG.fst",
+    "data/lang/words.txt",
+    decoder_opts=decoder_opts,
+    decodable_opts=decodable_opts)
+
+# Define feature pipelines as Kaldi rspecifiers
+feats_rspec = (
+    "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |"
+)
+ivectors_rspec = (
+    "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |"
+    "ivector-extract-online2 --config=conf/ivector_extractor.conf ark:data/test/spk2utt ark:- ark:- |"
+)
+
+# Decode wav files
+with SequentialMatrixReader(feats_rspec) as f, \
+     SequentialMatrixReader(ivectors_rspec) as i, \
+     open("out/test/decode.out", "w") as o:
+    for (key, feats), (_, ivectors) in zip(f, i):
+        out = asr.decode((feats, ivectors))
+        print(key, out["text"], file=o)
diff --git a/examples/setups/zamia/models.sh b/examples/setups/zamia/models.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+
+wget http://goofy.zamia.org/zamia-speech/asr-models/kaldi-generic-en-tdnn_f-r20180901.tar.xz
+tar -xJf kaldi-generic-en-tdnn_f-r20180901.tar.xz
+rm -f kaldi-generic-en-tdnn_f-r20180901.tar.xz
+
+mv kaldi-generic-en-tdnn_f-r20180901/README.md README-ZAMIA.md
+mv kaldi-generic-en-tdnn_f-r20180901/* .
+rm -rf kaldi-generic-en-tdnn_f-r20180901
+
+mkdir -p exp/nnet3_chain
+mv model exp/nnet3_chain/tdnn_f
+mv extractor exp/nnet3_chain/.
+mv ivectors_test_hires exp/nnet3_chain/.
+ln -sr exp/nnet3_chain/ivectors_test_hires/conf/ivector_extractor.conf conf/.
+ln -sr ../aspire/data/test data/.
diff --git a/examples/setups/zamia/out/test/align.out b/examples/setups/zamia/out/test/align.out
@@ -0,0 +1 @@
+utt1 [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18742, 18741, 18741, 18741, 8152, 8151, 8151, 15350, 15349, 15349, 15349, 15349, 2, 18094, 18093, 18093, 18093, 17442, 17441, 17441, 17441, 17441, 17441, 7414, 7413, 7413, 7413, 7413, 17072, 17071, 17071, 11384, 11383, 11383, 11383, 11383, 11383, 10666, 10665, 10665, 10665, 10665, 10665, 14554, 14553, 14553, 14553, 14553, 14553, 14553, 14553, 14553, 17028, 2, 1, 10656, 10655, 10655, 10655, 10655, 10655, 892, 891, 891, 891, 891, 19408, 19407, 19407, 2, 16310, 16309, 16309, 16309, 5602, 5601, 12612, 12611, 12611, 12611, 16470, 16469, 16254, 16253, 16253, 4526, 4525, 4525, 19466, 19465, 8608, 8607, 15238, 15237, 15237, 2, 1, 1, 1, 1, 1, 9230, 9229, 9229, 9229, 9229, 9229, 9229, 6370, 18284, 18283, 18283, 18283, 18283, 18283, 18283, 18283, 18283, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
diff --git a/examples/setups/zamia/out/test/decode.out b/examples/setups/zamia/out/test/decode.out
@@ -0,0 +1 @@
+utt1 one two three four five six seven eight
diff --git a/examples/setups/zamia/out/test/phone_align.out b/examples/setups/zamia/out/test/phone_align.out
@@ -0,0 +1 @@
+utt1 [('SIL', 0, 42), ('w_B', 42, 4), ("'a_I", 46, 3), ('n_E', 49, 5), ('SIL', 54, 1), ('t_B', 55, 4), ("'u_E", 59, 6), ('T_B', 65, 5), ('r_I', 70, 3), ("'i_E", 73, 6), ('f_B', 79, 6), ("'o_I", 85, 9), ('r_E', 94, 1), ('SIL', 95, 2), ('f_B', 97, 6), ("'aI_I", 103, 5), ('v_E', 108, 3), ('SIL', 111, 1), ('s_B', 112, 4), ("'I_I", 116, 2), ('k_I', 118, 4), ('s_E', 122, 2), ('s_B', 124, 3), ("'E_I", 127, 3), ('v_I', 130, 2), ('a_I', 132, 2), ('n_E', 134, 3), ('SIL', 137, 6), ("'e_B", 143, 7), ('I_I', 150, 1), ('t_E', 151, 9), ('SIL', 160, 44)]
diff --git a/examples/setups/zamia/out/test/word_align.out b/examples/setups/zamia/out/test/word_align.out
@@ -0,0 +1 @@
+utt1 [('<eps>', 0, 42), ('one', 42, 12), ('<eps>', 54, 1), ('two', 55, 10), ('three', 65, 14), ('four', 79, 16), ('<eps>', 95, 2), ('five', 97, 14), ('<eps>', 111, 1), ('six', 112, 12), ('seven', 124, 13), ('<eps>', 137, 6), ('eight', 143, 17), ('<eps>', 160, 44)]
diff --git a/examples/setups/zamia/path.sh b/examples/setups/zamia/path.sh
@@ -0,0 +1,3 @@
+export KALDI_ROOT=`pwd`/../../../tools/kaldi
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		utt1 [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 18742, 18741, 18741, 18741, 8152, 8151, 8151, 15350, 15349, 15349, 15349, 15349, 2, 18094, 18093, 18093, 18093, 17442, 17441, 17441, 17441, 17441, 17441, 7414, 7413, 7413, 7413, 7413, 17072, 17071, 17071, 11384, 11383, 11383, 11383, 11383, 11383, 10666, 10665, 10665, 10665, 10665, 10665, 14554, 14553, 14553, 14553, 14553, 14553, 14553, 14553, 14553, 17028, 2, 1, 10656, 10655, 10655, 10655, 10655, 10655, 892, 891, 891, 891, 891, 19408, 19407, 19407, 2, 16310, 16309, 16309, 16309, 5602, 5601, 12612, 12611, 12611, 12611, 16470, 16469, 16254, 16253, 16253, 4526, 4525, 4525, 19466, 19465, 8608, 8607, 15238, 15237, 15237, 2, 1, 1, 1, 1, 1, 9230, 9229, 9229, 9229, 9229, 9229, 9229, 6370, 18284, 18283, 18283, 18283, 18283, 18283, 18283, 18283, 18283, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		utt1 [('SIL', 0, 42), ('w_B', 42, 4), ("'a_I", 46, 3), ('n_E', 49, 5), ('SIL', 54, 1), ('t_B', 55, 4), ("'u_E", 59, 6), ('T_B', 65, 5), ('r_I', 70, 3), ("'i_E", 73, 6), ('f_B', 79, 6), ("'o_I", 85, 9), ('r_E', 94, 1), ('SIL', 95, 2), ('f_B', 97, 6), ("'aI_I", 103, 5), ('v_E', 108, 3), ('SIL', 111, 1), ('s_B', 112, 4), ("'I_I", 116, 2), ('k_I', 118, 4), ('s_E', 122, 2), ('s_B', 124, 3), ("'E_I", 127, 3), ('v_I', 130, 2), ('a_I', 132, 2), ('n_E', 134, 3), ('SIL', 137, 6), ("'e_B", 143, 7), ('I_I', 150, 1), ('t_E', 151, 9), ('SIL', 160, 44)]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		utt1 [('<eps>', 0, 42), ('one', 42, 12), ('<eps>', 54, 1), ('two', 55, 10), ('three', 65, 14), ('four', 79, 16), ('<eps>', 95, 2), ('five', 97, 14), ('<eps>', 111, 1), ('six', 112, 12), ('seven', 124, 13), ('<eps>', 137, 6), ('eight', 143, 17), ('<eps>', 160, 44)]