update readme and prepare to work with Tacotron2

chenchy · Jan 11, 2019 · 7f17e2b · 7f17e2b
1 parent 37c151c
commit 7f17e2b
Show file tree

Hide file tree

Showing 7 changed files with 61 additions and 3 deletions.
diff --git a/Makefile b/Makefile
@@ -2,6 +2,10 @@
 
 CC=gcc
 CFLAGS+=-Wall -W -Wextra -Wno-unused-function -O3 -g -I../include 
+ifeq ($(taco),1)
+  CFLAGS += -DTACOTRON2
+endif
+
 
 AVX2:=$(shell cat /proc/cpuinfo | grep -c avx2)
 AVX:=$(shell cat /proc/cpuinfo | grep -c avx)

diff --git a/README.md b/README.md
@@ -1,5 +1,7 @@
 # LPCNet
 
+
+
 Low complexity implementation of the WaveRNN-based LPCNet algorithm, as described in:
 
 J.-M. Valin, J. Skoglund, [LPCNet: Improving Neural Speech Synthesis Through Linear Prediction](https://jmvalin.ca/papers/lpcnet_icassp2019.pdf), *Submitted for ICASSP 2019*, arXiv:1810.11846.
@@ -12,6 +14,8 @@ The BSD licensed software is written in C and Python/Keras. For training, a GTX
 
 This software is an open source starting point for WaveRNN-based speech synthesis and coding.
 
+__NOTE__: The repo aims to work with Tacotron2.
+
 # Quickstart
 
 1. Set up a Keras system with GPU.
@@ -47,16 +51,33 @@ This software is an open source starting point for WaveRNN-based speech synthesi
    make test_lpcnet
    ./dump_data -test test_input.s16 test_features.f32
    ./test_lpcnet test_features.f32 test.s16
+   ffmpeg f s16le -ar 16k -ac 1 -i test.s16 test-out.wav
    ```
 
-# Speech Material for Training 
+# Speech Material for Training LPCNet
 
 Suitable training material can be obtained from the [McGill University Telecommunications & Signal Processing Laboratory](http://www-mmsp.ece.mcgill.ca/Documents/Data/).  Download the ISO and extract the 16k-LP7 directory, the src/concat.sh script can be used to generate a headerless file of training samples.
 ```
 cd 16k-LP7
 sh /path/to/concat.sh
 ```
 
+# Speech Material for Training Tacotron2
+Although the model has 55 dims features when training LPCNet, there are 20 features to be used input features when inferring the audio. You should enble TACOTRON2 Macro in Makefile to get the features for Training Tacotron2. You also should generate indepent features for every audio when training Tacotron2 other than concatate all features into one file when training LPCNet.
+```bash
+#preprocessing
+./header_removal.sh
+make dump_data taco=1   # Define TACOTRON2 macro
+./feature_extract.sh
+```
+```bash
+#synthesis
+make test_lpcnet taco=1 # Define TACOTRON2 macro
+./test_lpcnet test_features.f32 test.s16
+ffmpeg f s16le -ar 16k -ac 1 -i test.s16 test-out.wav
+
+```
+
 # Reading Further
 
 1. [LPCNet: DSP-Boosted Neural Speech Synthesis](https://people.xiph.org/~jm/demo/lpcnet/)

diff --git a/feature_extract.sh b/feature_extract.sh
@@ -0,0 +1,6 @@
+# Place in 16k-LP7 from TSPSpeech.iso and run to concatenate wave files
+# into one headerless training file
+for i in mandarin_female/wavs/*.s16 
+do
+./dump_data -test $i mandarin_female/feature_extract/${i##*/}.f32
+done
diff --git a/header_removal.sh b/header_removal.sh
@@ -0,0 +1,6 @@
+# Place in 16k-LP7 from TSPSpeech.iso and run to concatenate wave files
+# into one headerless training file
+for i in /data/dataset/mandarin_female/wavs/*.wav
+do
+sox $i -r 16000 -c 1 -t sw - > mandarin_female/wavs/${i##*/}.s16 
+done
diff --git a/src/dump_data.c b/src/dump_data.c
@@ -285,6 +285,7 @@ int main(int argc, char **argv) {
     float Ex[NB_BANDS], Ep[NB_BANDS];
     float Exp[NB_BANDS];
     float features[NB_FEATURES];
+    float taco_features[NB_BANDS+2];
     float E=0;
     int silent;
     for (i=0;i<FRAME_SIZE;i++) x[i] = tmp[i];
@@ -335,7 +336,13 @@ int main(int argc, char **argv) {
     }
     for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
     compute_frame_features(st, X, P, Ex, Ep, Exp, features, x);
-    fwrite(features, sizeof(float), NB_FEATURES, ffeat);
+    //fwrite(features, sizeof(float), NB_FEATURES, ffeat);
+	for (i=0; i < NB_BANDS; i++) {
+	  taco_features[i] = features[i];
+	}
+	taco_features[NB_BANDS]=features[36];
+	taco_features[NB_BANDS+1]=features[37];
+    fwrite(taco_features, sizeof(float), (NB_BANDS+2), ffeat);
     /* PCM is delayed by 1/2 frame to make the features centered on the frames. */
     for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
     if (fpcm) write_audio(st, pcm, noise_std, fpcm);

diff --git a/src/test_lpcnet.c b/src/test_lpcnet.c
@@ -53,13 +53,24 @@ int main(int argc, char **argv) {
     }
 
     while (1) {
-        float in_features[NB_TOTAL_FEATURES];
+
         float features[NB_FEATURES];
         short pcm[FRAME_SIZE];
+
+#ifndef TACOTRON2
+        float in_features[NB_TOTAL_FEATURES];
         fread(in_features, sizeof(features[0]), NB_TOTAL_FEATURES, fin);
         if (feof(fin)) break;
         RNN_COPY(features, in_features, NB_FEATURES);
         RNN_CLEAR(&features[18], 18);
+#else
+        float in_features[NB_BANDS+2];
+        fread(in_features, sizeof(features[0]), NB_BANDS+2, fin);
+        if (feof(fin)) break;
+        RNN_COPY(features, in_features, NB_BANDS);
+        RNN_CLEAR(&features[18], 18);
+        RNN_COPY(features+36, in_features+NB_BANDS, 2);
+#endif
         lpcnet_synthesize(net, pcm, features, FRAME_SIZE);
         fwrite(pcm, sizeof(pcm[0]), FRAME_SIZE, fout);
     }

diff --git a/src/train_yl_lpc.sh b/src/train_yl_lpc.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+CUDA_VISIBLE_DEVICES=$1 python train_lpcnet.py ../yl_features.f32 ../yl_data.u8
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash
		CUDA_VISIBLE_DEVICES=$1 python train_lpcnet.py ../yl_features.f32 ../yl_data.u8