Skip to content

Commit

Permalink
update readme and prepare to work with Tacotron2
Browse files Browse the repository at this point in the history
  • Loading branch information
吴梦林 authored and 吴梦林 committed Jan 11, 2019
1 parent 37c151c commit 7f17e2b
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 3 deletions.
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

CC=gcc
CFLAGS+=-Wall -W -Wextra -Wno-unused-function -O3 -g -I../include
ifeq ($(taco),1)
CFLAGS += -DTACOTRON2
endif


AVX2:=$(shell cat /proc/cpuinfo | grep -c avx2)
AVX:=$(shell cat /proc/cpuinfo | grep -c avx)
Expand Down
23 changes: 22 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# LPCNet



Low complexity implementation of the WaveRNN-based LPCNet algorithm, as described in:

J.-M. Valin, J. Skoglund, [LPCNet: Improving Neural Speech Synthesis Through Linear Prediction](https://jmvalin.ca/papers/lpcnet_icassp2019.pdf), *Submitted for ICASSP 2019*, arXiv:1810.11846.
Expand All @@ -12,6 +14,8 @@ The BSD licensed software is written in C and Python/Keras. For training, a GTX

This software is an open source starting point for WaveRNN-based speech synthesis and coding.

__NOTE__: The repo aims to work with Tacotron2.

# Quickstart

1. Set up a Keras system with GPU.
Expand Down Expand Up @@ -47,16 +51,33 @@ This software is an open source starting point for WaveRNN-based speech synthesi
make test_lpcnet
./dump_data -test test_input.s16 test_features.f32
./test_lpcnet test_features.f32 test.s16
ffmpeg f s16le -ar 16k -ac 1 -i test.s16 test-out.wav
```

# Speech Material for Training
# Speech Material for Training LPCNet

Suitable training material can be obtained from the [McGill University Telecommunications & Signal Processing Laboratory](http://www-mmsp.ece.mcgill.ca/Documents/Data/). Download the ISO and extract the 16k-LP7 directory, the src/concat.sh script can be used to generate a headerless file of training samples.
```
cd 16k-LP7
sh /path/to/concat.sh
```

# Speech Material for Training Tacotron2
Although the model has 55 dims features when training LPCNet, there are 20 features to be used input features when inferring the audio. You should enble TACOTRON2 Macro in Makefile to get the features for Training Tacotron2. You also should generate indepent features for every audio when training Tacotron2 other than concatate all features into one file when training LPCNet.
```bash
#preprocessing
./header_removal.sh
make dump_data taco=1 # Define TACOTRON2 macro
./feature_extract.sh
```
```bash
#synthesis
make test_lpcnet taco=1 # Define TACOTRON2 macro
./test_lpcnet test_features.f32 test.s16
ffmpeg f s16le -ar 16k -ac 1 -i test.s16 test-out.wav

```

# Reading Further

1. [LPCNet: DSP-Boosted Neural Speech Synthesis](https://people.xiph.org/~jm/demo/lpcnet/)
Expand Down
6 changes: 6 additions & 0 deletions feature_extract.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Place in 16k-LP7 from TSPSpeech.iso and run to concatenate wave files
# into one headerless training file
for i in mandarin_female/wavs/*.s16
do
./dump_data -test $i mandarin_female/feature_extract/${i##*/}.f32
done
6 changes: 6 additions & 0 deletions header_removal.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Place in 16k-LP7 from TSPSpeech.iso and run to concatenate wave files
# into one headerless training file
for i in /data/dataset/mandarin_female/wavs/*.wav
do
sox $i -r 16000 -c 1 -t sw - > mandarin_female/wavs/${i##*/}.s16
done
9 changes: 8 additions & 1 deletion src/dump_data.c
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ int main(int argc, char **argv) {
float Ex[NB_BANDS], Ep[NB_BANDS];
float Exp[NB_BANDS];
float features[NB_FEATURES];
float taco_features[NB_BANDS+2];
float E=0;
int silent;
for (i=0;i<FRAME_SIZE;i++) x[i] = tmp[i];
Expand Down Expand Up @@ -335,7 +336,13 @@ int main(int argc, char **argv) {
}
for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
compute_frame_features(st, X, P, Ex, Ep, Exp, features, x);
fwrite(features, sizeof(float), NB_FEATURES, ffeat);
//fwrite(features, sizeof(float), NB_FEATURES, ffeat);
for (i=0; i < NB_BANDS; i++) {
taco_features[i] = features[i];
}
taco_features[NB_BANDS]=features[36];
taco_features[NB_BANDS+1]=features[37];
fwrite(taco_features, sizeof(float), (NB_BANDS+2), ffeat);
/* PCM is delayed by 1/2 frame to make the features centered on the frames. */
for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
if (fpcm) write_audio(st, pcm, noise_std, fpcm);
Expand Down
13 changes: 12 additions & 1 deletion src/test_lpcnet.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,24 @@ int main(int argc, char **argv) {
}

while (1) {
float in_features[NB_TOTAL_FEATURES];

float features[NB_FEATURES];
short pcm[FRAME_SIZE];

#ifndef TACOTRON2
float in_features[NB_TOTAL_FEATURES];
fread(in_features, sizeof(features[0]), NB_TOTAL_FEATURES, fin);
if (feof(fin)) break;
RNN_COPY(features, in_features, NB_FEATURES);
RNN_CLEAR(&features[18], 18);
#else
float in_features[NB_BANDS+2];
fread(in_features, sizeof(features[0]), NB_BANDS+2, fin);
if (feof(fin)) break;
RNN_COPY(features, in_features, NB_BANDS);
RNN_CLEAR(&features[18], 18);
RNN_COPY(features+36, in_features+NB_BANDS, 2);
#endif
lpcnet_synthesize(net, pcm, features, FRAME_SIZE);
fwrite(pcm, sizeof(pcm[0]), FRAME_SIZE, fout);
}
Expand Down
3 changes: 3 additions & 0 deletions src/train_yl_lpc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
CUDA_VISIBLE_DEVICES=$1 python train_lpcnet.py ../yl_features.f32 ../yl_data.u8

0 comments on commit 7f17e2b

Please sign in to comment.