Initial commit

bshall · Apr 19, 2020 · 52e3493 · 52e3493
commit 52e3493
Show file tree

Hide file tree

Showing 20 changed files with 896 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,116 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Pycharm project settings
+.idea
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# Preprocessed data
+*.npy
+
+# Model checkpoints
+checkpoints/
+
+# Datasets
+datasets/
diff --git a/README.md b/README.md
@@ -0,0 +1,107 @@
+# VQ-VAE for Acoustic Unit Discovery and Voice Conversion
+
+Train and evaluate models for the ZeroSpeech challenges.
+Voice conversion samples can be found [here](https://bshall.github.io/ZeroSpeech/).
+Pretrained weights for the 2019 english and surprise models can be found here.
+
+<p align="center">
+  <img width="384" height="563" alt="VQ-VAE for Acoustic Unit Discovery"
+    src="https://raw.githubusercontent.com/bshall/ZeroSpeech/master/model.png">
+</p>
+
+# Quick Start
+
+## Requirements
+
+1.  Ensure you have Python 3 and PyTorch 1.4 or greater.
+
+2.  Install [NVIDIA/apex](https://github.com/NVIDIA/apex) for mixed precision training.
+
+3.  Install pip dependencies:
+    ```
+    pip install requirements.txt
+    ```
+
+4. For evaluation install [bootphon/zerospeech2020](https://github.com/bootphon/zerospeech2020).
+
+## Training
+
+1.  Download and extract the [ZeroSpeech2020 datasets](https://download.zerospeech.com/).
+    
+2.  Preprocess audio and extract train/test log-Mel spectrograms:
+    ```
+    python preprocess.py in_dir=/path/to/dataset dataset=[2019/english or 2019/surprise]
+    ```
+    Note: `in_dir` must be the path to the `2019` folder. 
+    For `dataset` choose between `2019/english` or `2019/surprise`.
+    Other datasets will be added in the future.
+    ```
+    e.g. python preprecess.py in_dir=../datasets/2020/2019 dataset=2019/english
+    ```
+   
+3. Train the model:
+    ```
+    python train.py checkpoint_dir=path/to/checkpoint_dir dataset=[2019/english or 2019/surprise]
+    ```
+    ```
+    e.g. python train.py checkpoint_dir=checkpoints/2019english dataset=2019/english
+    ```
+   
+## Evaluation
+    
+### Voice conversion
+   ```
+    python convert.py checkpoint=path/to/checkpoint in_dir=path/to/wavs out_dir=path/to/out_dir synthesis_list=path/to/synthesis_list dataset=[2019/english or 2019/surprise]
+   ```
+   Note: `the synthesis list` is a `json` file:
+   ```
+    [
+        [
+            "english/test/S002_0379088085",
+            "V002",
+            "V002_0379088085"
+        ]
+   ]
+   ```
+   containing a list of items with a) the path (relative to `in_dir`) of the source `wav` files;
+   b) the target speaker (see `datasets/2019/english/speakers.json` for a list of options);
+   and c) the target file name.
+   ```
+   e.g. python convert.py checkpoint=checkpoints/2019english/model.ckpt-500000.pt in_dir=../datasets/2020/2019 out_dir=submission/2019/english/test synthesis_list=datasets/2019/english/synthesis.json dataset=2019/english
+   ```
+
+### ABX Score
+    
+1.  Encode test data for evaluation:
+    ```
+    python encode.py checkpoint=path/to/checkpoint out_dir=path/to/out_dir dataset=[2019/english or 2019/surprise]
+    ```
+    ```
+    e.g. python encode.py checkpoint=checkpoints/2019english/model.ckpt-500000.pt out_dir=submission/2019/english/test dataset=2019/english
+    ```
+    
+2. Run ABX evaluation script (see [bootphon/zerospeech2020](https://github.com/bootphon/zerospeech2020)).
+
+For example, the ABX score for the pretrained english model (available here) is:
+```
+{
+    "2019": {
+        "english": {
+            "scores": {
+                "abx": 14.043611615570672,
+                "bitrate": 412.2387509949519
+            },
+            "details_bitrate": {
+                "test": 412.2387509949519
+            },
+            "details_abx": {
+                "test": {
+                    "cosine": 14.043611615570672,
+                    "KL": 50.0,
+                    "levenshtein": 35.927825062038984
+                }
+            }
+        }
+    }
+}
+```
diff --git a/config/convert.yaml b/config/convert.yaml
@@ -0,0 +1,9 @@
+defaults:
+    - dataset: 2019/english
+    - preprocessing: default
+    - model: default
+
+synthesis_list: ???
+in_dir: ???
+out_dir: ???
+checkpoint: ???
diff --git a/config/dataset/2019/english.yaml b/config/dataset/2019/english.yaml
@@ -0,0 +1,5 @@
+dataset:
+  dataset: 2019
+  language: english
+  path: 2019/english
+  n_speakers: 102
diff --git a/config/dataset/2019/surprise.yaml b/config/dataset/2019/surprise.yaml
@@ -0,0 +1,5 @@
+dataset:
+  dataset: 2019
+  language: surprise
+  path: 2019/surprise
+  n_speakers: 113
diff --git a/config/encode.yaml b/config/encode.yaml
@@ -0,0 +1,7 @@
+defaults:
+    - dataset: 2019/english
+    - preprocessing: default
+    - model: default
+
+checkpoint: ???
+out_dir: ???
diff --git a/config/model/default.yaml b/config/model/default.yaml
@@ -0,0 +1,21 @@
+model:
+    encoder:
+        class: model.Encoder
+        params:
+            in_channels: ${preprocessing.n_mels}
+            channels: 768
+            n_embeddings: 512
+            embedding_dim: 64
+            jitter: 0.5
+    decoder:
+        class: model.Decoder
+        params:
+            in_channels: ${model.encoder.params.embedding_dim}
+            conditioning_channels: 128
+            n_speakers: ${dataset.n_speakers}
+            speaker_embedding_dim: 64
+            mu_embedding_dim: 256
+            rnn_channels: 896
+            fc_channels: 256
+            bits: ${preprocessing.bits}
+            hop_length: ${preprocessing.hop_length}
diff --git a/config/preprocessing.yaml b/config/preprocessing.yaml
@@ -0,0 +1,5 @@
+defaults:
+    - dataset: 2019/english
+    - preprocessing: default
+
+in_dir: ???
diff --git a/config/preprocessing/default.yaml b/config/preprocessing/default.yaml
@@ -0,0 +1,10 @@
+preprocessing:
+    sr: 16000
+    n_fft: 2048
+    n_mels: 80
+    fmin: 50
+    preemph: 0.97
+    top_db: 80
+    hop_length: 160
+    win_length: 400
+    bits: 8
diff --git a/config/train.yaml b/config/train.yaml
@@ -0,0 +1,8 @@
+defaults:
+    - dataset: 2019/english
+    - preprocessing: default
+    - model: default
+    - training: default
+
+resume: False
+checkpoint_dir: ???
diff --git a/config/training/default.yaml b/config/training/default.yaml
@@ -0,0 +1,13 @@
+training:
+    batch_size: 52
+    sample_frames: 32
+    n_steps: 500000
+    optimizer:
+        lr: 4e-4
+    scheduler:
+        milestones:
+            - 300000
+            - 400000
+        gamma: 0.5
+    checkpoint_interval: 20000
+    n_workers: 8