Training, evaluation, and inference of neural phonetic posteriorgrams (PPGs) in PyTorch
An inference-only installation with our best model is pip-installable
pip install ppgs
To perform training, install training dependencies and FFMPEG.
pip install ppgs[train]
conda install -c conda-forge 'ffmpeg<5'
If you wish to use the Charsiu representation, download the code, install both inference and training dependencies, and install Charsiu as a Git submodule.
# Clone
git clone [email protected]/interactiveaudiolab/ppgs
cd ppgs/
# Install dependencies
pip install -e .[train]
conda install -c conda-forge 'ffmpeg<5'
# Download Charsiu
git submodule init
git submodule update
import ppgs
# Load speech audio at correct sample rate
audio = ppgs.load.audio(audio_file)
# Choose a gpu index to use for inference. Set to None to use cpu.
gpu = 0
# Infer PPGs
ppgs = ppgs.from_audio(audio, ppgs.SAMPLE_RATE, gpu=gpu)
def from_audio(
audio: torch.Tensor,
sample_rate: Union[int, float],
checkpoint: Optional[Union[str, bytes, os.PathLike]] = None,
gpu: int = None
) -> torch.Tensor:
"""Infer ppgs from audio
Arguments
audio
Batched audio to process
shape=(batch, 1, samples)
sample_rate
Audio sampling rate
checkpoint
The checkpoint file
gpu
The index of the GPU to use for inference
Returns
ppgs
Phonetic posteriorgrams
shape=(batch, len(ppgs.PHONEMES), frames)
"""
def from_file(
file: Union[str, bytes, os.PathLike],
checkpoint: Optional[Union[str, bytes, os.PathLike]] = None,
gpu: Optional[int] = None
) -> torch.Tensor:
"""Infer ppgs from an audio file
Arguments
file
The audio file
checkpoint
The checkpoint file
gpu
The index of the GPU to use for inference
Returns
ppgs
Phonetic posteriorgram
shape=(len(ppgs.PHONEMES), frames)
def from_file_to_file(
audio_file: Union[str, bytes, os.PathLike],
output_file: Union[str, bytes, os.PathLike],
checkpoint: Optional[Union[str, bytes, os.PathLike]] = None,
gpu: Optional[int] = None
) -> None:
"""Infer ppg from an audio file and save to a torch tensor file
Arguments
audio_file
The audio file
output_file
The .pt file to save PPGs
checkpoint
The checkpoint file
gpu
The index of the GPU to use for inference
"""
def from_files_to_files(
audio_files: List[Union[str, bytes, os.PathLike]],
output_files: List[Union[str, bytes, os.PathLike]],
checkpoint: Optional[Union[str, bytes, os.PathLike]] = None,
num_workers: int = ppgs.NUM_WORKERS,
gpu: Optional[int] = None,
max_frames: int = ppgs.MAX_INFERENCE_FRAMES
) -> None:
"""Infer ppgs from audio files and save to torch tensor files
Arguments
audio_files
The audio files
output_files
The .pt files to save PPGs
checkpoint
The checkpoint file
num_workers
Number of CPU threads for multiprocessing
gpu
The index of the GPU to use for inference
max_frames
The maximum number of frames on the GPU at once
"""
def from_paths_to_paths(
input_paths: List[Union[str, bytes, os.PathLike]],
output_paths: Optional[List[Union[str, bytes, os.PathLike]]] = None,
extensions: Optional[List[str]] = None,
checkpoint: Optional[Union[str, bytes, os.PathLike]] = None,
num_workers: int = ppgs.NUM_WORKERS,
gpu: Optional[int] = None,
max_frames: int = ppgs.MAX_INFERENCE_FRAMES
) -> None:
"""Infer ppgs from audio files and save to torch tensor files
Arguments
input_paths
Paths to audio files and/or directories
output_paths
The one-to-one corresponding outputs
extensions
Extensions to glob for in directories
checkpoint
The checkpoint file
num_workers
Number of CPU threads for multiprocessing
gpu
The index of the GPU to use for inference
max_frames
The maximum number of frames on the GPU at once
"""
usage: python -m ppgs
[-h]
[--input_paths INPUT_PATHS [INPUT_PATHS ...]]
[--output_paths OUTPUT_PATHS [OUTPUT_PATHS ...]]
[--extensions EXTENSIONS [EXTENSIONS ...]]
[--checkpoint CHECKPOINT]
[--num-workers NUM_WORKERS]
[--gpu GPU]
[--max-frames MAX_TRAINING_FRAMES]
arguments:
--input_paths INPUT_PATHS [INPUT_PATHS ...]
Paths to audio files and/or directories
optional arguments:
-h, --help
Show this help message and exit
--output_paths OUTPUT_PATHS [OUTPUT_PATHS ...]
The one-to-one corresponding output paths
--extensions EXTENSIONS [EXTENSIONS ...]
Extensions to glob for in directories
--checkpoint CHECKPOINT
The checkpoint file
--num-workers NUM_WORKERS
Number of CPU threads for multiprocessing
--gpu GPU
The index of the GPU to use for inference. Defaults to CPU.
To compute the proposed normalized Jenson-Shannon divergence pronunciation
distance between two PPGs, use ppgs.distance()
.
def distance(
ppgX: torch.Tensor,
ppgY: torch.Tensor,
reduction: str = 'mean',
normalize: bool = True
) -> torch.Tensor:
"""Compute the pronunciation distance between two aligned PPGs
Arguments
ppgX
Input PPG X
shape=(len(ppgs.PHONEMES), frames)
ppgY
Input PPG Y to compare with PPG X
shape=(len(ppgs.PHONEMES), frames)
reduction
Reduction to apply to the output. One of ['mean', 'none', 'sum'].
normalize
Apply similarity based normalization
Returns
Normalized Jenson-shannon divergence between PPGs
"""
def interpolate(
ppgX: torch.Tensor,
ppgY: torch.Tensor,
interp: Union[float, torch.Tensor]
) -> torch.Tensor:
"""Spherical linear interpolation
Arguments
ppgX
Input PPG X
shape=(len(ppgs.PHONEMES), frames)
ppgY
Input PPG Y
shape=(len(ppgs.PHONEMES), frames)
interp
Interpolation values
scalar float OR shape=(frames,)
Returns
Interpolated PPGs
shape=(len(ppgs.PHONEMES), frames)
"""
import ppgs
# Get PPGs to edit
ppg = ppgs.from_file(audio_file, gpu=gpu)
# Constant-ratio time-stretching (slowing down)
grid = ppgs.edit.grid.constant(ppg, ratio=0.8)
slow = ppgs.edit.grid.sample(ppg, grid)
# Stretch to a desired length (e.g., 100 frames)
grid = ppgs.edit.grid.of_length(ppg, 100)
fixed = ppgs.edit.grid.sample(ppg, grid)
def constant(ppg: torch.Tensor, ratio: float) -> torch.Tensor:
"""Create a grid for constant-ratio time-stretching
Arguments
ppg
Input PPG
ratio
Time-stretching ratio; lower is slower
Returns
Constant-ratio grid for time-stretching ppg
"""
def from_alignments(
source: pypar.Alignment,
target: pypar.Alignment,
sample_rate: int = ppgs.SAMPLE_RATE,
hopsize: int = ppgs.HOPSIZE
) -> torch.Tensor:
"""Create time-stretch grid to convert source alignment to target
Arguments
source
Forced alignment of PPG to stretch
target
Forced alignment of target PPG
sample_rate
Audio sampling rate
hopsize
Hopsize in samples
Returns
Grid for time-stretching source PPG
"""
def of_length(ppg: torch.Tensor, length: int) -> torch.Tensor:
"""Create time-stretch grid to resample PPG to a specified length
Arguments
ppg
Input PPG
length
Target length
Returns
Grid of specified length for time-stretching ppg
"""
def grid_sample(ppg: torch.Tensor, grid: torch.Tensor) -> torch.Tensor:
"""Grid-based PPG interpolation
Arguments
ppg
Input PPG
grid
Grid of desired length; each item is a float-valued index into ppg
Returns
Interpolated PPG
"""
def reallocate(
ppg: torch.Tensor,
source: str,
target: str,
value: Optional[float] = None
) -> torch.Tensor:
"""Reallocate probability from source phoneme to target phoneme
Arguments
ppg
Input PPG
shape=(len(ppgs.PHONEMES), frames)
source
Source phoneme
target
Target phoneme
value
Max amount to reallocate. If None, reallocates all probability.
Returns
Edited PPG
"""
def regex(
ppg: torch.Tensor,
source_phonemes: List[str],
target_phonemes: List[str]
) -> torch.Tensor:
"""Regex match and replace (via swap) for phoneme sequences
Arguments
ppg
Input PPG
shape=(len(ppgs.PHONEMES), frames)
source_phonemes
Source phoneme sequence
target_phonemes
Target phoneme sequence
Returns
Edited PPG
"""
def shift(ppg: torch.Tensor, phoneme: str, value: float):
"""Shift probability of a phoneme and reallocate proportionally
Arguments
ppg
Input PPG
shape=(len(ppgs.PHONEMES), frames)
phoneme
Input phoneme
value
Maximal shift amount
Returns
Edited PPG
"""
def swap(ppg: torch.Tensor, phonemeA: str, phonemeB: str) -> torch.Tensor:
"""Swap the probabilities of two phonemes
Arguments
ppg
Input PPG
shape=(len(ppg.PHONEMES), frames)
phonemeA
Input phoneme A
phonemeB
Input phoneme B
Returns
Edited PPG
"""
Downloads, unzips, and formats datasets. Stores datasets in data/datasets/
.
Stores formatted datasets in data/cache/
.
N.B. Common voice and TIMIT cannot be automatically downloaded. You must
manually download the tarballs and place them in data/sources/commonvoice
or data/sources/timit
, respectively, prior to running the following.
python -m ppgs.data.download --datasets <datasets>
Prepares representations for training. Representations are stored
in data/cache/
.
python -m ppgs.data.preprocess \
--datasets <datasets> \
--representatations <representations> \
--gpu <gpu> \
--num-workers <workers>
Partitions a dataset. You should not need to run this, as the partitions
used in our work are provided for each dataset in
ppgs/assets/partitions/
.
python -m ppgs.partition --datasets <datasets>
Trains a model. Checkpoints and logs are stored in runs/
. You may want to run
accelerate config
first to configure which devices are used for training.
CUDA_VISIBLE_DEVICES=<gpus> accelerate launch -m ppgs.train \
--config <config> \
--dataset <dataset>
If the config file has been previously run, the most recent checkpoint will automatically be loaded and training will resume from that checkpoint.
You can monitor training via tensorboard
.
tensorboard --logdir runs/ --port <port> --load_fast true
To use the torchutil
notification system to receive notifications for long
jobs (download, preprocess, train, and evaluate), set the
PYTORCH_NOTIFICATION_URL
environment variable to a supported webhook as
explained in the Apprise documentation.
Performs objective evaluation of phoneme accuracy. Results are stored
in eval/
.
python -m ppgs.evaluate \
--config <name> \
--datasets <datasets> \
--checkpoint <checkpoint> \
--gpus <gpus>
C. Churchwell, M. Morrison, and B. Pardo, "High-Fidelity Neural Phonetic Posteriorgrams," Submitted to ICASSP 2024, April 2024.
@inproceedings{churchwell2024high,
title={High-Fidelity Neural Phonetic Posteriorgrams},
author={Churchwell, Cameron and Morrison, Max and Pardo, Bryan},
booktitle={Submitted to ICASSP 2024},
month={April},
year={2024}
}