Skip to content

Commit

Permalink
Gradient clipping experiments
Browse files Browse the repository at this point in the history
Former-commit-id: 71264d6
Former-commit-id: 1a2f69f
  • Loading branch information
maxrmorrison committed Dec 22, 2023
1 parent 1345e5d commit 887457d
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 11 deletions.
30 changes: 30 additions & 0 deletions config/w2v2fb-buckets1-050000-7layer-2048channel-clipinf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
MODULE = 'ppgs'

# Configuration name
CONFIG = 'w2v2fb-buckets1-050000-7layer-2048channel-clipinf'

# Number of buckets to partition training examples to minimize padding
BUCKETS = 1

# Method to use for gradient clipping.
# One of ['autoclip', 'inf', 'l1', 'l2', 'skip'].
GRADIENT_CLIPPING_METHOD = 'inf'

# Gradient clipping threshold.
# For autoclip, this specifies the clipping percentile.
GRADIENT_CLIPPING_THRESHOLD = 10000

# Network width
HIDDEN_CHANNELS = 2048

# Dimensionality of input representation
INPUT_CHANNELS = 768

# Maximum number of frames in a batch
MAX_TRAINING_FRAMES = 50000

# Number of hidden layers
NUM_HIDDEN_LAYERS = 7

# Input representation
REPRESENTATION = 'w2v2fb'
30 changes: 30 additions & 0 deletions config/w2v2fb-buckets1-100000-6layer-1024channel-autoclip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
MODULE = 'ppgs'

# Configuration name
CONFIG = 'w2v2fb-buckets1-100000-6layer-1024channel-autoclip'

# Number of buckets to partition training examples to minimize padding
BUCKETS = 1

# Method to use for gradient clipping.
# One of ['autoclip', 'inf', 'l1', 'l2', 'skip'].
GRADIENT_CLIPPING_METHOD = 'autoclip'

# Gradient clipping threshold.
# For autoclip, this specifies the clipping percentile.
GRADIENT_CLIPPING_THRESHOLD = .9

# Network width
HIDDEN_CHANNELS = 1024

# Dimensionality of input representation
INPUT_CHANNELS = 768

# Maximum number of frames in a batch
MAX_TRAINING_FRAMES = 100000

# Number of hidden layers
NUM_HIDDEN_LAYERS = 6

# Input representation
REPRESENTATION = 'w2v2fb'
30 changes: 30 additions & 0 deletions config/w2v2fb-buckets1-100000-6layer-1024channel-clipskip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
MODULE = 'ppgs'

# Configuration name
CONFIG = 'w2v2fb-buckets1-100000-6layer-1024channel-clipskip'

# Number of buckets to partition training examples to minimize padding
BUCKETS = 1

# Method to use for gradient clipping.
# One of ['autoclip', 'inf', 'l1', 'l2', 'skip'].
GRADIENT_CLIPPING_METHOD = 'skip'

# Gradient clipping threshold.
# For autoclip, this specifies the clipping percentile.
GRADIENT_CLIPPING_THRESHOLD = 10000

# Network width
HIDDEN_CHANNELS = 1024

# Dimensionality of input representation
INPUT_CHANNELS = 768

# Maximum number of frames in a batch
MAX_TRAINING_FRAMES = 100000

# Number of hidden layers
NUM_HIDDEN_LAYERS = 6

# Input representation
REPRESENTATION = 'w2v2fb'
30 changes: 30 additions & 0 deletions config/w2v2fb-buckets1-150000-5layer-512channel-clipinf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
MODULE = 'ppgs'

# Configuration name
CONFIG = 'w2v2fb-buckets1-150000-5layer-512channel-clipinf'

# Number of buckets to partition training examples to minimize padding
BUCKETS = 1

# Method to use for gradient clipping.
# One of ['autoclip', 'inf', 'l1', 'l2', 'skip'].
GRADIENT_CLIPPING_METHOD = 'inf'

# Gradient clipping threshold.
# For autoclip, this specifies the clipping percentile.
GRADIENT_CLIPPING_THRESHOLD = 10000

# Network width
HIDDEN_CHANNELS = 512

# Dimensionality of input representation
INPUT_CHANNELS = 768

# Maximum number of frames in a batch
MAX_TRAINING_FRAMES = 150000

# Number of hidden layers
NUM_HIDDEN_LAYERS = 5

# Input representation
REPRESENTATION = 'w2v2fb'
2 changes: 1 addition & 1 deletion ppgs/config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@
# NUM_WORKERS = int(os.cpu_count() / max(1, len(GPUtil.getGPUs())))
# except ValueError:
# NUM_WORKERS = os.cpu_count()
NUM_WORKERS = 12
NUM_WORKERS = 8

# Seed for all random number generators
RANDOM_SEED = 1234
Expand Down
20 changes: 10 additions & 10 deletions ppgs/train/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

# import accelerate
import matplotlib
import numpy as np
import torch
import torchutil

Expand Down Expand Up @@ -151,7 +152,7 @@ def train(dataset, directory=ppgs.RUNS_DIR / ppgs.CONFIG, gpu=None):
if p.grad is not None:
max_grad = max(max_grad, p.grad.data.max())
total_norm += p.grad.data.norm(2)
total_norm = total_norm ** (1. / 2)
total_norm = (total_norm ** (1. / 2)).item()
torchutil.tensorboard.update(
directory,
step,
Expand All @@ -163,7 +164,12 @@ def train(dataset, directory=ppgs.RUNS_DIR / ppgs.CONFIG, gpu=None):
if ppgs.GRADIENT_CLIPPING_METHOD is not None:

# Just skip the update
if ppgs.GRADIENT_CLIPPING_METHOD == 'skip':
if (
ppgs.GRADIENT_CLIPPING_METHOD == 'skip' and
max_grad > ppgs.GRADIENT_CLIPPING_THRESHOLD and
step > 1000
):
print(f'{max_grad} exceeds threshold of {ppgs.GRADIENT_CLIPPING_THRESHOLD}. Skipping.')
continue

# Unscale gradients
Expand Down Expand Up @@ -214,17 +220,11 @@ def train(dataset, directory=ppgs.RUNS_DIR / ppgs.CONFIG, gpu=None):

# Log VRAM utilization
# index = accelerator.device.index
index = device.index
print(torch.cuda.memory_summary(index))
scalars = {
'max_allocated (GB)': torch.cuda.max_memory_allocated(
index) / (1024 ** 3),
'max_reserved (GB)': torch.cuda.max_memory_reserved(
index) / (1024 ** 3)}
print(torch.cuda.memory_summary(device.index))
torchutil.tensorboard.update(
directory,
step,
scalars=scalars)
scalars=torchutil.cuda.utilization(device, 'MB'))

# Clear cache to make space for evaluation tensors
del train_loss
Expand Down

0 comments on commit 887457d

Please sign in to comment.