Gradient clipping experiments

Former-commit-id: 71264d6 Former-commit-id: 1a2f69f
a2d8a4v · Dec 22, 2023 · 887457d · 887457d
1 parent 1345e5d
commit 887457d
Show file tree

Hide file tree

Showing 6 changed files with 131 additions and 11 deletions.
diff --git a/config/w2v2fb-buckets1-050000-7layer-2048channel-clipinf.py b/config/w2v2fb-buckets1-050000-7layer-2048channel-clipinf.py
@@ -0,0 +1,30 @@
+MODULE = 'ppgs'
+
+# Configuration name
+CONFIG = 'w2v2fb-buckets1-050000-7layer-2048channel-clipinf'
+
+# Number of buckets to partition training examples to minimize padding
+BUCKETS = 1
+
+# Method to use for gradient clipping.
+# One of ['autoclip', 'inf', 'l1', 'l2', 'skip'].
+GRADIENT_CLIPPING_METHOD = 'inf'
+
+# Gradient clipping threshold.
+# For autoclip, this specifies the clipping percentile.
+GRADIENT_CLIPPING_THRESHOLD = 10000
+
+# Network width
+HIDDEN_CHANNELS = 2048
+
+# Dimensionality of input representation
+INPUT_CHANNELS = 768
+
+# Maximum number of frames in a batch
+MAX_TRAINING_FRAMES = 50000
+
+# Number of hidden layers
+NUM_HIDDEN_LAYERS = 7
+
+# Input representation
+REPRESENTATION = 'w2v2fb'
diff --git a/config/w2v2fb-buckets1-100000-6layer-1024channel-autoclip.py b/config/w2v2fb-buckets1-100000-6layer-1024channel-autoclip.py
@@ -0,0 +1,30 @@
+MODULE = 'ppgs'
+
+# Configuration name
+CONFIG = 'w2v2fb-buckets1-100000-6layer-1024channel-autoclip'
+
+# Number of buckets to partition training examples to minimize padding
+BUCKETS = 1
+
+# Method to use for gradient clipping.
+# One of ['autoclip', 'inf', 'l1', 'l2', 'skip'].
+GRADIENT_CLIPPING_METHOD = 'autoclip'
+
+# Gradient clipping threshold.
+# For autoclip, this specifies the clipping percentile.
+GRADIENT_CLIPPING_THRESHOLD = .9
+
+# Network width
+HIDDEN_CHANNELS = 1024
+
+# Dimensionality of input representation
+INPUT_CHANNELS = 768
+
+# Maximum number of frames in a batch
+MAX_TRAINING_FRAMES = 100000
+
+# Number of hidden layers
+NUM_HIDDEN_LAYERS = 6
+
+# Input representation
+REPRESENTATION = 'w2v2fb'
diff --git a/config/w2v2fb-buckets1-100000-6layer-1024channel-clipskip.py b/config/w2v2fb-buckets1-100000-6layer-1024channel-clipskip.py
@@ -0,0 +1,30 @@
+MODULE = 'ppgs'
+
+# Configuration name
+CONFIG = 'w2v2fb-buckets1-100000-6layer-1024channel-clipskip'
+
+# Number of buckets to partition training examples to minimize padding
+BUCKETS = 1
+
+# Method to use for gradient clipping.
+# One of ['autoclip', 'inf', 'l1', 'l2', 'skip'].
+GRADIENT_CLIPPING_METHOD = 'skip'
+
+# Gradient clipping threshold.
+# For autoclip, this specifies the clipping percentile.
+GRADIENT_CLIPPING_THRESHOLD = 10000
+
+# Network width
+HIDDEN_CHANNELS = 1024
+
+# Dimensionality of input representation
+INPUT_CHANNELS = 768
+
+# Maximum number of frames in a batch
+MAX_TRAINING_FRAMES = 100000
+
+# Number of hidden layers
+NUM_HIDDEN_LAYERS = 6
+
+# Input representation
+REPRESENTATION = 'w2v2fb'
diff --git a/config/w2v2fb-buckets1-150000-5layer-512channel-clipinf.py b/config/w2v2fb-buckets1-150000-5layer-512channel-clipinf.py
@@ -0,0 +1,30 @@
+MODULE = 'ppgs'
+
+# Configuration name
+CONFIG = 'w2v2fb-buckets1-150000-5layer-512channel-clipinf'
+
+# Number of buckets to partition training examples to minimize padding
+BUCKETS = 1
+
+# Method to use for gradient clipping.
+# One of ['autoclip', 'inf', 'l1', 'l2', 'skip'].
+GRADIENT_CLIPPING_METHOD = 'inf'
+
+# Gradient clipping threshold.
+# For autoclip, this specifies the clipping percentile.
+GRADIENT_CLIPPING_THRESHOLD = 10000
+
+# Network width
+HIDDEN_CHANNELS = 512
+
+# Dimensionality of input representation
+INPUT_CHANNELS = 768
+
+# Maximum number of frames in a batch
+MAX_TRAINING_FRAMES = 150000
+
+# Number of hidden layers
+NUM_HIDDEN_LAYERS = 5
+
+# Input representation
+REPRESENTATION = 'w2v2fb'
diff --git a/ppgs/config/defaults.py b/ppgs/config/defaults.py
@@ -196,7 +196,7 @@
 #     NUM_WORKERS = int(os.cpu_count() / max(1, len(GPUtil.getGPUs())))
 # except ValueError:
 #     NUM_WORKERS = os.cpu_count()
-NUM_WORKERS = 12
+NUM_WORKERS = 8
 
 # Seed for all random number generators
 RANDOM_SEED = 1234

diff --git a/ppgs/train/core.py b/ppgs/train/core.py
@@ -3,6 +3,7 @@
 
 # import accelerate
 import matplotlib
+import numpy as np
 import torch
 import torchutil
 
@@ -151,7 +152,7 @@ def train(dataset, directory=ppgs.RUNS_DIR / ppgs.CONFIG, gpu=None):
                     if p.grad is not None:
                         max_grad = max(max_grad, p.grad.data.max())
                         total_norm += p.grad.data.norm(2)
-                total_norm = total_norm ** (1. / 2)
+                total_norm = (total_norm ** (1. / 2)).item()
                 torchutil.tensorboard.update(
                     directory,
                     step,
@@ -163,7 +164,12 @@ def train(dataset, directory=ppgs.RUNS_DIR / ppgs.CONFIG, gpu=None):
                 if ppgs.GRADIENT_CLIPPING_METHOD is not None:
 
                     # Just skip the update
-                    if ppgs.GRADIENT_CLIPPING_METHOD == 'skip':
+                    if (
+                        ppgs.GRADIENT_CLIPPING_METHOD == 'skip' and
+                        max_grad > ppgs.GRADIENT_CLIPPING_THRESHOLD and
+                        step > 1000
+                    ):
+                        print(f'{max_grad} exceeds threshold of {ppgs.GRADIENT_CLIPPING_THRESHOLD}. Skipping.')
                         continue
 
                     # Unscale gradients
@@ -214,17 +220,11 @@ def train(dataset, directory=ppgs.RUNS_DIR / ppgs.CONFIG, gpu=None):
 
                     # Log VRAM utilization
                     # index = accelerator.device.index
-                    index = device.index
-                    print(torch.cuda.memory_summary(index))
-                    scalars = {
-                        'max_allocated (GB)': torch.cuda.max_memory_allocated(
-                            index) / (1024 ** 3),
-                        'max_reserved (GB)': torch.cuda.max_memory_reserved(
-                            index) / (1024 ** 3)}
+                    print(torch.cuda.memory_summary(device.index))
                     torchutil.tensorboard.update(
                         directory,
                         step,
-                        scalars=scalars)
+                        scalars=torchutil.cuda.utilization(device, 'MB'))
 
                     # Clear cache to make space for evaluation tensors
                     del train_loss