diff --git a/litgpt/utils.py b/litgpt/utils.py index 9b580d6146..e964564da7 100644 --- a/litgpt/utils.py +++ b/litgpt/utils.py @@ -39,7 +39,7 @@ def num_parameters(module: nn.Module, requires_grad: Optional[bool] = None) -> i if requires_grad is None or p.requires_grad == requires_grad: if hasattr(p, "quant_state"): # bitsandbytes 4bit layer support - total += math.prod(p.quant_state[1]) + total += math.prod(p.quant_state.shape) else: total += p.numel() return total diff --git a/pyproject.toml b/pyproject.toml index f13f193b80..d827b2f9c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ license = { file = "LICENSE" } dependencies = [ "torch>=2.2.0", - "lightning @ git+https://github.com/Lightning-AI/lightning@b19c3a961c79028d7c39a4f1ff1c2df991406d1d", + "lightning @ git+https://github.com/Lightning-AI/lightning@75553845c6bbcc305fbae38a46ef4e532e4ac85a", # TODO: install from PyPI when https://github.com/omni-us/jsonargparse/pull/466 is released "jsonargparse[signatures] @ git+https://github.com/omni-us/jsonargparse", ] @@ -32,8 +32,7 @@ test = [ "protobuf", ] all = [ - "bitsandbytes==0.41.0", # quantization - "scipy", # required by bitsandbytes + "bitsandbytes==0.42.0", # quantization "sentencepiece", # llama-based models "tokenizers", # pythia, falcon, redpajama "datasets", # eval diff --git a/tests/test_model.py b/tests/test_model.py index 2919e51fe2..dd4e249d8c 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1,6 +1,7 @@ # Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file. import sys +from copy import deepcopy from functools import partial from pathlib import Path from urllib.request import urlretrieve @@ -698,7 +699,7 @@ def test_model_kv_cache_amp(): @RunIf(min_cuda_gpus=1) -@pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs]) +@pytest.mark.parametrize("config", deepcopy(config_module.configs), ids=[c["name"] for c in config_module.configs]) @torch.inference_mode() def test_sdpa_choice(config): from torch.backends.cuda import ( @@ -754,7 +755,7 @@ def assert_sdpa_backend(original_fn, q, k, v, mask): @RunIf(min_cuda_gpus=1) -@pytest.mark.parametrize("config", config_module.configs, ids=[c["name"] for c in config_module.configs]) +@pytest.mark.parametrize("config", deepcopy(config_module.configs), ids=[c["name"] for c in config_module.configs]) @torch.inference_mode() def test_sdpa_choice_kv_cache(config): from torch.backends.cuda import ( diff --git a/tests/test_utils.py b/tests/test_utils.py index 837ad9c737..ac92ffcab4 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -158,7 +158,6 @@ def test_num_parameters(): @RunIf(min_cuda_gpus=1) @pytest.mark.parametrize("mode", ["nf4", "nf4-dq", "fp4", "fp4-dq", "int8", "int8-training"]) -@pytest.mark.skip("To be fixed") def test_num_parameters_bitsandbytes(mode): from lightning.fabric.plugins import BitsandbytesPrecision diff --git a/tutorials/quantize.md b/tutorials/quantize.md index 9292711347..6a823f2f02 100644 --- a/tutorials/quantize.md +++ b/tutorials/quantize.md @@ -46,7 +46,7 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check Uses the normalized float 4 (nf4) data type. This is recommended over "fp4" based on the paper's experimental results and theoretical analysis. ```bash -pip install scipy bitsandbytes # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released +pip install bitsandbytes litgpt generate base --quantize bnb.nf4 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256 ... @@ -62,7 +62,7 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check In average, this amounts to about 0.37 bits per parameter (approximately 3 GB for a 65B model). ```bash -pip install scipy bitsandbytes # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released +pip install bitsandbytes litgpt generate base --quantize bnb.nf4-dq --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256 ... @@ -77,7 +77,7 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check Uses pure FP4 quantization. ```bash -pip install scipy bitsandbytes # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released +pip install bitsandbytes litgpt generate base --quantize bnb.fp4 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256 ... @@ -93,7 +93,7 @@ Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check In average, this amounts to about 0.37 bits per parameter (approximately 3 GB for a 65B model). ```bash -pip install scipy bitsandbytes # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released +pip install bitsandbytes litgpt generate base --quantize bnb.fp4-dq --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision bf16-true --max_new_tokens 256 ... @@ -106,7 +106,7 @@ Memory used: 5.38 GB Enabled with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). Check out the [paper](https://arxiv.org/abs/2110.02861) to learn more about how it works. ```bash -pip install scipy bitsandbytes # scipy is required until https://github.com/TimDettmers/bitsandbytes/pull/525 is released +pip install bitsandbytes litgpt generate base --quantize bnb.int8 --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision 16-true --max_new_tokens 256 ...