tests/test_batch.py

import torch
import pytest
import warnings
from pathlib import Path
import lightning as L
import litgpt
from litgpt.generate.base import (
    next_token,
    batched_next_token,
    batched_generate_fn,
    generate_fn,
)
from litgpt.api import LLM, GPT
from litgpt.scripts.download import download_from_hub
from tests.conftest import RunIf


warnings.filterwarnings("ignore")


def create_llm(tmp_path, batch_size, max_seq_length, device) -> tuple[LLM, GPT]:

    L.seed_everything(42)

    model_name = "microsoft/phi-2"
    download_from_hub(repo_id=model_name, tokenizer_only=True, checkpoint_dir=tmp_path)

    llm: LLM = LLM.load(
        model_name,
        tokenizer_dir=Path(tmp_path / model_name),
        init="random",
    )
    model: GPT = llm.model
    model.set_kv_cache(
        batch_size=batch_size, max_seq_length=max_seq_length, device=device
    )

    return llm, model


@pytest.mark.skipif(not torch.cuda.is_available(), reason="Test requires a GPU.")
def test_batched_equivalence(tmp_path):

    model_name = "microsoft/phi-2"
    download_from_hub(repo_id=model_name, tokenizer_only=True, checkpoint_dir=tmp_path)

    device = "cuda:0"
    batch_size = 3
    sample_kwargs = {"top_k": 1}

    llm: LLM = LLM.load(
        model_name,
        tokenizer_dir=Path(tmp_path / model_name),
        init="random",
    )
    model: GPT = llm.model
    model.set_kv_cache(batch_size=1, max_seq_length=50, device=device)

    input_pos_1 = torch.tensor(
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=torch.int64, device=device
    )
    input_pos_2 = torch.tensor([10], dtype=torch.int64, device=device)

    x = torch.tensor(
        [43993, 25, 1867, 466, 32660, 17485, 4483, 30, 198, 26410],
        device=device,
        dtype=torch.int64,
    )

    batch_x1 = torch.stack([x] * batch_size, dim=0)

    # Single token generation baseline
    tok_1 = next_token(model, input_pos_1, x.unsqueeze(0), **sample_kwargs)
    tok_2 = next_token(model, input_pos_2, tok_1.unsqueeze(0), **sample_kwargs)

    assert tok_1.ndim == 1
    assert tok_2.ndim == 1
    assert tok_1.size(0) == 1
    assert tok_2.size(0) == 1

    # Switch to batched generation
    model.clear_kv_cache()
    model.set_kv_cache(batch_size=batch_size, max_seq_length=50, device="cuda:0")

    toks_1: torch.Tensor = batched_next_token(
        model, input_pos_1, batch_x1, sample_kwargs
    )
    toks_2: torch.Tensor = batched_next_token(model, input_pos_2, toks_1, sample_kwargs)

    assert toks_1.ndim == 2
    assert toks_2.ndim == 2
    assert toks_1.size(0) == batch_size
    assert toks_2.size(0) == batch_size

    # Assert that single and batched next token generation are equivalent
    assert all(t == tok_1 for t in toks_1), f"{tok_1} != {toks_1}"
    assert all(t == tok_2 for t in toks_2), f"{tok_2} != {toks_2}"


@RunIf(min_cuda_gpus=1)
def test_simple_batch():
    old_allow_tf32 = torch.backends.cuda.matmul.allow_tf32
    torch.backends.cuda.matmul.allow_tf32 = False
    config = litgpt.Config.from_name(
        "microsoft/phi-2", padded_vocab_size=10000, n_layer=2, n_head=8, n_embd=256
    )
    with torch.device("cuda"):
        m = litgpt.GPT(config).requires_grad_(False).eval()
        x0 = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 7]])
        input_pos0 = torch.tensor([[0, 1, 2, 3], [0, 1, 2, 2]])
        x1 = torch.tensor([[1], [2]])
        input_pos1 = torch.tensor([[4], [3]])

    with torch.device("cuda"):
        m.set_kv_cache(2)
    outs0 = m(x0, input_pos0)
    outs1 = m(x1, input_pos1)

    with torch.device("cuda"):
        m.set_kv_cache(1)

    outs0_ref0 = m(x0[:1], input_pos0[0])
    outs1_ref0 = m(x1[:1], input_pos1[0])

    with torch.device("cuda"):
        m.set_kv_cache(1)

    outs0_ref1 = m(x0[1:], input_pos0[1])
    outs1_ref1 = m(x1[1:], input_pos1[1])

    outs0_ref = torch.cat([outs0_ref0, outs0_ref1])
    outs1_ref = torch.cat([outs1_ref0, outs1_ref1])

    print(outs0_ref - outs0)
    print(outs0.shape)
    torch.testing.assert_close(outs0, outs0_ref)
    torch.testing.assert_close(outs1, outs1_ref)
    torch.backends.cuda.matmul.allow_tf32 = old_allow_tf32


@RunIf(min_cuda_gpus=1)
def test_batch_generate(tmp_path):

    torch.use_deterministic_algorithms(True)

    device = "cuda:0"
    batch_size = 3
    sample_kwargs = {"top_k": 1}
    llm, model = create_llm(tmp_path, batch_size, 50, device)

    batch_x = torch.tensor(
        [
            [43993, 25, 1867, 466, 32660, 17485, 4483, 30, 198, 26410],
            [25, 1867, 466, 32660, 17485, 4483, 30, 198, 26410, 7596],
            [1867, 466, 32660, 17485, 4483, 30, 198, 26410, 7596, 7596],
        ],
        device=device,
        dtype=torch.int64,
    )

    # Generate tokens
    tokens = []
    for l in batched_generate_fn(
        model,
        prompts=batch_x,
        max_returned_tokens=50,
        sample_args=sample_kwargs,
        include_prompt=True,
        include_eos=False,
    ):
        tokens.append([t.item() if t is not None else None for t in l])

    def find_unique_stop(triplets):
        # Initialize a dictionary to count all number occurrences
        number_count = {}

        # Count occurrences of each number across all positions
        for triplet in triplets:
            for num in triplet:
                number_count[num] = number_count.get(num, 0) + 1

        # Initialize lists to store unique numbers for each position
        unique_first = []
        unique_second = []
        unique_third = []

        # Check each triplet
        for a, b, c in triplets:
            if number_count[a] == 1:
                unique_first.append(a)
            if number_count[b] == 1:
                unique_second.append(b)
            if number_count[c] == 1:
                unique_third.append(c)

        import random  # Seeded earlier

        random.shuffle(unique_first)
        random.shuffle(unique_second)
        random.shuffle(unique_third)
        return [unique_first[0], unique_second[0], unique_third[0]]

    # Now that we know the randomly generated tokens, sample some tokens to stop each stream at.
    stops = find_unique_stop(tokens[batch_x.size(1) :])
    first_stream = [t[0] for t in tokens if t[0] is not None]
    second_stream = [t[1] for t in tokens if t[1] is not None]
    third_stream = [t[2] for t in tokens if t[2] is not None]

    # Let's slice the streams at the stop tokens.
    stop_idxes = [
        first_stream.index(stops[0]),
        second_stream.index(stops[1]),
        third_stream.index(stops[2]),
    ]

    # While we're at it, grab the last token that would be generated before stopping.
    last_tokens = [
        first_stream[stop_idxes[0] - 1],
        second_stream[stop_idxes[1] - 1],
        third_stream[stop_idxes[2] - 1],
    ]

    for t in tokens:
        print(t)

    # Now we generate again, stopping early at the stop tokens.
    tokens = []
    for l in batched_generate_fn(
        model,
        prompts=batch_x,
        max_returned_tokens=50,
        stop_tokens=[(s,) for s in stops],
        sample_args=sample_kwargs,
        include_prompt=True,
        include_eos=False,
    ):
        tokens.append([t.item() if t is not None else None for t in l])

    # Finally, assert that the streams are correct.

    first_stream = [t[0] for t in tokens if t[0] is not None]
    print(first_stream)
    print(len(first_stream), stop_idxes[0])
    assert len(first_stream) == stop_idxes[0]
    assert first_stream[-1] == last_tokens[0]

    second_stream = [t[1] for t in tokens if t[1] is not None]
    print(second_stream)
    print(len(second_stream), stop_idxes[1])
    assert len(second_stream) == stop_idxes[1]
    assert second_stream[-1] == last_tokens[1]

    third_stream = [t[2] for t in tokens if t[2] is not None]
    print(third_stream)
    print(len(third_stream), stop_idxes[2])
    assert len(third_stream) == stop_idxes[2]
    assert third_stream[-1] == last_tokens[2]

    torch.use_deterministic_algorithms(False)

    # for t in llm.tokenizer.decode_stream([torch.tensor(i) for i in first_stream]):
    #    print(t, end="", flush=True)
    # print()


@RunIf(min_cuda_gpus=1)
def test_batch_generate_equivalence(tmp_path):

    torch.use_deterministic_algorithms(True)

    device = "cuda:0"
    batch_size = 3
    sample_kwargs = {"top_k": 1}
    llm, model = create_llm(tmp_path, batch_size, 50, device)

    batch_x = torch.tensor(
        [
            [43993, 25, 1867, 466, 32660, 17485, 4483, 30, 198, 26410],
            [25, 1867, 466, 32660, 17485, 4483, 30, 198, 26410, 7596],
            [1867, 466, 32660, 17485, 4483, 30, 198, 26410, 7596, 7596],
        ],
        device=device,
        dtype=torch.int64,
    )

    # The other test tests the stop_tokens functionality much more exhaustively, we'll just generate and compare 50 tokens here.

    batch_tokens = []
    for l in batched_generate_fn(
        model,
        prompts=batch_x,
        max_returned_tokens=50,
        sample_args=sample_kwargs,
        include_prompt=False,
        include_eos=False,
    ):
        batch_tokens.append([t.item() if t is not None else None for t in l])

    first_stream = [t[0] for t in batch_tokens if t[0] is not None]

    batch_size = 1
    llm, model = create_llm(tmp_path, batch_size, 50, device)

    tokens = []
    for t in generate_fn(
        model,
        prompt=batch_x[0],
        max_returned_tokens=50,
        include_prompt=False,
        include_eos=False,
        **sample_kwargs,
    ):
        if t.size(0) == 1:
            tokens.append(t.item())
        else:
            tokens.extend(t.tolist())

    torch.use_deterministic_algorithms(False)

    # TODO: (apaz-cli) This consistency test doesn't actually work at the moment. It's inconsistent.
    # The output is really close... Something is going on here. For the moment, maybe this is close enough?
    # Enough at least that we can start prototyping.

    print(first_stream)
    print(tokens)
    # assert first_stream == tokens