Skip to content

Commit 2905d38

Browse files
Add prepare_dataset command (ServiceNow#38)
Co-authored-by: Joel Lamy-Poirier <[email protected]>
1 parent 7989595 commit 2905d38

20 files changed

+496
-56
lines changed

.dockerignore

+7
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
# Ignore everything by default
12
*
3+
4+
# Allow specific files and directories
25
!setup.py
36
!setup.cfg
47
!Megatron-LM
@@ -7,3 +10,7 @@
710
!tools
811
!tests
912
!pyproject.toml
13+
14+
# Exclude Python cache directories and shared object files within included directories
15+
**/__pycache__/
16+
**/*.so

.github/workflows/ci.yaml

+2-6
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,9 @@ jobs:
5757
ghcr.io/servicenow/fast-llm
5858
tags: |
5959
type=schedule
60-
type=ref,event=branch
61-
type=semver,pattern={{version}}
62-
type=semver,pattern={{major}}.{{minor}}
63-
type=semver,pattern={{major}}
60+
type=pep440,pattern={{version}}
6461
type=sha
65-
type=raw,value=latest,enabled={{github.ref == 'refs/heads/main'}}
62+
type=raw,value=latest,enable={{is_default_branch}}
6663
6764
- name: Set up Docker Buildx
6865
uses: docker/setup-buildx-action@v3
@@ -78,7 +75,6 @@ jobs:
7875
uses: docker/build-push-action@v6
7976
with:
8077
context: .
81-
# push: ${{ github.event_name != 'pull_request' }}
8278
push: true
8379
tags: ${{ steps.meta.outputs.tags }}
8480
labels: ${{ steps.meta.outputs.labels }}

Dockerfile

+28-23
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,39 @@
11
# syntax=docker/dockerfile:1.7-labs
22
FROM nvcr.io/nvidia/pytorch:24.07-py3
33

4-
# Install git-lfs for Huggingface hub interaction and sudo for system adjustments
4+
# Install dependencies.
55
RUN apt-get update \
6-
&& apt-get install --no-install-recommends -y git-lfs sudo util-linux \
6+
&& apt-get install --no-install-recommends -y acl git-lfs \
77
&& rm -rf /var/lib/apt/lists/* \
88
&& git lfs install
99

10-
# Add a user for Fast-LLM with sudo privileges for runtime adjustments
11-
ARG FAST_LLM_USER_ID=1000
12-
RUN useradd -m -u $FAST_LLM_USER_ID -s /bin/bash fast_llm \
13-
&& echo 'fast_llm ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
14-
15-
USER fast_llm
10+
# Set the working directory.
1611
WORKDIR /app
12+
# Set the permission to 777 for all files and directories in `/app`, `/home` and python install directories:
13+
# 1. Create directories explicitly because docker use the wrong permission for explicit creation.
14+
# 2. For the rest, set the default ACL to 777 for all users.
15+
RUN mkdir -m 777 /app/Megatron-LM /app/examples /app/fast_llm /app/tests /app/tools \
16+
&& setfacl -m d:u::rwx,d:g::rwx,d:o::rwx,u::rwx,g::rwx,o::rwx \
17+
/app \
18+
/home \
19+
/usr \
20+
/usr/local \
21+
/usr/local/bin \
22+
/usr/local/lib \
23+
/usr/local/lib/python3.10 \
24+
/usr/local/lib/python3.10/dist-packages \
25+
/usr/local/lib/python3.10/dist-packages/__pycache__
1726

18-
# Environment settings for Python and PATH
19-
ENV PYTHONPATH=/app:/app/Megatron-LM \
20-
PATH=$PATH:/home/fast_llm/.local/bin/
21-
22-
# Copy the dependency files and install dependencies
23-
COPY --chown=fast_llm setup.py setup.cfg pyproject.toml ./
24-
COPY --chown=fast_llm ./fast_llm/csrc/ fast_llm/csrc/
25-
RUN PIP_NO_INPUT=1 pip3 install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]"
27+
# Copy dependency files with universal write permissions for all users.
28+
COPY --chmod=777 setup.py setup.cfg pyproject.toml ./
29+
COPY --chmod=777 ./fast_llm/csrc/ fast_llm/csrc/
2630

27-
# Copy the rest of the code
28-
COPY --chown=fast_llm ./Megatron-LM Megatron-LM
29-
COPY --chown=fast_llm ./examples examples
30-
COPY --chown=fast_llm ./tests tests
31-
COPY --chown=fast_llm ./tools tools
31+
# Install dependencies within the virtual environment.
32+
RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]"
3233

33-
# Copy the main source code for Fast-LLM
34-
COPY --exclude=./fast_llm/csrc/ --chown=fast_llm ./fast_llm/ fast_llm/
34+
# Copy the remaining source code with universal write permissions.
35+
COPY --chmod=777 ./Megatron-LM Megatron-LM
36+
COPY --chmod=777 ./examples examples
37+
COPY --chmod=777 ./tests tests
38+
COPY --chmod=777 ./tools tools
39+
COPY --chmod=777 --exclude=./fast_llm/csrc/ ./fast_llm/ fast_llm/

fast_llm/config.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -301,15 +301,21 @@ def __setattr__(self, key, value):
301301
# Allow setting the exact same object to facilitate setup of cross-dependencies.
302302
# Ex. allow re-setting cross-dependencies of already validated sub-configs.
303303
return
304-
raise RuntimeError()
304+
raise RuntimeError(
305+
f"Cannot set attribute `{key}`"
306+
f" in configuration class `{get_type_name(type(self))}` after validation."
307+
)
305308
super().__setattr__(key, value)
306309

307310
def __delattr__(self, key):
308311
"""
309312
Make the class read-only after validation.
310313
"""
311314
if getattr(self, "_validated", False):
312-
raise RuntimeError()
315+
raise RuntimeError(
316+
f"Cannot delete attribute `{key}`"
317+
f" in configuration class `{get_type_name(type(self))}` after validation."
318+
)
313319
super().__delattr__(key)
314320

315321
def validate(self, *, _is_validating=False):

fast_llm/data/auto.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from fast_llm.data.preparator.gpt_memmap.config import GPTMemmapDatasetPreparatorConfig
2+
from fast_llm.utils import Registry
3+
4+
dataset_preparator_registry = Registry(
5+
"DatasetPreparator",
6+
{
7+
dataset_preparator.preparator_name: dataset_preparator
8+
for dataset_preparator in [
9+
GPTMemmapDatasetPreparatorConfig,
10+
]
11+
},
12+
)

fast_llm/data/config.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def _validate(self):
107107
class TokenizerConfig(Config):
108108
"""
109109
Configuration for the tokenizer.
110-
Currently, the tokenizer is only needed for FIM.
110+
The tokenizer is needed for FIM and dataset preparation.
111111
"""
112112

113113
format: str = Field(

fast_llm/data/gpt/memmap.py

+7-17
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import numpy as np
55

66
from fast_llm.data.gpt.dataset import GPTIndexedDataset
7+
from fast_llm.data.preparator.gpt_memmap.config import MEMMAP_DTYPES, MEMMAP_DTYPES_INV, MEMMAP_INDEX_HEADER
8+
from fast_llm.engine.config_utils.data_type import DataType
79
from fast_llm.utils import Assert, div, padded_cumsum
810

911

@@ -16,18 +18,6 @@ class GPTMemmapDataset(GPTIndexedDataset):
1618
See https://github.com/NVIDIA/Megatron-LM?tab=readme-ov-file#data-preprocessing for more details.
1719
"""
1820

19-
_DTYPES = {
20-
1: np.uint8,
21-
2: np.int8,
22-
3: np.int16,
23-
4: np.int32,
24-
5: np.int64,
25-
6: np.float32,
26-
7: np.float64,
27-
8: np.uint16,
28-
}
29-
_INDEX_HEADER = b"MMIDIDX\x00\x00"
30-
3121
def __init__(self, name: str, prefix: pathlib.Path | str):
3222
self._init(name, prefix)
3323

@@ -37,10 +27,10 @@ def _init(self, name: str, prefix: pathlib.Path | str):
3727
self._prefix = pathlib.Path(prefix)
3828

3929
with self._prefix.with_suffix(".idx").open("rb") as stream:
40-
Assert.eq(stream.read(9), self._INDEX_HEADER)
30+
Assert.eq(stream.read(9), MEMMAP_INDEX_HEADER)
4131
Assert.eq(struct.unpack("<Q", stream.read(8))[0], 1)
4232

43-
self._dtype = self._DTYPES[struct.unpack("<B", stream.read(1))[0]]
33+
self._dtype = MEMMAP_DTYPES[struct.unpack("<B", stream.read(1))[0]].numpy
4434
self._num_documents = struct.unpack("<Q", stream.read(8))[0]
4535
_ = struct.unpack("<Q", stream.read(8))[0]
4636
offset = stream.tell()
@@ -106,13 +96,13 @@ def write_dataset(cls, prefix: pathlib.Path | str, documents: list[np.ndarray]):
10696
dtype = documents[0].dtype
10797
num_documents = len(documents)
10898
lengths = np.array([len(document) for document in documents], dtype=np.int32)
109-
pointers = padded_cumsum(lengths[:-1].astype(np.int64) * 2)
99+
pointers = padded_cumsum(lengths[:-1].astype(np.int64)) * np.dtype(dtype).itemsize
110100
prefix.parent.mkdir(parents=True, exist_ok=True)
111101
with prefix.with_suffix(".idx").open("wb") as stream:
112-
stream.write(cls._INDEX_HEADER)
102+
stream.write(MEMMAP_INDEX_HEADER)
113103
stream.write(struct.pack("<Q", 1))
114104
# Data type
115-
stream.write(struct.pack("<B", {y: x for x, y in cls._DTYPES.items()}[dtype.type]))
105+
stream.write(struct.pack("<B", MEMMAP_DTYPES_INV[DataType.from_numpy(dtype.type)]))
116106
# "Number of sequences", same as documents in our case.
117107
stream.write(struct.pack("<Q", num_documents))
118108
# "Number of documents", needs a +1 for some reason.

fast_llm/data/preparator/__init__.py

Whitespace-only changes.

fast_llm/data/preparator/config.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import abc
2+
import argparse
3+
import typing
4+
5+
from fast_llm.config import config_class
6+
from fast_llm.engine.config_utils.runnable import RunnableConfig
7+
from fast_llm.utils import Assert
8+
9+
10+
@config_class()
11+
class DatasetPreparatorConfig(RunnableConfig):
12+
preparator_name: typing.ClassVar[str]
13+
14+
@classmethod
15+
def get_dataset_preparator_class(cls) -> type["DatasetPreparator"]:
16+
raise NotImplementedError
17+
18+
def _get_runnable(self, parsed: argparse.Namespace) -> typing.Callable[[], None]:
19+
dataset_preparator = self.get_dataset_preparator_class()(config=self)
20+
return dataset_preparator.run
21+
22+
23+
class DatasetPreparator(abc.ABC):
24+
_config: DatasetPreparatorConfig
25+
config_class: typing.ClassVar[type[DatasetPreparatorConfig]] = DatasetPreparatorConfig
26+
27+
def __init__(self, config: DatasetPreparatorConfig) -> None:
28+
Assert.custom(isinstance, config, self.config_class)
29+
config.validate()
30+
self._config = config
31+
32+
@abc.abstractmethod
33+
def run(self) -> None:
34+
raise NotImplementedError

fast_llm/data/preparator/gpt_memmap/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)