Skip to content

Commit

Permalink
fix: Beam integratin. Comet ML. Poetry + PyTorch
Browse files Browse the repository at this point in the history
  • Loading branch information
iusztinpaul committed Aug 8, 2023
1 parent 433040b commit 10a3dd8
Show file tree
Hide file tree
Showing 12 changed files with 444 additions and 326 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,4 @@ cython_debug/

# Training
results/
model_cache/
File renamed without changes.
File renamed without changes.
File renamed without changes.

Large diffs are not rendered by default.

25 changes: 21 additions & 4 deletions modules/tools/train_finqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,23 +38,40 @@ def read_requirements(file_path):
runtime=Runtime(
cpu=4,
memory="32Gi",
gpu="T4",
gpu="A10G",
# TODO: Install requirements using Poetry & custom commands.
image=Image(python_version="python3.10", python_packages=requirements),
),
volumes=[
Volume(path="dataset", name="train_finqa_dataset"),
Volume(path="results", name="train_finqa_results"),
Volume(path="./dataset", name="train_finqa_dataset"),
Volume(path="./results", name="train_finqa_results"),
Volume(path="./model_cache", name="model_cache"),
],
)



@training_app.run()
def train():
# from training import initialize

# print(f"COMET_API_KEY: {os.environ.get('COMET_API_KEY')}")
# print(f"COMET_PROJECT_NAME: {os.environ.get('COMET_PROJECT_NAME')}")
# print(f"COMET_WORKSPACE: {os.environ.get('COMET_WORKSPACE')}")

# initialize()

print(f"COMET_API_KEY: {os.environ.get('COMET_API_KEY')}")
print(f"COMET_PROJECT_NAME: {os.environ.get('COMET_PROJECT_NAME')}")
print(f"COMET_WORKSPACE: {os.environ.get('COMET_WORKSPACE')}")

import torch
from training import utils

print(f"COMET_API_KEY: {os.environ.get('COMET_API_KEY')}")
print(f"COMET_PROJECT_NAME: {os.environ.get('COMET_PROJECT_NAME')}")
print(f"COMET_WORKSPACE: {os.environ.get('COMET_WORKSPACE')}")


if torch.cuda.is_available():
device_count = torch.cuda.device_count()

Expand Down
24 changes: 17 additions & 7 deletions modules/training/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@

from dotenv import load_dotenv, find_dotenv
from pathlib import Path
from training import api, constants, metrics, models


__all__ = ["api", "constants", "metrics", "models"]


logger = logging.getLogger(__name__)
Expand All @@ -34,11 +30,14 @@ def initialize_logger(
with open(config_path, "rt") as f:
config = yaml.safe_load(f.read())

# Make sure that existing logger will still work.
config["disable_existing_loggers"] = False

logging.config.dictConfig(config)


@run_immediately_decorator
def initialize(logging_config_path: str ="logging.yaml"):
def initialize(logging_config_path: str = os.path.join("..", "logging.yaml")):
# Initialize logger.
try:
initialize_logger(config_path=logging_config_path)
Expand All @@ -49,7 +48,18 @@ def initialize(logging_config_path: str ="logging.yaml"):
logger.info("Initializing resources...")

# Initialize environment variables.
load_dotenv(find_dotenv())
env_file_path = find_dotenv(raise_error_if_not_found=True, usecwd=False)
logger.info(f"Loading environment variables from {env_file_path}")
load_dotenv(env_file_path)

# Enable logging of model checkpoints
# Enable logging of model checkpoints.
os.environ["COMET_LOG_ASSETS"] = "True"
# Set to OFFLINE to run an Offline Experiment or DISABLE to turn off logging
os.environ["COMET_MODE"] = "ONLINE"
# Find out more about Comet ML configuration here: https://www.comet.com/docs/v2/integrations/ml-frameworks/huggingface/#configure-comet-for-hugging-face


# TODO: Find a better way to initialize the logger and env vars before importing the rest of the packages.
from training import api, constants, metrics, models

__all__ = ["api", "constants", "metrics", "models"]
3 changes: 3 additions & 0 deletions modules/training/api/FinQATrainingAPI.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import os

from typing import Tuple

Expand Down Expand Up @@ -72,6 +73,8 @@ def load_model(self) -> Tuple[AutoModelForCausalLM, AutoTokenizer, PeftConfig]:
return model, tokenizer, peft_config

def train(self) -> SFTTrainer:
logger.info("Training model...")

# TODO: Handle this error: "Token indices sequence length is longer than the specified maximum sequence length
# for this model (2302 > 2048). Running this sequence through the model will result in indexing errors"
trainer = SFTTrainer(
Expand Down
2 changes: 1 addition & 1 deletion modules/training/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class Scope(Enum):
# TODO: Use Hydra as a configuration management tool.
# TODO: Configure this path instead of hardcoding it.
# ROOT_DIR = Path("/workspace")
ROOT_DIR = Path("..")
ROOT_DIR = Path("..") / ".."
# TODO: Fix this /dataset/dataset nested directory.
# ROOT_DATASET_DIR_DEFAULT = ROOT_DIR / "dataset" / "dataset"
ROOT_DATASET_DIR_DEFAULT = ROOT_DIR / "dataset"
Expand Down
1 change: 1 addition & 0 deletions modules/training/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def build_qlora_model(
load_in_4bit=True,
device_map="auto",
trust_remote_code=True,
cache_dir="./model_cache"
)

# TODO: Should we also enable kbit training? Check out what it does.
Expand Down
Loading

0 comments on commit 10a3dd8

Please sign in to comment.