fix: Beam integratin. Comet ML. Poetry + PyTorch

HoangfVx · Aug 8, 2023 · 10a3dd8 · 10a3dd8
1 parent 433040b
commit 10a3dd8
Show file tree

Hide file tree

Showing 12 changed files with 444 additions and 326 deletions.
diff --git a/.gitignore b/.gitignore
@@ -175,3 +175,4 @@ cython_debug/
 
 # Training
 results/
+model_cache/
diff --git a/logging.yaml → modules/logging.yaml b/logging.yaml → modules/logging.yaml
diff --git a/notebooks/explore_dataset.ipynb → modules/notebooks/explore_dataset.ipynb b/notebooks/explore_dataset.ipynb → modules/notebooks/explore_dataset.ipynb
diff --git a/notebooks/train_dummy_dataset.ipynb → modules/notebooks/train_dummy_dataset.ipynb b/notebooks/train_dummy_dataset.ipynb → modules/notebooks/train_dummy_dataset.ipynb
diff --git a/notebooks/train_on_finqa.ipynb → modules/notebooks/train_on_finqa.ipynb b/notebooks/train_on_finqa.ipynb → modules/notebooks/train_on_finqa.ipynb
diff --git a/modules/tools/train_finqa.py b/modules/tools/train_finqa.py
@@ -38,23 +38,40 @@ def read_requirements(file_path):
     runtime=Runtime(
         cpu=4,
         memory="32Gi",
-        gpu="T4",
+        gpu="A10G",
         # TODO: Install requirements using Poetry & custom commands.
         image=Image(python_version="python3.10", python_packages=requirements),
     ),
     volumes=[
-        Volume(path="dataset", name="train_finqa_dataset"),
-        Volume(path="results", name="train_finqa_results"),
+        Volume(path="./dataset", name="train_finqa_dataset"),
+        Volume(path="./results", name="train_finqa_results"),
+        Volume(path="./model_cache", name="model_cache"),
         ],
 )
 
 
-
 @training_app.run()
 def train():
+    # from training import initialize
+
+    # print(f"COMET_API_KEY: {os.environ.get('COMET_API_KEY')}")
+    # print(f"COMET_PROJECT_NAME: {os.environ.get('COMET_PROJECT_NAME')}")
+    # print(f"COMET_WORKSPACE: {os.environ.get('COMET_WORKSPACE')}")
+
+    # initialize()
+
+    print(f"COMET_API_KEY: {os.environ.get('COMET_API_KEY')}")
+    print(f"COMET_PROJECT_NAME: {os.environ.get('COMET_PROJECT_NAME')}")
+    print(f"COMET_WORKSPACE: {os.environ.get('COMET_WORKSPACE')}")
+
     import torch
     from training import utils
 
+    print(f"COMET_API_KEY: {os.environ.get('COMET_API_KEY')}")
+    print(f"COMET_PROJECT_NAME: {os.environ.get('COMET_PROJECT_NAME')}")
+    print(f"COMET_WORKSPACE: {os.environ.get('COMET_WORKSPACE')}")
+
+
     if torch.cuda.is_available():
         device_count = torch.cuda.device_count()
 

diff --git a/modules/training/__init__.py b/modules/training/__init__.py
@@ -5,10 +5,6 @@
 
 from dotenv import load_dotenv, find_dotenv
 from pathlib import Path
-from training import api, constants, metrics, models
-
-
-__all__ = ["api", "constants", "metrics", "models"]
 
 
 logger = logging.getLogger(__name__)
@@ -34,11 +30,14 @@ def initialize_logger(
     with open(config_path, "rt") as f:
         config = yaml.safe_load(f.read())
 
+    # Make sure that existing logger will still work.
+    config["disable_existing_loggers"] = False
+
     logging.config.dictConfig(config)
 
 
 @run_immediately_decorator
-def initialize(logging_config_path: str ="logging.yaml"):
+def initialize(logging_config_path: str = os.path.join("..", "logging.yaml")):
      # Initialize logger.
     try:
         initialize_logger(config_path=logging_config_path)
@@ -49,7 +48,18 @@ def initialize(logging_config_path: str ="logging.yaml"):
     logger.info("Initializing resources...")
 
     # Initialize environment variables.
-    load_dotenv(find_dotenv())
+    env_file_path = find_dotenv(raise_error_if_not_found=True, usecwd=False)
+    logger.info(f"Loading environment variables from {env_file_path}")
+    load_dotenv(env_file_path)
 
-    # Enable logging of model checkpoints
+    # Enable logging of model checkpoints.
     os.environ["COMET_LOG_ASSETS"] = "True"
+    # Set to OFFLINE to run an Offline Experiment or DISABLE to turn off logging
+    os.environ["COMET_MODE"] = "ONLINE"
+    # Find out more about Comet ML configuration here: https://www.comet.com/docs/v2/integrations/ml-frameworks/huggingface/#configure-comet-for-hugging-face
+
+
+# TODO: Find a better way to initialize the logger and env vars before importing the rest of the packages.
+from training import api, constants, metrics, models
+
+__all__ = ["api", "constants", "metrics", "models"]
diff --git a/modules/training/api/FinQATrainingAPI.py b/modules/training/api/FinQATrainingAPI.py
@@ -1,4 +1,5 @@
 import logging
+import os
 
 from typing import Tuple
 
@@ -72,6 +73,8 @@ def load_model(self) -> Tuple[AutoModelForCausalLM, AutoTokenizer, PeftConfig]:
         return model, tokenizer, peft_config
 
     def train(self) -> SFTTrainer:
+        logger.info("Training model...")
+
         # TODO: Handle this error: "Token indices sequence length is longer than the specified maximum sequence length
         # for this model (2302 > 2048). Running this sequence through the model will result in indexing errors"
         trainer = SFTTrainer(

diff --git a/modules/training/constants.py b/modules/training/constants.py
@@ -13,7 +13,7 @@ class Scope(Enum):
 # TODO: Use Hydra as a configuration management tool.
 # TODO: Configure this path instead of hardcoding it.
 # ROOT_DIR = Path("/workspace")
-ROOT_DIR = Path("..")
+ROOT_DIR = Path("..") / ".."
 # TODO: Fix this /dataset/dataset nested directory.
 # ROOT_DATASET_DIR_DEFAULT = ROOT_DIR / "dataset" / "dataset"
 ROOT_DATASET_DIR_DEFAULT = ROOT_DIR / "dataset"

diff --git a/modules/training/models.py b/modules/training/models.py
@@ -44,6 +44,7 @@ def build_qlora_model(
         load_in_4bit=True,
         device_map="auto",
         trust_remote_code=True,
+        cache_dir="./model_cache"
     )
 
     # TODO: Should we also enable kbit training? Check out what it does.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -175,3 +175,4 @@ cython_debug/

		# Training
		results/
		model_cache/