fix: switch to individual decorators

latchbio · Dec 13, 2024 · c41dbce · c41dbce
1 parent 118105e
commit c41dbce
Show file tree

Hide file tree

Showing 3 changed files with 67 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,7 +20,7 @@ Types of changes
 
 ### Added
 
-* Add `l40s_gpu_task` to request L40s GPU instances
+* Add `g6e_xlarge_task`, `g6e_2xlarge_task`, `g6e_4xlarge_task`, `g6e_8xlarge_task`, `g6e_12xlarge_task`, `g6e_16xlarge_task`, and `g6e_24xlarge_task` to request G6e GPU instances.
 
 ## 2.54.10 - 2024-12-06
 

diff --git a/src/latch/__init__.py b/src/latch/__init__.py
@@ -20,7 +20,13 @@
 from latch.resources.tasks import (
     custom_memory_optimized_task,
     custom_task,
-    l40s_gpu_task,
+    g6e_xlarge_task,
+    g6e_2xlarge_task,
+    g6e_4xlarge_task,
+    g6e_8xlarge_task,
+    g6e_12xlarge_task,
+    g6e_16xlarge_task,
+    g6e_24xlarge_task,
     large_gpu_task,
     large_task,
     medium_task,

diff --git a/src/latch/resources/tasks.py b/src/latch/resources/tasks.py
@@ -629,76 +629,33 @@ def nextflow_runtime_task(cpu: int, memory: int, storage_gib: int = 50):
     return functools.partial(task, task_config=task_config)
 
 
-@dataclass
-class _L40sGPUInstanceSpec:
-    type: str
-    cpu: int
-    memory: int
-    gpus: int
-
-
-def l40s_gpu_task(cpu: int, memory: int, gpus: int, **kwargs):
-    """Creates a task configuration for L40s GPU instances.
-
-    Will choose the smallest instance that satisfies the resource requirements and will assign the entire instance to the task.
-
-    Args:
-        cpu: Number of vCPUs requested (4-96)
-        memory: Memory in GiB requested (32-768)
-        gpus: Number of GPUs requested (1 or 4)
-    """
-    instance_specs: list[_L40sGPUInstanceSpec] = [
-        _L40sGPUInstanceSpec(type="g6e.xlarge", cpu=4, memory=32, gpus=1),
-        _L40sGPUInstanceSpec(type="g6e.2xlarge", cpu=8, memory=64, gpus=1),
-        _L40sGPUInstanceSpec(type="g6e.4xlarge", cpu=16, memory=128, gpus=1),
-        _L40sGPUInstanceSpec(type="g6e.8xlarge", cpu=32, memory=256, gpus=1),
-        _L40sGPUInstanceSpec(type="g6e.12xlarge", cpu=48, memory=384, gpus=1),
-        _L40sGPUInstanceSpec(type="g6e.16xlarge", cpu=64, memory=512, gpus=4),
-        _L40sGPUInstanceSpec(type="g6e.24xlarge", cpu=96, memory=768, gpus=4),
-        # _L40sGPUInstanceSpec(type="g6e.48xlarge", cpu=192, memory=1536, gpus=8),
-        # # todo(aidan): add 48xlarge instance (need to debug why does not join cluster)
-    ]
-
-    selected_instance = None
-    for spec in instance_specs:
-        if (cpu <= spec.cpu and 
-            memory <= spec.memory and 
-            gpus <= spec.gpus):
-            selected_instance = spec
-            break
-
-    if not selected_instance:
-        raise ValueError(
-            f"No instance type available for requested resources: "
-            f"{cpu} vCPUs, {memory} GiB RAM, {gpus} GPUs. "
-            f"Maximum available: 96 vCPUs, 768 GiB RAM, 4 GPUs"
-        )
-
+def _get_l40s_pod(instance_type: str, cpu: int, memory: int, gpus: int) -> Pod:
+    """Helper function to create L40s GPU pod configurations."""
     primary_container = V1Container(name="primary")
     resources = V1ResourceRequirements(
         requests={
-            "cpu": str(selected_instance.cpu - 2),
-            "memory": f"{selected_instance.memory - 4}Gi",
-            "nvidia.com/gpu": str(selected_instance.gpus),
+            "cpu": str(cpu - 2),  # Reserve 2 cores for system processes
+            "memory": f"{memory - 4}Gi",  # Reserve 4GB for system processes
+            "nvidia.com/gpu": str(gpus),
             "ephemeral-storage": "4500Gi",
         },
         limits={
-            "cpu": str(selected_instance.cpu),
-            "memory": f"{selected_instance.memory}Gi",
-            "nvidia.com/gpu": str(selected_instance.gpus),
+            "cpu": str(cpu),
+            "memory": f"{memory}Gi",
+            "nvidia.com/gpu": str(gpus),
             "ephemeral-storage": "5000Gi",
         },
     )
     primary_container.resources = resources
 
-    pod_config = Pod(
+    return Pod(
         pod_spec=V1PodSpec(
             containers=[primary_container],
             tolerations=[
                 V1Toleration(
                     effect="NoSchedule",
                     key="ng",
-                    value=selected_instance.type
+                    value=instance_type
                 )
             ],
         ),
@@ -711,4 +668,52 @@ def l40s_gpu_task(cpu: int, memory: int, gpus: int, **kwargs):
         },
     )
 
-    return functools.partial(task, task_config=pod_config, **kwargs)
+
+g6e_xlarge_task = functools.partial(
+    task,
+    task_config=_get_l40s_pod("g6e.xlarge", cpu=4, memory=32, gpus=1)
+)
+"""4 vCPUs, 32 GiB RAM, 1 L40s GPU"""
+
+g6e_2xlarge_task = functools.partial(
+    task,
+    task_config=_get_l40s_pod("g6e.2xlarge", cpu=8, memory=64, gpus=1)
+)
+"""8 vCPUs, 64 GiB RAM, 1 L40s GPU"""
+
+g6e_4xlarge_task = functools.partial(
+    task,
+    task_config=_get_l40s_pod("g6e.4xlarge", cpu=16, memory=128, gpus=1)
+)
+"""16 vCPUs, 128 GiB RAM, 1 L40s GPU"""
+
+g6e_8xlarge_task = functools.partial(
+    task,
+    task_config=_get_l40s_pod("g6e.8xlarge", cpu=32, memory=256, gpus=1)
+)
+"""32 vCPUs, 256 GiB RAM, 1 L40s GPU"""
+
+g6e_12xlarge_task = functools.partial(
+    task,
+    task_config=_get_l40s_pod("g6e.12xlarge", cpu=48, memory=384, gpus=1)
+)
+"""48 vCPUs, 384 GiB RAM, 1 L40s GPU"""
+
+g6e_16xlarge_task = functools.partial(
+    task,
+    task_config=_get_l40s_pod("g6e.16xlarge", cpu=64, memory=512, gpus=4)
+)
+"""64 vCPUs, 512 GiB RAM, 4 L40s GPUs"""
+
+g6e_24xlarge_task = functools.partial(
+    task,
+    task_config=_get_l40s_pod("g6e.24xlarge", cpu=96, memory=768, gpus=4)
+)
+"""96 vCPUs, 768 GiB RAM, 4 L40s GPUs"""
+
+"""
+g6e_48xlarge_task = functools.partial(
+    task,
+    task_config=_get_l40s_pod("g6e.48xlarge", cpu=192, memory=1536, gpus=8)
+)
+192 vCPUs, 1536 GiB RAM, 8 L40s GPUs"""