handled erroneous data (hpcaitech#101)

zhhezhhe · May 14, 2024 · f73f756 · f73f756
1 parent 5929cae
commit f73f756
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 13 deletions.
diff --git a/.gitignore b/.gitignore
@@ -180,15 +180,14 @@ cache/
 
 # Secret files
 hostfile
+run.sh
 gradio_cached_examples/
 wandb/
 
-<<<<<<< HEAD
 # vae weights
 eval/vae/flolpips/weights/
-=======
+
 # npm
 node_modules/
 package-lock.json
 package.json
->>>>>>> upstream/main
diff --git a/opensora/datasets/datasets.py b/opensora/datasets/datasets.py
@@ -176,12 +176,9 @@ def getitem(self, index):
         return ret
 
     def __getitem__(self, index):
-        for _ in range(10):
-            try:
-                return self.getitem(index)
-            except Exception as e:
-                index, num_frames, height, width = [int(val) for val in index.split("-")]
-                path = self.data.iloc[index]["path"]
-                print(f"data {path}: {e}")
-                index = np.random.randint(len(self))
-        raise RuntimeError("Too many bad data.")
+        try:
+            return self.getitem(index)
+        except Exception:
+            # we return None here in case of errorneous data
+            # the collate function will handle it
+            return None
diff --git a/opensora/datasets/utils.py b/opensora/datasets/utils.py
@@ -204,3 +204,10 @@ def resize_crop_to_fill(pil_image, image_size):
     arr = np.array(image)
     assert i + th <= arr.shape[0] and j + tw <= arr.shape[1]
     return Image.fromarray(arr[i : i + th, j : j + tw])
+
+
+def collate_fn_ignore_none(batch):
+    # we filter out the None values
+    # None value is returned when the get_item fails for an index
+    batch = [val for val in batch if val is not None]
+    return torch.utils.data.default_collate(batch)
diff --git a/scripts/train.py b/scripts/train.py
@@ -5,16 +5,17 @@
 
 import torch
 import torch.distributed as dist
-import wandb
 from colossalai.booster import Booster
 from colossalai.cluster import DistCoordinator
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.utils import get_current_device, set_seed
 from tqdm import tqdm
 
+import wandb
 from opensora.acceleration.checkpoint import set_grad_checkpoint
 from opensora.acceleration.parallel_states import get_data_parallel_group
 from opensora.datasets import prepare_dataloader, prepare_variable_dataloader
+from opensora.datasets.utils import collate_fn_ignore_none
 from opensora.registry import DATASETS, MODELS, SCHEDULERS, build_module
 from opensora.utils.ckpt_utils import load, model_gathering, model_sharding, record_model_param_shape, save
 from opensora.utils.config_utils import define_experiment_workspace, parse_configs, save_training_config
@@ -97,6 +98,7 @@ def main():
         drop_last=True,
         pin_memory=True,
         process_group=get_data_parallel_group(),
+        collate_fn=collate_fn_ignore_none,
     )
     if cfg.dataset.type == DEFAULT_DATASET_NAME:
         dataloader = prepare_dataloader(**dataloader_args)