added pre-commit (hpcaitech#5)

* added pre-commit * polish
jjandnn · Feb 22, 2024 · 9eaf57f · 9eaf57f
1 parent 37a296a
commit 9eaf57f
Show file tree

Hide file tree

Showing 14 changed files with 175 additions and 243 deletions.
diff --git a/.isort.cfg b/.isort.cfg
@@ -0,0 +1,7 @@
+[settings]
+line_length = 120
+multi_line_output=3
+include_trailing_comma = true
+ignore_comments = true
+profile = black
+honor_noqa = true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,32 @@
+repos:
+
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.2.1
+    hooks:
+      - id: autoflake
+        name: autoflake (python)
+        args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports']
+
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: sort all imports (python)
+
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 23.9.1
+    hooks:
+    - id: black
+      name: black formatter
+      args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+      - id: check-merge-conflict
+      - id: check-case-conflict
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: ['--fix=lf']
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ Here is an example of the captions file:
     },
     {
         "file": "video1.mp4",
-        "captions": ["a  comparison of two opposing team football athletes"]    
+        "captions": ["a  comparison of two opposing team football athletes"]
     }
 ]
 ```
@@ -46,4 +46,4 @@ How to run the script:
 python preprocess_data.py /path/to/captions.json /path/to/video_dir /path/to/output_dir
 ```
 
-Note that this script needs to be run on a machine with a GPU. To avoid CUDA OOM, we filter out the videos that are too long.
+Note that this script needs to be run on a machine with a GPU. To avoid CUDA OOM, we filter out the videos that are too long.
diff --git a/data_utils.py b/data_utils.py
@@ -1,8 +1,7 @@
 import os
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 from datasets import Dataset as HFDataset
 from datasets import dataset_dict, load_from_disk
@@ -35,9 +34,7 @@ def video2col(video_4d: torch.Tensor, patch_size: int) -> torch.Tensor:
     return torch.stack(out, dim=1).view(-1, c, patch_size, patch_size)
 
 
-def col2video(
-    patches: torch.Tensor, video_shape: Tuple[int, int, int, int]
-) -> torch.Tensor:
+def col2video(patches: torch.Tensor, video_shape: Tuple[int, int, int, int]) -> torch.Tensor:
     """
     Convert a 2D tensor of patches to a 4D video tensor.
 
@@ -74,10 +71,7 @@ def pad_sequences(sequences: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Te
     """
     max_len = max([sequence.shape[0] for sequence in sequences])
     padded_sequences = [
-        F.pad(
-            sequence, [0] * (sequence.ndim - 1) * 2 + [0, max_len - sequence.shape[0]]
-        )
-        for sequence in sequences
+        F.pad(sequence, [0] * (sequence.ndim - 1) * 2 + [0, max_len - sequence.shape[0]]) for sequence in sequences
     ]
     padded_sequences = torch.stack(padded_sequences, dim=0)
     padding_mask = torch.zeros(
@@ -91,9 +85,7 @@ def pad_sequences(sequences: List[torch.Tensor]) -> Tuple[torch.Tensor, torch.Te
     return padded_sequences, padding_mask
 
 
-def patchify_batch(
-    videos: List[torch.Tensor], patch_size: int
-) -> Tuple[torch.Tensor, torch.Tensor]:
+def patchify_batch(videos: List[torch.Tensor], patch_size: int) -> Tuple[torch.Tensor, torch.Tensor]:
     """Patchify a batch of videos.
 
     Args:
@@ -128,18 +120,14 @@ def make_batch(samples: List[dict], patch_size: int) -> dict:
     }
 
 
-def load_datasets(
-    dataset_paths: Union[PathType, List[PathType]], mode: str = "train"
-) -> Optional[DatasetType]:
+def load_datasets(dataset_paths: Union[PathType, List[PathType]], mode: str = "train") -> Optional[DatasetType]:
     """
     Load pre-tokenized dataset.
     Each instance of dataset is a dictionary with
     `{'input_ids': List[int], 'labels': List[int], sequence: str}` format.
     """
     mode_map = {"train": "train", "dev": "validation", "test": "test"}
-    assert mode in tuple(
-        mode_map
-    ), f"Unsupported mode {mode}, it must be in {tuple(mode_map)}"
+    assert mode in tuple(mode_map), f"Unsupported mode {mode}, it must be in {tuple(mode_map)}"
 
     if isinstance(dataset_paths, (str, os.PathLike)):
         dataset_paths = [dataset_paths]
@@ -148,9 +136,7 @@ def load_datasets(
     for ds_path in dataset_paths:
         ds_path = os.path.abspath(ds_path)
         assert os.path.exists(ds_path), f"Not existed file path {ds_path}"
-        ds_dict = load_from_disk(
-            dataset_path=ds_path, keep_in_memory=False
-        ).with_format("torch")
+        ds_dict = load_from_disk(dataset_path=ds_path, keep_in_memory=False).with_format("torch")
         if isinstance(ds_dict, HFDataset):
             datasets.append(ds_dict)
         else:

diff --git a/diffusion/__init__.py b/diffusion/__init__.py
@@ -9,13 +9,13 @@
 
 def create_diffusion(
     timestep_respacing,
-    noise_schedule="linear", 
+    noise_schedule="linear",
     use_kl=False,
     sigma_small=False,
     predict_xstart=False,
     learn_sigma=True,
     rescale_learned_sigmas=False,
-    diffusion_steps=1000
+    diffusion_steps=1000,
 ):
     betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
     if use_kl:
@@ -29,15 +29,9 @@ def create_diffusion(
     return SpacedDiffusion(
         use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
         betas=betas,
-        model_mean_type=(
-            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
-        ),
+        model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X),
         model_var_type=(
-            (
-                gd.ModelVarType.FIXED_LARGE
-                if not sigma_small
-                else gd.ModelVarType.FIXED_SMALL
-            )
+            (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL)
             if not learn_sigma
             else gd.ModelVarType.LEARNED_RANGE
         ),

diff --git a/diffusion/diffusion_utils.py b/diffusion/diffusion_utils.py
@@ -3,8 +3,8 @@
 #     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
 #     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
 
-import torch as th
 import numpy as np
+import torch as th
 
 
 def normal_kl(mean1, logvar1, mean2, logvar2):
@@ -22,18 +22,9 @@ def normal_kl(mean1, logvar1, mean2, logvar2):
 
     # Force variances to be Tensors. Broadcasting helps convert scalars to
     # Tensors, but it does not work for th.exp().
-    logvar1, logvar2 = [
-        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
-        for x in (logvar1, logvar2)
-    ]
+    logvar1, logvar2 = [x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) for x in (logvar1, logvar2)]
 
-    return 0.5 * (
-        -1.0
-        + logvar2
-        - logvar1
-        + th.exp(logvar1 - logvar2)
-        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
-    )
+    return 0.5 * (-1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2))
 
 
 def approx_standard_normal_cdf(x):