[MAINTENACE] Fixing method overload violations; providing typehints and method argument documentation (ludwig-ai#3753)

alexsherstinsky · web-flow · commit 31db2033ec20 · 2023-10-25T13:13:37.000-04:00
diff --git a/ludwig/datasets/loaders/dataset_loader.py b/ludwig/datasets/loaders/dataset_loader.py
@@ -12,14 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+from __future__ import annotations
+
 import glob
 import hashlib
 import logging
 import os
 import shutil
 import urllib
 from enum import Enum
-from typing import Dict, List, Optional, Set, Tuple, Union
 from urllib.parse import urlparse
 
 import pandas as pd
@@ -54,16 +55,16 @@ def update_to(self, b=1, bsize=1, tsize=None):
             Total size (in tqdm units). If [default: None] remains unchanged.
         """
         if tsize is not None:
-            self.total = tsize
+            self.total = tsize  # noqa W0201
         self.update(b * bsize - self.n)  # will also set self.n = b * bsize
 
 
-def _list_of_strings(list_or_string: Union[str, List[str]]) -> List[str]:
+def _list_of_strings(list_or_string: str | list[str]) -> list[str]:
     """Helper function to accept single string or lists in config."""
     return [list_or_string] if isinstance(list_or_string, str) else list_or_string
 
 
-def _glob_multiple(pathnames: List[str], root_dir: str = None, recursive: bool = True) -> Set[str]:
+def _glob_multiple(pathnames: list[str], root_dir: str = None, recursive: bool = True) -> set[str]:
     """Recursive glob multiple patterns, returns set of matches.
 
     Note: glob's root_dir argument was added in python 3.10, not using it for compatibility.
@@ -117,7 +118,7 @@ class DatasetLoader:
     training.
     """
 
-    def __init__(self, config: DatasetConfig, cache_dir: Optional[str] = None):
+    def __init__(self, config: DatasetConfig, cache_dir: str | None = None):
         """Constructor."""
         self.config = config
         self.cache_dir = cache_dir if cache_dir else get_default_cache_location()
@@ -187,17 +188,18 @@ def state(self) -> DatasetState:
         return DatasetState.NOT_LOADED
 
     @property
-    def download_urls(self) -> List[str]:
+    def download_urls(self) -> list[str]:
         return _list_of_strings(self.config.download_urls)
 
     @property
-    def download_filenames(self) -> List[str]:
+    def download_filenames(self) -> list[str]:
         """Filenames for downloaded files inferred from download_urls."""
         if self.config.archive_filenames:
             return _list_of_strings(self.config.archive_filenames)
         return [os.path.basename(urlparse(url).path) for url in self.download_urls]
 
-    def get_mirror_download_paths(self, mirror: DatasetFallbackMirror):
+    @staticmethod
+    def get_mirror_download_paths(mirror: DatasetFallbackMirror):
         """Filenames for downloaded files inferred from mirror download_paths."""
         return _list_of_strings(mirror.download_paths)
 
@@ -212,17 +214,17 @@ def description(self) -> str:
         return f"{self.config.name} {self.config.version}\n{self.config.description}"
 
     @property
-    def model_configs(self) -> Dict[str, Dict]:
+    def model_configs(self) -> dict[str, dict]:
         """Returns a dictionary of built-in model configs for this dataset."""
         return model_configs_for_dataset(self.config.name)
 
     @property
-    def best_model_config(self) -> Optional[Dict]:
+    def best_model_config(self) -> dict | None:
         """Returns the best built-in model config for this dataset, or None."""
         return self.model_configs.get("best")
 
     @property
-    def default_model_config(self) -> Optional[Dict]:
+    def default_model_config(self) -> dict | None:
         """Returns the default built-in model config for this dataset.
 
         This is a good first model which should train in under 10m on a current laptop without GPU acceleration.
@@ -252,7 +254,7 @@ def export(self, output_directory: str) -> None:
             else:
                 shutil.copy2(source, destination)
 
-    def _download_and_process(self, kaggle_username=None, kaggle_key=None):
+    def _download_and_process(self, kaggle_username: str | None = None, kaggle_key: str | None = None):
         """Loads the dataset, downloaded and processing it if needed.
 
         If dataset is already processed, does nothing.
@@ -283,14 +285,18 @@ def _download_and_process(self, kaggle_username=None, kaggle_key=None):
             except Exception:
                 logger.exception("Failed to transform dataset")
 
-    def load(self, split=False, kaggle_username=None, kaggle_key=None) -> pd.DataFrame:
+    def load(
+        self, kaggle_username: str | None = None, kaggle_key: str | None = None, split: bool = False
+    ) -> pd.DataFrame | list[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         """Loads the dataset, downloaded and processing it if needed.
 
         Note: This method is also responsible for splitting the data, returning a single dataframe if split=False, and a
         3-tuple of train, val, test if split=True.
 
+        :param kaggle_username: (str) username on Kaggle platform
+        :param kaggle_key: (str) dataset key on Kaggle platform
         :param split: (bool) splits dataset along 'split' column if present. The split column should always have values
-        0: train, 1: validation, 2: test.
+            0: train, 1: validation, 2: test.
         """
         self._download_and_process(kaggle_username=kaggle_username, kaggle_key=kaggle_key)
         if self.state == DatasetState.TRANSFORMED:
@@ -300,7 +306,7 @@ def load(self, split=False, kaggle_username=None, kaggle_key=None) -> pd.DataFra
             else:
                 return dataset_df
 
-    def download(self, kaggle_username=None, kaggle_key=None) -> List[str]:
+    def download(self, kaggle_username: str | None = None, kaggle_key: str | None = None) -> list[str]:
         if not os.path.exists(self.raw_dataset_dir):
             os.makedirs(self.raw_dataset_dir)
         if self.is_kaggle_dataset:
@@ -347,7 +353,7 @@ def verify(self) -> None:
                     digest = _sha256_digest(path)
                     logger.info(f"    {filename}: {digest}")
 
-    def extract(self) -> List[str]:
+    def extract(self) -> list[str]:
         extracted_files = set()
         for download_filename in self.download_filenames:
             download_path = os.path.join(self.raw_dataset_dir, download_filename)
@@ -373,7 +379,7 @@ def transform(self) -> None:
         transformed_dataframe = self.transform_dataframe(unprocessed_dataframe)
         self.save_processed(transformed_dataframe)
 
-    def transform_files(self, file_paths: List[str]) -> List[str]:
+    def transform_files(self, file_paths: list[str]) -> list[str]:
         """Transform data files before loading to dataframe.
 
         Subclasses should override this method to process files before loading dataframe, calling the base class
@@ -409,7 +415,7 @@ def load_file_to_dataframe(self, file_path: str) -> pd.DataFrame:
         else:
             raise ValueError(f"Unsupported dataset file type: {file_extension}")
 
-    def load_files_to_dataframe(self, file_paths: List[str], root_dir=None) -> pd.DataFrame:
+    def load_files_to_dataframe(self, file_paths: list[str], root_dir=None) -> pd.DataFrame:
         """Loads a file or list of files and returns a dataframe.
 
         Subclasses may override this method to change the loader's behavior for groups of files.
@@ -439,7 +445,7 @@ def load_files_to_dataframe(self, file_paths: List[str], root_dir=None) -> pd.Da
             logger.warning(f"Error setting column names: {e}")
             return pd.concat(dataframes, ignore_index=True)
 
-    def load_unprocessed_dataframe(self, file_paths: List[str]) -> pd.DataFrame:
+    def load_unprocessed_dataframe(self, file_paths: list[str]) -> pd.DataFrame:
         """Load dataset files into a dataframe.
 
         Will use the list of data files in the dataset directory as a default if all of config's dataset_filenames,
@@ -451,7 +457,6 @@ def load_unprocessed_dataframe(self, file_paths: List[str]) -> pd.DataFrame:
             _list_of_strings(self.config.validation_filenames), root_dir=self.raw_dataset_dir
         )
         test_paths = _glob_multiple(_list_of_strings(self.config.test_filenames), root_dir=self.raw_dataset_dir)
-        dataframes = []
         if self.config.name == "hugging_face":
             dataframes = self._get_dataframe_with_fixed_splits_from_hf()
         else:
@@ -519,7 +524,8 @@ def get_mtime(self) -> float:
         """Last modified time of the processed dataset after downloading successfully."""
         return os.path.getmtime(self.processed_dataset_path)
 
-    def split(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
+    @staticmethod
+    def split(dataset: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         if SPLIT in dataset:
             dataset[SPLIT] = pd.to_numeric(dataset[SPLIT])
             training_set = dataset[dataset[SPLIT] == 0].drop(columns=[SPLIT])
diff --git a/ludwig/datasets/loaders/hugging_face.py b/ludwig/datasets/loaders/hugging_face.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+from __future__ import annotations
+
 import logging
-from typing import Dict
 
 import datasets
 import pandas as pd
@@ -33,29 +34,40 @@ class HFLoader(DatasetLoader):
     identify which dataset and which subsample of that dataset to load in.
     """
 
-    def load_hf_to_dict(self, hf_id: str, hf_subsample: str) -> Dict[str, pd.DataFrame]:
-        """Returns a map of split -> pd.DataFrame for the given HF dataset."""
-        dataset_dict: Dict[str, "datasets.arrow_dataset.Dataset"] = datasets.load_dataset(
-            path=hf_id, name=hf_subsample
-        )  # noqa
+    @staticmethod
+    def load_hf_to_dict(hf_id: str | None = None, hf_subsample: str | None = None) -> dict[str, pd.DataFrame]:
+        """Returns a map of split -> pd.DataFrame for the given HF dataset.
+
+        :param hf_id: (str) path to dataset on HuggingFace platform
+        :param hf_subsample: (str) name of dataset configuration on HuggingFace platform
+        """
+        dataset_dict: dict[str, datasets.Dataset] = datasets.load_dataset(path=hf_id, name=hf_subsample)
         pandas_dict = {}
         for split in dataset_dict:
             # Convert from HF DatasetDict type to a dictionary of pandas dataframes
             pandas_dict[split] = dataset_dict[split].to_pandas()
         return pandas_dict
 
-    def load(self, hf_id, hf_subsample, split=False) -> pd.DataFrame:
+    # TODO(Alex): Standardize load() signature as interface method in DatasetLoader and adhere to it in all subclasses.
+    def load(
+        self, hf_id: str | None = None, hf_subsample: str | None = None, split: bool = False
+    ) -> pd.DataFrame | list[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
         """When load() is called, HFLoader calls the datasets API to return all of the data in a HuggingFace
         DatasetDict, converts it to a dictionary of pandas dataframes, and returns either three dataframes
         containing train, validation, and test data or one dataframe that is the concatenation of all three
         depending on whether `split` is set to True or False.
 
+        :param split: (bool) directive for how to interpret if dataset contains validation or test set (see below)
+
         Note that some datasets may not provide a validation set or a test set. In this case:
         - If split is True, the DataFrames corresponding to the missing sets are initialized to be empty
         - If split is False, the "split" column in the resulting DataFrame will reflect the fact that there is no
           validation/test split (i.e., there will be no 1s/2s)
 
         A train set should always be provided by Hugging Face.
+
+        :param hf_id: (str) path to dataset on HuggingFace platform
+        :param hf_subsample: (str) name of dataset configuration on HuggingFace platform
         """
         self.config.huggingface_dataset_id = hf_id
         self.config.huggingface_subsample = hf_subsample