training pipeline

ZwickyTransientFacility · mcoughlin · Sep 13, 2021 · Mar 17, 2021 · Mar 17, 2021 · Mar 17, 2021
commit 0957b0bf59b73bf6054e25de1240cf5c43bd3098
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ repos:
       - id: check-yaml
       - id: end-of-file-fixer
       - id: trailing-whitespace
-  - repo: https://github.com/python/black
+  - repo: https://github.com/psf/black
     rev: 20.8b1
     hooks:
       - id: black

diff --git a/scope.py b/scope.py
@@ -48,7 +48,7 @@ def status(message):
         print(f"\r[✓] {message}")
 
 
-def check_configs(config_wildcards: Sequence = ("config.*yaml", )):
+def check_configs(config_wildcards: Sequence = ("config.*yaml",)):
     """
     - Check if config files exist
     - Offer to use the config files that match the wildcards
@@ -89,7 +89,6 @@ def check_configs(config_wildcards: Sequence = ("config.*yaml", )):
 
 
 class Scope:
-
     def __init__(self):
         # check configuration
         with status("Checking configuration"):
@@ -119,11 +118,11 @@ def __init__(self):
             print("Kowalski not available")
 
     def _get_nearest_gaia(
-            self,
-            positions: Sequence[Sequence[float]],
-            catalog: str = None,
-            max_distance: Union[float, int] = 5.0,
-            distance_units: str = "arcsec",
+        self,
+        positions: Sequence[Sequence[float]],
+        catalog: str = None,
+        max_distance: Union[float, int] = 5.0,
+        distance_units: str = "arcsec",
     ) -> pd.DataFrame:
         """Get nearest Gaia source for a set of given positions
 
@@ -158,23 +157,24 @@ def _get_nearest_gaia(
                             "phot_rp_mean_mag": 1,
                             "ra": 1,
                             "dec": 1,
-                        }
+                        },
                     }
-                }
+                },
             },
-            "kwargs": {
-                "limit": 1
-            }
+            "kwargs": {"limit": 1},
         }
         response = self.kowalski.query(query=query)
         gaia_nearest = [
-            v[0] for k, v in response.get("data").get(catalog).items()
-            if len(v) > 0
+            v[0] for k, v in response.get("data").get(catalog).items() if len(v) > 0
         ]
         df = pd.DataFrame.from_records(gaia_nearest)
 
         df["M"] = df["phot_g_mean_mag"] + 5 * np.log10(df["parallax"] * 0.001) + 5
-        df["Ml"] = df["phot_g_mean_mag"] + 5 * np.log10((df["parallax"] + df["parallax_error"]) * 0.001) + 5
+        df["Ml"] = (
+            df["phot_g_mean_mag"]
+            + 5 * np.log10((df["parallax"] + df["parallax_error"]) * 0.001)
+            + 5
+        )
         df["BP-RP"] = df["phot_bp_mean_mag"] - df["phot_rp_mean_mag"]
 
         return df
@@ -206,9 +206,7 @@ def _get_light_curve_data(
                 "object_coordinates": {
                     "cone_search_radius": cone_search_radius,
                     "cone_search_unit": cone_search_unit,
-                    "radec": {
-                        "target": [ra, dec]
-                    }
+                    "radec": {"target": [ra, dec]},
                 },
                 "catalogs": {
                     catalog: {
@@ -223,11 +221,11 @@ def _get_light_curve_data(
                             "data.ra": 1,
                             "data.dec": 1,
                             "data.programid": 1,
-                            "data.catflags": 1
-                        }
+                            "data.catflags": 1,
+                        },
                     }
-                }
-            }
+                },
+            },
         }
         response = self.kowalski.query(query=query)
         light_curves_raw = response.get("data").get(catalog).get("target")
@@ -276,7 +274,7 @@ def doc(self):
                 path_static.mkdir(parents=True, exist_ok=True)
             tdtax.write_viz(
                 make_tdtax_taxonomy(self.config["taxonomy"]),
-                outname=path_static / "taxonomy.html"
+                outname=path_static / "taxonomy.html",
             )
 
         # generate images for the Field Guide
@@ -288,7 +286,9 @@ def doc(self):
         with status("Generating example light curves"):
             path_doc_data = pathlib.Path(__file__).parent.absolute() / "doc" / "data"
 
-            for sample_object_name, sample_object in self.config["docs"]["field_guide"].items():
+            for sample_object_name, sample_object in self.config["docs"][
+                "field_guide"
+            ].items():
                 sample_light_curves = self._get_light_curve_data(
                     ra=sample_object["coordinates"][0],
                     dec=sample_object["coordinates"][1],
@@ -304,7 +304,10 @@ def doc(self):
         # example HR diagrams for all Golden sets
         with status("Generating HR diagrams for Golden sets"):
             path_gaia_hr_histogram = (
-                pathlib.Path(__file__).parent.absolute() / "doc" / "data" / "gaia_hr_histogram.dat"
+                pathlib.Path(__file__).parent.absolute()
+                / "doc"
+                / "data"
+                / "gaia_hr_histogram.dat"
             )
             # stored as ra/decs in csv format under /data/golden
             golden_sets = pathlib.Path(__file__).parent.absolute() / "data" / "golden"
@@ -376,7 +379,7 @@ def train(
         path_dataset: str,
         gpu: Optional[int] = None,
         verbose: bool = False,
-        **kwargs
+        **kwargs,
     ):
         """Train classifier
 
@@ -393,18 +396,17 @@ def train(
         features = self.config["features"][train_config["features"]]
 
         ds = Dataset(
-            tag=tag,
-            path_dataset=path_dataset,
-            features=features,
-            verbose=verbose
+            tag=tag, path_dataset=path_dataset, features=features, verbose=verbose
         )
 
         label = train_config["label"]
 
         # values from kwargs override those defined in config. if latter is absent, use reasonable default
         threshold = kwargs.get("threshold", train_config.get("threshold", 0.5))
         balance = kwargs.get("balance", train_config.get("balance", None))
-        weight_per_class = kwargs.get("weight_per_class", train_config.get("weight_per_class", False))
+        weight_per_class = kwargs.get(
+            "weight_per_class", train_config.get("weight_per_class", False)
+        )
 
         test_size = kwargs.get("test_size", 0.1)
         val_size = kwargs.get("val_size", 0.1)
@@ -485,16 +487,14 @@ def train(
 
         # eval and save
         stats = classifier.evaluate(
-            datasets['test'],
-            callbacks=[tfa.callbacks.TQDMProgressBar()],
-            verbose=0
+            datasets['test'], callbacks=[tfa.callbacks.TQDMProgressBar()], verbose=0
         )
         print(stats)
 
         classifier.save(
             output_path=f"models/{tag}",
             output_format="hdf5",
-            tag=f'{datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")}'
+            tag=f'{datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")}',
         )