Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DNN model training pipeline #6

Merged
merged 45 commits into from
Sep 13, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
5653112
nn architecture
dmitryduev Mar 17, 2021
23b3e59
util to work with dataset
dmitryduev Mar 17, 2021
2032bc8
util to work with dataset
dmitryduev Mar 17, 2021
87c7ec3
refactor taxonomy description and parsing
dmitryduev Mar 18, 2021
3c9f773
training pipeline
dmitryduev Mar 18, 2021
bc74784
training pipeline
dmitryduev Mar 18, 2021
0957b0b
training pipeline
dmitryduev Mar 18, 2021
7bec033
training pipeline
dmitryduev Mar 18, 2021
7701ace
training pipeline
dmitryduev Mar 18, 2021
ca51f49
training pipeline
dmitryduev Mar 18, 2021
8fd77ab
debugging training pipeline
dmitryduev Mar 18, 2021
cb7b5aa
debugging training pipeline
dmitryduev Mar 19, 2021
e2fa1eb
merge upstream/main
dmitryduev Mar 24, 2021
21b24f6
training works
dmitryduev Mar 24, 2021
9582b24
clean up commented code
dmitryduev Mar 24, 2021
b35789b
specify gsutil version in requirements
dmitryduev Mar 24, 2021
55c73c2
improve wording in taxonomy specs
dmitryduev Mar 24, 2021
40eb7f5
test training pipeline
dmitryduev Mar 25, 2021
9193ede
add reference to scope-i paper
dmitryduev Mar 25, 2021
ce7fae9
document training pipeline usage
dmitryduev Mar 25, 2021
c8fc864
parse boolean args
dmitryduev Mar 25, 2021
7b9bc48
d12 features
dmitryduev Mar 25, 2021
da3001b
different feature scaling options
dmitryduev Mar 25, 2021
48e382d
resolve conflict in config.defaults
dmitryduev Mar 26, 2021
688baeb
resolve conflict in config.defaults
dmitryduev Mar 26, 2021
e96d696
replace literal_eval with json.loads
dmitryduev Mar 26, 2021
edbadc6
fix t_0 for phase-folded lc plots
dmitryduev Mar 26, 2021
acf2ee8
Merge branch 'main' of github.com:ZwickyTransientFacility/scope into …
dmitryduev Mar 26, 2021
b67bf55
fix merge conflicts with master
dmitryduev Aug 1, 2021
1acf5e7
fix merge conflicts with master
dmitryduev Aug 1, 2021
7fabb76
lr -> learning rate
dmitryduev Aug 1, 2021
37f6e2a
parametrize GCS paths
dmitryduev Aug 1, 2021
cbca1ce
parametrize GCS paths
dmitryduev Aug 1, 2021
8a2f946
bugfix in feature standartization + d15 features and stats
dmitryduev Aug 12, 2021
0c6d661
document training parameters
dmitryduev Aug 12, 2021
c6e5fa0
document training parameters
dmitryduev Aug 12, 2021
f57c377
fix feature scaling/bookkeeping issues
dmitryduev Aug 13, 2021
d3830cb
use keras' functinal api for now
dmitryduev Sep 13, 2021
300e10c
fix test
dmitryduev Sep 13, 2021
f06909d
fix test
dmitryduev Sep 13, 2021
d0d2a54
fix doc building
dmitryduev Sep 13, 2021
0bf5dee
fix doc building
dmitryduev Sep 13, 2021
27ce517
fix doc building
dmitryduev Sep 13, 2021
66647b1
fix doc building
dmitryduev Sep 13, 2021
e8cc5ae
fix typo in test.yml
dmitryduev Sep 13, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
training pipeline
  • Loading branch information
dmitryduev committed Mar 18, 2021
commit 0957b0bf59b73bf6054e25de1240cf5c43bd3098
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ repos:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/python/black
- repo: https://github.com/psf/black
rev: 20.8b1
hooks:
- id: black
Expand Down
70 changes: 35 additions & 35 deletions scope.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def status(message):
print(f"\r[✓] {message}")


def check_configs(config_wildcards: Sequence = ("config.*yaml", )):
def check_configs(config_wildcards: Sequence = ("config.*yaml",)):
"""
- Check if config files exist
- Offer to use the config files that match the wildcards
Expand Down Expand Up @@ -89,7 +89,6 @@ def check_configs(config_wildcards: Sequence = ("config.*yaml", )):


class Scope:

def __init__(self):
# check configuration
with status("Checking configuration"):
Expand Down Expand Up @@ -119,11 +118,11 @@ def __init__(self):
print("Kowalski not available")

def _get_nearest_gaia(
self,
positions: Sequence[Sequence[float]],
catalog: str = None,
max_distance: Union[float, int] = 5.0,
distance_units: str = "arcsec",
self,
positions: Sequence[Sequence[float]],
catalog: str = None,
max_distance: Union[float, int] = 5.0,
distance_units: str = "arcsec",
) -> pd.DataFrame:
"""Get nearest Gaia source for a set of given positions

Expand Down Expand Up @@ -158,23 +157,24 @@ def _get_nearest_gaia(
"phot_rp_mean_mag": 1,
"ra": 1,
"dec": 1,
}
},
}
}
},
},
"kwargs": {
"limit": 1
}
"kwargs": {"limit": 1},
}
response = self.kowalski.query(query=query)
gaia_nearest = [
v[0] for k, v in response.get("data").get(catalog).items()
if len(v) > 0
v[0] for k, v in response.get("data").get(catalog).items() if len(v) > 0
]
df = pd.DataFrame.from_records(gaia_nearest)

df["M"] = df["phot_g_mean_mag"] + 5 * np.log10(df["parallax"] * 0.001) + 5
df["Ml"] = df["phot_g_mean_mag"] + 5 * np.log10((df["parallax"] + df["parallax_error"]) * 0.001) + 5
df["Ml"] = (
df["phot_g_mean_mag"]
+ 5 * np.log10((df["parallax"] + df["parallax_error"]) * 0.001)
+ 5
)
df["BP-RP"] = df["phot_bp_mean_mag"] - df["phot_rp_mean_mag"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we using errors on BP/RP in any way? The question that was asked of Laurent in the first ZTF-Gaia talk was important: will they be presenting average colors with error-bars in DR3. He didn't know.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part of code wasn't touched in this PR, except for linting, so I think this should be moved into issues.
(I think we don't use it in this context).


return df
Expand Down Expand Up @@ -206,9 +206,7 @@ def _get_light_curve_data(
"object_coordinates": {
"cone_search_radius": cone_search_radius,
"cone_search_unit": cone_search_unit,
"radec": {
"target": [ra, dec]
}
"radec": {"target": [ra, dec]},
},
"catalogs": {
catalog: {
Expand All @@ -223,11 +221,11 @@ def _get_light_curve_data(
"data.ra": 1,
"data.dec": 1,
"data.programid": 1,
"data.catflags": 1
}
"data.catflags": 1,
},
}
}
}
},
},
}
response = self.kowalski.query(query=query)
light_curves_raw = response.get("data").get(catalog).get("target")
Expand Down Expand Up @@ -276,7 +274,7 @@ def doc(self):
path_static.mkdir(parents=True, exist_ok=True)
tdtax.write_viz(
make_tdtax_taxonomy(self.config["taxonomy"]),
outname=path_static / "taxonomy.html"
outname=path_static / "taxonomy.html",
)

# generate images for the Field Guide
Expand All @@ -288,7 +286,9 @@ def doc(self):
with status("Generating example light curves"):
path_doc_data = pathlib.Path(__file__).parent.absolute() / "doc" / "data"

for sample_object_name, sample_object in self.config["docs"]["field_guide"].items():
for sample_object_name, sample_object in self.config["docs"][
"field_guide"
].items():
sample_light_curves = self._get_light_curve_data(
ra=sample_object["coordinates"][0],
dec=sample_object["coordinates"][1],
Expand All @@ -304,7 +304,10 @@ def doc(self):
# example HR diagrams for all Golden sets
with status("Generating HR diagrams for Golden sets"):
path_gaia_hr_histogram = (
pathlib.Path(__file__).parent.absolute() / "doc" / "data" / "gaia_hr_histogram.dat"
pathlib.Path(__file__).parent.absolute()
/ "doc"
/ "data"
/ "gaia_hr_histogram.dat"
)
# stored as ra/decs in csv format under /data/golden
golden_sets = pathlib.Path(__file__).parent.absolute() / "data" / "golden"
Expand Down Expand Up @@ -376,7 +379,7 @@ def train(
path_dataset: str,
gpu: Optional[int] = None,
verbose: bool = False,
**kwargs
**kwargs,
):
"""Train classifier

Expand All @@ -393,18 +396,17 @@ def train(
features = self.config["features"][train_config["features"]]

ds = Dataset(
tag=tag,
path_dataset=path_dataset,
features=features,
verbose=verbose
tag=tag, path_dataset=path_dataset, features=features, verbose=verbose
)

label = train_config["label"]

# values from kwargs override those defined in config. if latter is absent, use reasonable default
threshold = kwargs.get("threshold", train_config.get("threshold", 0.5))
balance = kwargs.get("balance", train_config.get("balance", None))
weight_per_class = kwargs.get("weight_per_class", train_config.get("weight_per_class", False))
weight_per_class = kwargs.get(
"weight_per_class", train_config.get("weight_per_class", False)
)

test_size = kwargs.get("test_size", 0.1)
val_size = kwargs.get("val_size", 0.1)
Expand Down Expand Up @@ -485,16 +487,14 @@ def train(

# eval and save
stats = classifier.evaluate(
datasets['test'],
callbacks=[tfa.callbacks.TQDMProgressBar()],
verbose=0
datasets['test'], callbacks=[tfa.callbacks.TQDMProgressBar()], verbose=0
)
print(stats)

classifier.save(
output_path=f"models/{tag}",
output_format="hdf5",
tag=f'{datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")}'
tag=f'{datetime.datetime.utcnow().strftime("%Y%m%d_%H%M%S")}',
)


Expand Down