From 0d264ca8295bcca835344e499e6fbf485a7e3c3b Mon Sep 17 00:00:00 2001 From: Riley Hales PhD <39097632+rileyhales@users.noreply.github.com> Date: Thu, 25 Apr 2024 08:41:53 -0600 Subject: [PATCH 01/11] enable forecast records plot --- docs/api-documentation/plots.rst | 2 +- geoglows/__init__.py | 2 +- geoglows/_plots/__init__.py | 2 ++ geoglows/_plots/plotly_forecasts.py | 10 ++++---- geoglows/_plots/plots.py | 39 +++++++++++++++++++++++------ 5 files changed, 41 insertions(+), 14 deletions(-) diff --git a/docs/api-documentation/plots.rst b/docs/api-documentation/plots.rst index 9c29282..95d0eac 100644 --- a/docs/api-documentation/plots.rst +++ b/docs/api-documentation/plots.rst @@ -4,4 +4,4 @@ geoglows.plots .. automodule:: geoglows.plots :members: - forecast, forecast_stats, forecast_ensembles, retrospective, annual_averages, monthly_averages, daily_averages, flow_duration_curve, corrected_retrospective, corrected_month_average, corrected_day_average, corrected_scatterplots + forecast, forecast_stats, forecast_ensembles, forecast_records, retrospective, annual_averages, monthly_averages, daily_averages, flow_duration_curve, corrected_retrospective, corrected_month_average, corrected_day_average, corrected_scatterplots diff --git a/geoglows/__init__.py b/geoglows/__init__.py index ca99c8f..b3ec040 100644 --- a/geoglows/__init__.py +++ b/geoglows/__init__.py @@ -12,6 +12,6 @@ 'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow', 'METADATA_TABLE_PATH' ] -__version__ = '1.2.1' +__version__ = '1.3.0' __author__ = 'Riley Hales' __license__ = 'BSD 3-Clause Clear License' diff --git a/geoglows/_plots/__init__.py b/geoglows/_plots/__init__.py index 2cf43c5..84233f5 100644 --- a/geoglows/_plots/__init__.py +++ b/geoglows/_plots/__init__.py @@ -2,6 +2,7 @@ forecast, forecast_stats, forecast_ensembles, + forecast_records, retrospective, daily_averages, monthly_averages, @@ -17,6 +18,7 @@ 'forecast', 'forecast_stats', 'forecast_ensembles', + 'forecast_records', 'retrospective', 'daily_averages', 'monthly_averages', diff --git a/geoglows/_plots/plotly_forecasts.py b/geoglows/_plots/plotly_forecasts.py index 464ab23..66802fc 100644 --- a/geoglows/_plots/plotly_forecasts.py +++ b/geoglows/_plots/plotly_forecasts.py @@ -259,12 +259,12 @@ def forecast_ensembles(df: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_tit return go.Figure(scatter_plots, layout=layout) -def forecast_records(recs: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_titles: list = False, ) -> go.Figure: +def forecast_records(df: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_titles: list = False, ) -> go.Figure: """ Makes the streamflow saved forecast data and metadata into a plotly plot Args: - recs: the csv response from forecast_records + df: the csv response from forecast_records rp_df: the csv response from return_periods plot_titles: a list of strings to place in the figure title. each list item will be on a new line. @@ -272,14 +272,14 @@ def forecast_records(recs: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_tit plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method """ # Start processing the inputs - dates = recs.index.tolist() + dates = df.index.tolist() startdate = dates[0] enddate = dates[-1] plot_data = { 'x_records': dates, - 'recorded_flows': recs.dropna(axis=0).values.flatten(), - 'y_max': max(recs.values), + 'recorded_flows': df.dropna(axis=0).values.flatten(), + 'y_max': np.nanmax(df.values), } if rp_df is not None: plot_data.update(rp_df.to_dict(orient='index').items()) diff --git a/geoglows/_plots/plots.py b/geoglows/_plots/plots.py index 2c759c4..a886dcf 100644 --- a/geoglows/_plots/plots.py +++ b/geoglows/_plots/plots.py @@ -2,10 +2,17 @@ import plotly.graph_objects as go from .format_tools import plotly_figure_to_html_plot +from .plotly_bias_corrected import ( + corrected_retrospective as plotly_corrected_retrospective, + corrected_month_average as plotly_corrected_month_average, + corrected_day_average as plotly_corrected_day_average, + corrected_scatterplots as plotly_corrected_scatterplots, +) from .plotly_forecasts import ( forecast as plotly_forecast, forecast_stats as plotly_forecast_stats, - forecast_ensembles as plotly_forecast_ensembles + forecast_ensembles as plotly_forecast_ensembles, + forecast_records as plotly_forecast_records, ) from .plotly_retrospective import ( retrospective as plotly_retrospective, @@ -14,17 +21,12 @@ annual_averages as plotly_annual_averages, flow_duration_curve as plotly_flow_duration_curve, ) -from .plotly_bias_corrected import ( - corrected_retrospective as plotly_corrected_retrospective, - corrected_month_average as plotly_corrected_month_average, - corrected_day_average as plotly_corrected_day_average, - corrected_scatterplots as plotly_corrected_scatterplots, -) __all__ = [ 'forecast', 'forecast_stats', 'forecast_ensembles', + 'forecast_records', 'retrospective', 'daily_averages', @@ -105,6 +107,29 @@ def forecast_ensembles(df: pd.DataFrame, *, raise NotImplementedError(f'Plot type "{plot_type}" is not supported.') +def forecast_records(df: pd.DataFrame, *, + plot_type: str = 'plotly', + rp_df: pd.DataFrame = None, + plot_titles: list = None, ) -> go.Figure: + """ + Plots forecasted streamflow and optional return periods + Args: + df: + plot_type: + rp_df: + plot_titles: + + Returns: + go.Figure + """ + if plot_type in ('plotly', 'html'): + figure = plotly_forecast_records(df, rp_df=rp_df, plot_titles=plot_titles) + if plot_type == 'html': + return plotly_figure_to_html_plot(figure) + return figure + raise NotImplementedError(f'Plot type "{plot_type}" is not supported.') + + def retrospective(df: pd.DataFrame, *, plot_type: str = 'plotly', rp_df: pd.DataFrame = None, From 5be2a0197f24ea08774cde4f8c55c101706383fa Mon Sep 17 00:00:00 2001 From: rileyhales Date: Tue, 30 Apr 2024 21:53:32 -0600 Subject: [PATCH 02/11] loosen versions, correct warnings, xarray datasets unfiltered by river number --- docs/conf.py | 2 +- geoglows/__init__.py | 2 +- geoglows/data.py | 35 ++++++++++++++++++++--------------- geoglows/streamflow.py | 18 +++++++++--------- requirements.txt | 6 +++--- setup.py | 2 +- 6 files changed, 35 insertions(+), 30 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index d09a5fc..1abade2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,7 @@ author = 'Riley Hales, PhD' # The full version, including alpha/beta/rc tags -release = '1.2.0' +release = '1.4.0' master_doc = 'index' # -- General configuration --------------------------------------------------- diff --git a/geoglows/__init__.py b/geoglows/__init__.py index b3ec040..ba86d67 100644 --- a/geoglows/__init__.py +++ b/geoglows/__init__.py @@ -12,6 +12,6 @@ 'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow', 'METADATA_TABLE_PATH' ] -__version__ = '1.3.0' +__version__ = '1.4.0' __author__ = 'Riley Hales' __license__ = 'BSD 3-Clause Clear License' diff --git a/geoglows/data.py b/geoglows/data.py index de09d7d..fe4d441 100644 --- a/geoglows/data.py +++ b/geoglows/data.py @@ -57,9 +57,10 @@ def from_aws(*args, **kwargs): s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) date = kwargs.get('date', False) if not date: - dates = sorted([x.split('/')[-1] for x in s3.ls(ODP_FORECAST_S3_BUCKET_URI)], reverse=True) - dates = [x.split('.')[0] for x in dates if x.endswith('.zarr')] # ignore the index.html file - dates = [x.replace('00.zarr', '') for x in dates] + zarr_vars = ['rivid', 'Qout', 'time', 'ensemble'] + dates = [s3.glob(os.path.join(ODP_FORECAST_S3_BUCKET_URI, f'*.zarr/{var}')) for var in zarr_vars] + dates = [set([d.split('/')[1].replace('.zarr', '') for d in date]) for date in dates] + dates = sorted(set.intersection(*dates), reverse=True) if product_name == 'dates': return pd.DataFrame(dict(dates=dates)) date = dates[0] @@ -121,7 +122,7 @@ def from_rest(*args, **kwargs): endpoint = f'https://{endpoint}' if not endpoint.startswith(('https://', 'http://')) else endpoint version = kwargs.get('version', DEFAULT_REST_ENDPOINT_VERSION) - assert version in ('v2', ), ValueError(f'Unrecognized model version parameter: {version}') + assert version in ('v2',), ValueError(f'Unrecognized model version parameter: {version}') product_name = function.__name__.replace("_", "").lower() @@ -180,6 +181,7 @@ def main(*args, **kwargs): if source == 'rest': return from_rest(*args, **kwargs) return from_aws(*args, **kwargs) + main.__doc__ = function.__doc__ # necessary for code documentation auto generators return main @@ -290,15 +292,10 @@ def retrospective(river_id: int or list, format: str = 'df') -> pd.DataFrame or """ s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/retrospective.zarr', s3=s3, check=False) - ds = xr.open_zarr(s3store).sel(rivid=river_id) + ds = xr.open_zarr(s3store) if format == 'xarray': return ds - return ds.to_dataframe().reset_index().set_index('time').pivot(columns='rivid', values='Qout') - - -def historical(*args, **kwargs): - """Alias for retrospective""" - return retrospective(*args, **kwargs) + return ds.sel(rivid=river_id).to_dataframe().reset_index().set_index('time').pivot(columns='rivid', values='Qout') def daily_averages(river_id: int or list) -> pd.DataFrame: @@ -343,24 +340,32 @@ def annual_averages(river_id: int or list) -> pd.DataFrame: return calc_annual_averages(df) -def return_periods(river_id: int or list, format: str = 'df') -> pd.DataFrame or xr.Dataset: +def return_periods(river_id: int or list, format: str = 'df', method: str = 'gumbel1') -> pd.DataFrame or xr.Dataset: """ Retrieves the return period thresholds based on a specified historic simulation forcing on a certain river_id. Args: river_id (int): the ID of a stream, should be a 9 digit integer format (str): the format to return the data, either 'df' or 'xarray'. default is 'df' + method (str): the method to use to estimate the return period thresholds. default is 'gumbel1' + + Changelog: + v1.4.0: adds method parameter for future expansion of multiple return period methods Returns: pd.DataFrame """ + rp_methods = { + 'gumbel1': 'gumbel1_return_period', + } + assert method in rp_methods, f'Unrecognized return period estimation method given: {method}' s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/return-periods.zarr', s3=s3, check=False) - ds = xr.open_zarr(s3store).sel(rivid=river_id) + ds = xr.open_zarr(s3store) if format == 'xarray': return ds - return (ds['return_period_flow'].to_dataframe().reset_index() - .pivot(index='rivid', columns='return_period', values='return_period_flow')) + return (ds.sel(rivid=river_id)[rp_methods[method]].to_dataframe().reset_index() + .pivot(index='rivid', columns='return_period', values=rp_methods[method])) # model config and supplementary data diff --git a/geoglows/streamflow.py b/geoglows/streamflow.py index a0cb642..28deb7e 100644 --- a/geoglows/streamflow.py +++ b/geoglows/streamflow.py @@ -42,7 +42,7 @@ def forecast_stats(reach_id: int, return_format: str = 'csv', forecast_date: str data = streamflow.rst.forecast_stats(12341234) """ - warnings.warn(DEPRECATIONWARNING, DeprecationWarning) + warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2) method = 'ForecastStats/' # if you only wanted the url, quit here @@ -113,7 +113,7 @@ def forecast_warnings(region: str = 'all', return_format='csv', data = streamflow.rst.forecast_warnings('australia-geoglows') """ - warnings.warn(DEPRECATIONWARNING, DeprecationWarning) + warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2) method = 'ForecastWarnings/' # if you only wanted the url, quit here @@ -148,7 +148,7 @@ def forecast_records(reach_id: int, start_date: str = None, end_date: str = None data = streamflow.rst.forecast_warnings('australia-geoglows') """ - warnings.warn(DEPRECATIONWARNING, DeprecationWarning) + warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2) method = 'ForecastRecords/' # if you only wanted the url, quit here @@ -188,7 +188,7 @@ def historic_simulation(reach_id: int, return_format='csv', forcing='era_5', data = streamflow.rst.historic_simulation(12341234) """ - warnings.warn(DEPRECATIONWARNING, DeprecationWarning) + warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2) method = 'HistoricSimulation/' # if you only wanted the url, quit here @@ -223,7 +223,7 @@ def daily_averages(reach_id: int, return_format='csv', forcing='era_5', data = streamflow.rst.seasonal_average(12341234) """ - warnings.warn(DEPRECATIONWARNING, DeprecationWarning) + warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2) method = 'DailyAverages/' # if you only wanted the url, quit here @@ -258,7 +258,7 @@ def monthly_averages(reach_id: int, return_format='csv', forcing='era_5', data = streamflow.rst.seasonal_average(12341234) """ - warnings.warn(DEPRECATIONWARNING, DeprecationWarning) + warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2) method = 'MonthlyAverages/' # if you only wanted the url, quit here @@ -293,7 +293,7 @@ def return_periods(reach_id: int, return_format='csv', forcing='era_5', data = streamflow.rst.return_periods(12341234) """ - warnings.warn(DEPRECATIONWARNING, DeprecationWarning) + warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2) method = 'ReturnPeriods/' # if you only wanted the url, quit here @@ -324,7 +324,7 @@ def available_data(endpoint: str = ENDPOINT, return_format='json', s: requests.S data = streamflow.rst.available_data() """ - warnings.warn(DEPRECATIONWARNING, DeprecationWarning) + warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2) method = 'AvailableData/' # if you only wanted the url, quit here @@ -356,7 +356,7 @@ def available_dates(reach_id: int = None, region: str = None, return_format: str data = streamflow.rst.available_dates(12341234) """ - warnings.warn(DEPRECATIONWARNING, DeprecationWarning) + warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2) method = 'AvailableDates/' # you need a region for the api call, so the user needs to provide one or a valid reach_id to get it from diff --git a/requirements.txt b/requirements.txt index 9afba53..22a86d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ -dask>=2024 +dask>=2022 fastparquet requests pandas>=1 plotly>=5 scipy>=1 -s3fs>=2024 +s3fs>=2022 numpy>=1 hydrostats HydroErr -xarray>=2024 +xarray>=2022 zarr \ No newline at end of file diff --git a/setup.py b/setup.py index 3547621..ba5a6a2 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ DESCRIPTION = 'Package for accessing data from the GEOGLOWS Hydrological Model' URL = 'https://data.geoglows.org' AUTHOR = 'Riley Hales PhD' -REQUIRES_PYTHON = '>=3.10.0' +REQUIRES_PYTHON = '>=3.7.0' LICENSE = 'BSD 3-Clause Clear License' with open("README.md", "r") as readme: From 12502d7c9ff8897c85ab18d0a0833a1bd14e8290 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Tue, 30 Apr 2024 22:41:42 -0600 Subject: [PATCH 03/11] put river id selector back in xarray dataset returns --- geoglows/data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/geoglows/data.py b/geoglows/data.py index fe4d441..dae15b5 100644 --- a/geoglows/data.py +++ b/geoglows/data.py @@ -292,10 +292,10 @@ def retrospective(river_id: int or list, format: str = 'df') -> pd.DataFrame or """ s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/retrospective.zarr', s3=s3, check=False) - ds = xr.open_zarr(s3store) + ds = xr.open_zarr(s3store).sel(rivid=river_id) if format == 'xarray': return ds - return ds.sel(rivid=river_id).to_dataframe().reset_index().set_index('time').pivot(columns='rivid', values='Qout') + return ds.to_dataframe().reset_index().set_index('time').pivot(columns='rivid', values='Qout') def daily_averages(river_id: int or list) -> pd.DataFrame: @@ -361,10 +361,10 @@ def return_periods(river_id: int or list, format: str = 'df', method: str = 'gum assert method in rp_methods, f'Unrecognized return period estimation method given: {method}' s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/return-periods.zarr', s3=s3, check=False) - ds = xr.open_zarr(s3store) + ds = xr.open_zarr(s3store).sel(rivid=river_id) if format == 'xarray': return ds - return (ds.sel(rivid=river_id)[rp_methods[method]].to_dataframe().reset_index() + return (ds[rp_methods[method]].to_dataframe().reset_index() .pivot(index='rivid', columns='return_period', values=rp_methods[method])) From ee0f8c5aa27a7a3758fa016cc5051b6c061e9e7c Mon Sep 17 00:00:00 2001 From: Riley Hales PhD <39097632+rileyhales@users.noreply.github.com> Date: Thu, 2 May 2024 17:46:59 -0600 Subject: [PATCH 04/11] v1.5.0 (#35) * move decorators to file, add decorator for retrospective, log aws requests * increment docs version number --- docs/conf.py | 2 +- geoglows/__init__.py | 2 +- geoglows/_constants.py | 5 + geoglows/_download_decorators.py | 230 +++++++++++++++++++++++++++++++ geoglows/data.py | 207 +++------------------------- 5 files changed, 256 insertions(+), 190 deletions(-) create mode 100644 geoglows/_download_decorators.py diff --git a/docs/conf.py b/docs/conf.py index 1abade2..6c3694b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,7 @@ author = 'Riley Hales, PhD' # The full version, including alpha/beta/rc tags -release = '1.4.0' +release = '1.5.0' master_doc = 'index' # -- General configuration --------------------------------------------------- diff --git a/geoglows/__init__.py b/geoglows/__init__.py index ba86d67..0a13e9f 100644 --- a/geoglows/__init__.py +++ b/geoglows/__init__.py @@ -12,6 +12,6 @@ 'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow', 'METADATA_TABLE_PATH' ] -__version__ = '1.4.0' +__version__ = '1.5.0' __author__ = 'Riley Hales' __license__ = 'BSD 3-Clause Clear License' diff --git a/geoglows/_constants.py b/geoglows/_constants.py index a653b47..fd69956 100644 --- a/geoglows/_constants.py +++ b/geoglows/_constants.py @@ -1,5 +1,10 @@ import os +ODP_CORE_S3_BUCKET_URI = 's3://geoglows-v2' +ODP_FORECAST_S3_BUCKET_URI = 's3://geoglows-v2-forecasts' +ODP_RETROSPECTIVE_S3_BUCKET_URI = 's3://geoglows-v2-retrospective' +ODP_S3_BUCKET_REGION = 'us-west-2' + METADATA_TABLE_PATH = os.getenv( 'PYGEOGLOWS_METADATA_TABLE_PATH', os.path.join(os.path.dirname(__file__), 'data', 'metadata-tables.parquet') diff --git a/geoglows/_download_decorators.py b/geoglows/_download_decorators.py new file mode 100644 index 0000000..fb4924d --- /dev/null +++ b/geoglows/_download_decorators.py @@ -0,0 +1,230 @@ +import os +import warnings +from io import StringIO + +import pandas as pd +import requests +import s3fs +import xarray as xr +import numpy as np + +from .analyze import ( + simple_forecast as calc_simple_forecast, + forecast_stats as calc_forecast_stats, +) + +from ._constants import ( + ODP_FORECAST_S3_BUCKET_URI, + ODP_RETROSPECTIVE_S3_BUCKET_URI, + ODP_S3_BUCKET_REGION, +) + +DEFAULT_REST_ENDPOINT = 'https://geoglows.ecmwf.int/api/' +DEFAULT_REST_ENDPOINT_VERSION = 'v2' # 'v1, v2, latest' + +__all__ = [ + '_forecast', + '_retrospective', +] + + +def _forecast(function): + def from_aws(*args, **kwargs): + product_name = function.__name__.replace("_", "").lower() + if product_name == 'forecastrecords': + warnings.warn('forecast_records are not available from the AWS Open Data Program.') + return from_rest(*args, **kwargs) + + river_id = kwargs.get('river_id', '') + river_id = args[0] if len(args) > 0 else river_id + + return_format = kwargs.get('format', 'df') + assert return_format in ('df', 'xarray'), f'Unsupported return format requested: {return_format}' + + if kwargs.get('skip_log', False): + requests.post(f'{DEFAULT_REST_ENDPOINT}{DEFAULT_REST_ENDPOINT_VERSION}/log', + json={'river_id': river_id, 'product': product_name, 'format': return_format}, + timeout=1, ) # short timeout- don't need the response, post only needs to be received + + s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) + date = kwargs.get('date', False) + if not date: + zarr_vars = ['rivid', 'Qout', 'time', 'ensemble'] + dates = [s3.glob(os.path.join(ODP_FORECAST_S3_BUCKET_URI, f'*.zarr/{var}')) for var in zarr_vars] + dates = [set([d.split('/')[1].replace('.zarr', '') for d in date]) for date in dates] + dates = sorted(set.intersection(*dates), reverse=True) + if product_name == 'dates': + return pd.DataFrame(dict(dates=dates)) + date = dates[0] + if len(date) == 8: + date = f'{date}00.zarr' + elif len(date) == 10: + date = f'{date}.zarr' + else: + raise ValueError('Date must be YYYYMMDD or YYYYMMDDHH format. Use dates() to view available data.') + + s3store = s3fs.S3Map(root=f'{ODP_FORECAST_S3_BUCKET_URI}/{date}', s3=s3, check=False) + + attrs = { + 'source': 'geoglows', + 'forecast_date': date[:8], + 'retrieval_date': pd.Timestamp.now().strftime('%Y%m%d'), + 'units': 'cubic meters per second', + } + ds = xr.open_zarr(s3store).sel(rivid=river_id) + if return_format == 'xarray' and product_name == 'forecastensembles': + ds = ds.rename({'time': 'datetime', 'rivid': 'river_id'}) + ds.attrs = attrs + return ds + df = ds.to_dataframe().round(2).reset_index() + + # rename columns to match the REST API + if isinstance(river_id, int) or isinstance(river_id, np.int64): + df = df.pivot(index='time', columns='ensemble', values='Qout') + else: + df = df.pivot(index=['time', 'rivid'], columns='ensemble', values='Qout') + df.index.names = ['time', 'river_id'] + df = df[sorted(df.columns)] + df.columns = [f'ensemble_{str(x).zfill(2)}' for x in df.columns] + + if product_name == 'forecast': + df = calc_simple_forecast(df) + elif product_name == 'forecaststats': + df = calc_forecast_stats(df) + + if return_format == 'df': + return df + ds = df.to_xarray() + ds.attrs = attrs + return ds + + def from_rest(*args, **kwargs): + # update the default values set by the function unless the user has already specified them + for key, value in function.__kwdefaults__.items() if function.__kwdefaults__ else []: + if key not in kwargs: + kwargs[key] = value + + return_format = kwargs.get('format', 'csv') + assert return_format in ('csv', 'json', 'url'), f'Unsupported format requested: {return_format}' + + # parse out the information necessary to build a request url + endpoint = kwargs.get('endpoint', DEFAULT_REST_ENDPOINT) + endpoint = endpoint[:-1] if endpoint[-1] == '/' else endpoint + endpoint = endpoint + '/api' if not endpoint.endswith('/api') else endpoint + endpoint = f'https://{endpoint}' if not endpoint.startswith(('https://', 'http://')) else endpoint + + version = kwargs.get('version', DEFAULT_REST_ENDPOINT_VERSION) + assert version in ('v2',), ValueError(f'Unrecognized model version parameter: {version}') + + product_name = function.__name__.replace("_", "").lower() + + river_id = args[0] if len(args) > 0 else None + river_id = kwargs.get('river_id', '') if not river_id else river_id + if isinstance(river_id, list): + raise ValueError('Multiple river_ids are not available via REST API or on v1. ' + 'Use data_source="aws" for multiple river_ids.') + river_id = int(river_id) if river_id else None + if river_id and version == 'v2': + assert 1_000_000_000 > river_id >= 110_000_000, ValueError('River ID must be a 9 digit integer') + + # request parameter validation before submitting + for key in ('endpoint', 'version', 'river_id'): + if key in kwargs: + del kwargs[key] + for key, value in kwargs.items(): + if value is None: + del kwargs[key] + for date in ('date', 'start_date', 'end_date'): + if date in kwargs: + assert len(str(kwargs[date])) == 8 or len( + str(kwargs[date])) == 10, f'Invalid date format: {kwargs[date]}' + if 'format' in kwargs and kwargs['format'] != 'json': + del kwargs['format'] + kwargs['source'] = kwargs.get('source', 'pygeoglows') # allow using default for specific apps which override + params = '&'.join([f'{key}={value}' for key, value in kwargs.items()]) + + # piece together the request url + request_url = f'{endpoint}/{version}/{product_name}' # build the base url + request_url = f'{request_url}/{river_id}' if river_id else request_url # add the river_id if it exists + request_url = f'{request_url}?{params}' # add the query parameters + + if return_format == 'url': + return request_url.replace(f'source={kwargs["source"]}', '') + + response = requests.get(request_url) + + if response.status_code != 200: + raise RuntimeError('Received an error from the REST API: ' + response.text) + + if return_format == 'csv': + df = pd.read_csv(StringIO(response.text)) + if 'datetime' in df.columns: + df['datetime'] = pd.to_datetime(df['datetime']) + df = df.set_index('datetime') + return df + elif return_format == 'json': + return response.json() + else: + raise ValueError(f'Unsupported return format requested: {return_format}') + + def main(*args, **kwargs): + source = kwargs.get('data_source', 'aws') + assert source in ('rest', 'aws'), ValueError(f'Unrecognized data source requested: {source}') + if source == 'rest': + return from_rest(*args, **kwargs) + return from_aws(*args, **kwargs) + + main.__doc__ = function.__doc__ # necessary for code documentation auto generators + return main + + +def _retrospective(function): + def main(*args, **kwargs): + product_name = function.__name__.replace("_", "-").lower() + + river_id = args[0] if len(args) > 0 else None + river_id = kwargs.get('river_id', '') if not river_id else river_id + + return_format = kwargs.get('format', 'df') + assert return_format in ('df', 'xarray'), f'Unsupported return format requested: {return_format}' + + method = kwargs.get('method', 'gumbel1') + + if kwargs.get('skip_log', False): + requests.post(f'{DEFAULT_REST_ENDPOINT}{DEFAULT_REST_ENDPOINT_VERSION}/log', + timeout=1, # short timeout because we don't need the response, post just needs to be received + json={'river_id': river_id, 'product': product_name, 'format': return_format}) + + s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) + s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/{product_name}.zarr', s3=s3, check=False) + ds = xr.open_zarr(s3store) + try: + ds = ds.sel(rivid=river_id) + except Exception: + raise ValueError(f'River ID(s) not found in the retrospective dataset: {river_id}') + if return_format == 'xarray': + return ds + if product_name == 'retrospective': + return ( + ds + .to_dataframe() + .reset_index() + .set_index('time') + .pivot(columns='rivid', values='Qout') + ) + if product_name == 'return-periods': + rp_methods = { + 'gumbel1': 'gumbel1_return_period', + } + assert method in rp_methods, f'Unrecognized return period estimation method given: {method}' + return ( + ds + [rp_methods[method]] + .to_dataframe() + .reset_index() + .pivot(index='rivid', columns='return_period', values=rp_methods[method]) + ) + raise ValueError(f'Unsupported product requested: {product_name}') + + main.__doc__ = function.__doc__ # necessary for code documentation auto generators + return main diff --git a/geoglows/data.py b/geoglows/data.py index dae15b5..dcf8409 100644 --- a/geoglows/data.py +++ b/geoglows/data.py @@ -1,17 +1,13 @@ import os import warnings -from io import StringIO import pandas as pd -import requests -import s3fs import xarray as xr -import numpy as np from ._constants import METADATA_TABLE_PATH +from ._download_decorators import _forecast, _retrospective + from .analyze import ( - simple_forecast as calc_simple_forecast, - forecast_stats as calc_forecast_stats, daily_averages as calc_daily_averages, monthly_averages as calc_monthly_averages, annual_averages as calc_annual_averages, @@ -33,161 +29,9 @@ 'metadata_tables', ] -DEFAULT_REST_ENDPOINT = 'https://geoglows.ecmwf.int/api/' -DEFAULT_REST_ENDPOINT_VERSION = 'v2' # 'v1, v2, latest' -ODP_CORE_S3_BUCKET_URI = 's3://geoglows-v2' -ODP_FORECAST_S3_BUCKET_URI = 's3://geoglows-v2-forecasts' -ODP_RETROSPECTIVE_S3_BUCKET_URI = 's3://geoglows-v2-retrospective' -ODP_S3_BUCKET_REGION = 'us-west-2' - - -def _forecast_endpoint_decorator(function): - def from_aws(*args, **kwargs): - product_name = function.__name__.replace("_", "").lower() - if product_name == 'forecastrecords': - warnings.warn('forecast_records are not available from the AWS Open Data Program.') - return from_rest(*args, **kwargs) - - river_id = kwargs.get('river_id', '') - river_id = args[0] if len(args) > 0 else river_id - - return_format = kwargs.get('format', 'df') - assert return_format in ('df', 'xarray'), f'Unsupported return format requested: {return_format}' - - s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) - date = kwargs.get('date', False) - if not date: - zarr_vars = ['rivid', 'Qout', 'time', 'ensemble'] - dates = [s3.glob(os.path.join(ODP_FORECAST_S3_BUCKET_URI, f'*.zarr/{var}')) for var in zarr_vars] - dates = [set([d.split('/')[1].replace('.zarr', '') for d in date]) for date in dates] - dates = sorted(set.intersection(*dates), reverse=True) - if product_name == 'dates': - return pd.DataFrame(dict(dates=dates)) - date = dates[0] - if len(date) == 8: - date = f'{date}00.zarr' - elif len(date) == 10: - date = f'{date}.zarr' - else: - raise ValueError('Date must be YYYYMMDD or YYYYMMDDHH format. Use dates() to view available data.') - - s3store = s3fs.S3Map(root=f'{ODP_FORECAST_S3_BUCKET_URI}/{date}', s3=s3, check=False) - - attrs = { - 'source': 'geoglows', - 'forecast_date': date[:8], - 'retrieval_date': pd.Timestamp.now().strftime('%Y%m%d'), - 'units': 'cubic meters per second', - } - ds = xr.open_zarr(s3store).sel(rivid=river_id) - if return_format == 'xarray' and product_name == 'forecastensembles': - ds = ds.rename({'time': 'datetime', 'rivid': 'river_id'}) - ds.attrs = attrs - return ds - df = ds.to_dataframe().round(2).reset_index() - - # rename columns to match the REST API - if isinstance(river_id, int) or isinstance(river_id, np.int64): - df = df.pivot(index='time', columns='ensemble', values='Qout') - else: - df = df.pivot(index=['time', 'rivid'], columns='ensemble', values='Qout') - df.index.names = ['time', 'river_id'] - df = df[sorted(df.columns)] - df.columns = [f'ensemble_{str(x).zfill(2)}' for x in df.columns] - - if product_name == 'forecast': - df = calc_simple_forecast(df) - elif product_name == 'forecaststats': - df = calc_forecast_stats(df) - - if return_format == 'df': - return df - ds = df.to_xarray() - ds.attrs = attrs - return ds - - def from_rest(*args, **kwargs): - # update the default values set by the function unless the user has already specified them - for key, value in function.__kwdefaults__.items() if function.__kwdefaults__ else []: - if key not in kwargs: - kwargs[key] = value - - return_format = kwargs.get('format', 'csv') - assert return_format in ('csv', 'json', 'url'), f'Unsupported format requested: {return_format}' - - # parse out the information necessary to build a request url - endpoint = kwargs.get('endpoint', DEFAULT_REST_ENDPOINT) - endpoint = endpoint[:-1] if endpoint[-1] == '/' else endpoint - endpoint = endpoint + '/api' if not endpoint.endswith('/api') else endpoint - endpoint = f'https://{endpoint}' if not endpoint.startswith(('https://', 'http://')) else endpoint - - version = kwargs.get('version', DEFAULT_REST_ENDPOINT_VERSION) - assert version in ('v2',), ValueError(f'Unrecognized model version parameter: {version}') - - product_name = function.__name__.replace("_", "").lower() - - river_id = args[0] if len(args) > 0 else None - river_id = kwargs.get('river_id', '') if not river_id else river_id - if isinstance(river_id, list): - raise ValueError('Multiple river_ids are not available via REST API or on v1. ' - 'Use data_source="aws" and version="v2" for multiple river_ids.') - river_id = int(river_id) if river_id else None - if river_id and version == 'v2': - assert 1_000_000_000 > river_id >= 110_000_000, ValueError('River ID must be a 9 digit integer') - - # request parameter validation before submitting - for key in ('endpoint', 'version', 'river_id'): - if key in kwargs: - del kwargs[key] - for key, value in kwargs.items(): - if value is None: - del kwargs[key] - for date in ('date', 'start_date', 'end_date'): - if date in kwargs: - assert len(str(kwargs[date])) == 8 or len( - str(kwargs[date])) == 10, f'Invalid date format: {kwargs[date]}' - if 'format' in kwargs and kwargs['format'] != 'json': - del kwargs['format'] - kwargs['source'] = kwargs.get('source', 'pygeoglows') # allow using default for specific apps which override - params = '&'.join([f'{key}={value}' for key, value in kwargs.items()]) - - # piece together the request url - request_url = f'{endpoint}/{version}/{product_name}' # build the base url - request_url = f'{request_url}/{river_id}' if river_id else request_url # add the river_id if it exists - request_url = f'{request_url}?{params}' # add the query parameters - - if return_format == 'url': - return request_url.replace(f'source={kwargs["source"]}', '') - - response = requests.get(request_url) - - if response.status_code != 200: - raise RuntimeError('Received an error from the REST API: ' + response.text) - - if return_format == 'csv': - df = pd.read_csv(StringIO(response.text)) - if 'datetime' in df.columns: - df['datetime'] = pd.to_datetime(df['datetime']) - df = df.set_index('datetime') - return df - elif return_format == 'json': - return response.json() - else: - raise ValueError(f'Unsupported return format requested: {return_format}') - - def main(*args, **kwargs): - source = kwargs.get('data_source', 'aws') - assert source in ('rest', 'aws'), ValueError(f'Unrecognized data source requested: {source}') - if source == 'rest': - return from_rest(*args, **kwargs) - return from_aws(*args, **kwargs) - - main.__doc__ = function.__doc__ # necessary for code documentation auto generators - return main - # Forecast data and derived products -@_forecast_endpoint_decorator +@_forecast def dates(**kwargs) -> dict or str: """ Gets a list of available forecast product dates @@ -204,7 +48,7 @@ def dates(**kwargs) -> dict or str: pass -@_forecast_endpoint_decorator +@_forecast def forecast(*, river_id: int, date: str, format: str, data_source: str, **kwargs) -> pd.DataFrame or xr.Dataset: """ @@ -222,7 +66,7 @@ def forecast(*, river_id: int, date: str, format: str, data_source: str, pass -@_forecast_endpoint_decorator +@_forecast def forecast_stats(*, river_id: int, date: str, format: str, data_source: str, **kwargs) -> pd.DataFrame or xr.Dataset: """ @@ -241,7 +85,7 @@ def forecast_stats(*, river_id: int, date: str, format: str, data_source: str, pass -@_forecast_endpoint_decorator +@_forecast def forecast_ensembles(*, river_id: int, date: str, format: str, data_source: str, **kwargs) -> pd.DataFrame or xr.Dataset: """ @@ -259,7 +103,7 @@ def forecast_ensembles(*, river_id: int, date: str, format: str, data_source: st pass -@_forecast_endpoint_decorator +@_forecast def forecast_records(*, river_id: int, start_date: str, end_date: str, format: str, **kwargs) -> pd.DataFrame or dict or str: """ @@ -278,7 +122,8 @@ def forecast_records(*, river_id: int, start_date: str, end_date: str, format: s # Retrospective simulation and derived products -def retrospective(river_id: int or list, format: str = 'df') -> pd.DataFrame or xr.Dataset: +@_retrospective +def retrospective(river_id: int or list, *, format: str = 'df') -> pd.DataFrame or xr.Dataset: """ Retrieves the retrospective simulation of streamflow for a given river_id from the AWS Open Data Program GEOGLOWS V2 S3 bucket @@ -290,15 +135,10 @@ def retrospective(river_id: int or list, format: str = 'df') -> pd.DataFrame or Returns: pd.DataFrame """ - s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) - s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/retrospective.zarr', s3=s3, check=False) - ds = xr.open_zarr(s3store).sel(rivid=river_id) - if format == 'xarray': - return ds - return ds.to_dataframe().reset_index().set_index('time').pivot(columns='rivid', values='Qout') + pass -def daily_averages(river_id: int or list) -> pd.DataFrame: +def daily_averages(river_id: int or list, **kwargs) -> pd.DataFrame: """ Retrieves daily average streamflow for a given river_id @@ -308,11 +148,11 @@ def daily_averages(river_id: int or list) -> pd.DataFrame: Returns: pd.DataFrame """ - df = retrospective(river_id) + df = retrospective(river_id, **kwargs) return calc_daily_averages(df) -def monthly_averages(river_id: int or list) -> pd.DataFrame: +def monthly_averages(river_id: int or list, **kwargs) -> pd.DataFrame: """ Retrieves monthly average streamflow for a given river_id @@ -322,11 +162,11 @@ def monthly_averages(river_id: int or list) -> pd.DataFrame: Returns: pd.DataFrame """ - df = retrospective(river_id) + df = retrospective(river_id, **kwargs) return calc_monthly_averages(df) -def annual_averages(river_id: int or list) -> pd.DataFrame: +def annual_averages(river_id: int or list, **kwargs) -> pd.DataFrame: """ Retrieves annual average streamflow for a given river_id @@ -336,11 +176,12 @@ def annual_averages(river_id: int or list) -> pd.DataFrame: Returns: pd.DataFrame """ - df = retrospective(river_id) + df = retrospective(river_id, **kwargs) return calc_annual_averages(df) -def return_periods(river_id: int or list, format: str = 'df', method: str = 'gumbel1') -> pd.DataFrame or xr.Dataset: +@_retrospective +def return_periods(river_id: int or list, *, format: str = 'df', method: str = 'gumbel1') -> pd.DataFrame or xr.Dataset: """ Retrieves the return period thresholds based on a specified historic simulation forcing on a certain river_id. @@ -355,17 +196,7 @@ def return_periods(river_id: int or list, format: str = 'df', method: str = 'gum Returns: pd.DataFrame """ - rp_methods = { - 'gumbel1': 'gumbel1_return_period', - } - assert method in rp_methods, f'Unrecognized return period estimation method given: {method}' - s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION)) - s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/return-periods.zarr', s3=s3, check=False) - ds = xr.open_zarr(s3store).sel(rivid=river_id) - if format == 'xarray': - return ds - return (ds[rp_methods[method]].to_dataframe().reset_index() - .pivot(index='rivid', columns='return_period', values=rp_methods[method])) + pass # model config and supplementary data From 9b174a2df8f38d8cd68813e35e430976c5dd3e62 Mon Sep 17 00:00:00 2001 From: Biplov Bhandari Date: Fri, 10 May 2024 09:57:55 -0500 Subject: [PATCH 05/11] fix the parquet link to the metatable (#36) * fix the parquet link to the metatable add metatable as an argument to the stream latlon_to_river function * update docs --- .gitignore | 3 ++- geoglows/data.py | 11 ++++++++--- geoglows/streams.py | 5 +++-- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 010e8d0..f5f4f81 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,5 @@ dev.ipynb geoglows.egg-info dist .pypirc -*.parquet \ No newline at end of file +*.parquet +*.pyc diff --git a/geoglows/data.py b/geoglows/data.py index dcf8409..f635888 100644 --- a/geoglows/data.py +++ b/geoglows/data.py @@ -200,24 +200,29 @@ def return_periods(river_id: int or list, *, format: str = 'df', method: str = ' # model config and supplementary data -def metadata_tables(columns: list = None) -> pd.DataFrame: +def metadata_tables(columns: list = None, metadata_table_path: str = None) -> pd.DataFrame: """ Retrieves the master table of rivers metadata and properties as a pandas DataFrame Args: columns (list): optional subset of columns names to read from the parquet + metadata_table_path (str): optional path to a local copy of the metadata table Returns: pd.DataFrame """ if os.path.exists(METADATA_TABLE_PATH): return pd.read_parquet(METADATA_TABLE_PATH, columns=columns) + + if metadata_table_path: + return pd.read_parquet(metadata_table_path, columns=columns) + warn = f""" - Local copy of geoglows v2 metadata table not found. You should download a copy for optimal performance and + Local copy of geoglows v2 metadata table not found. You should download a copy for optimal performance and to make the data available when you are offline. A copy of the table will be cached at {METADATA_TABLE_PATH}. Alternatively, set the environment variable PYGEOGLOWS_METADATA_TABLE_PATH to the path of the table. """ warnings.warn(warn) - df = pd.read_parquet('https://geoglows-v2.s3-website-us-west-2.amazonaws.com/tables/package-metadata-table.parquet') + df = pd.read_parquet('http://geoglows-v2.s3-website-us-west-2.amazonaws.com/tables/package-metadata-table.parquet') os.makedirs(os.path.dirname(METADATA_TABLE_PATH), exist_ok=True) df.to_parquet(METADATA_TABLE_PATH) return df[columns] if columns else df diff --git a/geoglows/streams.py b/geoglows/streams.py index d6f19b5..919d86f 100644 --- a/geoglows/streams.py +++ b/geoglows/streams.py @@ -22,17 +22,18 @@ def river_to_vpu(river_id: int) -> int: ) -def latlon_to_river(lat: float, lon: float) -> int: +def latlon_to_river(lat: float, lon: float, metadata_table_path: str = None) -> int: """ Gives the River ID number whose outlet is nearest the given lat and lon Args: lat (float): a latitude lon (float): a longitude + metadata_table_path (str): optional path to the local metadata table Returns: int: a 9 digit integer that is a valid GEOGLOWS River ID number """ - df = metadata_tables(columns=['LINKNO', 'lat', 'lon']) + df = metadata_tables(columns=['LINKNO', 'lat', 'lon'], metadata_table_path=metadata_table_path) df['dist'] = ((df['lat'] - lat) ** 2 + (df['lon'] - lon) ** 2) ** 0.5 return df.loc[lambda x: x['dist'] == df['dist'].min(), 'LINKNO'].values[0] From 23d0a1170f6075cf9f22b09dcfa2fde44103a08a Mon Sep 17 00:00:00 2001 From: rileyhales Date: Fri, 10 May 2024 09:02:37 -0600 Subject: [PATCH 06/11] update version, docs, add arg to other streams functions --- docs/index.rst | 32 ++++++++++++++++++++++---------- geoglows/__init__.py | 6 +++--- geoglows/_constants.py | 10 ++++++++++ geoglows/data.py | 19 +++++++++---------- geoglows/streams.py | 10 ++++++---- 5 files changed, 50 insertions(+), 27 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 68e44e1..8081813 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,19 +7,31 @@ geoglows .. image:: https://anaconda.org/conda-forge/geoglows/badges/latest_release_date.svg :target: https://anaconda.org/geoglows/geoglows -The geoglows Python package enables access to data, API's, and code developed for the `GEOGLOWS Streamflow Model `_. +The geoglows Python package enables access to data, API's, and code developed for the `GEOGLOWS Hydrology Model `_. Read more about GEOGLOWS at ``_ -For demos, tutorials, and other training materials for GEOGLOWS and the geoglows Python packge, please visit -``_. +For demos, tutorials, and other training materials for GEOGLOWS and the geoglows Python packge, please visit ``_. -About GEOGLOWS ECMWF Streamflow -=============================== -GEOGLOWS ECMWF Streamflow Project: This project provides access to the results of a hydrological model that is run each -day. The model is based on a group of unique weather forecasts, known as an ensemble, from ECMWF. Each unique -precipitation forecast, known as an ensemble member, produces a unique streamflow forecast. There are 52 members of the -ensemble that drives the model each day. The ERA-5 historical precipitation dataset to also used to produce a -retrospective streamflow on each river. `Read more here `_. +Supplemental Data +================= +Some functions in this package will help you browse the metadata for the model to identify river locations, names, and +other information. It is available online at ``_. +If you do not already have a copy downloaded, the code will fetch it for you and cache a copy in the same directory that +the source code of the package is installed in. + +It is more efficient to save this file yourself and reuse it so that you do not have to download it every time your python +environment is recreated, the package version is updated, and so on. You may do this in several ways. + +1. You can set the environment variable `PYGEOGLOWS_METADATA_TABLE_PATH` **before** importing the package. +2. Call the `geoglows.set_metadata_table_path` function to set the path the path at any time **after** importing. +3. Pass the path to the table to functions that use it. + +About the GEOGLOWS Hydrology Model +================================== +The GEOGLOWS Hydrology Model is run each day at midnight (UTC +00). The model is based on the ECMWF ENS and HRES ensemble +of meteorology and land surface model forecasts. There are 51 members of the ensemble that drives the model each day. +The ERA5 reanalysis dataset is also used to produce a retrospective simulation on each river. The model provides river in +units m^3/s over the preceeding interval (1, 3, or 24 hours depending on the dataset). `Read more here `_. .. toctree:: :caption: Table of Contents diff --git a/geoglows/__init__.py b/geoglows/__init__.py index 0a13e9f..2cd7b09 100644 --- a/geoglows/__init__.py +++ b/geoglows/__init__.py @@ -6,12 +6,12 @@ import geoglows.tables import geoglows.streamflow -from ._constants import METADATA_TABLE_PATH +from ._constants import get_metadata_table_path, set_metadata_table_path __all__ = [ 'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow', - 'METADATA_TABLE_PATH' + 'get_metadata_table_path', 'set_metadata_table_path', ] -__version__ = '1.5.0' +__version__ = '1.6.0' __author__ = 'Riley Hales' __license__ = 'BSD 3-Clause Clear License' diff --git a/geoglows/_constants.py b/geoglows/_constants.py index fd69956..50ff8f0 100644 --- a/geoglows/_constants.py +++ b/geoglows/_constants.py @@ -9,3 +9,13 @@ 'PYGEOGLOWS_METADATA_TABLE_PATH', os.path.join(os.path.dirname(__file__), 'data', 'metadata-tables.parquet') ) + + +def get_metadata_table_path() -> str: + return METADATA_TABLE_PATH + + +def set_metadata_table_path(path: str) -> str: + global METADATA_TABLE_PATH + METADATA_TABLE_PATH = path + return METADATA_TABLE_PATH diff --git a/geoglows/data.py b/geoglows/data.py index f635888..1dea868 100644 --- a/geoglows/data.py +++ b/geoglows/data.py @@ -4,7 +4,7 @@ import pandas as pd import xarray as xr -from ._constants import METADATA_TABLE_PATH +from ._constants import get_metadata_table_path from ._download_decorators import _forecast, _retrospective from .analyze import ( @@ -210,19 +210,18 @@ def metadata_tables(columns: list = None, metadata_table_path: str = None) -> pd Returns: pd.DataFrame """ - if os.path.exists(METADATA_TABLE_PATH): - return pd.read_parquet(METADATA_TABLE_PATH, columns=columns) - if metadata_table_path: return pd.read_parquet(metadata_table_path, columns=columns) - + metadata_table_path = get_metadata_table_path() + if os.path.exists(metadata_table_path): + return pd.read_parquet(metadata_table_path, columns=columns) warn = f""" - Local copy of geoglows v2 metadata table not found. You should download a copy for optimal performance and - to make the data available when you are offline. A copy of the table will be cached at {METADATA_TABLE_PATH}. - Alternatively, set the environment variable PYGEOGLOWS_METADATA_TABLE_PATH to the path of the table. + Local copy of geoglows v2 metadata table not found. + A copy of the table has been cached at {metadata_table_path} which you can move as desired. + You should set the environment variable PYGEOGLOWS_METADATA_TABLE_PATH or provide the metadata_table_path argument. """ warnings.warn(warn) df = pd.read_parquet('http://geoglows-v2.s3-website-us-west-2.amazonaws.com/tables/package-metadata-table.parquet') - os.makedirs(os.path.dirname(METADATA_TABLE_PATH), exist_ok=True) - df.to_parquet(METADATA_TABLE_PATH) + os.makedirs(os.path.dirname(metadata_table_path), exist_ok=True) + df.to_parquet(metadata_table_path) return df[columns] if columns else df diff --git a/geoglows/streams.py b/geoglows/streams.py index 919d86f..2ba94d9 100644 --- a/geoglows/streams.py +++ b/geoglows/streams.py @@ -5,18 +5,19 @@ __all__ = ['river_to_vpu', 'latlon_to_river', 'river_to_latlon', ] -def river_to_vpu(river_id: int) -> int: +def river_to_vpu(river_id: int, metadata_table_path: str = None) -> int: """ Gives the VPU number for a given River ID number Args: river_id (int): a 9 digit integer that is a valid GEOGLOWS River ID number + metadata_table_path (str): optional path to the local metadata table Returns: int: a 3 digit integer that is the VPU number for the given River ID number """ return ( - metadata_tables(columns=['LINKNO', 'VPUCode']) + metadata_tables(columns=['LINKNO', 'VPUCode'], metadata_table_path=metadata_table_path) .loc[lambda x: x['LINKNO'] == river_id, 'VPUCode'] .values[0] ) @@ -38,18 +39,19 @@ def latlon_to_river(lat: float, lon: float, metadata_table_path: str = None) -> return df.loc[lambda x: x['dist'] == df['dist'].min(), 'LINKNO'].values[0] -def river_to_latlon(river_id: int) -> np.ndarray: +def river_to_latlon(river_id: int, metadata_table_path: str = None) -> np.ndarray: """ Gives the lat and lon of the outlet of the river with the given River ID number Args: river_id (int): a 9 digit integer that is a valid GEOGLOWS River ID number + metadata_table_path (str): optional path to the local metadata table Returns: np.ndarray: a numpy array of floats, [lat, lon] """ return ( - metadata_tables(columns=['LINKNO', 'lat', 'lon']) + metadata_tables(columns=['LINKNO', 'lat', 'lon'], metadata_table_path=metadata_table_path) .loc[lambda x: x['LINKNO'] == river_id, ['lat', 'lon']] .values[0] ) From 8b3729d879643561dc40908985d873c81d2597b6 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Fri, 10 May 2024 09:03:26 -0600 Subject: [PATCH 07/11] correct docs version number --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 6c3694b..332595a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,7 @@ author = 'Riley Hales, PhD' # The full version, including alpha/beta/rc tags -release = '1.5.0' +release = '1.6.0' master_doc = 'index' # -- General configuration --------------------------------------------------- From 2121219fa1ccc7e9b2e30c994a604c4f02215189 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Mon, 13 May 2024 22:31:49 -0600 Subject: [PATCH 08/11] correct retrieving return periods with lists or arrays --- docs/conf.py | 2 +- geoglows/__init__.py | 2 +- geoglows/_download_decorators.py | 13 +++++++------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 332595a..41da386 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,7 @@ author = 'Riley Hales, PhD' # The full version, including alpha/beta/rc tags -release = '1.6.0' +release = '1.6.1' master_doc = 'index' # -- General configuration --------------------------------------------------- diff --git a/geoglows/__init__.py b/geoglows/__init__.py index 2cd7b09..e6a6d92 100644 --- a/geoglows/__init__.py +++ b/geoglows/__init__.py @@ -12,6 +12,6 @@ 'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow', 'get_metadata_table_path', 'set_metadata_table_path', ] -__version__ = '1.6.0' +__version__ = '1.6.1' __author__ = 'Riley Hales' __license__ = 'BSD 3-Clause Clear License' diff --git a/geoglows/_download_decorators.py b/geoglows/_download_decorators.py index fb4924d..6e01e0f 100644 --- a/geoglows/_download_decorators.py +++ b/geoglows/_download_decorators.py @@ -35,8 +35,9 @@ def from_aws(*args, **kwargs): warnings.warn('forecast_records are not available from the AWS Open Data Program.') return from_rest(*args, **kwargs) - river_id = kwargs.get('river_id', '') - river_id = args[0] if len(args) > 0 else river_id + river_id = args[0] if len(args) > 0 else kwargs.get('river_id', '') + if river_id is None or river_id == '': + raise ValueError('River ID must be provided to retrieve forecast data.') return_format = kwargs.get('format', 'df') assert return_format in ('df', 'xarray'), f'Unsupported return format requested: {return_format}' @@ -118,8 +119,7 @@ def from_rest(*args, **kwargs): product_name = function.__name__.replace("_", "").lower() - river_id = args[0] if len(args) > 0 else None - river_id = kwargs.get('river_id', '') if not river_id else river_id + river_id = args[0] if len(args) > 0 else kwargs.get('river_id', '') if isinstance(river_id, list): raise ValueError('Multiple river_ids are not available via REST API or on v1. ' 'Use data_source="aws" for multiple river_ids.') @@ -182,8 +182,9 @@ def _retrospective(function): def main(*args, **kwargs): product_name = function.__name__.replace("_", "-").lower() - river_id = args[0] if len(args) > 0 else None - river_id = kwargs.get('river_id', '') if not river_id else river_id + river_id = args[0] if len(args) > 0 else kwargs.get('river_id', '') + if river_id is None or river_id == '': + raise ValueError('River ID must be provided to retrieve retrospective data.') return_format = kwargs.get('format', 'df') assert return_format in ('df', 'xarray'), f'Unsupported return format requested: {return_format}' From 14ca779584a1a3c825d75f4242a3c4728e5000d2 Mon Sep 17 00:00:00 2001 From: Riley Hales PhD <39097632+rileyhales@users.noreply.github.com> Date: Wed, 22 May 2024 14:17:09 -0600 Subject: [PATCH 09/11] correct errors in s3 file pathing specific to windows (#37) * correct errors in s3 file pathing specific to windows * update param validation for rest endpoints --- docs/conf.py | 2 +- environment.yaml | 1 + geoglows/__init__.py | 2 +- geoglows/_download_decorators.py | 40 ++++++++++++++++++-------------- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 41da386..6f9a35d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,7 @@ author = 'Riley Hales, PhD' # The full version, including alpha/beta/rc tags -release = '1.6.1' +release = '1.6.2' master_doc = 'index' # -- General configuration --------------------------------------------------- diff --git a/environment.yaml b/environment.yaml index 6908fc6..0ab021c 100644 --- a/environment.yaml +++ b/environment.yaml @@ -3,6 +3,7 @@ channels: - conda-forge dependencies: - python>=3 + - cftime - dask >=2024 - fastparquet - HydroErr diff --git a/geoglows/__init__.py b/geoglows/__init__.py index e6a6d92..a0ee1f5 100644 --- a/geoglows/__init__.py +++ b/geoglows/__init__.py @@ -12,6 +12,6 @@ 'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow', 'get_metadata_table_path', 'set_metadata_table_path', ] -__version__ = '1.6.1' +__version__ = '1.6.2' __author__ = 'Riley Hales' __license__ = 'BSD 3-Clause Clear License' diff --git a/geoglows/_download_decorators.py b/geoglows/_download_decorators.py index 6e01e0f..501ea70 100644 --- a/geoglows/_download_decorators.py +++ b/geoglows/_download_decorators.py @@ -1,23 +1,21 @@ -import os import warnings from io import StringIO +import numpy as np import pandas as pd import requests import s3fs import xarray as xr -import numpy as np - -from .analyze import ( - simple_forecast as calc_simple_forecast, - forecast_stats as calc_forecast_stats, -) from ._constants import ( ODP_FORECAST_S3_BUCKET_URI, ODP_RETROSPECTIVE_S3_BUCKET_URI, ODP_S3_BUCKET_REGION, ) +from .analyze import ( + simple_forecast as calc_simple_forecast, + forecast_stats as calc_forecast_stats, +) DEFAULT_REST_ENDPOINT = 'https://geoglows.ecmwf.int/api/' DEFAULT_REST_ENDPOINT_VERSION = 'v2' # 'v1, v2, latest' @@ -29,14 +27,22 @@ def _forecast(function): + def _river_id_is_iterable(river_id): + return bool( + isinstance(river_id, list) or + isinstance(river_id, tuple) or + isinstance(river_id, set) or + isinstance(river_id, np.ndarray) + ) + def from_aws(*args, **kwargs): product_name = function.__name__.replace("_", "").lower() if product_name == 'forecastrecords': warnings.warn('forecast_records are not available from the AWS Open Data Program.') return from_rest(*args, **kwargs) - river_id = args[0] if len(args) > 0 else kwargs.get('river_id', '') - if river_id is None or river_id == '': + river_id = args[0] if len(args) > 0 else kwargs.get('river_id', None) + if river_id is None: raise ValueError('River ID must be provided to retrieve forecast data.') return_format = kwargs.get('format', 'df') @@ -51,7 +57,7 @@ def from_aws(*args, **kwargs): date = kwargs.get('date', False) if not date: zarr_vars = ['rivid', 'Qout', 'time', 'ensemble'] - dates = [s3.glob(os.path.join(ODP_FORECAST_S3_BUCKET_URI, f'*.zarr/{var}')) for var in zarr_vars] + dates = [s3.glob(ODP_FORECAST_S3_BUCKET_URI + '/' + f'*.zarr/{var}') for var in zarr_vars] dates = [set([d.split('/')[1].replace('.zarr', '') for d in date]) for date in dates] dates = sorted(set.intersection(*dates), reverse=True) if product_name == 'dates': @@ -119,11 +125,11 @@ def from_rest(*args, **kwargs): product_name = function.__name__.replace("_", "").lower() - river_id = args[0] if len(args) > 0 else kwargs.get('river_id', '') - if isinstance(river_id, list): - raise ValueError('Multiple river_ids are not available via REST API or on v1. ' - 'Use data_source="aws" for multiple river_ids.') - river_id = int(river_id) if river_id else None + river_id = args[0] if len(args) > 0 else kwargs.get('river_id', None) + if river_id is None: + raise ValueError('River ID must be provided to retrieve forecast data.') + if not isinstance(river_id, (int, np.int64, )): + raise ValueError('Multiple river_ids are not available via REST API. Provide a single 9 digit integer.') if river_id and version == 'v2': assert 1_000_000_000 > river_id >= 110_000_000, ValueError('River ID must be a 9 digit integer') @@ -182,8 +188,8 @@ def _retrospective(function): def main(*args, **kwargs): product_name = function.__name__.replace("_", "-").lower() - river_id = args[0] if len(args) > 0 else kwargs.get('river_id', '') - if river_id is None or river_id == '': + river_id = args[0] if len(args) > 0 else kwargs.get('river_id', None) + if river_id is None: raise ValueError('River ID must be provided to retrieve retrospective data.') return_format = kwargs.get('format', 'df') From 52c8ea94a4da137c01e6a06d210e24fc4ed3d157 Mon Sep 17 00:00:00 2001 From: Riley Hales PhD <39097632+rileyhales@users.noreply.github.com> Date: Thu, 30 May 2024 11:40:26 -0600 Subject: [PATCH 10/11] Fix date list (#38) * correct bug getting date list * increment version number --- docs/conf.py | 2 +- geoglows/__init__.py | 2 +- geoglows/_download_decorators.py | 12 ++---------- geoglows/data.py | 5 ++++- 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 6f9a35d..a822a70 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,7 +15,7 @@ author = 'Riley Hales, PhD' # The full version, including alpha/beta/rc tags -release = '1.6.2' +release = '1.6.3' master_doc = 'index' # -- General configuration --------------------------------------------------- diff --git a/geoglows/__init__.py b/geoglows/__init__.py index a0ee1f5..95e484e 100644 --- a/geoglows/__init__.py +++ b/geoglows/__init__.py @@ -12,6 +12,6 @@ 'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow', 'get_metadata_table_path', 'set_metadata_table_path', ] -__version__ = '1.6.2' +__version__ = '1.6.3' __author__ = 'Riley Hales' __license__ = 'BSD 3-Clause Clear License' diff --git a/geoglows/_download_decorators.py b/geoglows/_download_decorators.py index 501ea70..3bfde37 100644 --- a/geoglows/_download_decorators.py +++ b/geoglows/_download_decorators.py @@ -27,14 +27,6 @@ def _forecast(function): - def _river_id_is_iterable(river_id): - return bool( - isinstance(river_id, list) or - isinstance(river_id, tuple) or - isinstance(river_id, set) or - isinstance(river_id, np.ndarray) - ) - def from_aws(*args, **kwargs): product_name = function.__name__.replace("_", "").lower() if product_name == 'forecastrecords': @@ -42,7 +34,7 @@ def from_aws(*args, **kwargs): return from_rest(*args, **kwargs) river_id = args[0] if len(args) > 0 else kwargs.get('river_id', None) - if river_id is None: + if river_id is None and product_name != 'dates': raise ValueError('River ID must be provided to retrieve forecast data.') return_format = kwargs.get('format', 'df') @@ -126,7 +118,7 @@ def from_rest(*args, **kwargs): product_name = function.__name__.replace("_", "").lower() river_id = args[0] if len(args) > 0 else kwargs.get('river_id', None) - if river_id is None: + if river_id is None and product_name != 'dates': raise ValueError('River ID must be provided to retrieve forecast data.') if not isinstance(river_id, (int, np.int64, )): raise ValueError('Multiple river_ids are not available via REST API. Provide a single 9 digit integer.') diff --git a/geoglows/data.py b/geoglows/data.py index 1dea868..cf5f8fa 100644 --- a/geoglows/data.py +++ b/geoglows/data.py @@ -5,7 +5,7 @@ import xarray as xr from ._constants import get_metadata_table_path -from ._download_decorators import _forecast, _retrospective +from ._download_decorators import _forecast, _retrospective, DEFAULT_REST_ENDPOINT, DEFAULT_REST_ENDPOINT_VERSION from .analyze import ( daily_averages as calc_daily_averages, @@ -27,6 +27,9 @@ 'return_periods', 'metadata_tables', + + 'DEFAULT_REST_ENDPOINT', + 'DEFAULT_REST_ENDPOINT_VERSION', ] From 97eb2850b0746dc02c8771a98b2478fe47bd5a06 Mon Sep 17 00:00:00 2001 From: Riley Hales <39097632+rileyhales@users.noreply.github.com> Date: Thu, 18 Jul 2024 21:52:21 -0600 Subject: [PATCH 11/11] Time zone index (#39) * add time zone information to data retrievals * optional decade average bars on annual averages plot --- geoglows/__init__.py | 2 +- geoglows/_download_decorators.py | 6 ++- geoglows/_plots/__init__.py | 2 + geoglows/_plots/format_tools.py | 13 +++++ geoglows/_plots/plotly_forecasts.py | 24 ++++++--- geoglows/_plots/plotly_retrospective.py | 66 +++++++++++++++++-------- geoglows/_plots/plots.py | 6 ++- 7 files changed, 86 insertions(+), 33 deletions(-) diff --git a/geoglows/__init__.py b/geoglows/__init__.py index 95e484e..5cdc1c4 100644 --- a/geoglows/__init__.py +++ b/geoglows/__init__.py @@ -12,6 +12,6 @@ 'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow', 'get_metadata_table_path', 'set_metadata_table_path', ] -__version__ = '1.6.3' +__version__ = '1.7.0' __author__ = 'Riley Hales' __license__ = 'BSD 3-Clause Clear License' diff --git a/geoglows/_download_decorators.py b/geoglows/_download_decorators.py index 3bfde37..23e702f 100644 --- a/geoglows/_download_decorators.py +++ b/geoglows/_download_decorators.py @@ -76,6 +76,7 @@ def from_aws(*args, **kwargs): ds.attrs = attrs return ds df = ds.to_dataframe().round(2).reset_index() + df['time'] = pd.to_datetime(df['time'], utc=True) # rename columns to match the REST API if isinstance(river_id, int) or isinstance(river_id, np.int64): @@ -159,6 +160,7 @@ def from_rest(*args, **kwargs): if 'datetime' in df.columns: df['datetime'] = pd.to_datetime(df['datetime']) df = df.set_index('datetime') + df.index = df.index.tz_localize('UTC') return df elif return_format == 'json': return response.json() @@ -204,13 +206,15 @@ def main(*args, **kwargs): if return_format == 'xarray': return ds if product_name == 'retrospective': - return ( + df = ( ds .to_dataframe() .reset_index() .set_index('time') .pivot(columns='rivid', values='Qout') ) + df.index = df.index.tz_localize('UTC') + return df if product_name == 'return-periods': rp_methods = { 'gumbel1': 'gumbel1_return_period', diff --git a/geoglows/_plots/__init__.py b/geoglows/_plots/__init__.py index 84233f5..e387110 100644 --- a/geoglows/_plots/__init__.py +++ b/geoglows/_plots/__init__.py @@ -12,6 +12,8 @@ corrected_month_average, corrected_day_average, corrected_scatterplots, + + plotly_figure_to_html_plot as plotly_figure_to_html, ) __all__ = [ diff --git a/geoglows/_plots/format_tools.py b/geoglows/_plots/format_tools.py index 41cdbf2..c714556 100644 --- a/geoglows/_plots/format_tools.py +++ b/geoglows/_plots/format_tools.py @@ -1,3 +1,6 @@ +import datetime + +import pytz from plotly.offline import plot as offline_plot @@ -26,3 +29,13 @@ def plotly_figure_to_html_plot(figure, include_plotlyjs: bool = False, ) -> str: output_type='div', include_plotlyjs=include_plotlyjs ) + + +def timezone_label(timezone: str = None): + timezone = str(timezone) if timezone is not None else 'UTC' + # get the number of hours the timezone is offset from UTC + now = datetime.datetime.now(pytz.timezone(timezone)) + utc_offset = now.utcoffset().total_seconds() / 3600 + # convert float number of hours to HH:MM format + utc_offset = f'{int(utc_offset):+03d}:{int((utc_offset % 1) * 60):02d}' + return f'Datetime ({timezone} {utc_offset})' diff --git a/geoglows/_plots/plotly_forecasts.py b/geoglows/_plots/plotly_forecasts.py index 66802fc..bc3c178 100644 --- a/geoglows/_plots/plotly_forecasts.py +++ b/geoglows/_plots/plotly_forecasts.py @@ -2,7 +2,7 @@ import pandas as pd import plotly.graph_objects as go -from .format_tools import build_title +from .format_tools import build_title, timezone_label from .plotly_helpers import _rperiod_scatters __all__ = [ @@ -35,7 +35,7 @@ def forecast(df: pd.DataFrame, *, ), go.Scatter( name='Uncertainty Bounds', - x=np.concatenate([df.index.values, df.index.values[::-1]]), + x=np.concatenate([df.index, df.index[::-1]]), y=np.concatenate([df['flow_uncertainty_upper'], df['flow_uncertainty_lower'][::-1]]), legendgroup='uncertainty', showlegend=True, @@ -67,7 +67,11 @@ def forecast(df: pd.DataFrame, *, layout = go.Layout( title=build_title('Forecasted Streamflow', plot_titles), yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, - xaxis={'title': 'Date (UTC +0:00)', 'range': [df.index[0], df.index[-1]]}, + xaxis={ + 'title': timezone_label(df.index.tz), + 'range': [df.index[0], df.index[-1]], + 'hoverformat': '%d %b %Y %X', + }, ) return go.Figure(scatter_traces, layout=layout) @@ -179,9 +183,9 @@ def forecast_stats(df: pd.DataFrame, *, 'range': [0, 'auto'] }, xaxis={ - 'title': 'Date (UTC +0:00)', + 'title': timezone_label(df.index.tz), 'range': [startdate, enddate], - 'hoverformat': '%b %d %Y', + 'hoverformat': '%d %b %Y %X', 'tickformat': '%b %d %Y' }, ) @@ -250,9 +254,9 @@ def forecast_ensembles(df: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_tit title=build_title('Ensemble Predicted Streamflow', plot_titles), yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, xaxis={ - 'title': 'Date (UTC +0:00)', + 'title': timezone_label(df.index.tz), 'range': [startdate, enddate], - 'hoverformat': '%b %d %Y', + 'hoverformat': '%d %b %Y %X', 'tickformat': '%b %d %Y' }, ) @@ -297,6 +301,10 @@ def forecast_records(df: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_title layout = go.Layout( title=build_title('Previous Forecasted Streamflow', plot_titles=plot_titles), yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, - xaxis={'title': 'Date (UTC +0:00)', 'range': [startdate, enddate]}, + xaxis={ + 'title': timezone_label(df.index.tz), + 'range': [startdate, enddate], + 'hoverformat': '%d %b %Y %X', + }, ) return go.Figure(scatter_plots, layout=layout) diff --git a/geoglows/_plots/plotly_retrospective.py b/geoglows/_plots/plotly_retrospective.py index 3115a50..710f233 100644 --- a/geoglows/_plots/plotly_retrospective.py +++ b/geoglows/_plots/plotly_retrospective.py @@ -1,8 +1,9 @@ +import numpy as np import pandas as pd import plotly.graph_objs as go import scipy.stats -from .format_tools import build_title +from .format_tools import build_title, timezone_label from .plotly_helpers import _rperiod_scatters __all__ = [ @@ -15,29 +16,27 @@ ] -def retrospective(retro: pd.DataFrame, *, - rp_df: pd.DataFrame = None, plot_titles: dict = None, ) -> go.Figure: +def retrospective(df: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_titles: dict = None, ) -> go.Figure: """ Makes the streamflow ensemble data and metadata into a plotly plot Args: - retro: the csv response from historic_simulation + df: the csv response from historic_simulation rp_df: the csv response from return_periods - plot_type: either 'json', 'plotly', or 'html' (default plotly) plot_titles: (dict) Extra info to show on the title of the plot. For example: {'River ID': 1234567, 'Drainage Area': '1000km^2'} Return: plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method """ - dates = retro.index.tolist() + dates = df.index.tolist() startdate = dates[0] enddate = dates[-1] plot_data = { 'x_datetime': dates, - 'y_flow': retro.values.flatten(), - 'y_max': retro.values.max(), + 'y_flow': df.values.flatten(), + 'y_max': df.values.max(), } if rp_df is not None: plot_data.update(rp_df.to_dict(orient='index').items()) @@ -56,9 +55,9 @@ def retrospective(retro: pd.DataFrame, *, title=build_title('Retrospective Streamflow Simulation', plot_titles), yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, xaxis={ - 'title': 'Date (UTC +0:00)', + 'title': timezone_label(df.index.tz), 'range': [startdate, enddate], - 'hoverformat': '%b %d %Y', + 'hoverformat': '%d %b %Y', 'tickformat': '%Y' }, ) @@ -94,18 +93,19 @@ def daily_averages(dayavg: pd.DataFrame, plot_titles: list = None, plot_type: st layout = go.Layout( title=build_title('Daily Average Streamflow (Simulated)', plot_titles), yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, - xaxis={'title': 'Date (UTC +0:00)', 'hoverformat': '%b %d', 'tickformat': '%b'}, + xaxis={'title': 'Date', 'hoverformat': '%b %d', 'tickformat': '%b'}, ) return go.Figure(scatter_plots, layout=layout) -def monthly_averages(monavg: pd.DataFrame, titles: dict = None, plot_titles: list = None, plot_type: str = 'plotly') -> go.Figure: +def monthly_averages(monavg: pd.DataFrame, plot_titles: list = None, + plot_type: str = 'plotly') -> go.Figure: """ Makes the daily_averages data and metadata into a plotly plot Args: monavg: the csv response from monthly_averages - titles: (dict) Extra info to show on the title of the plot. For example: + plot_titles: (dict) Extra info to show on the title of the plot. For example: {'River ID': 1234567, 'Drainage Area': '1000km^2'} plot_type: either 'plotly', or 'html' (default plotly) @@ -133,7 +133,7 @@ def monthly_averages(monavg: pd.DataFrame, titles: dict = None, plot_titles: lis return go.Figure(scatter_plots, layout=layout) -def annual_averages(df: pd.DataFrame, *, plot_titles: list = None, ) -> go.Figure: +def annual_averages(df: pd.DataFrame, *, plot_titles: list = None, decade_averages: bool = False) -> go.Figure: """ Makes the annual_averages data and metadata into a plotly plot @@ -141,6 +141,7 @@ def annual_averages(df: pd.DataFrame, *, plot_titles: list = None, ) -> go.Figur df: the csv response from annual_averages plot_titles: (dict) Extra info to show on the title of the plot. For example: {'River ID': 1234567, 'Drainage Area': '1000km^2'} + decade_averages: (bool) if True, will plot the average flow for each decade Return: plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method @@ -153,6 +154,29 @@ def annual_averages(df: pd.DataFrame, *, plot_titles: list = None, ) -> go.Figur line=dict(color='blue') ), ] + + if decade_averages: + # get a list of decades covered by the data in the index + first_year = str(int(df.index[0]))[:-1] + '0' + last_year = str(int(df.index[-1]))[:-1] + '9' + first_year = int(first_year) + last_year = int(last_year) + decades = [decade for decade in range(int(first_year), int(last_year) + 1, 10)] + for idx, decade in enumerate(decades): + decade_values = df[np.logical_and(df.index.astype(int) >= decade, df.index.astype(int) < decade + 10)] + mean_flow = decade_values.values.flatten().mean() + scatter_plots.append( + go.Scatter( + name=f'{decade}s: {mean_flow:.2f} m3/s', + x=[decade_values.index[0], decade_values.index[-1]], + y=mean_flow * np.ones(2), + line=dict(color='red'), + hoverinfo='name', + legendgroup='decade_averages', + legendgrouptitle=dict(text='Decade Averages') + ) + ) + layout = go.Layout( title=build_title('Annual Average Streamflow (Simulated)', plot_titles), yaxis={'title': 'Streamflow (m3/s)'}, @@ -161,12 +185,12 @@ def annual_averages(df: pd.DataFrame, *, plot_titles: list = None, ) -> go.Figur return go.Figure(scatter_plots, layout=layout) -def flow_duration_curve(hist: pd.DataFrame, plot_titles: dict = None, plot_type: str = 'plotly') -> go.Figure: +def flow_duration_curve(df: pd.DataFrame, plot_titles: dict = None, plot_type: str = 'plotly') -> go.Figure: """ Makes the streamflow ensemble data and metadata into a plotly plot Args: - hist: the csv response from historic_simulation + df: the dataframe response from data.retrospective plot_titles: (dict) Extra info to show on the title of the plot. For example: {'River ID': 1234567, 'Drainage Area': '1000km^2'} plot_type: either 'json', 'plotly', or 'html' (default plotly) @@ -178,7 +202,7 @@ def flow_duration_curve(hist: pd.DataFrame, plot_titles: dict = None, plot_type: raise ValueError('invalid plot_type specified. pick json, plotly, plotly_scatters, or html') # process the hist dataframe to create the flow duration curve - sorted_hist = hist.values.flatten() + sorted_hist = df.values.flatten() sorted_hist.sort() # ranks data from smallest to largest @@ -212,12 +236,12 @@ def flow_duration_curve(hist: pd.DataFrame, plot_titles: dict = None, plot_type: return go.Figure(scatter_plots, layout=layout) -def daily_stats(hist: pd.DataFrame, *, plot_titles: dict = None, plot_type: str = 'plotly') -> go.Figure: +def daily_stats(df: pd.DataFrame, *, plot_titles: dict = None, plot_type: str = 'plotly') -> go.Figure: """ Plots a graph with statistics for each day of year Args: - hist: dataframe of values to plot + df: dataframe of values to plot plot_titles: (dict) Extra info to show on the title of the plot. For example: {'River ID': 1234567, 'Drainage Area': '1000km^2'} plot_type: either 'plotly' (python object, default), 'plotly_scatters', or 'html' @@ -226,7 +250,7 @@ def daily_stats(hist: pd.DataFrame, *, plot_titles: dict = None, plot_type: str plot of the graph of the low flows """ - stats_df = daily_stats(hist) + stats_df = daily_stats(df) data = [ go.Scatter( @@ -241,7 +265,7 @@ def daily_stats(hist: pd.DataFrame, *, plot_titles: dict = None, plot_type: str layout = go.Layout( title=build_title('Daily Average Streamflow (Simulated)', plot_titles), yaxis={'title': 'Streamflow (m3/s)', 'range': [0, 'auto']}, - xaxis={'title': 'Date (UTC +0:00)', 'hoverformat': '%b %d', 'tickformat': '%b'}, + xaxis={'title': timezone_label(df.index.tz), 'hoverformat': '%b %d', 'tickformat': '%b'}, ) return go.Figure(data=data, layout=layout) diff --git a/geoglows/_plots/plots.py b/geoglows/_plots/plots.py index a886dcf..504c233 100644 --- a/geoglows/_plots/plots.py +++ b/geoglows/_plots/plots.py @@ -200,7 +200,8 @@ def monthly_averages(df: pd.DataFrame, *, def annual_averages(df: pd.DataFrame, *, plot_type: str = 'plotly', - plot_titles: list = None, ) -> go.Figure: + plot_titles: list = None, + decade_averages: bool = False, ) -> go.Figure: """ Makes a plotly figure of the annual average flows @@ -208,12 +209,13 @@ def annual_averages(df: pd.DataFrame, *, df: a dataframe of the annual average flows plot_type: either plotly or html plot_titles: additional key-value pairs to display in the title of the figure + decade_averages: if True, the figure will include the average flows for each decade Returns: go.Figure """ if plot_type in ('plotly', 'html'): - figure = plotly_annual_averages(df, plot_titles=plot_titles) + figure = plotly_annual_averages(df, plot_titles=plot_titles, decade_averages=decade_averages) if plot_type == 'html': return plotly_figure_to_html_plot(figure) return figure