From 0d264ca8295bcca835344e499e6fbf485a7e3c3b Mon Sep 17 00:00:00 2001
From: Riley Hales PhD <39097632+rileyhales@users.noreply.github.com>
Date: Thu, 25 Apr 2024 08:41:53 -0600
Subject: [PATCH 01/11] enable forecast records plot

---
 docs/api-documentation/plots.rst    |  2 +-
 geoglows/__init__.py                |  2 +-
 geoglows/_plots/__init__.py         |  2 ++
 geoglows/_plots/plotly_forecasts.py | 10 ++++----
 geoglows/_plots/plots.py            | 39 +++++++++++++++++++++++------
 5 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/docs/api-documentation/plots.rst b/docs/api-documentation/plots.rst
index 9c29282..95d0eac 100644
--- a/docs/api-documentation/plots.rst
+++ b/docs/api-documentation/plots.rst
@@ -4,4 +4,4 @@ geoglows.plots
 
 .. automodule:: geoglows.plots
 	:members:
-		forecast, forecast_stats, forecast_ensembles, retrospective, annual_averages, monthly_averages, daily_averages, flow_duration_curve, corrected_retrospective, corrected_month_average, corrected_day_average, corrected_scatterplots
+		forecast, forecast_stats, forecast_ensembles, forecast_records, retrospective, annual_averages, monthly_averages, daily_averages, flow_duration_curve, corrected_retrospective, corrected_month_average, corrected_day_average, corrected_scatterplots
diff --git a/geoglows/__init__.py b/geoglows/__init__.py
index ca99c8f..b3ec040 100644
--- a/geoglows/__init__.py
+++ b/geoglows/__init__.py
@@ -12,6 +12,6 @@
     'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow',
     'METADATA_TABLE_PATH'
 ]
-__version__ = '1.2.1'
+__version__ = '1.3.0'
 __author__ = 'Riley Hales'
 __license__ = 'BSD 3-Clause Clear License'
diff --git a/geoglows/_plots/__init__.py b/geoglows/_plots/__init__.py
index 2cf43c5..84233f5 100644
--- a/geoglows/_plots/__init__.py
+++ b/geoglows/_plots/__init__.py
@@ -2,6 +2,7 @@
     forecast,
     forecast_stats,
     forecast_ensembles,
+    forecast_records,
     retrospective,
     daily_averages,
     monthly_averages,
@@ -17,6 +18,7 @@
     'forecast',
     'forecast_stats',
     'forecast_ensembles',
+    'forecast_records',
     'retrospective',
     'daily_averages',
     'monthly_averages',
diff --git a/geoglows/_plots/plotly_forecasts.py b/geoglows/_plots/plotly_forecasts.py
index 464ab23..66802fc 100644
--- a/geoglows/_plots/plotly_forecasts.py
+++ b/geoglows/_plots/plotly_forecasts.py
@@ -259,12 +259,12 @@ def forecast_ensembles(df: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_tit
     return go.Figure(scatter_plots, layout=layout)
 
 
-def forecast_records(recs: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_titles: list = False, ) -> go.Figure:
+def forecast_records(df: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_titles: list = False, ) -> go.Figure:
     """
     Makes the streamflow saved forecast data and metadata into a plotly plot
 
     Args:
-        recs: the csv response from forecast_records
+        df: the csv response from forecast_records
         rp_df: the csv response from return_periods
         plot_titles: a list of strings to place in the figure title. each list item will be on a new line.
 
@@ -272,14 +272,14 @@ def forecast_records(recs: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_tit
          plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method
     """
     # Start processing the inputs
-    dates = recs.index.tolist()
+    dates = df.index.tolist()
     startdate = dates[0]
     enddate = dates[-1]
 
     plot_data = {
         'x_records': dates,
-        'recorded_flows': recs.dropna(axis=0).values.flatten(),
-        'y_max': max(recs.values),
+        'recorded_flows': df.dropna(axis=0).values.flatten(),
+        'y_max': np.nanmax(df.values),
     }
     if rp_df is not None:
         plot_data.update(rp_df.to_dict(orient='index').items())
diff --git a/geoglows/_plots/plots.py b/geoglows/_plots/plots.py
index 2c759c4..a886dcf 100644
--- a/geoglows/_plots/plots.py
+++ b/geoglows/_plots/plots.py
@@ -2,10 +2,17 @@
 import plotly.graph_objects as go
 
 from .format_tools import plotly_figure_to_html_plot
+from .plotly_bias_corrected import (
+    corrected_retrospective as plotly_corrected_retrospective,
+    corrected_month_average as plotly_corrected_month_average,
+    corrected_day_average as plotly_corrected_day_average,
+    corrected_scatterplots as plotly_corrected_scatterplots,
+)
 from .plotly_forecasts import (
     forecast as plotly_forecast,
     forecast_stats as plotly_forecast_stats,
-    forecast_ensembles as plotly_forecast_ensembles
+    forecast_ensembles as plotly_forecast_ensembles,
+    forecast_records as plotly_forecast_records,
 )
 from .plotly_retrospective import (
     retrospective as plotly_retrospective,
@@ -14,17 +21,12 @@
     annual_averages as plotly_annual_averages,
     flow_duration_curve as plotly_flow_duration_curve,
 )
-from .plotly_bias_corrected import (
-    corrected_retrospective as plotly_corrected_retrospective,
-    corrected_month_average as plotly_corrected_month_average,
-    corrected_day_average as plotly_corrected_day_average,
-    corrected_scatterplots as plotly_corrected_scatterplots,
-)
 
 __all__ = [
     'forecast',
     'forecast_stats',
     'forecast_ensembles',
+    'forecast_records',
 
     'retrospective',
     'daily_averages',
@@ -105,6 +107,29 @@ def forecast_ensembles(df: pd.DataFrame, *,
     raise NotImplementedError(f'Plot type "{plot_type}" is not supported.')
 
 
+def forecast_records(df: pd.DataFrame, *,
+                     plot_type: str = 'plotly',
+                     rp_df: pd.DataFrame = None,
+                     plot_titles: list = None, ) -> go.Figure:
+    """
+    Plots forecasted streamflow and optional return periods
+    Args:
+        df:
+        plot_type:
+        rp_df:
+        plot_titles:
+
+    Returns:
+        go.Figure
+    """
+    if plot_type in ('plotly', 'html'):
+        figure = plotly_forecast_records(df, rp_df=rp_df, plot_titles=plot_titles)
+        if plot_type == 'html':
+            return plotly_figure_to_html_plot(figure)
+        return figure
+    raise NotImplementedError(f'Plot type "{plot_type}" is not supported.')
+
+
 def retrospective(df: pd.DataFrame, *,
                   plot_type: str = 'plotly',
                   rp_df: pd.DataFrame = None,

From 5be2a0197f24ea08774cde4f8c55c101706383fa Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Tue, 30 Apr 2024 21:53:32 -0600
Subject: [PATCH 02/11] loosen versions, correct warnings, xarray datasets
 unfiltered by river number

---
 docs/conf.py           |  2 +-
 geoglows/__init__.py   |  2 +-
 geoglows/data.py       | 35 ++++++++++++++++++++---------------
 geoglows/streamflow.py | 18 +++++++++---------
 requirements.txt       |  6 +++---
 setup.py               |  2 +-
 6 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index d09a5fc..1abade2 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -15,7 +15,7 @@
 author = 'Riley Hales, PhD'
 
 # The full version, including alpha/beta/rc tags
-release = '1.2.0'
+release = '1.4.0'
 master_doc = 'index'
 
 # -- General configuration ---------------------------------------------------
diff --git a/geoglows/__init__.py b/geoglows/__init__.py
index b3ec040..ba86d67 100644
--- a/geoglows/__init__.py
+++ b/geoglows/__init__.py
@@ -12,6 +12,6 @@
     'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow',
     'METADATA_TABLE_PATH'
 ]
-__version__ = '1.3.0'
+__version__ = '1.4.0'
 __author__ = 'Riley Hales'
 __license__ = 'BSD 3-Clause Clear License'
diff --git a/geoglows/data.py b/geoglows/data.py
index de09d7d..fe4d441 100644
--- a/geoglows/data.py
+++ b/geoglows/data.py
@@ -57,9 +57,10 @@ def from_aws(*args, **kwargs):
         s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION))
         date = kwargs.get('date', False)
         if not date:
-            dates = sorted([x.split('/')[-1] for x in s3.ls(ODP_FORECAST_S3_BUCKET_URI)], reverse=True)
-            dates = [x.split('.')[0] for x in dates if x.endswith('.zarr')]  # ignore the index.html file
-            dates = [x.replace('00.zarr', '') for x in dates]
+            zarr_vars = ['rivid', 'Qout', 'time', 'ensemble']
+            dates = [s3.glob(os.path.join(ODP_FORECAST_S3_BUCKET_URI, f'*.zarr/{var}')) for var in zarr_vars]
+            dates = [set([d.split('/')[1].replace('.zarr', '') for d in date]) for date in dates]
+            dates = sorted(set.intersection(*dates), reverse=True)
             if product_name == 'dates':
                 return pd.DataFrame(dict(dates=dates))
             date = dates[0]
@@ -121,7 +122,7 @@ def from_rest(*args, **kwargs):
         endpoint = f'https://{endpoint}' if not endpoint.startswith(('https://', 'http://')) else endpoint
 
         version = kwargs.get('version', DEFAULT_REST_ENDPOINT_VERSION)
-        assert version in ('v2', ), ValueError(f'Unrecognized model version parameter: {version}')
+        assert version in ('v2',), ValueError(f'Unrecognized model version parameter: {version}')
 
         product_name = function.__name__.replace("_", "").lower()
 
@@ -180,6 +181,7 @@ def main(*args, **kwargs):
         if source == 'rest':
             return from_rest(*args, **kwargs)
         return from_aws(*args, **kwargs)
+
     main.__doc__ = function.__doc__  # necessary for code documentation auto generators
     return main
 
@@ -290,15 +292,10 @@ def retrospective(river_id: int or list, format: str = 'df') -> pd.DataFrame or
     """
     s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION))
     s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/retrospective.zarr', s3=s3, check=False)
-    ds = xr.open_zarr(s3store).sel(rivid=river_id)
+    ds = xr.open_zarr(s3store)
     if format == 'xarray':
         return ds
-    return ds.to_dataframe().reset_index().set_index('time').pivot(columns='rivid', values='Qout')
-
-
-def historical(*args, **kwargs):
-    """Alias for retrospective"""
-    return retrospective(*args, **kwargs)
+    return ds.sel(rivid=river_id).to_dataframe().reset_index().set_index('time').pivot(columns='rivid', values='Qout')
 
 
 def daily_averages(river_id: int or list) -> pd.DataFrame:
@@ -343,24 +340,32 @@ def annual_averages(river_id: int or list) -> pd.DataFrame:
     return calc_annual_averages(df)
 
 
-def return_periods(river_id: int or list, format: str = 'df') -> pd.DataFrame or xr.Dataset:
+def return_periods(river_id: int or list, format: str = 'df', method: str = 'gumbel1') -> pd.DataFrame or xr.Dataset:
     """
     Retrieves the return period thresholds based on a specified historic simulation forcing on a certain river_id.
 
     Args:
         river_id (int): the ID of a stream, should be a 9 digit integer
         format (str): the format to return the data, either 'df' or 'xarray'. default is 'df'
+        method (str): the method to use to estimate the return period thresholds. default is 'gumbel1'
+
+    Changelog:
+        v1.4.0: adds method parameter for future expansion of multiple return period methods
 
     Returns:
         pd.DataFrame
     """
+    rp_methods = {
+        'gumbel1': 'gumbel1_return_period',
+    }
+    assert method in rp_methods, f'Unrecognized return period estimation method given: {method}'
     s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION))
     s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/return-periods.zarr', s3=s3, check=False)
-    ds = xr.open_zarr(s3store).sel(rivid=river_id)
+    ds = xr.open_zarr(s3store)
     if format == 'xarray':
         return ds
-    return (ds['return_period_flow'].to_dataframe().reset_index()
-            .pivot(index='rivid', columns='return_period', values='return_period_flow'))
+    return (ds.sel(rivid=river_id)[rp_methods[method]].to_dataframe().reset_index()
+            .pivot(index='rivid', columns='return_period', values=rp_methods[method]))
 
 
 # model config and supplementary data
diff --git a/geoglows/streamflow.py b/geoglows/streamflow.py
index a0cb642..28deb7e 100644
--- a/geoglows/streamflow.py
+++ b/geoglows/streamflow.py
@@ -42,7 +42,7 @@ def forecast_stats(reach_id: int, return_format: str = 'csv', forecast_date: str
 
             data = streamflow.rst.forecast_stats(12341234)
     """
-    warnings.warn(DEPRECATIONWARNING, DeprecationWarning)
+    warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2)
     method = 'ForecastStats/'
 
     # if you only wanted the url, quit here
@@ -113,7 +113,7 @@ def forecast_warnings(region: str = 'all', return_format='csv',
 
             data = streamflow.rst.forecast_warnings('australia-geoglows')
     """
-    warnings.warn(DEPRECATIONWARNING, DeprecationWarning)
+    warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2)
     method = 'ForecastWarnings/'
 
     # if you only wanted the url, quit here
@@ -148,7 +148,7 @@ def forecast_records(reach_id: int, start_date: str = None, end_date: str = None
 
             data = streamflow.rst.forecast_warnings('australia-geoglows')
     """
-    warnings.warn(DEPRECATIONWARNING, DeprecationWarning)
+    warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2)
     method = 'ForecastRecords/'
 
     # if you only wanted the url, quit here
@@ -188,7 +188,7 @@ def historic_simulation(reach_id: int, return_format='csv', forcing='era_5',
 
             data = streamflow.rst.historic_simulation(12341234)
     """
-    warnings.warn(DEPRECATIONWARNING, DeprecationWarning)
+    warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2)
     method = 'HistoricSimulation/'
 
     # if you only wanted the url, quit here
@@ -223,7 +223,7 @@ def daily_averages(reach_id: int, return_format='csv', forcing='era_5',
 
             data = streamflow.rst.seasonal_average(12341234)
     """
-    warnings.warn(DEPRECATIONWARNING, DeprecationWarning)
+    warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2)
     method = 'DailyAverages/'
 
     # if you only wanted the url, quit here
@@ -258,7 +258,7 @@ def monthly_averages(reach_id: int, return_format='csv', forcing='era_5',
 
             data = streamflow.rst.seasonal_average(12341234)
     """
-    warnings.warn(DEPRECATIONWARNING, DeprecationWarning)
+    warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2)
     method = 'MonthlyAverages/'
 
     # if you only wanted the url, quit here
@@ -293,7 +293,7 @@ def return_periods(reach_id: int, return_format='csv', forcing='era_5',
 
             data = streamflow.rst.return_periods(12341234)
     """
-    warnings.warn(DEPRECATIONWARNING, DeprecationWarning)
+    warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2)
     method = 'ReturnPeriods/'
 
     # if you only wanted the url, quit here
@@ -324,7 +324,7 @@ def available_data(endpoint: str = ENDPOINT, return_format='json', s: requests.S
             data = streamflow.rst.available_data()
 
     """
-    warnings.warn(DEPRECATIONWARNING, DeprecationWarning)
+    warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2)
     method = 'AvailableData/'
 
     # if you only wanted the url, quit here
@@ -356,7 +356,7 @@ def available_dates(reach_id: int = None, region: str = None, return_format: str
 
             data = streamflow.rst.available_dates(12341234)
     """
-    warnings.warn(DEPRECATIONWARNING, DeprecationWarning)
+    warnings.warn(DEPRECATIONWARNING, DeprecationWarning, stacklevel=2)
     method = 'AvailableDates/'
 
     # you need a region for the api call, so the user needs to provide one or a valid reach_id to get it from
diff --git a/requirements.txt b/requirements.txt
index 9afba53..22a86d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,12 @@
-dask>=2024
+dask>=2022
 fastparquet
 requests
 pandas>=1
 plotly>=5
 scipy>=1
-s3fs>=2024
+s3fs>=2022
 numpy>=1
 hydrostats
 HydroErr
-xarray>=2024
+xarray>=2022
 zarr
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 3547621..ba5a6a2 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 DESCRIPTION = 'Package for accessing data from the GEOGLOWS Hydrological Model'
 URL = 'https://data.geoglows.org'
 AUTHOR = 'Riley Hales PhD'
-REQUIRES_PYTHON = '>=3.10.0'
+REQUIRES_PYTHON = '>=3.7.0'
 LICENSE = 'BSD 3-Clause Clear License'
 
 with open("README.md", "r") as readme:

From 12502d7c9ff8897c85ab18d0a0833a1bd14e8290 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Tue, 30 Apr 2024 22:41:42 -0600
Subject: [PATCH 03/11] put river id selector back in xarray dataset returns

---
 geoglows/data.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/geoglows/data.py b/geoglows/data.py
index fe4d441..dae15b5 100644
--- a/geoglows/data.py
+++ b/geoglows/data.py
@@ -292,10 +292,10 @@ def retrospective(river_id: int or list, format: str = 'df') -> pd.DataFrame or
     """
     s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION))
     s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/retrospective.zarr', s3=s3, check=False)
-    ds = xr.open_zarr(s3store)
+    ds = xr.open_zarr(s3store).sel(rivid=river_id)
     if format == 'xarray':
         return ds
-    return ds.sel(rivid=river_id).to_dataframe().reset_index().set_index('time').pivot(columns='rivid', values='Qout')
+    return ds.to_dataframe().reset_index().set_index('time').pivot(columns='rivid', values='Qout')
 
 
 def daily_averages(river_id: int or list) -> pd.DataFrame:
@@ -361,10 +361,10 @@ def return_periods(river_id: int or list, format: str = 'df', method: str = 'gum
     assert method in rp_methods, f'Unrecognized return period estimation method given: {method}'
     s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION))
     s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/return-periods.zarr', s3=s3, check=False)
-    ds = xr.open_zarr(s3store)
+    ds = xr.open_zarr(s3store).sel(rivid=river_id)
     if format == 'xarray':
         return ds
-    return (ds.sel(rivid=river_id)[rp_methods[method]].to_dataframe().reset_index()
+    return (ds[rp_methods[method]].to_dataframe().reset_index()
             .pivot(index='rivid', columns='return_period', values=rp_methods[method]))
 
 

From ee0f8c5aa27a7a3758fa016cc5051b6c061e9e7c Mon Sep 17 00:00:00 2001
From: Riley Hales PhD <39097632+rileyhales@users.noreply.github.com>
Date: Thu, 2 May 2024 17:46:59 -0600
Subject: [PATCH 04/11] v1.5.0 (#35)

* move decorators to file, add decorator for retrospective, log aws requests

* increment docs version number
---
 docs/conf.py                     |   2 +-
 geoglows/__init__.py             |   2 +-
 geoglows/_constants.py           |   5 +
 geoglows/_download_decorators.py | 230 +++++++++++++++++++++++++++++++
 geoglows/data.py                 | 207 +++-------------------------
 5 files changed, 256 insertions(+), 190 deletions(-)
 create mode 100644 geoglows/_download_decorators.py

diff --git a/docs/conf.py b/docs/conf.py
index 1abade2..6c3694b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -15,7 +15,7 @@
 author = 'Riley Hales, PhD'
 
 # The full version, including alpha/beta/rc tags
-release = '1.4.0'
+release = '1.5.0'
 master_doc = 'index'
 
 # -- General configuration ---------------------------------------------------
diff --git a/geoglows/__init__.py b/geoglows/__init__.py
index ba86d67..0a13e9f 100644
--- a/geoglows/__init__.py
+++ b/geoglows/__init__.py
@@ -12,6 +12,6 @@
     'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow',
     'METADATA_TABLE_PATH'
 ]
-__version__ = '1.4.0'
+__version__ = '1.5.0'
 __author__ = 'Riley Hales'
 __license__ = 'BSD 3-Clause Clear License'
diff --git a/geoglows/_constants.py b/geoglows/_constants.py
index a653b47..fd69956 100644
--- a/geoglows/_constants.py
+++ b/geoglows/_constants.py
@@ -1,5 +1,10 @@
 import os
 
+ODP_CORE_S3_BUCKET_URI = 's3://geoglows-v2'
+ODP_FORECAST_S3_BUCKET_URI = 's3://geoglows-v2-forecasts'
+ODP_RETROSPECTIVE_S3_BUCKET_URI = 's3://geoglows-v2-retrospective'
+ODP_S3_BUCKET_REGION = 'us-west-2'
+
 METADATA_TABLE_PATH = os.getenv(
     'PYGEOGLOWS_METADATA_TABLE_PATH',
     os.path.join(os.path.dirname(__file__), 'data', 'metadata-tables.parquet')
diff --git a/geoglows/_download_decorators.py b/geoglows/_download_decorators.py
new file mode 100644
index 0000000..fb4924d
--- /dev/null
+++ b/geoglows/_download_decorators.py
@@ -0,0 +1,230 @@
+import os
+import warnings
+from io import StringIO
+
+import pandas as pd
+import requests
+import s3fs
+import xarray as xr
+import numpy as np
+
+from .analyze import (
+    simple_forecast as calc_simple_forecast,
+    forecast_stats as calc_forecast_stats,
+)
+
+from ._constants import (
+    ODP_FORECAST_S3_BUCKET_URI,
+    ODP_RETROSPECTIVE_S3_BUCKET_URI,
+    ODP_S3_BUCKET_REGION,
+)
+
+DEFAULT_REST_ENDPOINT = 'https://geoglows.ecmwf.int/api/'
+DEFAULT_REST_ENDPOINT_VERSION = 'v2'  # 'v1, v2, latest'
+
+__all__ = [
+    '_forecast',
+    '_retrospective',
+]
+
+
+def _forecast(function):
+    def from_aws(*args, **kwargs):
+        product_name = function.__name__.replace("_", "").lower()
+        if product_name == 'forecastrecords':
+            warnings.warn('forecast_records are not available from the AWS Open Data Program.')
+            return from_rest(*args, **kwargs)
+
+        river_id = kwargs.get('river_id', '')
+        river_id = args[0] if len(args) > 0 else river_id
+
+        return_format = kwargs.get('format', 'df')
+        assert return_format in ('df', 'xarray'), f'Unsupported return format requested: {return_format}'
+
+        if kwargs.get('skip_log', False):
+            requests.post(f'{DEFAULT_REST_ENDPOINT}{DEFAULT_REST_ENDPOINT_VERSION}/log',
+                          json={'river_id': river_id, 'product': product_name, 'format': return_format},
+                          timeout=1, )  # short timeout- don't need the response, post only needs to be received
+
+        s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION))
+        date = kwargs.get('date', False)
+        if not date:
+            zarr_vars = ['rivid', 'Qout', 'time', 'ensemble']
+            dates = [s3.glob(os.path.join(ODP_FORECAST_S3_BUCKET_URI, f'*.zarr/{var}')) for var in zarr_vars]
+            dates = [set([d.split('/')[1].replace('.zarr', '') for d in date]) for date in dates]
+            dates = sorted(set.intersection(*dates), reverse=True)
+            if product_name == 'dates':
+                return pd.DataFrame(dict(dates=dates))
+            date = dates[0]
+        if len(date) == 8:
+            date = f'{date}00.zarr'
+        elif len(date) == 10:
+            date = f'{date}.zarr'
+        else:
+            raise ValueError('Date must be YYYYMMDD or YYYYMMDDHH format. Use dates() to view available data.')
+
+        s3store = s3fs.S3Map(root=f'{ODP_FORECAST_S3_BUCKET_URI}/{date}', s3=s3, check=False)
+
+        attrs = {
+            'source': 'geoglows',
+            'forecast_date': date[:8],
+            'retrieval_date': pd.Timestamp.now().strftime('%Y%m%d'),
+            'units': 'cubic meters per second',
+        }
+        ds = xr.open_zarr(s3store).sel(rivid=river_id)
+        if return_format == 'xarray' and product_name == 'forecastensembles':
+            ds = ds.rename({'time': 'datetime', 'rivid': 'river_id'})
+            ds.attrs = attrs
+            return ds
+        df = ds.to_dataframe().round(2).reset_index()
+
+        # rename columns to match the REST API
+        if isinstance(river_id, int) or isinstance(river_id, np.int64):
+            df = df.pivot(index='time', columns='ensemble', values='Qout')
+        else:
+            df = df.pivot(index=['time', 'rivid'], columns='ensemble', values='Qout')
+            df.index.names = ['time', 'river_id']
+        df = df[sorted(df.columns)]
+        df.columns = [f'ensemble_{str(x).zfill(2)}' for x in df.columns]
+
+        if product_name == 'forecast':
+            df = calc_simple_forecast(df)
+        elif product_name == 'forecaststats':
+            df = calc_forecast_stats(df)
+
+        if return_format == 'df':
+            return df
+        ds = df.to_xarray()
+        ds.attrs = attrs
+        return ds
+
+    def from_rest(*args, **kwargs):
+        # update the default values set by the function unless the user has already specified them
+        for key, value in function.__kwdefaults__.items() if function.__kwdefaults__ else []:
+            if key not in kwargs:
+                kwargs[key] = value
+
+        return_format = kwargs.get('format', 'csv')
+        assert return_format in ('csv', 'json', 'url'), f'Unsupported format requested: {return_format}'
+
+        # parse out the information necessary to build a request url
+        endpoint = kwargs.get('endpoint', DEFAULT_REST_ENDPOINT)
+        endpoint = endpoint[:-1] if endpoint[-1] == '/' else endpoint
+        endpoint = endpoint + '/api' if not endpoint.endswith('/api') else endpoint
+        endpoint = f'https://{endpoint}' if not endpoint.startswith(('https://', 'http://')) else endpoint
+
+        version = kwargs.get('version', DEFAULT_REST_ENDPOINT_VERSION)
+        assert version in ('v2',), ValueError(f'Unrecognized model version parameter: {version}')
+
+        product_name = function.__name__.replace("_", "").lower()
+
+        river_id = args[0] if len(args) > 0 else None
+        river_id = kwargs.get('river_id', '') if not river_id else river_id
+        if isinstance(river_id, list):
+            raise ValueError('Multiple river_ids are not available via REST API or on v1. '
+                             'Use data_source="aws" for multiple river_ids.')
+        river_id = int(river_id) if river_id else None
+        if river_id and version == 'v2':
+            assert 1_000_000_000 > river_id >= 110_000_000, ValueError('River ID must be a 9 digit integer')
+
+        # request parameter validation before submitting
+        for key in ('endpoint', 'version', 'river_id'):
+            if key in kwargs:
+                del kwargs[key]
+        for key, value in kwargs.items():
+            if value is None:
+                del kwargs[key]
+        for date in ('date', 'start_date', 'end_date'):
+            if date in kwargs:
+                assert len(str(kwargs[date])) == 8 or len(
+                    str(kwargs[date])) == 10, f'Invalid date format: {kwargs[date]}'
+        if 'format' in kwargs and kwargs['format'] != 'json':
+            del kwargs['format']
+        kwargs['source'] = kwargs.get('source', 'pygeoglows')  # allow using default for specific apps which override
+        params = '&'.join([f'{key}={value}' for key, value in kwargs.items()])
+
+        # piece together the request url
+        request_url = f'{endpoint}/{version}/{product_name}'  # build the base url
+        request_url = f'{request_url}/{river_id}' if river_id else request_url  # add the river_id if it exists
+        request_url = f'{request_url}?{params}'  # add the query parameters
+
+        if return_format == 'url':
+            return request_url.replace(f'source={kwargs["source"]}', '')
+
+        response = requests.get(request_url)
+
+        if response.status_code != 200:
+            raise RuntimeError('Received an error from the REST API: ' + response.text)
+
+        if return_format == 'csv':
+            df = pd.read_csv(StringIO(response.text))
+            if 'datetime' in df.columns:
+                df['datetime'] = pd.to_datetime(df['datetime'])
+                df = df.set_index('datetime')
+            return df
+        elif return_format == 'json':
+            return response.json()
+        else:
+            raise ValueError(f'Unsupported return format requested: {return_format}')
+
+    def main(*args, **kwargs):
+        source = kwargs.get('data_source', 'aws')
+        assert source in ('rest', 'aws'), ValueError(f'Unrecognized data source requested: {source}')
+        if source == 'rest':
+            return from_rest(*args, **kwargs)
+        return from_aws(*args, **kwargs)
+
+    main.__doc__ = function.__doc__  # necessary for code documentation auto generators
+    return main
+
+
+def _retrospective(function):
+    def main(*args, **kwargs):
+        product_name = function.__name__.replace("_", "-").lower()
+
+        river_id = args[0] if len(args) > 0 else None
+        river_id = kwargs.get('river_id', '') if not river_id else river_id
+
+        return_format = kwargs.get('format', 'df')
+        assert return_format in ('df', 'xarray'), f'Unsupported return format requested: {return_format}'
+
+        method = kwargs.get('method', 'gumbel1')
+
+        if kwargs.get('skip_log', False):
+            requests.post(f'{DEFAULT_REST_ENDPOINT}{DEFAULT_REST_ENDPOINT_VERSION}/log',
+                          timeout=1,  # short timeout because we don't need the response, post just needs to be received
+                          json={'river_id': river_id, 'product': product_name, 'format': return_format})
+
+        s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION))
+        s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/{product_name}.zarr', s3=s3, check=False)
+        ds = xr.open_zarr(s3store)
+        try:
+            ds = ds.sel(rivid=river_id)
+        except Exception:
+            raise ValueError(f'River ID(s) not found in the retrospective dataset: {river_id}')
+        if return_format == 'xarray':
+            return ds
+        if product_name == 'retrospective':
+            return (
+                ds
+                .to_dataframe()
+                .reset_index()
+                .set_index('time')
+                .pivot(columns='rivid', values='Qout')
+            )
+        if product_name == 'return-periods':
+            rp_methods = {
+                'gumbel1': 'gumbel1_return_period',
+            }
+            assert method in rp_methods, f'Unrecognized return period estimation method given: {method}'
+            return (
+                ds
+                [rp_methods[method]]
+                .to_dataframe()
+                .reset_index()
+                .pivot(index='rivid', columns='return_period', values=rp_methods[method])
+            )
+        raise ValueError(f'Unsupported product requested: {product_name}')
+
+    main.__doc__ = function.__doc__  # necessary for code documentation auto generators
+    return main
diff --git a/geoglows/data.py b/geoglows/data.py
index dae15b5..dcf8409 100644
--- a/geoglows/data.py
+++ b/geoglows/data.py
@@ -1,17 +1,13 @@
 import os
 import warnings
-from io import StringIO
 
 import pandas as pd
-import requests
-import s3fs
 import xarray as xr
-import numpy as np
 
 from ._constants import METADATA_TABLE_PATH
+from ._download_decorators import _forecast, _retrospective
+
 from .analyze import (
-    simple_forecast as calc_simple_forecast,
-    forecast_stats as calc_forecast_stats,
     daily_averages as calc_daily_averages,
     monthly_averages as calc_monthly_averages,
     annual_averages as calc_annual_averages,
@@ -33,161 +29,9 @@
     'metadata_tables',
 ]
 
-DEFAULT_REST_ENDPOINT = 'https://geoglows.ecmwf.int/api/'
-DEFAULT_REST_ENDPOINT_VERSION = 'v2'  # 'v1, v2, latest'
-ODP_CORE_S3_BUCKET_URI = 's3://geoglows-v2'
-ODP_FORECAST_S3_BUCKET_URI = 's3://geoglows-v2-forecasts'
-ODP_RETROSPECTIVE_S3_BUCKET_URI = 's3://geoglows-v2-retrospective'
-ODP_S3_BUCKET_REGION = 'us-west-2'
-
-
-def _forecast_endpoint_decorator(function):
-    def from_aws(*args, **kwargs):
-        product_name = function.__name__.replace("_", "").lower()
-        if product_name == 'forecastrecords':
-            warnings.warn('forecast_records are not available from the AWS Open Data Program.')
-            return from_rest(*args, **kwargs)
-
-        river_id = kwargs.get('river_id', '')
-        river_id = args[0] if len(args) > 0 else river_id
-
-        return_format = kwargs.get('format', 'df')
-        assert return_format in ('df', 'xarray'), f'Unsupported return format requested: {return_format}'
-
-        s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION))
-        date = kwargs.get('date', False)
-        if not date:
-            zarr_vars = ['rivid', 'Qout', 'time', 'ensemble']
-            dates = [s3.glob(os.path.join(ODP_FORECAST_S3_BUCKET_URI, f'*.zarr/{var}')) for var in zarr_vars]
-            dates = [set([d.split('/')[1].replace('.zarr', '') for d in date]) for date in dates]
-            dates = sorted(set.intersection(*dates), reverse=True)
-            if product_name == 'dates':
-                return pd.DataFrame(dict(dates=dates))
-            date = dates[0]
-        if len(date) == 8:
-            date = f'{date}00.zarr'
-        elif len(date) == 10:
-            date = f'{date}.zarr'
-        else:
-            raise ValueError('Date must be YYYYMMDD or YYYYMMDDHH format. Use dates() to view available data.')
-
-        s3store = s3fs.S3Map(root=f'{ODP_FORECAST_S3_BUCKET_URI}/{date}', s3=s3, check=False)
-
-        attrs = {
-            'source': 'geoglows',
-            'forecast_date': date[:8],
-            'retrieval_date': pd.Timestamp.now().strftime('%Y%m%d'),
-            'units': 'cubic meters per second',
-        }
-        ds = xr.open_zarr(s3store).sel(rivid=river_id)
-        if return_format == 'xarray' and product_name == 'forecastensembles':
-            ds = ds.rename({'time': 'datetime', 'rivid': 'river_id'})
-            ds.attrs = attrs
-            return ds
-        df = ds.to_dataframe().round(2).reset_index()
-
-        # rename columns to match the REST API
-        if isinstance(river_id, int) or isinstance(river_id, np.int64):
-            df = df.pivot(index='time', columns='ensemble', values='Qout')
-        else:
-            df = df.pivot(index=['time', 'rivid'], columns='ensemble', values='Qout')
-            df.index.names = ['time', 'river_id']
-        df = df[sorted(df.columns)]
-        df.columns = [f'ensemble_{str(x).zfill(2)}' for x in df.columns]
-
-        if product_name == 'forecast':
-            df = calc_simple_forecast(df)
-        elif product_name == 'forecaststats':
-            df = calc_forecast_stats(df)
-
-        if return_format == 'df':
-            return df
-        ds = df.to_xarray()
-        ds.attrs = attrs
-        return ds
-
-    def from_rest(*args, **kwargs):
-        # update the default values set by the function unless the user has already specified them
-        for key, value in function.__kwdefaults__.items() if function.__kwdefaults__ else []:
-            if key not in kwargs:
-                kwargs[key] = value
-
-        return_format = kwargs.get('format', 'csv')
-        assert return_format in ('csv', 'json', 'url'), f'Unsupported format requested: {return_format}'
-
-        # parse out the information necessary to build a request url
-        endpoint = kwargs.get('endpoint', DEFAULT_REST_ENDPOINT)
-        endpoint = endpoint[:-1] if endpoint[-1] == '/' else endpoint
-        endpoint = endpoint + '/api' if not endpoint.endswith('/api') else endpoint
-        endpoint = f'https://{endpoint}' if not endpoint.startswith(('https://', 'http://')) else endpoint
-
-        version = kwargs.get('version', DEFAULT_REST_ENDPOINT_VERSION)
-        assert version in ('v2',), ValueError(f'Unrecognized model version parameter: {version}')
-
-        product_name = function.__name__.replace("_", "").lower()
-
-        river_id = args[0] if len(args) > 0 else None
-        river_id = kwargs.get('river_id', '') if not river_id else river_id
-        if isinstance(river_id, list):
-            raise ValueError('Multiple river_ids are not available via REST API or on v1. '
-                             'Use data_source="aws" and version="v2" for multiple river_ids.')
-        river_id = int(river_id) if river_id else None
-        if river_id and version == 'v2':
-            assert 1_000_000_000 > river_id >= 110_000_000, ValueError('River ID must be a 9 digit integer')
-
-        # request parameter validation before submitting
-        for key in ('endpoint', 'version', 'river_id'):
-            if key in kwargs:
-                del kwargs[key]
-        for key, value in kwargs.items():
-            if value is None:
-                del kwargs[key]
-        for date in ('date', 'start_date', 'end_date'):
-            if date in kwargs:
-                assert len(str(kwargs[date])) == 8 or len(
-                    str(kwargs[date])) == 10, f'Invalid date format: {kwargs[date]}'
-        if 'format' in kwargs and kwargs['format'] != 'json':
-            del kwargs['format']
-        kwargs['source'] = kwargs.get('source', 'pygeoglows')  # allow using default for specific apps which override
-        params = '&'.join([f'{key}={value}' for key, value in kwargs.items()])
-
-        # piece together the request url
-        request_url = f'{endpoint}/{version}/{product_name}'  # build the base url
-        request_url = f'{request_url}/{river_id}' if river_id else request_url  # add the river_id if it exists
-        request_url = f'{request_url}?{params}'  # add the query parameters
-
-        if return_format == 'url':
-            return request_url.replace(f'source={kwargs["source"]}', '')
-
-        response = requests.get(request_url)
-
-        if response.status_code != 200:
-            raise RuntimeError('Received an error from the REST API: ' + response.text)
-
-        if return_format == 'csv':
-            df = pd.read_csv(StringIO(response.text))
-            if 'datetime' in df.columns:
-                df['datetime'] = pd.to_datetime(df['datetime'])
-                df = df.set_index('datetime')
-            return df
-        elif return_format == 'json':
-            return response.json()
-        else:
-            raise ValueError(f'Unsupported return format requested: {return_format}')
-
-    def main(*args, **kwargs):
-        source = kwargs.get('data_source', 'aws')
-        assert source in ('rest', 'aws'), ValueError(f'Unrecognized data source requested: {source}')
-        if source == 'rest':
-            return from_rest(*args, **kwargs)
-        return from_aws(*args, **kwargs)
-
-    main.__doc__ = function.__doc__  # necessary for code documentation auto generators
-    return main
-
 
 # Forecast data and derived products
-@_forecast_endpoint_decorator
+@_forecast
 def dates(**kwargs) -> dict or str:
     """
     Gets a list of available forecast product dates
@@ -204,7 +48,7 @@ def dates(**kwargs) -> dict or str:
     pass
 
 
-@_forecast_endpoint_decorator
+@_forecast
 def forecast(*, river_id: int, date: str, format: str, data_source: str,
              **kwargs) -> pd.DataFrame or xr.Dataset:
     """
@@ -222,7 +66,7 @@ def forecast(*, river_id: int, date: str, format: str, data_source: str,
     pass
 
 
-@_forecast_endpoint_decorator
+@_forecast
 def forecast_stats(*, river_id: int, date: str, format: str, data_source: str,
                    **kwargs) -> pd.DataFrame or xr.Dataset:
     """
@@ -241,7 +85,7 @@ def forecast_stats(*, river_id: int, date: str, format: str, data_source: str,
     pass
 
 
-@_forecast_endpoint_decorator
+@_forecast
 def forecast_ensembles(*, river_id: int, date: str, format: str, data_source: str,
                        **kwargs) -> pd.DataFrame or xr.Dataset:
     """
@@ -259,7 +103,7 @@ def forecast_ensembles(*, river_id: int, date: str, format: str, data_source: st
     pass
 
 
-@_forecast_endpoint_decorator
+@_forecast
 def forecast_records(*, river_id: int, start_date: str, end_date: str, format: str,
                      **kwargs) -> pd.DataFrame or dict or str:
     """
@@ -278,7 +122,8 @@ def forecast_records(*, river_id: int, start_date: str, end_date: str, format: s
 
 
 # Retrospective simulation and derived products
-def retrospective(river_id: int or list, format: str = 'df') -> pd.DataFrame or xr.Dataset:
+@_retrospective
+def retrospective(river_id: int or list, *, format: str = 'df') -> pd.DataFrame or xr.Dataset:
     """
     Retrieves the retrospective simulation of streamflow for a given river_id from the
     AWS Open Data Program GEOGLOWS V2 S3 bucket
@@ -290,15 +135,10 @@ def retrospective(river_id: int or list, format: str = 'df') -> pd.DataFrame or
     Returns:
         pd.DataFrame
     """
-    s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION))
-    s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/retrospective.zarr', s3=s3, check=False)
-    ds = xr.open_zarr(s3store).sel(rivid=river_id)
-    if format == 'xarray':
-        return ds
-    return ds.to_dataframe().reset_index().set_index('time').pivot(columns='rivid', values='Qout')
+    pass
 
 
-def daily_averages(river_id: int or list) -> pd.DataFrame:
+def daily_averages(river_id: int or list, **kwargs) -> pd.DataFrame:
     """
     Retrieves daily average streamflow for a given river_id
 
@@ -308,11 +148,11 @@ def daily_averages(river_id: int or list) -> pd.DataFrame:
     Returns:
         pd.DataFrame
     """
-    df = retrospective(river_id)
+    df = retrospective(river_id, **kwargs)
     return calc_daily_averages(df)
 
 
-def monthly_averages(river_id: int or list) -> pd.DataFrame:
+def monthly_averages(river_id: int or list, **kwargs) -> pd.DataFrame:
     """
     Retrieves monthly average streamflow for a given river_id
 
@@ -322,11 +162,11 @@ def monthly_averages(river_id: int or list) -> pd.DataFrame:
     Returns:
         pd.DataFrame
     """
-    df = retrospective(river_id)
+    df = retrospective(river_id, **kwargs)
     return calc_monthly_averages(df)
 
 
-def annual_averages(river_id: int or list) -> pd.DataFrame:
+def annual_averages(river_id: int or list, **kwargs) -> pd.DataFrame:
     """
     Retrieves annual average streamflow for a given river_id
 
@@ -336,11 +176,12 @@ def annual_averages(river_id: int or list) -> pd.DataFrame:
     Returns:
         pd.DataFrame
     """
-    df = retrospective(river_id)
+    df = retrospective(river_id, **kwargs)
     return calc_annual_averages(df)
 
 
-def return_periods(river_id: int or list, format: str = 'df', method: str = 'gumbel1') -> pd.DataFrame or xr.Dataset:
+@_retrospective
+def return_periods(river_id: int or list, *, format: str = 'df', method: str = 'gumbel1') -> pd.DataFrame or xr.Dataset:
     """
     Retrieves the return period thresholds based on a specified historic simulation forcing on a certain river_id.
 
@@ -355,17 +196,7 @@ def return_periods(river_id: int or list, format: str = 'df', method: str = 'gum
     Returns:
         pd.DataFrame
     """
-    rp_methods = {
-        'gumbel1': 'gumbel1_return_period',
-    }
-    assert method in rp_methods, f'Unrecognized return period estimation method given: {method}'
-    s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=ODP_S3_BUCKET_REGION))
-    s3store = s3fs.S3Map(root=f'{ODP_RETROSPECTIVE_S3_BUCKET_URI}/return-periods.zarr', s3=s3, check=False)
-    ds = xr.open_zarr(s3store).sel(rivid=river_id)
-    if format == 'xarray':
-        return ds
-    return (ds[rp_methods[method]].to_dataframe().reset_index()
-            .pivot(index='rivid', columns='return_period', values=rp_methods[method]))
+    pass
 
 
 # model config and supplementary data

From 9b174a2df8f38d8cd68813e35e430976c5dd3e62 Mon Sep 17 00:00:00 2001
From: Biplov Bhandari <bionicbiplov45@gmail.com>
Date: Fri, 10 May 2024 09:57:55 -0500
Subject: [PATCH 05/11]  fix the parquet link to the metatable (#36)

* fix the parquet link to the metatable
add metatable as an argument to the stream latlon_to_river function

* update docs
---
 .gitignore          |  3 ++-
 geoglows/data.py    | 11 ++++++++---
 geoglows/streams.py |  5 +++--
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 010e8d0..f5f4f81 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ dev.ipynb
 geoglows.egg-info
 dist
 .pypirc
-*.parquet
\ No newline at end of file
+*.parquet
+*.pyc
diff --git a/geoglows/data.py b/geoglows/data.py
index dcf8409..f635888 100644
--- a/geoglows/data.py
+++ b/geoglows/data.py
@@ -200,24 +200,29 @@ def return_periods(river_id: int or list, *, format: str = 'df', method: str = '
 
 
 # model config and supplementary data
-def metadata_tables(columns: list = None) -> pd.DataFrame:
+def metadata_tables(columns: list = None, metadata_table_path: str = None) -> pd.DataFrame:
     """
     Retrieves the master table of rivers metadata and properties as a pandas DataFrame
     Args:
         columns (list): optional subset of columns names to read from the parquet
+        metadata_table_path (str): optional path to a local copy of the metadata table
 
     Returns:
         pd.DataFrame
     """
     if os.path.exists(METADATA_TABLE_PATH):
         return pd.read_parquet(METADATA_TABLE_PATH, columns=columns)
+
+    if metadata_table_path:
+        return pd.read_parquet(metadata_table_path, columns=columns)
+
     warn = f"""
-    Local copy of geoglows v2 metadata table not found. You should download a copy for optimal performance and 
+    Local copy of geoglows v2 metadata table not found. You should download a copy for optimal performance and
     to make the data available when you are offline. A copy of the table will be cached at {METADATA_TABLE_PATH}.
     Alternatively, set the environment variable PYGEOGLOWS_METADATA_TABLE_PATH to the path of the table.
     """
     warnings.warn(warn)
-    df = pd.read_parquet('https://geoglows-v2.s3-website-us-west-2.amazonaws.com/tables/package-metadata-table.parquet')
+    df = pd.read_parquet('http://geoglows-v2.s3-website-us-west-2.amazonaws.com/tables/package-metadata-table.parquet')
     os.makedirs(os.path.dirname(METADATA_TABLE_PATH), exist_ok=True)
     df.to_parquet(METADATA_TABLE_PATH)
     return df[columns] if columns else df
diff --git a/geoglows/streams.py b/geoglows/streams.py
index d6f19b5..919d86f 100644
--- a/geoglows/streams.py
+++ b/geoglows/streams.py
@@ -22,17 +22,18 @@ def river_to_vpu(river_id: int) -> int:
     )
 
 
-def latlon_to_river(lat: float, lon: float) -> int:
+def latlon_to_river(lat: float, lon: float, metadata_table_path: str = None) -> int:
     """
     Gives the River ID number whose outlet is nearest the given lat and lon
     Args:
         lat (float): a latitude
         lon (float): a longitude
+        metadata_table_path (str): optional path to the local metadata table
 
     Returns:
         int: a 9 digit integer that is a valid GEOGLOWS River ID number
     """
-    df = metadata_tables(columns=['LINKNO', 'lat', 'lon'])
+    df = metadata_tables(columns=['LINKNO', 'lat', 'lon'], metadata_table_path=metadata_table_path)
     df['dist'] = ((df['lat'] - lat) ** 2 + (df['lon'] - lon) ** 2) ** 0.5
     return df.loc[lambda x: x['dist'] == df['dist'].min(), 'LINKNO'].values[0]
 

From 23d0a1170f6075cf9f22b09dcfa2fde44103a08a Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Fri, 10 May 2024 09:02:37 -0600
Subject: [PATCH 06/11] update version, docs, add arg to other streams
 functions

---
 docs/index.rst         | 32 ++++++++++++++++++++++----------
 geoglows/__init__.py   |  6 +++---
 geoglows/_constants.py | 10 ++++++++++
 geoglows/data.py       | 19 +++++++++----------
 geoglows/streams.py    | 10 ++++++----
 5 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index 68e44e1..8081813 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -7,19 +7,31 @@ geoglows
 .. image:: https://anaconda.org/conda-forge/geoglows/badges/latest_release_date.svg
         :target: https://anaconda.org/geoglows/geoglows
 
-The geoglows Python package enables access to data, API's, and code developed for the `GEOGLOWS Streamflow Model <https://geoglows.ecmwf.int>`_.
+The geoglows Python package enables access to data, API's, and code developed for the `GEOGLOWS Hydrology Model <https://geoglows.ecmwf.int>`_.
 Read more about GEOGLOWS at `<https://geoglows.org>`_
 
-For demos, tutorials, and other training materials for GEOGLOWS and the geoglows Python packge, please visit
-`<https://data.geoglows.org>`_.
+For demos, tutorials, and other training materials for GEOGLOWS and the geoglows Python packge, please visit `<https://data.geoglows.org>`_.
 
-About GEOGLOWS ECMWF Streamflow
-===============================
-GEOGLOWS ECMWF Streamflow Project: This project provides access to the results of a hydrological model that is run each
-day. The model is based on a group of unique weather forecasts, known as an ensemble, from ECMWF. Each unique
-precipitation forecast, known as an ensemble member, produces a unique streamflow forecast. There are 52 members of the
-ensemble that drives the model each day. The ERA-5 historical precipitation dataset to also used to produce a
-retrospective streamflow on each river. `Read more here <https://geoglows.ecmwf.int>`_.
+Supplemental Data
+=================
+Some functions in this package will help you browse the metadata for the model to identify river locations, names, and
+other information. It is available online at `<http://geoglows-v2.s3-website-us-west-2.amazonaws.com/tables/package-metadata-table.parquet>`_.
+If you do not already have a copy downloaded, the code will fetch it for you and cache a copy in the same directory that
+the source code of the package is installed in.
+
+It is more efficient to save this file yourself and reuse it so that you do not have to download it every time your python
+environment is recreated, the package version is updated, and so on. You may do this in several ways.
+
+1. You can set the environment variable `PYGEOGLOWS_METADATA_TABLE_PATH` **before** importing the package.
+2. Call the `geoglows.set_metadata_table_path` function to set the path the path at any time **after** importing.
+3. Pass the path to the table to functions that use it.
+
+About the GEOGLOWS Hydrology Model
+==================================
+The GEOGLOWS Hydrology Model is run each day at midnight (UTC +00). The model is based on the ECMWF ENS and HRES ensemble
+of meteorology and land surface model forecasts. There are 51 members of the ensemble that drives the model each day.
+The ERA5 reanalysis dataset is also used to produce a retrospective simulation on each river. The model provides river in
+units m^3/s over the preceeding interval (1, 3, or 24 hours depending on the dataset). `Read more here <https://data.geoglows.org>`_.
 
 .. toctree::
     :caption: Table of Contents
diff --git a/geoglows/__init__.py b/geoglows/__init__.py
index 0a13e9f..2cd7b09 100644
--- a/geoglows/__init__.py
+++ b/geoglows/__init__.py
@@ -6,12 +6,12 @@
 import geoglows.tables
 import geoglows.streamflow
 
-from ._constants import METADATA_TABLE_PATH
+from ._constants import get_metadata_table_path, set_metadata_table_path
 
 __all__ = [
     'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow',
-    'METADATA_TABLE_PATH'
+    'get_metadata_table_path', 'set_metadata_table_path',
 ]
-__version__ = '1.5.0'
+__version__ = '1.6.0'
 __author__ = 'Riley Hales'
 __license__ = 'BSD 3-Clause Clear License'
diff --git a/geoglows/_constants.py b/geoglows/_constants.py
index fd69956..50ff8f0 100644
--- a/geoglows/_constants.py
+++ b/geoglows/_constants.py
@@ -9,3 +9,13 @@
     'PYGEOGLOWS_METADATA_TABLE_PATH',
     os.path.join(os.path.dirname(__file__), 'data', 'metadata-tables.parquet')
 )
+
+
+def get_metadata_table_path() -> str:
+    return METADATA_TABLE_PATH
+
+
+def set_metadata_table_path(path: str) -> str:
+    global METADATA_TABLE_PATH
+    METADATA_TABLE_PATH = path
+    return METADATA_TABLE_PATH
diff --git a/geoglows/data.py b/geoglows/data.py
index f635888..1dea868 100644
--- a/geoglows/data.py
+++ b/geoglows/data.py
@@ -4,7 +4,7 @@
 import pandas as pd
 import xarray as xr
 
-from ._constants import METADATA_TABLE_PATH
+from ._constants import get_metadata_table_path
 from ._download_decorators import _forecast, _retrospective
 
 from .analyze import (
@@ -210,19 +210,18 @@ def metadata_tables(columns: list = None, metadata_table_path: str = None) -> pd
     Returns:
         pd.DataFrame
     """
-    if os.path.exists(METADATA_TABLE_PATH):
-        return pd.read_parquet(METADATA_TABLE_PATH, columns=columns)
-
     if metadata_table_path:
         return pd.read_parquet(metadata_table_path, columns=columns)
-
+    metadata_table_path = get_metadata_table_path()
+    if os.path.exists(metadata_table_path):
+        return pd.read_parquet(metadata_table_path, columns=columns)
     warn = f"""
-    Local copy of geoglows v2 metadata table not found. You should download a copy for optimal performance and
-    to make the data available when you are offline. A copy of the table will be cached at {METADATA_TABLE_PATH}.
-    Alternatively, set the environment variable PYGEOGLOWS_METADATA_TABLE_PATH to the path of the table.
+    Local copy of geoglows v2 metadata table not found.
+    A copy of the table has been cached at {metadata_table_path} which you can move as desired.
+    You should set the environment variable PYGEOGLOWS_METADATA_TABLE_PATH or provide the metadata_table_path argument.
     """
     warnings.warn(warn)
     df = pd.read_parquet('http://geoglows-v2.s3-website-us-west-2.amazonaws.com/tables/package-metadata-table.parquet')
-    os.makedirs(os.path.dirname(METADATA_TABLE_PATH), exist_ok=True)
-    df.to_parquet(METADATA_TABLE_PATH)
+    os.makedirs(os.path.dirname(metadata_table_path), exist_ok=True)
+    df.to_parquet(metadata_table_path)
     return df[columns] if columns else df
diff --git a/geoglows/streams.py b/geoglows/streams.py
index 919d86f..2ba94d9 100644
--- a/geoglows/streams.py
+++ b/geoglows/streams.py
@@ -5,18 +5,19 @@
 __all__ = ['river_to_vpu', 'latlon_to_river', 'river_to_latlon', ]
 
 
-def river_to_vpu(river_id: int) -> int:
+def river_to_vpu(river_id: int, metadata_table_path: str = None) -> int:
     """
     Gives the VPU number for a given River ID number
 
     Args:
         river_id (int): a 9 digit integer that is a valid GEOGLOWS River ID number
+        metadata_table_path (str): optional path to the local metadata table
 
     Returns:
         int: a 3 digit integer that is the VPU number for the given River ID number
     """
     return (
-        metadata_tables(columns=['LINKNO', 'VPUCode'])
+        metadata_tables(columns=['LINKNO', 'VPUCode'], metadata_table_path=metadata_table_path)
         .loc[lambda x: x['LINKNO'] == river_id, 'VPUCode']
         .values[0]
     )
@@ -38,18 +39,19 @@ def latlon_to_river(lat: float, lon: float, metadata_table_path: str = None) ->
     return df.loc[lambda x: x['dist'] == df['dist'].min(), 'LINKNO'].values[0]
 
 
-def river_to_latlon(river_id: int) -> np.ndarray:
+def river_to_latlon(river_id: int, metadata_table_path: str = None) -> np.ndarray:
     """
     Gives the lat and lon of the outlet of the river with the given River ID number
 
     Args:
         river_id (int): a 9 digit integer that is a valid GEOGLOWS River ID number
+        metadata_table_path (str): optional path to the local metadata table
 
     Returns:
         np.ndarray: a numpy array of floats, [lat, lon]
     """
     return (
-        metadata_tables(columns=['LINKNO', 'lat', 'lon'])
+        metadata_tables(columns=['LINKNO', 'lat', 'lon'], metadata_table_path=metadata_table_path)
         .loc[lambda x: x['LINKNO'] == river_id, ['lat', 'lon']]
         .values[0]
     )

From 8b3729d879643561dc40908985d873c81d2597b6 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Fri, 10 May 2024 09:03:26 -0600
Subject: [PATCH 07/11] correct docs version number

---
 docs/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index 6c3694b..332595a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -15,7 +15,7 @@
 author = 'Riley Hales, PhD'
 
 # The full version, including alpha/beta/rc tags
-release = '1.5.0'
+release = '1.6.0'
 master_doc = 'index'
 
 # -- General configuration ---------------------------------------------------

From 2121219fa1ccc7e9b2e30c994a604c4f02215189 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Mon, 13 May 2024 22:31:49 -0600
Subject: [PATCH 08/11] correct retrieving return periods with lists or arrays

---
 docs/conf.py                     |  2 +-
 geoglows/__init__.py             |  2 +-
 geoglows/_download_decorators.py | 13 +++++++------
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 332595a..41da386 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -15,7 +15,7 @@
 author = 'Riley Hales, PhD'
 
 # The full version, including alpha/beta/rc tags
-release = '1.6.0'
+release = '1.6.1'
 master_doc = 'index'
 
 # -- General configuration ---------------------------------------------------
diff --git a/geoglows/__init__.py b/geoglows/__init__.py
index 2cd7b09..e6a6d92 100644
--- a/geoglows/__init__.py
+++ b/geoglows/__init__.py
@@ -12,6 +12,6 @@
     'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow',
     'get_metadata_table_path', 'set_metadata_table_path',
 ]
-__version__ = '1.6.0'
+__version__ = '1.6.1'
 __author__ = 'Riley Hales'
 __license__ = 'BSD 3-Clause Clear License'
diff --git a/geoglows/_download_decorators.py b/geoglows/_download_decorators.py
index fb4924d..6e01e0f 100644
--- a/geoglows/_download_decorators.py
+++ b/geoglows/_download_decorators.py
@@ -35,8 +35,9 @@ def from_aws(*args, **kwargs):
             warnings.warn('forecast_records are not available from the AWS Open Data Program.')
             return from_rest(*args, **kwargs)
 
-        river_id = kwargs.get('river_id', '')
-        river_id = args[0] if len(args) > 0 else river_id
+        river_id = args[0] if len(args) > 0 else kwargs.get('river_id', '')
+        if river_id is None or river_id == '':
+            raise ValueError('River ID must be provided to retrieve forecast data.')
 
         return_format = kwargs.get('format', 'df')
         assert return_format in ('df', 'xarray'), f'Unsupported return format requested: {return_format}'
@@ -118,8 +119,7 @@ def from_rest(*args, **kwargs):
 
         product_name = function.__name__.replace("_", "").lower()
 
-        river_id = args[0] if len(args) > 0 else None
-        river_id = kwargs.get('river_id', '') if not river_id else river_id
+        river_id = args[0] if len(args) > 0 else kwargs.get('river_id', '')
         if isinstance(river_id, list):
             raise ValueError('Multiple river_ids are not available via REST API or on v1. '
                              'Use data_source="aws" for multiple river_ids.')
@@ -182,8 +182,9 @@ def _retrospective(function):
     def main(*args, **kwargs):
         product_name = function.__name__.replace("_", "-").lower()
 
-        river_id = args[0] if len(args) > 0 else None
-        river_id = kwargs.get('river_id', '') if not river_id else river_id
+        river_id = args[0] if len(args) > 0 else kwargs.get('river_id', '')
+        if river_id is None or river_id == '':
+            raise ValueError('River ID must be provided to retrieve retrospective data.')
 
         return_format = kwargs.get('format', 'df')
         assert return_format in ('df', 'xarray'), f'Unsupported return format requested: {return_format}'

From 14ca779584a1a3c825d75f4242a3c4728e5000d2 Mon Sep 17 00:00:00 2001
From: Riley Hales PhD <39097632+rileyhales@users.noreply.github.com>
Date: Wed, 22 May 2024 14:17:09 -0600
Subject: [PATCH 09/11] correct errors in s3 file pathing specific to windows
 (#37)

* correct errors in s3 file pathing specific to windows

* update param validation for rest endpoints
---
 docs/conf.py                     |  2 +-
 environment.yaml                 |  1 +
 geoglows/__init__.py             |  2 +-
 geoglows/_download_decorators.py | 40 ++++++++++++++++++--------------
 4 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 41da386..6f9a35d 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -15,7 +15,7 @@
 author = 'Riley Hales, PhD'
 
 # The full version, including alpha/beta/rc tags
-release = '1.6.1'
+release = '1.6.2'
 master_doc = 'index'
 
 # -- General configuration ---------------------------------------------------
diff --git a/environment.yaml b/environment.yaml
index 6908fc6..0ab021c 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -3,6 +3,7 @@ channels:
   - conda-forge
 dependencies:
   - python>=3
+  - cftime
   - dask >=2024
   - fastparquet
   - HydroErr
diff --git a/geoglows/__init__.py b/geoglows/__init__.py
index e6a6d92..a0ee1f5 100644
--- a/geoglows/__init__.py
+++ b/geoglows/__init__.py
@@ -12,6 +12,6 @@
     'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow',
     'get_metadata_table_path', 'set_metadata_table_path',
 ]
-__version__ = '1.6.1'
+__version__ = '1.6.2'
 __author__ = 'Riley Hales'
 __license__ = 'BSD 3-Clause Clear License'
diff --git a/geoglows/_download_decorators.py b/geoglows/_download_decorators.py
index 6e01e0f..501ea70 100644
--- a/geoglows/_download_decorators.py
+++ b/geoglows/_download_decorators.py
@@ -1,23 +1,21 @@
-import os
 import warnings
 from io import StringIO
 
+import numpy as np
 import pandas as pd
 import requests
 import s3fs
 import xarray as xr
-import numpy as np
-
-from .analyze import (
-    simple_forecast as calc_simple_forecast,
-    forecast_stats as calc_forecast_stats,
-)
 
 from ._constants import (
     ODP_FORECAST_S3_BUCKET_URI,
     ODP_RETROSPECTIVE_S3_BUCKET_URI,
     ODP_S3_BUCKET_REGION,
 )
+from .analyze import (
+    simple_forecast as calc_simple_forecast,
+    forecast_stats as calc_forecast_stats,
+)
 
 DEFAULT_REST_ENDPOINT = 'https://geoglows.ecmwf.int/api/'
 DEFAULT_REST_ENDPOINT_VERSION = 'v2'  # 'v1, v2, latest'
@@ -29,14 +27,22 @@
 
 
 def _forecast(function):
+    def _river_id_is_iterable(river_id):
+        return bool(
+                isinstance(river_id, list) or
+                isinstance(river_id, tuple) or
+                isinstance(river_id, set) or
+                isinstance(river_id, np.ndarray)
+        )
+
     def from_aws(*args, **kwargs):
         product_name = function.__name__.replace("_", "").lower()
         if product_name == 'forecastrecords':
             warnings.warn('forecast_records are not available from the AWS Open Data Program.')
             return from_rest(*args, **kwargs)
 
-        river_id = args[0] if len(args) > 0 else kwargs.get('river_id', '')
-        if river_id is None or river_id == '':
+        river_id = args[0] if len(args) > 0 else kwargs.get('river_id', None)
+        if river_id is None:
             raise ValueError('River ID must be provided to retrieve forecast data.')
 
         return_format = kwargs.get('format', 'df')
@@ -51,7 +57,7 @@ def from_aws(*args, **kwargs):
         date = kwargs.get('date', False)
         if not date:
             zarr_vars = ['rivid', 'Qout', 'time', 'ensemble']
-            dates = [s3.glob(os.path.join(ODP_FORECAST_S3_BUCKET_URI, f'*.zarr/{var}')) for var in zarr_vars]
+            dates = [s3.glob(ODP_FORECAST_S3_BUCKET_URI + '/' + f'*.zarr/{var}') for var in zarr_vars]
             dates = [set([d.split('/')[1].replace('.zarr', '') for d in date]) for date in dates]
             dates = sorted(set.intersection(*dates), reverse=True)
             if product_name == 'dates':
@@ -119,11 +125,11 @@ def from_rest(*args, **kwargs):
 
         product_name = function.__name__.replace("_", "").lower()
 
-        river_id = args[0] if len(args) > 0 else kwargs.get('river_id', '')
-        if isinstance(river_id, list):
-            raise ValueError('Multiple river_ids are not available via REST API or on v1. '
-                             'Use data_source="aws" for multiple river_ids.')
-        river_id = int(river_id) if river_id else None
+        river_id = args[0] if len(args) > 0 else kwargs.get('river_id', None)
+        if river_id is None:
+            raise ValueError('River ID must be provided to retrieve forecast data.')
+        if not isinstance(river_id, (int, np.int64, )):
+            raise ValueError('Multiple river_ids are not available via REST API. Provide a single 9 digit integer.')
         if river_id and version == 'v2':
             assert 1_000_000_000 > river_id >= 110_000_000, ValueError('River ID must be a 9 digit integer')
 
@@ -182,8 +188,8 @@ def _retrospective(function):
     def main(*args, **kwargs):
         product_name = function.__name__.replace("_", "-").lower()
 
-        river_id = args[0] if len(args) > 0 else kwargs.get('river_id', '')
-        if river_id is None or river_id == '':
+        river_id = args[0] if len(args) > 0 else kwargs.get('river_id', None)
+        if river_id is None:
             raise ValueError('River ID must be provided to retrieve retrospective data.')
 
         return_format = kwargs.get('format', 'df')

From 52c8ea94a4da137c01e6a06d210e24fc4ed3d157 Mon Sep 17 00:00:00 2001
From: Riley Hales PhD <39097632+rileyhales@users.noreply.github.com>
Date: Thu, 30 May 2024 11:40:26 -0600
Subject: [PATCH 10/11] Fix date list (#38)

* correct bug getting date list

* increment version number
---
 docs/conf.py                     |  2 +-
 geoglows/__init__.py             |  2 +-
 geoglows/_download_decorators.py | 12 ++----------
 geoglows/data.py                 |  5 ++++-
 4 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 6f9a35d..a822a70 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -15,7 +15,7 @@
 author = 'Riley Hales, PhD'
 
 # The full version, including alpha/beta/rc tags
-release = '1.6.2'
+release = '1.6.3'
 master_doc = 'index'
 
 # -- General configuration ---------------------------------------------------
diff --git a/geoglows/__init__.py b/geoglows/__init__.py
index a0ee1f5..95e484e 100644
--- a/geoglows/__init__.py
+++ b/geoglows/__init__.py
@@ -12,6 +12,6 @@
     'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow',
     'get_metadata_table_path', 'set_metadata_table_path',
 ]
-__version__ = '1.6.2'
+__version__ = '1.6.3'
 __author__ = 'Riley Hales'
 __license__ = 'BSD 3-Clause Clear License'
diff --git a/geoglows/_download_decorators.py b/geoglows/_download_decorators.py
index 501ea70..3bfde37 100644
--- a/geoglows/_download_decorators.py
+++ b/geoglows/_download_decorators.py
@@ -27,14 +27,6 @@
 
 
 def _forecast(function):
-    def _river_id_is_iterable(river_id):
-        return bool(
-                isinstance(river_id, list) or
-                isinstance(river_id, tuple) or
-                isinstance(river_id, set) or
-                isinstance(river_id, np.ndarray)
-        )
-
     def from_aws(*args, **kwargs):
         product_name = function.__name__.replace("_", "").lower()
         if product_name == 'forecastrecords':
@@ -42,7 +34,7 @@ def from_aws(*args, **kwargs):
             return from_rest(*args, **kwargs)
 
         river_id = args[0] if len(args) > 0 else kwargs.get('river_id', None)
-        if river_id is None:
+        if river_id is None and product_name != 'dates':
             raise ValueError('River ID must be provided to retrieve forecast data.')
 
         return_format = kwargs.get('format', 'df')
@@ -126,7 +118,7 @@ def from_rest(*args, **kwargs):
         product_name = function.__name__.replace("_", "").lower()
 
         river_id = args[0] if len(args) > 0 else kwargs.get('river_id', None)
-        if river_id is None:
+        if river_id is None and product_name != 'dates':
             raise ValueError('River ID must be provided to retrieve forecast data.')
         if not isinstance(river_id, (int, np.int64, )):
             raise ValueError('Multiple river_ids are not available via REST API. Provide a single 9 digit integer.')
diff --git a/geoglows/data.py b/geoglows/data.py
index 1dea868..cf5f8fa 100644
--- a/geoglows/data.py
+++ b/geoglows/data.py
@@ -5,7 +5,7 @@
 import xarray as xr
 
 from ._constants import get_metadata_table_path
-from ._download_decorators import _forecast, _retrospective
+from ._download_decorators import _forecast, _retrospective, DEFAULT_REST_ENDPOINT, DEFAULT_REST_ENDPOINT_VERSION
 
 from .analyze import (
     daily_averages as calc_daily_averages,
@@ -27,6 +27,9 @@
     'return_periods',
 
     'metadata_tables',
+
+    'DEFAULT_REST_ENDPOINT',
+    'DEFAULT_REST_ENDPOINT_VERSION',
 ]
 
 

From 97eb2850b0746dc02c8771a98b2478fe47bd5a06 Mon Sep 17 00:00:00 2001
From: Riley Hales <39097632+rileyhales@users.noreply.github.com>
Date: Thu, 18 Jul 2024 21:52:21 -0600
Subject: [PATCH 11/11] Time zone index (#39)

* add time zone information to data retrievals

* optional decade average bars on annual averages plot
---
 geoglows/__init__.py                    |  2 +-
 geoglows/_download_decorators.py        |  6 ++-
 geoglows/_plots/__init__.py             |  2 +
 geoglows/_plots/format_tools.py         | 13 +++++
 geoglows/_plots/plotly_forecasts.py     | 24 ++++++---
 geoglows/_plots/plotly_retrospective.py | 66 +++++++++++++++++--------
 geoglows/_plots/plots.py                |  6 ++-
 7 files changed, 86 insertions(+), 33 deletions(-)

diff --git a/geoglows/__init__.py b/geoglows/__init__.py
index 95e484e..5cdc1c4 100644
--- a/geoglows/__init__.py
+++ b/geoglows/__init__.py
@@ -12,6 +12,6 @@
     'bias', 'plots', 'data', 'analyze', 'streams', 'tables', 'streamflow',
     'get_metadata_table_path', 'set_metadata_table_path',
 ]
-__version__ = '1.6.3'
+__version__ = '1.7.0'
 __author__ = 'Riley Hales'
 __license__ = 'BSD 3-Clause Clear License'
diff --git a/geoglows/_download_decorators.py b/geoglows/_download_decorators.py
index 3bfde37..23e702f 100644
--- a/geoglows/_download_decorators.py
+++ b/geoglows/_download_decorators.py
@@ -76,6 +76,7 @@ def from_aws(*args, **kwargs):
             ds.attrs = attrs
             return ds
         df = ds.to_dataframe().round(2).reset_index()
+        df['time'] = pd.to_datetime(df['time'], utc=True)
 
         # rename columns to match the REST API
         if isinstance(river_id, int) or isinstance(river_id, np.int64):
@@ -159,6 +160,7 @@ def from_rest(*args, **kwargs):
             if 'datetime' in df.columns:
                 df['datetime'] = pd.to_datetime(df['datetime'])
                 df = df.set_index('datetime')
+                df.index = df.index.tz_localize('UTC')
             return df
         elif return_format == 'json':
             return response.json()
@@ -204,13 +206,15 @@ def main(*args, **kwargs):
         if return_format == 'xarray':
             return ds
         if product_name == 'retrospective':
-            return (
+            df = (
                 ds
                 .to_dataframe()
                 .reset_index()
                 .set_index('time')
                 .pivot(columns='rivid', values='Qout')
             )
+            df.index = df.index.tz_localize('UTC')
+            return df
         if product_name == 'return-periods':
             rp_methods = {
                 'gumbel1': 'gumbel1_return_period',
diff --git a/geoglows/_plots/__init__.py b/geoglows/_plots/__init__.py
index 84233f5..e387110 100644
--- a/geoglows/_plots/__init__.py
+++ b/geoglows/_plots/__init__.py
@@ -12,6 +12,8 @@
     corrected_month_average,
     corrected_day_average,
     corrected_scatterplots,
+
+    plotly_figure_to_html_plot as plotly_figure_to_html,
 )
 
 __all__ = [
diff --git a/geoglows/_plots/format_tools.py b/geoglows/_plots/format_tools.py
index 41cdbf2..c714556 100644
--- a/geoglows/_plots/format_tools.py
+++ b/geoglows/_plots/format_tools.py
@@ -1,3 +1,6 @@
+import datetime
+
+import pytz
 from plotly.offline import plot as offline_plot
 
 
@@ -26,3 +29,13 @@ def plotly_figure_to_html_plot(figure, include_plotlyjs: bool = False, ) -> str:
         output_type='div',
         include_plotlyjs=include_plotlyjs
     )
+
+
+def timezone_label(timezone: str = None):
+    timezone = str(timezone) if timezone is not None else 'UTC'
+    # get the number of hours the timezone is offset from UTC
+    now = datetime.datetime.now(pytz.timezone(timezone))
+    utc_offset = now.utcoffset().total_seconds() / 3600
+    # convert float number of hours to HH:MM format
+    utc_offset = f'{int(utc_offset):+03d}:{int((utc_offset % 1) * 60):02d}'
+    return f'Datetime ({timezone} {utc_offset})'
diff --git a/geoglows/_plots/plotly_forecasts.py b/geoglows/_plots/plotly_forecasts.py
index 66802fc..bc3c178 100644
--- a/geoglows/_plots/plotly_forecasts.py
+++ b/geoglows/_plots/plotly_forecasts.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import plotly.graph_objects as go
 
-from .format_tools import build_title
+from .format_tools import build_title, timezone_label
 from .plotly_helpers import _rperiod_scatters
 
 __all__ = [
@@ -35,7 +35,7 @@ def forecast(df: pd.DataFrame, *,
         ),
         go.Scatter(
             name='Uncertainty Bounds',
-            x=np.concatenate([df.index.values, df.index.values[::-1]]),
+            x=np.concatenate([df.index, df.index[::-1]]),
             y=np.concatenate([df['flow_uncertainty_upper'], df['flow_uncertainty_lower'][::-1]]),
             legendgroup='uncertainty',
             showlegend=True,
@@ -67,7 +67,11 @@ def forecast(df: pd.DataFrame, *,
     layout = go.Layout(
         title=build_title('Forecasted Streamflow', plot_titles),
         yaxis={'title': 'Streamflow (m<sup>3</sup>/s)', 'range': [0, 'auto']},
-        xaxis={'title': 'Date (UTC +0:00)', 'range': [df.index[0], df.index[-1]]},
+        xaxis={
+            'title': timezone_label(df.index.tz),
+            'range': [df.index[0], df.index[-1]],
+            'hoverformat': '%d %b %Y %X',
+        },
     )
 
     return go.Figure(scatter_traces, layout=layout)
@@ -179,9 +183,9 @@ def forecast_stats(df: pd.DataFrame, *,
             'range': [0, 'auto']
         },
         xaxis={
-            'title': 'Date (UTC +0:00)',
+            'title': timezone_label(df.index.tz),
             'range': [startdate, enddate],
-            'hoverformat': '%b %d %Y',
+            'hoverformat': '%d %b %Y %X',
             'tickformat': '%b %d %Y'
         },
     )
@@ -250,9 +254,9 @@ def forecast_ensembles(df: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_tit
         title=build_title('Ensemble Predicted Streamflow', plot_titles),
         yaxis={'title': 'Streamflow (m<sup>3</sup>/s)', 'range': [0, 'auto']},
         xaxis={
-            'title': 'Date (UTC +0:00)',
+            'title': timezone_label(df.index.tz),
             'range': [startdate, enddate],
-            'hoverformat': '%b %d %Y',
+            'hoverformat': '%d %b %Y %X',
             'tickformat': '%b %d %Y'
         },
     )
@@ -297,6 +301,10 @@ def forecast_records(df: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_title
     layout = go.Layout(
         title=build_title('Previous Forecasted Streamflow', plot_titles=plot_titles),
         yaxis={'title': 'Streamflow (m<sup>3</sup>/s)', 'range': [0, 'auto']},
-        xaxis={'title': 'Date (UTC +0:00)', 'range': [startdate, enddate]},
+        xaxis={
+            'title': timezone_label(df.index.tz),
+            'range': [startdate, enddate],
+            'hoverformat': '%d %b %Y %X',
+        },
     )
     return go.Figure(scatter_plots, layout=layout)
diff --git a/geoglows/_plots/plotly_retrospective.py b/geoglows/_plots/plotly_retrospective.py
index 3115a50..710f233 100644
--- a/geoglows/_plots/plotly_retrospective.py
+++ b/geoglows/_plots/plotly_retrospective.py
@@ -1,8 +1,9 @@
+import numpy as np
 import pandas as pd
 import plotly.graph_objs as go
 import scipy.stats
 
-from .format_tools import build_title
+from .format_tools import build_title, timezone_label
 from .plotly_helpers import _rperiod_scatters
 
 __all__ = [
@@ -15,29 +16,27 @@
 ]
 
 
-def retrospective(retro: pd.DataFrame, *,
-                  rp_df: pd.DataFrame = None, plot_titles: dict = None, ) -> go.Figure:
+def retrospective(df: pd.DataFrame, *, rp_df: pd.DataFrame = None, plot_titles: dict = None, ) -> go.Figure:
     """
     Makes the streamflow ensemble data and metadata into a plotly plot
 
     Args:
-        retro: the csv response from historic_simulation
+        df: the csv response from historic_simulation
         rp_df: the csv response from return_periods
-        plot_type: either 'json', 'plotly', or 'html' (default plotly)
         plot_titles: (dict) Extra info to show on the title of the plot. For example:
             {'River ID': 1234567, 'Drainage Area': '1000km^2'}
 
     Return:
          plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method
     """
-    dates = retro.index.tolist()
+    dates = df.index.tolist()
     startdate = dates[0]
     enddate = dates[-1]
 
     plot_data = {
         'x_datetime': dates,
-        'y_flow': retro.values.flatten(),
-        'y_max': retro.values.max(),
+        'y_flow': df.values.flatten(),
+        'y_max': df.values.max(),
     }
     if rp_df is not None:
         plot_data.update(rp_df.to_dict(orient='index').items())
@@ -56,9 +55,9 @@ def retrospective(retro: pd.DataFrame, *,
         title=build_title('Retrospective Streamflow Simulation', plot_titles),
         yaxis={'title': 'Streamflow (m<sup>3</sup>/s)', 'range': [0, 'auto']},
         xaxis={
-            'title': 'Date (UTC +0:00)',
+            'title': timezone_label(df.index.tz),
             'range': [startdate, enddate],
-            'hoverformat': '%b %d %Y',
+            'hoverformat': '%d %b %Y',
             'tickformat': '%Y'
         },
     )
@@ -94,18 +93,19 @@ def daily_averages(dayavg: pd.DataFrame, plot_titles: list = None, plot_type: st
     layout = go.Layout(
         title=build_title('Daily Average Streamflow (Simulated)', plot_titles),
         yaxis={'title': 'Streamflow (m<sup>3</sup>/s)', 'range': [0, 'auto']},
-        xaxis={'title': 'Date (UTC +0:00)', 'hoverformat': '%b %d', 'tickformat': '%b'},
+        xaxis={'title': 'Date', 'hoverformat': '%b %d', 'tickformat': '%b'},
     )
     return go.Figure(scatter_plots, layout=layout)
 
 
-def monthly_averages(monavg: pd.DataFrame, titles: dict = None, plot_titles: list = None, plot_type: str = 'plotly') -> go.Figure:
+def monthly_averages(monavg: pd.DataFrame, plot_titles: list = None,
+                     plot_type: str = 'plotly') -> go.Figure:
     """
     Makes the daily_averages data and metadata into a plotly plot
 
     Args:
         monavg: the csv response from monthly_averages
-        titles: (dict) Extra info to show on the title of the plot. For example:
+        plot_titles: (dict) Extra info to show on the title of the plot. For example:
             {'River ID': 1234567, 'Drainage Area': '1000km^2'}
         plot_type: either 'plotly', or 'html' (default plotly)
 
@@ -133,7 +133,7 @@ def monthly_averages(monavg: pd.DataFrame, titles: dict = None, plot_titles: lis
     return go.Figure(scatter_plots, layout=layout)
 
 
-def annual_averages(df: pd.DataFrame, *, plot_titles: list = None, ) -> go.Figure:
+def annual_averages(df: pd.DataFrame, *, plot_titles: list = None, decade_averages: bool = False) -> go.Figure:
     """
     Makes the annual_averages data and metadata into a plotly plot
 
@@ -141,6 +141,7 @@ def annual_averages(df: pd.DataFrame, *, plot_titles: list = None, ) -> go.Figur
         df: the csv response from annual_averages
         plot_titles: (dict) Extra info to show on the title of the plot. For example:
             {'River ID': 1234567, 'Drainage Area': '1000km^2'}
+        decade_averages: (bool) if True, will plot the average flow for each decade
 
     Return:
          plotly.GraphObject: plotly object, especially for use with python notebooks and the .show() method
@@ -153,6 +154,29 @@ def annual_averages(df: pd.DataFrame, *, plot_titles: list = None, ) -> go.Figur
             line=dict(color='blue')
         ),
     ]
+
+    if decade_averages:
+        # get a list of decades covered by the data in the index
+        first_year = str(int(df.index[0]))[:-1] + '0'
+        last_year = str(int(df.index[-1]))[:-1] + '9'
+        first_year = int(first_year)
+        last_year = int(last_year)
+        decades = [decade for decade in range(int(first_year), int(last_year) + 1, 10)]
+        for idx, decade in enumerate(decades):
+            decade_values = df[np.logical_and(df.index.astype(int) >= decade, df.index.astype(int) < decade + 10)]
+            mean_flow = decade_values.values.flatten().mean()
+            scatter_plots.append(
+                go.Scatter(
+                    name=f'{decade}s: {mean_flow:.2f} m<sup>3</sup>/s',
+                    x=[decade_values.index[0], decade_values.index[-1]],
+                    y=mean_flow * np.ones(2),
+                    line=dict(color='red'),
+                    hoverinfo='name',
+                    legendgroup='decade_averages',
+                    legendgrouptitle=dict(text='Decade Averages')
+                )
+            )
+
     layout = go.Layout(
         title=build_title('Annual Average Streamflow (Simulated)', plot_titles),
         yaxis={'title': 'Streamflow (m<sup>3</sup>/s)'},
@@ -161,12 +185,12 @@ def annual_averages(df: pd.DataFrame, *, plot_titles: list = None, ) -> go.Figur
     return go.Figure(scatter_plots, layout=layout)
 
 
-def flow_duration_curve(hist: pd.DataFrame, plot_titles: dict = None, plot_type: str = 'plotly') -> go.Figure:
+def flow_duration_curve(df: pd.DataFrame, plot_titles: dict = None, plot_type: str = 'plotly') -> go.Figure:
     """
     Makes the streamflow ensemble data and metadata into a plotly plot
 
     Args:
-        hist: the csv response from historic_simulation
+        df: the dataframe response from data.retrospective
         plot_titles: (dict) Extra info to show on the title of the plot. For example:
             {'River ID': 1234567, 'Drainage Area': '1000km^2'}
         plot_type: either 'json', 'plotly', or 'html' (default plotly)
@@ -178,7 +202,7 @@ def flow_duration_curve(hist: pd.DataFrame, plot_titles: dict = None, plot_type:
         raise ValueError('invalid plot_type specified. pick json, plotly, plotly_scatters, or html')
 
     # process the hist dataframe to create the flow duration curve
-    sorted_hist = hist.values.flatten()
+    sorted_hist = df.values.flatten()
     sorted_hist.sort()
 
     # ranks data from smallest to largest
@@ -212,12 +236,12 @@ def flow_duration_curve(hist: pd.DataFrame, plot_titles: dict = None, plot_type:
     return go.Figure(scatter_plots, layout=layout)
 
 
-def daily_stats(hist: pd.DataFrame, *, plot_titles: dict = None, plot_type: str = 'plotly') -> go.Figure:
+def daily_stats(df: pd.DataFrame, *, plot_titles: dict = None, plot_type: str = 'plotly') -> go.Figure:
     """
     Plots a graph with statistics for each day of year
 
     Args:
-        hist: dataframe of values to plot
+        df: dataframe of values to plot
         plot_titles: (dict) Extra info to show on the title of the plot. For example:
             {'River ID': 1234567, 'Drainage Area': '1000km^2'}
         plot_type: either 'plotly' (python object, default), 'plotly_scatters', or 'html'
@@ -226,7 +250,7 @@ def daily_stats(hist: pd.DataFrame, *, plot_titles: dict = None, plot_type: str
         plot of the graph of the low flows
     """
 
-    stats_df = daily_stats(hist)
+    stats_df = daily_stats(df)
 
     data = [
         go.Scatter(
@@ -241,7 +265,7 @@ def daily_stats(hist: pd.DataFrame, *, plot_titles: dict = None, plot_type: str
     layout = go.Layout(
         title=build_title('Daily Average Streamflow (Simulated)', plot_titles),
         yaxis={'title': 'Streamflow (m<sup>3</sup>/s)', 'range': [0, 'auto']},
-        xaxis={'title': 'Date (UTC +0:00)', 'hoverformat': '%b %d', 'tickformat': '%b'},
+        xaxis={'title': timezone_label(df.index.tz), 'hoverformat': '%b %d', 'tickformat': '%b'},
     )
     return go.Figure(data=data, layout=layout)
 
diff --git a/geoglows/_plots/plots.py b/geoglows/_plots/plots.py
index a886dcf..504c233 100644
--- a/geoglows/_plots/plots.py
+++ b/geoglows/_plots/plots.py
@@ -200,7 +200,8 @@ def monthly_averages(df: pd.DataFrame, *,
 
 def annual_averages(df: pd.DataFrame, *,
                     plot_type: str = 'plotly',
-                    plot_titles: list = None, ) -> go.Figure:
+                    plot_titles: list = None,
+                    decade_averages: bool = False, ) -> go.Figure:
     """
     Makes a plotly figure of the annual average flows
 
@@ -208,12 +209,13 @@ def annual_averages(df: pd.DataFrame, *,
         df: a dataframe of the annual average flows
         plot_type: either plotly or html
         plot_titles: additional key-value pairs to display in the title of the figure
+        decade_averages: if True, the figure will include the average flows for each decade
 
     Returns:
         go.Figure
     """
     if plot_type in ('plotly', 'html'):
-        figure = plotly_annual_averages(df, plot_titles=plot_titles)
+        figure = plotly_annual_averages(df, plot_titles=plot_titles, decade_averages=decade_averages)
         if plot_type == 'html':
             return plotly_figure_to_html_plot(figure)
         return figure