added new fn, added tests, ran black

ianozsvald · Oct 24, 2023 · 1d873e4 · 1d873e4
1 parent ddd15c5
commit 1d873e4
Show file tree

Hide file tree

Showing 6 changed files with 119 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -328,12 +328,19 @@ client # show client details
 
 Use `dask.config.set(temporary_directory='/path/to/tmp')` to set temp folder if we run out of disk (h/t https://stackoverflow.com/questions/40042748/how-to-specify-the-directory-that-dask-uses-for-temporary-files https://docs.dask.org/en/stable/configuration.html). To confirm this open the Web UI (maybe on http://127.0.0.1:8787/status), go to Info, logs, then check early output to see the entry for "Directory".
 
-# Conda
+# Conda or Pip
 
 * `conda config --show` to list all config
 * `conda info` to list general configuration
 * prefer channel_priority to be strict https://conda-forge.org/
 
+## Pip 
+
+```
+$ conda create -n notes_to_self python=3.12
+$ pip install -r requirements.txt
+```
+
 ## Conda for this environment
 
 ```
@@ -342,6 +349,17 @@ $ conda create -n notes_to_self python=3.10 pandas matplotlib jupyterlab altair
 $ pytest *.py
 ```
 
+## testing and coverage
+
+```
+pytest
+
+coverage run -m pytest 
+coverage html
+firefox htmlcov/index.html
+coverage erase
+```
+
 # Shell
 
 ## link

diff --git a/labelling.py b/labelling.py
@@ -27,7 +27,7 @@ def format_to_base_10(
     else:
         sgn = ""
     if m >= 0:
-        short_form = num / 1000.0 ** m
+        short_form = num / 1000.0**m
         short_form_template = f"{short_form:.{precision}f}"
         if precision > 0 and trim_0_decimals:
             if short_form == int(short_form):

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,8 @@
+numpy
+pandas
+altair
+matplotlib
+pytest
+ipython
+black
+coverage
diff --git a/simpler_pandas.py b/simpler_pandas.py
@@ -25,20 +25,44 @@
 }
 
 
+def to_datetime_helper(ser, format="%b %Y", trim_at=10):
+    """Show conversion errors (as NaT) from original strings during `to_datetime` conversion
+    A `format` of `%b %Y` corresponds to e.g. 'Jan 2023'
+    `to_datetime` seems to skip whitespace"""
+    ser_nat = pd.to_datetime(ser, errors="coerce", format=format)
+    # ser_nat can have NaT if error occurred
+    mask = ser_nat.isna()
+    print(f"{mask.sum()} errors seen in conversion")
+    # show the errors, trim if there are too many to show
+    mask_cum = mask.cumsum()
+    if mask.sum() > trim_at:
+        print(f"{mask.sum()} is too many errors, trimming to {trim_at}")
+    mask[mask_cum > trim_at] = False  # get the items up until the trim point
+    for idx, value in ser[mask].items():
+        print(f"Row {idx} '{value}'")
+    return ser_nat
+
+
 def check_series_is_ordered(ser, ascending=True):
     """Check 1 series is ascending"""
-    assert ascending==True, "Haven't done descending yet, nor tested this"
-    return (ser.shift()[1:].reset_index(drop=True) >= ser[:-1].reset_index(drop=True)).all()
+    assert ascending == True, "Haven't done descending yet, nor tested this"
+    return (
+        ser.shift()[1:].reset_index(drop=True) >= ser[:-1].reset_index(drop=True)
+    ).all()
 
 
 def show_df_details(df):
     """Dig into _data hidden attribute, note is_consolidated check can be slow first time"""
-    print(
-        f"""is view {df._data.is_view}, is consolidated {df._data.is_consolidated()}, single block {df._data.is_single_block}"""
-        f""", numeric mixed {df._data.is_numeric_mixed_type}"""
-    )
-    print(f"""{df._data.nblocks} blocks looking like:""")
-    print(df._data.blocks)
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        # 2023 we get DeprecationWarning: DataFrame._data is deprecated and will be removed in a future version. Use public APIs instead.
+        print(
+            f"""is view {df._data.is_view}, is consolidated {df._data.is_consolidated()}, single block {df._data.is_single_block}"""
+        )
+        print(f"""{df._data.nblocks} blocks looking like:""")
+        print(df._data.blocks)
 
 
 def sanity_check(df):
@@ -201,3 +225,5 @@ def apply_labelling(ser, format_fn=None, **kwargs):
     counted_vc = counted.value_counts()
     counted_vc.index = apply_labelling(counted_vc.index, format_to_base_10, prefix="$")
     show_all(counted_vc)
+
+    show_df_details(df)
diff --git a/simpler_stats.py b/simpler_stats.py
@@ -19,6 +19,7 @@
 
 
 def calculate_ci(arr):
+    """Calculate a 90% CI"""
     p = arr.mean()
     q = 1 - p
     n = arr.shape[0]
@@ -28,28 +29,52 @@ def calculate_ci(arr):
     return p - se_95, p, p + se_95
 
 
-def calculate_bootstrap_ci(arr, repeats=1000):
-    """Build repeats' worth of bootstrap samples, calculate percentiles"""
-    pc2_5_idx = int(repeats * 0.025)
-    pc50_idx = int(repeats * 0.5)
-    pc97_5_idx = int(repeats * 0.975)
-    percentiles = (np.array([0.025, 0.5, 0.975]) * repeats).astype(int)
+def calculate_bootstraps(arr, repeats=1000, agg_fn=np.sum):
+    """Calculate a bootstrap statistic (default `sum`)
+    Given an array calculate `repeats` bootstrap samples
+    taking the `agg_fn` of each and return `repeats` results"""
     n = arr.shape[0]
-    means = []
+    aggs = []
     for it in range(repeats):
         mask = rng.integers(0, n, n)
-        arr2 = arr[mask]
-        means.append(arr2.mean())
-    means = np.array(means)
-    means.sort()
-    print(
-        f"Bootstrap mean {means[pc50_idx]:0.3f}, 2.5th CI {means[pc2_5_idx]:0.3f}, 97.5th CI {means[pc97_5_idx]:0.3f}"
-    )
-    return means[percentiles]
+        aggs.append(agg_fn(arr[mask]))
+    aggs = np.array(aggs)
+    return aggs
+
+
+def calculate_bootstrap_ci(
+    arr, percentiles=[0.025, 0.5, 0.975], repeats=1000, agg_fn=np.mean
+):
+    """Bootstrap CI
+    Given percentiles, calculate a repeated statistic (default is the mean) on the bootstrap
+    and return the values at the matching percentiles"""
+    perc = np.array([int(p * repeats) for p in percentiles])
+    aggs = calculate_bootstrap(arr, repeats, agg_fn)
+    aggs.sort()
+    return aggs[perc]
+
+
+def test_calculate_bootstrap():
+    arr = np.ones(10)
+    bootstraps = calculate_bootstraps(arr, repeats=10_000)
+    # given 10 * 1 in arr, we expect 10 as the result (no variance)
+    np.testing.assert_equal(bootstraps[0], 10)
+    assert np.var(bootstraps) == 0, "Not expecting any variance"
+    assert bootstraps.shape == (10_000,)
+
+    HIGH = 100
+    SIZE = 100
+    arr = np.random.randint(0, high=HIGH, size=SIZE)
+    bootstraps = calculate_bootstraps(arr, repeats=1_000, agg_fn=np.sum)
+    assert bootstraps.min() >= 0
+    assert bootstraps.max() <= SIZE * HIGH
+    bootstraps = calculate_bootstraps(arr, repeats=1_000, agg_fn=np.mean)
+    assert bootstraps.min() >= 0
+    assert bootstraps.max() <= SIZE
 
 
 if __name__ == "__main__":
     arr = rng.binomial(1, 0.5, 1000)
     arr = arr < 0.01
     print(calculate_ci(arr))
-    print(calculate_bootstrap_ci(arr))
+    print(calculate_bootstrap_ci(arr, repeats=10_000))
diff --git a/test_simpler_pandas.py b/test_simpler_pandas.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from datetime import datetime
 
 from simpler_pandas import (
     make_bin_edges,
@@ -12,12 +13,26 @@
     label_interval,
     show_df_details,
     show_all,
+    to_datetime_helper,
 )
 from labelling import format_to_base_10
 
 # TODO value_counts_pct has no tests yet
 
 
+def test_to_datetime_helper():
+    res = to_datetime_helper(pd.Series(["Jan 2023", "Feb 2024"]))
+    expected = [datetime(2023, 1, 1), datetime(2024, 2, 1)]
+    assert (res == expected).all()
+
+    # it is harder to do a comparison on a series as we can't also
+    # compare a pd.NaT along with valid values, we need to use .isna()
+    # so instead just check that this gives us 3 elements with 1 NaT
+    res = to_datetime_helper(pd.Series(["Jan 2023", "Feb 2024", "xx"]))
+    assert len(res) == 3
+    assert res.isna().sum() == 1
+
+
 def test_show_all(capsys):
     # TODO run coverage, currently we don't test everything
     df = pd.DataFrame({"a": [1, 2, 3]})
@@ -35,7 +50,7 @@ def test_show_df_details(capsys):
     show_df_details(df)
     captured = capsys.readouterr()
     assert "is view False" in captured.out
-    assert "numeric mixed True" in captured.out
+    assert "is consolidated True, single block True" in captured.out
 
 
 # TODO replace with warns check https://docs.pytest.org/en/latest/how-to/capture-warnings.html#warns