Skip to content

Commit

Permalink
added new fn, added tests, ran black
Browse files Browse the repository at this point in the history
  • Loading branch information
ianozsvald committed Oct 24, 2023
1 parent ddd15c5 commit 1d873e4
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 27 deletions.
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -328,12 +328,19 @@ client # show client details

Use `dask.config.set(temporary_directory='/path/to/tmp')` to set temp folder if we run out of disk (h/t https://stackoverflow.com/questions/40042748/how-to-specify-the-directory-that-dask-uses-for-temporary-files https://docs.dask.org/en/stable/configuration.html). To confirm this open the Web UI (maybe on http://127.0.0.1:8787/status), go to Info, logs, then check early output to see the entry for "Directory".

# Conda
# Conda or Pip

* `conda config --show` to list all config
* `conda info` to list general configuration
* prefer channel_priority to be strict https://conda-forge.org/

## Pip

```
$ conda create -n notes_to_self python=3.12
$ pip install -r requirements.txt
```

## Conda for this environment

```
Expand All @@ -342,6 +349,17 @@ $ conda create -n notes_to_self python=3.10 pandas matplotlib jupyterlab altair
$ pytest *.py
```

## testing and coverage

```
pytest
coverage run -m pytest
coverage html
firefox htmlcov/index.html
coverage erase
```

# Shell

## link
Expand Down
2 changes: 1 addition & 1 deletion labelling.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def format_to_base_10(
else:
sgn = ""
if m >= 0:
short_form = num / 1000.0 ** m
short_form = num / 1000.0**m
short_form_template = f"{short_form:.{precision}f}"
if precision > 0 and trim_0_decimals:
if short_form == int(short_form):
Expand Down
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
numpy
pandas
altair
matplotlib
pytest
ipython
black
coverage
42 changes: 34 additions & 8 deletions simpler_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,44 @@
}


def to_datetime_helper(ser, format="%b %Y", trim_at=10):
"""Show conversion errors (as NaT) from original strings during `to_datetime` conversion
A `format` of `%b %Y` corresponds to e.g. 'Jan 2023'
`to_datetime` seems to skip whitespace"""
ser_nat = pd.to_datetime(ser, errors="coerce", format=format)
# ser_nat can have NaT if error occurred
mask = ser_nat.isna()
print(f"{mask.sum()} errors seen in conversion")
# show the errors, trim if there are too many to show
mask_cum = mask.cumsum()
if mask.sum() > trim_at:
print(f"{mask.sum()} is too many errors, trimming to {trim_at}")
mask[mask_cum > trim_at] = False # get the items up until the trim point
for idx, value in ser[mask].items():
print(f"Row {idx} '{value}'")
return ser_nat


def check_series_is_ordered(ser, ascending=True):
"""Check 1 series is ascending"""
assert ascending==True, "Haven't done descending yet, nor tested this"
return (ser.shift()[1:].reset_index(drop=True) >= ser[:-1].reset_index(drop=True)).all()
assert ascending == True, "Haven't done descending yet, nor tested this"
return (
ser.shift()[1:].reset_index(drop=True) >= ser[:-1].reset_index(drop=True)
).all()


def show_df_details(df):
"""Dig into _data hidden attribute, note is_consolidated check can be slow first time"""
print(
f"""is view {df._data.is_view}, is consolidated {df._data.is_consolidated()}, single block {df._data.is_single_block}"""
f""", numeric mixed {df._data.is_numeric_mixed_type}"""
)
print(f"""{df._data.nblocks} blocks looking like:""")
print(df._data.blocks)
import warnings

with warnings.catch_warnings():
warnings.simplefilter("ignore")
# 2023 we get DeprecationWarning: DataFrame._data is deprecated and will be removed in a future version. Use public APIs instead.
print(
f"""is view {df._data.is_view}, is consolidated {df._data.is_consolidated()}, single block {df._data.is_single_block}"""
)
print(f"""{df._data.nblocks} blocks looking like:""")
print(df._data.blocks)


def sanity_check(df):
Expand Down Expand Up @@ -201,3 +225,5 @@ def apply_labelling(ser, format_fn=None, **kwargs):
counted_vc = counted.value_counts()
counted_vc.index = apply_labelling(counted_vc.index, format_to_base_10, prefix="$")
show_all(counted_vc)

show_df_details(df)
57 changes: 41 additions & 16 deletions simpler_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@


def calculate_ci(arr):
"""Calculate a 90% CI"""
p = arr.mean()
q = 1 - p
n = arr.shape[0]
Expand All @@ -28,28 +29,52 @@ def calculate_ci(arr):
return p - se_95, p, p + se_95


def calculate_bootstrap_ci(arr, repeats=1000):
"""Build repeats' worth of bootstrap samples, calculate percentiles"""
pc2_5_idx = int(repeats * 0.025)
pc50_idx = int(repeats * 0.5)
pc97_5_idx = int(repeats * 0.975)
percentiles = (np.array([0.025, 0.5, 0.975]) * repeats).astype(int)
def calculate_bootstraps(arr, repeats=1000, agg_fn=np.sum):
"""Calculate a bootstrap statistic (default `sum`)
Given an array calculate `repeats` bootstrap samples
taking the `agg_fn` of each and return `repeats` results"""
n = arr.shape[0]
means = []
aggs = []
for it in range(repeats):
mask = rng.integers(0, n, n)
arr2 = arr[mask]
means.append(arr2.mean())
means = np.array(means)
means.sort()
print(
f"Bootstrap mean {means[pc50_idx]:0.3f}, 2.5th CI {means[pc2_5_idx]:0.3f}, 97.5th CI {means[pc97_5_idx]:0.3f}"
)
return means[percentiles]
aggs.append(agg_fn(arr[mask]))
aggs = np.array(aggs)
return aggs


def calculate_bootstrap_ci(
arr, percentiles=[0.025, 0.5, 0.975], repeats=1000, agg_fn=np.mean
):
"""Bootstrap CI
Given percentiles, calculate a repeated statistic (default is the mean) on the bootstrap
and return the values at the matching percentiles"""
perc = np.array([int(p * repeats) for p in percentiles])
aggs = calculate_bootstrap(arr, repeats, agg_fn)
aggs.sort()
return aggs[perc]


def test_calculate_bootstrap():
arr = np.ones(10)
bootstraps = calculate_bootstraps(arr, repeats=10_000)
# given 10 * 1 in arr, we expect 10 as the result (no variance)
np.testing.assert_equal(bootstraps[0], 10)
assert np.var(bootstraps) == 0, "Not expecting any variance"
assert bootstraps.shape == (10_000,)

HIGH = 100
SIZE = 100
arr = np.random.randint(0, high=HIGH, size=SIZE)
bootstraps = calculate_bootstraps(arr, repeats=1_000, agg_fn=np.sum)
assert bootstraps.min() >= 0
assert bootstraps.max() <= SIZE * HIGH
bootstraps = calculate_bootstraps(arr, repeats=1_000, agg_fn=np.mean)
assert bootstraps.min() >= 0
assert bootstraps.max() <= SIZE


if __name__ == "__main__":
arr = rng.binomial(1, 0.5, 1000)
arr = arr < 0.01
print(calculate_ci(arr))
print(calculate_bootstrap_ci(arr))
print(calculate_bootstrap_ci(arr, repeats=10_000))
17 changes: 16 additions & 1 deletion test_simpler_pandas.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import pytest
from datetime import datetime

from simpler_pandas import (
make_bin_edges,
Expand All @@ -12,12 +13,26 @@
label_interval,
show_df_details,
show_all,
to_datetime_helper,
)
from labelling import format_to_base_10

# TODO value_counts_pct has no tests yet


def test_to_datetime_helper():
res = to_datetime_helper(pd.Series(["Jan 2023", "Feb 2024"]))
expected = [datetime(2023, 1, 1), datetime(2024, 2, 1)]
assert (res == expected).all()

# it is harder to do a comparison on a series as we can't also
# compare a pd.NaT along with valid values, we need to use .isna()
# so instead just check that this gives us 3 elements with 1 NaT
res = to_datetime_helper(pd.Series(["Jan 2023", "Feb 2024", "xx"]))
assert len(res) == 3
assert res.isna().sum() == 1


def test_show_all(capsys):
# TODO run coverage, currently we don't test everything
df = pd.DataFrame({"a": [1, 2, 3]})
Expand All @@ -35,7 +50,7 @@ def test_show_df_details(capsys):
show_df_details(df)
captured = capsys.readouterr()
assert "is view False" in captured.out
assert "numeric mixed True" in captured.out
assert "is consolidated True, single block True" in captured.out


# TODO replace with warns check https://docs.pytest.org/en/latest/how-to/capture-warnings.html#warns
Expand Down

0 comments on commit 1d873e4

Please sign in to comment.