Merge branch 'master' of github.com:optimdata/chrony

optimdata · Dec 3, 2015 · 5b16267 · 5b16267
2 parents 8ffd58a + e5866bb
commit 5b16267
Show file tree

Hide file tree

Showing 7 changed files with 127 additions and 26 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,15 @@
+language: python
+
+python:
+  - 3.4
+
+env:
+
+install:
+  - pip install -r requirements.txt
+  - pip install coveralls
+
+script:
+  - nosetests --with-cover --cover-package chrony chrony
+
+after_success: coveralls
diff --git a/README.md b/README.md
@@ -1,5 +1,8 @@
 # chrony
 
+[![Build Status](https://travis-ci.org/optimdata/chrony.svg?branch=master)](https://travis-ci.org/optimdata/chrony)
+[![Coverage Status](https://coveralls.io/repos/optimdata/chrony/badge.svg?branch=master&service=github)](https://coveralls.io/github/optimdata/chrony?branch=master)
+
 Timeseries analysis tools with specific focus on timespans. Built on top of pandas.
 
 ## tldr
@@ -82,3 +85,8 @@ Check out tests for examples.
 A **timespan** is a row of a `pandas.DataFrame` which represents a period of time between two fixed points. These are represented using a beg and a end column.
 
 
+### Development
+
+#### Tests
+
+    nosetests chrony --with-coverage --cover-package chrony
diff --git a/chrony/charting.py b/chrony/charting.py
@@ -4,10 +4,7 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
-
-
-def compute_category_index(categories):
-    return {category: index + 1 for index, category in enumerate(sorted(set(categories)))}
+from .core import compute_category_index
 
 
 def plot_events(categories, xmin, xmax, labels=None, xlim=None, linewidth=10):

diff --git a/chrony/core.py b/chrony/core.py
@@ -2,3 +2,23 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import numpy as np
+import pandas as pd
+
+
+def compute_category_index(categories):
+    return {category: index + 1 for index, category in enumerate(sorted(set(categories)))}
+
+
+def weighted_interpolate(serie, weights):
+    sb = serie.fillna(method='ffill')
+    se = serie.fillna(method='bfill')
+    cw = weights.cumsum()
+    w2 = pd.Series(None, index=serie.index)
+    w2[~np.isnan(serie)] = cw[~np.isnan(serie)]
+    wb = w2.fillna(method='ffill')
+    we = w2.fillna(method='bfill')
+    cw = (cw - wb) / (we - wb)
+    r = sb + cw * (se - sb)
+    r.update(serie)
+    return r
diff --git a/chrony/tests.py b/chrony/tests.py
@@ -8,9 +8,9 @@
 import pytz
 import unittest
 
-from .charting import compute_category_index
+from .core import compute_category_index, weighted_interpolate
 from .exceptions import BadLengthsError, BegPosteriorToEndError, OverlapError, NotSortedError, HasTimezoneError, IntegrityError
-from .timespans import audit_timespan, describe_timespan, to_stamps, to_spans, compute_segments
+from .timespans import audit_timespan, describe_timespan, to_stamps, to_spans, compute_segments, clean_overlap_timespan, fill_na_dataframe
 
 pd.set_option('display.width', 1000)
 
@@ -45,6 +45,12 @@ def test_all(self):
         begs = pd.date_range('1970-1-1', freq='d', periods=2).to_series().reset_index(drop=True)
         ends = pd.date_range('1970-1-2', freq='d', periods=2).to_series().reset_index(drop=True)
         describe_timespan(begs, ends)
+        describe_timespan(pd.Series(), pd.Series())
+        self.assertIsNone(audit_timespan(pd.Series(), pd.Series()))
+        begs = pd.date_range('1970-1-1', freq='d', periods=2).to_series().reset_index(drop=True)
+        ends = pd.date_range('1970-1-3', freq='d', periods=2).to_series().reset_index(drop=True)
+        ret = pd.to_datetime(['1970-1-2', '1970-1-4']).to_series().reset_index(drop=True)
+        pd.util.testing.assert_series_equal(ret, clean_overlap_timespan(begs, ends))
         # self.assertTrue(pd.Series().equals(describe_timespan(begs, ends)))
 
     def test_merge(self):
@@ -75,6 +81,15 @@ def test_merge(self):
             'value_d': [10., 20., 30.],
             'value_s': ['10', '20', '30']
         }, columns=stamp_columns)
+        df2b = pd.DataFrame({
+            'ts': pd.to_datetime(['2015-1-1', '2015-1-2', '2015-1-3']),
+            'beg_state_d': [1., 2., -1.],
+            'end_state_d': [-1., 1., 2.],
+            'beg_state_s': ['1', '2', 'UNDEFINED'],
+            'end_state_s': ['UNDEFINED', '1', '2'],
+            'value_d': [10., 20., 30.],
+            'value_s': ['10', '20', '30']
+        }, columns=stamp_columns)
         df3 = pd.DataFrame(
             to_stamps(
                 df1,
@@ -84,6 +99,8 @@ def test_merge(self):
             columns=stamp_columns
         )
         pd.util.testing.assert_frame_equal(df3, df2)
+        fill_na_dataframe(df3)
+        pd.util.testing.assert_frame_equal(df3, df2b)
         df4 = pd.DataFrame(
             to_spans(
                 df3,
@@ -120,8 +137,14 @@ def test_compute_segments(self):
         )
 
 
-class ChartingCase(unittest.TestCase):
+class CoreCase(unittest.TestCase):
     def test_all(self):
         self.assertTrue(compute_category_index([]) == {})
         self.assertTrue(compute_category_index(['a']) == {'a': 1})
         self.assertTrue(compute_category_index(['b', 'a', 'b']) == {'a': 1, 'b': 2})
+
+    def test_weighted_interpolate(self):
+        s = pd.Series([0, np.nan, np.nan, 1, np.nan, np.nan, np.nan, 2])
+        w = pd.Series([0, 1, 0, 1, 1, 2, 0, 1])
+        r = pd.Series([0, .5, .5, 1, 1.25, 1.75, 1.75, 2])
+        pd.util.testing.assert_series_equal(weighted_interpolate(s, w), r)
diff --git a/chrony/timespans.py b/chrony/timespans.py
@@ -8,18 +8,19 @@
 
 
 def audit_timespan(begs, ends):
+    if begs.empty and ends.empty:
+        return
     if begs.dt.tz or ends.dt.tz:
         raise HasTimezoneError
     if len(begs) != len(ends):
         raise BadLengthsError
     for beg, end in zip(begs, ends):
         if beg > end:
             raise BegPosteriorToEndError
-    for i in range(len(begs) - 1):
-        if begs[i + 1] < begs[i]:
-            raise NotSortedError
-        if ends[i] > begs[i + 1]:
-            raise OverlapError('At row %s end %s is posterior to %s' % (i, ends[i], begs[i + 1]))
+    if (begs < begs.shift()).sum():
+        raise NotSortedError
+    if (ends.shift() > begs)[1:].sum():
+        raise OverlapError
 
 
 def audit_timespan_print(begs, ends):
@@ -42,6 +43,9 @@ def audit_timespan_print(begs, ends):
 
 
 def describe_timespan(begs, ends):
+    if begs.empty and ends.empty:
+        print('Empty series')
+        return
     contiguous_transitions = (begs == ends.shift()).sum()
     coverage = (ends - begs).sum().total_seconds() / (ends[len(ends) - 1] - begs[0]).total_seconds()
     metrics = (
@@ -56,6 +60,23 @@ def describe_timespan(begs, ends):
     return retval
 
 
+def clean_overlap_timespan(begs, ends):
+    return pd.DataFrame({'ts_end': ends, 'ts_end_shifted': begs.shift(-1)}).min(axis=1)
+
+
+def fill_na_series(series):
+    if series.dtype.char == 'O':
+        series.fillna('UNDEFINED', inplace=True)
+    else:
+        series.fillna(-1, inplace=True)
+
+
+def fill_na_dataframe(df):
+    for column in df.columns:
+        if column.startswith('beg_') or column.startswith('end_'):
+            fill_na_series(df[column])
+
+
 def to_stamps(df, state_columns, value_columns, beg_col='ts_beg', end_col='ts_end'):
     '''
         Convert an frame representing periods (eg each row has a beg and end) to a frame representing change of periods.
@@ -140,20 +161,21 @@ def to_spans(df, state_columns, value_columns, beg_col='ts_beg', end_col='ts_end
     return pd.DataFrame(dict(list(df_beg.to_dict('series').items()) + list(df_end.to_dict('series').items())))
 
 
-def merge_spans(spans, stamps, columns_states):
-    for key in ('beg', 'end'):
-        spans['ts'] = spans['ts_%s' % key]
-        spans = pd.merge(stamps, spans, how='outer', on='ts')
-        spans.set_index('ts', inplace=True)
-        spans.sort_index(inplace=True)
-        for column in columns_states:
-            spans['%s_%s' % (column, key)] = spans.pop(column).interpolate(method='time')
-            spans['%s_%s' % (column, key)].fillna(method='ffill', inplace=True)
-            spans['%s_%s' % (column, key)].fillna(method='bfill', inplace=True)
-        spans.reset_index(inplace=True)
-        spans.pop('ts')
-        spans = spans[~pd.isnull(spans['ts_%s' % key])]
-    return spans
+# def merge_spans(left, right):
+
+    # for key in ('beg', 'end'):
+    #     spans['ts'] = spans['ts_%s' % key]
+    #     spans = pd.merge(stamps, spans, how='outer', on='ts')
+    #     spans.set_index('ts', inplace=True)
+    #     spans.sort_index(inplace=True)
+    #     for column in columns_states:
+    #         spans['%s_%s' % (column, key)] = spans.pop(column).interpolate(method='time')
+    #         spans['%s_%s' % (column, key)].fillna(method='ffill', inplace=True)
+    #         spans['%s_%s' % (column, key)].fillna(method='bfill', inplace=True)
+    #     spans.reset_index(inplace=True)
+    #     spans.pop('ts')
+    #     spans = spans[~pd.isnull(spans['ts_%s' % key])]
+    # return spans
 
 
 def compute_segments(df, columns):

diff --git a/setup.py b/setup.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+
+from distutils.core import setup
+from setuptools import find_packages
+
+setup(
+    name='chrony',
+    version='0.1.0',
+    author='Guillaume Thomas',
+    author_email='[email protected]',
+    license='LICENSE',
+    description='Timeseries analysis tools with specific focus on timespans. Built on top of pandas.',
+    url='https://github.com/optimdata/chrony',
+    include_package_data=True,
+    packages=find_packages(),
+)