Merge branch 'master' of https://github.com/jreback/pandas into pytab…

…lesv4
greedo · Nov 27, 2012 · a4191c6 · a4191c6
2 parents 4b76509 + f6d48ca
commit a4191c6
Show file tree

Hide file tree

Showing 53 changed files with 1,343 additions and 672 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -11,4 +11,4 @@ install:
 
 script:
   - python setup.py build_ext install
-  - nosetests --exe -w /tmp pandas.tests
+  - nosetests --exe -w /tmp -A "not slow" pandas
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -31,21 +31,30 @@ pandas 0.10.0
 
   - Add error handling to Series.str.encode/decode (#2276)
   - Add ``where`` and ``mask`` to Series (#2337)
+  - Grouped histogram via `by` keyword in Series/DataFrame.hist (#2186)
+  - Support optional ``min_periods`` keyword in ``corr`` and ``cov``
+    for both Series and DataFrame (#2002)
 
 **API Changes**
 
   - ``names`` handling in file parsing: if explicit column `names` passed,
     `header` argument will be respected. If there is an existing header column,
     this can rename the columns. To fix legacy code, put ``header=None`` when
     passing ``names``
+  - DataFrame selection using a boolean frame now preserves input shape
+  - If function passed to Series.apply yields a Series, result will be a
+    DataFrame (#2316)
 
 **Improvements to existing features**
 
-  - Grouped histogram via `by` keyword in Series/DataFrame.hist (#2186)
   - Add ``nrows`` option to DataFrame.from_records for iterators (#1794)
   - Unstack/reshape algorithm rewrite to avoid high memory use in cases where
     the number of observed key-tuples is much smaller than the total possible
     number that could occur (#2278). Also improves performance in most cases.
+  - Support duplicate columns in DataFrame.from_records (#2179)
+  - Add ``normalize`` option to Series/DataFrame.asfreq (#2137)
+  - SparseSeries and SparseDataFrame construction from empty and scalar
+    values now no longer create dense ndarrays unnecessarily (#2322)
 
 **Bug fixes**
 
@@ -62,6 +71,7 @@ pandas 0.10.0
     (#2295)
   - Respect dtype=object in DataFrame constructor (#2291)
   - Fix DatetimeIndex.join bug with tz-aware indexes and how='outer' (#2317)
+  - pop(...) and del works with DataFrame with duplicate columns (#2349)
 
 pandas 0.9.1
 ============

diff --git a/doc/source/computation.rst b/doc/source/computation.rst
@@ -62,6 +62,21 @@ among the series in the DataFrame, also excluding NA/null values.
    frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e'])
    frame.cov()
 
+``DataFrame.cov`` also supports an optional ``min_periods`` keyword that
+specifies the required minimum number of observations for each column pair
+in order to have a valid result.
+
+.. ipython:: python
+
+   frame = DataFrame(randn(20, 3), columns=['a', 'b', 'c'])
+   frame.ix[:5, 'a'] = np.nan
+   frame.ix[5:10, 'b'] = np.nan
+
+   frame.cov()
+
+   frame.cov(min_periods=12)
+
+
 .. _computation.correlation:
 
 Correlation
@@ -97,6 +112,19 @@ All of these are currently computed using pairwise complete observations.
 Note that non-numeric columns will be automatically excluded from the
 correlation calculation.
 
+Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword:
+
+.. ipython:: python
+
+   frame = DataFrame(randn(20, 3), columns=['a', 'b', 'c'])
+   frame.ix[:5, 'a'] = np.nan
+   frame.ix[5:10, 'b'] = np.nan
+
+   frame.corr()
+
+   frame.corr(min_periods=12)
+
+
 A related method ``corrwith`` is implemented on DataFrame to compute the
 correlation between like-labeled Series contained in different DataFrame
 objects.
@@ -290,9 +318,9 @@ columns using ``ix`` indexing:
 
 Expanding window moment functions
 ---------------------------------
-A common alternative to rolling statistics is to use an *expanding* window, 
-which yields the value of the statistic with all the data available up to that 
-point in time. As these calculations are a special case of rolling statistics, 
+A common alternative to rolling statistics is to use an *expanding* window,
+which yields the value of the statistic with all the data available up to that
+point in time. As these calculations are a special case of rolling statistics,
 they are implemented in pandas such that the following two calls are equivalent:
 
 .. ipython:: python
@@ -301,7 +329,7 @@ they are implemented in pandas such that the following two calls are equivalent:
 
    expanding_mean(df)[:5]
 
-Like the ``rolling_`` functions, the following methods are included in the 
+Like the ``rolling_`` functions, the following methods are included in the
 ``pandas`` namespace or can be located in ``pandas.stats.moments``.
 
 .. csv-table::
@@ -324,12 +352,12 @@ Like the ``rolling_`` functions, the following methods are included in the
     ``expanding_corr``, Correlation (binary)
     ``expanding_corr_pairwise``, Pairwise correlation of DataFrame columns
 
-Aside from not having a ``window`` parameter, these functions have the same 
-interfaces as their ``rolling_`` counterpart. Like above, the parameters they 
+Aside from not having a ``window`` parameter, these functions have the same
+interfaces as their ``rolling_`` counterpart. Like above, the parameters they
 all accept are:
 
-  - ``min_periods``: threshold of non-null data points to require. Defaults to 
-    minimum needed to compute statistic. No ``NaNs`` will be output once 
+  - ``min_periods``: threshold of non-null data points to require. Defaults to
+    minimum needed to compute statistic. No ``NaNs`` will be output once
     ``min_periods`` non-null data points have been seen.
   - ``freq``: optionally specify a :ref:`frequency string <timeseries.alias>`
     or :ref:`DateOffset <timeseries.offsets>` to pre-conform the data to.
@@ -338,15 +366,15 @@ all accept are:
 
 .. note::
 
-   The output of the ``rolling_`` and ``expanding_`` functions do not return a 
-   ``NaN`` if there are at least ``min_periods`` non-null values in the current 
-   window. This differs from ``cumsum``, ``cumprod``, ``cummax``, and 
-   ``cummin``, which return ``NaN`` in the output wherever a ``NaN`` is 
+   The output of the ``rolling_`` and ``expanding_`` functions do not return a
+   ``NaN`` if there are at least ``min_periods`` non-null values in the current
+   window. This differs from ``cumsum``, ``cumprod``, ``cummax``, and
+   ``cummin``, which return ``NaN`` in the output wherever a ``NaN`` is
    encountered in the input.
 
-An expanding window statistic will be more stable (and less responsive) than 
-its rolling window counterpart as the increasing window size decreases the 
-relative impact of an individual data point. As an example, here is the 
+An expanding window statistic will be more stable (and less responsive) than
+its rolling window counterpart as the increasing window size decreases the
+relative impact of an individual data point. As an example, here is the
 ``expanding_mean`` output for the previous time series dataset:
 
 .. ipython:: python

diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -190,6 +190,7 @@ Using a boolean vector to index a Series works exactly as in a numpy ndarray:
 
    s[s > 0]
    s[(s < 0) & (s > -0.5)]
+   s[(s < -1) | (s > 1 )]
 
 You may select rows from a DataFrame using a boolean vector the same length as
 the DataFrame's index (for example, something derived from one of the columns
@@ -231,22 +232,77 @@ Note, with the :ref:`advanced indexing <indexing.advanced>` ``ix`` method, you
 may select along more than one axis using boolean vectors combined with other
 indexing expressions.
 
-Indexing a DataFrame with a boolean DataFrame
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Where and Masking
+~~~~~~~~~~~~~~~~~
 
-You may wish to set values on a DataFrame based on some boolean criteria
-derived from itself or another DataFrame or set of DataFrames. This can be done
-intuitively like so:
+Selecting values from a Series with a boolean vector generally returns a subset of the data.
+To guarantee that selection output has the same shape as the original data, you can use the
+``where`` method in ``Series`` and ``DataFrame``.
 
 .. ipython:: python
 
+   # return only the selected rows
+   s[s > 0]
+
+   # return a Series of the same shape as the original
+   s.where(s > 0)
+
+Selecting values from a DataFrame with a boolean critierion now also preserves input data shape.
+``where`` is used under the hood as the implementation.
+
+.. ipython:: python
+
+   # return a DataFrame of the same shape as the original
+   # this is equiavalent to ``df.where(df < 0)``
+   df[df < 0]
+
+In addition, ``where`` takes an optional ``other`` argument for replacement of values where the
+condition is False, in the returned copy.
+
+.. ipython:: python
+
+   df.where(df < 0, -df)
+
+You may wish to set values based on some boolean criteria.
+This can be done intuitively like so:
+
+.. ipython:: python
+
+   s2 = s.copy()
+   s2[s2 < 0] = 0
+   s2
+
    df2 = df.copy()
-   df2 < 0
    df2[df2 < 0] = 0
    df2
 
-Note that such an operation requires that the boolean DataFrame is indexed
-exactly the same.
+Furthermore, ``where`` aligns the input boolean condition (ndarray or DataFrame), such that partial selection
+with setting is possible. This is analagous to partial setting via ``.ix`` (but on the contents rather than the axis labels)
+
+.. ipython:: python
+
+   df2 = df.copy()
+   df2[ df2[1:4] > 0 ] = 3
+   df2
+
+By default, ``where`` returns a modified copy of the data. There is an optional parameter ``inplace``
+so that the original data can be modified without creating a copy:
+
+.. ipython:: python
+
+   df_orig = df.copy()
+
+   df_orig.where(df > 0, -df, inplace=True);
+
+   df_orig
+
+``mask`` is the inverse boolean operation of ``where``.
+
+.. ipython:: python
+
+   s.mask(s >= 0)
+
+   df.mask(df >= 0)
 
 
 Take Methods

diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst
@@ -15,10 +15,14 @@ rpy2 / R interface
 If your computer has R and rpy2 (> 2.2) installed (which will be left to the
 reader), you will be able to leverage the below functionality. On Windows,
 doing this is quite an ordeal at the moment, but users on Unix-like systems
-should find it quite easy. rpy2 evolves in time and the current interface is
-designed for the 2.2.x series, and we recommend to use over other series 
-unless you are prepared to fix parts of the code. Released packages are available
-in PyPi, but should the latest code in the 2.2.x series be wanted it can be obtained with:
+should find it quite easy. rpy2 evolves in time, and is currently reaching 
+its release 2.3, while the current interface is
+designed for the 2.2.x series. We recommend to use 2.2.x over other series 
+unless you are prepared to fix parts of the code, yet the rpy2-2.3.0
+introduces improvements such as a better R-Python bridge memory management
+layer so I might be a good idea to bite the bullet and submit patches for
+the few minor differences that need to be fixed.
+
 
 ::
 

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -117,16 +117,17 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
     """
     values = np.asarray(values)
     is_datetime = com.is_datetime64_dtype(values)
-    hash_klass, values = _get_data_algo(values, _hashtables)
+    (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)
 
-    uniques = []
     table = hash_klass(len(values))
-    labels, counts = table.get_labels(values, uniques, 0, na_sentinel)
+    uniques = vec_klass()
+    labels = table.get_labels(values, uniques, 0, na_sentinel)
 
     labels = com._ensure_platform_int(labels)
 
-    uniques = com._asarray_tuplesafe(uniques)
-    if sort and len(counts) > 0:
+    uniques = uniques.to_array()
+
+    if sort and len(uniques) > 0:
         sorter = uniques.argsort()
         reverse_indexer = np.empty(len(sorter), dtype=np.int_)
         reverse_indexer.put(sorter, np.arange(len(sorter)))
@@ -136,12 +137,11 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
         np.putmask(labels, mask, -1)
 
         uniques = uniques.take(sorter)
-        counts = counts.take(sorter)
 
     if is_datetime:
-        uniques = np.array(uniques, dtype='M8[ns]')
+        uniques = uniques.view('M8[ns]')
 
-    return labels, uniques, counts
+    return labels, uniques
 
 
 def value_counts(values, sort=True, ascending=False):
@@ -325,7 +325,7 @@ def group_position(*args):
 }
 
 _hashtables = {
-    'float64': lib.Float64HashTable,
-    'int64': lib.Int64HashTable,
-    'generic': lib.PyObjectHashTable
+    'float64': (lib.Float64HashTable, lib.Float64Vector),
+    'int64': (lib.Int64HashTable, lib.Int64Vector),
+    'generic': (lib.PyObjectHashTable, lib.ObjectVector)
 }
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -53,9 +53,9 @@ def from_array(cls, data):
             labels, levels = data.factorize()
         else:
             try:
-                labels, levels, _ = factorize(data, sort=True)
+                labels, levels = factorize(data, sort=True)
             except TypeError:
-                labels, levels, _ = factorize(data, sort=False)
+                labels, levels = factorize(data, sort=False)
 
         return Categorical(labels, levels,
                            name=getattr(data, 'name', None))

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -587,7 +587,7 @@ def _is_bool_indexer(key):
 
 def _default_index(n):
     from pandas.core.index import Int64Index
-    values = np.arange(n)
+    values = np.arange(n, dtype=np.int64)
     result = values.view(Int64Index)
     result.name = None
     return result
@@ -699,6 +699,23 @@ def iterpairs(seq):
 
     return itertools.izip(seq_it, seq_it_next)
 
+def split_ranges(mask):
+    """ Generates tuples of ranges which cover all True value in mask
+
+    >>> list(split_ranges([1,0,0,1,0]))
+    [(0, 1), (3, 4)]
+    """
+    ranges = [(0,len(mask))]
+
+    for pos,val in enumerate(mask):
+        if not val: # this pos should be ommited, split off the prefix range
+            r = ranges.pop()
+            if pos > r[0]: # yield non-zero range
+                yield (r[0],pos)
+            if pos+1 < len(mask): # save the rest for processing
+                ranges.append((pos+1,len(mask)))
+    if ranges:
+        yield ranges[-1]
 
 def indent(string, spaces=4):
     dent = ' ' * spaces

diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -79,7 +79,7 @@ def _get_footer(self):
             if getattr(self.series.index, 'freq', None):
                 footer += 'Freq: %s' % self.series.index.freqstr
 
-            if footer and self.series.name:
+            if footer and self.series.name is not None:
                 footer += ', '
 
             series_name = com.pprint_thing(self.series.name)