BUG: Incorrect dtypes inferred on datetimelike looking series & on xs…

… slices (GH9477)
mgdadv · Feb 24, 2015 · cdf611a · cdf611a
1 parent 9796a9f
commit cdf611a
Show file tree

Hide file tree

Showing 7 changed files with 60 additions and 32 deletions.
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -246,6 +246,7 @@ Bug Fixes
 - Fixed bug in ``to_sql`` ``dtype`` argument not accepting an instantiated
   SQLAlchemy type  (:issue:`9083`).
 - Bug in ``.loc`` partial setting with a ``np.datetime64`` (:issue:`9516`)
+- Incorrect dtypes inferred on datetimelike looking series & on xs slices (:issue:`9477`)
 
 - Items in ``Categorical.unique()`` (and ``s.unique()`` if ``s`` is of dtype ``category``) now appear in the order in which they are originally found, not in sorted order (:issue:`9331`). This is now consistent with the behavior for other dtypes in pandas.
 

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -2030,6 +2030,7 @@ def _possibly_infer_to_datetimelike(value, convert_dates=False):
 
     Parameters
     ----------
+    value : np.array
     convert_dates : boolean, default False
        if True try really hard to convert dates (such as datetime.date), other
        leave inferred dtype 'date' alone
@@ -2068,9 +2069,9 @@ def _try_timedelta(v):
         inferred_type = lib.infer_dtype(sample)
 
         if inferred_type in ['datetime', 'datetime64'] or (convert_dates and inferred_type in ['date']):
-            value = _try_datetime(v)
+            value = _try_datetime(v).reshape(shape)
         elif inferred_type in ['timedelta', 'timedelta64']:
-            value = _try_timedelta(v)
+            value = _try_timedelta(v).reshape(shape)
 
         # its possible to have nulls intermixed within the datetime or timedelta
         # these will in general have an inferred_type of 'mixed', so have to try
@@ -2081,9 +2082,9 @@ def _try_timedelta(v):
         elif inferred_type in ['mixed']:
 
             if lib.is_possible_datetimelike_array(_ensure_object(v)):
-                value = _try_timedelta(v)
+                value = _try_timedelta(v).reshape(shape)
                 if lib.infer_dtype(value) in ['mixed']:
-                    value = _try_datetime(v)
+                    value = _try_datetime(v).reshape(shape)
 
     return value
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -27,7 +27,7 @@
                                 _default_index, _maybe_upcast, is_sequence,
                                 _infer_dtype_from_scalar, _values_from_object,
                                 is_list_like, _get_dtype, _maybe_box_datetimelike,
-                                is_categorical_dtype)
+                                is_categorical_dtype, is_object_dtype, _possibly_infer_to_datetimelike)
 from pandas.core.generic import NDFrame, _shared_docs
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import (maybe_droplevels,
@@ -396,7 +396,15 @@ def _get_axes(N, K, index=index, columns=columns):
                     raise_with_traceback(e)
 
         index, columns = _get_axes(*values.shape)
-        return create_block_manager_from_blocks([values.T], [columns, index])
+        values = values.T
+
+        # if we don't have a dtype specified, then try to convert objects
+        # on the entire block; this is to convert if we have datetimelike's
+        # embedded in an object type
+        if dtype is None and is_object_dtype(values):
+            values = _possibly_infer_to_datetimelike(values)
+
+        return create_block_manager_from_blocks([values], [columns, index])
 
     @property
     def axes(self):
@@ -1537,7 +1545,7 @@ def _sizeof_fmt(num, size_qualifier):
             # cases (e.g., it misses categorical data even with object
             # categories)
             size_qualifier = ('+' if 'object' in counts
-                              or self.index.dtype.kind == 'O' else '')
+                              or is_object_dtype(self.index) else '')
             mem_usage = self.memory_usage(index=True).sum()
             lines.append("memory usage: %s\n" %
                             _sizeof_fmt(mem_usage, size_qualifier))
@@ -2257,6 +2265,8 @@ def reindexer(value):
 
         elif (isinstance(value, Index) or is_sequence(value)):
             from pandas.core.series import _sanitize_index
+
+            # turn me into an ndarray
             value = _sanitize_index(value, self.index, copy=False)
             if not isinstance(value, (np.ndarray, Index)):
                 if isinstance(value, list) and len(value) > 0:
@@ -2267,6 +2277,11 @@ def reindexer(value):
                 value = value.copy().T
             else:
                 value = value.copy()
+
+            # possibly infer to datetimelike
+            if is_object_dtype(value.dtype):
+                value = _possibly_infer_to_datetimelike(value.ravel()).reshape(value.shape)
+
         else:
             # upcast the scalar
             dtype, value = _infer_dtype_from_scalar(value)
@@ -2341,7 +2356,7 @@ def lookup(self, row_labels, col_labels):
             for i, (r, c) in enumerate(zip(row_labels, col_labels)):
                 result[i] = self.get_value(r, c)
 
-        if result.dtype == 'O':
+        if is_object_dtype(result):
             result = lib.maybe_convert_objects(result)
 
         return result
@@ -4232,7 +4247,7 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
                 values = self.values
             result = f(values)
 
-        if result.dtype == np.object_:
+        if is_object_dtype(result.dtype):
             try:
                 if filter_type is None or filter_type == 'numeric':
                     result = result.astype(np.float64)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1467,8 +1467,11 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True):
             if not is_list_like(new_values) or self.ndim == 1:
                 return _maybe_box_datetimelike(new_values)
 
-            result = Series(new_values, index=self.columns,
-                            name=self.index[loc])
+            result = Series(new_values,
+                            index=self.columns,
+                            name=self.index[loc],
+                            copy=copy,
+                            dtype=new_values.dtype)
 
         else:
             result = self.iloc[loc]

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -13,8 +13,8 @@
                                 ABCSparseSeries, _infer_dtype_from_scalar,
                                 is_null_datelike_scalar, _maybe_promote,
                                 is_timedelta64_dtype, is_datetime64_dtype,
-                                _possibly_infer_to_datetimelike, array_equivalent,
-                                _maybe_convert_string_to_object, is_categorical)
+                                array_equivalent, _maybe_convert_string_to_object,
+                                is_categorical)
 from pandas.core.index import Index, MultiIndex, _ensure_index
 from pandas.core.indexing import maybe_convert_indices, length_of_indexer
 from pandas.core.categorical import Categorical, maybe_to_categorical
@@ -2074,25 +2074,8 @@ def make_block(values, placement, klass=None, ndim=None,
             klass = ComplexBlock
         elif is_categorical(values):
             klass = CategoricalBlock
-
         else:
-
-            # we want to infer here if its a datetimelike if its object type
-            # this is pretty strict in that it requires a datetime/timedelta
-            # value IN addition to possible nulls/strings
-            # an array of ONLY strings will not be inferred
-            if np.prod(values.shape):
-                result = _possibly_infer_to_datetimelike(values)
-                vtype = result.dtype.type
-                if issubclass(vtype, np.datetime64):
-                    klass = DatetimeBlock
-                    values = result
-                elif (issubclass(vtype, np.timedelta64)):
-                    klass = TimeDeltaBlock
-                    values = result
-
-            if klass is None:
-                klass = ObjectBlock
+            klass = ObjectBlock
 
     return klass(values, ndim=ndim, fastpath=fastpath,
                  placement=placement)

diff --git a/pandas/io/packers.py b/pandas/io/packers.py
@@ -490,7 +490,9 @@ def decode(obj):
         index = obj['index']
         return globals()[obj['klass']](unconvert(obj['data'], dtype,
                                                  obj['compress']),
-                                       index=index, name=obj['name'])
+                                       index=index,
+                                       dtype=dtype,
+                                       name=obj['name'])
     elif typ == 'block_manager':
         axes = obj['axes']
 

diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -569,6 +569,7 @@ def test_scalar_conversion(self):
         self.assertEqual(int(Series([1.])), 1)
         self.assertEqual(long(Series([1.])), 1)
 
+
     def test_astype(self):
         s = Series(np.random.randn(5),name='foo')
 
@@ -778,6 +779,28 @@ def test_constructor_dtype_nocast(self):
         s2[1] = 5
         self.assertEqual(s[1], 5)
 
+    def test_constructor_datelike_coercion(self):
+
+        # GH 9477
+        # incorrectly infering on dateimelike looking when object dtype is specified
+        s = Series([Timestamp('20130101'),'NOV'],dtype=object)
+        self.assertEqual(s.iloc[0],Timestamp('20130101'))
+        self.assertEqual(s.iloc[1],'NOV')
+        self.assertTrue(s.dtype == object)
+
+        # the dtype was being reset on the slicing and re-inferred to datetime even
+        # thought the blocks are mixed
+        belly = '216 3T19'.split()
+        wing1 = '2T15 4H19'.split()
+        wing2 = '416 4T20'.split()
+        mat = pd.to_datetime('2016-01-22 2019-09-07'.split())
+        df = pd.DataFrame({'wing1':wing1, 'wing2':wing2, 'mat':mat}, index=belly)
+
+        result = df.loc['3T19']
+        self.assertTrue(result.dtype == object)
+        result = df.loc['216']
+        self.assertTrue(result.dtype == object)
+
     def test_constructor_dtype_datetime64(self):
         import pandas.tslib as tslib