TST: Test for float --> int in the middle of object dtype

bingo · Nov 1, 2013 · 1ea301e · 1ea301e
1 parent f490600
commit 1ea301e
Show file tree

Hide file tree

Showing 6 changed files with 14 additions and 2 deletions.
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1839,6 +1839,13 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`.
        df1.to_excel(writer, sheet_name='Sheet1')
        df2.to_excel(writer, sheet_name='Sheet2')
 
+.. note:: Wringing a little more performance out of ``read_excel``
+    Internally, Excel stores all numeric data as floats. Because this can
+    produce unexpected behavior when reading in data, pandas defaults to trying
+    to convert integers to floats if it doesn't lose information (``1.0 -->
+    1``).  You can pass ``convert_float=False`` to disable this behavior, which
+    may give a slight performance improvement.
+
 .. _io.excel.writers:
 
 Excel writer engines

diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -207,6 +207,8 @@ Improvements to existing features
     closed])
   - Fixed bug in `tools.plotting.andrews_curvres` so that lines are drawn grouped
     by color as expected.
+  - ``read_excel()`` now tries to convert integral floats (like ``1.0``) to int
+    by default. (:issue:`5394`)
 
 API Changes
 ~~~~~~~~~~~

diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -273,7 +273,8 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
                     elif typ == XL_CELL_BOOLEAN:
                         value = bool(value)
                     elif convert_float and typ == XL_CELL_NUMBER:
-                        # Excel 'numbers' are always floats
+                        # GH5394 - Excel 'numbers' are always floats
+                        # it's a minimal perf hit and less suprising
                         val = int(value)
                         if val == value:
                             value = val

diff --git a/pandas/io/tests/data/test_types.xls b/pandas/io/tests/data/test_types.xls
diff --git a/pandas/io/tests/data/test_types.xlsx b/pandas/io/tests/data/test_types.xlsx
diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py
@@ -315,7 +315,8 @@ def test_reader_special_dtypes(self):
             ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
             ("BoolCol", [True, False, True, True, False]),
             ("StrCol", [1, 2, 3, 4, 5]),
-            ("Str2Col", ["a", "b", "c", "d", "e"]),
+            # GH5394 - this is why convert_float isn't vectorized
+            ("Str2Col", ["a", 3, "c", "d", "e"]),
             ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31),
                          datetime(1905, 1, 1), datetime(2013, 12, 14),
                          datetime(2015, 3, 14)])
@@ -332,6 +333,7 @@ def test_reader_special_dtypes(self):
         # if not coercing number, then int comes in as float
         float_expected = expected.copy()
         float_expected["IntCol"] = float_expected["IntCol"].astype(float)
+        float_expected.loc[1, "Str2Col"] = 3.0
         for path in (xls_path, xlsx_path):
             actual = read_excel(path, 'Sheet1', convert_float=False)
             tm.assert_frame_equal(actual, float_expected)