Change default of float_precision for read_csv and read_table to "hig…

…h" (pandas-dev#36228)
adana315 · Sep 13, 2020 · a3c4dc8 · a3c4dc8
1 parent 5ad15f8
commit a3c4dc8
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 8 deletions.
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -96,6 +96,19 @@ For example:
    buffer = io.BytesIO()
    data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip")
 
+:.. _whatsnew_read_csv_table_precision_default:
+
+Change in default floating precision for ``read_csv`` and ``read_table``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For the C parsing engine, the methods :meth:`read_csv` and :meth:`read_table` previously defaulted to a parser that
+could read floating point numbers slightly incorrectly with respect to the last bit in precision.
+The option ``floating_precision="high"`` has always been available to avoid this issue.
+Beginning with this version, the default is now to use the more accurate parser by making
+``floating_precision=None`` correspond to the high precision parser, and the new option
+``floating_precision="legacy"`` to use the legacy parser. The change to using the higher precision
+parser by default should have no impact on performance. (:issue:`17154`)
+
 .. _whatsnew_120.enhancements.other:
 
 Other enhancements

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -476,10 +476,13 @@ cdef class TextReader:
         if float_precision == "round_trip":
             # see gh-15140
             self.parser.double_converter = round_trip
-        elif float_precision == "high":
+        elif float_precision == "legacy":
+            self.parser.double_converter = xstrtod
+        elif float_precision == "high" or float_precision is None:
             self.parser.double_converter = precise_xstrtod
         else:
-            self.parser.double_converter = xstrtod
+            raise ValueError(f'Unrecognized float_precision option: '
+                             f'{float_precision}')
 
         if isinstance(dtype, dict):
             dtype = {k: pandas_dtype(dtype[k])

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -338,9 +338,9 @@
     option can improve performance because there is no longer any I/O overhead.
 float_precision : str, optional
     Specifies which converter the C engine should use for floating-point
-    values. The options are `None` for the ordinary converter,
-    `high` for the high-precision converter, and `round_trip` for the
-    round-trip converter.
+    values. The options are `None` or `high` for the ordinary converter,
+    `legacy` for the original lower precision pandas converter, and
+    `round_trip` for the round-trip converter.
 
 Returns
 -------
@@ -2284,6 +2284,7 @@ def TextParser(*args, **kwds):
         values. The options are None for the ordinary converter,
         'high' for the high-precision converter, and 'round_trip' for the
         round-trip converter.
+        .. versionchanged:: 1.2
     """
     kwds["engine"] = "python"
     return TextFileReader(*args, **kwds)

diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
@@ -160,7 +160,9 @@ def test_precise_conversion(c_parser_only):
         # 25 decimal digits of precision
         text = f"a\n{num:.25}"
 
-        normal_val = float(parser.read_csv(StringIO(text))["a"][0])
+        normal_val = float(
+            parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
+        )
         precise_val = float(
             parser.read_csv(StringIO(text), float_precision="high")["a"][0]
         )
@@ -608,7 +610,7 @@ def test_unix_style_breaks(c_parser_only):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
+@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
 @pytest.mark.parametrize(
     "data,thousands,decimal",
     [
@@ -646,7 +648,7 @@ def test_1000_sep_with_decimal(
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
+@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
 @pytest.mark.parametrize(
     "value,expected",
     [
@@ -702,3 +704,22 @@ def test_1000_sep_decimal_float_precision(
     )
     val = df.iloc[0, 0]
     assert val == expected
+
+
+def test_float_precision_options(c_parser_only):
+    # GH 17154, 36228
+    parser = c_parser_only
+    s = "foo\n243.164\n"
+    df = parser.read_csv(StringIO(s))
+    df2 = parser.read_csv(StringIO(s), float_precision="high")
+
+    tm.assert_frame_equal(df, df2)
+
+    df3 = parser.read_csv(StringIO(s), float_precision="legacy")
+
+    assert not df.iloc[0, 0] == df3.iloc[0, 0]
+
+    msg = "Unrecognized float_precision option: junk"
+
+    with pytest.raises(ValueError, match=msg):
+        parser.read_csv(StringIO(s), float_precision="junk")