Skip to content

Commit

Permalink
Change default of float_precision for read_csv and read_table to "hig…
Browse files Browse the repository at this point in the history
  • Loading branch information
Dr-Irv authored Sep 13, 2020
1 parent 5ad15f8 commit a3c4dc8
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 8 deletions.
13 changes: 13 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,19 @@ For example:
buffer = io.BytesIO()
data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip")
:.. _whatsnew_read_csv_table_precision_default:

Change in default floating precision for ``read_csv`` and ``read_table``
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

For the C parsing engine, the methods :meth:`read_csv` and :meth:`read_table` previously defaulted to a parser that
could read floating point numbers slightly incorrectly with respect to the last bit in precision.
The option ``floating_precision="high"`` has always been available to avoid this issue.
Beginning with this version, the default is now to use the more accurate parser by making
``floating_precision=None`` correspond to the high precision parser, and the new option
``floating_precision="legacy"`` to use the legacy parser. The change to using the higher precision
parser by default should have no impact on performance. (:issue:`17154`)

.. _whatsnew_120.enhancements.other:

Other enhancements
Expand Down
7 changes: 5 additions & 2 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -476,10 +476,13 @@ cdef class TextReader:
if float_precision == "round_trip":
# see gh-15140
self.parser.double_converter = round_trip
elif float_precision == "high":
elif float_precision == "legacy":
self.parser.double_converter = xstrtod
elif float_precision == "high" or float_precision is None:
self.parser.double_converter = precise_xstrtod
else:
self.parser.double_converter = xstrtod
raise ValueError(f'Unrecognized float_precision option: '
f'{float_precision}')

if isinstance(dtype, dict):
dtype = {k: pandas_dtype(dtype[k])
Expand Down
7 changes: 4 additions & 3 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,9 +338,9 @@
option can improve performance because there is no longer any I/O overhead.
float_precision : str, optional
Specifies which converter the C engine should use for floating-point
values. The options are `None` for the ordinary converter,
`high` for the high-precision converter, and `round_trip` for the
round-trip converter.
values. The options are `None` or `high` for the ordinary converter,
`legacy` for the original lower precision pandas converter, and
`round_trip` for the round-trip converter.
Returns
-------
Expand Down Expand Up @@ -2284,6 +2284,7 @@ def TextParser(*args, **kwds):
values. The options are None for the ordinary converter,
'high' for the high-precision converter, and 'round_trip' for the
round-trip converter.
.. versionchanged:: 1.2
"""
kwds["engine"] = "python"
return TextFileReader(*args, **kwds)
Expand Down
27 changes: 24 additions & 3 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,9 @@ def test_precise_conversion(c_parser_only):
# 25 decimal digits of precision
text = f"a\n{num:.25}"

normal_val = float(parser.read_csv(StringIO(text))["a"][0])
normal_val = float(
parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
)
precise_val = float(
parser.read_csv(StringIO(text), float_precision="high")["a"][0]
)
Expand Down Expand Up @@ -608,7 +610,7 @@ def test_unix_style_breaks(c_parser_only):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
@pytest.mark.parametrize(
"data,thousands,decimal",
[
Expand Down Expand Up @@ -646,7 +648,7 @@ def test_1000_sep_with_decimal(
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
@pytest.mark.parametrize(
"value,expected",
[
Expand Down Expand Up @@ -702,3 +704,22 @@ def test_1000_sep_decimal_float_precision(
)
val = df.iloc[0, 0]
assert val == expected


def test_float_precision_options(c_parser_only):
# GH 17154, 36228
parser = c_parser_only
s = "foo\n243.164\n"
df = parser.read_csv(StringIO(s))
df2 = parser.read_csv(StringIO(s), float_precision="high")

tm.assert_frame_equal(df, df2)

df3 = parser.read_csv(StringIO(s), float_precision="legacy")

assert not df.iloc[0, 0] == df3.iloc[0, 0]

msg = "Unrecognized float_precision option: junk"

with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(s), float_precision="junk")

0 comments on commit a3c4dc8

Please sign in to comment.