Skip to content

Commit

Permalink
BUG: NaN values not converted to Stata missing values
Browse files Browse the repository at this point in the history
Stata does not correctly handle NaNs, and so these must be replaced with Stata
missing values (. by default).  The fix checks floating point columns for nan
and replaces these with the Stata numeric code for (.).  One of the code paths
which writes files correctly handled this case, and this last-minute check was
removed.

The write_index option was also being ignored by omission. This has been fixed
and numerous tests which were not correct have been fixed.

Also contains some additional tests which were uncovered edges cases related to
fix.
  • Loading branch information
bashtage committed Mar 23, 2014
1 parent 66cf19a commit 88c4c55
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 22 deletions.
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,7 @@ Bug Fixes
- Bug in ``DataFrame.to_stata`` when columns have non-string names (:issue:`4558`)
- Bug in compat with ``np.compress``, surfaced in (:issue:`6658`)
- Bug in binary operations with a rhs of a Series not aligning (:issue:`6681`)
- Bug in ``DataFrame.to_stata`` which incorrectly handles nan values and ignores 'with_index' keyword argument (:issue:`6685`)

pandas 0.13.1
-------------
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1258,7 +1258,8 @@ def to_stata(
from pandas.io.stata import StataWriter
writer = StataWriter(fname, self, convert_dates=convert_dates,
encoding=encoding, byteorder=byteorder,
time_stamp=time_stamp, data_label=data_label)
time_stamp=time_stamp, data_label=data_label,
write_index=write_index)
writer.write_file()

@Appender(fmt.docstring_to_string, indents=1)
Expand Down
23 changes: 17 additions & 6 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -990,8 +990,6 @@ def _dtype_to_stata_type(dtype):
return chr(255)
elif dtype == np.float32:
return chr(254)
elif dtype == np.int64:
return chr(253)
elif dtype == np.int32:
return chr(253)
elif dtype == np.int16:
Expand Down Expand Up @@ -1025,8 +1023,6 @@ def _dtype_to_default_stata_fmt(dtype):
return "%10.0g"
elif dtype == np.float32:
return "%9.0g"
elif dtype == np.int64:
return "%9.0g"
elif dtype == np.int32:
return "%12.0g"
elif dtype == np.int8 or dtype == np.int16:
Expand Down Expand Up @@ -1108,6 +1104,21 @@ def _write(self, to_write):
self._file.write(to_write)


def _replace_nans(self, data):
# return data
"""Checks floating point data columns for nans, and replaces these with
the generic Stata for missing value (.)"""
for c in data:
dtype = data[c].dtype
if dtype in (np.float32, np.float64):
if dtype == np.float32:
replacement = self.MISSING_VALUES['f']
else:
replacement = self.MISSING_VALUES['d']
data[c] = data[c].fillna(replacement)

return data

def _check_column_names(self, data):
"""Checks column names to ensure that they are valid Stata column names.
This includes checks for:
Expand Down Expand Up @@ -1197,6 +1208,8 @@ def __iter__(self):
data = _cast_to_stata_types(data)
# Ensure column names are strings
data = self._check_column_names(data)
# Replace NaNs with Stata missing values
data = self._replace_nans(data)
self.datarows = DataFrameRowIter(data)
self.nobs, self.nvar = data.shape
self.data = data
Expand Down Expand Up @@ -1340,8 +1353,6 @@ def _write_data_dates(self):
var = _pad_bytes(var, typ)
self._write(var)
else:
if isnull(var): # this only matters for floats
var = MISSING_VALUES[TYPE_MAP[typ]]
self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var))

def _null_terminate(self, s, as_string=False):
Expand Down
124 changes: 109 additions & 15 deletions pandas/io/tests/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
import pandas as pd
from pandas.core.frame import DataFrame, Series
from pandas.io.parsers import read_csv
from pandas.io.stata import read_stata, StataReader, InvalidColumnName
from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
PossiblePrecisionLoss)
import pandas.util.testing as tm
from pandas.util.misc import is_little_endian
from pandas import compat
Expand Down Expand Up @@ -142,8 +143,7 @@ def test_read_dta2(self):
parsed_117 = self.read_dta(self.dta2_117)
# 113 is buggy due ot limits date format support in Stata
# parsed_113 = self.read_dta(self.dta2_113)

np.testing.assert_equal(
tm.assert_equal(
len(w), 1) # should get a warning for that format.

# buggy test because of the NaT comparison on certain platforms
Expand Down Expand Up @@ -206,7 +206,7 @@ def test_read_write_dta5(self):
original.index.name = 'index'

with tm.ensure_clean() as path:
original.to_stata(path, None, False)
original.to_stata(path, None)
written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'),
original)
Expand All @@ -221,7 +221,7 @@ def test_write_dta6(self):
original['quarter'] = original['quarter'].astype(np.int32)

with tm.ensure_clean() as path:
original.to_stata(path, None, False)
original.to_stata(path, None)
written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'),
original)
Expand Down Expand Up @@ -257,7 +257,7 @@ def test_read_write_dta10(self):
original['integer'] = original['integer'].astype(np.int32)

with tm.ensure_clean() as path:
original.to_stata(path, {'datetime': 'tc'}, False)
original.to_stata(path, {'datetime': 'tc'})
written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'),
original)
Expand Down Expand Up @@ -295,9 +295,9 @@ def test_read_write_dta11(self):

with tm.ensure_clean() as path:
with warnings.catch_warnings(record=True) as w:
original.to_stata(path, None, False)
np.testing.assert_equal(
len(w), 1) # should get a warning for that format.
original.to_stata(path, None)
# should get a warning for that format.
tm.assert_equal(len(w), 1)

written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
Expand All @@ -324,13 +324,12 @@ def test_read_write_dta12(self):

with tm.ensure_clean() as path:
with warnings.catch_warnings(record=True) as w:
original.to_stata(path, None, False)
np.testing.assert_equal(
len(w), 1) # should get a warning for that format.
original.to_stata(path, None)
tm.assert_equal(len(w), 1) # should get a warning for that format.

written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)

def test_read_write_dta13(self):
s1 = Series(2**9, dtype=np.int16)
s2 = Series(2**17, dtype=np.int32)
Expand Down Expand Up @@ -366,7 +365,7 @@ def test_read_write_reread_dta14(self):
tm.assert_frame_equal(parsed_114, parsed_115)

with tm.ensure_clean() as path:
parsed_114.to_stata(path, {'date_td': 'td'}, write_index=False)
parsed_114.to_stata(path, {'date_td': 'td'})
written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed_114)

Expand Down Expand Up @@ -406,7 +405,7 @@ def test_numeric_column_names(self):
with warnings.catch_warnings(record=True) as w:
tm.assert_produces_warning(original.to_stata(path), InvalidColumnName)
# should produce a single warning
np.testing.assert_equal(len(w), 1)
tm.assert_equal(len(w), 1)

written_and_read_again = self.read_dta(path)
written_and_read_again = written_and_read_again.set_index('index')
Expand All @@ -415,7 +414,102 @@ def test_numeric_column_names(self):
written_and_read_again.columns = map(convert_col_name, columns)
tm.assert_frame_equal(original, written_and_read_again)

def test_nan_to_missing_value(self):
s1 = Series(np.arange(4.0), dtype=np.float32)
s2 = Series(np.arange(4.0), dtype=np.float64)
s1[::2] = np.nan
s2[1::2] = np.nan
original = DataFrame({'s1': s1, 's2': s2})
original.index.name = 'index'
with tm.ensure_clean() as path:
original.to_stata(path)
written_and_read_again = self.read_dta(path)
written_and_read_again = written_and_read_again.set_index('index')
tm.assert_frame_equal(written_and_read_again, original)

def test_no_index(self):
columns = ['x', 'y']
original = DataFrame(np.reshape(np.arange(10.0), (5, 2)),
columns=columns)
original.index.name = 'index_not_written'
with tm.ensure_clean() as path:
original.to_stata(path, write_index=False)
written_and_read_again = self.read_dta(path)
tm.assertRaises(KeyError,
lambda: written_and_read_again['index_not_written'])

def test_string_no_dates(self):
s1 = Series(['a', 'A longer string'])
s2 = Series([1.0, 2.0], dtype=np.float64)
original = DataFrame({'s1': s1, 's2': s2})
original.index.name = 'index'
with tm.ensure_clean() as path:
original.to_stata(path)
written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'),
original)

def test_large_value_conversion(self):
s0 = Series([1, 99], dtype=np.int8)
s1 = Series([1, 127], dtype=np.int8)
s2 = Series([1, 2 ** 15 - 1], dtype=np.int16)
s3 = Series([1, 2 ** 63 - 1], dtype=np.int64)
original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3})
original.index.name = 'index'
with tm.ensure_clean() as path:
with warnings.catch_warnings(record=True) as w:
tm.assert_produces_warning(original.to_stata(path),
PossiblePrecisionLoss)
# should produce a single warning
tm.assert_equal(len(w), 1)

written_and_read_again = self.read_dta(path)
modified = original.copy()
modified['s1'] = Series(modified['s1'], dtype=np.int16)
modified['s2'] = Series(modified['s2'], dtype=np.int32)
modified['s3'] = Series(modified['s3'], dtype=np.float64)
tm.assert_frame_equal(written_and_read_again.set_index('index'),
modified)

def test_dates_invalid_column(self):
original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)])
original.index.name = 'index'
with tm.ensure_clean() as path:
with warnings.catch_warnings(record=True) as w:
tm.assert_produces_warning(original.to_stata(path, {0: 'tc'}),
InvalidColumnName)
tm.assert_equal(len(w), 1)

written_and_read_again = self.read_dta(path)
modified = original.copy()
modified.columns = ['_0']
tm.assert_frame_equal(written_and_read_again.set_index('index'),
modified)

def test_date_export_formats(self):
columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty']
conversions = dict(((c, c) for c in columns))
data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns)
original = DataFrame([data], columns=columns)
original.index.name = 'index'
expected_values = [datetime(2006, 11, 20, 23, 13, 20), # Time
datetime(2006, 11, 20), # Day
datetime(2006, 11, 19), # Week
datetime(2006, 11, 1), # Month
datetime(2006, 10, 1), # Quarter year
datetime(2006, 7, 1), # Half year
datetime(2006, 1, 1)] # Year

expected = DataFrame([expected_values], columns=columns)
expected.index.name = 'index'
with tm.ensure_clean() as path:
original.to_stata(path, conversions)
written_and_read_again = self.read_dta(path)
tm.assert_frame_equal(written_and_read_again.set_index('index'),
expected)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)

0 comments on commit 88c4c55

Please sign in to comment.