Skip to content

Commit

Permalink
Merge pull request numpy#13491 from eric-wieser/npy-2.1
Browse files Browse the repository at this point in the history
 BUG/ENH: Create npy format 3.0 to support extended unicode characters in dtypes
  • Loading branch information
mattip authored May 19, 2019
2 parents b74e419 + ecb4024 commit a86c7e5
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 54 deletions.
6 changes: 6 additions & 0 deletions doc/release/1.17.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,18 @@ Floating point scalars implement ``as_integer_ratio`` to match the builtin float
This returns a (numerator, denominator) pair, which can be used to construct a
`fractions.Fraction`.

``.npy`` files support unicode field names
------------------------------------------
A new format version of 3.0 has been introduced, which enables structured types
with non-latin1 field names. This is used automatically when needed.

`numpy.packbits` and `numpy.unpackbits` accept an ``order`` keyword
-------------------------------------------------------------------
The ``order`` keyword defaults to ``big``, and will order the **bits**
accordingly. For ``'big'`` 3 will become ``[0, 0, 0, 0, 0, 0, 1, 1]``, and
``[1, 1, 0, 0, 0, 0, 0, 0]`` for ``little``


Improvements
============

Expand Down
132 changes: 78 additions & 54 deletions numpy/lib/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,13 @@
"The next 4 bytes form a little-endian unsigned int: the length of the header
data HEADER_LEN."
Format Version 3.0
------------------
This version replaces the ASCII string (which in practice was latin1) with
a utf8-encoded string, so supports structured types with any unicode field
names.
Notes
-----
The ``.npy`` format, including motivation for creating it and a comparison of
Expand All @@ -162,7 +169,7 @@
import warnings
from numpy.lib.utils import safe_eval
from numpy.compat import (
asbytes, asstr, isfileobj, long, os_fspath, pickle
isfileobj, long, os_fspath, pickle
)


Expand All @@ -173,10 +180,16 @@

# difference between version 1.0 and 2.0 is a 4 byte (I) header length
# instead of 2 bytes (H) allowing storage of large structured arrays
_header_size_info = {
(1, 0): ('<H', 'latin1'),
(2, 0): ('<I', 'latin1'),
(3, 0): ('<I', 'utf8'),
}


def _check_version(version):
if version not in [(1, 0), (2, 0), None]:
msg = "we only support format version (1,0) and (2, 0), not %s"
if version not in [(1, 0), (2, 0), (3, 0), None]:
msg = "we only support format version (1,0), (2,0), and (3,0), not %s"
raise ValueError(msg % (version,))

def magic(major, minor):
Expand Down Expand Up @@ -326,6 +339,56 @@ def header_data_from_array_1_0(array):
d['descr'] = dtype_to_descr(array.dtype)
return d


def _wrap_header(header, version):
"""
Takes a stringified header, and attaches the prefix and padding to it
"""
import struct
assert version is not None
fmt, encoding = _header_size_info[version]
if not isinstance(header, bytes): # always true on python 3
header = header.encode(encoding)
hlen = len(header) + 1
padlen = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize(fmt) + hlen) % ARRAY_ALIGN)
try:
header_prefix = magic(*version) + struct.pack(fmt, hlen + padlen)
except struct.error:
msg = "Header length {} too big for version={}".format(hlen, version)
raise ValueError(msg)

# Pad the header with spaces and a final newline such that the magic
# string, the header-length short and the header are aligned on a
# ARRAY_ALIGN byte boundary. This supports memory mapping of dtypes
# aligned up to ARRAY_ALIGN on systems like Linux where mmap()
# offset must be page-aligned (i.e. the beginning of the file).
return header_prefix + header + b' '*padlen + b'\n'


def _wrap_header_guess_version(header):
"""
Like `_wrap_header`, but chooses an appropriate version given the contents
"""
try:
return _wrap_header(header, (1, 0))
except ValueError:
pass

try:
ret = _wrap_header(header, (2, 0))
except UnicodeEncodeError:
pass
else:
warnings.warn("Stored array in format 2.0. It can only be"
"read by NumPy >= 1.9", UserWarning, stacklevel=2)
return ret

header = _wrap_header(header, (3, 0))
warnings.warn("Stored array in format 3.0. It can only be"
"read by NumPy >= 1.17", UserWarning, stacklevel=2)
return header


def _write_array_header(fp, d, version=None):
""" Write the header for an array and returns the version used
Expand All @@ -339,48 +402,19 @@ def _write_array_header(fp, d, version=None):
None means use oldest that works
explicit version will raise a ValueError if the format does not
allow saving this data. Default: None
Returns
-------
version : tuple of int
the file version which needs to be used to store the data
"""
import struct
header = ["{"]
for key, value in sorted(d.items()):
# Need to use repr here, since we eval these when reading
header.append("'%s': %s, " % (key, repr(value)))
header.append("}")
header = "".join(header)
header = asbytes(_filter_header(header))

hlen = len(header) + 1 # 1 for newline
padlen_v1 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<H') + hlen) % ARRAY_ALIGN)
padlen_v2 = ARRAY_ALIGN - ((MAGIC_LEN + struct.calcsize('<I') + hlen) % ARRAY_ALIGN)

# Which version(s) we write depends on the total header size; v1 has a max of 65535
if hlen + padlen_v1 < 2**16 and version in (None, (1, 0)):
version = (1, 0)
header_prefix = magic(1, 0) + struct.pack('<H', hlen + padlen_v1)
topad = padlen_v1
elif hlen + padlen_v2 < 2**32 and version in (None, (2, 0)):
version = (2, 0)
header_prefix = magic(2, 0) + struct.pack('<I', hlen + padlen_v2)
topad = padlen_v2
header = _filter_header(header)
if version is None:
header = _wrap_header_guess_version(header)
else:
msg = "Header length %s too big for version=%s"
msg %= (hlen, version)
raise ValueError(msg)

# Pad the header with spaces and a final newline such that the magic
# string, the header-length short and the header are aligned on a
# ARRAY_ALIGN byte boundary. This supports memory mapping of dtypes
# aligned up to ARRAY_ALIGN on systems like Linux where mmap()
# offset must be page-aligned (i.e. the beginning of the file).
header = header + b' '*topad + b'\n'

fp.write(header_prefix)
header = _wrap_header(header, version)
fp.write(header)
return version

def write_array_header_1_0(fp, d):
""" Write the header for an array using the 1.0 format.
Expand Down Expand Up @@ -483,7 +517,7 @@ def _filter_header(s):
Parameters
----------
s : byte string
s : string
Npy file header.
Returns
Expand All @@ -501,7 +535,7 @@ def _filter_header(s):
tokens = []
last_token_was_number = False
# adding newline as python 2.7.5 workaround
string = asstr(s) + "\n"
string = s + "\n"
for token in tokenize.generate_tokens(StringIO(string).readline):
token_type = token[0]
token_string = token[1]
Expand All @@ -523,16 +557,15 @@ def _read_array_header(fp, version):
# Read an unsigned, little-endian short int which has the length of the
# header.
import struct
if version == (1, 0):
hlength_type = '<H'
elif version == (2, 0):
hlength_type = '<I'
else:
hinfo = _header_size_info.get(version)
if hinfo is None:
raise ValueError("Invalid version {!r}".format(version))
hlength_type, encoding = hinfo

hlength_str = _read_bytes(fp, struct.calcsize(hlength_type), "array header length")
header_length = struct.unpack(hlength_type, hlength_str)[0]
header = _read_bytes(fp, header_length, "array header")
header = header.decode(encoding)

# The header is a pretty-printed string representation of a literal
# Python dictionary with trailing newlines padded to a ARRAY_ALIGN byte
Expand Down Expand Up @@ -607,12 +640,7 @@ def write_array(fp, array, version=None, allow_pickle=True, pickle_kwargs=None):
"""
_check_version(version)
used_ver = _write_array_header(fp, header_data_from_array_1_0(array),
version)
# this warning can be removed when 1.9 has aged enough
if version != (2, 0) and used_ver == (2, 0):
warnings.warn("Stored array in format 2.0. It can only be"
"read by NumPy >= 1.9", UserWarning, stacklevel=2)
_write_array_header(fp, header_data_from_array_1_0(array), version)

if array.itemsize == 0:
buffersize = 0
Expand Down Expand Up @@ -815,11 +843,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
# If we got here, then it should be safe to create the file.
fp = open(os_fspath(filename), mode+'b')
try:
used_ver = _write_array_header(fp, d, version)
# this warning can be removed when 1.9 has aged enough
if version != (2, 0) and used_ver == (2, 0):
warnings.warn("Stored array in format 2.0. It can only be"
"read by NumPy >= 1.9", UserWarning, stacklevel=2)
_write_array_header(fp, d, version)
offset = fp.tell()
finally:
fp.close()
Expand Down
25 changes: 25 additions & 0 deletions numpy/lib/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@
import numpy as np
from numpy.testing import (
assert_, assert_array_equal, assert_raises, assert_raises_regex,
assert_warns
)
from numpy.lib import format

Expand Down Expand Up @@ -938,3 +939,27 @@ def test_empty_npz():
fname = os.path.join(tempdir, "nothing.npz")
np.savez(fname)
np.load(fname)


def test_unicode_field_names():
# gh-7391
arr = np.array([
(1, 3),
(1, 2),
(1, 3),
(1, 2)
], dtype=[
('int', int),
(u'\N{CJK UNIFIED IDEOGRAPH-6574}\N{CJK UNIFIED IDEOGRAPH-5F62}', int)
])
fname = os.path.join(tempdir, "unicode.npy")
with open(fname, 'wb') as f:
format.write_array(f, arr, version=(3, 0))
with open(fname, 'rb') as f:
arr2 = format.read_array(f)
assert_array_equal(arr, arr2)

# notifies the user that 3.0 is selected
with open(fname, 'wb') as f:
with assert_warns(UserWarning):
format.write_array(f, arr, version=None)

0 comments on commit a86c7e5

Please sign in to comment.