Skip to content

Commit

Permalink
Merge pull request numpy#4765 from juliantaylor/npyformat-2.0
Browse files Browse the repository at this point in the history
ENH: add storage format 2.0 with 4 byte header size
  • Loading branch information
charris committed Jun 2, 2014
2 parents 5e7e24e + fc50d27 commit e6f4366
Show file tree
Hide file tree
Showing 3 changed files with 202 additions and 34 deletions.
8 changes: 8 additions & 0 deletions doc/release/1.9.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,14 @@ comparison when the numpy version goes to 1.10.devel. For example::
>>> if NumpyVersion(np.__version__) < '1.10.0'):
... print('Wow, that is an old NumPy version!')

Allow saving arrays with large number of named columns
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The numpy storage format 1.0 only allowed the array header to have a total size
of 65535 bytes. This can be exceeded by structured arrays with a large number
of columns. A new format 2.0 has been added which extends the header size to 4
GiB. `np.save` will automatically save in 2.0 format if the data requires it,
else it will always use the more compatible 1.0 format.


Improvements
============
Expand Down
171 changes: 139 additions & 32 deletions numpy/lib/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@
import numpy
import sys
import io
import warnings
from numpy.lib.utils import safe_eval
from numpy.compat import asbytes, isfileobj, long, basestring

Expand All @@ -151,6 +152,14 @@
MAGIC_LEN = len(MAGIC_PREFIX) + 2
BUFFER_SIZE = 2 ** 18 #size of buffer for reading npz files in bytes

# difference between version 1.0 and 2.0 is a 4 byte (I) header length
# instead of 2 bytes (H) allowing storage of large structured arrays

def _check_version(version):
if version not in [(1, 0), (2, 0), None]:
msg = "we only support format version (1,0) and (2, 0), not %s"
raise ValueError(msg % (version,))

def magic(major, minor):
""" Return the magic string for the given file format version.
Expand Down Expand Up @@ -258,15 +267,23 @@ def header_data_from_array_1_0(array):
d['descr'] = dtype_to_descr(array.dtype)
return d

def write_array_header_1_0(fp, d):
""" Write the header for an array using the 1.0 format.
def _write_array_header(fp, d, version=None):
""" Write the header for an array and returns the version used
Parameters
----------
fp : filelike object
d : dict
This has the appropriate entries for writing its string representation
to the header of the file.
version: tuple or None
None means use oldest that works
explicit version will raise a ValueError if the format does not
allow saving this data. Default: None
Returns
-------
version : tuple of int
the file version which needs to be used to store the data
"""
import struct
header = ["{"]
Expand All @@ -282,11 +299,52 @@ def write_array_header_1_0(fp, d):
current_header_len = MAGIC_LEN + 2 + len(header) + 1 # 1 for the newline
topad = 16 - (current_header_len % 16)
header = asbytes(header + ' '*topad + '\n')
if len(header) >= (256*256):
raise ValueError("header does not fit inside %s bytes" % (256*256))
header_len_str = struct.pack('<H', len(header))

if len(header) >= (256*256) and version == (1, 0):
raise ValueError("header does not fit inside %s bytes required by the"
" 1.0 format" % (256*256))
if len(header) < (256*256):
header_len_str = struct.pack('<H', len(header))
version = (1, 0)
elif len(header) < (2**32):
header_len_str = struct.pack('<I', len(header))
version = (2, 0)
else:
raise ValueError("header does not fit inside 4 GiB required by "
"the 2.0 format")

fp.write(magic(*version))
fp.write(header_len_str)
fp.write(header)
return version

def write_array_header_1_0(fp, d):
""" Write the header for an array using the 1.0 format.
Parameters
----------
fp : filelike object
d : dict
This has the appropriate entries for writing its string representation
to the header of the file.
"""
_write_array_header(fp, d, (1, 0))


def write_array_header_2_0(fp, d):
""" Write the header for an array using the 2.0 format.
The 2.0 format allows storing very large structured arrays.
.. versionadded:: 1.9.0
Parameters
----------
fp : filelike object
d : dict
This has the appropriate entries for writing its string representation
to the header of the file.
"""
_write_array_header(fp, d, (2, 0))

def read_array_header_1_0(fp):
"""
Expand Down Expand Up @@ -316,13 +374,59 @@ def read_array_header_1_0(fp):
ValueError
If the data is invalid.
"""
_read_array_header(fp, version=(1, 0))

def read_array_header_2_0(fp):
"""
Read an array header from a filelike object using the 2.0 file format
version.
This will leave the file object located just after the header.
.. versionadded:: 1.9.0
Parameters
----------
fp : filelike object
A file object or something with a `.read()` method like a file.
Returns
-------
shape : tuple of int
The shape of the array.
fortran_order : bool
The array data will be written out directly if it is either C-contiguous
or Fortran-contiguous. Otherwise, it will be made contiguous before
writing it out.
dtype : dtype
The dtype of the file's data.
Raises
------
ValueError
If the data is invalid.
"""
_read_array_header(fp, version=(2, 0))

def _read_array_header(fp, version):
"""
see read_array_header_1_0
"""
# Read an unsigned, little-endian short int which has the length of the
# header.
import struct
hlength_str = _read_bytes(fp, 2, "array header length")
header_length = struct.unpack('<H', hlength_str)[0]
header = _read_bytes(fp, header_length, "array header")
if version == (1, 0):
hlength_str = _read_bytes(fp, 2, "array header length")
header_length = struct.unpack('<H', hlength_str)[0]
header = _read_bytes(fp, header_length, "array header")
elif version == (2, 0):
hlength_str = _read_bytes(fp, 4, "array header length")
header_length = struct.unpack('<I', hlength_str)[0]
header = _read_bytes(fp, header_length, "array header")
else:
raise ValueError("Invalid version %r" % version)

# The header is a pretty-printed string representation of a literal Python
# dictionary with trailing newlines padded to a 16-byte boundary. The keys
Expand Down Expand Up @@ -359,7 +463,7 @@ def read_array_header_1_0(fp):

return d['shape'], d['fortran_order'], dtype

def write_array(fp, array, version=(1, 0)):
def write_array(fp, array, version=None):
"""
Write an array to an NPY file, including a header.
Expand All @@ -374,8 +478,9 @@ def write_array(fp, array, version=(1, 0)):
method.
array : ndarray
The array to write to disk.
version : (int, int), optional
The version number of the format. Default: (1, 0)
version : (int, int) or None, optional
The version number of the format. None means use the oldest supported
version that is able to store the data. Default: None
Raises
------
Expand All @@ -387,11 +492,13 @@ def write_array(fp, array, version=(1, 0)):
are not picklable.
"""
if version != (1, 0):
msg = "we only support format version (1,0), not %s"
raise ValueError(msg % (version,))
fp.write(magic(*version))
write_array_header_1_0(fp, header_data_from_array_1_0(array))
_check_version(version)
used_ver = _write_array_header(fp, header_data_from_array_1_0(array),
version)
# this warning can be removed when 1.9 has aged enough
if version != (2, 0) and used_ver == (2, 0):
warnings.warn("Stored array in format 2.0. It can only be"
"read by NumPy >= 1.9", UserWarning)

# Set buffer size to 16 MiB to hide the Python loop overhead.
buffersize = max(16 * 1024 ** 2 // array.itemsize, 1)
Expand Down Expand Up @@ -440,10 +547,8 @@ def read_array(fp):
"""
version = read_magic(fp)
if version != (1, 0):
msg = "only support version (1,0) of file format, not %r"
raise ValueError(msg % (version,))
shape, fortran_order, dtype = read_array_header_1_0(fp)
_check_version(version)
shape, fortran_order, dtype = _read_array_header(fp, version)
if len(shape) == 0:
count = 1
else:
Expand Down Expand Up @@ -486,7 +591,7 @@ def read_array(fp):


def open_memmap(filename, mode='r+', dtype=None, shape=None,
fortran_order=False, version=(1, 0)):
fortran_order=False, version=None):
"""
Open a .npy file as a memory-mapped array.
Expand All @@ -513,9 +618,11 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
Whether the array should be Fortran-contiguous (True) or
C-contiguous (False, the default) if we are creating a new file
in "write" mode.
version : tuple of int (major, minor)
version : tuple of int (major, minor) or None
If the mode is a "write" mode, then this is the version of the file
format used to create the file. Default: (1,0)
format used to create the file.
None means use the oldest supported version that is able to store the
data. Default: None
Returns
-------
Expand All @@ -541,9 +648,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
if 'w' in mode:
# We are creating the file, not reading it.
# Check if we ought to create the file.
if version != (1, 0):
msg = "only support version (1,0) of file format, not %r"
raise ValueError(msg % (version,))
_check_version(version)
# Ensure that the given dtype is an authentic dtype object rather than
# just something that can be interpreted as a dtype object.
dtype = numpy.dtype(dtype)
Expand All @@ -558,8 +663,11 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
# If we got here, then it should be safe to create the file.
fp = open(filename, mode+'b')
try:
fp.write(magic(*version))
write_array_header_1_0(fp, d)
used_ver = _write_array_header(fp, d, version)
# this warning can be removed when 1.9 has aged enough
if version != (2, 0) and used_ver == (2, 0):
warnings.warn("Stored array in format 2.0. It can only be"
"read by NumPy >= 1.9", UserWarning)
offset = fp.tell()
finally:
fp.close()
Expand All @@ -568,10 +676,9 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
fp = open(filename, 'rb')
try:
version = read_magic(fp)
if version != (1, 0):
msg = "only support version (1,0) of file format, not %r"
raise ValueError(msg % (version,))
shape, fortran_order, dtype = read_array_header_1_0(fp)
_check_version(version)

shape, fortran_order, dtype = _read_array_header(fp, version)
if dtype.hasobject:
msg = "Array can't be memory-mapped: Python objects in dtype."
raise ValueError(msg)
Expand Down
57 changes: 55 additions & 2 deletions numpy/lib/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@
import os
import shutil
import tempfile
import warnings
from io import BytesIO

import numpy as np
Expand Down Expand Up @@ -521,19 +522,71 @@ def test_compressed_roundtrip():
assert_array_equal(arr, arr1)


def test_write_version_1_0():
def test_version_2_0():
f = BytesIO()
# requires more than 2 byte for header
dt = [(("%d" % i) * 100, float) for i in range(500)]
d = np.ones(1000, dtype=dt)

format.write_array(f, d, version=(2, 0))
with warnings.catch_warnings(record=True) as w:
warnings.filterwarnings('always', '', UserWarning)
format.write_array(f, d)
assert_(w[0].category is UserWarning)

f.seek(0)
n = format.read_array(f)
assert_array_equal(d, n)

# 1.0 requested but data cannot be saved this way
assert_raises(ValueError, format.write_array, f, d, (1, 0))


def test_version_2_0_memmap():
# requires more than 2 byte for header
dt = [(("%d" % i) * 100, float) for i in range(500)]
d = np.ones(1000, dtype=dt)
tf = tempfile.mktemp('', 'mmap', dir=tempdir)

# 1.0 requested but data cannot be saved this way
assert_raises(ValueError, format.open_memmap, tf, mode='w+', dtype=d.dtype,
shape=d.shape, version=(1, 0))

ma = format.open_memmap(tf, mode='w+', dtype=d.dtype,
shape=d.shape, version=(2, 0))
ma[...] = d
del ma

with warnings.catch_warnings(record=True) as w:
warnings.filterwarnings('always', '', UserWarning)
ma = format.open_memmap(tf, mode='w+', dtype=d.dtype,
shape=d.shape, version=None)
assert_(w[0].category is UserWarning)
ma[...] = d
del ma

ma = format.open_memmap(tf, mode='r')
assert_array_equal(ma, d)


def test_write_version():
f = BytesIO()
arr = np.arange(1)
# These should pass.
format.write_array(f, arr, version=(1, 0))
format.write_array(f, arr)

format.write_array(f, arr, version=None)
format.write_array(f, arr)

format.write_array(f, arr, version=(2, 0))
format.write_array(f, arr)

# These should all fail.
bad_versions = [
(1, 1),
(0, 0),
(0, 1),
(2, 0),
(2, 2),
(255, 255),
]
Expand Down

0 comments on commit e6f4366

Please sign in to comment.