Merge pull request numpy#4765 from juliantaylor/npyformat-2.0

ENH: add storage format 2.0 with 4 byte header size
jonathanrocher · Jun 2, 2014 · e6f4366 · e6f4366
2 parents 5e7e24e + fc50d27
commit e6f4366
Show file tree

Hide file tree

Showing 3 changed files with 202 additions and 34 deletions.
diff --git a/doc/release/1.9.0-notes.rst b/doc/release/1.9.0-notes.rst
@@ -196,6 +196,14 @@ comparison when the numpy version goes to 1.10.devel. For example::
     >>> if NumpyVersion(np.__version__) < '1.10.0'):
     ...     print('Wow, that is an old NumPy version!')
 
+Allow saving arrays with large number of named columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The numpy storage format 1.0 only allowed the array header to have a total size
+of 65535 bytes. This can be exceeded by structured arrays with a large number
+of columns. A new format 2.0 has been added which extends the header size to 4
+GiB. `np.save` will automatically save in 2.0 format if the data requires it,
+else it will always use the more compatible 1.0 format.
+
 
 Improvements
 ============

diff --git a/numpy/lib/format.py b/numpy/lib/format.py
@@ -139,6 +139,7 @@
 import numpy
 import sys
 import io
+import warnings
 from numpy.lib.utils import safe_eval
 from numpy.compat import asbytes, isfileobj, long, basestring
 
@@ -151,6 +152,14 @@
 MAGIC_LEN = len(MAGIC_PREFIX) + 2
 BUFFER_SIZE = 2 ** 18 #size of buffer for reading npz files in bytes
 
+# difference between version 1.0 and 2.0 is a 4 byte (I) header length
+# instead of 2 bytes (H) allowing storage of large structured arrays
+
+def _check_version(version):
+    if version not in [(1, 0), (2, 0), None]:
+        msg = "we only support format version (1,0) and (2, 0), not %s"
+        raise ValueError(msg % (version,))
+
 def magic(major, minor):
     """ Return the magic string for the given file format version.
 
@@ -258,15 +267,23 @@ def header_data_from_array_1_0(array):
     d['descr'] = dtype_to_descr(array.dtype)
     return d
 
-def write_array_header_1_0(fp, d):
-    """ Write the header for an array using the 1.0 format.
+def _write_array_header(fp, d, version=None):
+    """ Write the header for an array and returns the version used
 
     Parameters
     ----------
     fp : filelike object
     d : dict
         This has the appropriate entries for writing its string representation
         to the header of the file.
+    version: tuple or None
+        None means use oldest that works
+        explicit version will raise a ValueError if the format does not
+        allow saving this data.  Default: None
+    Returns
+    -------
+    version : tuple of int
+        the file version which needs to be used to store the data
     """
     import struct
     header = ["{"]
@@ -282,11 +299,52 @@ def write_array_header_1_0(fp, d):
     current_header_len = MAGIC_LEN + 2 + len(header) + 1  # 1 for the newline
     topad = 16 - (current_header_len % 16)
     header = asbytes(header + ' '*topad + '\n')
-    if len(header) >= (256*256):
-        raise ValueError("header does not fit inside %s bytes" % (256*256))
-    header_len_str = struct.pack('<H', len(header))
+
+    if len(header) >= (256*256) and version == (1, 0):
+        raise ValueError("header does not fit inside %s bytes required by the"
+                         " 1.0 format" % (256*256))
+    if len(header) < (256*256):
+        header_len_str = struct.pack('<H', len(header))
+        version = (1, 0)
+    elif len(header) < (2**32):
+        header_len_str = struct.pack('<I', len(header))
+        version = (2, 0)
+    else:
+        raise ValueError("header does not fit inside 4 GiB required by "
+                         "the 2.0 format")
+
+    fp.write(magic(*version))
     fp.write(header_len_str)
     fp.write(header)
+    return version
+
+def write_array_header_1_0(fp, d):
+    """ Write the header for an array using the 1.0 format.
+
+    Parameters
+    ----------
+    fp : filelike object
+    d : dict
+        This has the appropriate entries for writing its string representation
+        to the header of the file.
+    """
+    _write_array_header(fp, d, (1, 0))
+
+
+def write_array_header_2_0(fp, d):
+    """ Write the header for an array using the 2.0 format.
+        The 2.0 format allows storing very large structured arrays.
+
+    .. versionadded:: 1.9.0
+
+    Parameters
+    ----------
+    fp : filelike object
+    d : dict
+        This has the appropriate entries for writing its string representation
+        to the header of the file.
+    """
+    _write_array_header(fp, d, (2, 0))
 
 def read_array_header_1_0(fp):
     """
@@ -316,13 +374,59 @@ def read_array_header_1_0(fp):
     ValueError
         If the data is invalid.
 
+    """
+    _read_array_header(fp, version=(1, 0))
+
+def read_array_header_2_0(fp):
+    """
+    Read an array header from a filelike object using the 2.0 file format
+    version.
+
+    This will leave the file object located just after the header.
+
+    .. versionadded:: 1.9.0
+
+    Parameters
+    ----------
+    fp : filelike object
+        A file object or something with a `.read()` method like a file.
+
+    Returns
+    -------
+    shape : tuple of int
+        The shape of the array.
+    fortran_order : bool
+        The array data will be written out directly if it is either C-contiguous
+        or Fortran-contiguous. Otherwise, it will be made contiguous before
+        writing it out.
+    dtype : dtype
+        The dtype of the file's data.
+
+    Raises
+    ------
+    ValueError
+        If the data is invalid.
+
+    """
+    _read_array_header(fp, version=(2, 0))
+
+def _read_array_header(fp, version):
+    """
+    see read_array_header_1_0
     """
     # Read an unsigned, little-endian short int which has the length of the
     # header.
     import struct
-    hlength_str = _read_bytes(fp, 2, "array header length")
-    header_length = struct.unpack('<H', hlength_str)[0]
-    header = _read_bytes(fp, header_length, "array header")
+    if version == (1, 0):
+        hlength_str = _read_bytes(fp, 2, "array header length")
+        header_length = struct.unpack('<H', hlength_str)[0]
+        header = _read_bytes(fp, header_length, "array header")
+    elif version == (2, 0):
+        hlength_str = _read_bytes(fp, 4, "array header length")
+        header_length = struct.unpack('<I', hlength_str)[0]
+        header = _read_bytes(fp, header_length, "array header")
+    else:
+        raise ValueError("Invalid version %r" % version)
 
     # The header is a pretty-printed string representation of a literal Python
     # dictionary with trailing newlines padded to a 16-byte boundary. The keys
@@ -359,7 +463,7 @@ def read_array_header_1_0(fp):
 
     return d['shape'], d['fortran_order'], dtype
 
-def write_array(fp, array, version=(1, 0)):
+def write_array(fp, array, version=None):
     """
     Write an array to an NPY file, including a header.
 
@@ -374,8 +478,9 @@ def write_array(fp, array, version=(1, 0)):
         method.
     array : ndarray
         The array to write to disk.
-    version : (int, int), optional
-        The version number of the format.  Default: (1, 0)
+    version : (int, int) or None, optional
+        The version number of the format. None means use the oldest supported
+        version that is able to store the data.  Default: None
 
     Raises
     ------
@@ -387,11 +492,13 @@ def write_array(fp, array, version=(1, 0)):
         are not picklable.
 
     """
-    if version != (1, 0):
-        msg = "we only support format version (1,0), not %s"
-        raise ValueError(msg % (version,))
-    fp.write(magic(*version))
-    write_array_header_1_0(fp, header_data_from_array_1_0(array))
+    _check_version(version)
+    used_ver = _write_array_header(fp, header_data_from_array_1_0(array),
+                                   version)
+    # this warning can be removed when 1.9 has aged enough
+    if version != (2, 0) and used_ver == (2, 0):
+        warnings.warn("Stored array in format 2.0. It can only be"
+                      "read by NumPy >= 1.9", UserWarning)
 
     # Set buffer size to 16 MiB to hide the Python loop overhead.
     buffersize = max(16 * 1024 ** 2 // array.itemsize, 1)
@@ -440,10 +547,8 @@ def read_array(fp):
 
     """
     version = read_magic(fp)
-    if version != (1, 0):
-        msg = "only support version (1,0) of file format, not %r"
-        raise ValueError(msg % (version,))
-    shape, fortran_order, dtype = read_array_header_1_0(fp)
+    _check_version(version)
+    shape, fortran_order, dtype = _read_array_header(fp, version)
     if len(shape) == 0:
         count = 1
     else:
@@ -486,7 +591,7 @@ def read_array(fp):
 
 
 def open_memmap(filename, mode='r+', dtype=None, shape=None,
-                fortran_order=False, version=(1, 0)):
+                fortran_order=False, version=None):
     """
     Open a .npy file as a memory-mapped array.
 
@@ -513,9 +618,11 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
         Whether the array should be Fortran-contiguous (True) or
         C-contiguous (False, the default) if we are creating a new file
         in "write" mode.
-    version : tuple of int (major, minor)
+    version : tuple of int (major, minor) or None
         If the mode is a "write" mode, then this is the version of the file
-        format used to create the file.  Default: (1,0)
+        format used to create the file.
+        None means use the oldest supported version that is able to store the
+        data.  Default: None
 
     Returns
     -------
@@ -541,9 +648,7 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
     if 'w' in mode:
         # We are creating the file, not reading it.
         # Check if we ought to create the file.
-        if version != (1, 0):
-            msg = "only support version (1,0) of file format, not %r"
-            raise ValueError(msg % (version,))
+        _check_version(version)
         # Ensure that the given dtype is an authentic dtype object rather than
         # just something that can be interpreted as a dtype object.
         dtype = numpy.dtype(dtype)
@@ -558,8 +663,11 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
         # If we got here, then it should be safe to create the file.
         fp = open(filename, mode+'b')
         try:
-            fp.write(magic(*version))
-            write_array_header_1_0(fp, d)
+            used_ver = _write_array_header(fp, d, version)
+            # this warning can be removed when 1.9 has aged enough
+            if version != (2, 0) and used_ver == (2, 0):
+                warnings.warn("Stored array in format 2.0. It can only be"
+                              "read by NumPy >= 1.9", UserWarning)
             offset = fp.tell()
         finally:
             fp.close()
@@ -568,10 +676,9 @@ def open_memmap(filename, mode='r+', dtype=None, shape=None,
         fp = open(filename, 'rb')
         try:
             version = read_magic(fp)
-            if version != (1, 0):
-                msg = "only support version (1,0) of file format, not %r"
-                raise ValueError(msg % (version,))
-            shape, fortran_order, dtype = read_array_header_1_0(fp)
+            _check_version(version)
+
+            shape, fortran_order, dtype = _read_array_header(fp, version)
             if dtype.hasobject:
                 msg = "Array can't be memory-mapped: Python objects in dtype."
                 raise ValueError(msg)

diff --git a/numpy/lib/tests/test_format.py b/numpy/lib/tests/test_format.py
@@ -280,6 +280,7 @@
 import os
 import shutil
 import tempfile
+import warnings
 from io import BytesIO
 
 import numpy as np
@@ -521,19 +522,71 @@ def test_compressed_roundtrip():
     assert_array_equal(arr, arr1)
 
 
-def test_write_version_1_0():
+def test_version_2_0():
+    f = BytesIO()
+    # requires more than 2 byte for header
+    dt = [(("%d" % i) * 100, float) for i in range(500)]
+    d = np.ones(1000, dtype=dt)
+
+    format.write_array(f, d, version=(2, 0))
+    with warnings.catch_warnings(record=True) as w:
+        warnings.filterwarnings('always', '', UserWarning)
+        format.write_array(f, d)
+        assert_(w[0].category is UserWarning)
+
+    f.seek(0)
+    n = format.read_array(f)
+    assert_array_equal(d, n)
+
+    # 1.0 requested but data cannot be saved this way
+    assert_raises(ValueError, format.write_array, f, d, (1, 0))
+
+
+def test_version_2_0_memmap():
+    # requires more than 2 byte for header
+    dt = [(("%d" % i) * 100, float) for i in range(500)]
+    d = np.ones(1000, dtype=dt)
+    tf = tempfile.mktemp('', 'mmap', dir=tempdir)
+
+    # 1.0 requested but data cannot be saved this way
+    assert_raises(ValueError, format.open_memmap, tf, mode='w+', dtype=d.dtype,
+                            shape=d.shape, version=(1, 0))
+
+    ma = format.open_memmap(tf, mode='w+', dtype=d.dtype,
+                            shape=d.shape, version=(2, 0))
+    ma[...] = d
+    del ma
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.filterwarnings('always', '', UserWarning)
+        ma = format.open_memmap(tf, mode='w+', dtype=d.dtype,
+                                shape=d.shape, version=None)
+        assert_(w[0].category is UserWarning)
+        ma[...] = d
+        del ma
+
+    ma = format.open_memmap(tf, mode='r')
+    assert_array_equal(ma, d)
+
+
+def test_write_version():
     f = BytesIO()
     arr = np.arange(1)
     # These should pass.
     format.write_array(f, arr, version=(1, 0))
     format.write_array(f, arr)
 
+    format.write_array(f, arr, version=None)
+    format.write_array(f, arr)
+
+    format.write_array(f, arr, version=(2, 0))
+    format.write_array(f, arr)
+
     # These should all fail.
     bad_versions = [
         (1, 1),
         (0, 0),
         (0, 1),
-        (2, 0),
         (2, 2),
         (255, 255),
     ]