ENH: Better handling of MultiIndex with Excel

Added merged cell formatting for MultiIndex and Hierarchical Rows. Issue pandas-dev#5254.
bingo · Nov 5, 2013 · ae37d22 · ae37d22
1 parent 4d8cd16
commit ae37d22
Show file tree

Hide file tree

Showing 9 changed files with 256 additions and 144 deletions.
diff --git a/ci/requirements-2.7.txt b/ci/requirements-2.7.txt
@@ -8,7 +8,7 @@ numexpr==2.1
 tables==2.3.1
 matplotlib==1.1.1
 openpyxl==1.6.2
-xlsxwriter==0.4.3
+xlsxwriter==0.4.6
 xlrd==0.9.2
 patsy==0.1.0
 html5lib==1.0b2

diff --git a/ci/requirements-2.7_LOCALE.txt b/ci/requirements-2.7_LOCALE.txt
@@ -2,7 +2,7 @@ python-dateutil
 pytz==2013b
 xlwt==0.7.5
 openpyxl==1.6.2
-xlsxwriter==0.4.3
+xlsxwriter==0.4.6
 xlrd==0.9.2
 numpy==1.6.1
 cython==0.19.1

diff --git a/ci/requirements-3.2.txt b/ci/requirements-3.2.txt
@@ -1,7 +1,7 @@
 python-dateutil==2.1
 pytz==2013b
 openpyxl==1.6.2
-xlsxwriter==0.4.3
+xlsxwriter==0.4.6
 xlrd==0.9.2
 numpy==1.7.1
 cython==0.19.1

diff --git a/ci/requirements-3.3.txt b/ci/requirements-3.3.txt
@@ -1,7 +1,7 @@
 python-dateutil==2.2
 pytz==2013b
 openpyxl==1.6.2
-xlsxwriter==0.4.3
+xlsxwriter==0.4.6
 xlrd==0.9.2
 html5lib==1.0b2
 numpy==1.8.0

diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -209,6 +209,11 @@ Improvements to existing features
     by color as expected.
   - ``read_excel()`` now tries to convert integral floats (like ``1.0``) to int
     by default. (:issue:`5394`)
+  - Excel writers now have a default option ``merge_cells`` in ``to_excel()``
+    to merge cells in MultiIndex and Hierarchical Rows. Note: using this
+    option it is no longer possible to round trip Excel files with merged
+    MultiIndex and Hierarchical Rows. Set the ``merge_cells`` to ``False`` to
+    restore the previous behaviour.  (:issue:`5254`)
 
 API Changes
 ~~~~~~~~~~~

diff --git a/pandas/core/format.py b/pandas/core/format.py
@@ -1213,7 +1213,7 @@ def __init__(self, row, col, val,
                             "right": "thin",
                             "bottom": "thin",
                             "left": "thin"},
-                "alignment": {"horizontal": "center"}}
+                "alignment": {"horizontal": "center", "vertical": "top"}}
 
 
 class ExcelFormatter(object):
@@ -1237,10 +1237,12 @@ class ExcelFormatter(object):
             Column label for index column(s) if desired. If None is given, and
             `header` and `index` are True, then the index names are used. A
             sequence should be given if the DataFrame uses MultiIndex.
+    merge_cells : boolean, default False
+            Format MultiIndex and Hierarchical Rows as merged cells.
     """
 
     def __init__(self, df, na_rep='', float_format=None, cols=None,
-                 header=True, index=True, index_label=None):
+                 header=True, index=True, index_label=None, merge_cells=False):
         self.df = df
         self.rowcounter = 0
         self.na_rep = na_rep
@@ -1251,6 +1253,7 @@ def __init__(self, df, na_rep='', float_format=None, cols=None,
         self.index = index
         self.index_label = index_label
         self.header = header
+        self.merge_cells = merge_cells
 
     def _format_value(self, val):
         if lib.checknull(val):
@@ -1264,29 +1267,44 @@ def _format_header_mi(self):
         if not(has_aliases or self.header):
             return
 
-        levels = self.columns.format(sparsify=True, adjoin=False,
-                                     names=False)
-        # level_lenghts = _get_level_lengths(levels)
-        coloffset = 1
-        if isinstance(self.df.index, MultiIndex):
-            coloffset = len(self.df.index[0])
-
-        # for lnum, (records, values) in enumerate(zip(level_lenghts,
-        #                                              levels)):
-        #     name = self.columns.names[lnum]
-        #     yield ExcelCell(lnum, coloffset, name, header_style)
-        #     for i in records:
-        #         if records[i] > 1:
-        #             yield ExcelCell(lnum,coloffset + i + 1, values[i],
-        #                     header_style, lnum, coloffset + i + records[i])
-        #         else:
-        # yield ExcelCell(lnum, coloffset + i + 1, values[i], header_style)
-
-        #     self.rowcounter = lnum
+        columns = self.columns
+        level_strs = columns.format(sparsify=True, adjoin=False, names=False)
+        level_lengths = _get_level_lengths(level_strs)
+        coloffset = 0
         lnum = 0
-        for i, values in enumerate(zip(*levels)):
-            v = ".".join(map(com.pprint_thing, values))
-            yield ExcelCell(lnum, coloffset + i, v, header_style)
+
+        if isinstance(self.df.index, MultiIndex):
+            coloffset = len(self.df.index[0]) - 1
+
+        if self.merge_cells:
+            # Format multi-index as a merged cells.
+            for lnum in range(len(level_lengths)):
+                name = columns.names[lnum]
+                yield ExcelCell(lnum, coloffset, name, header_style)
+
+            for lnum, (spans, levels, labels) in enumerate(zip(level_lengths,
+                                                               columns.levels,
+                                                               columns.labels)
+                                                           ):
+                values = levels.take(labels)
+                for i in spans:
+                    if spans[i] > 1:
+                        yield ExcelCell(lnum,
+                                        coloffset + i + 1,
+                                        values[i],
+                                        header_style,
+                                        lnum,
+                                        coloffset + i + spans[i])
+                    else:
+                        yield ExcelCell(lnum,
+                                        coloffset + i + 1,
+                                        values[i],
+                                        header_style)
+        else:
+            # Format in legacy format with dots to indicate levels.
+            for i, values in enumerate(zip(*level_strs)):
+                v = ".".join(map(com.pprint_thing, values))
+                yield ExcelCell(lnum, coloffset + i + 1, v, header_style)
 
         self.rowcounter = lnum
 
@@ -1354,14 +1372,17 @@ def _format_regular_rows(self):
                 index_label = self.df.index.names[0]
 
             if index_label and self.header is not False:
-                # add to same level as column names
-                # if isinstance(self.df.columns, MultiIndex):
-                #     yield ExcelCell(self.rowcounter, 0,
-                #                 index_label, header_style)
-                #     self.rowcounter += 1
-                # else:
-                yield ExcelCell(self.rowcounter - 1, 0,
-                                index_label, header_style)
+                if self.merge_cells:
+                    yield ExcelCell(self.rowcounter,
+                                    0,
+                                    index_label,
+                                    header_style)
+                    self.rowcounter += 1
+                else:
+                    yield ExcelCell(self.rowcounter - 1,
+                                    0,
+                                    index_label,
+                                    header_style)
 
             # write index_values
             index_values = self.df.index
@@ -1383,7 +1404,7 @@ def _format_hierarchical_rows(self):
             self.rowcounter += 1
 
         gcolidx = 0
-        # output index and index_label?
+
         if self.index:
             index_labels = self.df.index.names
             # check for aliases
@@ -1394,29 +1415,60 @@ def _format_hierarchical_rows(self):
             # if index labels are not empty go ahead and dump
             if (any(x is not None for x in index_labels)
                     and self.header is not False):
-                # if isinstance(self.df.columns, MultiIndex):
-                #     self.rowcounter += 1
-                # else:
-                self.rowcounter -= 1
+
+                if not self.merge_cells:
+                    self.rowcounter -= 1
+
                 for cidx, name in enumerate(index_labels):
-                    yield ExcelCell(self.rowcounter, cidx,
-                                    name, header_style)
+                    yield ExcelCell(self.rowcounter,
+                                    cidx,
+                                    name,
+                                    header_style)
                 self.rowcounter += 1
 
-            for indexcolvals in zip(*self.df.index):
-                for idx, indexcolval in enumerate(indexcolvals):
-                    yield ExcelCell(self.rowcounter + idx, gcolidx,
-                                    indexcolval, header_style)
-                gcolidx += 1
+            if self.merge_cells:
+                # Format hierarchical rows as merged cells.
+                level_strs = self.df.index.format(sparsify=True, adjoin=False,
+                                                  names=False)
+                level_lengths = _get_level_lengths(level_strs)
+
+                for spans, levels, labels in zip(level_lengths,
+                                                 self.df.index.levels,
+                                                 self.df.index.labels):
+                    values = levels.take(labels)
+                    for i in spans:
+                        if spans[i] > 1:
+                            yield ExcelCell(self.rowcounter + i,
+                                            gcolidx,
+                                            values[i],
+                                            header_style,
+                                            self.rowcounter + i + spans[i] - 1,
+                                            gcolidx)
+                        else:
+                            yield ExcelCell(self.rowcounter + i,
+                                            gcolidx,
+                                            values[i],
+                                            header_style)
+                    gcolidx += 1
+
+            else:
+                # Format hierarchical rows with non-merged values.
+                for indexcolvals in zip(*self.df.index):
+                    for idx, indexcolval in enumerate(indexcolvals):
+                        yield ExcelCell(self.rowcounter + idx,
+                                        gcolidx,
+                                        indexcolval,
+                                        header_style)
+                    gcolidx += 1
 
         for colidx in range(len(self.columns)):
             series = self.df.iloc[:, colidx]
             for i, val in enumerate(series):
                 yield ExcelCell(self.rowcounter + i, gcolidx + colidx, val)
 
     def get_formatted_cells(self):
-        for cell in itertools.chain(self._format_header(), self._format_body()
-                                    ):
+        for cell in itertools.chain(self._format_header(),
+                                    self._format_body()):
             cell.val = self._format_value(cell.val)
             yield cell
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1130,7 +1130,8 @@ def to_csv(self, path_or_buf, sep=",", na_rep='', float_format=None,
 
     def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
                  float_format=None, cols=None, header=True, index=True,
-                 index_label=None, startrow=0, startcol=0, engine=None):
+                 index_label=None, startrow=0, startcol=0, engine=None,
+                 merge_cells=True):
         """
         Write DataFrame to a excel sheet
 
@@ -1161,13 +1162,15 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
             write engine to use - you can also set this via the options
             ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and
             ``io.excel.xlsm.writer``.
-
+        merge_cells : boolean, default True
+            Write MultiIndex and Hierarchical Rows as merged cells.
 
         Notes
         -----
         If passing an existing ExcelWriter object, then the sheet will be added
         to the existing workbook.  This can be used to save different
         DataFrames to one workbook
+
         >>> writer = ExcelWriter('output.xlsx')
         >>> df1.to_excel(writer,'Sheet1')
         >>> df2.to_excel(writer,'Sheet2')
@@ -1185,7 +1188,8 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
                                        header=header,
                                        float_format=float_format,
                                        index=index,
-                                       index_label=index_label)
+                                       index_label=index_label,
+                                       merge_cells=merge_cells)
         formatted_cells = formatter.get_formatted_cells()
         excel_writer.write_cells(formatted_cells, sheet_name,
                                  startrow=startrow, startcol=startcol)