Merge pull request gecko984#14 from gecko984/return_value

Return value
gokceneraslan · Sep 6, 2020 · f966f59 · f966f59
2 parents 147ddee + cd2dd94
commit f966f59
Show file tree

Hide file tree

Showing 5 changed files with 107 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -19,14 +19,14 @@ package, bottom row is supervenn diagrams:
 Python 2.7 or 3.6+ with `numpy` and`matplotlib`.
 
 ### Basic usage 
-The main entry point is the eponymous supervenn function, that takes a list of python `set`s as its first and only
-required argument:
+The main entry point is the eponymous `supervenn` function. It takes a list of python `set`s as its first and only
+required argument and returns a `SupervennPlot` object.
 ```python
 from supervenn import supervenn
 sets = [{1, 2, 3, 4}, {3, 4, 5}, {1, 6, 7, 8}]
 supervenn(sets, side_plots=False)
 ```
-<img src="https://i.imgur.com/BQrrcEl.png" width=400>
+<img src="https://i.imgur.com/aAOP6dq.png" width=330>
 
 Each row repesents a set, the order from bottom to top is the same as in the `sets` list. Overlapping parts correspond
 to set intersections.
@@ -43,14 +43,14 @@ array plotted) to minimize the number of parts the sets are broken into. In the
 supervenn([{1, 2}, {2, 3}, {1, 3}], side_plots=False)
 ```
 
-<img src="https://i.imgur.com/2QV0zou.png" width="400">
+<img src="https://i.imgur.com/8aTSg2A.png" width="330">
 
 By default, additional *side plots* are also displayed:
 
 ```python
 supervenn(sets)
 ```
-<img src="https://i.imgur.com/na3YAn0.png" width=400>
+<img src="https://i.imgur.com/9IhLBcK.png" width=330>
 Here, the numbers on the right are the set sizes (cardinalities), and numbers on the top show how many sets does this
 intersection make part of. The grey bars represent the same numbers visually.
 
@@ -63,7 +63,7 @@ sets = [{1, 2, 3, 4}, {3, 4, 5}, {1, 6, 7, 8}]
 labels = ['alice', 'bob', 'third party']
 supervenn(sets, labels)
 ```
-<img src="https://i.imgur.com/YlPKs7u.png" width=290>
+<img src="https://i.imgur.com/YlPKs7u.png" width=330>
 
 #### Change size and dpi of the plot
 Create a new figure and plot into it:
@@ -83,6 +83,11 @@ Use the `ax` argument:
 supervenn(sets, ax=my_axis)
 ```
 
+#### Access the figure and axes objects of the plot
+Use `.figure` and `axes`  attributes of the object returned by `supervenn()`. The `axes` attribute is
+organized as a dict with descriptive strings for keys: `main`, `top_side_plot`, `right_side_plot`, `unused`. 
+If `side_plots=False`, the dict has only one key `main`.
+
 #### Save the plot to an image file
 
 ```python
@@ -96,7 +101,7 @@ Use the `chunks_ordering` argument. The following options are available:
 - `'minimize gaps'`: default, use an optimization algorithm to find an order of columns with fewer
 gaps in each row;
 - `'size'`: bigger chunks go first;
-- `'occurence'`: chunks that are in more sets go first;
+- `'occurrence'`: chunks that are in more sets go first;
 - `'random'`: randomly shuffle the columns.
 
 To reverse the order (e.g. you want smaller chunks to go first), pass `reverse_chunks_order=False` (by default
@@ -115,6 +120,19 @@ algorithm is that now gaps are minimized in columns instead of rows, and they ar
 To reverse the order (e.g. you want smaller sets to go first), pass `reverse_sets_order=False` (by default
 it's `True`) 
 
+#### Inspect the chunks' contents
+`supervenn(sets, ...)` returns a `SupervennPlot` object, which has a `chunks` attribute.
+It is a `dict` with `frozenset`s of set indices as keys, and chunks as values. For example, 
+`my_supervenn_object.chunks[frozenset([0, 2])]` is the chunk with all the items that are in `sets[0]` and
+`sets[2]`, but not in any of the other sets.
+
+There is also a `get_chunk(set_indices)` method that is slightly more convenient, because you
+can pass a `list` or any other iterable of indices instead of a `frozenset`. For example:
+`my_supervenn_object.get_chunk([0, 2])`. 
+
+If you have a good idea of a more convenient method of chunks lookup, let me know and I'll
+implement it as well.
+
 #### Make the plot prettier if sets and/or chunks are very different in size
 Use the `widths_minmax_ratio` argument, with a value between 0.01 and 1. Consider the following example
 ```python
@@ -123,7 +141,7 @@ supervenn(sets, side_plots=False)
 ```
 <img src="https://i.imgur.com/i05lgNU.png" width=330>
 
-The bottom left corner is unreadable.
+Annotations in the bottom left corner are unreadable.
 
 One solution is to trade exact chunk proportionality for readability. This is done by making small chunks visually
 larger. To be exact, a linear function is applied to the chunk sizes, with slope and intercept chosen so that the
@@ -139,6 +157,7 @@ The image now looks clean, but chunks of size 1 to 3 look almost the same.
 
 <img src="https://i.imgur.com/cIp42uD.png" width=330>
 
+
 #### Avoid clutter in the X axis annotations
 - Use the `min_width_for_annotation` argument to hide annotations for chunks smaller than this value. 
 ```python
@@ -158,7 +177,7 @@ import matplotlib.pyplot as plt
 with plt.style.context('bmh'):
     supervenn([{1,2,3}, {3,4}])
 ```
-<img src="https://i.imgur.com/yEUChI4.png" width="285">
+<img src="https://i.imgur.com/yEUChI4.png" width="330">
 
 
 #### Change side plots size and color

diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
     name='supervenn',
     license='MIT',
     description='supervenn is a tool for visualization of relations of many sets using matplotlib',
-    version='0.2.2',
+    version='0.3.0',
     long_description='See https://github.com/gecko984/supervenn/blob/master/README.md',
     url='https://github.com/gecko984/supervenn',
     packages=setuptools.find_packages(),

diff --git a/supervenn/_algorithms.py b/supervenn/_algorithms.py
@@ -123,13 +123,14 @@
 from collections import defaultdict
 import datetime
 from itertools import permutations
+import warnings
 
 import numpy as np
 
 HUGE_NUMBER = 1e10 # can fail for weighted! FIXME
 DEFAULT_MAX_BRUTEFORCE_SIZE = 8
 BRUTEFORCE_SIZE_HARD_LIMIT = 12
-DEFAULT_SEEDS = 1000
+DEFAULT_SEEDS = 10000
 DEFAULT_NOISE_PROB = 0.0075
 DEFAULT_MAX_NOISE = 1.1
 
@@ -158,9 +159,9 @@ def get_total_gaps_in_rows(arr, row_weights=None):
     return rowwise_gaps_counts.dot(row_weights)
 
 
-def get_chunks_and_composition_array(sets):
+def break_into_chunks(sets):
     """
-    Let us have a collection {S_1, ..., S_n} of finite sets and U the union of all these sets.
+    Let us have a collection {S_1, ..., S_n} of finite sets and U be the union of all these sets.
     For a given subset C = {i_1, ..., i_k} of indices {1, ..., n}, define the 'chunk', corresponding to C, as the set
     of elements of U, that belong to S_i_1, ..., S_i_k, but not to any of the other sets.
     For example, for a collection of two sets {S_1, S_2}, there can be max three chunks: S_1 - S_2, S_2 - S_1, S1 & S_2.
@@ -169,14 +170,11 @@ def get_chunks_and_composition_array(sets):
     In general, the number of possible non-empty chunks for a collection of n sets is equal to min(|U|, 2^n - 1).
     Any chunk either lies completely inside any or completely outside any of the sets S_1, ... S_n.
 
-    This function takes a list of sets as its only argument and returns two objects:
-    - list of all chunks (each chunk is a set of items)
-    - a numpy.array A of zeros and ones with len(sets) rows and len(chunks) columns,
-    where A[i, j] == 1 <=> sets[i] includes chunks[j].
+    This function takes a list of sets as its only argument and returns a dict with frozensets of indices as keys and
+    chunks as values.
     :param sets: list of sets
-    :return: chunks - list of sets, arr - numpy.array
+    :return: chunks_dict - dict with frozensets as keys and sets as values.
     """
-
     if not sets:
         raise ValueError('Sets list is empty.')
 
@@ -185,20 +183,32 @@ def get_chunks_and_composition_array(sets):
     if not all_items:
         raise ValueError('All sets are empty')
 
-    # Each chunk is characterized by its occurence pattern, which is a unique subset of indices of our sets.
+    # Each chunk is characterized by its occurrence pattern, which is a unique subset of indices of our sets.
     # E.g. chunk with signature {1, 2, 5} is exactly the set of items such that they belong to sets 1, 2, 5, and
     # don't belong to any of the other sets.
     # Build a dict with signatures as keys (as frozensets), and lists of items as values,
-    occurences = defaultdict(set)
+    chunks_dict = defaultdict(set)
     for item in all_items:
-        occurence_pattern = frozenset({i for i, set_ in enumerate(sets) if item in set_})
-        occurences[occurence_pattern].add(item)
+        occurrence_pattern = frozenset({i for i, set_ in enumerate(sets) if item in set_})
+        chunks_dict[occurrence_pattern].add(item)
+    return dict(chunks_dict)
+
 
-    chunks_count = len(occurences)
+def get_chunks_and_composition_array(sets):
+    """
+    Take
+    - list of all chunks (each chunk is a set of items)
+    - a numpy.array A of zeros and ones with len(sets) rows and len(chunks) columns,
+    where A[i, j] == 1 <=> sets[i] includes chunks[j].
+    :param sets: list of sets
+    :return: chunks - list of sets, arr - numpy.array, chunks_dict - dict w
+    """
+    chunks_dict = break_into_chunks(sets)
+    chunks_count = len(chunks_dict)
     chunks = []
     arr = np.zeros((len(sets), chunks_count), dtype=int)
 
-    for idx, (sets_indices, items) in enumerate(occurences.items()):
+    for idx, (sets_indices, items) in enumerate(chunks_dict.items()):
         chunks.append(items)
         arr[list(sets_indices), idx] = 1
 
@@ -374,10 +384,14 @@ def get_permutations(chunks, composition_array, chunks_ordering='minimize gaps',
         'array': composition_array,
         'row_weights': None,
         'ordering': chunks_ordering,
-        'allowed_orderings': ['size', 'occurence', 'random', 'minimize gaps'],
+        'allowed_orderings': ['size', 'occurrence', 'random', 'minimize gaps'] + ['occurence'],  # todo remove with typo
         'reverse': reverse_chunks_order
     }
 
+    if chunks_ordering == 'occurence':
+        warnings.warn('Please use chunks_ordering="occurrence" (with double "r") instead of "occurence" (spelling fixed'
+                      'in 0.3.0). The incorrect variant is still supported, but will be removed in a future version')
+
     sets_case = {
         'sizes': set_sizes,
         'param': 'sets_ordering',
@@ -397,7 +411,7 @@ def get_permutations(chunks, composition_array, chunks_ordering='minimize gaps',
 
         if case['ordering'] == 'size':
             permutation = np.argsort(case['sizes'])
-        elif case['ordering'] in ['occurence', 'chunk count']:
+        elif case['ordering'] in ['occurrence', 'chunk count'] + ['occurence']:
             permutation = np.argsort(case['array'].sum(0))
         elif case['ordering'] == 'random':
             permutation = np.array(range(len(case['sizes'])))

diff --git a/supervenn/_plots.py b/supervenn/_plots.py
@@ -10,6 +10,7 @@
 import matplotlib.pyplot as plt
 
 from supervenn._algorithms import (
+    break_into_chunks,
     get_chunks_and_composition_array,
     get_permutations,
     DEFAULT_MAX_BRUTEFORCE_SIZE,
@@ -20,6 +21,45 @@
 DEFAULT_FONTSIZE = 12
 
 
+class SupervennPlot(object):
+    """
+    Attributes
+    ----------
+    axes: `dict
+        a dict containing all the plot's axes under descriptive keys: 'main', 'top_side_plot', 'right_side_plot',
+        'unused' (the small empty square in the top right corner)
+    figure: matplotlib.figure.Figure
+        figure containing the plot.
+    chunks: dict
+        a dictionary allowing to get the contents of chunks. It has frozensets of key indices as keys and chunks
+        as values.
+
+    Methods
+    -------
+    get_chunk(set_indices)
+        get a chunk by the indices of its defining sets without them to a frozenset first
+   """
+
+    def __init__(self, axes, figure, chunks_dict):
+        self.axes = axes
+        self.figure = figure
+        self.chunks = chunks_dict
+
+    def get_chunk(self, set_indices):
+        """
+        Get the contents of a chunk defined by the sets indicated by sets_indices. Indices refer to the original sets
+        order as it was passed to supervenn() function (any reordering of sets due to use of sets_ordering params is
+        ignored).
+        For example .get_chunk_by_set_indices([1,5]) will return the chunk containing all the items that are in
+        sets[1] and sets[5], but not in any of the other sets.
+        supervenn() function, the
+        :param set_indices: iterable of integers, referring to positions in sets list, as passed into supervenn().
+        :return: chunk with items, that are in each of the sets with indices from set_indices, but not in any of the
+        other sets.
+        """
+        return self.chunks[frozenset(set_indices)]
+
+
 def get_alternated_ys(ys_count, low, high):
     """
     A helper function generating y-positions for x-axis annotations, useful when some annotations positioned along the
@@ -325,7 +365,10 @@ def supervenn(sets, set_annotations=None, figsize=None, side_plots=True,
     :param bar_height: height of cell fill as a fraction of row height, a number in (0, 1).
     :param bar_alpha: alpha for cell fills.
     :param bar_align: vertical alignment of bars, 'edge' (default) or 'center'. Only matters when bar_height < 1.
-    :param color_cycle: a list of set colors, given as names of matplotlib named colors, or hex codes (e.g. '#1f77b4').
+    :param color_cycle: a list of set colors, given as names of matplotlib named colors, or hex codes (e.g. '#1f77b4')
+
+    :return: SupervennPlot instance with attributes `axes`, `figure`, `chunks`
+        and method `get_chunk(set_indices)`. See docstring to returned object.
     """
 
     if figsize is not None or dpi is not None:
@@ -404,3 +447,4 @@ def supervenn(sets, set_annotations=None, figsize=None, side_plots=True,
         plt.ylim(ylim)
 
     plt.sca(axes['main'])
+    return SupervennPlot(axes, plt.gcf(), break_into_chunks(sets))  # todo: break_into_chunks is called twice, fix
diff --git a/supervenn/tests/algorithms_test.py b/supervenn/tests/algorithms_test.py
@@ -295,15 +295,15 @@ def test_order_chunks_size_ascending(self):
     def test_order_chunks_occurence_descending(self):
         for _ in range(10):
             sets = make_random_sets()
-            _, composition_matrix = _get_ordered_chunks_and_composition_array(sets, chunks_ordering='occurence')
+            _, composition_matrix = _get_ordered_chunks_and_composition_array(sets, chunks_ordering='occurrence')
             occurences = composition_matrix.sum(0)
             occurences_descending = is_ascending(occurences[::-1])
             self.assertTrue(occurences_descending)
 
     def test_order_chunks_occurence_ascending(self):
         for _ in range(10):
             sets = make_random_sets()
-            _, composition_matrix = _get_ordered_chunks_and_composition_array(sets, chunks_ordering='occurence',
+            _, composition_matrix = _get_ordered_chunks_and_composition_array(sets, chunks_ordering='occurrence',
                                                                               reverse_chunks_order=False)
             occurences = composition_matrix.sum(0)
             occurences_ascending = is_ascending(occurences)