added dense factorization code

b333sai · Oct 27, 2017 · 12f2ede · 12f2ede
1 parent 3415b9b
commit 12f2ede
Show file tree

Hide file tree

Showing 4 changed files with 327 additions and 0 deletions.
diff --git a/bindings/python/cntk/contrib/netopt/__init__.py b/bindings/python/cntk/contrib/netopt/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft. All rights reserved.
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""
+Netowrk optimization alogorithms.
+"""
+
diff --git a/bindings/python/cntk/contrib/netopt/factorization.py b/bindings/python/cntk/contrib/netopt/factorization.py
@@ -0,0 +1,156 @@
+import cntk
+from cntk.ops.functions import BlockFunction
+from cntk.variables import Parameter
+from cntk.ops import times
+from cntk.internal import _as_tuple
+from cntk.layers.blocks import _initializer_for, _INFERRED, identity
+from cntk.layers.blocks import UntestedBranchError  # helpers
+from cntk.default_options import is_default_override
+from cntk.default_options import get_default_override, default_override_or
+
+def svd_subprojection(matrix, k):
+    '''
+    Calculate svd of the matrix and produce a subprojection based on k
+
+    Args:
+        matrix : an input matrix        
+        k (int): desired rank of the output matrix
+
+    Returns:
+        two matrices representing the original matrix after svd and 
+        reducing them based on k.
+    '''
+
+    import numpy as np
+    from numpy import dot, diag
+    from numpy.linalg import svd
+
+    # Decompose W into (U, s, V)
+    U, s, V = svd(matrix, full_matrices=False)
+
+    # Create two dense layers from this; one that takes U, one that takes
+    # dot(s, V), but restrict them all to rank k, such that the result is a
+    # k-rank subprojection
+    W1 = np.ascontiguousarray(U[:, :k])
+    W2 = dot(diag(s[:k]), V[:k, :])
+
+    return W1, W2
+
+
+def factor_dense(model, projection_function = None, filter_function = None, 
+                 factor_function = None):
+    '''
+    Reduce the size of a dense model using the provided factor_function 
+    and the projection_function. filter_function is used to select dense 
+    layers to apply the reduction. If no factor_function is specified, 
+    use svd decomposition. 
+
+    Args:
+        model               : dense model.
+        projection_function : determin the new size of the dense model. It can 
+                              be based on the shape of the weight matrix or 
+                              other heuristics.
+                              factor_function can choose to ignore the value k.
+        filter_function     : filter layers in the model to apply the factorization
+        factor_function     : factor the dense model (e.g. svd)   
+                
+    Returns:
+        a model that is factored and reduced in size.
+    '''
+    if (factor_function == None and projection_function == None):
+        raise ValueError("Dense: default factor function (svd) requires a projection_function.")
+
+    dense_filter = (lambda x: type(x) == cntk.Function 
+                       and x.op_name == 'Dense' 
+                       and filter_function(x) if filter_function else True)
+
+    def dense_converter(model):        
+        W, b = model.W.value, model.b.value
+
+        ht, wdth = W.shape
+        # k is the rank of the output matrices. If a projection function is 
+        # provided, then use it, otherwise assign min of two dimensions of
+        # W to k.
+        k = projection_function(W) if projection_function else min(ht, wdth)
+        W1, W2 = factor_function(W, k) if factor_function else svd_subprojection(W, k)
+
+        Ws = {'W1': W1, 'W2': W2}
+        dfl = dense_factored((k, wdth),
+            init=Ws,
+            activation=None,
+            init_bias=b,
+            name='DenseFactored')(model.inputs[2])
+        return dfl
+
+    return cntk.misc.convert(model, dense_filter, dense_converter)
+
+
+def dense_factored(shapes, #(shape1, shape2)
+                  activation=default_override_or(identity),
+                  init={'W1':None, 'W2':None},
+                  input_rank=None,
+                  map_rank=None,
+                  bias=default_override_or(True),
+                  init_bias=default_override_or(0),
+                  name=''):
+    '''
+    Perform the new model creation using the factored inputs W1 and W2. 
+    The returend function represents the new model.
+
+    Args:
+        shapes                  : dimensions of the input matrices.
+        activation              : activation function used for the model.
+        init                    : the two matrices corresponding to the factorization.
+        input_rank              : rank of the input tensor.
+        map_rank                : ???
+        bias                    : bias for the model.
+        init_bias               : initial bias value.
+        name                    : name of the block function that creates the new model.
+        
+    Returns:
+        a model that is factored and projected (reduced).
+    '''
+
+    # matthaip: Not sure how to handle input tensor of rank > 1
+    # or selective flattening of ranks
+    assert(input_rank is None and
+           map_rank is None and
+           all(isinstance(s,int) for s in list(shapes)))
+
+    activation = get_default_override(cntk.layers.Dense, activation=activation)
+    bias       = get_default_override(cntk.layers.Dense, bias=bias)
+    init_bias  = get_default_override(cntk.layers.Dense, init_bias=init_bias)
+    # how to use get_default_override for init parameeter?
+
+    output_shape1 = _as_tuple(shapes[0])
+    output_shape2 = _as_tuple(shapes[1])
+    if input_rank is not None and map_rank is not None:
+        raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.")
+
+
+    # If input_rank not given then pass a single _INFERRED; 
+    # map_rank if given will determine the input_rank.
+    # The dimension inference may still create multiple axes.
+    input_shape = _INFERRED
+
+    # parameters bound to this Function
+    #    init_weights = _initializer_for(init, Record(output_rank=output_rank))
+    init_weights = init
+    W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1')
+    W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2')
+    b = Parameter(output_shape2, init=init_bias,    name='b') if bias else None
+
+    # expression of this function
+    @BlockFunction('DenseFactored', name)
+    def dense(x):
+        r = times(x, W1)
+        r = times(r, W2)
+        if b:
+            r = r + b
+        if activation is not None:
+            r = activation(r)
+        return r
+    return dense
+
+# Reference for sklearn.tucker.hooi:
+# https://hal.inria.fr/hal-01219316/document
diff --git a/bindings/python/cntk/contrib/netopt/test/__init__.py b/bindings/python/cntk/contrib/netopt/test/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft. All rights reserved.
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+"""
+Tests for netowrk optimization alogorithms.
+"""
+
diff --git a/bindings/python/cntk/contrib/netopt/test/factorization_test.py b/bindings/python/cntk/contrib/netopt/test/factorization_test.py
@@ -0,0 +1,155 @@
+import numpy as np
+import pytest
+import cntk as C
+import cntk.contrib.netopt.factorization as nc
+C.cntk_py.set_fixed_random_seed(1)
+
+# create a dense network for the tests
+def _create_model_dense(features, num_hidden_layers, hidden_layers_dim, num_output_classes):
+    with C.layers.default_options(init=C.layers.glorot_uniform(), activation=C.sigmoid):
+        h = features
+        for _ in range(num_hidden_layers):
+            h = C.layers.Dense(hidden_layers_dim)(h)
+        last_layer = C.layers.Dense(num_output_classes, activation = None)
+
+        return last_layer(h)
+
+
+# no size reduction, only the factorization.
+def _get_rank_same_size(W):
+        return int(len(W) * 1)
+
+
+# reduce the size by len* 0.8
+def _get_rank_reduced_size(W):
+    return int(len(W) * 0.8)
+
+
+# filter dense blocks that has the same height and width.
+def _filter(model):
+    W = model.W.value
+    if (len(W) != len(W[0])):
+        return False
+    else:
+        return True
+
+
+# Helper function to generate a random data sample
+def _generate_random_data_sample(sample_size, feature_dim, num_classes):
+    Y = np.random.randint(size=(sample_size, 1), low=0, high=num_classes)
+    X = (np.random.randn(sample_size, feature_dim)+3) * (Y+1)
+    X = X.astype(np.float32)
+    class_ind = [Y==class_number for class_number in range(num_classes)]
+    Y = np.asarray(np.hstack(class_ind), dtype=np.float32)
+    return X, Y
+
+
+def test_svd_factorization():
+    # W and its svd factorizations (U and sV)
+    W = np.array([[1, 0, 0, 0, 2], 
+         [0, 0, 3, 0, 0],
+         [0, 0, 0, 0, 0],
+         [0, 0, 0, 2, 0]])
+
+    U = np.array([[0, 1, 0, 0],
+           [1, 0, 0, 0],
+           [0, 0, 0, 1],
+           [0, 0, 1, 0]])
+
+    sV = np.array([[0, 0, 3, 0, 0], 
+           [1, 0, 0, 0, 2],
+           [0, 0, 0, 2, 0],
+           [0, 0, 0, 0, 0]])
+
+    # call svd factorization with W's length
+    W1, W2 = nc.svd_subprojection(W, len(W))
+
+    assert(np.array_equal(W1, U) == True)   
+    assert(np.allclose(sV, W2) == True)
+
+
+def test_factor_dense():
+
+    input_dim = 2
+    num_output_classes = 2
+    hidden_layer_dim = 50
+
+    input = C.input_variable(input_dim)
+    z = _create_model_dense(input, input_dim, hidden_layer_dim, num_output_classes)
+    blocks = C.logging.graph.depth_first_search(
+                z, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0)
+
+    newz = nc.factor_dense(z, projection_function=_get_rank_same_size, filter_function = _filter)
+    newblocks = C.logging.graph.depth_first_search(
+                    newz, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0)
+
+    assert(newblocks[1].op_name == "DenseFactored")    
+    block_root = C.as_composite(newblocks[1].block_root)
+    # no reduction, same size but factored.
+    assert(block_root.W1.value.shape == (50, 50))
+
+    newz = nc.factor_dense(z, projection_function=_get_rank_reduced_size, filter_function = _filter)
+    newblocks = C.logging.graph.depth_first_search(
+                    newz, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0)
+    assert(newblocks[1].op_name == "DenseFactored")    
+    block_root = C.as_composite(newblocks[1].block_root)
+    # the reduction has taken place now.
+    assert(block_root.W1.value.shape == (50, 40))
+
+
+def _percentage_match(labels, predictions):
+    match_count = 0
+    for idx, lbl in enumerate(labels): 
+        if (np.argmax(lbl) == np.argmax(predictions[idx])):
+            match_count += 1
+    return match_count / len(labels) * 100 if len(labels) != 0  else 0
+
+
+def test_factor_dense_for_prediction():
+
+    input_dim = 2
+    num_output_classes = 2
+    hidden_layer_dim = 50
+    num_minibatches_to_train = 2000
+    minibatch_size = 25
+    learning_rate = 0.5
+
+    input = C.input_variable(input_dim)
+    label = C.input_variable(num_output_classes)
+
+    z = _create_model_dense(input, input_dim, hidden_layer_dim, num_output_classes)
+
+    loss = C.cross_entropy_with_softmax(z, label)
+    eval_error = C.classification_error(z, label)
+
+    # Instantiate the trainer object to drive the model training
+
+    lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch)
+    learner = C.sgd(z.parameters, lr_schedule)
+    trainer = C.Trainer(z, (loss, eval_error), [learner])
+
+
+    # Run the trainer and perform model training
+    training_progress_output_freq = 20
+    plotdata = {"batchsize":[], "loss":[], "error":[]}
+
+
+    for i in range(0, int(num_minibatches_to_train)):
+        features, labels = _generate_random_data_sample(minibatch_size, input_dim, num_output_classes)
+        # Specify the input variables mapping in the model to actual minibatch data for training
+        trainer.train_minibatch({input : features, label : labels})
+
+    # generate some data to predict
+    features, labels = _generate_random_data_sample(10, 2, 2)
+
+    # factor the model.
+    newz = nc.factor_dense(z, projection_function=_get_rank_reduced_size, filter_function = _filter)
+    original_out = C.softmax(z)
+    factored_out = C.softmax(newz)
+
+    original_labels_probs = original_out.eval({input : features})
+    predicted_label_probs = factored_out.eval({input : features})
+
+    # reduced model should have at leat 70% match compared to the original
+    # For the test, we reduced the training minibatches, thus the match is lower.
+    assert(_percentage_match(labels, predicted_label_probs) >=70)