Skip to content


added dense factorization code
Browse files Browse the repository at this point in the history
  • Loading branch information
jaliyae committed Oct 27, 2017
1 parent 3415b9b commit 12f2ede
Show file tree
Hide file tree
Showing 4 changed files with 327 additions and 0 deletions.
8 changes: 8 additions & 0 deletions bindings/python/cntk/contrib/netopt/
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See file in the project root
# for full license information.
# ==============================================================================
Netowrk optimization alogorithms.

156 changes: 156 additions & 0 deletions bindings/python/cntk/contrib/netopt/
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import cntk
from cntk.ops.functions import BlockFunction
from cntk.variables import Parameter
from cntk.ops import times
from cntk.internal import _as_tuple
from cntk.layers.blocks import _initializer_for, _INFERRED, identity
from cntk.layers.blocks import UntestedBranchError # helpers
from cntk.default_options import is_default_override
from cntk.default_options import get_default_override, default_override_or

def svd_subprojection(matrix, k):
Calculate svd of the matrix and produce a subprojection based on k
matrix : an input matrix
k (int): desired rank of the output matrix
two matrices representing the original matrix after svd and
reducing them based on k.

import numpy as np
from numpy import dot, diag
from numpy.linalg import svd

# Decompose W into (U, s, V)
U, s, V = svd(matrix, full_matrices=False)

# Create two dense layers from this; one that takes U, one that takes
# dot(s, V), but restrict them all to rank k, such that the result is a
# k-rank subprojection
W1 = np.ascontiguousarray(U[:, :k])
W2 = dot(diag(s[:k]), V[:k, :])

return W1, W2

def factor_dense(model, projection_function = None, filter_function = None,
factor_function = None):
Reduce the size of a dense model using the provided factor_function
and the projection_function. filter_function is used to select dense
layers to apply the reduction. If no factor_function is specified,
use svd decomposition.
model : dense model.
projection_function : determin the new size of the dense model. It can
be based on the shape of the weight matrix or
other heuristics.
factor_function can choose to ignore the value k.
filter_function : filter layers in the model to apply the factorization
factor_function : factor the dense model (e.g. svd)
a model that is factored and reduced in size.
if (factor_function == None and projection_function == None):
raise ValueError("Dense: default factor function (svd) requires a projection_function.")

dense_filter = (lambda x: type(x) == cntk.Function
and x.op_name == 'Dense'
and filter_function(x) if filter_function else True)

def dense_converter(model):
W, b = model.W.value, model.b.value

ht, wdth = W.shape
# k is the rank of the output matrices. If a projection function is
# provided, then use it, otherwise assign min of two dimensions of
# W to k.
k = projection_function(W) if projection_function else min(ht, wdth)
W1, W2 = factor_function(W, k) if factor_function else svd_subprojection(W, k)

Ws = {'W1': W1, 'W2': W2}
dfl = dense_factored((k, wdth),
return dfl

return cntk.misc.convert(model, dense_filter, dense_converter)

def dense_factored(shapes, #(shape1, shape2)
init={'W1':None, 'W2':None},
Perform the new model creation using the factored inputs W1 and W2.
The returend function represents the new model.
shapes : dimensions of the input matrices.
activation : activation function used for the model.
init : the two matrices corresponding to the factorization.
input_rank : rank of the input tensor.
map_rank : ???
bias : bias for the model.
init_bias : initial bias value.
name : name of the block function that creates the new model.
a model that is factored and projected (reduced).

# matthaip: Not sure how to handle input tensor of rank > 1
# or selective flattening of ranks
assert(input_rank is None and
map_rank is None and
all(isinstance(s,int) for s in list(shapes)))

activation = get_default_override(cntk.layers.Dense, activation=activation)
bias = get_default_override(cntk.layers.Dense, bias=bias)
init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias)
# how to use get_default_override for init parameeter?

output_shape1 = _as_tuple(shapes[0])
output_shape2 = _as_tuple(shapes[1])
if input_rank is not None and map_rank is not None:
raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.")

# If input_rank not given then pass a single _INFERRED;
# map_rank if given will determine the input_rank.
# The dimension inference may still create multiple axes.
input_shape = _INFERRED

# parameters bound to this Function
# init_weights = _initializer_for(init, Record(output_rank=output_rank))
init_weights = init
W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1')
W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2')
b = Parameter(output_shape2, init=init_bias, name='b') if bias else None

# expression of this function
@BlockFunction('DenseFactored', name)
def dense(x):
r = times(x, W1)
r = times(r, W2)
if b:
r = r + b
if activation is not None:
r = activation(r)
return r
return dense

# Reference for sklearn.tucker.hooi:
8 changes: 8 additions & 0 deletions bindings/python/cntk/contrib/netopt/test/
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See file in the project root
# for full license information.
# ==============================================================================
Tests for netowrk optimization alogorithms.

155 changes: 155 additions & 0 deletions bindings/python/cntk/contrib/netopt/test/
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import numpy as np
import pytest
import cntk as C
import cntk.contrib.netopt.factorization as nc

# create a dense network for the tests
def _create_model_dense(features, num_hidden_layers, hidden_layers_dim, num_output_classes):
with C.layers.default_options(init=C.layers.glorot_uniform(), activation=C.sigmoid):
h = features
for _ in range(num_hidden_layers):
h = C.layers.Dense(hidden_layers_dim)(h)
last_layer = C.layers.Dense(num_output_classes, activation = None)

return last_layer(h)

# no size reduction, only the factorization.
def _get_rank_same_size(W):
return int(len(W) * 1)

# reduce the size by len* 0.8
def _get_rank_reduced_size(W):
return int(len(W) * 0.8)

# filter dense blocks that has the same height and width.
def _filter(model):
W = model.W.value
if (len(W) != len(W[0])):
return False
return True

# Helper function to generate a random data sample
def _generate_random_data_sample(sample_size, feature_dim, num_classes):
Y = np.random.randint(size=(sample_size, 1), low=0, high=num_classes)
X = (np.random.randn(sample_size, feature_dim)+3) * (Y+1)
X = X.astype(np.float32)
class_ind = [Y==class_number for class_number in range(num_classes)]
Y = np.asarray(np.hstack(class_ind), dtype=np.float32)
return X, Y

def test_svd_factorization():
# W and its svd factorizations (U and sV)
W = np.array([[1, 0, 0, 0, 2],
[0, 0, 3, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 2, 0]])

U = np.array([[0, 1, 0, 0],
[1, 0, 0, 0],
[0, 0, 0, 1],
[0, 0, 1, 0]])

sV = np.array([[0, 0, 3, 0, 0],
[1, 0, 0, 0, 2],
[0, 0, 0, 2, 0],
[0, 0, 0, 0, 0]])

# call svd factorization with W's length
W1, W2 = nc.svd_subprojection(W, len(W))

assert(np.array_equal(W1, U) == True)
assert(np.allclose(sV, W2) == True)

def test_factor_dense():

input_dim = 2
num_output_classes = 2
hidden_layer_dim = 50

input = C.input_variable(input_dim)
z = _create_model_dense(input, input_dim, hidden_layer_dim, num_output_classes)
blocks = C.logging.graph.depth_first_search(
z, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0)

newz = nc.factor_dense(z, projection_function=_get_rank_same_size, filter_function = _filter)
newblocks = C.logging.graph.depth_first_search(
newz, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0)

assert(newblocks[1].op_name == "DenseFactored")
block_root = C.as_composite(newblocks[1].block_root)
# no reduction, same size but factored.
assert(block_root.W1.value.shape == (50, 50))

newz = nc.factor_dense(z, projection_function=_get_rank_reduced_size, filter_function = _filter)
newblocks = C.logging.graph.depth_first_search(
newz, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0)
assert(newblocks[1].op_name == "DenseFactored")
block_root = C.as_composite(newblocks[1].block_root)
# the reduction has taken place now.
assert(block_root.W1.value.shape == (50, 40))

def _percentage_match(labels, predictions):
match_count = 0
for idx, lbl in enumerate(labels):
if (np.argmax(lbl) == np.argmax(predictions[idx])):
match_count += 1
return match_count / len(labels) * 100 if len(labels) != 0 else 0

def test_factor_dense_for_prediction():

input_dim = 2
num_output_classes = 2
hidden_layer_dim = 50
num_minibatches_to_train = 2000
minibatch_size = 25
learning_rate = 0.5

input = C.input_variable(input_dim)
label = C.input_variable(num_output_classes)

z = _create_model_dense(input, input_dim, hidden_layer_dim, num_output_classes)

loss = C.cross_entropy_with_softmax(z, label)
eval_error = C.classification_error(z, label)

# Instantiate the trainer object to drive the model training

lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch)
learner = C.sgd(z.parameters, lr_schedule)
trainer = C.Trainer(z, (loss, eval_error), [learner])

# Run the trainer and perform model training
training_progress_output_freq = 20
plotdata = {"batchsize":[], "loss":[], "error":[]}

for i in range(0, int(num_minibatches_to_train)):
features, labels = _generate_random_data_sample(minibatch_size, input_dim, num_output_classes)
# Specify the input variables mapping in the model to actual minibatch data for training
trainer.train_minibatch({input : features, label : labels})

# generate some data to predict
features, labels = _generate_random_data_sample(10, 2, 2)

# factor the model.
newz = nc.factor_dense(z, projection_function=_get_rank_reduced_size, filter_function = _filter)
original_out = C.softmax(z)
factored_out = C.softmax(newz)

original_labels_probs = original_out.eval({input : features})
predicted_label_probs = factored_out.eval({input : features})

# reduced model should have at leat 70% match compared to the original
# For the test, we reduced the training minibatches, thus the match is lower.
assert(_percentage_match(labels, predicted_label_probs) >=70)

0 comments on commit 12f2ede

Please sign in to comment.