Skip to content

Commit

Permalink
added dense factorization code
Browse files Browse the repository at this point in the history
  • Loading branch information
jaliyae committed Oct 27, 2017
1 parent 3415b9b commit 12f2ede
Show file tree
Hide file tree
Showing 4 changed files with 327 additions and 0 deletions.
8 changes: 8 additions & 0 deletions bindings/python/cntk/contrib/netopt/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
"""
Netowrk optimization alogorithms.
"""

156 changes: 156 additions & 0 deletions bindings/python/cntk/contrib/netopt/factorization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import cntk
from cntk.ops.functions import BlockFunction
from cntk.variables import Parameter
from cntk.ops import times
from cntk.internal import _as_tuple
from cntk.layers.blocks import _initializer_for, _INFERRED, identity
from cntk.layers.blocks import UntestedBranchError # helpers
from cntk.default_options import is_default_override
from cntk.default_options import get_default_override, default_override_or

def svd_subprojection(matrix, k):
'''
Calculate svd of the matrix and produce a subprojection based on k
Args:
matrix : an input matrix
k (int): desired rank of the output matrix
Returns:
two matrices representing the original matrix after svd and
reducing them based on k.
'''

import numpy as np
from numpy import dot, diag
from numpy.linalg import svd

# Decompose W into (U, s, V)
U, s, V = svd(matrix, full_matrices=False)

# Create two dense layers from this; one that takes U, one that takes
# dot(s, V), but restrict them all to rank k, such that the result is a
# k-rank subprojection
W1 = np.ascontiguousarray(U[:, :k])
W2 = dot(diag(s[:k]), V[:k, :])

return W1, W2


def factor_dense(model, projection_function = None, filter_function = None,
factor_function = None):
'''
Reduce the size of a dense model using the provided factor_function
and the projection_function. filter_function is used to select dense
layers to apply the reduction. If no factor_function is specified,
use svd decomposition.
Args:
model : dense model.
projection_function : determin the new size of the dense model. It can
be based on the shape of the weight matrix or
other heuristics.
factor_function can choose to ignore the value k.
filter_function : filter layers in the model to apply the factorization
factor_function : factor the dense model (e.g. svd)
Returns:
a model that is factored and reduced in size.
'''
if (factor_function == None and projection_function == None):
raise ValueError("Dense: default factor function (svd) requires a projection_function.")

dense_filter = (lambda x: type(x) == cntk.Function
and x.op_name == 'Dense'
and filter_function(x) if filter_function else True)

def dense_converter(model):
W, b = model.W.value, model.b.value

ht, wdth = W.shape
# k is the rank of the output matrices. If a projection function is
# provided, then use it, otherwise assign min of two dimensions of
# W to k.
k = projection_function(W) if projection_function else min(ht, wdth)
W1, W2 = factor_function(W, k) if factor_function else svd_subprojection(W, k)

Ws = {'W1': W1, 'W2': W2}
dfl = dense_factored((k, wdth),
init=Ws,
activation=None,
init_bias=b,
name='DenseFactored')(model.inputs[2])
return dfl

return cntk.misc.convert(model, dense_filter, dense_converter)


def dense_factored(shapes, #(shape1, shape2)
activation=default_override_or(identity),
init={'W1':None, 'W2':None},
input_rank=None,
map_rank=None,
bias=default_override_or(True),
init_bias=default_override_or(0),
name=''):
'''
Perform the new model creation using the factored inputs W1 and W2.
The returend function represents the new model.
Args:
shapes : dimensions of the input matrices.
activation : activation function used for the model.
init : the two matrices corresponding to the factorization.
input_rank : rank of the input tensor.
map_rank : ???
bias : bias for the model.
init_bias : initial bias value.
name : name of the block function that creates the new model.
Returns:
a model that is factored and projected (reduced).
'''

# matthaip: Not sure how to handle input tensor of rank > 1
# or selective flattening of ranks
assert(input_rank is None and
map_rank is None and
all(isinstance(s,int) for s in list(shapes)))

activation = get_default_override(cntk.layers.Dense, activation=activation)
bias = get_default_override(cntk.layers.Dense, bias=bias)
init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias)
# how to use get_default_override for init parameeter?

output_shape1 = _as_tuple(shapes[0])
output_shape2 = _as_tuple(shapes[1])
if input_rank is not None and map_rank is not None:
raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.")


# If input_rank not given then pass a single _INFERRED;
# map_rank if given will determine the input_rank.
# The dimension inference may still create multiple axes.
input_shape = _INFERRED

# parameters bound to this Function
# init_weights = _initializer_for(init, Record(output_rank=output_rank))
init_weights = init
W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1')
W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2')
b = Parameter(output_shape2, init=init_bias, name='b') if bias else None

# expression of this function
@BlockFunction('DenseFactored', name)
def dense(x):
r = times(x, W1)
r = times(r, W2)
if b:
r = r + b
if activation is not None:
r = activation(r)
return r
return dense

# Reference for sklearn.tucker.hooi:
# https://hal.inria.fr/hal-01219316/document
8 changes: 8 additions & 0 deletions bindings/python/cntk/contrib/netopt/test/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE.md file in the project root
# for full license information.
# ==============================================================================
"""
Tests for netowrk optimization alogorithms.
"""

155 changes: 155 additions & 0 deletions bindings/python/cntk/contrib/netopt/test/factorization_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import numpy as np
import pytest
import cntk as C
import cntk.contrib.netopt.factorization as nc
C.cntk_py.set_fixed_random_seed(1)

# create a dense network for the tests
def _create_model_dense(features, num_hidden_layers, hidden_layers_dim, num_output_classes):
with C.layers.default_options(init=C.layers.glorot_uniform(), activation=C.sigmoid):
h = features
for _ in range(num_hidden_layers):
h = C.layers.Dense(hidden_layers_dim)(h)
last_layer = C.layers.Dense(num_output_classes, activation = None)

return last_layer(h)


# no size reduction, only the factorization.
def _get_rank_same_size(W):
return int(len(W) * 1)


# reduce the size by len* 0.8
def _get_rank_reduced_size(W):
return int(len(W) * 0.8)


# filter dense blocks that has the same height and width.
def _filter(model):
W = model.W.value
if (len(W) != len(W[0])):
return False
else:
return True


# Helper function to generate a random data sample
def _generate_random_data_sample(sample_size, feature_dim, num_classes):
Y = np.random.randint(size=(sample_size, 1), low=0, high=num_classes)
X = (np.random.randn(sample_size, feature_dim)+3) * (Y+1)
X = X.astype(np.float32)
class_ind = [Y==class_number for class_number in range(num_classes)]
Y = np.asarray(np.hstack(class_ind), dtype=np.float32)
return X, Y


def test_svd_factorization():
# W and its svd factorizations (U and sV)
W = np.array([[1, 0, 0, 0, 2],
[0, 0, 3, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 2, 0]])

U = np.array([[0, 1, 0, 0],
[1, 0, 0, 0],
[0, 0, 0, 1],
[0, 0, 1, 0]])

sV = np.array([[0, 0, 3, 0, 0],
[1, 0, 0, 0, 2],
[0, 0, 0, 2, 0],
[0, 0, 0, 0, 0]])

# call svd factorization with W's length
W1, W2 = nc.svd_subprojection(W, len(W))

assert(np.array_equal(W1, U) == True)
assert(np.allclose(sV, W2) == True)


def test_factor_dense():

input_dim = 2
num_output_classes = 2
hidden_layer_dim = 50

input = C.input_variable(input_dim)
z = _create_model_dense(input, input_dim, hidden_layer_dim, num_output_classes)
blocks = C.logging.graph.depth_first_search(
z, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0)

newz = nc.factor_dense(z, projection_function=_get_rank_same_size, filter_function = _filter)
newblocks = C.logging.graph.depth_first_search(
newz, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0)

assert(newblocks[1].op_name == "DenseFactored")
block_root = C.as_composite(newblocks[1].block_root)
# no reduction, same size but factored.
assert(block_root.W1.value.shape == (50, 50))

newz = nc.factor_dense(z, projection_function=_get_rank_reduced_size, filter_function = _filter)
newblocks = C.logging.graph.depth_first_search(
newz, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0)
assert(newblocks[1].op_name == "DenseFactored")
block_root = C.as_composite(newblocks[1].block_root)
# the reduction has taken place now.
assert(block_root.W1.value.shape == (50, 40))


def _percentage_match(labels, predictions):
match_count = 0
for idx, lbl in enumerate(labels):
if (np.argmax(lbl) == np.argmax(predictions[idx])):
match_count += 1
return match_count / len(labels) * 100 if len(labels) != 0 else 0


def test_factor_dense_for_prediction():

input_dim = 2
num_output_classes = 2
hidden_layer_dim = 50
num_minibatches_to_train = 2000
minibatch_size = 25
learning_rate = 0.5

input = C.input_variable(input_dim)
label = C.input_variable(num_output_classes)

z = _create_model_dense(input, input_dim, hidden_layer_dim, num_output_classes)

loss = C.cross_entropy_with_softmax(z, label)
eval_error = C.classification_error(z, label)

# Instantiate the trainer object to drive the model training

lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch)
learner = C.sgd(z.parameters, lr_schedule)
trainer = C.Trainer(z, (loss, eval_error), [learner])


# Run the trainer and perform model training
training_progress_output_freq = 20
plotdata = {"batchsize":[], "loss":[], "error":[]}


for i in range(0, int(num_minibatches_to_train)):
features, labels = _generate_random_data_sample(minibatch_size, input_dim, num_output_classes)
# Specify the input variables mapping in the model to actual minibatch data for training
trainer.train_minibatch({input : features, label : labels})

# generate some data to predict
features, labels = _generate_random_data_sample(10, 2, 2)

# factor the model.
newz = nc.factor_dense(z, projection_function=_get_rank_reduced_size, filter_function = _filter)
original_out = C.softmax(z)
factored_out = C.softmax(newz)

original_labels_probs = original_out.eval({input : features})
predicted_label_probs = factored_out.eval({input : features})

# reduced model should have at leat 70% match compared to the original
# For the test, we reduced the training minibatches, thus the match is lower.
assert(_percentage_match(labels, predicted_label_probs) >=70)

0 comments on commit 12f2ede

Please sign in to comment.