forked from microsoft/CNTK
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
327 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Copyright (c) Microsoft. All rights reserved. | ||
# Licensed under the MIT license. See LICENSE.md file in the project root | ||
# for full license information. | ||
# ============================================================================== | ||
""" | ||
Netowrk optimization alogorithms. | ||
""" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
import cntk | ||
from cntk.ops.functions import BlockFunction | ||
from cntk.variables import Parameter | ||
from cntk.ops import times | ||
from cntk.internal import _as_tuple | ||
from cntk.layers.blocks import _initializer_for, _INFERRED, identity | ||
from cntk.layers.blocks import UntestedBranchError # helpers | ||
from cntk.default_options import is_default_override | ||
from cntk.default_options import get_default_override, default_override_or | ||
|
||
def svd_subprojection(matrix, k): | ||
''' | ||
Calculate svd of the matrix and produce a subprojection based on k | ||
Args: | ||
matrix : an input matrix | ||
k (int): desired rank of the output matrix | ||
Returns: | ||
two matrices representing the original matrix after svd and | ||
reducing them based on k. | ||
''' | ||
|
||
import numpy as np | ||
from numpy import dot, diag | ||
from numpy.linalg import svd | ||
|
||
# Decompose W into (U, s, V) | ||
U, s, V = svd(matrix, full_matrices=False) | ||
|
||
# Create two dense layers from this; one that takes U, one that takes | ||
# dot(s, V), but restrict them all to rank k, such that the result is a | ||
# k-rank subprojection | ||
W1 = np.ascontiguousarray(U[:, :k]) | ||
W2 = dot(diag(s[:k]), V[:k, :]) | ||
|
||
return W1, W2 | ||
|
||
|
||
def factor_dense(model, projection_function = None, filter_function = None, | ||
factor_function = None): | ||
''' | ||
Reduce the size of a dense model using the provided factor_function | ||
and the projection_function. filter_function is used to select dense | ||
layers to apply the reduction. If no factor_function is specified, | ||
use svd decomposition. | ||
Args: | ||
model : dense model. | ||
projection_function : determin the new size of the dense model. It can | ||
be based on the shape of the weight matrix or | ||
other heuristics. | ||
factor_function can choose to ignore the value k. | ||
filter_function : filter layers in the model to apply the factorization | ||
factor_function : factor the dense model (e.g. svd) | ||
Returns: | ||
a model that is factored and reduced in size. | ||
''' | ||
if (factor_function == None and projection_function == None): | ||
raise ValueError("Dense: default factor function (svd) requires a projection_function.") | ||
|
||
dense_filter = (lambda x: type(x) == cntk.Function | ||
and x.op_name == 'Dense' | ||
and filter_function(x) if filter_function else True) | ||
|
||
def dense_converter(model): | ||
W, b = model.W.value, model.b.value | ||
|
||
ht, wdth = W.shape | ||
# k is the rank of the output matrices. If a projection function is | ||
# provided, then use it, otherwise assign min of two dimensions of | ||
# W to k. | ||
k = projection_function(W) if projection_function else min(ht, wdth) | ||
W1, W2 = factor_function(W, k) if factor_function else svd_subprojection(W, k) | ||
|
||
Ws = {'W1': W1, 'W2': W2} | ||
dfl = dense_factored((k, wdth), | ||
init=Ws, | ||
activation=None, | ||
init_bias=b, | ||
name='DenseFactored')(model.inputs[2]) | ||
return dfl | ||
|
||
return cntk.misc.convert(model, dense_filter, dense_converter) | ||
|
||
|
||
def dense_factored(shapes, #(shape1, shape2) | ||
activation=default_override_or(identity), | ||
init={'W1':None, 'W2':None}, | ||
input_rank=None, | ||
map_rank=None, | ||
bias=default_override_or(True), | ||
init_bias=default_override_or(0), | ||
name=''): | ||
''' | ||
Perform the new model creation using the factored inputs W1 and W2. | ||
The returend function represents the new model. | ||
Args: | ||
shapes : dimensions of the input matrices. | ||
activation : activation function used for the model. | ||
init : the two matrices corresponding to the factorization. | ||
input_rank : rank of the input tensor. | ||
map_rank : ??? | ||
bias : bias for the model. | ||
init_bias : initial bias value. | ||
name : name of the block function that creates the new model. | ||
Returns: | ||
a model that is factored and projected (reduced). | ||
''' | ||
|
||
# matthaip: Not sure how to handle input tensor of rank > 1 | ||
# or selective flattening of ranks | ||
assert(input_rank is None and | ||
map_rank is None and | ||
all(isinstance(s,int) for s in list(shapes))) | ||
|
||
activation = get_default_override(cntk.layers.Dense, activation=activation) | ||
bias = get_default_override(cntk.layers.Dense, bias=bias) | ||
init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias) | ||
# how to use get_default_override for init parameeter? | ||
|
||
output_shape1 = _as_tuple(shapes[0]) | ||
output_shape2 = _as_tuple(shapes[1]) | ||
if input_rank is not None and map_rank is not None: | ||
raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.") | ||
|
||
|
||
# If input_rank not given then pass a single _INFERRED; | ||
# map_rank if given will determine the input_rank. | ||
# The dimension inference may still create multiple axes. | ||
input_shape = _INFERRED | ||
|
||
# parameters bound to this Function | ||
# init_weights = _initializer_for(init, Record(output_rank=output_rank)) | ||
init_weights = init | ||
W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1') | ||
W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2') | ||
b = Parameter(output_shape2, init=init_bias, name='b') if bias else None | ||
|
||
# expression of this function | ||
@BlockFunction('DenseFactored', name) | ||
def dense(x): | ||
r = times(x, W1) | ||
r = times(r, W2) | ||
if b: | ||
r = r + b | ||
if activation is not None: | ||
r = activation(r) | ||
return r | ||
return dense | ||
|
||
# Reference for sklearn.tucker.hooi: | ||
# https://hal.inria.fr/hal-01219316/document |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Copyright (c) Microsoft. All rights reserved. | ||
# Licensed under the MIT license. See LICENSE.md file in the project root | ||
# for full license information. | ||
# ============================================================================== | ||
""" | ||
Tests for netowrk optimization alogorithms. | ||
""" | ||
|
155 changes: 155 additions & 0 deletions
155
bindings/python/cntk/contrib/netopt/test/factorization_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import numpy as np | ||
import pytest | ||
import cntk as C | ||
import cntk.contrib.netopt.factorization as nc | ||
C.cntk_py.set_fixed_random_seed(1) | ||
|
||
# create a dense network for the tests | ||
def _create_model_dense(features, num_hidden_layers, hidden_layers_dim, num_output_classes): | ||
with C.layers.default_options(init=C.layers.glorot_uniform(), activation=C.sigmoid): | ||
h = features | ||
for _ in range(num_hidden_layers): | ||
h = C.layers.Dense(hidden_layers_dim)(h) | ||
last_layer = C.layers.Dense(num_output_classes, activation = None) | ||
|
||
return last_layer(h) | ||
|
||
|
||
# no size reduction, only the factorization. | ||
def _get_rank_same_size(W): | ||
return int(len(W) * 1) | ||
|
||
|
||
# reduce the size by len* 0.8 | ||
def _get_rank_reduced_size(W): | ||
return int(len(W) * 0.8) | ||
|
||
|
||
# filter dense blocks that has the same height and width. | ||
def _filter(model): | ||
W = model.W.value | ||
if (len(W) != len(W[0])): | ||
return False | ||
else: | ||
return True | ||
|
||
|
||
# Helper function to generate a random data sample | ||
def _generate_random_data_sample(sample_size, feature_dim, num_classes): | ||
Y = np.random.randint(size=(sample_size, 1), low=0, high=num_classes) | ||
X = (np.random.randn(sample_size, feature_dim)+3) * (Y+1) | ||
X = X.astype(np.float32) | ||
class_ind = [Y==class_number for class_number in range(num_classes)] | ||
Y = np.asarray(np.hstack(class_ind), dtype=np.float32) | ||
return X, Y | ||
|
||
|
||
def test_svd_factorization(): | ||
# W and its svd factorizations (U and sV) | ||
W = np.array([[1, 0, 0, 0, 2], | ||
[0, 0, 3, 0, 0], | ||
[0, 0, 0, 0, 0], | ||
[0, 0, 0, 2, 0]]) | ||
|
||
U = np.array([[0, 1, 0, 0], | ||
[1, 0, 0, 0], | ||
[0, 0, 0, 1], | ||
[0, 0, 1, 0]]) | ||
|
||
sV = np.array([[0, 0, 3, 0, 0], | ||
[1, 0, 0, 0, 2], | ||
[0, 0, 0, 2, 0], | ||
[0, 0, 0, 0, 0]]) | ||
|
||
# call svd factorization with W's length | ||
W1, W2 = nc.svd_subprojection(W, len(W)) | ||
|
||
assert(np.array_equal(W1, U) == True) | ||
assert(np.allclose(sV, W2) == True) | ||
|
||
|
||
def test_factor_dense(): | ||
|
||
input_dim = 2 | ||
num_output_classes = 2 | ||
hidden_layer_dim = 50 | ||
|
||
input = C.input_variable(input_dim) | ||
z = _create_model_dense(input, input_dim, hidden_layer_dim, num_output_classes) | ||
blocks = C.logging.graph.depth_first_search( | ||
z, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0) | ||
|
||
newz = nc.factor_dense(z, projection_function=_get_rank_same_size, filter_function = _filter) | ||
newblocks = C.logging.graph.depth_first_search( | ||
newz, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0) | ||
|
||
assert(newblocks[1].op_name == "DenseFactored") | ||
block_root = C.as_composite(newblocks[1].block_root) | ||
# no reduction, same size but factored. | ||
assert(block_root.W1.value.shape == (50, 50)) | ||
|
||
newz = nc.factor_dense(z, projection_function=_get_rank_reduced_size, filter_function = _filter) | ||
newblocks = C.logging.graph.depth_first_search( | ||
newz, lambda x : type(x) == C.Function and x.root_function.is_block, depth = 0) | ||
assert(newblocks[1].op_name == "DenseFactored") | ||
block_root = C.as_composite(newblocks[1].block_root) | ||
# the reduction has taken place now. | ||
assert(block_root.W1.value.shape == (50, 40)) | ||
|
||
|
||
def _percentage_match(labels, predictions): | ||
match_count = 0 | ||
for idx, lbl in enumerate(labels): | ||
if (np.argmax(lbl) == np.argmax(predictions[idx])): | ||
match_count += 1 | ||
return match_count / len(labels) * 100 if len(labels) != 0 else 0 | ||
|
||
|
||
def test_factor_dense_for_prediction(): | ||
|
||
input_dim = 2 | ||
num_output_classes = 2 | ||
hidden_layer_dim = 50 | ||
num_minibatches_to_train = 2000 | ||
minibatch_size = 25 | ||
learning_rate = 0.5 | ||
|
||
input = C.input_variable(input_dim) | ||
label = C.input_variable(num_output_classes) | ||
|
||
z = _create_model_dense(input, input_dim, hidden_layer_dim, num_output_classes) | ||
|
||
loss = C.cross_entropy_with_softmax(z, label) | ||
eval_error = C.classification_error(z, label) | ||
|
||
# Instantiate the trainer object to drive the model training | ||
|
||
lr_schedule = C.learning_rate_schedule(learning_rate, C.UnitType.minibatch) | ||
learner = C.sgd(z.parameters, lr_schedule) | ||
trainer = C.Trainer(z, (loss, eval_error), [learner]) | ||
|
||
|
||
# Run the trainer and perform model training | ||
training_progress_output_freq = 20 | ||
plotdata = {"batchsize":[], "loss":[], "error":[]} | ||
|
||
|
||
for i in range(0, int(num_minibatches_to_train)): | ||
features, labels = _generate_random_data_sample(minibatch_size, input_dim, num_output_classes) | ||
# Specify the input variables mapping in the model to actual minibatch data for training | ||
trainer.train_minibatch({input : features, label : labels}) | ||
|
||
# generate some data to predict | ||
features, labels = _generate_random_data_sample(10, 2, 2) | ||
|
||
# factor the model. | ||
newz = nc.factor_dense(z, projection_function=_get_rank_reduced_size, filter_function = _filter) | ||
original_out = C.softmax(z) | ||
factored_out = C.softmax(newz) | ||
|
||
original_labels_probs = original_out.eval({input : features}) | ||
predicted_label_probs = factored_out.eval({input : features}) | ||
|
||
# reduced model should have at leat 70% match compared to the original | ||
# For the test, we reduced the training minibatches, thus the match is lower. | ||
assert(_percentage_match(labels, predicted_label_probs) >=70) |