diff --git a/Math/CNTKMathTest/CNTKMathTest.vcxproj b/Math/CNTKMathTest/CNTKMathTest.vcxproj
deleted file mode 100644
index 748d162c1d0d..000000000000
--- a/Math/CNTKMathTest/CNTKMathTest.vcxproj
+++ /dev/null
@@ -1,137 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{6CEE834A-8104-46A8-8902-64C81BD7928F}</ProjectGuid>
-    <SccProjectName>
-    </SccProjectName>
-    <SccAuxPath>
-    </SccAuxPath>
-    <SccLocalPath>
-    </SccLocalPath>
-    <SccProvider>
-    </SccProvider>
-    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>CNTKMathTest</RootNamespace>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-    <UseOfMfc>false</UseOfMfc>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-    <UseOfMfc>false</UseOfMfc>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.0.props" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <IncludePath>..\..\common\include\;$(IncludePath)</IncludePath>
-    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>true</LinkIncremental>
-    <IncludePath>..\..\common\include\;$(IncludePath);$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
-    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <ClCompile>
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <WarningLevel>Level4</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <AdditionalIncludeDirectories>$(VCInstallDir)UnitTest\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>WIN32;_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <UseFullPaths>true</UseFullPaths>
-      <OpenMPSupport>true</OpenMPSupport>
-      <TreatWarningAsError>true</TreatWarningAsError>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalLibraryDirectories>$(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-    </Link>
-    <CudaCompile>
-      <TargetMachinePlatform>64</TargetMachinePlatform>
-      <CodeGeneration>compute_20,sm_20;compute_30,sm_30;%(CodeGeneration)</CodeGeneration>
-    </CudaCompile>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <ClCompile>
-      <WarningLevel>Level4</WarningLevel>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <AdditionalIncludeDirectories>$(VCInstallDir)UnitTest\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>WIN32;NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <UseFullPaths>true</UseFullPaths>
-      <MultiProcessorCompilation>true</MultiProcessorCompilation>
-      <TreatWarningAsError>true</TreatWarningAsError>
-      <OpenMPSupport>false</OpenMPSupport>
-      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalLibraryDirectories>$(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClInclude Include="stdafx.h" />
-    <ClInclude Include="targetver.h" />
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="GPUWatcherTests.cpp" />
-    <ClCompile Include="MatrixQuantizerTests.cpp" />
-    <ClCompile Include="stdafx.cpp">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\DebugUtil.cpp">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ProjectReference Include="..\Math\Math.vcxproj">
-      <Project>{60bdb847-d0c4-4fd3-a947-0c15c08bcdb5}</Project>
-      <Private>true</Private>
-      <ReferenceOutputAssembly>true</ReferenceOutputAssembly>
-      <CopyLocalSatelliteAssemblies>false</CopyLocalSatelliteAssemblies>
-      <LinkLibraryDependencies>true</LinkLibraryDependencies>
-      <UseLibraryDependencyInputs>false</UseLibraryDependencyInputs>
-    </ProjectReference>
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.0.targets" />
-  </ImportGroup>
-</Project>
\ No newline at end of file
diff --git a/Math/CNTKMathTest/GPUWatcherTests.cpp b/Math/CNTKMathTest/GPUWatcherTests.cpp
deleted file mode 100644
index 223b3e7abbdf..000000000000
--- a/Math/CNTKMathTest/GPUWatcherTests.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-//
-// <copyright file="GPUWatcherTests.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#include "stdafx.h"
-#include "CppUnitTest.h"
-#include "..\Math\GPUWatcher.h"
-
-#define epsilon 0.000001
-#define IDX2C(i,j,ld) (((j)*(ld))+(i)) // 0 based indexing
-
-using namespace Microsoft::VisualStudio::CppUnitTestFramework;
-
-#if 0   // BUGBUG: This somehow does not link, GetFreeMemoryOnCUDADevice is missing from CNTKMath.dll. Commening out so that we can at least get all other tests to run.
-namespace CNTKMathTest
-{    
-    TEST_CLASS(GPUWatcherTests)
-    {        
-
-    public:
-
-        //This test should fail if you don't have CUDA GPU (or working under remote desktop)
-        TEST_METHOD(GetFreeMemoryOnCUDADeviceTest)
-        {
-            size_t x = GPUWatcher::GetFreeMemoryOnCUDADevice(0);
-            Assert::IsTrue(x>0);
-        }
-    };
-}
-#endif
diff --git a/Math/CNTKMathTest/MatrixQuantizerTests.cpp b/Math/CNTKMathTest/MatrixQuantizerTests.cpp
deleted file mode 100644
index e3a812e6a2f1..000000000000
--- a/Math/CNTKMathTest/MatrixQuantizerTests.cpp
+++ /dev/null
@@ -1,505 +0,0 @@
-//
-// <copyright file="MatrixQuantizerTests.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#include "stdafx.h"
-#include "CppUnitTest.h"
-#include "File.h"
-#include <memory>
-#include <io.h>
-#include <type_traits>
-
-#include "..\..\common\include\fileutil.h"
-#include "..\..\common\include\File.h"
-#include "..\..\common\File.cpp"
-#include "..\..\common\fileutil.cpp"
-#include <string>
-
-#include "..\Math\MatrixQuantizer.h"
-#include "..\Math\CUDAPageLockedMemAllocator.h"
-#include "..\Math\ValueQuantizer.h"
-
-#define DEBUG_FLAG 1
-using namespace Microsoft::MSR::CNTK;
-
-#pragma warning (disable: 4305)
-
-using namespace Microsoft::MSR::CNTK;
-using namespace Microsoft::VisualStudio::CppUnitTestFramework;
-
-//#define DEBUG_OUTPUT_PATH L"E:/temp/MatrixQuantizerTest.out.txt"
-
-#pragma warning (disable: 4996)
-
-void RedirectStdErrAndStdOut(wstring logpath)
-{
-    fprintf(stderr, "Redirecting stderr to file %S\n", logpath.c_str());
-    auto f = make_shared<File>(logpath.c_str(), fileOptionsWrite | fileOptionsText);
-    if (dup2(fileno(*f), 1) == -1)
-        RuntimeError("unexpected failure to redirect stdout to log file");
-    if (dup2(fileno(*f), 2) == -1)
-        RuntimeError("unexpected failure to redirect stderr to log file");
-    setvbuf(stderr, NULL, _IONBF, 16384);   // unbuffer it
-    static auto fKept = f;                  // keep it around (until it gets changed)
-}
-
-namespace CNTKMathTest
-{
-    TEST_CLASS(MatrixQuantizerTests)
-    {
-    private:
-
-        static const float SINGLE_PRECISION_TOLERANCE;
-        static const double DOUBLE_PRECISION_TOLERANCE;
-        static const float SINGLE_PRECISION_GPU_QUANTIZATION_TOLERANCE;
-
-        template <typename ElemType>
-        static void ReferenceCPUQuantizer(
-            size_t numBits,
-            size_t numRows,
-            size_t numCols,
-            const ElemType* inMatrix,
-            const ElemType* prevResidualMatrix,
-            const ElemType* prevOutMatrix,
-            ElemType* outMatrix,
-            ElemType* outResidualMatrix,
-            bool zeroThresholdFor1Bit)
-        {
-            typedef typename QuantizedWordHelper<ElemType>::ValueType QWordVal;
-            typedef typename QuantizedWordHelper<ElemType>::ValueTypeSigned QWordValSigned;
-
-            // Just pass through the values if numBits is of the full size of the ElemType
-            if (numBits == (8 * sizeof(ElemType)))
-            {
-                for (size_t j = 0; j < numCols; j++)
-                {
-                    for (int i = 0; i < numRows; i++)
-                    {
-                        size_t flatIdx = (j * numRows) + i;
-                        ElemType val = inMatrix[flatIdx] + prevResidualMatrix[flatIdx];
-                        outMatrix[flatIdx] = prevOutMatrix[flatIdx] + val;
-                        outResidualMatrix[flatIdx] = 0;
-                    }
-                }
-
-                return;
-            }
-
-            for (size_t j = 0; j < numCols; j++)
-            {
-                ElemType mean = 0.0f;
-                if (!zeroThresholdFor1Bit || (numBits != 1))
-                {
-                    ElemType sum = (ElemType)0.0;
-                    for (int i = 0; i < numRows; i++)
-                    {
-                        size_t flatIdx = (j * numRows) + i;
-                        sum += inMatrix[flatIdx] + prevResidualMatrix[flatIdx];
-                    }
-                    mean = sum / numRows;
-                }
-
-                ElemType radius = 0.0f;
-                ElemType newMean = 0.0f;
-                ElemType quantiMin;
-                ElemType quantiMax;
-                if (numBits == 1)
-                {
-                    // Calculate the mean0 and mean1 for each column
-                    ElemType mean0Sum = 0.0f;
-                    ElemType mean1Sum = 0.0f;
-                    int num0 = 0;
-                    int num1 = 0;
-                    for (int i = 0; i < numRows; i++)
-                    {
-                        size_t flatIdx = (j * numRows) + i;
-                        ElemType val = inMatrix[flatIdx] + prevResidualMatrix[flatIdx];
-                        if (val < mean)
-                        {
-                            mean0Sum += val;
-                            num0++;
-                        }
-                        else
-                        {
-                            mean1Sum += val;
-                            num1++;
-                        }
-                    }
-
-                    if (!zeroThresholdFor1Bit)
-                    {
-                        // we minimize the error jointly across positive and negative numbers to make things
-                        // symmetrical around the mean (which may be non-zero) tying the two sides
-                        ElemType devacc0 = (num0 * mean) - mean0Sum;
-                        ElemType devacc1 = mean1Sum - (num1 * mean);
-
-                        // both deviations tied, to ensure consistent mean
-                        ElemType dev = (devacc0 + devacc1) / numRows;
-                        radius = (ElemType)2.0 * dev;
-                        newMean = mean;
-                    }
-                    else
-                    {
-                        // happens for all-zero columns which do exist (mean0 is 0 in that case)
-                        if (num0 == 0) num0 = 1;
-                        if (num1 == 0) num1 = 1;
-
-                        const ElemType mean0 = mean0Sum / num0;
-                        const ElemType mean1 = mean1Sum / num1;
-
-                        newMean = (ElemType)0.5 * (mean0 + mean1);
-                        radius = (ElemType)2.0 * (mean1 - newMean);
-                    }
-
-                    quantiMin = newMean - radius;
-                    quantiMax = newMean + radius;
-                }
-                else
-                {
-                    // >1 bit:
-                    // We linearly quantize between 'stddevs' standard deviations.
-                    ElemType stddevs = 5.0f;
-                    ElemType varacc = 0.0f;
-                    for (int i = 0; i < numRows; i++)
-                    {
-                        size_t flatIdx = (j * numRows) + i;
-                        ElemType val = inMatrix[flatIdx] + prevResidualMatrix[flatIdx];
-                        varacc += (val - mean) * (val - mean);
-                    }
-
-                    ElemType stddev = sqrt(varacc / numRows);
-                    quantiMin = mean - (stddevs * stddev);
-                    quantiMax = mean + (stddevs * stddev);
-                }
-
-                ElemType qFactor;
-                ElemType uFactor;
-                QWordVal rangeSize = ((QWordVal)1) << numBits;
-
-                // must protect against NaN: interval is 0 -> quantization is futile, just emit 0
-                if (((quantiMax - quantiMin) < 1e-36f) || (rangeSize == 0))
-                {
-                    qFactor = uFactor = (ElemType)0.0;
-                }
-                else
-                {
-                    qFactor = rangeSize / (quantiMax - quantiMin);
-                    uFactor = (quantiMax - quantiMin) / rangeSize;
-                }
-
-                for (int i = 0; i < numRows; i++)
-                {
-                    size_t flatIdx = (j * numRows) + i;
-                    ElemType val = inMatrix[flatIdx] + prevResidualMatrix[flatIdx];
-                    ElemType qVal;
-
-                    if (numBits == 1)
-                    {
-                        if (val < mean)
-                        {
-                            qVal = newMean - ((ElemType)0.5 * radius);
-                        }
-                        else
-                        {
-                            qVal = newMean + ((ElemType)0.5 * radius);
-                        }
-                    }
-                    else
-                    {
-                        QWordValSigned result;
-                        if (val <= quantiMin)
-                        {
-                            result = 0;
-                        }
-                        else if (val >= quantiMax)
-                        {
-                            result = (QWordValSigned)(rangeSize - 1);
-                        }
-                        else
-                        {
-                            result = (QWordValSigned)((val - quantiMin) * qFactor);
-                        }
-
-                        qVal = (((QWordVal)result + (ElemType)0.5) * uFactor) + quantiMin;
-                    }
-
-                    outMatrix[flatIdx] = prevOutMatrix[flatIdx] + qVal;
-                    outResidualMatrix[flatIdx] = val - qVal;
-                }
-            }
-        }
-
-        template <typename ElemType>
-        static void TestQuantization(
-            size_t numBits,
-            size_t numRows,
-            size_t numCols,
-            ElemType rangeLow,
-            ElemType rangeHigh,
-            int seed,
-            int numIterations,
-            short deviceId,
-            bool zeroThresholdFor1Bit)
-        {
-            auto verifyAllZerosFunc = [](const Matrix<ElemType>& matrix) {
-                ElemType* cpuMatrix = matrix.CopyToArray();
-                size_t numMatrixElems = matrix.GetNumElements();
-                for (size_t i = 0; i < numMatrixElems; ++i)
-                {
-                    Assert::IsTrue(cpuMatrix[i] == ((ElemType)0));
-                }
-
-                delete[] cpuMatrix;
-            };
-
-            MemAllocator* allocator = nullptr;
-            if (deviceId != CPUDEVICE)
-            {
-                allocator = new CUDAPageLockedMemAllocator(deviceId);
-            }
-
-            Matrix<ElemType> inMatrix(numRows, numCols, deviceId);
-            auto quantizer = MatrixQuantizer<ElemType>::CreateMatrixQuantizer(numRows, numCols, deviceId);
-
-            // Verify that the initial residue is comprised of all zeros
-            verifyAllZerosFunc(quantizer->GetResidualMatrix());
-
-            Matrix<ElemType> outMatrix(numRows, numCols, deviceId);
-            // Verify that the outMatrix is initialized with all zeros
-            verifyAllZerosFunc(outMatrix);
-
-            for (int iterNum = 0; iterNum < numIterations; ++iterNum)
-            {
-                inMatrix = Matrix<ElemType>::RandomUniform(numRows, numCols, rangeLow, rangeHigh, seed + iterNum, deviceId);
-
-                ElemType* gpuInMatrix = inMatrix.CopyToArray();
-                ElemType* gpuPrevResidualMatrix = quantizer->GetResidualMatrix().CopyToArray();
-                ElemType *gpuPrevOutMatrix = outMatrix.CopyToArray();
-
-#ifdef DEBUG_OUTPUT_PATH
-                bool peekOnly = true;
-                const size_t numRowsToPeek = 3;
-                const size_t numColsToPeek = 3;
-                size_t numRowsToPrint;
-                size_t numColsToPrint;
-                if (peekOnly)
-                {
-                    numRowsToPrint = (std::min)(numRowsToPeek, numRows);
-                    numColsToPrint = (std::min)(numColsToPeek, numCols);
-                }
-                else
-                {
-                    numRowsToPrint = numRows;
-                    numColsToPrint = numCols;
-                }
-
-                inMatrix.Print("Input Matrix", 0, numRowsToPrint - 1, 0, numColsToPrint - 1);
-                quantizer->GetResidualMatrix().Print("Old Residual Matrix", 0, numRowsToPrint - 1, 0, numColsToPrint - 1);
-                outMatrix.Print("Old Output Matrix", 0, numRowsToPrint - 1, 0, numColsToPrint - 1);
-#endif
-
-                QuantizedMatrix<ElemType> tempCPUQuantizationBuffer(numRows, numCols, numBits, CPUDEVICE, allocator);
-                quantizer->QuantizeAsync(inMatrix, tempCPUQuantizationBuffer, zeroThresholdFor1Bit);
-                quantizer->WaitQuantizeAsyncDone();
-
-#ifdef DEBUG_OUTPUT_PATH
-                tempCPUQuantizationBuffer.Print("Quantized Matrix", 0, numRowsToPrint - 1, 0, numColsToPrint - 1);
-                quantizer->GetResidualMatrix().Print("New residual Matrix", 0, numRowsToPrint - 1, 0, numColsToPrint - 1);
-#endif
-
-                quantizer->UnquantizeAsync(tempCPUQuantizationBuffer, outMatrix, (iterNum > 0));
-                quantizer->WaitUnquantizeAsyncDone();
-
-#ifdef DEBUG_OUTPUT_PATH
-                outMatrix.Print("Unquantized Output Matrix", 0, numRowsToPrint - 1, 0, numColsToPrint - 1);
-#endif
-
-                // Now verify the quantization results
-                ElemType* gpuNewResidualMatrix = quantizer->GetResidualMatrix().CopyToArray();
-                ElemType* gpuNewOutMatrix = outMatrix.CopyToArray();
-
-                ElemType PRECISION_TOLERANCE = (std::is_same<ElemType, double>::value) ? ((ElemType)DOUBLE_PRECISION_TOLERANCE) : SINGLE_PRECISION_TOLERANCE;
-                ElemType tolerance = 0.0f;
-                if (numBits != (8 * sizeof(ElemType)))
-                {
-                    tolerance = (rangeHigh - rangeLow) * PRECISION_TOLERANCE;
-                }
-                // First verify that (cpuInMatrix + cpuPrevResidualMatrix + cpuPrevOutMatrix == gpuNewResidualMatrix + gpuNewOutMatrix)
-                size_t numMatrixElems = inMatrix.GetNumElements();
-                for (size_t i = 0; i < numMatrixElems; ++i)
-                {
-                    Assert::IsTrue(fabs((gpuInMatrix[i] + gpuPrevResidualMatrix[i] + gpuPrevOutMatrix[i]) - (gpuNewResidualMatrix[i] + gpuNewOutMatrix[i])) <= tolerance);
-                }
-
-                size_t numIncorrectAllowed = 0;
-                if (std::is_same<ElemType, float>::value && (deviceId >= 0))
-                {
-                    // We allow a small number of incorrect results when computing on the GPU
-                    // for single precision since, in rare cases, the value of the CPU and GPU
-                    // may quantize to different integers resulting in difference larger than 
-                    // what is allowed by tolerance
-                    numIncorrectAllowed = (std::max)((size_t)1, (size_t)(numMatrixElems * SINGLE_PRECISION_GPU_QUANTIZATION_TOLERANCE));
-                }
-
-                // Now verify against the reference CPU quantizer
-                size_t numIncorrectOutValue = 0;
-                size_t numIncorrectResidualValue = 0;
-                ElemType* refNewOutMatrix = new ElemType[numMatrixElems];
-                ElemType* refNewResidualMatrix = new ElemType[numMatrixElems];
-                ReferenceCPUQuantizer(numBits, numRows, numCols, gpuInMatrix, gpuPrevResidualMatrix, gpuPrevOutMatrix, refNewOutMatrix, refNewResidualMatrix, zeroThresholdFor1Bit);
-                for (size_t i = 0; i < numMatrixElems; ++i)
-                {
-                    if (fabs(gpuNewOutMatrix[i] - refNewOutMatrix[i]) > tolerance)
-                    {
-                        numIncorrectOutValue++;
-                        if (numIncorrectOutValue > numIncorrectAllowed)
-                        {
-                            Assert::IsTrue(fabs(gpuNewOutMatrix[i] - refNewOutMatrix[i]) <= tolerance);
-                        }
-                    }
-
-                    if (fabs(gpuNewResidualMatrix[i] - refNewResidualMatrix[i]) > tolerance)
-                    {
-                        numIncorrectResidualValue++;
-                        if (numIncorrectResidualValue > numIncorrectAllowed)
-                        {
-                            Assert::IsTrue(fabs(gpuNewResidualMatrix[i] - refNewResidualMatrix[i]) <= tolerance);
-                        }
-                    }
-                }
-
-                delete[] gpuInMatrix;
-                delete[] gpuPrevResidualMatrix;
-                delete[] gpuPrevOutMatrix;
-                delete[] gpuNewResidualMatrix;
-                delete[] gpuNewOutMatrix;
-                delete[] refNewOutMatrix;
-                delete[] refNewResidualMatrix;
-            }
-
-            delete quantizer;
-            delete allocator;
-        }
-
-        template <typename ElemType>
-        static void TestQuantization(short deviceId)
-        {
-            // Test quantization for all power of 2 bit sizes
-            const size_t maxNumBits = 8 * sizeof(ElemType);
-            for (size_t numBits = 1; numBits <= maxNumBits; numBits = numBits * 2)
-            {
-                // Test 1 bit quantization both with and without zeroThresholdFor1Bit setting
-                for (int i = 0; i < 2; ++i)
-                {
-                    bool zeroThresholdFor1Bit = (i == 1);
-
-                    // zeroThresholdFor1Bit test applicable only for 1 bit
-                    if ((numBits != 1) && zeroThresholdFor1Bit)
-                    {
-                        continue;
-                    }
-
-                    // Test quantization on a matrix initialized with floating point numbers between -1 and + 1
-                    size_t numRows = 256;
-                    size_t numCols = 135;
-                    float rangeLow = -1.0f;
-                    float rangeHigh = 1.0f;
-                    int seed = 2015;
-                    int numIterations = 5;
-                    TestQuantization<ElemType>(numBits, numRows, numCols, rangeLow, rangeHigh, seed, numIterations, deviceId, zeroThresholdFor1Bit);
-
-                    // Test a matrix with smaller range of values
-                    seed += 100;
-                    rangeLow = -0.005f;
-                    rangeHigh = 0.005f;
-                    TestQuantization<ElemType>(numBits, numRows, numCols, rangeLow, rangeHigh, seed, numIterations, deviceId, zeroThresholdFor1Bit);
-
-                    // Test a matrix with a very small range of values
-                    seed += 100;
-                    rangeLow = -0.00001f;
-                    rangeHigh = 0.00001f;
-                    TestQuantization<ElemType>(numBits, numRows, numCols, rangeLow, rangeHigh, seed, numIterations, deviceId, zeroThresholdFor1Bit);
-
-                    // Test a matrix with larger range of values
-                    seed += 100;
-                    rangeLow = -10.0f;
-                    rangeHigh = 10.0f;
-                    TestQuantization<ElemType>(numBits, numRows, numCols, rangeLow, rangeHigh, seed, numIterations, deviceId, zeroThresholdFor1Bit);
-
-                    // Test a matrix with assymmetric range of values
-                    seed += 100;
-                    rangeLow = -1.0f;
-                    rangeHigh = 2.05f;
-                    TestQuantization<ElemType>(numBits, numRows, numCols, rangeLow, rangeHigh, seed, numIterations, deviceId, zeroThresholdFor1Bit);
-
-                    // Test a matrix with a single column
-                    seed += 100;
-                    rangeLow = -0.5f;
-                    rangeHigh = 0.5f;
-                    numRows = 489;
-                    numCols = 1;
-                    TestQuantization<ElemType>(numBits, numRows, numCols, rangeLow, rangeHigh, seed, numIterations, deviceId, zeroThresholdFor1Bit);
-
-                    // Test a matrix with a single row
-                    seed += 100;
-                    rangeLow = -0.5f;
-                    rangeHigh = 0.5f;
-                    numRows = 1;
-                    numCols = 135;
-                    TestQuantization<ElemType>(numBits, numRows, numCols, rangeLow, rangeHigh, seed, numIterations, deviceId, zeroThresholdFor1Bit);
-
-                    // Test a matrix with a number of rows that is not a multiple of the number of bits in a quantized word
-                    seed += 100;
-                    rangeLow = -0.5f;
-                    rangeHigh = 0.5f;
-                    numRows = 89;
-                    numCols = 23;
-                    TestQuantization<ElemType>(numBits, numRows, numCols, rangeLow, rangeHigh, seed, numIterations, deviceId, zeroThresholdFor1Bit);
-
-                    // Test a matrix with a number of rows less than number of bits in a quantized word
-                    seed += 100;
-                    rangeLow = -0.5f;
-                    rangeHigh = 0.5f;
-                    numRows = 15;
-                    numCols = 135;
-                    TestQuantization<ElemType>(numBits, numRows, numCols, rangeLow, rangeHigh, seed, numIterations, deviceId, zeroThresholdFor1Bit);
-
-                    // Test with a large matrix
-                    seed += 100;
-                    rangeLow = -0.5f;
-                    rangeHigh = 0.5f;
-                    numRows = 737;
-                    numCols = 373;
-                    TestQuantization<ElemType>(numBits, numRows, numCols, rangeLow, rangeHigh, seed, numIterations, deviceId, zeroThresholdFor1Bit);
-                }
-            }
-        }
-
-    public:
-        //This test will fail without GPU
-        TEST_METHOD(Matrix1BitQuantize)
-        {
-#ifdef DEBUG_OUTPUT_PATH
-            RedirectStdErrAndStdOut(DEBUG_OUTPUT_PATH);
-#endif
-            const int GPUDEVICE = 0;
-
-            // Test single precision 1bit quantization on GPU
-            TestQuantization<float>(GPUDEVICE);
-
-            // Test double precision 1bit quantization on GPU
-            TestQuantization<double>(GPUDEVICE);
-
-            // Test single precision 1bit quantization on CPU
-            TestQuantization<float>(CPUDEVICE);
-
-            // Test double precision 1bit quantization on CPU
-            TestQuantization<double>(CPUDEVICE);
-        }
-    };
-
-    /*static*/ const float MatrixQuantizerTests::SINGLE_PRECISION_TOLERANCE = 0.00005f;
-    /*static*/ const double MatrixQuantizerTests::DOUBLE_PRECISION_TOLERANCE = 0.000000001;
-    /*static*/ const float MatrixQuantizerTests::SINGLE_PRECISION_GPU_QUANTIZATION_TOLERANCE = 0.0001f;
-}
diff --git a/Math/CNTKMathTest/stdafx.cpp b/Math/CNTKMathTest/stdafx.cpp
deleted file mode 100644
index c33c3d11bc0c..000000000000
--- a/Math/CNTKMathTest/stdafx.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-//
-// <copyright file="stdafx.cpp" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// stdafx.cpp : source file that includes just the standard includes
-// CNTKMathTest.pch will be the pre-compiled header
-// stdafx.obj will contain the pre-compiled type information
-
-#include "stdafx.h"
-
-// TODO: reference any additional headers you need in STDAFX.H
-// and not in this file
diff --git a/Math/CNTKMathTest/stdafx.h b/Math/CNTKMathTest/stdafx.h
deleted file mode 100644
index bc30f65ba20c..000000000000
--- a/Math/CNTKMathTest/stdafx.h
+++ /dev/null
@@ -1,20 +0,0 @@
-//
-// <copyright file="stdafx.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// stdafx.h : include file for standard system include files,
-// or project specific include files that are used frequently, but
-// are changed infrequently
-//
-
-#pragma once
-
-#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms
-#include "targetver.h"
-
-// Headers for CppUnitTest
-#pragma warning (disable: 4505) // 'Microsoft::VisualStudio::CppUnitTestFramework::ToString' : unreferenced local function has been removed
-#include "CppUnitTest.h"
-
-// TODO: reference additional headers your program requires here
diff --git a/Math/CNTKMathTest/targetver.h b/Math/CNTKMathTest/targetver.h
deleted file mode 100644
index e0f1e69ca9f1..000000000000
--- a/Math/CNTKMathTest/targetver.h
+++ /dev/null
@@ -1,13 +0,0 @@
-//
-// <copyright file="targetver.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-#pragma once
-
-// Including SDKDDKVer.h defines the highest available Windows platform.
-
-// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
-// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
-
-#include <SDKDDKVer.h>
diff --git a/Tests/UnitTests/MathTests/MathTests.vcxproj b/Tests/UnitTests/MathTests/MathTests.vcxproj
index f80fadeb9623..7040ed6cc54c 100644
--- a/Tests/UnitTests/MathTests/MathTests.vcxproj
+++ b/Tests/UnitTests/MathTests/MathTests.vcxproj
@@ -127,6 +127,7 @@
     <ClCompile Include="MatrixDataSynchronizationTests.cpp" />
     <ClCompile Include="MatrixFileWriteReadTests.cpp" />
     <ClCompile Include="MatrixQuantizerTests.cpp" />
+    <ClCompile Include="MatrixSparseDenseInteractionsTests.cpp" />
     <ClCompile Include="MatrixTests.cpp" />
     <ClCompile Include="stdafx.cpp">
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
diff --git a/Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp b/Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp
index e51767379b85..0b7e5b5e0644 100644
--- a/Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp
+++ b/Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp
@@ -45,9 +45,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
 #else
     static bool createDebugOut = false;
 #endif
-    static const float c_SinglePrecisionTolerance = 0.00005f;
-    static const double c_DoublePrecisionTolerance = 0.000000001;
-    static const float c_SinglePrecisionGpuQuantizationTolerance = 0.0001f;
+    //static const float c_SinglePrecisionTolerance = 0.00005f;
+    //static const double c_DoublePrecisionTolerance = 0.000000001;
+    //static const float c_SinglePrecisionGpuQuantizationTolerance = 0.0001f;
+    static const float c_SinglePrecisionTolerance = 0.0001f;
+    static const double c_DoublePrecisionTolerance = 0.00000001;
+    static const float c_SinglePrecisionGpuQuantizationTolerance = 0.001f;
 
     template <typename ElemType>
     static void ReferenceCPUQuantizer(