forked from microsoft/CNTK
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added a E2E Speech DNN parallel training test with no gradient quanti…
…zation
- Loading branch information
Showing
7 changed files
with
5,937 additions
and
0 deletions.
There are no files selected for viewing
1,426 changes: 1,426 additions & 0 deletions
1,426
Tests/Speech/DNN/ParallelNoQuantization/baseline.cpu.txt
Large diffs are not rendered by default.
Oops, something went wrong.
1,426 changes: 1,426 additions & 0 deletions
1,426
Tests/Speech/DNN/ParallelNoQuantization/baseline.gpu.txt
Large diffs are not rendered by default.
Oops, something went wrong.
1,453 changes: 1,453 additions & 0 deletions
1,453
Tests/Speech/DNN/ParallelNoQuantization/baseline.windows.cpu.txt
Large diffs are not rendered by default.
Oops, something went wrong.
1,453 changes: 1,453 additions & 0 deletions
1,453
Tests/Speech/DNN/ParallelNoQuantization/baseline.windows.gpu.txt
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/bin/bash | ||
if [ "$TEST_DEVICE" == "cpu" ]; then | ||
CNTK_DEVICE_ID=-1 | ||
elif [ "$TEST_DEVICE" == "gpu" ]; then | ||
CNTK_DEVICE_ID=0 | ||
else | ||
echo "Error: Unknown TEST_DEVICE specified!" | ||
exit 3 | ||
fi | ||
|
||
configFile=$TEST_DIR/../cntk.config | ||
RunDir=$TEST_RUN_DIR | ||
DataDir=$TEST_DATA_DIR | ||
|
||
if [ "$OS" == "Windows_NT" ]; then | ||
# When running on cygwin translating /cygdrive/xxx paths to proper windows paths: | ||
configFile=$(cygpath -aw $configFile) | ||
RunDir=$(cygpath -aw $RunDir) | ||
DataDir=$(cygpath -aw $DataDir) | ||
fi | ||
|
||
# Since we use the MS MPI program on Windows, the CNTK binary path argument | ||
# passed to mpiexec must be in the windows format | ||
CNTKBinaryPath=$TEST_CNTK_BINARY | ||
if [ "$OS" == "Windows_NT" ]; then | ||
CNTKBinaryPath=$(cygpath -aw $CNTKBinaryPath) | ||
fi | ||
|
||
MPI_ARGS="-n 3" | ||
CNTK_ARGS="configFile=$configFile RunDir=$RunDir DataDir=$DataDir DeviceId=$CNTK_DEVICE_ID stderr=$RunDir/stderr" | ||
MODELS_DIR=$TEST_RUN_DIR/models | ||
[ -d $MODELS_DIR ] && rm -rf $MODELS_DIR | ||
mkdir -p $MODELS_DIR || exit $? | ||
echo === Running "$MPI_BINARY" $MPI_ARGS $CNTKBinaryPath $CNTK_ARGS | ||
"$MPI_BINARY" $MPI_ARGS $CNTKBinaryPath $CNTK_ARGS | ||
ExitCode=$? | ||
sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/stderr_speechTrain.logrank0 | ||
sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/stderr_speechTrain.logrank1 | ||
sed 's/^/MPI Rank 2: /' $TEST_RUN_DIR/stderr_speechTrain.logrank2 | ||
exit $ExitCode |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
dataDir: ../../Data | ||
tags: | ||
# running on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations: | ||
- bvt-s (flavor=='debug') ^ (device=='cpu') | ||
# running unconditionally on every Nightly job in 'S' leg | ||
- nightly-s | ||
|
||
testCases: | ||
Must train epochs in exactly same order and parameters for each MPI Rank: | ||
patterns: | ||
- ^MPI Rank {{integer}} | ||
- Starting Epoch {{integer}} | ||
- learning rate per sample = {{float}} | ||
- momentum = {{float}} | ||
|
||
Epochs must be finished with expected results for each MPI Rank: | ||
patterns: | ||
- ^MPI Rank {{integer}} | ||
- Finished Epoch[{{integer}} of {{integer}}] | ||
- TrainLossPerSample = {{float,tolerance=0.01%}} | ||
- EvalErrPerSample = {{float,tolerance=0.01%}} | ||
- Ave LearnRatePerSample = {{float,tolerance=0%}} | ||
|
||
Per-minibatch training results must match for each MPI Rank: | ||
patterns: | ||
- ^MPI Rank {{integer}} | ||
- Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}} of {{integer}}] | ||
- SamplesSeen = {{integer}} | ||
- TrainLossPerSample = {{float,tolerance=0.01%}} | ||
- EvalErr[0]PerSample = {{float,tolerance=0.01%}} | ||
|
||
DataParallelSGD training parameters must match for each MPI Rank: | ||
patterns: | ||
- ^MPI Rank {{integer}} | ||
- Starting minibatch loop | ||
- DataParallelSGD training | ||
- MyRank = {{integer}} | ||
- NumNodes = 3 | ||
- NumGradientBits = 32 | ||
- Distributed reading is ENABLED |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
precision=float | ||
command=speechTrain | ||
deviceId=$DeviceId$ | ||
|
||
parallelTrain=true | ||
|
||
speechTrain=[ | ||
action=train | ||
modelPath=$RunDir$/models/cntkSpeech.dnn | ||
deviceId=$DeviceId$ | ||
traceLevel=1 | ||
SimpleNetworkBuilder=[ | ||
layerSizes=363:512:512:132 | ||
trainingCriterion=CrossEntropyWithSoftmax | ||
evalCriterion=ErrorPrediction | ||
layerTypes=Sigmoid | ||
initValueScale=1.0 | ||
applyMeanVarNorm=true | ||
uniformInit=true | ||
needPrior=true | ||
] | ||
|
||
ExperimentalNetworkBuilder=[ // the same as above but with BS | ||
layerSizes=363:512:512:132 | ||
trainingCriterion='CE' | ||
evalCriterion='Err' | ||
|
||
applyMeanVarNorm=true | ||
|
||
L = Length(layerSizes)-1 // number of model layers | ||
features = Input(layerSizes[0], 1, tag='feature') ; labels = Input(layerSizes[Length(layerSizes)-1], 1, tag='label') | ||
featNorm = if applyMeanVarNorm | ||
then MeanVarNorm(features) | ||
else features | ||
layers[layer:1..L-1] = if layer > 1 | ||
then SBFF(layers[layer-1].Eh, layerSizes[layer], layerSizes[layer-1]) | ||
else SBFF(featNorm, layerSizes[layer], layerSizes[layer-1]) | ||
outLayer = BFF(layers[L-1].Eh, layerSizes[L], layerSizes[L-1]) | ||
outZ = outLayer.z // + PastValue(layerSizes[L], 1, outLayer.z) | ||
CE = if trainingCriterion == 'CE' | ||
then CrossEntropyWithSoftmax(labels, outZ, tag='criterion') | ||
else Fail('unknown trainingCriterion ' + trainingCriterion) | ||
Err = if evalCriterion == 'Err' then | ||
ErrorPrediction(labels, outZ, tag='eval') | ||
else Fail('unknown evalCriterion ' + evalCriterion) | ||
logPrior = LogPrior(labels) | ||
// TODO: how to add a tag to an infix operation? | ||
ScaledLogLikelihood = Minus (outZ, logPrior, tag='output') | ||
] | ||
|
||
SGD=[ | ||
epochSize=20480 | ||
minibatchSize=64:256:1024 | ||
learningRatesPerMB=1.0:0.5:0.1 | ||
numMBsToShowResult=10 | ||
momentumPerMB=0.9:0.656119 | ||
dropoutRate=0.0 | ||
maxEpochs=3 | ||
keepCheckPointFiles=true | ||
|
||
ParallelTrain=[ | ||
parallelizationMethod=DataParallelSGD | ||
distributedMBReading=true | ||
DataParallelSGD=[ | ||
gradientBits=32 | ||
] | ||
] | ||
|
||
AutoAdjust=[ | ||
reduceLearnRateIfImproveLessThan=0 | ||
loadBestModel=true | ||
increaseLearnRateIfImproveMoreThan=1000000000 | ||
learnRateDecreaseFactor=0.5 | ||
learnRateIncreaseFactor=1.382 | ||
autoAdjustLR=AdjustAfterEpoch | ||
] | ||
clippingThresholdPerSample=1#INF | ||
] | ||
reader=[ | ||
readerType=HTKMLFReader | ||
readMethod=blockRandomize | ||
miniBatchMode=Partial | ||
randomize=Auto | ||
verbosity=0 | ||
features=[ | ||
dim=363 | ||
type=Real | ||
scpFile=glob_0000.scp | ||
] | ||
|
||
labels=[ | ||
mlfFile=$DataDir$/glob_0000.mlf | ||
labelMappingFile=$DataDir$/state.list | ||
|
||
labelDim=132 | ||
labelType=Category | ||
] | ||
] | ||
] |