diff --git a/Common/Include/DataReader.h b/Common/Include/DataReader.h
index 0ddaeaadc074..718adbba1610 100644
--- a/Common/Include/DataReader.h
+++ b/Common/Include/DataReader.h
@@ -100,9 +100,10 @@ class DATAREADER_API IDataReader
 
     void SetDoRandomize(bool b){ mDoRandomize = b; }
 
-    // Gets a copy of the minibatch for the forward computation. This can be
-    // useful if some of the computation has to happen in the reader.
-    // TODO: No, there should be no computation in the reader.
+    // Workaround for the two-forward-pass sequence and ctc training, which
+    // allows processing more utterances at the same time. Only used in
+    // Kaldi2Reader.
+    // TODO: move this out of the reader.
     virtual bool GetMinibatchCopy(
         std::vector<std::vector<std::pair<wstring, size_t>>>& /*uttInfo*/,
         std::map<std::wstring, Matrix<ElemType>*>& /*matrices*/,
@@ -111,9 +112,10 @@ class DATAREADER_API IDataReader
         return false;
     }
 
-    // Sets the neural network output to the reader. This can be useful if some
-    // of the computation has to happen in the reader.
-    // TODO: No, there should be no computation in the reader.
+    // Workaround for the two-forward-pass sequence and ctc training, which
+    // allows processing more utterances at the same time. Only used in
+    // Kaldi2Reader.
+    // TODO: move this out of the reader.
     virtual bool SetNetOutput(
         const std::vector<std::vector<std::pair<wstring, size_t>>>& /*uttInfo*/,
         const Matrix<ElemType>& /*outputs*/,
diff --git a/DataReader/Kaldi2Reader/HTKMLFReader.cpp b/DataReader/Kaldi2Reader/HTKMLFReader.cpp
index 899b06f70d4a..8e3c8b648f83 100644
--- a/DataReader/Kaldi2Reader/HTKMLFReader.cpp
+++ b/DataReader/Kaldi2Reader/HTKMLFReader.cpp
@@ -1105,16 +1105,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             // We initialize the sentence boundary information before we process
             // the utterances.
-            m_sentenceBegin.Resize(m_numberOfuttsPerMinibatch, m_currentMBSize);
-            m_minibatchPackingFlags.resize(m_currentMBSize);
+            m_pMBLayout->Init(m_numberOfuttsPerMinibatch, m_currentMBSize, !m_framemode);
             for (size_t i = 0; i < m_numberOfuttsPerMinibatch; i++)
             {
                 for (size_t j = 0; j < m_currentMBSize; j++)
                 {
-                    m_sentenceBegin.SetValue(i, j, (ElemType) SEQUENCE_MIDDLE);
+                    m_pMBLayout->SetWithoutOr(i, j, MinibatchPackingFlags::None);
                 }
             }
-            std::fill(m_minibatchPackingFlags.begin(), m_minibatchPackingFlags.end(), MinibatchPackingFlags::None);
 
             // Iterates over utterances. m_numberOfuttsPerMinibatch = 1 is a
             // special case.
@@ -1133,8 +1131,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     // Sets the utterance boundary.
                     if (startFrame == 0)
                     {
-                        m_sentenceBegin.SetValue(i, 0, (ElemType)SEQUENCE_START);
-                        m_minibatchPackingFlags[0] |= MinibatchPackingFlags::SequenceStart;
+                        m_pMBLayout->Set(i, 0, MinibatchPackingFlags::SequenceStart);
                     }
 
                     endFrame = startFrame + m_currentMBSize;
@@ -1161,13 +1158,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     {
                         if (startFrame == 0)
                         {
-                            m_sentenceBegin.SetValue(i, 0, (ElemType)SEQUENCE_START);
-                            m_minibatchPackingFlags[0] |= MinibatchPackingFlags::SequenceStart;
+                            m_pMBLayout->Set(i, 0, MinibatchPackingFlags::SequenceStart);
                         }
 
                         // We have to set the utterance end.
-                        m_sentenceBegin.SetValue(i, m_sentenceBegin.GetNumCols() - 1, (ElemType)SEQUENCE_END);
-                        m_minibatchPackingFlags[m_sentenceBegin.GetNumCols() - 1] |= MinibatchPackingFlags::SequenceEnd;
+                        m_pMBLayout->Set(i, m_pMBLayout->GetNumTimeSteps() - 1, MinibatchPackingFlags::SequenceEnd);
                     }
 
                     // Now puts the utterance into the minibatch, and loads the
@@ -1198,8 +1193,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     {
                         for (size_t k = 0; k < m_currentMBSize; k++)
                         {
-                            m_sentenceBegin.SetValue(i, k, (ElemType) NO_INPUT);
-                            m_minibatchPackingFlags[k] |= MinibatchPackingFlags::NoInput;
+                            m_pMBLayout->Set(i, k, MinibatchPackingFlags::NoInput);
 
                             // Populates <NO_INPUT> with real features, the
                             // following implementation is not efficient...
@@ -1224,14 +1218,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     {
                         if (startFrame == 0)
                         {
-                            m_sentenceBegin.SetValue(i, 0, (ElemType)SEQUENCE_START);
-                            m_minibatchPackingFlags[0] |= MinibatchPackingFlags::SequenceStart;
+                            m_pMBLayout->Set(i, 0, MinibatchPackingFlags::SequenceStart);
                         }
 
                         // We have to set the utterance end.
-                        assert(m_toProcess[i] - startFrame - 1 < m_sentenceBegin.GetNumCols());
-                        m_sentenceBegin.SetValue(i, m_toProcess[i] - startFrame - 1, (ElemType)SEQUENCE_END);
-                        m_minibatchPackingFlags[m_toProcess[i] - startFrame - 1] |= MinibatchPackingFlags::SequenceEnd;
+                        assert(m_toProcess[i] - startFrame - 1 < m_pMBLayout->GetNumTimeSteps());
+                        m_pMBLayout->Set(i, m_toProcess[i] - startFrame - 1, MinibatchPackingFlags::SequenceEnd);
                     }
                     endFrame = m_toProcess[i];
                     size_t currentMBFilled = endFrame - startFrame;
@@ -1249,11 +1241,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     while (reNewSucc && (currentMBFilled + m_toProcess[i] <= m_currentMBSize))
                     {
                         // Sets the utterance boundary.
-                        assert(currentMBFilled + m_toProcess[i] <= m_sentenceBegin.GetNumCols());
-                        m_sentenceBegin.SetValue(i, currentMBFilled, (ElemType)SEQUENCE_START);
-                        m_minibatchPackingFlags[currentMBFilled] |= MinibatchPackingFlags::SequenceStart;
-                        m_sentenceBegin.SetValue(i, currentMBFilled + m_toProcess[i] - 1, (ElemType)SEQUENCE_END);
-                        m_minibatchPackingFlags[currentMBFilled + m_toProcess[i] - 1] |= MinibatchPackingFlags::SequenceEnd;
+                        assert(currentMBFilled + m_toProcess[i] <= m_pMBLayout->GetNumTimeSteps());
+                        m_pMBLayout->Set(i, currentMBFilled, MinibatchPackingFlags::SequenceStart);
+                        m_pMBLayout->Set(i, currentMBFilled + m_toProcess[i] - 1, MinibatchPackingFlags::SequenceEnd);
                         populateSucc = PopulateUtteranceInMinibatch(matrices, i, 0, m_toProcess[i], m_currentMBSize, currentMBFilled);
                         if (m_doMinibatchBuffering && populateSucc)
                         {
@@ -1279,16 +1269,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         m_processedFrame[i] += m_currentMBSize - currentMBFilled;
                         if (currentMBFilled < m_currentMBSize)
                         {
-                            m_sentenceBegin.SetValue(i, currentMBFilled, (ElemType)SEQUENCE_START);
-                            m_minibatchPackingFlags[currentMBFilled] |= MinibatchPackingFlags::SequenceStart;
+                            m_pMBLayout->Set(i, currentMBFilled, MinibatchPackingFlags::SequenceStart);
                         }
                     }
                     else
                     {
                         for (size_t k = currentMBFilled; k < m_currentMBSize; k++)
                         {
-                            m_sentenceBegin.SetValue(i, k, (ElemType) NO_INPUT);
-                            m_minibatchPackingFlags[k] |= MinibatchPackingFlags::NoInput;
+                            m_pMBLayout->Set(i, k, MinibatchPackingFlags::NoInput);
 
                             // Populates <NO_INPUT> with real features, the
                             // following implementation is not efficient...
@@ -1353,15 +1341,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 (startIndex + currentMBSize <= originalMBSize) ?
                 currentMBSize : (originalMBSize - startIndex);
 
-            // Sets sentence boundary for the current minibatch.
-            currentMinibatch.sentenceBegin.SetValue(
-                m_sentenceBegin.ColumnSlice(startIndex, numFrames));
-
-            // Sets packing flag for the current minibatch.
-            currentMinibatch.minibatchPackingFlag.resize(numFrames);
-            currentMinibatch.minibatchPackingFlag.assign(
-                m_minibatchPackingFlags.begin() + startIndex,
-                m_minibatchPackingFlags.begin() + startIndex + numFrames);
+            // Sets MBLayout.
+            currentMinibatch.pMBLayout->CopyFromRange(m_pMBLayout, startIndex, numFrames);
 
             // Sets the minibatch size for the current minibatch.
             currentMinibatch.currentMBSize = numFrames;
@@ -1424,8 +1405,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         assert(m_minibatchBuffer.size() > index);
 
         // Restores the variables related to the minibatch.
-        m_sentenceBegin.SetValue(m_minibatchBuffer[index].sentenceBegin);
-        m_minibatchPackingFlags = m_minibatchBuffer[index].minibatchPackingFlag;
+        m_pMBLayout->CopyFrom(m_minibatchBuffer[index].pMBLayout);
         m_currentMBSize = m_minibatchBuffer[index].currentMBSize;
         m_minibatchUttInfo = m_minibatchBuffer[index].minibatchUttInfo;
 
@@ -1470,9 +1450,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     }
                     else
                     {
-                        m_uttDerivBuffer->GetDerivative(
-                            m_minibatchUttInfo, m_sentenceBegin,
-                            m_minibatchPackingFlags, matrices[iter->first]);
+                        m_uttDerivBuffer->GetDerivative(m_minibatchUttInfo,
+                                                        m_pMBLayout,
+                                                        matrices[iter->first]);
                     }
                 }
                 else if (m_nameToTypeMap[iter->first] == InputOutputTypes::readerObj)
@@ -1661,15 +1641,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     const msra::dbn::matrix feat = m_fileEvalSource->ChunkOfFrames(id);
                     if (first)
                     {
-                        m_sentenceBegin.Resize((size_t)1, (size_t)feat.cols());
-                        m_minibatchPackingFlags.resize(feat.cols());
-                        m_sentenceBegin.SetValue((ElemType) SEQUENCE_MIDDLE);
-                        m_sentenceBegin.SetValue(0, 0, (ElemType) SEQUENCE_START);
-                        m_sentenceBegin.SetValue(0, (size_t)feat.cols()-1, (ElemType) SEQUENCE_END);
-                                
-                        std::fill(m_minibatchPackingFlags.begin(), m_minibatchPackingFlags.end(), MinibatchPackingFlags::None);
-                        m_minibatchPackingFlags[0] = MinibatchPackingFlags::SequenceStart;
-                        m_minibatchPackingFlags[(size_t)feat.cols()-1] = MinibatchPackingFlags::SequenceEnd;
+                        m_pMBLayout->Init(1, feat.cols(), true);
+                        m_pMBLayout->Set(0, 0, MinibatchPackingFlags::SequenceStart);
+                        m_pMBLayout->SetWithoutOr(0, feat.cols() - 1, MinibatchPackingFlags::SequenceEnd);
                         first = false;
                     }
 
@@ -1944,8 +1918,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     bool HTKMLFReader<ElemType>::GetMinibatchCopy(
         std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
         std::map<std::wstring, Matrix<ElemType>*>& matrices,
-        Matrix<ElemType>& sentenceBegin,
-        std::vector<MinibatchPackingFlags>& minibatchPackingFlag)
+        MBLayoutPtr pMBLayout)
     {
         // We need to get a "copy" of the minibatch to do the forward
         // computation for sequence training.
@@ -1957,8 +1930,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 m_getMinibatchCopy = true;
                 if (GetMinibatchToTrainOrTest(matrices))
                 {
-                    sentenceBegin.SetValue(m_sentenceBegin);
-                    minibatchPackingFlag = m_minibatchPackingFlags;
+                    pMBLayout->CopyFrom(m_pMBLayout);
                     uttInfo = m_minibatchUttInfo;
                     m_getMinibatchCopy = false;
                     return true;
@@ -1974,8 +1946,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     bool HTKMLFReader<ElemType>::SetNetOutput(
         const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
         const Matrix<ElemType>& outputs,
-        const Matrix<ElemType>& sentenceBegin,
-        const std::vector<MinibatchPackingFlags>& minibatchPackingFlag)
+        const MBLayoutPtr pMBLayout)
     {
         // Set the likelihoods for the utterance with which we can comput the
         // derivatives. Note that the minibatch may only contain partial output
@@ -1984,9 +1955,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (m_doMinibatchBuffering)
         {
             assert(m_framemode == false);
-            return m_uttDerivBuffer->SetLikelihood(uttInfo, outputs,
-                                                   sentenceBegin,
-                                                   minibatchPackingFlag);
+            return m_uttDerivBuffer->SetLikelihood(uttInfo, outputs, pMBLayout);
         }
         return false;
     }
@@ -2114,16 +2083,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
     }
 
-    template<class ElemType>
-    void HTKMLFReader<ElemType>::SetSentenceSegBatch(Matrix<ElemType> &sentenceBegin, vector<MinibatchPackingFlags>& minibatchPackingFlag)
-    {
-        if (!m_framemode)
-        {
-            sentenceBegin.SetValue(m_sentenceBegin);
-            minibatchPackingFlag = m_minibatchPackingFlags;
-        }
-    }
-
     // For Kaldi2Reader, we now make the following assumptions
     // 1. feature sections will always have a sub-field "scpFile"
     // 2. label sections will always have a sub-field "mlfFile"
diff --git a/DataReader/Kaldi2Reader/HTKMLFReader.h b/DataReader/Kaldi2Reader/HTKMLFReader.h
index 8bcb8dc7ac67..9550f270ddff 100644
--- a/DataReader/Kaldi2Reader/HTKMLFReader.h
+++ b/DataReader/Kaldi2Reader/HTKMLFReader.h
@@ -35,10 +35,10 @@ class HTKMLFReader : public IDataReader<ElemType>
     {
         std::vector<std::vector<ElemType>> features;
         std::vector<std::vector<ElemType>> labels;
-        Matrix<ElemType> sentenceBegin;
-        vector<MinibatchPackingFlags> minibatchPackingFlag;
+        MBLayoutPtr pMBLayout;
         std::vector<std::vector<std::pair<wstring, size_t>>> minibatchUttInfo;
         size_t currentMBSize;
+        MinibatchBufferUnit() : pMBLayout(make_shared<MBLayout>()), currentMBSize(0) {}
     };
     bool m_doMinibatchBuffering;
     bool m_getMinibatchCopy;
@@ -151,35 +151,14 @@ class HTKMLFReader : public IDataReader<ElemType>
 
 
 public:
+    MBLayoutPtr m_pMBLayout;
    
-    /// a matrix of n_stream x n_length
-    /// n_stream is the number of streams
-    /// n_length is the maximum lenght of each stream
-    /// for example, two sentences used in parallel in one minibatch would be
-    /// [2 x 5] if the max length of one of the sentences is 5
-    /// the elements of the matrix is 0, 1, or -1, defined as SEQUENCE_START, SEQUENCE_MIDDLE, NO_INPUT in cbasetype.h 
-    /// 0 1 1 0 1
-    /// 1 0 1 0 0 
-    /// for two parallel data streams. The first has two sentences, with 0 indicating begining of a sentence
-    /// the second data stream has two sentences, with 0 indicating begining of sentences
-    /// you may use 1 even if a sentence begins at that position, in this case, the trainer will carry over hidden states to the following
-    /// frame.
-    Matrix<ElemType> m_sentenceBegin;
-
-    /// a matrix of 1 x n_length
-    /// 1 denotes the case that there exists sentnece begin or no_labels case in this frame
-    /// 0 denotes such case is not in this frame
-
-
-    vector<MinibatchPackingFlags> m_minibatchPackingFlags;
-
     /// by default it is false
     /// if true, reader will set to SEQUENCE_MIDDLE for time positions that are orignally correspond to SEQUENCE_START
     /// set to true so that a current minibatch can uses state activities from the previous minibatch. 
     /// default will have truncated BPTT, which only does BPTT inside a minibatch
-
     bool mIgnoreSentenceBeginTag;
-    HTKMLFReader() : m_sentenceBegin(CPUDEVICE) {
+    HTKMLFReader() : m_pMBLayout(make_shared<MBLayout>()){
     }
 
     virtual void Init(const ConfigParameters& config);
@@ -195,19 +174,18 @@ class HTKMLFReader : public IDataReader<ElemType>
     virtual bool GetMinibatchCopy(
         std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
         std::map<std::wstring, Matrix<ElemType>*>& matrices,
-        Matrix<ElemType>& sentenceBegin,
-        vector<MinibatchPackingFlags>& sentenceExistsBeginOrNoLabels);
+        MBLayoutPtr pMBLayout);
     virtual bool SetNetOutput(
         const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
         const Matrix<ElemType>& outputs,
-        const Matrix<ElemType>& sentenceBegin,
-        const vector<MinibatchPackingFlags>& sentenceExistsBeginOrNoLabels);
+        const MBLayoutPtr pMBLayout);
 
     virtual bool DataEnd(EndDataType endDataType);
     void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
     void SetSentenceEnd(int /*actualMbSize*/){};
 
-    void SetSentenceSegBatch(Matrix<ElemType> &sentenceBegin, vector<MinibatchPackingFlags>& sentenceExistsBeginOrNoLabels);
+    void CopyMBLayoutTo(MBLayoutPtr pMBLayout) { pMBLayout->CopyFrom(m_pMBLayout); }
+    bool RequireSentenceSeg() const override { return !m_framemode; };
 };
 
 }}}
diff --git a/DataReader/Kaldi2Reader/UtteranceDerivativeBuffer.cpp b/DataReader/Kaldi2Reader/UtteranceDerivativeBuffer.cpp
index c6dfdc4227b8..730a2b8d49a8 100644
--- a/DataReader/Kaldi2Reader/UtteranceDerivativeBuffer.cpp
+++ b/DataReader/Kaldi2Reader/UtteranceDerivativeBuffer.cpp
@@ -23,34 +23,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     void UtteranceDerivativeBuffer<ElemType>::ProcessUttInfo(
         const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
-        const Matrix<ElemType>& sentenceBegin,
-        const std::vector<MinibatchPackingFlags>& minibatchPackingFlags,
+        const MBLayoutPtr pMBLayout,
         std::vector<std::vector<std::pair<
             wstring, std::pair<size_t, size_t>>>>* uttInfoInMinibatch) const
     {
         assert(uttInfoInMinibatch != NULL);
         assert(uttInfo.size() == m_numUttsPerMinibatch);
-        assert(sentenceBegin.GetNumRows() == m_numUttsPerMinibatch);
-        assert(minibatchPackingFlags.size() == sentenceBegin.GetNumCols());
+        assert(pMBLayout->GetNumParallelSequences() == m_numUttsPerMinibatch);
         uttInfoInMinibatch->clear();
         uttInfoInMinibatch->resize(uttInfo.size());
         for (size_t i = 0; i < uttInfo.size(); ++i)
         {
             size_t startFrameIndexInMinibatch = 0;
             size_t numFrames = 0;
-            for (size_t j = 0; j < sentenceBegin.GetNumCols(); ++j)
+            for (size_t j = 0; j < pMBLayout->GetNumTimeSteps(); ++j)
             {
-                if (((int)sentenceBegin(i, j) & NO_LABEL) == NO_LABEL)
+                if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoLabel))
                 {
                     continue;
                 }
-                if (((int)sentenceBegin(i, j) & NO_FEATURE) == NO_FEATURE)
+                if (pMBLayout->Is(i, j, MinibatchPackingFlags::NoFeature))
                 {
                     continue;
                 }
                 numFrames += 1;
-                if ((((int)sentenceBegin(i, j) & SEQUENCE_END) == SEQUENCE_END)
-                         || j == sentenceBegin.GetNumCols() - 1)
+                if (pMBLayout->Is(i, j, MinibatchPackingFlags::SequenceEnd)
+                         || j == pMBLayout->GetNumTimeSteps() - 1)
                 {
                     size_t uttIndex = (*uttInfoInMinibatch)[i].size();
                     wstring uttID = uttInfo[i][uttIndex].first;
@@ -74,8 +72,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     bool UtteranceDerivativeBuffer<ElemType>::SetLikelihood(
         const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
         const Matrix<ElemType>& logLikelihoodIn,
-        const Matrix<ElemType>& sentenceBegin,
-        const std::vector<MinibatchPackingFlags>& minibatchPackingFlags)
+        const MBLayoutPtr pMBLayout)
     {
         assert(m_needLikelihood == true);
         assert(m_epochEnd == false);
@@ -88,8 +85,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         std::vector<std::vector<
             std::pair<wstring, std::pair<size_t, size_t>>>> uttInfoInMinibatch;
-        ProcessUttInfo(uttInfo, sentenceBegin,
-                       minibatchPackingFlags, &uttInfoInMinibatch);
+        ProcessUttInfo(uttInfo, pMBLayout, &uttInfoInMinibatch);
 
         // Checks if we need to move data to CPU.
         Matrix<ElemType> logLikelihood(logLikelihoodIn);
@@ -99,7 +95,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 logLikelihood.GetDeviceId(), CPUDEVICE, true, false, false);
         }
 
-        size_t currentMBSize = minibatchPackingFlags.size();
+        size_t currentMBSize = pMBLayout->GetNumTimeSteps();
         for (size_t i = 0; i < uttInfo.size(); ++i)
         {
             assert(uttInfo[i].size() == uttInfoInMinibatch[i].size());
@@ -173,21 +169,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     bool UtteranceDerivativeBuffer<ElemType>::GetDerivative(
         const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
-        const Matrix<ElemType>& sentenceBegin,
-        const std::vector<MinibatchPackingFlags>& minibatchPackingFlags,
+        const MBLayoutPtr pMBLayout,
         Matrix<ElemType>* derivativesOut)
     {
         assert(derivativesOut != NULL);
         assert(m_needLikelihood == false);
         std::vector<std::vector<
             std::pair<wstring, std::pair<size_t, size_t>>>> uttInfoInMinibatch;
-        ProcessUttInfo(uttInfo, sentenceBegin,
-                       minibatchPackingFlags, &uttInfoInMinibatch);
+        ProcessUttInfo(uttInfo, pMBLayout, &uttInfoInMinibatch);
 
         m_currentObj = 0;
         Matrix<ElemType> derivatives(CPUDEVICE);
-        derivatives.Resize(m_dimension,
-            sentenceBegin.GetNumCols() * sentenceBegin.GetNumRows());
+        derivatives.Resize(m_dimension, pMBLayout->GetNumCols());
         for (size_t i = 0; i < uttInfo.size(); ++i)
         {
             assert(uttInfo[i].size() == uttInfoInMinibatch[i].size());
diff --git a/DataReader/Kaldi2Reader/UtteranceDerivativeBuffer.h b/DataReader/Kaldi2Reader/UtteranceDerivativeBuffer.h
index de2a6136e0b6..5f7d118a7fe2 100644
--- a/DataReader/Kaldi2Reader/UtteranceDerivativeBuffer.h
+++ b/DataReader/Kaldi2Reader/UtteranceDerivativeBuffer.h
@@ -47,8 +47,7 @@ class UtteranceDerivativeBuffer
     //     uttID startFrameIndexInMinibatch numFrames
     void ProcessUttInfo(
         const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
-        const Matrix<ElemType>& sentenceBegin,
-        const std::vector<MinibatchPackingFlags>& minibatchPackingFlags,
+        const MBLayoutPtr pMBLayout,
         std::vector<std::vector<std::pair<
             wstring, std::pair<size_t, size_t>>>>* uttInfoInMinibatch) const;
 
@@ -71,14 +70,12 @@ class UtteranceDerivativeBuffer
     bool SetLikelihood(
         const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
         const Matrix<ElemType>& outputs,
-        const Matrix<ElemType>& sentenceBegin,
-        const std::vector<MinibatchPackingFlags>& minibatchPackingFlags);
+        const MBLayoutPtr pMBLayout);
 
     // Gets the computed derivatives for given utterance.
     bool GetDerivative(
         const std::vector<std::vector<std::pair<wstring, size_t>>>& uttInfo,
-        const Matrix<ElemType>& sentenceBegin,
-        const std::vector<MinibatchPackingFlags>& minibatchPackingFlags,
+        const MBLayoutPtr pMBLayout,
         Matrix<ElemType>* derivativesOut);
 
     // Gets the computed objectives for given utterance.
diff --git a/DataReader/Kaldi2Reader/simple_checked_arrays.h b/DataReader/Kaldi2Reader/simple_checked_arrays.h
deleted file mode 100644
index 19c2932a5692..000000000000
--- a/DataReader/Kaldi2Reader/simple_checked_arrays.h
+++ /dev/null
@@ -1,89 +0,0 @@
-//
-// <copyright file="simple_checked_arrays.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// simple_checked_arrays.h -- a simple wrapper around pointers used as arrays to allow bounds checking
-//
-
-#pragma once
-
-#include <stddef.h>     // for size_t
-#include <assert.h>
-
-// ---------------------------------------------------------------------------
-// array_ref -- wraps a C pointer to an array together with its size.
-//
-// Called _ref because this is a reference to the array rather than the array
-// itself (since it wraps a pointer). No need to pass an array_ref by reference.
-//
-// operator[] checks index bounds in Debug builds. size() is provided such
-// that this class can be substituted for STL vector in many cases.
-// ---------------------------------------------------------------------------
-
-template<class _T> class array_ref
-{
-    _T * data;
-    size_t n;
-    inline void check_index (size_t i) const { i; assert (i < n); }
-    inline void check_ptr() const { n; data; assert (n == 0 || data != NULL); }
-public:
-    inline array_ref (_T * ptr, size_t size) throw() : data (ptr), n (size) { }
-    inline array_ref() throw() : data (NULL), n (0) { }   // in case we have a vector of this
-    inline       _T & operator[] (size_t i)       throw() { check_index (i); return data[i]; }
-    inline const _T & operator[] (size_t i) const throw() { check_index (i); return data[i]; }
-    inline size_t size() const throw() { return n; }
-    inline _T * begin() { return data; }
-    inline _T * end() { return data + n; }
-    inline void resize (size_t sz) { sz; assert (n == sz); }    // allow compatibility with some functions
-    // construct from other vector types
-    template<class _V> inline array_ref (_V & v) : data (v.size() > 0 ? &v[0] : NULL), n ((size_t) v.size()) { }
-};
-
-
-// ---------------------------------------------------------------------------
-// const_array_ref -- same as array_ref for 'const' (read-only) pointers
-// ---------------------------------------------------------------------------
-
-template<class _T> class const_array_ref
-{
-    const _T * data;
-    size_t n;
-    inline void check_index (size_t i) const { i; assert (i < n); }
-    inline void check_ptr() const { n; data; assert (n == 0 || data != NULL); }
-public:
-    inline const_array_ref (const _T * ptr, size_t size) throw() : data (ptr), n (size) { }
-    inline const_array_ref() throw() : data (NULL), n (0) { }   // in case we have a vector of this
-    inline const _T & operator[] (size_t i) const throw() { check_index (i); return data[i]; }
-    inline size_t size() const throw() { return n; }
-    inline const _T * begin() { return data; }
-    inline const _T * end() { return data + n; }
-    inline const _T & front() const throw() { check_index (0); return data[0];}
-    inline const _T & back() const throw() {check_index (0); return data[n-1];}
-    // construct from other vector types
-    template<class _V> inline const_array_ref (const _V & v) : data (v.size() > 0 ? &v[0] : NULL), n ((size_t) v.size()) { }
-};
-
-// ---------------------------------------------------------------------------
-// hardcoded_array -- wraps a fixed-size C array together with its size.
-//
-// operator[] checks index bounds in Debug builds. size() is provided such
-// that this class can be substituted for STL vector in many cases.
-// Can be constructed with a size parameter--it will be checked against the
-// hard-coded size.
-// Can also be constructed with an initialization parameter (typ. 0).
-// ---------------------------------------------------------------------------
-
-template<class _T, int _N> class hardcoded_array
-{
-    _T data[_N];
-    inline void check_index (size_t i) const { i; assert (i < _N); }
-    inline void check_size  (size_t n) const { n; assert (n == _N); }
-public:
-    inline hardcoded_array() throw() {}
-    inline hardcoded_array (size_t n) throw() { check_size (n); }  // we can instantiate with a size parameter--just checks the size
-    inline hardcoded_array (size_t n, const _T & val) throw() { check_size (n); for (size_t i = 0; i < n; i++) data[i] = val; }
-    inline       _T & operator[] (size_t i)       throw() { check_index (i); return data[i]; }
-    inline const _T & operator[] (size_t i) const throw() { check_index (i); return data[i]; }
-    inline size_t size() const throw() { return _N; }
-};
diff --git a/MachineLearning/CNTKSGDLib/SGD.cpp b/MachineLearning/CNTKSGDLib/SGD.cpp
index aafe165e4ceb..b68e451a142c 100644
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@@ -1634,24 +1634,23 @@ template<class ElemType>
         return lastTriedTrialMinibatchSize;
     }
 
-    // Tries to compute derivatives for the whole utterances, which will be
-    // fed to the neural network as features.
-    // This function currently does nothing in the Windows build (no version of GetMinibatchCopy() does anything),
-    // but it is currently used in the Kaldi reader to support parallelizing more utterances in sequence and CTC training. Only implemented inside Kaldi reader right now.
+    // Attemps to compute the error signal for the whole utterance, which will
+    // be fed to the neural network as features. Currently it is a workaround
+    // for the two-forward-pass sequence and ctc training, which allows
+    // processing more utterances at the same time. Only used in Kaldi2Reader.
+    // TODO: move the two-forward-pass support out of the reader.
     template<class ElemType>
     void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetwork& net,
                                             IDataReader<ElemType>* trainSetDataReader,
                                             const std::vector<ComputationNodeBasePtr> & featureNodes,
                                             std::map<std::wstring, Matrix<ElemType>*>* inputMatrices)
     {
-        // Tries to read an utterance and run forward computation on the
-        // whole utterance.
         assert(trainSetDataReader != NULL);
         std::vector<std::vector<std::pair<wstring, size_t>>> uttInfo;
         auto pMBLayout = make_shared<MBLayout>();
+        // TODO: use GetMinibatchIntoNetwork().
         while (trainSetDataReader->GetMinibatchCopy(uttInfo, *inputMatrices, pMBLayout))
         {
-            // TODO: should use GetMinibatchIntoNetwork(), but can't because GetMinibatchCopy() is not the same (whatever it is)
             ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
 
             auto & outputNodes = net.OutputNodes();
@@ -1766,7 +1765,11 @@ template<class ElemType>
             trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize);
         }
 
-        // TODO: what is this??
+        // Attemps to compute the error signal for the whole utterance, which will
+        // be fed to the neural network as features. Currently it is a workaround
+        // for the two-forward-pass sequence and ctc training, which allows
+        // processing more utterances at the same time. Only used in Kaldi2Reader.
+        // TODO: move the two-forward-pass support out of the reader.
         AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
 
         fprintf(stderr, "\nStarting minibatch loop");
@@ -2038,7 +2041,11 @@ template<class ElemType>
             // DataEnd does reader specific process if sentence ending is reached
             trainSetDataReader->DataEnd(EndDataType::endDataSentence);
 
-            // Tries to set up derivative features for the next utterance.
+            // Attemps to compute the error signal for the whole utterance, which will
+            // be fed to the neural network as features. Currently it is a workaround
+            // for the two-forward-pass sequence and ctc training, which allows
+            // processing more utterances at the same time. Only used in Kaldi2Reader.
+            // TODO: move the two-forward-pass support out of the reader.
             AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
 
             profiler.NextSample();
diff --git a/MachineLearning/CNTKSGDLib/SGD.h b/MachineLearning/CNTKSGDLib/SGD.h
index 30fb707d8503..5ebe1037d708 100644
--- a/MachineLearning/CNTKSGDLib/SGD.h
+++ b/MachineLearning/CNTKSGDLib/SGD.h
@@ -253,8 +253,11 @@ class SGD
                                       std::list<Matrix<ElemType>>& smoothedGradients,
                                       const size_t minMinibatchSize, const size_t maxMinibatchSize);
 
-    // Tries to compute derivatives for the whole utterances, which will be
-    // fed to the neural network as features.
+    // Attemps to compute the error signal for the whole utterance, which will
+    // be fed to the neural network as features. Currently it is a workaround
+    // for the two-forward-pass sequence and ctc training, which allows
+    // processing more utterances at the same time. Only used in Kaldi2Reader.
+    // TODO: move the two-forward-pass support out of the reader.
     void AttemptUtteranceDerivativeFeatures(ComputationNetwork& net,
                                             IDataReader<ElemType>* trainSetDataReader,
                                             const std::vector<ComputationNodeBasePtr> & featureNodes,
diff --git a/Math/Math/Matrix.h b/Math/Math/Matrix.h
index a873e93914d2..d5bcf6339b46 100644
--- a/Math/Math/Matrix.h
+++ b/Math/Math/Matrix.h
@@ -641,6 +641,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
         // for LSTMNode ony, which is deprecated, only to make it compile easily:  also used in FindBestPathWithVariableLength() and FindBestPath() in a strange way
         Matrix<float> & GetM() { LazyAlloc(); return m_sentenceBoundaryFlags; }
+
+        // TODO: this function is only used in Kaldi2Reader for the moment, and
+        //       we plan to remove it in the future. It copies the current
+        //       MBLayout from an existing object but only copies <numTimeSteps>
+        //       steps starting from <startTimeStep>.
+        void CopyFromRange(const MBLayoutPtr & other, size_t startTimeStep, size_t numTimeSteps)
+        {
+            m_numParallelSequences = other->m_numParallelSequences;
+            m_numTimeSteps = numTimeSteps;
+            m_dataIsSequential = other->m_dataIsSequential;
+            m_sentenceBoundaryFlags.SetValue(other->m_sentenceBoundaryFlags.ColumnSlice(startTimeStep, numTimeSteps));
+            m_minibatchPackingFlags.resize(numTimeSteps);
+            m_minibatchPackingFlags.assign(
+                other->m_minibatchPackingFlags.begin() + startTimeStep,
+                other->m_minibatchPackingFlags.begin() + startTimeStep + numTimeSteps);
+        }
     };
     typedef MBLayout::MBLayoutPtr MBLayoutPtr;