diff --git a/.gitignore b/.gitignore
index 7e21ba0b750dfc..65ba217de37c82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,6 @@ build/
 *.user
 
 .vscode
-.idea
\ No newline at end of file
+.idea
+.project
+.pydevproject
diff --git a/.travis.yml b/.travis.yml
index d3dae9efd416bd..bf0e0b7bbddd4c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,9 +2,17 @@ language: cpp
 cache: ccache
 sudo: required
 dist: trusty
+os:
+  - linux
+  - osx
 env:
   - JOB=DOCS
   - JOB=BUILD_AND_TEST
+matrix:
+  exclude:
+    - os: osx
+      env: JOB=DOCS  # Only generate documentation in linux
+
 addons:
   apt:
     packages:
@@ -27,9 +35,11 @@ addons:
       - libgoogle-glog-dev
       - libgflags-dev
       - libgtest-dev
+      - graphviz
 before_install:
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
+  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
   - pip install wheel protobuf sphinx breathe recommonmark
-  - sudo paddle/scripts/travis/before_install.sh
 script:
   - paddle/scripts/travis/main.sh
 notifications:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 44e93f22c0eaf4..4613155f7700b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b1)
+set(PADDLE_PATCH_VERSION 0b2)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@@ -104,7 +104,7 @@ else()
 endif(NOT WITH_GPU)
 
 if(WITH_DOUBLE)
-    add_definitions(-DPADDLE_TYPE_DOUBLE -DHPPL_TYPE_DOUBLE)
+    add_definitions(-DPADDLE_TYPE_DOUBLE)
     set(ACCURACY double)
 else(WITH_DOUBLE)
     set(ACCURACY float)
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 529b4b9d15d097..57c32a54cd727e 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -17,10 +17,17 @@
 ## Find MKL First.
 set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
 
-find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT}/include)
-find_library(MKL_CORE_LIB NAMES mkl_core PATHS ${MKL_ROOT}/lib)
-find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS ${MKL_ROOT}/lib)
-find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ${MKL_ROOT}/lib)
+find_path(MKL_INCLUDE_DIR mkl.h PATHS
+  ${MKL_ROOT}/include)
+find_library(MKL_CORE_LIB NAMES mkl_core PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64)
+find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64)
+find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64)
 
 
 if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index cc59309ee7efab..dbad6be3f41b3f 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -64,7 +64,9 @@ set(COMMON_FLAGS
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
     -Wno-error=literal-suffix
-    -Wno-error=unused-local-typedefs)
+    -Wno-error=unused-local-typedefs
+    -Wno-error=unused-function  # Warnings in Numpy Header.
+)
 
 foreach(flag ${COMMON_FLAGS})
     safe_set_cflag(CMAKE_C_FLAGS ${flag})
diff --git a/cmake/util.cmake b/cmake/util.cmake
index d776c3ae499526..0fa36f070cc11b 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -184,3 +184,20 @@ macro(add_paddle_culib TARGET_NAME)
     cuda_add_library(${TARGET_NAME} STATIC ${ARGN})
     set(CUDA_NVCC_FLAGS ${NVCC_FLAG})
 endmacro()
+
+
+# Creates C resources file from files in given resource file
+function(create_resources res_file output)
+    # Create empty output file
+    file(WRITE ${output} "")
+    # Get short filename
+    string(REGEX MATCH "([^/]+)$" filename ${res_file})
+    # Replace filename spaces & extension separator for C compatibility
+    string(REGEX REPLACE "\\.| |-" "_" filename ${filename})
+    # Read hex data from file
+    file(READ ${res_file} filedata HEX)
+    # Convert hex data for C compatibility
+    string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata})
+    # Append data to output file
+    file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
+endfunction()
diff --git a/demo/mnist/.gitignore b/demo/mnist/.gitignore
new file mode 100644
index 00000000000000..810910fd5ca56f
--- /dev/null
+++ b/demo/mnist/.gitignore
@@ -0,0 +1,6 @@
+data/raw_data
+data/*.list
+mnist_vgg_model
+plot.png
+train.log
+*pyc
diff --git a/demo/mnist/data/generate_list.py b/demo/mnist/data/generate_list.py
new file mode 100644
index 00000000000000..1b929048b4d82b
--- /dev/null
+++ b/demo/mnist/data/generate_list.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+o = open("./" + "train.list", "w")
+o.write("./data/raw_data/train" +"\n")
+o.close()
+
+o = open("./" + "test.list", "w")
+o.write("./data/raw_data/t10k" +"\n")
+o.close()
\ No newline at end of file
diff --git a/demo/mnist/data/get_mnist_data.sh b/demo/mnist/data/get_mnist_data.sh
new file mode 100755
index 00000000000000..9099b5ab6fb85d
--- /dev/null
+++ b/demo/mnist/data/get_mnist_data.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env sh
+# This scripts downloads the mnist data and unzips it.
+set -e
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+rm -rf "$DIR/raw_data"
+mkdir "$DIR/raw_data"
+cd "$DIR/raw_data"
+
+echo "Downloading..."
+
+for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
+do
+    if [ ! -e $fname ]; then
+        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
+        gunzip ${fname}.gz
+    fi
+done
+
+cd $DIR
+rm -f *.list
+python generate_list.py
+
diff --git a/demo/mnist/mnist_provider.py b/demo/mnist/mnist_provider.py
new file mode 100644
index 00000000000000..32af29730a7365
--- /dev/null
+++ b/demo/mnist/mnist_provider.py
@@ -0,0 +1,32 @@
+from paddle.trainer.PyDataProvider2 import *
+
+
+# Define a py data provider
+@provider(input_types={
+    'pixel': dense_vector(28 * 28),
+    'label': integer_value(10)
+})
+def process(settings, filename):  # settings is not used currently.
+    imgf = filename + "-images-idx3-ubyte"
+    labelf = filename + "-labels-idx1-ubyte"
+    f = open(imgf, "rb")
+    l = open(labelf, "rb")
+
+    f.read(16)
+    l.read(8)
+
+    # Define number of samples for train/test
+    if "train" in filename:
+        n = 60000
+    else:
+        n = 10000
+
+    for i in range(n):
+        label = ord(l.read(1))
+        pixels = []
+        for j in range(28 * 28):
+            pixels.append(float(ord(f.read(1))) / 255.0)
+        yield {"pixel": pixels, 'label': label}
+
+    f.close()
+    l.close()
diff --git a/demo/mnist/train.sh b/demo/mnist/train.sh
new file mode 100755
index 00000000000000..084b32ac390b84
--- /dev/null
+++ b/demo/mnist/train.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+config=vgg_16_mnist.py
+output=./mnist_vgg_model
+log=train.log
+
+paddle train \
+--config=$config \
+--dot_period=10 \
+--log_period=100 \
+--test_all_data_in_one_period=1 \
+--use_gpu=0 \
+--trainer_count=1 \
+--num_passes=100 \
+--save_dir=$output \
+2>&1 | tee $log
+
+python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/mnist/vgg_16_mnist.py b/demo/mnist/vgg_16_mnist.py
new file mode 100644
index 00000000000000..45a45bb061aa78
--- /dev/null
+++ b/demo/mnist/vgg_16_mnist.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+is_predict = get_config_arg("is_predict", bool, False)
+
+####################Data Configuration ##################
+
+
+if not is_predict:
+  data_dir='./data/'
+  define_py_data_sources2(train_list= data_dir + 'train.list',
+                        test_list= data_dir + 'test.list',
+                        module='mnist_provider',
+                        obj='process')
+
+######################Algorithm Configuration #############
+settings(
+    batch_size = 128,
+    learning_rate = 0.1 / 128.0,
+    learning_method = MomentumOptimizer(0.9),
+    regularization = L2Regularization(0.0005 * 128)
+)
+
+#######################Network Configuration #############
+
+data_size=1*28*28
+label_size=10
+img = data_layer(name='pixel', size=data_size)
+
+# small_vgg is predined in trainer_config_helpers.network
+predict = small_vgg(input_image=img,
+                    num_channels=1,
+                    num_classes=label_size)
+
+if not is_predict:
+    lbl = data_layer(name="label", size=label_size)
+    inputs(img, lbl)
+    outputs(classification_cost(input=predict, label=lbl))
+else:
+    outputs(predict)
diff --git a/demo/quick_start/preprocess.sh b/demo/quick_start/preprocess.sh
index fb2bee98beb268..fe2acbbd74898f 100755
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
@@ -20,6 +20,8 @@
 
 set -e
 
+export LC_ALL=C
+
 mkdir -p data/tmp
 python preprocess.py -i data/reviews_Electronics_5.json.gz
 # uniq and shuffle
diff --git a/demo/quick_start/train.sh b/demo/quick_start/train.sh
index 1f0a137c8bd594..ea4e32249a3d01 100755
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -18,6 +18,8 @@ cfg=trainer_config.lr.py
 #cfg=trainer_config.emb.py
 #cfg=trainer_config.cnn.py
 #cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
 paddle train \
   --config=$cfg \
   --save_dir=./output \
diff --git a/demo/quick_start/trainer_config.bidi-lstm.py b/demo/quick_start/trainer_config.bidi-lstm.py
new file mode 100644
index 00000000000000..3be3d373422714
--- /dev/null
+++ b/demo/quick_start/trainer_config.bidi-lstm.py
@@ -0,0 +1,62 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(train_list=trn,
+                        test_list=tst,
+                        module="dataprovider_emb",
+                        obj=process,
+                        args={"dictionary": word_dict})
+
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)
+
+bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+data = data_layer(name="word", size=len(word_dict))
+emb = embedding_layer(input=data, size=128)
+
+bi_lstm = bidirectional_lstm(input=emb, size=128)
+dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
+
+output = fc_layer(input=dropout, size=2,
+                  bias_attr=bias_attr,
+                  act=SoftmaxActivation())
+
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
diff --git a/demo/quick_start/trainer_config.db-lstm.py b/demo/quick_start/trainer_config.db-lstm.py
new file mode 100644
index 00000000000000..b35bdf5a61b473
--- /dev/null
+++ b/demo/quick_start/trainer_config.db-lstm.py
@@ -0,0 +1,73 @@
+# edit-mode: -*- python -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(train_list=trn,
+                        test_list=tst,
+                        module="dataprovider_emb",
+                        obj=process,
+                        args={"dictionary": word_dict})
+
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)
+
+bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+
+data = data_layer(name="word", size=len(word_dict))
+emb = embedding_layer(input=data, size=128)
+
+hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)])
+lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
+
+input_layers = [hidden_0, lstm_0]
+
+for i in range(1,8):
+    fc = fc_layer(input=input_layers, size=128)
+    lstm = lstmemory(input=fc, layer_attr=ExtraAttr(drop_rate=0.1),
+                    reverse=(i % 2) == 1,)
+    input_layers = [fc, lstm]
+
+lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
+
+output = fc_layer(input=lstm_last, size=2,
+                  bias_attr=bias_attr,
+                  act=SoftmaxActivation())
+
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
index 2b0c3f34648b05..edd6ad3f739b6c 100644
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -96,12 +96,12 @@ def gru_encoder_decoder(data_conf,
     encoded_vector = concat_layer(input=[src_forward, src_backward])
 
     with mixed_layer(size=decoder_size) as encoded_proj:
-        encoded_proj += full_matrix_projection(encoded_vector)
+        encoded_proj += full_matrix_projection(input=encoded_vector)
 
     backward_first = first_seq(input=src_backward)
     with mixed_layer(size=decoder_size,
                      act=TanhActivation(), ) as decoder_boot:
-        decoder_boot += full_matrix_projection(backward_first)
+        decoder_boot += full_matrix_projection(input=backward_first)
 
     def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
         decoder_mem = memory(name='gru_decoder',
@@ -113,8 +113,8 @@ def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
                                    decoder_state=decoder_mem, )
 
         with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += full_matrix_projection(context)
-            decoder_inputs += full_matrix_projection(current_word)
+            decoder_inputs += full_matrix_projection(input=context)
+            decoder_inputs += full_matrix_projection(input=current_word)
 
         gru_step = gru_step_layer(name='gru_decoder',
                                   input=decoder_inputs,
diff --git a/demo/sequence_tagging/data/get_data.sh b/demo/sequence_tagging/data/get_data.sh
new file mode 100755
index 00000000000000..e579d6c46ce5ed
--- /dev/null
+++ b/demo/sequence_tagging/data/get_data.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
+wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz
diff --git a/demo/sequence_tagging/data/test.list b/demo/sequence_tagging/data/test.list
new file mode 100644
index 00000000000000..073c0a0c9063ac
--- /dev/null
+++ b/demo/sequence_tagging/data/test.list
@@ -0,0 +1 @@
+data/test.txt.gz
diff --git a/demo/sequence_tagging/data/train.list b/demo/sequence_tagging/data/train.list
new file mode 100644
index 00000000000000..43c24d5f6484a9
--- /dev/null
+++ b/demo/sequence_tagging/data/train.list
@@ -0,0 +1 @@
+data/train.txt.gz
diff --git a/demo/sequence_tagging/dataprovider.py b/demo/sequence_tagging/dataprovider.py
new file mode 100644
index 00000000000000..6f412d6834be6d
--- /dev/null
+++ b/demo/sequence_tagging/dataprovider.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+import gzip
+import logging
+
+logging.basicConfig(
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
+)
+logger = logging.getLogger('paddle')
+logger.setLevel(logging.INFO)
+
+OOV_POLICY_IGNORE = 0
+OOV_POLICY_USE = 1
+OOV_POLICY_ERROR = 2
+
+num_original_columns = 3
+
+# Feature combination patterns.
+# [[-1,0], [0,0]]  means previous token at column 0 and current token at 
+# column 0 are combined as one feature.
+patterns = [
+    [[-2,0]],
+    [[-1,0]],
+    [[0,0]],
+    [[1,0]],
+    [[2,0]],
+
+    [[-1,0], [0,0]],
+    [[0,0], [1,0]],
+
+    [[-2,1]],
+    [[-1,1]],
+    [[0,1]],
+    [[1,1]],
+    [[2,1]],
+    [[-2,1], [-1,1]],
+    [[-1,1], [0,1]],
+    [[0,1], [1,1]],
+    [[1,1], [2,1]],
+
+    [[-2,1], [-1,1], [0,1]],
+    [[-1,1], [0,1], [1,1]],
+    [[0,1], [1,1], [2,1]],
+]
+
+dict_label = {
+ 'B-ADJP': 0,
+ 'I-ADJP': 1,
+ 'B-ADVP': 2,
+ 'I-ADVP': 3,
+ 'B-CONJP': 4,
+ 'I-CONJP': 5,
+ 'B-INTJ': 6,
+ 'I-INTJ': 7,
+ 'B-LST': 8,
+ 'I-LST': 9,
+ 'B-NP': 10,
+ 'I-NP': 11,
+ 'B-PP': 12,
+ 'I-PP': 13,
+ 'B-PRT': 14,
+ 'I-PRT': 15,
+ 'B-SBAR': 16,
+ 'I-SBAR': 17,
+ 'B-UCP': 18,
+ 'I-UCP': 19,
+ 'B-VP': 20,
+ 'I-VP': 21,
+ 'O': 22
+}
+
+def make_features(sequence):
+    length = len(sequence)
+    num_features = len(sequence[0])
+    def get_features(pos):
+        if pos < 0:
+            return ['#B%s' % -pos] * num_features
+        if pos >= length:
+            return ['#E%s' % (pos - length + 1)] * num_features
+        return sequence[pos]
+
+    for i in xrange(length):
+        for pattern in patterns:
+            fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern])
+            sequence[i].append(fname)
+
+'''
+Source file format:
+Each line is for one timestep. The features are separated by space.
+An empty line indicates end of a sequence.
+
+cutoff: a list of numbers. If count of a feature is smaller than this,
+ it will be ignored.
+if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
+i-th column.
+
+return a list of dict for each column
+'''
+def create_dictionaries(filename, cutoff, oov_policy):
+    def add_to_dict(sequence, dicts):
+        num_features = len(dicts)
+        for features in sequence:
+            l = len(features)
+            assert l == num_features, "Wrong number of features " + line
+            for i in xrange(l):
+                if features[i] in dicts[i]:
+                    dicts[i][features[i]] += 1
+                else:
+                    dicts[i][features[i]] = 1
+
+    num_features = len(cutoff)
+    dicts = []
+    for i in xrange(num_features):
+        dicts.append(dict())
+
+    f = gzip.open(filename, 'rb')
+
+    sequence = []
+
+    for line in f:
+        line = line.strip()
+        if not line:
+            make_features(sequence)
+            add_to_dict(sequence, dicts)
+            sequence = []
+            continue
+        features = line.split(' ')
+        sequence.append(features)
+
+
+    for i in xrange(num_features):
+        dct = dicts[i]
+        n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
+        todo = []
+        for k, v in dct.iteritems():
+            if v < cutoff[i]:
+                todo.append(k)
+            else:
+                dct[k] = n
+                n += 1
+            
+        if oov_policy[i] == OOV_POLICY_USE:
+            # placeholder so that len(dct) will be the number of features
+            # including OOV
+            dct['#OOV#'] = 0
+
+        logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
+        for k in todo:
+            del dct[k]
+
+    f.close()
+    return dicts
+
+
+def initializer(settings, **xargs):
+    cutoff = [3, 1, 0]
+    cutoff += [3] * len(patterns)
+    oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
+    oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
+    dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy)
+    dicts[2] = dict_label
+    settings.dicts = dicts
+    settings.oov_policy = oov_policy
+    input_types = []
+    num_features = len(dicts)
+    for i in xrange(num_original_columns):
+        input_types.append(integer_sequence(len(dicts[i])))
+        logger.info("slot %s size=%s" % (i, len(dicts[i])))
+    if patterns:
+        dim = 0
+        for i in xrange(num_original_columns, num_features):
+            dim += len(dicts[i])
+        input_types.append(sparse_binary_vector_sequence(dim))
+        logger.info("feature size=%s" % dim)
+    settings.input_types = input_types
+
+'''
+if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
+existed in dicts[i] will be assigned to id 0.
+if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
+in dicts[i].
+'''
+@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, filename):
+    input_file = filename
+    dicts = settings.dicts
+    oov_policy = settings.oov_policy
+
+    def gen_sample(sequence):
+        num_features = len(dicts)
+        sample = [list() for i in xrange(num_original_columns)]
+        if patterns:
+            sample.append([])
+        for features in sequence:
+            assert len(features) == num_features, \
+                "Wrong number of features: " + line
+            for i in xrange(num_original_columns):
+                id = dicts[i].get(features[i], -1)
+                if id != -1:
+                    sample[i].append(id)
+                elif oov_policy[i] == OOV_POLICY_IGNORE:
+                    sample[i].append(0xffffffff)
+                elif oov_policy[i] == OOV_POLICY_ERROR:
+                    logger.fatal("Unknown token: %s" % features[i])
+                else:
+                    sample[i].append(0)
+
+            if patterns:
+                dim = 0
+                vec = []
+                for i in xrange(num_original_columns, num_features):
+                    id = dicts[i].get(features[i], -1)
+                    if id != -1:
+                        vec.append(dim + id)
+                    elif oov_policy[i] == OOV_POLICY_IGNORE:
+                        pass
+                    elif oov_policy[i] == OOV_POLICY_ERROR:
+                        logger.fatal("Unknown token: %s" % features[i])
+                    else:
+                        vec.ids.append(dim + 0)
+                    
+                    dim += len(dicts[i])
+                sample[-1].append(vec)
+        return sample
+
+    num_features = len(dicts)
+    f = gzip.open(input_file, 'rb')
+
+    num_sequences = 0
+    sequence = []
+    for line in f:
+        line = line.strip()
+        if not line:
+            make_features(sequence)
+            yield gen_sample(sequence)
+            sequence = []
+            num_sequences += 1
+            continue
+        features = line.split(' ')
+        sequence.append(features)
+
+    f.close()
+
+    logger.info("num_sequences=%s" % num_sequences)
+
diff --git a/demo/sequence_tagging/linear_crf.py b/demo/sequence_tagging/linear_crf.py
new file mode 100644
index 00000000000000..2bd1a20bc52fc5
--- /dev/null
+++ b/demo/sequence_tagging/linear_crf.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+import math
+
+define_py_data_sources2(train_list="data/train.list",
+                        test_list="data/test.list",
+                        module="dataprovider",
+                        obj="process")
+
+
+batch_size = 1
+settings(
+    learning_method=MomentumOptimizer(),
+    batch_size=batch_size,
+    regularization=L2Regularization(batch_size * 1e-4),
+    average_window=0.5,
+    learning_rate=1e-1,
+    learning_rate_decay_a=1e-5,
+    learning_rate_decay_b=0.25,
+)
+
+num_label_types=23
+
+def get_simd_size(size):
+    return int(math.ceil(float(size) / 8)) * 8
+
+# Currently, in order to use sparse_update=True,
+# the size has to be aligned.
+num_label_types = get_simd_size(num_label_types)
+
+features = data_layer(name="features", size=76328)
+word = data_layer(name="word", size=6778)
+pos = data_layer(name="pos", size=44)
+chunk = data_layer(name="chunk",
+                   size=num_label_types)
+
+crf_input = fc_layer(
+    input=features,
+    size=num_label_types,
+    act=LinearActivation(),
+    bias_attr=False,
+    param_attr=ParamAttr(initial_std=0, sparse_update=True))
+
+crf=crf_layer(
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw", initial_std=0),
+)
+
+crf_decoding=crf_decoding_layer(
+    size=num_label_types,
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw"),
+)
+
+sum_evaluator(
+    name="error",
+    input=crf_decoding,
+)
+
+chunk_evaluator(
+    name="chunk_f1",
+    input =[crf_decoding, chunk],
+    chunk_scheme="IOB",
+    num_chunk_types=11,
+)
+
+inputs(word, pos, chunk, features)
+outputs(crf)
diff --git a/demo/sequence_tagging/readme.md b/demo/sequence_tagging/readme.md
new file mode 100644
index 00000000000000..2e17fffb83c532
--- /dev/null
+++ b/demo/sequence_tagging/readme.md
@@ -0,0 +1,45 @@
+# Sequence Tagging
+
+This demo is a sequence model for assigning tags to each token in a sentence. The task is described at <a href = "http://www.cnts.ua.ac.be/conll2000/chunking">CONLL2000 Text Chunking</a> task.
+
+## Download data
+```bash
+cd demo/sequence_tagging
+./data/get_data.sh
+```
+
+## Train model
+```bash
+cd demo/sequence_tagging
+./train.sh
+```
+
+## Model description
+
+We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at <a href="http://leon.bottou.org/projects/sgd#stochastic_gradient_crfs">leon.bottou.org/projects/sgd</a>. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py).
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+
+<thead>
+<th scope="col" class="left">Model name</th>
+<th scope="col" class="left">Number of parameters</th>
+<th scope="col" class="left">F1 score</th>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">linear_crf</td>
+<td class="left"> 1.8M </td>
+<td class="left"> 0.937</td>
+</tr>
+
+<tr>
+<td class="left">rnn_crf</td>
+<td class="left"> 960K </td>
+<td class="left">0.941</td>
+</tr>
+
+</tbody>
+</table>
+</center>
+<br>
diff --git a/demo/sequence_tagging/rnn_crf.py b/demo/sequence_tagging/rnn_crf.py
new file mode 100644
index 00000000000000..fb157bf3ea7193
--- /dev/null
+++ b/demo/sequence_tagging/rnn_crf.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+import math
+
+define_py_data_sources2(train_list="data/train.list",
+                        test_list="data/test.list",
+                        module="dataprovider",
+                        obj="process")
+
+batch_size = 16
+settings(
+    learning_method=MomentumOptimizer(),
+    batch_size=batch_size,
+    regularization=L2Regularization(batch_size * 1e-5),
+    average_window=0.5,
+    learning_rate = 2e-3,
+    learning_rate_decay_a = 5e-7,
+    learning_rate_decay_b = 0.5,
+)
+
+word_dim=128
+hidden_dim = 128
+with_rnn = True
+
+initial_std=1/math.sqrt(hidden_dim)
+param_attr=ParamAttr(initial_std=initial_std)
+cpu_layer_attr=ExtraLayerAttribute(device=-1)
+
+default_device(0)
+
+num_label_types=23
+
+features = data_layer(name="features", size=76328)
+word = data_layer(name="word", size=6778)
+pos = data_layer(name="pos", size=44)
+chunk = data_layer(name="chunk",
+                   size=num_label_types,
+                   layer_attr=cpu_layer_attr)
+
+emb = embedding_layer(
+    input=word, size=word_dim, param_attr=ParamAttr(initial_std=0))
+
+hidden1 = mixed_layer(
+    size=hidden_dim,
+    act=STanhActivation(),
+    bias_attr=True,
+    input=[full_matrix_projection(emb),
+           table_projection(pos, param_attr=param_attr)]
+)
+
+if with_rnn:
+    rnn1 = recurrent_layer(
+        act=ReluActivation(),
+        bias_attr=True,
+        input=hidden1,
+        param_attr=ParamAttr(initial_std=0),
+    )
+
+hidden2 = mixed_layer(
+    size=hidden_dim,
+    act=STanhActivation(),
+    bias_attr=True,
+    input=[full_matrix_projection(hidden1)
+    ] + ([
+        full_matrix_projection(rnn1, param_attr=ParamAttr(initial_std=0))
+    ] if with_rnn else []),
+)
+
+if with_rnn:
+    rnn2=recurrent_layer(
+        reverse=True,
+        act=ReluActivation(),
+        bias_attr=True,
+        input=hidden2,
+        param_attr=ParamAttr(initial_std=0),
+    )
+
+crf_input = mixed_layer(
+    size=num_label_types,
+    bias_attr=False,
+    input=[
+        full_matrix_projection(hidden2),
+    ] + ([
+        full_matrix_projection(rnn2, param_attr=ParamAttr(initial_std=0))
+    ] if with_rnn else []),
+)
+
+crf = crf_layer(
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw", initial_std=0),
+    layer_attr=cpu_layer_attr,
+)
+
+crf_decoding = crf_decoding_layer(
+    size=num_label_types,
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw"),
+    layer_attr=cpu_layer_attr,
+)
+
+sum_evaluator(
+    name="error",
+    input=crf_decoding,
+)
+
+chunk_evaluator(
+    name="chunk_f1",
+    input =[crf_decoding, chunk],
+    chunk_scheme="IOB",
+    num_chunk_types=11,
+)
+
+inputs(word, pos, chunk, features)
+outputs(crf)
diff --git a/demo/sequence_tagging/train.sh b/demo/sequence_tagging/train.sh
new file mode 100755
index 00000000000000..9a706b98d86861
--- /dev/null
+++ b/demo/sequence_tagging/train.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+paddle train \
+       --config rnn_crf.py \
+       --parallel_nn=1 \
+       --use_gpu=1 \
+       --dot_period=10 \
+       --log_period=1000 \
+       --test_period=0 \
+       --num_passes=10
diff --git a/demo/sequence_tagging/train_linear.sh b/demo/sequence_tagging/train_linear.sh
new file mode 100755
index 00000000000000..597b5afea9c63a
--- /dev/null
+++ b/demo/sequence_tagging/train_linear.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+paddle train \
+       --config linear_crf.py \
+       --use_gpu=0 \
+       --dot_period=100 \
+       --log_period=10000 \
+       --test_period=0 \
+       --num_passes=10
diff --git a/doc/build/contribute_to_paddle.md b/doc/build/contribute_to_paddle.md
index 06fcff61720755..bbdbb4d4227d0b 100644
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@@ -99,3 +99,7 @@ git pull --rebase upstream HEAD
 git push -f origin HEAD
 ```
 Now your Pull Request is updated with the latest version.
+
+## Revise your pull request
+
+When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request.
diff --git a/doc/build/docker_install.rst b/doc/build/docker_install.rst
index 542b9bac27afb8..e95de35f4da35f 100644
--- a/doc/build/docker_install.rst
+++ b/doc/build/docker_install.rst
@@ -69,7 +69,7 @@ If you want to launch container with GPU support, you need to set some environme
 
 ..  code-block:: bash
 
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}"
+    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
 
diff --git a/doc/demo/quick_start/index_en.md b/doc/demo/quick_start/index_en.md
index ee3fa2a2166f49..e7d74512292c89 100644
--- a/doc/demo/quick_start/index_en.md
+++ b/doc/demo/quick_start/index_en.md
@@ -134,7 +134,7 @@ def process(settings, file_name):
 You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies:
 
 - The path of the training and testing data (`data/train.list`, `data/test.list`).
-- The location of the data provider file (`dataprovider_pow`).
+- The location of the data provider file (`dataprovider_bow`).
 - The function to call to get data. (`process`).
 - Additional arguments or data. Here it passes the path of word dictionary.
 
diff --git a/doc/ui/api/trainer_config_helpers/layers.rst b/doc/ui/api/trainer_config_helpers/layers.rst
index 01443466105b5b..ab27c3bd6e8ad7 100644
--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -73,6 +73,12 @@ img_pool_layer
     :members: img_pool_layer
     :noindex:
 
+maxout_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: maxout_layer
+    :noindex:
+
 Norm Layer
 ==========
 
@@ -130,6 +136,12 @@ gru_step_layer
 Recurrent Layer Group
 =====================
 
+memory
+------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: memory
+    :noindex:
+
 recurrent_group
 ---------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -377,6 +389,12 @@ ctc_layer
     :members: ctc_layer
     :noindex:
 
+nce_layer
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: nce_layer
+    :noindex:
+
 hsigmoid
 ---------
 ..  automodule:: paddle.trainer_config_helpers.layers
diff --git a/doc_cn/algorithm/rnn/hierarchical-layer.md b/doc_cn/algorithm/rnn/hierarchical-layer.md
new file mode 100644
index 00000000000000..5282bbbcb82d00
--- /dev/null
+++ b/doc_cn/algorithm/rnn/hierarchical-layer.md
@@ -0,0 +1,66 @@
+# 支持双层序列作为输入的Layer
+
+## 概述
+
+在自然语言处理任务中，序列是一种常见的数据类型。一个独立的词语，可以看作是一个非序列输入，或者，我们称之为一个0层的序列；由词语构成的句子，是一个单层序列；若干个句子构成一个段落，是一个双层的序列。
+
+双层序列是一个嵌套的序列，它的每一个元素，又是一个单层的序列。这是一种非常灵活的数据组织方式，帮助我们构造一些复杂的输入信息。
+
+我们可以按照如下层次定义非序列，单层序列，以及双层序列。
+
++ 0层序列：一个独立的元素，类型可以是PaddlePaddle支持的任意输入数据类型
++ 单层序列：排成一列的多个元素，每个元素是一个0层序列，元素之间的顺序是重要的输入信息
++ 双层序列：排成一列的多个元素，每个元素是一个单层序列，称之为双层序列的一个子序列（subseq），subseq的每个元素是一个0层序列
+
+
+在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
+## pooling_layer
+
+pooling_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer">配置API</a>。
+```python
+seq_pool = pooling_layer(input=layer,
+                         pooling_type=AvgPooling(),
+                         agg_level=AggregateLevel.EACH_SEQUENCE)
+```
+- `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
+- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
+  - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
+  - 输入：一个双层序列，或一个单层序列
+  - 输出：一个0层序列，即整个输入序列（单层或双层）的平均值（或最大值）
+- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+  - 作用：一个双层序列经过运算变成一个单层序列
+  - 输入：必须是一个双层序列
+  - 输出：一个单层序列，序列的每个元素是原来双层序列每个subseq元素的平均值（或最大值）
+
+## last_seq 和 first_seq
+
+last_seq的使用示例如下（first_seq类似），详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq">配置API</a>。
+```python
+last = last_seq(input=layer,
+                agg_level=AggregateLevel.EACH_SEQUENCE)
+```
+- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
+  - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
+  - 输入：一个双层序列或一个单层序列
+  - 输出：一个0层序列，即整个输入序列（双层或者单层）最后一个，或第一个元素。
+- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+  - 作用：一个双层序列经过运算变成一个单层序列
+  - 输入：必须是一个双层序列
+  - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
+
+## expand_layer
+
+expand_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer">配置API</a>。
+```python
+expand = expand_layer(input=layer1,
+                      expand_as=layer2,
+                      expand_level=ExpandLevel.FROM_TIMESTEP)
+```
+- `expand_level=ExpandLevel.FROM_TIMESTEP`时（默认值）：
+  - 作用：一个0层序列经过运算扩展成一个单层序列，或者一个双层序列
+  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
+  - 输出：一个单层序列，或一个双层序列，输出序列的类型（双层序列，或单层序列）和序列中含有元素的数目同 layer2一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
+- `expand_level=ExpandLevel.FROM_SEQUENCE`时：
+  - 作用：一个单层序列经过运算扩展成一个双层序列
+  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2必须是一个双层序列，提供扩展的长度信息
+  - 输出：一个双层序列，序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目（0层序列），和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个subseq。
\ No newline at end of file
diff --git a/doc_cn/algorithm/rnn/hierarchical-rnn.md b/doc_cn/algorithm/rnn/hierarchical-rnn.md
new file mode 100644
index 00000000000000..4a85cf336146ef
--- /dev/null
+++ b/doc_cn/algorithm/rnn/hierarchical-rnn.md
@@ -0,0 +1,403 @@
+# 双层RNN配置与示例
+
+我们在`paddle/gserver/tests/test_RecurrentGradientMachine`单测中，通过多组语义相同的单双层RNN配置，讲解如何使用双层RNN。
+
+## 示例1：双进双出，subseq间无memory
+
+配置：单层RNN（`sequence_layer_group`）和双层RNN（`sequence_nest_layer_group`），语义完全相同。
+
+### 读取双层序列的方法
+
+首先，我们看一下单双层序列的不同数据组织形式（您也可以采用别的组织形式）：
+
+- 单层序列的数据（`Sequence/tour_train_wdseg`）如下，一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。
+
+```text
+2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
+2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
+2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
+2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
+2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
+2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
+2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
+2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
+2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
+2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
+```
+
+- 双层序列的数据（`Sequence/tour_train_wdseg.nest`）如下，一共有4个样本。样本间用空行分开，代表不同的双层序列，序列数据和上面的完全一样。每个样本的子句数分别为2,3,2,3。
+
+```text
+2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
+2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
+
+2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
+2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
+2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
+
+2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
+2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
+
+2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
+2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
+2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
+```
+
+其次，我们看一下单双层序列的不同dataprovider（见`sequenceGen.py`）：
+
+- 单层序列的dataprovider如下：
+  - word_slot是integer_value_sequence类型，代表单层序列。
+  - label是integer_value类型，代表一个向量。
+
+```python
+def hook(settings, dict_file, **kwargs):
+    settings.word_dict = dict_file
+    settings.input_types = [integer_value_sequence(len(settings.word_dict)), 
+                            integer_value(3)]
+
+@provider(init_hook=hook)
+def process(settings, file_name):
+    with open(file_name, 'r') as fdata:
+        for line in fdata:
+            label, comment = line.strip().split('\t')
+            label = int(''.join(label.split()))
+            words = comment.split()
+            word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
+            yield word_slot, label
+```
+
+- 双层序列的dataprovider如下：
+  - word_slot是integer_value_sub_sequence类型，代表双层序列。
+  - label是integer_value_sequence类型，代表单层序列，即一个子句一个label。注意：也可以为integer_value类型，代表一个向量，即一个句子一个label。通常根据任务需求进行不同设置。
+  - 关于dataprovider中input_types的详细用法，参见PyDataProvider2。
+
+```python
+def hook2(settings, dict_file, **kwargs):
+    settings.word_dict = dict_file
+    settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
+                            integer_value_sequence(3)]
+
+@provider(init_hook=hook2)
+def process2(settings, file_name):
+    with open(file_name) as fdata:
+        label_list = []
+        word_slot_list = []
+        for line in fdata:
+            if (len(line)) > 1:
+                label,comment = line.strip().split('\t')
+                label = int(''.join(label.split()))
+                words = comment.split()
+                word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
+                label_list.append(label)
+                word_slot_list.append(word_slot)
+            else:
+                yield word_slot_list, label_list
+                label_list = []
+                word_slot_list = []
+```
+
+### 模型中的配置
+
+首先，我们看一下单层序列的配置（见`sequence_layer_group.conf`）。注意：batchsize=5表示一次过5句单层序列，因此2个batch就可以完成1个pass。
+
+```python
+settings(batch_size=5)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# (lstm_input + lstm) is equal to lstmemory 
+with mixed_layer(size=hidden_dim*4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory_group(input=lstm_input,
+                       size=hidden_dim,
+                       act=TanhActivation(),
+                       gate_act=SigmoidActivation(),
+                       state_act=TanhActivation(),
+                       lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(size=label_dim, 
+                 act=SoftmaxActivation(), 
+                 bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
+
+```
+其次，我们看一下语义相同的双层序列配置（见`sequence_nest_layer_group.conf`），并对其详细分析：
+
+- batchsize=2表示一次过2句双层序列。但从上面的数据格式可知，2句双层序列和5句单层序列的数据完全一样。
+- data_layer和embedding_layer不关心数据是否是序列格式，因此两个配置在这两层上的输出是一样的。
+- lstmemory:
+  - 单层序列过了一个mixed_layer和lstmemory_group。
+  - 双层序列在同样的mixed_layer和lstmemory_group外，直接加了一层group。由于这个外层group里面没有memory，表示subseq间不存在联系，即起到的作用仅仅是把双层seq拆成单层，因此双层序列过完lstmemory的输出和单层的一样。
+- last_seq：
+  - 单层序列直接取了最后一个元素
+  - 双层序列首先（last_seq层）取了每个subseq的最后一个元素，将其拼接成一个新的单层序列；接着（expand_layer层）将其扩展成一个新的双层序列，其中第i个subseq中的所有向量均为输入的单层序列中的第i个向量；最后（average_layer层）取了每个subseq的平均值。
+  - 分析得出：第一个last_seq后，每个subseq的最后一个元素就等于单层序列的最后一个元素，而expand_layer和average_layer后，依然保持每个subseq最后一个元素的值不变（这两层仅是为了展示它们的用法，实际中并不需要）。因此单双层序列的输出是一样旳。
+
+```python
+settings(batch_size=2)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb_group = embedding_layer(input=data, size=word_dim)
+
+# (lstm_input + lstm) is equal to lstmemory 
+def lstm_group(lstm_group_input):
+    with mixed_layer(size=hidden_dim*4) as group_input:
+      group_input += full_matrix_projection(input=lstm_group_input)
+
+    lstm_output = lstmemory_group(input=group_input,
+                                  name="lstm_group",
+                                  size=hidden_dim,
+                                  act=TanhActivation(),
+                                  gate_act=SigmoidActivation(),
+                                  state_act=TanhActivation(),
+                                  lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+    return lstm_output
+
+lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group),
+                                  step=lstm_group,
+                                  name="lstm_nest_group")
+# hasSubseq ->(seqlastins) seq
+lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
+
+# seq ->(expand) hasSubseq
+lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE)
+
+# hasSubseq ->(average) seq
+lstm_average = pooling_layer(input=lstm_expand,
+                             pooling_type=AvgPooling(),
+                             agg_level=AggregateLevel.EACH_SEQUENCE)
+
+with mixed_layer(size=label_dim, 
+                 act=SoftmaxActivation(), 
+                 bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_average)
+
+outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
+```
+## 示例2：双进双出，subseq间有memory
+
+配置：单层RNN（`sequence_rnn.conf`），双层RNN（`sequence_nest_rnn.conf`和`sequence_nest_rnn_readonly_memory.conf`），语义完全相同。
+
+### 读取双层序列的方法
+
+我们看一下单双层序列的不同数据组织形式和dataprovider（见`rnn_data_provider.py`）
+```python
+data = [
+    [[[1, 3, 2], [4, 5, 2]], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], 1],
+]
+
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value(3)])
+def process_subseq(settings, file_name):
+    for d in data:
+        yield d
+
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value(3)])
+def process_seq(settings, file_name):
+    for d in data:
+        seq = []
+```
+- 单层序列：有两句，分别为[1,3,2,4,5,2]和[0,2,2,5,0,1,2]。
+- 双层序列：有两句，分别为[[1,3,2],[4,5,2]]（2个子句）和[[0,2],[2,5],[0,1,2]]（3个子句）。
+- 单双层序列的label都分别是0和1
+
+### 模型中的配置
+
+我们选取单双层序列配置中的不同部分，来对比分析两者语义相同的原因。
+
+- 单层序列：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
+
+```python
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    return fc_layer(input=[y, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+
+out = recurrent_group(step=step, input=emb)
+```
+- 双层序列，外层memory是一个元素：
+  - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
+  - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每一个时间步都用了上一个时间步的输出结果”一致。
+
+```python
+def outer_step(x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y):
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        return fc_layer(input=[y, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        input=x)
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    return inner_rnn_output
+
+out = recurrent_group(step=outer_step, input=SubsequenceInput(emb))
+```
+- 双层序列，外层memory是单层序列：
+  - 由于外层每个时间步返回的是一个子句，这些子句的长度往往不等长。因此当外层有is_seq=True的memory时，内层是**无法直接使用**它的，即内层memory的boot_layer不能链接外层的这个memory。
+  - 如果内层memory想**间接使用**这个外层memory，只能通过`pooling_layer`、`last_seq`或`first_seq`这三个layer将它先变成一个元素。但这种情况下，外层memory必须有boot_layer，否则在第0个时间步时，由于外层memory没有任何seq信息，因此上述三个layer的前向会报出“**Check failed: input.sequenceStartPositions**”的错误。
+
+## 示例3：双进双出，输入不等长
+
+**输入不等长**是指recurrent_group的多个输入在各时刻的长度可以不相等, 但需要指定一个和输出长度一致的input，用<font color="red">targetInlink</font>表示。参考配置：单层RNN（`sequence_rnn_multi_unequalength_inputs.conf`），双层RNN（`sequence_nest_rnn_multi_unequalength_inputs.conf`）
+
+### 读取双层序列的方法
+
+我们看一下单双层序列的数据组织形式和dataprovider（见`rnn_data_provider.py`）
+```python
+data2 = [
+    [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
+    [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
+]
+
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value_sub_sequence(10),
+                       integer_value(2)],
+          should_shuffle=False)
+def process_unequalength_subseq(settings, file_name): #双层RNN的dataprovider
+    for d in data2:
+        yield d
+
+
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value_sequence(10),
+                       integer_value(2)],
+          should_shuffle=False)
+def process_unequalength_seq(settings, file_name): #单层RNN的dataprovider
+    for d in data2:
+        words1=reduce(lambda x,y: x+y, d[0])
+        words2=reduce(lambda x,y: x+y, d[1])
+        yield words1, words2, d[2]
+```
+
+data2 中有两个样本，每个样本有两个特征, 记fea1, fea2。
+
+- 单层序列：两个样本分别为[[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]] 和 [[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]]
+- 双层序列：两个样本分别为
+  - **样本1**：[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]]]。fea1和fea2都分别有2个子句，fea1=[[1, 2], [4, 5, 2]], fea2=[[5, 4, 1], [3, 1]]
+  - **样本2**：[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]]。fea1和fea2都分别有3个子句， fea1=[[0, 2], [2, 5], [0, 1, 2]], fea2=[[1, 5], [4], [2, 3, 6, 1]]。<br/>
+  - **注意**：每个样本中，各特征的子句数目需要相等。这里说的“双进双出，输入不等长”是指fea1在i时刻的输入的长度可以不等于fea2在i时刻的输入的长度。如对于第1个样本，时刻i=2, fea1[2]=[4, 5, 2]，fea2[2]=[3, 1]，3≠2。
+- 单双层序列中，两个样本的label都分别是0和1
+
+### 模型中的配置
+
+单层RNN（`sequence_rnn_multi_unequalength_inputs.conf`）和双层RNN（`sequence_nest_rnn_multi_unequalength_inputs.conf`）两个模型配置达到的效果完全一样，区别只在于输入为单层还是双层序列，现在我们来看它们内部分别是如何实现的。
+
+- 单层序列：
+  - 过了一个简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全连接，功能与示例2中`sequence_rnn.conf`的`step`函数完全相同。这里，两个输入x1,x2分别通过calrnn返回最后时刻的状态。结果得到的encoder1_rep和encoder2_rep分别是单层序列，最后取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
+  - 注意到这里recurrent_group输入的每个样本中，fea1和fea2的长度都分别相等，这并非偶然，而是因为recurrent_group要求输入为单层序列时，所有输入的长度都必须相等。
+
+```python
+def step(x1, x2):
+	def calrnn(y):
+		mem = memory(name = 'rnn_state_' + y.name, size = hidden_dim)
+        out = fc_layer(input = [y, mem],
+	        size = hidden_dim,
+	        act = TanhActivation(),
+            bias_attr = True,
+            name = 'rnn_state_' + y.name)
+        return out
+
+	encoder1 = calrnn(x1)
+    encoder2 = calrnn(x2)
+    return [encoder1, encoder2]
+    
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="stepout",                           
+    step=step,
+    input=[emb1, emb2])
+
+encoder1_last = last_seq(input = encoder1_rep)                           
+encoder1_expandlast = expand_layer(input = encoder1_last,
+                                   expand_as = encoder2_rep)
+context = mixed_layer(input = [identity_projection(encoder1_expandlast),
+                               identity_projection(encoder2_rep)],
+                      size = hidden_dim)
+```
+- 双层序列：
+  - 双层RNN中，对输入的两个特征分别求时序上的连续全连接(`inner_step1`和`inner_step2`分别处理fea1和fea2)，其功能与示例2中`sequence_nest_rnn.conf`的`outer_step`函数完全相同。不同之处是，此时输入`[SubsequenceInput(emb1), SubsequenceInput(emb2)]`在各时刻并不等长。
+  - 函数`outer_step`中可以分别处理这两个特征，但我们需要用<font color=red>targetInlink</font>指定recurrent_group的输出的格式（各子句长度）只能和其中一个保持一致，如这里选择了和emb2的长度一致。
+  - 最后，依然是取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
+
+```python
+def outer_step(x1, x2):
+    outer_mem1 = memory(name = "outer_rnn_state1", size = hidden_dim)
+    outer_mem2 = memory(name = "outer_rnn_state2", size = hidden_dim)
+    def inner_step1(y):
+        inner_mem = memory(name = 'inner_rnn_state_' + y.name,
+                           size = hidden_dim,
+                           boot_layer = outer_mem1)
+        out = fc_layer(input = [y, inner_mem],
+                       size = hidden_dim,
+                       act = TanhActivation(),
+                       bias_attr = True,
+                       name = 'inner_rnn_state_' + y.name)
+        return out
+
+    def inner_step2(y):
+        inner_mem = memory(name = 'inner_rnn_state_' + y.name,
+                           size = hidden_dim,
+                           boot_layer = outer_mem2)
+        out = fc_layer(input = [y, inner_mem],
+                       size = hidden_dim,
+                       act = TanhActivation(),
+                       bias_attr = True,
+                       name = 'inner_rnn_state_' + y.name)
+        return out
+
+    encoder1 = recurrent_group(
+        step = inner_step1,
+        name = 'inner1',
+        input = x1)
+
+    encoder2 = recurrent_group(
+        step = inner_step2,
+        name = 'inner2',
+        input = x2)
+
+    sentence_last_state1 = last_seq(input = encoder1, name = 'outer_rnn_state1')
+    sentence_last_state2_ = last_seq(input = encoder2, name = 'outer_rnn_state2')
+
+    encoder1_expand = expand_layer(input = sentence_last_state1,
+                                   expand_as = encoder2)
+
+    return [encoder1_expand, encoder2]
+
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
+    targetInlink=emb2)
+
+encoder1_last = last_seq(input = encoder1_rep)
+encoder1_expandlast = expand_layer(input = encoder1_last,
+                                   expand_as = encoder2_rep)
+context = mixed_layer(input = [identity_projection(encoder1_expandlast),
+                               identity_projection(encoder2_rep)],
+                      size = hidden_dim)
+```
+
+## 示例4：beam_search的生成
+
+TBD
\ No newline at end of file
diff --git a/doc_cn/algorithm/rnn/rnn-tutorial.md b/doc_cn/algorithm/rnn/rnn-tutorial.md
new file mode 100644
index 00000000000000..7a553054c80392
--- /dev/null
+++ b/doc_cn/algorithm/rnn/rnn-tutorial.md
@@ -0,0 +1,96 @@
+# Recurrent Group教程
+
+## 概述
+
+序列数据是自然语言处理任务面对的一种主要输入数据类型。
+
+一句话是由词语构成的序列，多句话进一步构成了段落。因此，段落可以看作是一个嵌套的双层的序列，这个序列的每个元素又是一个序列。
+
+双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式，帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入，我们可以设计搭建一个灵活的、层次化的RNN，分别从词语和句子级别编码输入数据，同时也能够引入更加复杂的记忆机制，更好地完成一些复杂的语言理解任务。
+
+在PaddlePaddle中，`recurrent_group`是一种任意复杂的RNN单元，用户只需定义RNN在一个时间步内完成的计算，PaddlePaddle负责完成信息和误差在时间序列上的传播。
+
+更进一步，`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算，最终实现一个层次化的复杂RNN。
+
+目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical-layer.html">支持双层序列作为输入的Layer</a>。
+ 
+## 相关概念
+
+### 基本原理
+`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算，PaddlePaddle负责完成信息和梯度在时间序列上的传播。
+
+PaddlePaddle中，`recurrent_group`的一个简单调用如下：
+
+``` python
+recurrent_group(step, input, reverse)
+```
+- step：一个可调用的函数，定义一个时间步之内RNN单元完成的计算
+- input：输入，必须是一个单层序列，或者一个双层序列
+- reverse：是否以逆序处理输入序列
+ 
+使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer，完成任意的运算逻辑。`recurrent_group` 的输入（即input）会成为step函数的输入，由于step 函数只关注于RNN一个时间步之内的计算，在这里`recurrent_group`替我们完成了原始输入数据的拆分。
+
+### 输入
+`recurrent_group`处理的输入序列主要分为以下三种类型：
+ 
+- **数据输入**：一个双层序列进入`recurrent_group`会被拆解为一个单层序列，一个单层序列进入`recurrent_group`会被拆解为非序列，然后交给step函数，这一过程对用户是完全透明的。可以有以下两种：1）通过data_layer拿到的用户输入；2）其它layer的输出。
+		
+- **只读Memory输入**：`StaticInput` 定义了一个只读的Memory，由`StaticInput`指定的输入不会被`recurrent_group`拆解，`recurrent_group` 循环展开的每个时间步总是能够引用所有输入，可以是一个非序列，或者一个单层序列。
+	  
+- **序列生成任务的输入**：`GeneratedInput`只用于在序列生成任务中指定输入数据。
+
+### 输入示例
+
+序列生成任务大多遵循encoder-decoer架构，encoder和decoder可以是能够处理序列的任意神经网络单元，而RNN是最流行的选择。
+
+给定encoder输出和当前词，decoder每次预测产生下一个最可能的词语。在这种结构中，decoder接受两个输入：
+    
+- 要生成的目标序列：是decoder的数据输入，也是decoder循环展开的依据，`recurrent_group`会对这类输入进行拆解。
+
+- encoder输出，可以是一个非序列，或者一个单层序列：是一个unbounded memory，decoder循环展开的每一个时间步会引用全部结果，不应该被拆解，这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。
+		
+在序列生成任务中，decoder RNN总是引用上一时刻预测出的词的词向量，作为当前时刻输入。`GeneratedInput`自动完成这一过程。
+		 
+### 输出
+`step`函数必须返回一个或多个Layer的输出，这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中，`recurrent_group` 会将每个时间步的输出拼接，这个过程对用户也是透明的。
+
+### memory
+memory只能在`recurrent_group`中定义和使用。memory不能独立存在，必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出，因此，可以将memory理解为一个时延操作。
+
+可以显示地指定一个layer的输出用于初始化memory。不指定时，memory默认初始化为0。
+
+## 双层RNN介绍
+`recurrent_group`帮助我们完成对输入序列的拆分，对输出的合并，以及计算逻辑在序列上的循环展开。
+
+利用这种特性，两个嵌套的`recurrent_group`能够处理双层序列，实现词语和句子两个级别的双层RNN结构。
+
+- 单层（word-level）RNN：每个状态（state）对应一个词（word）。
+- 双层（sequence-level）RNN：一个双层RNN由多个单层RNN组成，每个单层RNN（即双层RNN的每个状态）对应一个子句（subseq）。
+
+为了描述方便，下文以NLP任务为例，将含有子句（subseq）的段落定义为一个双层序列，将含有词语的句子定义为一个单层序列，那么0层序列即为一个词语。
+
+## 双层RNN的使用
+
+### 训练流程的使用方法
+使用 `recurrent_group`需要遵循以下约定：
+ 
+- **单进单出**：输入和输出都是单层序列。
+  - 如果有多个输入，不同输入序列含有的词语数必须严格相等。
+  - 输出一个单层序列，输出序列的词语数和输入序列一致。
+  - memory：在step函数中定义 memory指向一个layer，通过引用memory得到这个layer上一个时刻输出，形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory，每个时间步之内的运算是独立的。
+  - boot_layer：memory的初始状态，默认初始状为0，memory的is_seq参数必须为false。
+ 
+- **双进双出**：输入和输出都是双层序列。
+  - 如果有多个输入序列，不同输入含有的子句（subseq）数必须严格相等，但子句含有的词语数可以不相等。
+  - 输出一个双层序列，子句（subseq）数、子句的单词数和指定的一个输入序列一致，默认为第一个输入。
+  - memory：在step函数中定义memory，指向一个layer，通过引用memory得到这个layer上一个时刻的输出，形成recurrent连接。定义在外层`recurrent_group` step函数中的memory，能够记录上一个subseq 的状态，可以是一个单层序列（只作为read-only memory），也可以是一个词语。如果没有定义memory，那么 subseq 之间的运算是独立的。
+  - boot_layer：memory 初始状态，可以是一个单层序列（只作为read-only memory）或一个向量。默认不设置，即初始状态为0。
+
+- **双进单出**：目前还未支持，会报错"In hierachical RNN, all out links should be from sequences now"。
+ 
+
+### 生成流程的使用方法
+使用`beam_search`需要遵循以下约定：
+
+- 单层RNN：从一个word生成下一个word。
+- 双层RNN：即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看，也不存在一个subseq直接生成下一个subseq的情况。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/install/docker_install.rst b/doc_cn/build_and_install/install/docker_install.rst
index 44aa2a0983f4fd..a5f5fb117e11e8 100644
--- a/doc_cn/build_and_install/install/docker_install.rst
+++ b/doc_cn/build_and_install/install/docker_install.rst
@@ -23,9 +23,9 @@ PaddlePaddle提供的Docker镜像版本
 +-----------------+------------------+------------------------+-----------------------+
 |       GPU       | gpu-latest       | gpu-devel-latest       | gpu-demo-latest       |
 +-----------------+------------------+------------------------+-----------------------+
-| CPU WITHOUT AVX | cpu-noavx-latest | cpu-devel-noavx-latest | cpu-demo-noavx-latest |
+| CPU WITHOUT AVX | cpu-noavx-latest | cpu-noavx-devel-latest | cpu-noavx-demo-latest |
 +-----------------+------------------+------------------------+-----------------------+
-| GPU WITHOUT AVX | gpu-noavx-latest | gpu-devel-noavx-latest | gpu-demo-noavx-latest |
+| GPU WITHOUT AVX | gpu-noavx-latest | gpu-noavx-devel-latest | gpu-noavx-demo-latest |
 +-----------------+------------------+------------------------+-----------------------+
 
 其中，横向包括三个版本，normal，devel和demo。
diff --git a/doc_cn/conf.py.in b/doc_cn/conf.py.in
index 391f7981eab809..93242ace406000 100644
--- a/doc_cn/conf.py.in
+++ b/doc_cn/conf.py.in
@@ -47,6 +47,7 @@ extensions = [
     'sphinx.ext.autosummary',
     'sphinx.ext.mathjax',
     'sphinx.ext.napoleon',
+    'sphinx.ext.graphviz'
 ]
 table_styling_embed_css = True
 
diff --git a/doc_cn/faq/index.rst b/doc_cn/faq/index.rst
new file mode 100644
index 00000000000000..283607957ce630
--- /dev/null
+++ b/doc_cn/faq/index.rst
@@ -0,0 +1,169 @@
+####################
+PaddlePaddle常见问题
+####################
+
+..  contents::
+
+1. 如何减少PaddlePaddle的内存占用
+---------------------------------
+
+神经网络的训练本身是一个非常消耗内存和显存的工作。经常会消耗数十G的内存和数G的显存。
+PaddlePaddle的内存占用主要分为如下几个方面\:
+
+* DataProvider缓冲池内存 (只针对内存)
+* 神经元激活内存 （针对内存和显存）
+* 参数内存 (针对内存和显存)
+* 其他内存杂项
+
+这其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，
+这些内存就不考虑如何缩减了。
+
+其他的内存的减少方法依次为
+
+
+减少DataProvider缓冲池内存
+++++++++++++++++++++++++++
+
+PyDataProvider使用的是异步加载，同时在内存里直接随即选取数据来做Shuffle。即
+
+..  graphviz::
+
+    digraph {
+        rankdir=LR;
+        数据文件 -> 内存池 -> PaddlePaddle训练
+    }
+
+所以，减小这个内存池即可减小内存占用，同时也可以加速开始训练前数据载入的过程。但是，这
+个内存池实际上决定了shuffle的粒度。所以，如果将这个内存池减小，又要保证数据是随机的，
+那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
+
+..  literalinclude:: reduce_min_pool_size.py
+
+这样做可以极大的减少内存占用，并且可能会加速训练过程。 详细文档参考 `这里
+<../ui/data_provider/pydataprovider2.html#provider>`_ 。
+
+神经元激活内存
+++++++++++++++
+
+神经网络在训练的时候，会对每一个激活暂存一些数据，包括激活，參差等等。
+在反向传递的时候，这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系，
+一是batch size，另一个是每条序列(Sequence)长度。所以，其实也是和每个mini-batch中包含
+的时间步信息成正比。
+
+所以，做法可以有两种。他们是
+
+* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数，减小batch size可能会对训练结果产生影响。
+* 减小序列的长度，或者直接扔掉非常长的序列。比如，一个数据集大部分序列长度是100-200,
+  但是突然有一个10000长的序列，就很容易导致内存超限。特别是在LSTM等RNN中。
+
+参数内存
+++++++++
+
+PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需要使用不同大小的内存。
+例如如果使用 :code:`adadelta` 算法，则需要使用参数规模大约5倍的内存。 如果参数保存下来的
+文件为 :code:`100M`， 那么该优化算法至少需要 :code:`500M` 的内存。
+
+可以考虑使用一些优化算法，例如 :code:`momentum`。
+
+2. 如何加速PaddlePaddle的训练速度
+---------------------------------
+
+PaddlePaddle是神经网络训练平台，加速PaddlePaddle训练有如下几个方面\：
+
+* 减少数据载入的耗时
+* 加速训练速度
+* 利用更多的计算资源
+
+减少数据载入的耗时
+++++++++++++++++++
+
+使用 :code:`pydataprovider`时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
+:code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
+
+..  literalinclude:: reduce_min_pool_size.py
+
+同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法，将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话，会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里，在之后的 :code:`pass` 中，不会再从 :code:`python` 端读取数据，而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
+
+
+加速训练速度
+++++++++++++
+
+PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时，与这个训练数据交互的Layer，需要将其Parameter设置成 sparse 更新模式，即设置 :code:`sparse_update=True`
+
+这里使用简单的 :code:`word2vec` 训练语言模型距离，具体使用方法为\:
+
+使用一个词前两个词和后两个词，来预测这个中间的词。这个任务的DataProvider为\:
+
+..  literalinclude:: word2vec_dataprovider.py
+
+这个任务的配置为\:
+
+..  literalinclude:: word2vec_config.py
+
+更多关于sparse训练的内容请参考 `sparse训练的文档 <TBD>`_
+
+利用更多的计算资源
+++++++++++++++++++
+
+利用更多的计算资源可以分为一下几个方式来进行\:
+
+* 单机CPU训练
+  * 使用多线程训练。设置命令行参数 :code:`trainer_count`，即可以设置参与训练的线程数量。使用方法为 :code:`paddle train --trainer_count=4`
+* 单机GPU训练
+  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 使用方法为 :code:`paddle train --use_gpu=true`
+  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count`。使用 :code:`--use_gpu=True` 开启GPU训练，使用 :code:`trainer_count` 指定显卡数量。使用方法为 :code:`paddle train --use_gpu=true --trainer_count=4`
+* 多机训练
+  * 使用多机训练的方法也比较简单，需要先在每个节点启动 :code:`paddle pserver`，在使用 :code:`paddle train --pservers=192.168.100.1,192.168.100.2` 来指定每个pserver的ip地址
+  * 具体的多机训练方法参考 `多机训练 <TBD>`_ 文档。
+
+
+3. 遇到“非法指令”或者是“illegal instruction” 
+--------------------------------------------
+
+paddle在进行计算的时候为了提升计算性能，使用了avx指令。部分老的cpu型号无法支持这样的指令。通常来说执行下grep avx /proc/cpuinfo看看是否有输出即可知道是否支持。（另：用此方法部分虚拟机可能检测到支持avx指令但是实际运行会挂掉，请当成是不支持，看下面的解决方案）
+
+解决办法是\:
+
+* 使用 NO_AVX的 `安装包 <../build_and_install/index.html>`_ 或者 `Docker image <../build_and_install/install/docker_install.html>`_
+* 或者，使用 :code:`-DWITH_AVX=OFF` 重新编译PaddlePaddle。
+
+
+4. 如何选择SGD算法的学习率
+--------------------------
+
+在采用sgd/async_sgd进行训练时，一个重要的问题是选择正确的learning_rate。如果learning_rate太大，那么训练有可能不收敛，如果learning_rate太小，那么收敛可能很慢，导致训练时间过长。
+
+通常做法是从一个比较大的learning_rate开始试，如果不收敛，那减少学习率10倍继续试验，直到训练收敛为止。那么如何判断训练不收敛呢？可以估计出如果模型采用不变的输出最小的cost0是多少。
+
+如果训练过程的的cost明显高于这个常数输出的cost，那么我们可以判断为训练不收敛。举一个例子，假如我们是三分类问题，采用multi-class-cross-entropy作为cost，数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass（或者更早）后，cost还大于这个数，那么可以认为训练不收敛，应该降低学习率。
+
+
+5. 如何初始化参数
+-----------------
+
+默认情况下，PaddlePaddle使用均值0，标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式，PaddlePaddle目前提供两种参数初始化的方式\:
+
+* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+
+比如设置一个全连接层的参数初始化方式和bias初始化方式，可以使用如下代码。
+
+..  code-block:: python
+
+    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), 
+                      bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+
+上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
+
+6. 如何共享参数
+---------------
+
+PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是想要共享的参数使用同样的 :code:`ParamAttr` 对象。
+
+简单的全连接网络，参数共享的配置示例为\:
+
+..  literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+
+这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
+
+
diff --git a/doc_cn/faq/reduce_min_pool_size.py b/doc_cn/faq/reduce_min_pool_size.py
new file mode 100644
index 00000000000000..2811b134b66b1e
--- /dev/null
+++ b/doc_cn/faq/reduce_min_pool_size.py
@@ -0,0 +1,6 @@
+@provider(min_pool_size=0, ...)
+def process(settings, filename):
+    os.system('shuf %s > %s.shuf' % (filename, filename))  # shuffle before.
+    with open('%s.shuf' % filename, 'r') as f:
+        for line in f:
+            yield get_sample_from_line(line)
\ No newline at end of file
diff --git a/doc_cn/faq/word2vec_config.py b/doc_cn/faq/word2vec_config.py
new file mode 100644
index 00000000000000..e347252476eab6
--- /dev/null
+++ b/doc_cn/faq/word2vec_config.py
@@ -0,0 +1,8 @@
+... # the settings and define data provider is omitted.
+DICT_DIM=3000  # dictionary dimension.
+word_ids=data_layer('word_ids', size=DICT_DIM)
+
+emb = embedding_layer(input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
+emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
+predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
+outputs(classification_cost(input=predict, label=data_layer('label', size=DICT_DIM))) 
\ No newline at end of file
diff --git a/doc_cn/faq/word2vec_dataprovider.py b/doc_cn/faq/word2vec_dataprovider.py
new file mode 100644
index 00000000000000..a0a39080cece90
--- /dev/null
+++ b/doc_cn/faq/word2vec_dataprovider.py
@@ -0,0 +1,8 @@
+DICT_DIM=3000
+@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
+def process(settings, filename):
+	with open(filename) as f:
+		# yield word ids to predict inner word id
+		# such as [28, 29, 10, 4], 4
+		# It means the sentance is  28, 29, 4, 10, 4.
+		yield read_next_from_file(f)
\ No newline at end of file
diff --git a/doc_cn/index.rst b/doc_cn/index.rst
index 6cf5588b5b34f5..d2d50fbdb47f27 100644
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -3,6 +3,7 @@ PaddlePaddle文档
 
 使用指南
 --------
+
 * `快速入门 <demo/quick_start/index.html>`_
 * `编译与安装 <build_and_install/index.html>`_
 * `用户接口 <ui/index.html>`_
@@ -16,4 +17,13 @@ PaddlePaddle文档
 
 算法教程
 --------
-* `RNN配置 <../doc/algorithm/rnn/rnn.html>`_
+
+* `Recurrent Group教程 <algorithm/rnn/rnn-tutorial.html>`_
+* `单层RNN示例 <../doc/algorithm/rnn/rnn.html>`_
+* `双层RNN示例 <algorithm/rnn/hierarchical-rnn.html>`_
+* `支持双层序列作为输入的Layer <algorithm/rnn/hierarchical-layer.html>`_
+
+常见问题
+--------
+
+* `常见问题 <faq/index.html>`_
diff --git a/doc_cn/ui/data_provider/mnist_provider.dict.py b/doc_cn/ui/data_provider/mnist_provider.dict.py
index 4eab5b1fd3b50a..bf13b56372b56a 100644
--- a/doc_cn/ui/data_provider/mnist_provider.dict.py
+++ b/doc_cn/ui/data_provider/mnist_provider.dict.py
@@ -2,10 +2,10 @@
 
 
 # Define a py data provider
-@provider(input_types=[
-    dense_vector(28 * 28),
-    integer_value(10)
-])
+@provider(input_types={
+    'pixel': dense_vector(28 * 28),
+    'label': integer_value(10)
+})
 def process(settings, filename):  # settings is not used currently.
     f = open(filename, 'r')  # open one of training file
 
@@ -20,6 +20,6 @@ def process(settings, filename):  # settings is not used currently.
             pixels_float.append(float(each_pixel_str))
 
         # give data to paddle.
-        yield { "pixel": pixels_float, 'label': int(label) }
+        yield {"pixel": pixels_float, 'label': int(label)}
 
     f.close()  # close file
diff --git a/doc_cn/ui/data_provider/pydataprovider2.rst b/doc_cn/ui/data_provider/pydataprovider2.rst
index 9e1d8c531f5ba2..80b40084d8f503 100644
--- a/doc_cn/ui/data_provider/pydataprovider2.rst
+++ b/doc_cn/ui/data_provider/pydataprovider2.rst
@@ -141,8 +141,6 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
    是一个batch size，但是有时为了计算均衡性，可以将一条数据设置成多个batch size
 *  cache 是数据缓存的策略，参考 `cache`_
 *  init_hook 是初始化时调用的函数，参考 `init_hook`_
-*  use_dynamic_order 如果是true的话，可以返回一个dict，key是data_layer的名字，value是特征值。同时，也可以
-   返回一个list或者tuple。如果是false的话，只能够返回list或者tuple
 *  check 设置成true的话，会根据input_types检查数据的合法性。
 *  check_fail_continue 如果设置成true的话，即使在check中数据不合法，也会扔到这条数据，继续训练。 如果
    check是false的话，没有作用。
diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh
index f7019b27f8f02a..657fdf65e92c9d 100755
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -33,7 +33,7 @@ if ! python -c "import paddle" >/dev/null 2>/dev/null; then
     esac
   done
   shift $(($OPTIND - 1))
-  export PYTHONPATH=$PYPATH
+  export PYTHONPATH=$PYPATH:$PYTHONPATH
   $@
 else
   echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
old mode 100644
new mode 100755
index e03a9a1baa0041..cdb730bb3cec7a
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -2,10 +2,17 @@ set(AVX_SOURCES
     src/hl_math.cc
     src/hl_avx_functions.cc
 )
-set(CUDA_SOURCES
-    src/hl_time.cc
-    src/hl_cpu_functions.cc
-    ${AVX_SOURCES})
+
+if(WITH_AVX)
+    set(CUDA_SOURCES
+        src/hl_time.cc
+        src/hl_cpu_functions.cc
+        ${AVX_SOURCES})
+else()
+    set(CUDA_SOURCES
+        src/hl_time.cc
+        src/hl_cpu_functions.cc)
+endif()
 
 set(CUDA_CXX_WITH_GPU_SOURCES
     src/hl_cuda_cublas.cc
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 77e2649b172144..1fe2774cc5a291 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -185,7 +185,7 @@ typedef struct {
     size_t                  nnz;
 } _hl_sparse_matrix_s, *hl_sparse_matrix_s;
 
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 /**
  * HPPL data type: real (float or double)
  *
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index aa4720f6ca749f..b5240da0f398c8 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -169,7 +169,7 @@ extern void hl_avgpool_forward(
  * @brief   Maximum pool backward.
  *
  * @param[in]   frameCnt    batch size of input image.
- * @param[in]   outGrad     input data.
+ * @param[in]   outGrad     output grad data.
  * @param[in]   channels    number of channel.
  * @param[in]   height      image height.
  * @param[in]   width       image width.
@@ -296,4 +296,34 @@ extern void hl_bilinear_backward(real* inGrad,
                                  const size_t outputW,
                                  const size_t numChannels);
 
+/**
+ * @brief   MaxOut forward.
+ *
+ * @param[in]   inData      input data.
+ * @param[out]  outData     output data.
+ * @param[out]  idData      output maxId.
+ * @param[in]   batchSize   batchSize.
+ * @param[in]   size        number of channels * image height * image width.
+ * @param[in]   featLen     feature length = image height * image width.
+ * @param[in]   groups      number of groups.
+ */
+extern void hl_maxout_forward(
+    const real* inData, real* outData, int* idData,
+    size_t batchSize, size_t size, size_t featLen, size_t groups);
+
+/**
+ * @brief   MaxOut backward.
+ *
+ * @param[out]  inGrad      input grad data.
+ * @param[in]   outGrad     output grad data.
+ * @param[in]   idData      output maxId.
+ * @param[in]   batchSize   batchSize.
+ * @param[in]   size        number of channels * image height * image width.
+ * @param[in]   featLen     feature length = image height * image width.
+ * @param[in]   groups      number of groups.
+ */
+extern void hl_maxout_backward(
+    real* inGrad, const real* outGrad, const int* idData,
+    size_t batchSize, size_t size, size_t featLen, size_t groups);
+
 #endif /* HL_CNN_H_ */
diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh
index cba1c9f30da8d5..d39cf67448b4f2 100644
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/math/MathFunctions.h"
 
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 #define     CBLAS_GEMM     paddle::gemm<float>
 #else
 #define     CBLAS_GEMM     paddle::gemm<double>
diff --git a/paddle/cuda/include/hl_gpu_functions.cuh b/paddle/cuda/include/hl_gpu_functions.cuh
index 38df4eb8958f21..a2c5ebd18a4403 100644
--- a/paddle/cuda/include/hl_gpu_functions.cuh
+++ b/paddle/cuda/include/hl_gpu_functions.cuh
@@ -28,7 +28,7 @@ namespace hppl {
     const real min = SIGMOID_THRESHOLD_MIN;
     const real max = SIGMOID_THRESHOLD_MAX;
     real tmp = (a < min) ? min : ((a > max) ? max : a);
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
     return __fdividef(1.0f, 1.0f + __expf(-tmp));
 #else
     return 1.0 / (1.0 + exp(-tmp));
@@ -36,7 +36,7 @@ namespace hppl {
   }
 
   __device__ static real tanh(const real a) {
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
     return __fdividef(2.0f, (1.0f + __expf(-2.0f*a))) - 1.0f;
 #else
     return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/cuda/include/hl_matrix_base.cuh
index 473d394c0c688d..a3645ef51e6ef7 100644
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -30,7 +30,7 @@ limitations under the License. */
 #define INLINE   inline
 #endif
 
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 #define     DEVICE_FMAX     fmaxf
 #define     DEVICE_FMIN     fminf
 #else
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
index 6917f362901411..51e483d1fb2ff3 100644
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -21,7 +21,7 @@ limitations under the License. */
 #ifdef __CUDA_ARCH__
 // typedef void*  vecType;
 #include <vector_types.h>
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
 #else
 typedef double2 vecType;
@@ -30,7 +30,7 @@ typedef double2 vecType;
 #include <mmintrin.h>
 #include <xmmintrin.h>
 #include <emmintrin.h>
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 typedef __m128  vecType;
 #else
 typedef __m128d vecType;
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 828c21beb2fbd4..46d86b2982f065 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -143,7 +143,7 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
  */
 extern void hl_sequence2batch_copy(real *batch,
                                    real *sequence,
-                                   int *batchIndex,
+                                   const int *batchIndex,
                                    int seqWidth,
                                    int batchCount,
                                    bool seq2batch);
diff --git a/paddle/cuda/include/hl_sse_matrix_kernel.cuh b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
index c90d49e4adeb5e..45db2f313e0d6e 100644
--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #define VECTOR_SIZE     16
 
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 /* number of float in vector */
 #define     VECTOR_LEN      4
 #define     VECTOR_SET      _mm_set_ps1
@@ -41,7 +41,7 @@ inline bool hl_check_align(void *ptr) {
   return hl_check_align(reinterpret_cast<size_t>(ptr));
 }
 
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 template <class Agg>
 inline real hl_agg_op(Agg agg, vecType mm) {
   __m128 lo = _mm_unpacklo_ps(mm, mm);
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index aa9442fb80237e..cf79fad9004cd8 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -113,4 +113,12 @@ inline void hl_bilinear_backward(real* inGrad,
                                 const size_t outputW,
                                 const size_t numChannels) {}
 
+inline void hl_maxout_forward(
+    const real* inData, real* outData, int* idData,
+    size_t batchSize, size_t size, size_t featLen, size_t group) {}
+
+inline void hl_maxout_backward(
+    real* inGrad, const real* outGrad, const int* idData,
+    size_t batchSize, size_t size, size_t featLen, size_t group) {}
+
 #endif  // HL_CNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index 417f40e0a69f6c..aabd956c37f7dc 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -62,7 +62,7 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
 
 inline void hl_sequence2batch_copy(real *batch,
                                    real *sequence,
-                                   int *batchIndex,
+                                   const int *batchIndex,
                                    int seqWidth,
                                    int batchCount,
                                    bool seq2batch) {}
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index f965adc13575c1..499b61195af5e1 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -662,4 +662,63 @@ void hl_bilinear_backward(real* inGrad,
     threadNum, inGrad, inImgH, inImgW, inputH, inputW, outGrad,
     outImgH, outImgW, outputH, outputW, numChannels, ratioH, ratioW);
   CHECK_SYNC("hl_bilinear_backward failed");
-}
\ No newline at end of file
+}
+
+__global__ void maxoutFpCompute(size_t nthreads, const real * inData,
+                                real * outData, int* idData, 
+                                size_t size, size_t featLen, size_t groups) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index < nthreads) {
+    size_t batch_idx = index / size;
+    size_t i = index % size;
+    size_t channel_idx = i / featLen;
+    size_t feat_idx = i % featLen;
+    size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
+    real max = inData[data_idx];
+    int maxId = 0;
+    for (size_t g = 1; g < groups; ++g) {
+      real tmp = inData[data_idx + g * featLen];
+      if (tmp > max) {
+        max = tmp;
+        maxId = g;
+      }
+    }
+    outData[index] = max;
+    idData[index] = maxId;
+  }
+}
+
+void hl_maxout_forward(const real* inData, real* outData,
+                       int* idData, size_t batchSize, size_t size,
+                       size_t featLen, size_t groups) {
+  int num_kernels = size * batchSize;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
+    num_kernels, inData, outData, idData, size, featLen, groups);
+  CHECK_SYNC("hl_maxout_forward failed");
+}
+
+__global__ void maxoutBpCompute(size_t nthreads, real* inGrad,
+                                const real* outGrad, const int* idData,
+                                size_t size, size_t featLen, size_t groups) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index < nthreads) {
+    size_t batch_idx = index / size;
+    size_t i = index % size;
+    size_t channel_idx = i / featLen;
+    size_t feat_idx = i % featLen;
+    size_t newIndex = batch_idx * size;
+    size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
+    (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
+  }
+}
+
+void hl_maxout_backward(real* inGrad, const real* outGrad,
+                        const int* idData, size_t batchSize, size_t size,
+                        size_t featLen, size_t groups) {
+  int num_kernels = size * batchSize;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>(
+    num_kernels, inGrad, outGrad, idData, size, featLen, groups);
+  CHECK_SYNC("hl_maxout_backward failed");
+}
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index dc109487ded20f..b3c9001ba39736 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -84,7 +84,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 } /* namespace dynload */
 
 
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 #define     CUBLAS_GEAM     dynload::cublasSgeam
 #define     CUBLAS_GEMV     dynload::cublasSgemv
 #define     CUBLAS_GEMM     dynload::cublasSgemm
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index c2dce1977bdf5d..b215c0f6e33a18 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -340,7 +340,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
         (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
     CHECK_NOTNULL(hl_desc);
 
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
     cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
     cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -373,7 +373,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
         (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
     CHECK_NOTNULL(hl_desc);
 
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
     cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
     cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -611,7 +611,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
 
     CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
 
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
     cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
     cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -921,7 +921,7 @@ void hl_softmax_forward(real *input,
                         int height,
                         int width)
 {
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
     cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
     cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -955,7 +955,7 @@ void hl_softmax_backward(real *output_value,
                          int height,
                          int width)
 {
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
     cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
     cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index f4c07367b485b8..e9fe9f1c117a05 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -626,7 +626,7 @@ void hl_specify_devices_start(int* device, int number) {
 void hl_rand(real *dest_d, size_t num) {
   pthread_mutex_lock(t_resource.gen_mutex);
   CHECK_EQ(
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
   dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
 #else
   dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 38e4f16217c2a4..067e68c41e1198 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -47,7 +47,7 @@ void hl_matrix_add(real *A_d,
   CHECK_SYNC("hl_matrix_add failed");
 }
 
-#ifdef HPPL_TYPE_DOUBLE
+#ifdef PADDLE_TYPE_DOUBLE
     #define THRESHOLD   128
 #else
     #define THRESHOLD   64
@@ -102,7 +102,7 @@ void subMaxAndExp(real* I,
       val = -THRESHOLD;
     }
     I[nextIdx] = val;
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
     O[nextIdx] = __expf(val);
 #else
     O[nextIdx] = exp(val);
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index e028880156e5b1..63824eaa4c201c 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -374,7 +374,7 @@ template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
 __global__
 void KeSequence2Batch(real *batch,
                       real *sequence,
-                      int *batchIndex,
+                      const int *batchIndex,
                       int seqWidth,
                       int batchCount) {
   int idx = threadIdx.x;
@@ -405,7 +405,7 @@ void KeSequence2Batch(real *batch,
 
 void hl_sequence2batch_copy(real *batch,
                             real *sequence,
-                            int *batchIndex,
+                            const int *batchIndex,
                             int seqWidth,
                             int batchCount,
                             bool seq2batch) {
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh
index 13e89390d68c22..c3b98f4ebc38db 100644
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -355,7 +355,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
 }
 
 /* best perf */
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 #define CU_CSCMM_THREAD_M_BEST          9
 #else
 #define CU_CSCMM_THREAD_M_BEST          4
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index c3b4769f7612b7..8cefbb30ada46d 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -57,7 +57,8 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
   }
 }
 
-DoubleBuffer::DoubleBuffer(DataProvider* dataPool, bool useGpu,
+DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
+                           bool useGpu,
                            int64_t batchSize) {
   batchSize_ = batchSize;
   dataPool_ = dataPool;
@@ -110,6 +111,9 @@ void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
 }
 
 void DoubleBuffer::insertOneBatch(DataBatch* batch) {
+  while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) {  // time out
+    if (stopping_) return;
+  }
   BufferBatch* bufBatch = bufferQueue_->dequeue();
   // clone and copy the data from an Threadlocal Variable
   bufBatch->clone(batch, useGpu_);
@@ -138,7 +142,7 @@ void DoubleBuffer::asyncLoadBatch() {
         actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
       }
       insertOneBatch(&newBatch);
-    } while (actualSize > 0);
+    } while (actualSize > 0 && !stopping_);
   }
 }
 
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 534491d70d5467..112e45de1cb232 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -259,7 +259,9 @@ typedef Queue<BufferBatch*> BufferBatchQueue;
 
 class DoubleBuffer {
 public:
-  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
+  DoubleBuffer(DataProvider* dataPool,
+               bool useGpu,
+               int64_t batchSize = 0);
   virtual ~DoubleBuffer();
   void removeOneBatch(DataBatch* dataBatch);
 
@@ -308,7 +310,8 @@ class DataProvider {
   /**
    * @brief create only used for unittest.
    */
-  inline static DataProvider* create(const DataConfig &config, bool useGpu) {
+  inline static DataProvider* create(const DataConfig &config,
+                                     bool useGpu = FLAGS_use_gpu) {
     return create(config, ModelConfig(), useGpu);
   }
 
@@ -348,7 +351,6 @@ class DataProvider {
    */
   virtual void reset() {
     if (doubleBuffer_ != nullptr) {
-      LOG(INFO) << "the double-buffer is starting ...";
       doubleBuffer_->startAsyncLoad();
     }
   }
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index 2f9a1223c6e454..ca8b07af49ca07 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -14,13 +14,20 @@ limitations under the License. */
 
 #ifndef PADDLE_NO_PYTHON
 
+#include <Python.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unordered_set>
 #include <list>
+#include <numpy/numpyconfig.h>
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/ndarrayobject.h>
 
 #include "DataProvider.h"
+
 #include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
@@ -202,7 +209,10 @@ class PyDataProvider2 : public DataProvider {
   PyDataProvider2(const DataConfig& config,
                   const ModelConfig& modelConfig,
                   bool useGpu)
-    :DataProvider(config, useGpu), callingContextCreated_(2) {
+    :DataProvider(config, useGpu),
+      callingContextCreated_(2) {
+    if (PyArray_API == NULL)
+      import_array();
     auto& args = config.load_data_args();
     PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
     if (!args.empty()) {
@@ -246,8 +256,7 @@ class PyDataProvider2 : public DataProvider {
                        PyObjectPtr && kwargs) {
     LOG(INFO) << "loading dataprovider " << model <<"::" << className;
 
-    PyObjectPtr module(PyImport_ImportModule(model.c_str()));
-    CHECK_PY(module) << "Cannot imort module " << model.c_str();
+    PyObjectPtr module = py::import(model);
     PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
     CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
     PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
@@ -455,6 +464,7 @@ class PyDataProvider2 : public DataProvider {
   std::condition_variable pushCV_;
   std::condition_variable pullCV_;
   std::mutex mtx_;
+
   ThreadBarrier callingContextCreated_;
   std::unique_ptr<IPyDataProviderCache> cache_;
 
@@ -497,8 +507,8 @@ class PyDataProvider2 : public DataProvider {
    * Resetting the PyDataProvider. May start reading thread here.
    */
   virtual void reset() {
-    DataProvider::reset();
     resetImpl(true);
+    DataProvider::reset();
   }
 
   /**
@@ -519,6 +529,7 @@ class PyDataProvider2 : public DataProvider {
    * Loading a batch of data.
    */
   int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+    REGISTER_TIMER("PyDP2.getNextBatchInternal")
     CHECK_GE(size_, 0);
     size_t size = (size_t) size_;
     if (loadThread_) {  // loading from thread should wait for data pool ready.
@@ -699,10 +710,22 @@ class DenseScanner: public IFieldScanner {
    */
   virtual void fill(Argument &argument, PyObject *obj) {
     real* dat = argument.value->getData() + height_ * headerPtr_->dim;
-    py::SequenceHelper s(obj);
-    // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
-    for (size_t i=0; i < headerPtr_->dim; ++i) {
-      dat[i] = (real) s.getDouble(i);
+    if (PyArray_Check(obj)) {
+        auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
+        if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
+            real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
+            auto sz = PyArray_SIZE((PyArrayObject*)obj);
+            std::copy(data, data + sz, dat);
+        } else {
+            LOG(FATAL) << "You should yield float" << sizeof(real) * 8
+                       << " array";
+        }
+     } else {
+        py::SequenceHelper s(obj);
+        // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+        for (size_t i=0; i < headerPtr_->dim; ++i) {
+          dat[i] = (real) s.getDouble(i);
+        }
     }
     ++height_;
   }
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 273925ba55ee40..22579891f397af 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -75,7 +75,6 @@ class ChunkEvaluator : public Evaluator {
 
 public:
   virtual void init(const EvaluatorConfig& config) {
-    CHECK(!FLAGS_use_gpu) << "Not supported";
     Evaluator::init(config);
     if (config.chunk_scheme() == "IOB") {
       numTagTypes_ = 2;
@@ -137,6 +136,7 @@ class ChunkEvaluator : public Evaluator {
     CHECK_EQ(arguments.size(), (size_t)2);
     IVectorPtr& output = arguments[0].ids;
     IVectorPtr& label = arguments[1].ids;
+    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
     auto sequenceStartPositions =
         arguments[1].sequenceStartPositions->getVector(false);
     CHECK_EQ(output->getSize(), label->getSize());
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 787ce703a08aef..0ded30eeb44e95 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -813,7 +813,6 @@ void TrainerThread::mergeGradSparse(
       para->getMat(PARAMETER_GRADIENT).get());
   std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
 
-  ids.clear();
   for (auto slaveParams : slaveParameters) {
     SparseRowCpuMatrix* mat =
         dynamic_cast<SparseRowCpuMatrix*>((*slaveParams)[pid]
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index fc38bca3c403b2..340cd1b9f8e927 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -544,6 +544,12 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     const std::vector<Argument> inArgs;
     std::vector<Argument> outArgs;
     frames_[i]->forward(inArgs, &outArgs, passType);
+    if (hasSubseq) {
+      for (auto& outFrameLine : outFrameLines_) {
+        CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions)
+          << "In hierachical RNN, all out links should be from sequences.";
+      }
+    }
   }
   if (evaluator_ && passType == PASS_TEST) {
     this->eval(evaluator_.get());
@@ -635,16 +641,15 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
   std::vector<int> sequenceStartPositions;
   const int* subSequenceStartPositions = nullptr;
 
-  if (hasSubseq) {                    // for sequenceScatterAgentLayer
-    subSequenceStartPositions =
-        input.subSequenceStartPositions->getData(false);
+  if (hasSubseq) {  // for sequenceScatterAgentLayer
+    subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
     inlinkInfo->seqStartPosIndex.clear();
     inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
   }
   // maxSequenceLength_: max topLevelLength in allsamples
   for (int i = 0; i < maxSequenceLength_; ++i) {
     if (hasSubseq) {
-      sequenceStartPositions.push_back(0);            // first element = 0
+      sequenceStartPositions.push_back(0);  // first element = 0
     }
     int numSeqs = 0;
     for (size_t j = 0; j < numSequences; ++j) {
@@ -676,9 +681,9 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
   }
   if (hasSubseq) {
     // inFrameLine create sequenceStartPositions one time
-    CHECK_EQ(sequenceStartPositions.size(),
-             static_cast<size_t>(maxSequenceLength_ +
-                                 input.getNumSubSequences()));
+    CHECK_EQ(
+        sequenceStartPositions.size(),
+        static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
     CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
              static_cast<size_t>(maxSequenceLength_ + 1));
     createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
@@ -1102,10 +1107,12 @@ size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
                    newPaths.end(), Path::greaterPath);
   newPaths.resize(totalExpandCount + minNewPathSize);
 
-  real minPathLogProb = std::min_element(newPaths.end() - minNewPathSize,
-                                         newPaths.end())->logProb;
-  real maxPathLogProb = std::max_element(newPaths.end() - minNewPathSize,
-                                         newPaths.end())->logProb;
+  real minPathLogProb =
+      std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
+          ->logProb;
+  real maxPathLogProb =
+      std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
+          ->logProb;
 
   // Remove the already formed paths that are relatively short
   finalPaths_[seqId].erase(
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 056e9568852ac9..5e07446c71ff62 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AgentLayer.h"
 
 #include "paddle/utils/Logging.h"
@@ -62,8 +61,8 @@ void SequenceAgentLayer::forward(PassType passType) {
 
   // get Arguments from real layers
   if (numSamples_ > 0 && numSamples_ < realNumSequences) {
-    int numRows = realOutput.sequenceStartPositions->
-                  getData(false)[numSamples_];
+    int numRows =
+        realOutput.sequenceStartPositions->getData(false)[numSamples_];
     CHECK(!realOutput.ids) << "Not supported";
     output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_,
                        /* trans */ false, /* seqFlag */ true,
@@ -141,8 +140,8 @@ void ScatterAgentLayer::forward(PassType passType) {
 
   int width = this->getSize();
   if (realOutArg_.value || realOutArg_.ids) {
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
-                       width, useGpu_);
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
+                       useGpu_);
   } else {  // used in generation
     if (realLayer_->getOutput().ids) {
       IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
@@ -224,8 +223,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
 
   if (realOutArg_.value || realOutArg_.ids) {
     CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
-                       width, useGpu_, /* trans */ false, /* seqFlag */ true,
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
+                       useGpu_, /* trans */ false, /* seqFlag */ true,
                        /* seqStart */ seqStartPosIndex_,
                        /* seqSize */ numSequences_);
   } else {
@@ -249,11 +248,12 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
     CHECK_NE(input.sequenceStartPositions.get(),
              output_.sequenceStartPositions.get());
     ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                   numSequences + 1, false);
+                                  numSequences + 1, false);
     int* outStarts = output_.sequenceStartPositions->getMutableData(false);
 
-    IVector::resizeOrCreate(cpuInputStartPos_, height, false);
-    int* inStarts = cpuInputStartPos_->getData();
+    ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
+    int* inStarts = inputStartPos_->getMutableData(false);
+
     size_t offsetOut = 0;
     for (size_t i = 0; i < numSequences; ++i) {
       outStarts[i] = offsetOut;
@@ -266,13 +266,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
     }
     outStarts[numSequences] = offsetOut;
 
-    if (useGpu_) {
-      IVector::resizeOrCreate(inputStartPos_, height, true);
-      inputStartPos_->copyFrom(*cpuInputStartPos_, HPPL_STREAM_DEFAULT);
-    } else {
-      inputStartPos_ = cpuInputStartPos_;
-    }
-    outputValue->copyByRowIndex(*input.value, *inputStartPos_);
+    outputValue->copyByRowIndex(*input.value,
+                                *inputStartPos_->getVector(useGpu_));
   }
 }
 
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index d82078dd933294..3d7bf558340707 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -191,11 +191,7 @@ class SequenceScatterAgentLayer : public ScatterAgentLayer {
 protected:
   // use to store expanded cpuStartPositions or subSequenceStartPositions
   // of real layer.
-  IVectorPtr cpuInputStartPos_;
-
-  // point to cpuInputStartPos_ when useGpu_ is false
-  // copy from cpuInputStartPos_ when useGpu_ is true
-  IVectorPtr inputStartPos_;
+  ICpuGpuVectorPtr inputStartPos_;
 
 public:
   explicit SequenceScatterAgentLayer(const LayerConfig& config)
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
index 374117b7659bbe..7401cdc9a516bb 100644
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "AverageLayer.h"
 
 #include "paddle/utils/Logging.h"
@@ -25,13 +24,8 @@ REGISTER_LAYER(average, AverageLayer);
 
 bool AverageLayer::init(const LayerMap& layerMap,
                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
+  SequencePoolLayer::init(layerMap, parameterMap);
 
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
   dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
   outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
   // average strategy
@@ -44,57 +38,15 @@ bool AverageLayer::init(const LayerMap& layerMap,
   } else {
     LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
   }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
   return true;
 }
 
 void AverageLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  // average layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  int64_t newBatchSize =
-      type_ ? input.getNumSubSequences() : input.getNumSequences();
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  const int* starts = startPositions->getData(false);
-  size_t numSequences = startPositions->getSize() - 1;
-
-  // check
-  CHECK_EQ(numSequences, (size_t)newBatchSize);
-  CHECK_EQ(starts[numSequences], input.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input must hasSubseq
-    CHECK_EQ(input.hasSubseq(), 1UL);
-  }
+  SequencePoolLayer::forward(passType);
 
-  CHECK_EQ(dim, input.value->getWidth());
-
-  resetOutput(newBatchSize, dim);
-  auto startsPos = startPositions->getVector(useGpu_);
   MatrixPtr inputValue = getInputValue(0);
-  getOutputValue()->sequenceAvgForward(*inputValue, *startsPos, mode_);
-
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no sequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new sequenceStartPositions.
-  */
-  if (type_) {
-    output_.degradeSequence(input, useGpu_);
-  }
+  getOutputValue()->sequenceAvgForward(
+      *inputValue, *startPositions_->getVector(useGpu_), mode_);
 
   /* add the bias-vector AFTER average operation */
   if (biases_.get() != NULL) {
@@ -106,26 +58,16 @@ void AverageLayer::forward(PassType passType) {
 }
 
 void AverageLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  const int* starts = startPositions->getData(false);
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
+  SequencePoolLayer::backward(callback);
 
+  const int* starts = startPositions_->getData(false);
   MatrixPtr grad = getInputGrad(0);
+
   if (grad) {
     size_t dim = getSize();
     real* gradientData = getInputGrad(0)->getData();
     real* gradient = getOutputGrad()->getData();
-    size_t numSequences = startPositions->getSize() - 1;
+    size_t numSequences = startPositions_->getSize() - 1;
     for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
       // TODO(Dangqingqing) optimization for GPU
       int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index ae910ddefad137..1edc2ace492c5b 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #pragma once
 
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 
 namespace paddle {
@@ -23,20 +22,21 @@ namespace paddle {
 /**
  * A layer for "internal average" for sequence input.
  * Input: one or more sequences. Each sequence contains some instances.
- * If AverageLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = average_{for each instance in this sequence}{input[i]}
- * If AverageLevel = kSeq:
+ * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
  *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
  */
-
-class AverageLayer : public Layer {
+class AverageLayer : public SequencePoolLayer {
 public:
   enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
-  enum AverageLevel { kNonSeq = 0, kSeq = 1 };
-  explicit AverageLayer(const LayerConfig& config) : Layer(config) {}
+  explicit AverageLayer(const LayerConfig& config)
+      : SequencePoolLayer(config) {}
 
   ~AverageLayer() {}
 
@@ -46,11 +46,8 @@ class AverageLayer : public Layer {
   void backward(const UpdateCallback& callback = nullptr);
 
 protected:
-  std::unique_ptr<Weight> biases_;
   MatrixPtr outMtx_;
   MatrixPtr dataMtx_;
   int mode_;
-  int type_;
 };
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.cpp b/paddle/gserver/layers/ExpandLayer.cpp
index bbd0b53273b430..9290ce4f6d46c1 100644
--- a/paddle/gserver/layers/ExpandLayer.cpp
+++ b/paddle/gserver/layers/ExpandLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "ExpandLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -53,9 +52,8 @@ void ExpandLayer::forward(PassType passType) {
   const Argument& shapeInput = getInput(1);
   const Argument& dataInput = getInput(0);
   size_t outputBatchSize = shapeInput.getBatchSize();
-  auto startPositions =
-      type_ ? shapeInput.subSequenceStartPositions
-            : shapeInput.sequenceStartPositions;
+  auto startPositions = type_ ? shapeInput.subSequenceStartPositions
+                              : shapeInput.sequenceStartPositions;
   size_t numSequences = startPositions->getSize() - 1;
   const int* starts = startPositions->getData(false);
 
@@ -71,8 +69,7 @@ void ExpandLayer::forward(PassType passType) {
   // set output sequence info as shape sequence
   output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
   if (shapeInput.hasSubseq()) {
-    output_.subSequenceStartPositions =
-        shapeInput.subSequenceStartPositions;
+    output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
   }
 
   // reserve output: Expand output to batchsize of sequence data.
@@ -81,8 +78,8 @@ void ExpandLayer::forward(PassType passType) {
   MatrixPtr inputValue = getInputValue(0);
   MatrixPtr outputValue = getOutputValue();
 
-  IVector::resizeOrCreate(cpuExpandStartsPos_, outputBatchSize, false);
-  int* expandStarts = cpuExpandStartsPos_->getData();
+  ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
+  int* expandStarts = expandStartsPos_->getMutableData(false);
   for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
     int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
     for (int j = 0; j < sequenceLength; j++) {
@@ -90,15 +87,8 @@ void ExpandLayer::forward(PassType passType) {
     }
   }
 
-  if (useGpu_) {
-    // TODO(Dangqingqing) move copyFrom
-    IVector::resizeOrCreate(expandStartsPos_, outputBatchSize, true);
-    expandStartsPos_->copyFrom(*cpuExpandStartsPos_, HPPL_STREAM_DEFAULT);
-  } else {
-    expandStartsPos_ = cpuExpandStartsPos_;
-  }
-
-  outputValue->copyByRowIndex(*inputValue, *expandStartsPos_);
+  outputValue->copyByRowIndex(*inputValue,
+                              *expandStartsPos_->getVector(useGpu_));
 
   if (biases_.get() != NULL) {
     outputValue->addBias(*(biases_->getW()), 1);
@@ -108,16 +98,15 @@ void ExpandLayer::forward(PassType passType) {
 void ExpandLayer::backward(const UpdateCallback& callback) {
   if (biases_ && biases_->getWGrad()) {
     biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-     /* Increasing the number of gradient */
+    /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
   if (!getInputGrad(0)) return;
   MatrixPtr inputGrad = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
-  auto cpuSeqStartPos =
-      type_ ? getInput(1).subSequenceStartPositions
-            : getInput(1).sequenceStartPositions;
+  auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
+                              : getInput(1).sequenceStartPositions;
   size_t numSequences = cpuSeqStartPos->getSize() - 1;
   const int* starts = cpuSeqStartPos->getData(false);
 
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
index 8a3eb1c973a475..fbe0ced9b1754d 100644
--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -44,14 +44,9 @@ class ExpandLayer : public Layer {
   enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
   /// store the ExpandLevel
   int type_;
-  // TODO(luotao) use ICpuGpuVectorPtr to merge cpuExpandStartsPos_
-  // and expandStartsPos_
   /// expanded sequenceStartPositions or subSequenceStartPositions
   /// of input[1]
-  IVectorPtr cpuExpandStartsPos_;
-  /// point to cpuExpandStartsPos_ when useGpu_ is false,
-  /// copy from cpuExpandStartsPos_ when useGpu_ is true
-  IVectorPtr expandStartsPos_;
+  ICpuGpuVectorPtr expandStartsPos_;
 
 public:
   explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/gserver/layers/MaxLayer.cpp
index 226e0ea87dbd4a..c4ffe894eccd61 100644
--- a/paddle/gserver/layers/MaxLayer.cpp
+++ b/paddle/gserver/layers/MaxLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "MaxLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -21,55 +20,11 @@ namespace paddle {
 
 REGISTER_LAYER(max, MaxLayer);
 
-bool MaxLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
-
 void MaxLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  // max layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  int64_t newBatchSize =
-      type_ ? input.getNumSubSequences() : input.getNumSequences();
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  auto starts = startPositions->getVector(useGpu_);
-  size_t numSequences = startPositions->getSize() - 1;
+  SequencePoolLayer::forward(passType);
 
-  CHECK_EQ(dim, input.value->getWidth());
-  CHECK_EQ(numSequences, (size_t)newBatchSize);
-  CHECK_EQ(startPositions->getData(false)[numSequences], input.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input must hasSubseq
-    CHECK_EQ(input.hasSubseq(), 1UL);
-  }
-
-  // reset output: resize to "num of sequences", not "batch size".
-  resetOutput(newBatchSize, dim);
-
-  IVector::resizeOrCreate(maxIndex_, newBatchSize * dim, useGpu(deviceId_));
+  IVector::resizeOrCreate(maxIndex_, newBatchSize_ * getSize(),
+                          useGpu(deviceId_));
   maxIndex_->zeroMem();
 
   MatrixPtr inputValue = getInputValue(0);
@@ -77,16 +32,8 @@ void MaxLayer::forward(PassType passType) {
 
   {
     REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
-    outputValue->maxSequenceForward(*inputValue, *starts, *maxIndex_);
-  }
-
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no cpuSequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new cpuSequenceStartPositions.
-  */
-  if (type_) {
-    output_.degradeSequence(input, useGpu_);
+    outputValue->maxSequenceForward(
+        *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
   }
 
   if (config_.output_max_index()) {
@@ -104,24 +51,14 @@ void MaxLayer::forward(PassType passType) {
 void MaxLayer::backward(const UpdateCallback& callback) {
   CHECK(!config_.output_max_index())
       << "backward is not available when output_max_index is set";
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
+  SequencePoolLayer::backward(callback);
 
   MatrixPtr inputGrad = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
   if (inputGrad) {
-    ICpuGpuVectorPtr starts =
-        type_ ? getInput(0).subSequenceStartPositions
-              : getInput(0).sequenceStartPositions;
     REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
-    inputGrad->maxSequenceBackward(*outputGrad,
-        *(starts->getVector(useGpu_)), *maxIndex_);
+    inputGrad->maxSequenceBackward(
+        *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
   }
 }
 
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index b4c34e665d926d..e6dcfe9c6759d1 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/ThreadLocal.h"
 
@@ -24,29 +24,30 @@ namespace paddle {
 /**
  * A layer for "internal max" for sequence input.
  * Input: one or more sequences. Each sequence contains some instances.
- * If MaxLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = max_{for each instance in this sequence}{input[i]}
- * If MaxLevel = kSeq:
+ * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
  *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
  */
 
-class MaxLayer : public Layer {
+class MaxLayer : public SequencePoolLayer {
 protected:
-  std::unique_ptr<Weight> biases_;
   // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
   IVectorPtr maxIndex_;
-  int type_;
 
 public:
-  explicit MaxLayer(const LayerConfig& config) : Layer(config) {}
-  enum MaxLevel {kNonSeq = 0, kSeq = 1 };
+  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
 
   ~MaxLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    return SequencePoolLayer::init(layerMap, parameterMap);
+  }
 
   void forward(PassType passType);
   void backward(const UpdateCallback& callback = nullptr);
diff --git a/paddle/gserver/layers/MaxOutLayer.cpp b/paddle/gserver/layers/MaxOutLayer.cpp
new file mode 100644
index 00000000000000..a3de069bf7a6c9
--- /dev/null
+++ b/paddle/gserver/layers/MaxOutLayer.cpp
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MaxOutLayer.h"
+#include "hl_gpu.h"
+#include "hl_cnn.h"
+
+namespace paddle {
+
+REGISTER_LAYER(maxout, MaxOutLayer);
+
+size_t MaxOutLayer::getSize() {
+  const MaxOutConfig& maxoutConf = config_.inputs(0).maxout_conf();
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = maxoutConf.img_size_y();
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = maxoutConf.img_size_x();
+  }
+
+  featLen_ = imgSizeH_ * imgSizeW_;
+  size_t layerSize = featLen_ * outputChannels_;
+
+  getOutput().setFrameHeight(imgSizeH_);
+  getOutput().setFrameWidth(imgSizeW_);
+
+  return layerSize;
+}
+
+bool MaxOutLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for maxout-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
+  groups_ = conf.groups();
+  channels_ = conf.channels();
+  CHECK_EQ(channels_ % groups_, 0UL);
+  outputChannels_ = channels_ / groups_;
+
+  return true;
+}
+
+void MaxOutLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  /* note: one sample correspond to one column */
+  size_t batchSize = getInput(0).getBatchSize();
+  size_t size = getSize();
+  resetOutput(batchSize, size);
+  MatrixPtr inputV = getInputValue(0);
+  MatrixPtr outV = getOutputValue();
+
+  IVector::resizeOrCreate(maxoutId_, size * batchSize, useGpu_);
+  outV->maxoutForward(*inputV, *maxoutId_, outputChannels_, groups_);
+}
+
+void MaxOutLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  /* Do derivation */
+  MatrixPtr inputG = getInputGrad(0);
+  MatrixPtr outG = getOutputGrad();
+
+  if (inputG) {
+    inputG->maxoutBackward(*outG, *maxoutId_, outputChannels_, groups_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
new file mode 100644
index 00000000000000..9011a5c332b17a
--- /dev/null
+++ b/paddle/gserver/layers/MaxOutLayer.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer to do max out on conv layer output.
+ * Input: output of a conv layer.
+ * Output: feature map size same as input.  Channel is (input channel) / groups.
+ * So the num of channels should be able to devided by groups.
+ *
+ * The config file api is maxout_layer.
+ */
+
+class MaxOutLayer : public Layer {
+protected:
+  size_t groups_;
+  size_t imgSizeH_, imgSizeW_;
+  /// outputChannels_ = channels_ / groups_
+  size_t channels_, outputChannels_;
+  /// feature length = imgSizeH_ * imgSizeW_
+  size_t featLen_;
+  IVectorPtr maxoutId_;
+
+public:
+  /// return imgSizeH_ * imgSizeW_ * outputChannels_;
+  size_t getSize();
+
+  explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
+  virtual ~MaxOutLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index a896e16a6027b3..4faebe5d2ad6f9 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -21,14 +21,18 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * Noise-contrastive estimation
+ * Noise-contrastive estimation.
  * Implements the method in the following paper:
- * A fast and simple algorithm for training neural probabilistic language models
+ * A fast and simple algorithm for training neural probabilistic language models.
+ *
+ * The config file api is nce_layer.
  */
 class NCELayer : public Layer {
   int numClasses_;
-  int numInputs_;  // number of input layer besides labelLayer and weightLayer
+  /// number of input layer besides labelLayer and weightLayer
+  int numInputs_;
   LayerPtr labelLayer_;
+  /// weight layer, can be None
   LayerPtr weightLayer_;
   WeightList weights_;
   std::unique_ptr<Weight> biases_;
@@ -43,7 +47,8 @@ class NCELayer : public Layer {
     real weight;
   };
   std::vector<Sample> samples_;
-  bool prepared_;  // whether samples_ is prepared
+  /// whether samples_ is prepared
+  bool prepared_;
   Argument sampleOut_;
 
   IVectorPtr labelIds_;
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 12831e36688029..26d9536dd57aa3 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 
 #include "paddle/utils/Logging.h"
 
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
 
@@ -29,20 +29,19 @@ namespace paddle {
  * If SequenceLevel = kSeq:
  *   Check input sequence must has sub-sequence
  *   Output: a sequence containing only the last instance of each sub-sequence
- * of the input sequence
+ *           of the input sequence
+ *
+ * The config file api is last_seq and first_seq.
  */
 
-class SequenceLastInstanceLayer : public Layer {
+class SequenceLastInstanceLayer : public SequencePoolLayer {
 protected:
-  std::unique_ptr<Weight> biases_;
   MatrixPtr tmpSrc_;
   MatrixPtr tmpDest_;
-  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
-  int type_;
 
 public:
   explicit SequenceLastInstanceLayer(const LayerConfig& config)
-      : Layer(config) {}
+      : SequencePoolLayer(config) {}
 
   ~SequenceLastInstanceLayer() {}
 
@@ -56,55 +55,20 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
 
 bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
                                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // seqlastins layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
+  SequencePoolLayer::init(layerMap, parameterMap);
 
   tmpSrc_ =
       Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
   tmpDest_ =
       Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
 
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
   return true;
 }
 
 void SequenceLastInstanceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-
-  // check
-  auto startPositions =
-      type_ ? input.subSequenceStartPositions->getVector(false)
-            : input.sequenceStartPositions->getVector(false);
-  size_t height = type_ ? input.getNumSubSequences() : input.getNumSequences();
-  CHECK_EQ(dim, input.value->getWidth());
-  CHECK_EQ(startPositions->getData()[height], input.getBatchSize());
-  CHECK_EQ(height, startPositions->getSize() - 1);
-  if (type_) {
-    // when trans_type = seq, input must hasSubseq
-    CHECK_EQ(input.hasSubseq(), 1UL);
-  }
+  SequencePoolLayer::forward(passType);
 
-  reserveOutput(height, dim);
-  const int* starts = startPositions->getData();
+  const int* starts = startPositions_->getData(false);
   MatrixPtr inputValue = getInputValue(0);
   MatrixPtr outputValue = getOutputValue();
 
@@ -112,21 +76,13 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
     AsyncGpuBlock asyncGpuBlock;
     REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
 
-    for (size_t seqId = 0; seqId < height; ++seqId) {
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
       int insId =
           config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
 
       outputValue->subMatrix(seqId, 1, tmpDest_)
           ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
     }
-    /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-     * thus, in this case, output_ has no sequenceStartPositions.
-     * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-     * case, we should compute the new sequenceStartPositions.
-    */
-    if (type_) {
-      output_.degradeSequence(input, useGpu_);
-    }
   }
 
   if (biases_.get() != NULL) {
@@ -138,23 +94,12 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
 }
 
 void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
-  /* activation, should set to 'linear' in most cases */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
+  SequencePoolLayer::backward(callback);
 
   MatrixPtr inputGrad = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions =
-      type_ ? getInput(0).subSequenceStartPositions->getVector(false)
-            : getInput(0).sequenceStartPositions->getVector(false);
-  const int* starts = startPositions->getData();
-  size_t numSequences = startPositions->getSize() - 1;
+  const int* starts = startPositions_->getData(false);
+  size_t numSequences = startPositions_->getSize() - 1;
 
   if (inputGrad) {
     AsyncGpuBlock asyncGpuBlock;
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
new file mode 100644
index 00000000000000..55be73d363df19
--- /dev/null
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Logging.h"
+#include "SequencePoolLayer.h"
+
+namespace paddle {
+
+bool SequencePoolLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // seqlastins/max/average layer should have exactly 1 input
+  CHECK_EQ(1U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  // transform to which sequence type
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequencePoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
+  size_t dim = getSize();
+  // check
+  CHECK_EQ(dim, input.value->getWidth());
+  startPositions_ =
+      type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
+  auto starts = startPositions_->getVector(false);
+  CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
+  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
+
+  resetOutput(newBatchSize_, dim);
+  if (type_) {
+    CHECK(input.subSequenceStartPositions)
+      << "when trans_type = seq, input must hasSubseq";
+  }
+  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
+   * thus, in this case, output_ has no sequenceStartPositions.
+   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
+   * case, we should compute the new sequenceStartPositions.
+  */
+  if (type_) {
+    output_.degradeSequence(input, useGpu_);
+  }
+}
+
+void SequencePoolLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
new file mode 100644
index 00000000000000..669af80e1d447a
--- /dev/null
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
+ *
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sequence}{input[i]}
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+
+class SequencePoolLayer : public Layer {
+protected:
+  int type_;
+  std::unique_ptr<Weight> biases_;
+  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
+  size_t newBatchSize_;
+  ICpuGpuVectorPtr startPositions_;
+
+public:
+  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~SequencePoolLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py
index 5c3b062309c51f..321c78cb1741bc 100644
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -14,12 +14,15 @@
 
 from paddle.trainer.PyDataProvider2 import *
 
+# Note that each config should has an independent provider
+# in current design of PyDataProvider2.
+#######################################################
 data = [
     [[[1, 3, 2], [4, 5, 2]], 0],
     [[[0, 2], [2, 5], [0, 1, 2]], 1],
 ]
 
-
+# Used for sequence_nest_rnn.conf
 @provider(input_types=[integer_value_sub_sequence(10),
                        integer_value(3)],
           should_shuffle=False)
@@ -27,7 +30,7 @@ def process_subseq(settings, file_name):
     for d in data:
         yield d
 
-
+# Used for sequence_rnn.conf
 @provider(input_types=[integer_value_sequence(10),
                        integer_value(3)],
           should_shuffle=False)
@@ -38,11 +41,32 @@ def process_seq(settings, file_name):
             seq += subseq
         yield seq, d[1]
 
+# Used for sequence_nest_rnn_multi_input.conf
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value(3)],
+          should_shuffle=False)
+def process_subseq2(settings, file_name):
+    for d in data:
+        yield d
+
+# Used for sequence_rnn_multi_input.conf
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value(3)],
+          should_shuffle=False)
+def process_seq2(settings, file_name):
+    for d in data:
+        seq = []
+        for subseq in d[0]:
+            seq += subseq
+        yield seq, d[1]
+
+###########################################################
 data2 = [
     [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
     [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
 ]
 
+# Used for sequence_nest_rnn_multi_unequalength_inputs.conf
 @provider(input_types=[integer_value_sub_sequence(10),
                        integer_value_sub_sequence(10),
                        integer_value(2)],
@@ -52,6 +76,7 @@ def process_unequalength_subseq(settings, file_name):
         yield d
 
 
+# Used for sequence_rnn_multi_unequalength_inputs.conf
 @provider(input_types=[integer_value_sequence(10),
                        integer_value_sequence(10),
                        integer_value(2)],
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py
index cbed1f15fc4157..b166e778d7a33f 100644
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -21,7 +21,7 @@
 def hook(settings, dict_file, **kwargs):
     settings.word_dict = dict_file
     settings.input_types = [integer_value_sequence(len(settings.word_dict)),
-                            integer_value_sequence(3)]
+                            integer_value(3)]
     settings.logger.info('dict len : %d' % (len(settings.word_dict)))
 
 
@@ -34,14 +34,14 @@ def process(settings, file_name):
             words = comment.split()
             word_slot = [settings.word_dict[w] for w in words if
                          w in settings.word_dict]
-            yield word_slot, [label]
+            yield word_slot, label
 
 
 ## for hierarchical sequence network
 def hook2(settings, dict_file, **kwargs):
     settings.word_dict = dict_file
     settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
-                            integer_value_sub_sequence(3)]
+                            integer_value_sequence(3)]
     settings.logger.info('dict len : %d' % (len(settings.word_dict)))
 
 
@@ -57,7 +57,7 @@ def process2(settings, file_name):
                 words = comment.split()
                 word_slot = [settings.word_dict[w] for w in words if
                              w in settings.word_dict]
-                label_list.append([label])
+                label_list.append(label)
                 word_slot_list.append(word_slot)
             else:
                 yield word_slot_list, label_list
diff --git a/paddle/gserver/tests/sequence_nest_rnn.conf b/paddle/gserver/tests/sequence_nest_rnn.conf
index 62b8c5d072d7b4..93b08eb2f8746d 100644
--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
@@ -56,9 +56,8 @@ def outer_step(x):
     last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
 
     # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it correctly. Current implementation requires that
-    # all the out links are from sequences. However, it does not report error
-    # when the out links are not sequences.
+    # does not handle it, and will report error: In hierachical RNN, all out 
+    # links should be from sequences now.
     return inner_rnn_output
 
 out = recurrent_group(
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
index e01b3f8e7aa5c4..0614958b4719dd 100644
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                         test_list=None,
                         module='rnn_data_provider',
-                        obj='process_subseq')
+                        obj='process_subseq2')
 
 
 settings(batch_size=2, learning_rate=0.01)
@@ -57,9 +57,8 @@ def outer_step(wid, x):
     last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
 
     # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it correctly. Current implementation requires that
-    # all the out links are from sequences. However, it does not report error
-    # when the out links are not sequences.
+    # does not handle it, and will report error: In hierachical RNN, all out 
+    # links should be from sequences now.
     return inner_rnn_output
 
 out = recurrent_group(
diff --git a/paddle/gserver/tests/sequence_rnn_multi_input.conf b/paddle/gserver/tests/sequence_rnn_multi_input.conf
index 968621cab59be9..51881e21d971bb 100644
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                         test_list=None,
                         module='rnn_data_provider',
-                        obj='process_seq')
+                        obj='process_seq2')
 
 
 settings(batch_size=2, learning_rate=0.01)
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 425d669206cce3..db48cc47a4a638 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -327,6 +327,24 @@ TEST(Layer, blockExpandLayer) {
   }
 }
 
+TEST(Layer, maxoutLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("maxout");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MaxOutConfig* maxout = input->mutable_maxout_conf();
+
+  maxout->set_img_size_x(32);
+  maxout->set_img_size_y(32);
+  maxout->set_channels(4);
+  maxout->set_groups(2);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "maxout", 10, false, useGpu);
+  }
+}
 void testFcLayer(string format, size_t nnz) {
   TestConfig config;
   config.biasSize = 4096;
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index e75e53ab7f431a..6bf1e329251219 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -117,7 +117,7 @@ TEST(PyDataProvider2, index_no_seq) {
 }
 
 TEST(PyDataProvider2, init_hook) {
-  paddle::PyObjectPtr pickle(PyImport_ImportModule("pickle"));
+  paddle::PyObjectPtr pickle = paddle::py::import("pickle");
   paddle::PyObjectPtr globals(
       PyModule_GetDict(PyImport_AddModule("__main__")));
   PyDict_SetItemString(globals.get(), "pickle", pickle.get());
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
index 145fe85cff7d88..71c3335231e521 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -86,7 +86,7 @@ def test_can_over_batch_size(setting, filename):
         yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
 
 
-@provider(input_types=[index_slot(10), index_slot(10)])
+@provider(input_types={'input1':index_slot(10), 'input2': index_slot(10)})
 def test_input_order(setting, filename):
     for _ in xrange(1000):
         yield {
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index ae7f617371ca5f..d104db3e5b32d5 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <gtest/gtest.h>
 #include <paddle/utils/Util.h>
 #include <paddle/utils/Version.h>
@@ -24,7 +23,7 @@ limitations under the License. */
 P_DECLARE_int32(seed);
 
 using namespace paddle;  // NOLINT
-using namespace std;  // NOLINT
+using namespace std;     // NOLINT
 class TrainerForTest : public paddle::Trainer {
 public:
   void startTrain() {
@@ -44,11 +43,10 @@ class TrainerForTest : public paddle::Trainer {
    */
   size_t getTotalParameterSize() const {
     auto p = const_cast<TrainerForTest*>(this);
-    auto & params = p->getGradientMachine()->getParameters();
-    return std::accumulate(params.begin(), params.end(), 0UL,
-                           [](size_t a, const ParameterPtr& p){
-      return a+p->getSize();
-    });
+    auto& params = p->getGradientMachine()->getParameters();
+    return std::accumulate(
+        params.begin(), params.end(), 0UL,
+        [](size_t a, const ParameterPtr& p) { return a + p->getSize(); });
   }
 };
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 469255719701a0..602d7db035deb5 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -283,13 +283,13 @@ void GpuMatrix::copyFrom(const IVector& src) {
   copyFrom(matrix);
 }
 
-void GpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
   size_t height = getHeight();
   size_t width = getWidth();
   CHECK_EQ(b.getWidth(), width);
   real* dst = getData();
   real* src = b.getData();
-  int* index = rowIndex.getData();
+  const int* index = rowIndex.getData();
   hl_sequence2batch_copy(dst, src, index, width, height, true);
 }
 
@@ -584,6 +584,42 @@ void GpuMatrix::colMax(Matrix& max) {
   max.maxCols(*this);
 }
 
+void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  LOG(FATAL) << "Is not supported";
+}
+
+void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  const real* input  = a.getData();
+  real* output = getData();
+  int* idForGpu = id.getData();
+
+  hl_maxout_forward(input, output, idForGpu, batchSize, size,
+                    size / channels, groups);
+}
+
+void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  real* input  = getData();
+  const real* output = a.getData();
+  const int* idForGpu = id.getData();
+
+  hl_maxout_backward(input, output, idForGpu, batchSize, size,
+                     size / channels, groups);
+}
+
 /*calulate the error of classification */
 void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
   GpuMatrixPtr output_ptr = std::dynamic_pointer_cast<GpuMatrix>(output);
@@ -1329,11 +1365,11 @@ void CpuMatrix::copyFrom(const IVector& src) {
   }
 }
 
-void CpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
   size_t height = getHeight();
   size_t width = getWidth();
   CHECK_EQ(b.getWidth(), width);
-  int* index = rowIndex.getData();
+  const int* index = rowIndex.getData();
   for (size_t i = 0; i < height; i++) {
     CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
     real* src = b.getData() + index[i] * width;
@@ -2799,6 +2835,95 @@ void CpuMatrix::colMax(Matrix& max) {
   max.maxCols(*this);
 }
 
+void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  CHECK(isContiguous());
+  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getWidth();
+  size_t beam = maxVal.getHeight();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getWidth(), numSamples);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  real* t = maxVal.getData();
+  size_t dim = getHeight();
+  for (size_t i = 0; i < numSamples; i++) {
+    std::vector<std::pair<real, size_t>> vec;
+    for (size_t j = 0; j < dim; j++) {
+      vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
+    }
+
+    std::partial_sort(
+        vec.begin(), vec.begin() + beam, vec.end(),
+        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
+          return l.first > r.first;
+        });
+    for (size_t j = 0; j < beam; j++) {
+      t[i + j * numSamples] = vec[j].first;
+      s[i + j * numSamples] = vec[j].second;
+    }
+  }
+}
+
+void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  const real* input  = a.getData();
+  int* idForCpu = id.getData();
+
+  MatrixPtr maxInMat, maxOutMat;
+  Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
+  Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
+
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
+
+    for (size_t i = 0; i < channels; ++i) {
+      size_t newFeatLen = i * featLen;
+      for (size_t j = 0; j < groups; ++j) {
+        maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
+            ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
+                       featLen);
+      }
+    }
+    maxInMat->colMax(*tmpId, *maxOutMat);
+    this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
+  }
+}
+
+void CpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  size_t newFeatLen = groups * featLen;
+  real* inputG  = getData();
+  const real* outG  = a.getData();
+  int* idForCpu = id.getData();
+
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    int* idData = idForCpu + newIndex;
+
+    for (size_t i = 0; i < size; ++i) {
+      int gradIdx =
+          idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
+      (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
+    }
+  }
+}
+
 void CpuMatrix::rowNormalizeL1(Matrix& out) {
   CHECK(!out.useGpu());
 
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index b4922d7e6f5469..9b16ceacbfe98a 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -253,7 +253,7 @@ class Matrix : public BaseMatrix {
     LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
   }
 
-  virtual void copyByRowIndex(Matrix& b, IVector& rowIndex) {
+  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -493,16 +493,40 @@ class Matrix : public BaseMatrix {
     LOG(FATAL) << "Not implemeted";
   }
 
+  /**
+   * set the max of each column of this to mat
+   */
   virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
 
+  /**
+   * @brief Get the top k elements of each column of this matrix.
+   *
+   * The row ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutForward(Matrix& a, IVector& id, size_t channels,
+                             size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutBackward(Matrix& a, IVector& id, size_t channels,
+                              size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
   virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
 
   /**
    * @brief Get the top k elements of each row of this matrix.
    *
    * The column ids and values of these elements are stored in
-   * maxIds and max respectively. Note that the top k
-   * elements are not sorted.
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
    */
   virtual void rowMax(IVector& maxIds, Matrix& max) {
     LOG(FATAL) << "Not implemented";
@@ -995,7 +1019,7 @@ class GpuMatrix : public Matrix {
 
   void copyFrom(const IVector& src);
 
-  void copyByRowIndex(Matrix& b, IVector& rowIndex);
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
@@ -1101,6 +1125,9 @@ class GpuMatrix : public Matrix {
   void rowMax(Matrix& max);
   void rowMax(IVector& maxIds, Matrix& max);
   void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& max);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
 
   void oneHotCrossEntropy(Matrix& output, IVector& label);
   void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
@@ -1271,7 +1298,7 @@ class CpuMatrix : public Matrix {
 
   void copyFrom(CpuSparseMatrix& src);
 
-  void copyByRowIndex(Matrix& b, IVector& rowIndex);
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
@@ -1425,6 +1452,9 @@ class CpuMatrix : public Matrix {
   void rowMax(Matrix& max);
   void rowMax(IVector& maxIds, Matrix& maxVal);
   void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& maxVal);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
   void rowNormalizeL1(Matrix& out);
 
   void oneHotCrossEntropy(Matrix& output, IVector& label);
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index 0b5de252258a96..6986624d25c7a4 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -227,12 +227,18 @@ void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
 
 void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
   std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < len; i ++) {
+    CHECK_LT(*(ids + i), this->getHeight())
+      << "id:" << *(ids + i) << "Height:" << this->getHeight()
+      << "sparse id value exceeds the max input dimension, "
+      << "it could be caused invalid input data samples";
+  }
   localIndices.insert(localIndices.end(), ids, ids + len);
 }
 
 void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
   CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
-  CHECK(mat) << "only support non value sparse matrix";
+  CHECK(mat) << "only support sparse matrix";
   addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
           mat->getElementCnt());
 }
@@ -243,7 +249,13 @@ void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
   int* index = ids->getData();
   for (size_t i = 0; i < numSamples; ++i) {
     if (index[i] == -1) continue;
-    localIndices.push_back((unsigned int)index[i]);
+
+    unsigned int id = (unsigned int)index[i];
+    CHECK_LT(id, this->getHeight())
+      << "id:" << id << "Height:" << this->getHeight()
+      << "sparse id value exceeds the max input dimension, "
+      << "it could be caused invalid input data samples";
+    localIndices.push_back(id);
   }
 }
 
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 2ff19e7b3f87ca..2cc38b82306e2b 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -2065,6 +2065,78 @@ TEST(Matrix, PoolFwdBwd) {
   }
 }
 
+void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
+                      int channels, int groups) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outChannels = channels / groups;
+  int outWidth = imgSizeH * imgSizeW * outChannels;
+
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+
+  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
+  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
+  IVectorPtr idCheck = CpuIVector::create(numSamples * outWidth, false);
+
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+
+  target->maxoutForward(*input, *id, outChannels, groups);
+  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
+
+  // check
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+  idCheck->copyFrom(*idGpu);
+  VectorCheckEqual(*id, *idCheck);
+
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
+                                              true);
+  MatrixPtr targetCheckGrad = CpuMatrix::create(numSamples, inWidth, false,
+                                                false);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
+  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
+
+  // check
+  targetCheckGrad->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+}
+
+TEST(Matrix, MaxOutFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          for (auto groups : {2, 4}) {
+            VLOG(3) << " numSamples=" << numSamples
+                    << " channels=" << channels
+                    << " imgSizeH=" << imgSizeH
+                    << " imgSizeW=" << imgSizeW
+                    << " groups=" << groups;
+            testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
+          }
+        }
+      }
+    }
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 2f9606dc680265..ff251fe89f9f88 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -146,6 +146,12 @@ class Parameter {
     }
   }
 
+  void enableBufType(ParameterType type) {
+    if (bufs_[type]) return;
+    bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+    bufs_[type]->zeroMem();
+  }
+
   void enableIntType(ParameterType type, size_t intStoreSize = 0) {
     if (!intBufs_[type]) {
       SetDevice device(deviceId_);
diff --git a/paddle/pserver/PserverForPython.h b/paddle/pserver/PserverForPython.h
deleted file mode 100644
index 5bbeae8bd8b973..00000000000000
--- a/paddle/pserver/PserverForPython.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/pserver/ParameterClient.h"
-#include "paddle/pserver/ParameterServer.h"
-#include "paddle/parameter/Parameter.h"
-#include <Python.h>
-
-namespace paddle {
-
-struct PyObjectDeleter {
-  void operator()(PyObject* obj) {
-    if (obj) {
-      Py_DECREF(obj);
-    }
-  }
-};
-
-class ParameterClientPy : public ParameterClient {
-protected:
-  typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
-
-  std::vector<ParameterPtr> parameter_;
-  int initArgc_;
-  char** initArgv_;
-
-public:
-  ParameterClientPy(std::vector<std::string> configs, int argc,
-                    std::vector<std::string> argv, bool useGpu) {
-    initArgc_ = argc;
-    initArgv_ = new char* [argc];
-    for (int i = 0; i < argc; i++) {
-      initArgv_[i] = new char[argv[i].size()];
-      strcpy(initArgv_[i],      // NOLINT
-             argv[i].c_str());  // NOLINT TODO(yuyang18): use snprintf instead.
-    }
-    ParameterConfig pyConfig;
-    ParameterPtr param;
-    for (auto& config : configs) {
-      pyConfig.ParseFromString(config);
-      param.reset(new Parameter(pyConfig, useGpu));
-      parameter_.push_back(param);
-    }
-    Py_Initialize();
-    CHECK(Py_IsInitialized());
-  }
-
-  ~ParameterClientPy() {
-    delete initArgv_;
-    Py_Finalize();
-  }
-
-  Parameter getParameter(int idx) { return *(parameter_[idx].get()); }
-
-  void initClientPy() {
-    initMain(initArgc_, initArgv_);
-    CHECK(init(parameter_)) << "Init Client Failed.";
-  }
-
-  void setConfigPy(std::string config) {
-    OptimizationConfig optConfig;
-    optConfig.ParseFromString(config);
-    setConfig(optConfig);
-  }
-
-  bool inStatusPy(int status) { return inStatus(PServerStatus(status)); }
-
-  void setStatusPy(int status) { setStatus(PServerStatus(status)); }
-
-  void waitForStatusPy(int status) { waitForStatus(PServerStatus(status)); }
-
-  void sendParameterPy(int updateMode, int parameterType, int numSamples,
-                       real cost, bool sendBackParameter) {
-    sendParameter(ParameterUpdateMode(updateMode), ParameterType(parameterType),
-                  int64_t(numSamples), real(cost), sendBackParameter);
-  }
-
-  template <class ProtoIn, class ProtoOut>
-  std::string asyncCallPy(const char* serviceName, const char* funcName,
-                          const std::string in) {
-    ProtoIn protoIn;
-    ProtoOut protoOut;
-    std::mutex waitLock;
-    std::string data;
-    protoIn.ParseFromString(in);
-    waitLock.lock();
-    auto callback = [&](ProtoOut* pOut, bool isSuccessful) {
-      if (isSuccessful) {
-        pOut->SerializeToString(&data);
-      } else {
-        LOG(INFO) << "Async Talk Failed.";
-      }
-      waitLock.unlock();
-    };
-
-    ubClient_.asyncCall<ProtoIn, ProtoOut>(serviceName, funcName, protoIn,
-                                           &protoOut, callback);
-    waitLock.lock();
-    protoOut.SerializeToString(&data);
-    return data;
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 0366bb636c704a..6d8f5da3e298fa 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -63,7 +63,8 @@ def __init__(self, input_type, pos):
 
     def scan(self, dat):
         self.extend_cols(dat)
-        self.__rows__.append(len(dat))
+        self.__rows__.append(len(dat) + self.__rows__[-1])
+        self.__height__ += 1
 
     def extend_cols(self, dat):
         self.__cols__.extend(dat)
diff --git a/paddle/scripts/travis/before_install.sh b/paddle/scripts/travis/before_install.linux.sh
similarity index 100%
rename from paddle/scripts/travis/before_install.sh
rename to paddle/scripts/travis/before_install.linux.sh
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
new file mode 100755
index 00000000000000..f438e69b822aa4
--- /dev/null
+++ b/paddle/scripts/travis/before_install.osx.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+brew update
+brew tap homebrew/science
+brew install python
+sudo pip install --upgrade protobuf==2.6.0
+brew install homebrew/versions/protobuf260 --without-python
+brew install cmake python glog gflags openblas wget md5sha1sum
+
+wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
+tar xf gtest.tar.gz
+cd googletest-release-1.8.0/
+cmake .
+make install
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 3ea633be327027..a73c32344c8abe 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,7 +1,22 @@
 #!/bin/bash
 source ./common.sh
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON
-make -j `nproc`
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j `nproc`"
+CMAKE_EXTRA=""
+if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
+  CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
+fi
+
+
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON ${CMAKE_EXTRA}
+
+NPROC=1
+if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
+  NRPOC=`nproc`
+elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
+  NPROC=`sysctl -n hw.ncpu`
+fi
+
+
+make -j $NPROC
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
 sudo make install
 sudo paddle version
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index 91f7f4d29df938..d0fda1b6253e3e 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/math/SparseRowMatrix.h"
 #include "paddle/utils/Thread.h"
 
+P_DECLARE_int32(trainer_count);
+
 namespace paddle {
 
 SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
@@ -48,6 +50,13 @@ void SgdThreadUpdater::init(std::vector<ParameterPtr>& parameters) {
                                               false /*inPserver*/));
     size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
     optimizers_[pid]->init(numRows, &para->getConfig());
+    if (para->isGradSparseUpdate() && FLAGS_trainer_count == 1) {
+      // For trainer_count=1, the gradient machine is NeuralNetwork, which does
+      // not create parameter buf for PARAMETER_GRADIENT for sparse update in
+      // Parameter::enableType(). But gradient parameter buf is still used
+      // in SgdThreadUpdater. We need to explicitly create it.
+      para->enableBufType(PARAMETER_GRADIENT);
+    }
   }
 }
 
@@ -211,7 +220,7 @@ void SgdThreadUpdater::threadUpdateSparse(
     // From MultiGradientMachine
     SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
       para->getMat(PARAMETER_GRADIENT).get());
-    const std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
+    std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
 
     for (auto id : sparseIds) {
       // setup sub bufs
@@ -221,6 +230,7 @@ void SgdThreadUpdater::threadUpdateSparse(
       optimizer->update(vecs, para->getConfig(), id);
       vecs[PARAMETER_GRADIENT]->zeroMem();
     }
+    sparseIds.clear();
   } else if (dynamic_cast<SparseRowCpuMatrix*>(
                para->getMat(PARAMETER_GRADIENT).get())) {
     // From NeuralNetwork
@@ -246,6 +256,10 @@ void SgdThreadUpdater::threadUpdateSparse(
       optimizer->update(vecs, para->getConfig(), id);
       vecs[PARAMETER_GRADIENT]->zeroMem();
     }
+    // For numThreads > 1, MultiGradientMachine is used, which goes
+    // to the above branch.
+    CHECK_EQ(numThreads, 1UL);
+    mainMat->clearIndices();
   } else {
     auto & m = *para->getMat(PARAMETER_GRADIENT).get();
     LOG(FATAL) << "Internal error: " << para->getName() << " "
diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf
index 5d2e2ba9df5c71..664e18cb986811 100644
--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
@@ -13,157 +13,71 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-default_initial_std(0.5)
-
-model_type("nn")
-
-DataLayer(
-    name = "input",
-    size = 3,
-)
-
-DataLayer(
-    name = "weight",
-    size = 1,
-)
-
-Layer(
-    name = "layer1_1",
-    type = "fc",
-    size = 5,
-    active_type = "sigmoid",
-    inputs = "input",
-)
-
-Layer(
-    name = "layer1_2",
-    type = "fc",
-    size = 12,
-    active_type = "linear",
-    inputs = Input("input", parameter_name='sharew'),
-)
-
-Layer(
-    name = "layer1_3",
-    type = "fc",
-    size = 3,
-    active_type = "tanh",
-    inputs = "input",
-)
-
-Layer(
-    name = "layer1_5",
-    type = "fc",
-    size = 3,
-    active_type = "tanh",
-    inputs = Input("input",
-              learning_rate=0.01,
-              momentum=0.9,
-              decay_rate=0.05,
-              initial_mean=0.0,
-              initial_std=0.01,
-              format = "csc",
-              nnz = 4)
-)
-
-FCLayer(
-    name = "layer1_4",
-    size = 5,
-    active_type = "square",
-    inputs = "input",
-    drop_rate = 0.5,
-)
-
-Layer(
-    name = "pool",
-    type = "pool",
-    inputs = Input("layer1_2",
-                   pool = Pool(pool_type="cudnn-avg-pool",
-                               channels = 1,
-                               size_x = 2,
-                               size_y = 3,
-                               img_width = 3,
-                               padding = 1,
-                               padding_y = 2,
-                               stride = 2,
-                               stride_y = 3))
-)
-
-Layer(
-    name = "concat",
-    type = "concat",
-    inputs = ["layer1_3", "layer1_4"],
-)
-
-MixedLayer(
-    name = "output",
-    size = 3,
-    active_type = "softmax",
-    inputs = [
-        FullMatrixProjection("layer1_1",
-              learning_rate=0.1),
-        TransposedFullMatrixProjection("layer1_2", parameter_name='sharew'),
-        FullMatrixProjection("concat"),
-        IdentityProjection("layer1_3"),
-    ],
-)
-
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-)
-
-Layer(
-    name = "cost",
-    type = "multi-class-cross-entropy",
-    inputs = ["output", "label", "weight"],
-)
-
-Layer(
-    name = "cost2",
-    type = "nce",
-    num_classes = 3,
-    active_type = "sigmoid",
-    neg_sampling_dist = [0.1, 0.3, 0.6],
-    inputs = ["layer1_2", "label", "weight"],
-)
-
-Evaluator(
-    name = "error",
-    type = "classification_error",
-    inputs = ["output", "label", "weight"]
-)
-
-Inputs("input", "label", "weight")
-Outputs("cost", "cost2")
-
-TrainData(
-    ProtoData(
-        files = "dummy_list",
-        constant_slots = [1.0],
-        async_load_data = True,
-    )
-)
-
-TestData(
-    SimpleData(
-        files = "trainer/tests/sample_filelist.txt",
-        feat_dim = 3,
-        context_len = 0,
-        buffer_capacity = 1000000,
-        async_load_data = False,
-    ),
-)
-
-Settings(
-    algorithm = "sgd",
-    num_batches_per_send_parameter = 1,
-    num_batches_per_get_parameter = 1,
-    batch_size = 100,
-    learning_rate = 0.001,
-    learning_rate_decay_a = 1e-5,
-    learning_rate_decay_b = 0.5,
-)
+from paddle.trainer_config_helpers import *
+
+TrainData(ProtoData(
+    files = "dummy_list",
+    constant_slots = [1.0],
+    async_load_data = True))
+
+TestData(SimpleData(
+    files = "trainer/tests/sample_filelist.txt",
+    feat_dim = 3,
+    context_len = 0,
+    buffer_capacity = 1000000,
+    async_load_data = False))
+
+settings(batch_size = 100)
+
+data = data_layer(name='input', size=3)
+
+wt = data_layer(name='weight', size=1)
+
+fc1 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=data, size=12,
+               bias_attr=True,
+               param_attr=ParamAttr(name='sharew'),
+               act=LinearActivation())
+
+fc3 = fc_layer(input=data, size=3,
+               bias_attr=True,
+               act=TanhActivation())
+
+fc4 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               layer_attr=ExtraAttr(drop_rate=0.5),
+               act=SquareActivation())
+
+pool = img_pool_layer(input=fc2,
+                      pool_size=2,
+                      pool_size_y=3,
+                      num_channels=1,
+                      padding=1,
+                      padding_y=2,
+                      stride=2,
+                      stride_y=3,
+                      img_width=3,
+                      pool_type=CudnnAvgPooling())
+
+concat = concat_layer(input=[fc3, fc4])
+
+with mixed_layer(size=3, act=SoftmaxActivation()) as output:
+    output += full_matrix_projection(input=fc1)
+    output += trans_full_matrix_projection(input=fc2,
+                                           param_attr=ParamAttr(name='sharew'))
+    output += full_matrix_projection(input=concat)
+    output += identity_projection(input=fc3)
+
+lbl = data_layer(name='label', size=1)
+
+cost = classification_cost(input=output, label=lbl, weight=wt,
+                           layer_attr=ExtraAttr(device=-1))
+
+nce = nce_layer(input=fc2, label=lbl, weight=wt,
+                num_classes=3, 
+                neg_distribution=[0.1, 0.3, 0.6])
+                
+outputs(cost, nce)
diff --git a/paddle/utils/.gitignore b/paddle/utils/.gitignore
new file mode 100644
index 00000000000000..f2cfd7409412de
--- /dev/null
+++ b/paddle/utils/.gitignore
@@ -0,0 +1 @@
+enable_virtualenv.c
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 0557b01e36f078..45240b5002aa18 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -2,6 +2,9 @@
 
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
+create_resources(enable_virtualenv.py enable_virtualenv.c)
+set(UTIL_RES enable_virtualenv.c)
+
 if(APPLE)
     file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
 else()
@@ -9,7 +12,8 @@ else()
 endif()
 add_library(paddle_utils STATIC
         ${UTIL_SOURCES}
-        ${UTIL_ARCH_SOURCES})
+        ${UTIL_ARCH_SOURCES}
+        ${UTIL_RES})
 add_style_check_target(paddle_utils ${UTIL_HEADERS})
 add_style_check_target(paddle_utils ${UTIL_SOURCES}
     ${UTIL_ARCH_SOURCES})
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
index b3f439804686fa..7fdfa3240c1de7 100644
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -191,7 +191,7 @@ void installFailureWriter(void(*callback)(const char*, int));
 }
 #endif  // PADDLE_USE_GLOG
 
-#ifdef NDEBUG
+#ifndef NDEBUG
 #define DEBUG_LEVEL 5
 #define DBG VLOG(DEBUG_LEVEL)
 #else
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp
index 78c3a80674f9c1..90e5093f96ea4e 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
@@ -77,11 +77,18 @@ static std::recursive_mutex g_pyMutex;
 PyGuard::PyGuard() : guard_(g_pyMutex) {}
 
 
-static void printPyErrorStack(std::ostream& os, bool withEndl = false) {
+static void printPyErrorStack(std::ostream& os, bool withEndl = false,
+                              bool withPyPath = true) {
   PyObject * ptype, *pvalue, *ptraceback;
   PyErr_Fetch(&ptype, &pvalue, &ptraceback);
   PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
   PyErr_Clear();
+  if (withPyPath) {
+    os << "Current PYTHONPATH: " << py::repr(PySys_GetObject(strdup("path")));
+    if (withEndl) {
+      os << std::endl;
+    }
+  }
   PyTracebackObject* obj = (PyTracebackObject*)ptraceback;
 
   os << "Python Error: " << PyString_AsString(PyObject_Str(ptype))
@@ -114,10 +121,7 @@ PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
                                    const std::string& funcName,
                                    const std::vector<std::string>& args) {
   PyGuard guard;
-  PyObjectPtr pyModuleName(PyString_FromString(moduleName.c_str()));
-  CHECK_PY(pyModuleName) << "Import PyModule failed" << moduleName;
-  PyObjectPtr pyModule(PyImport_Import(pyModuleName.get()));
-  CHECK_PY(pyModule) << "Import Python Module"<< moduleName << " failed.";
+  PyObjectPtr pyModule = py::import(moduleName);
   PyObjectPtr pyFunc(PyObject_GetAttrString(pyModule.get(), funcName.c_str()));
   CHECK_PY(pyFunc) << "GetAttrString failed.";
   PyObjectPtr pyArgs(PyTuple_New(args.size()));
@@ -143,7 +147,7 @@ PyObjectPtr createPythonClass(
     const std::vector<std::string>& args,
     const std::map<std::string, std::string>& kwargs) {
   PyGuard guard;
-  PyObjectPtr pyModule(PyImport_ImportModule(moduleName.c_str()));
+  PyObjectPtr pyModule = py::import(moduleName);
   LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str();
   CHECK_PY(pyModule) << "Import module " << moduleName << " failed.";
   PyObjectPtr pyDict(PyModule_GetDict(pyModule.get()));
@@ -181,18 +185,29 @@ std::string getPyCallStack() {
   printPyErrorStack(os, true);
   return os.str();
 }
+
+PyObjectPtr import(const std::string &moduleName) {
+  auto module = PyImport_ImportModule(moduleName.c_str());
+  CHECK_PY(module) << "Import " << moduleName << "Error";
+  return PyObjectPtr(module);
+}
+
 }  // namespace py
 
 #endif
-
+extern "C" {
+extern const char enable_virtualenv_py[];
+}
 void initPython(int argc, char** argv) {
 #ifndef PADDLE_NO_PYTHON
   Py_SetProgramName(argv[0]);
   Py_Initialize();
   PySys_SetArgv(argc, argv);
-
   // python blocks SIGINT. Need to enable it.
   signal(SIGINT, SIG_DFL);
+
+  // Manually activate virtualenv when user is using virtualenv
+  PyRun_SimpleString(enable_virtualenv_py);
 #endif
 }
 
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index db02d1252b4057..00fc177022ac34 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -87,6 +87,8 @@ PyObjectPtr createPythonClass(const std::string& moduleName,
   CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
 
 namespace py {
+PyObjectPtr import(const std::string& moduleName);
+
 /**
  * Cast a PyLong or PyInt to int type T.
  * @tparam T return type.
diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h
index d73f27d7fafd6c..f952cf58778dee 100644
--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
@@ -135,6 +135,21 @@ class Queue {
     queueCV_.wait(lock, [this]() { return numElements_ == 0; });
   }
 
+  /**
+   * @brief wait queue is not empty at most for some seconds.
+   * @param seconds wait time limit.
+   * @return true if queue is not empty. false if timeout.
+   */
+  bool waitNotEmptyFor(int seconds) {
+    std::unique_lock<std::mutex> lock(queueLock_);
+    return queueCV_.wait_for(
+          lock,
+          std::chrono::seconds(seconds),
+          [this] {
+      return numElements_ != 0;
+    });
+  }
+
 private:
   std::deque<T> elements_;
   int numElements_;
diff --git a/paddle/utils/enable_virtualenv.py b/paddle/utils/enable_virtualenv.py
new file mode 100644
index 00000000000000..99d822a4145cca
--- /dev/null
+++ b/paddle/utils/enable_virtualenv.py
@@ -0,0 +1,10 @@
+import os
+
+def __activate_virtual_env__():
+  __path__ = os.getenv('VIRTUAL_ENV')
+  if __path__ is None:
+    return
+  __script__ = os.path.join(__path__, 'bin', 'activate_this.py')
+  execfile(__script__, {'__file__': __script__})
+
+__activate_virtual_env__()
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index 8bdcd70a417b84..753fd0cac42233 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -170,6 +170,15 @@ message BlockExpandConfig {
   required uint32 img_size_y = 11;
 }
 
+message MaxOutConfig {
+  required uint32 channels = 1;
+  required uint32 groups = 2;
+
+  // The size of input feature map.
+  required uint32 img_size_x = 3;
+  required uint32 img_size_y = 4;
+}
+
 message ProjectionConfig {
   required string type = 1;
   required string name = 2;
@@ -235,6 +244,7 @@ message LayerInputConfig {
   // Set the argument name.
   optional string input_layer_argument = 9;
   optional BilinearInterpConfig bilinear_interp_conf = 10;
+  optional MaxOutConfig maxout_conf = 11;
 }
 
 message LayerConfig {
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index 34f5dd41b7e683..53409b746d811a 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -208,7 +208,6 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1,
              calc_batch_size=None,
              cache=CacheType.NO_CACHE,
              check=False, check_fail_continue=False,
-             use_dynamic_order=True,
              init_hook=None, **kwargs):
     """
     Provider decorator. Use it to make a function into PyDataProvider2 object.
@@ -228,9 +227,15 @@ def process(settings, file_name):
     The configuration of data provider should be setup by\:
 
     :param input_types: Specify the input types, can also be set in init_hook.
-                        It is a list of InputType object. For example, input_types= \
-                        [dense_vector(9), integer_value(2)].
-    :type input_types: list|tuple
+                        It could be a list of InputType object. For example,
+                        input_types=[dense_vector(9), integer_value(2)]. Or user
+                        can set a dict of InputType object, which key is
+                        data_layer's name. For example, input_types=\
+                        {'img': img_features, 'label': label}. when using dict of
+                        InputType, user could yield a dict of feature values, which
+                        key is also data_layer's name.
+
+    :type input_types: list|tuple|dict
 
     :param should_shuffle: True if data should shuffle. Pass None means shuffle
                            when is training and not to shuffle when is testing.
@@ -281,12 +286,6 @@ def process(settings, file_name):
                                 drop the wrong format data when it is True. Has
                                 no effect when check set to False.
     :type check_fail_continue: bool
-
-    :param use_dynamic_order: Allow provider to yield a dictionary object, whose
-                              key is a input data layer name, and value is the
-                              feature value. The tuples are still allowed when
-                              use_dynmaic_order is True.
-    :type use_dynamic_order: bool
     """
 
     def __wrapper__(generator):
@@ -340,6 +339,11 @@ def __init__(self, file_list, **kwargs):
                 assert self.slots is not None
                 assert self.generator is not None
 
+                use_dynamic_order = False
+                if isinstance(self.slots, dict):  # reorder input_types
+                    self.slots = [self.slots[ipt] for ipt in self.input_order]
+                    use_dynamic_order = True
+
                 if len(self.slots) == 1:
                     self.generator = SingleSlotWrapper(self.generator)
 
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 82446e980d81cc..c6cd4f62b91c9a 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -216,6 +216,10 @@ def Inputs(*args):
         if g_current_submodel is g_root_submodel:
             g_config.model_config.input_layer_names.append(name)
 
+@config_func
+def HasInputsSet():
+    return len(g_config.model_config.input_layer_names) != 0
+
 
 # Define the name of the output layers of the NeuralNetwork.
 # Usually the output is simply the cost layer.
@@ -466,6 +470,7 @@ def __init__(
             pool=None,
             image=None,
             block_expand=None,
+            maxout=None,
             format=None,
             nnz=None,
             is_static=None,
@@ -794,6 +799,16 @@ def __init__(
             output_y = 0):
         self.add_keys(locals())
 
+@config_class
+class MaxOut(Cfg):
+    def __init__(
+            self,
+            channels,
+            groups,
+            img_size_x = 0,
+            img_size_y = 0):
+        self.add_keys(locals())
+
 def DataBase(async_load_data=False,
              constant_slots=None,
              data_ratio=1,
@@ -1098,6 +1113,12 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
             int(math.ceil((2 * block_expand.padding_y + block_expand.img_size_y \
             - block_expand.block_y) / float(block_expand.stride_y)))
 
+def parse_maxout(maxout, input_layer_name, maxout_conf):
+    maxout_conf.channels = maxout.channels
+    maxout_conf.groups = maxout.groups
+    maxout_conf.img_size_x = maxout.img_size_x
+    maxout_conf.img_size_y = maxout.img_size_y
+    
 # Define an evaluator
 @config_func
 def Evaluator(
@@ -1721,6 +1742,21 @@ def __init__(
             self.set_layer_size(block_expand_conf.block_x * block_expand_conf.block_y
                 * block_expand_conf.channels)
 
+@config_layer('maxout')
+class MaxOutLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            **xargs):
+        super(MaxOutLayer, self).__init__(name, 'maxout', 0, inputs=inputs, **xargs)
+        input_layer = self.get_input_layer(0)
+        parse_maxout(self.inputs[0].maxout,
+                     input_layer.name,
+                     self.config.inputs[0].maxout_conf)
+        maxout_conf = self.config.inputs[0].maxout_conf
+        self.set_layer_size(g_layer_map[input_layer.name].size / maxout_conf.groups)
+            
 # key: cost type
 # value: cost class
 g_cost_map = {}
@@ -1735,7 +1771,6 @@ def init(cls, name, inputs, device=None, coeff=1.):
     g_cost_map[cost_type] = cls
 
 define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy')
-define_cost('ClassificationErrorLayer', 'classification_error')
 define_cost('RankingCost', 'rank-cost')
 define_cost('AucValidation', 'auc-validation')
 define_cost('PnpairValidation', 'pnpair-validation')
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index 8ada3903dc06be..f51140656d0dcf 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -68,7 +68,7 @@ def define_py_data_source(file_list, cls, module,
         file_list_name = 'train.list'
         if isinstance(cls, TestData):
             file_list_name = 'test.list'
-        with open(file_list_name, 'r') as f:
+        with open(file_list_name, 'w') as f:
             f.writelines(file_list)
         file_list = file_list_name
 
@@ -84,6 +84,7 @@ def py_data2(files, load_data_module, load_data_object, load_data_args,
             data.load_data_module = load_data_module
             data.load_data_object = load_data_object
             data.load_data_args = load_data_args
+            data.async_load_data = True
             return data
         data_cls = py_data2
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 59df4646faae98..8d249b140e8cde 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -50,11 +50,12 @@
            'slope_intercept_layer', 'trans_full_matrix_projection',
            'linear_comb_layer',
            'convex_comb_layer', 'ctc_layer', 'crf_layer', 'crf_decoding_layer',
+           'nce_layer',
            'cross_entropy_with_selfnorm', 'cross_entropy',
            'multi_binary_label_cross_entropy',
            'rank_cost', 'lambda_cost', 'huber_cost',
            # 'block_expand_layer',  # TODO(yuyang18): this layer is not correct
-           'out_prod_layer', 'print_layer'
+           'maxout_layer', 'out_prod_layer', 'print_layer'
            ]
 
 
@@ -110,12 +111,14 @@ class LayerType(object):
     SLOPE_INTERCEPT_LAYER = "slope_intercept"
     LINEAR_COMBINATION_LAYER = "convex_comb"
     BLOCK_EXPAND = "blockexpand"
+    MAXOUT = "maxout"
 
     PRINT_LAYER = "print"
 
     CTC_LAYER = "ctc"
     CRF_LAYER = "crf"
     CRF_DECODING_LAYER = "crf_decoding"
+    NCE_LAYER = 'nce'
 
     RANK_COST = "rank-cost"
     LAMBDA_COST = "lambda_cost"
@@ -169,7 +172,7 @@ class LayerOutput(object):
     :param activation: Layer Activation.
     :type activation: BaseActivation.
     :param parents: Layer's parents.
-    :type parents: list|tuple|collection.Sequence
+    :type parents: list|tuple|collections.Sequence
     """
 
     def __init__(self, name, layer_type, parents=None, activation=None,
@@ -1692,7 +1695,7 @@ def img_conv_layer(input, filter_size, num_filters,
 @layer_support()
 def img_pool_layer(input, pool_size, name=None,
                    num_channels=None, pool_type=None,
-                   stride=1, start=None, padding=0, layer_attr=None,
+                   stride=1, padding=0, layer_attr=None,
                    pool_size_y=None, stride_y=None, padding_y=None,
                    img_width=None):
     """
@@ -1723,8 +1726,6 @@ def img_pool_layer(input, pool_size, name=None,
     :type stride: int
     :param stride_y: stride height of pooling. It is equal to stride by default.
     :type stride_y: int|None
-    :param start: start position of pooling operation. Note it is deprecated now.
-    :type start: int|None
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
     :param img_width: the width of input feature map. If it is None, the input feature
@@ -1758,7 +1759,7 @@ def img_pool_layer(input, pool_size, name=None,
                           pool_type=type_name,
                           channels=num_channels,
                           size_x=pool_size,
-                          start=start,
+                          start=None,
                           stride=stride,
                           padding=padding,
                           size_y=pool_size_y,
@@ -2053,10 +2054,16 @@ def concat_layer(input, act=None, name=None, layer_attr=None):
     Concat all input vector into one huge vector.
     Inputs can be list of LayerOutput or list of projection.
 
+    The example usage is:
+
+    ..  code-block:: python
+
+        concat = concat_layer(input=[layer1, layer2])
+
     :param name: Layer name.
     :type name: basestring
     :param input: input layers or projections
-    :type input: list|tuple|collection.Sequence
+    :type input: list|tuple|collections.Sequence
     :param act: Activation type.
     :type act: BaseActivation
     :param layer_attr: Extra Layer Attribute.
@@ -2842,30 +2849,52 @@ def __real_step__(*args):
 
     return tmp
 
+def __cost_input__(input, label, weight=None):
+    """
+    inputs and parents for cost layers. 
+    """
+    ipts = [Input(input.name), Input(label.name)]
+    parents = [input, label]
+    if weight is not None:
+        assert weight.layer_type == LayerType.DATA
+        ipts.append(Input(weight.name))
+        parents.append(weight)
+    return ipts, parents
+    
 
 @wrap_name_default()
-def regression_cost(input, label, cost='square_error', name=None):
+@layer_support()
+def regression_cost(input, label, weight=None, name=None,
+                    layer_attr=None):
     """
     Regression Layer.
 
     TODO(yuyang18): Complete this method.
 
     :param name: layer name.
+    :type name: basestring
     :param input: Network prediction.
+    :type input: LayerOutput
     :param label: Data label.
-    :param cost: Cost method.
+    :type label: LayerOutput
+    :param weight: The weight affects the cost, namely the scale of cost.
+                   It is an optional argument.
+    :type weight: LayerOutput
+    :param layer_attr: layer's extra attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
+    :rtype: LayerOutput
     """
-    Layer(inputs=[Input(input.name), Input(label.name)], type=cost, name=name)
-    return LayerOutput(
-        name, LayerType.COST, parents=[input, label]
-    )
+    ipts, parents = __cost_input__(input, label, weight)
+
+    Layer(inputs=ipts, type="square_error", name=name,
+          **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name, LayerType.COST, parents=parents)
 
 
 @wrap_name_default("cost")
 @layer_support()
-def classification_cost(input, label, name=None,
-                        cost="multi-class-cross-entropy",
+def classification_cost(input, label, weight=None, name=None,
                         evaluator=classification_error_evaluator,
                         layer_attr=None):
     """
@@ -2877,8 +2906,9 @@ def classification_cost(input, label, name=None,
     :type input: LayerOutput
     :param label: label layer name. data_layer often.
     :type label: LayerOutput
-    :param cost: cost method.
-    :type cost: basestring
+    :param weight: The weight affects the cost, namely the scale of cost.
+                   It is an optional argument.
+    :type weight: LayerOutput
     :param evaluator: Evaluator method.
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -2888,7 +2918,10 @@ def classification_cost(input, label, name=None,
     assert input.layer_type != LayerType.DATA
     assert isinstance(input.activation, SoftmaxActivation)
     assert label.layer_type == LayerType.DATA
-    Layer(name=name, type=cost, inputs=[Input(input.name), Input(label.name)],
+
+    ipts, parents = __cost_input__(input, label, weight)
+
+    Layer(name=name, type="multi-class-cross-entropy", inputs=ipts,
           **ExtraLayerAttribute.to_kwargs(layer_attr))
 
     def __add_evaluator__(e):
@@ -2900,7 +2933,7 @@ def __add_evaluator__(e):
         assert isinstance(e.for_classification, bool)
         assert e.for_classification
 
-        e(name=e.__name__, input=input, label=label)
+        e(name=e.__name__, input=input, label=label, weight=weight)
 
     if not isinstance(evaluator, collections.Sequence):
         evaluator = [evaluator]
@@ -2908,7 +2941,7 @@ def __add_evaluator__(e):
     for each_evaluator in evaluator:
         __add_evaluator__(each_evaluator)
 
-    return LayerOutput(name, LayerType.COST, parents=[input, label])
+    return LayerOutput(name, LayerType.COST, parents=parents)
 
 
 def conv_operator(img, filter, filter_size, num_filters,
@@ -2984,7 +3017,8 @@ def conv_operator(img, filter, filter_size, num_filters,
 
 
 @wrap_name_default()
-def conv_shift_layer(a, b, name=None):
+@layer_support()
+def conv_shift_layer(a, b, name=None, layer_attr=None):
     """
     This layer performs cyclic convolution for two input. For example:
       - a[in]: contains M elements.
@@ -3013,6 +3047,8 @@ def conv_shift_layer(a, b, name=None):
     :type a: LayerOutput
     :param b: input layer b
     :type b: LayerOutput
+    :param layer_attr: layer's extra attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3022,6 +3058,7 @@ def conv_shift_layer(a, b, name=None):
         name=name,
         type=LayerType.CONV_SHIFT_LAYER,
         inputs=[a.name, b.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
 
     return LayerOutput(name, LayerType.CONV_SHIFT_LAYER, parents=[a, b],
@@ -3095,6 +3132,7 @@ def tensor_layer(a, b, size, act=None, name=None,
 @wrap_param_attr_default()
 @wrap_bias_attr_default()
 @wrap_act_default()
+@layer_support()
 def selective_fc_layer(input, select, size, act=None, name=None,
                        pass_generation=False,
                        has_selected_colums=True,
@@ -3167,7 +3205,8 @@ def selective_fc_layer(input, select, size, act=None, name=None,
 
 
 @wrap_name_default()
-def sampling_id_layer(input, name=None):
+@layer_support()
+def sampling_id_layer(input, name=None, layer_attr=None):
     """
     A layer for sampling id from multinomial distribution from the input layer.
     Sampling one id for one sample.
@@ -3182,6 +3221,8 @@ def sampling_id_layer(input, name=None):
     :type input: LayerOutput
     :param name: The Layer Name.
     :type name: basestring
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3189,12 +3230,15 @@ def sampling_id_layer(input, name=None):
         name=name,
         type=LayerType.SAMPLING_ID_LAYER,
         inputs=[Input(input.name)],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
     return LayerOutput(name, LayerType.SAMPLING_ID_LAYER, input)
 
 
 @wrap_name_default()
-def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
+@layer_support()
+def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0,
+                          layer_attr=None):
     """
     This layer for applying a slope and an intercept to the input
     element-wise. There is no activation and weight.
@@ -3216,6 +3260,8 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
     :type slope: float.
     :param intercept: the offset.
     :type intercept: float.
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3225,12 +3271,15 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
         slope=slope,
         intercept=intercept,
         inputs=[Input(input.name)],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
     return LayerOutput(name, LayerType.SLOPE_INTERCEPT_LAYER, input)
 
 
 @wrap_name_default()
-def linear_comb_layer(weights, vectors, size=None, name=None):
+@layer_support()
+def linear_comb_layer(weights, vectors, size=None, name=None,
+                      layer_attr=None):
     """
     A layer for weighted sum of vectors takes two inputs.
       - Input: size of weights is M
@@ -3271,6 +3320,8 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
     :type size: int
     :param name: The Layer Name.
     :type name: basestring
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3286,6 +3337,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
         type=LayerType.LINEAR_COMBINATION_LAYER,
         size=size,
         inputs=[Input(weights.name), Input(vectors.name)],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
     return LayerOutput(name, LayerType.LINEAR_COMBINATION_LAYER,
                        [weights, vectors], size=size)
@@ -3295,6 +3347,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
 
 
 @wrap_name_default()
+@layer_support()
 def block_expand_layer(input,
                        channel=0,
                        block_x=0,
@@ -3303,7 +3356,8 @@ def block_expand_layer(input,
                        stride_y=0,
                        padding_x=0,
                        padding_y=0,
-                       name=None):
+                       name=None,
+                       layer_attr=None):
     """
     Expand feature map to minibatch matrix.
        - matrix width is: block_y * block_x * channel
@@ -3350,6 +3404,8 @@ def block_expand_layer(input,
     :type padding_y: int
     :param name: The name of this layer, which can not specify.
     :type name: None|basestring.
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3364,13 +3420,83 @@ def block_expand_layer(input,
                                                padding_y=padding_y)
                       ),
           type=LayerType.BLOCK_EXPAND,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
 
     return LayerOutput(name, LayerType.BLOCK_EXPAND, parents=[input])
 
 
 @wrap_name_default()
-def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
+@layer_support()
+def maxout_layer(input,
+                 groups,
+                 num_channels=None,
+                 size_x=None,
+                 size_y=None,
+                 name=None,
+                 layer_attr=None):
+    """
+    A layer to do max out on conv layer output.
+      - Input: output of a conv layer.
+      - Output: feature map size same as input. Channel is (input channel) / groups.
+
+    So groups should be larger than 1, and the num of channels should be able 
+    to devided by groups.
+
+    Please refer to Paper: 
+      - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+      - Multi-digit Number Recognition from Street View \
+        Imagery using Deep Convolutional Neural Networks: \
+        https://arxiv.org/pdf/1312.6082v4.pdf
+    
+    The simple usage is:
+
+    .. code-block:: python
+
+       maxout = maxout_layer(input,
+                             num_channels=128,
+                             groups=4)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param num_channels: The channel number of input layer. If None will be set
+                     automatically from previous output.
+    :type num_channels: int|None
+    :param groups: The group number of input layer.
+    :type groups: int
+    :param size_x: conv output width. If None will be set
+                   automatically from previous output.
+    :type size_x: int|None
+    :param size_y: conv output height. If None will be set
+                   automatically from previous output.
+    :type size_y: int|None
+    :param name: The name of this layer, which can not specify.
+    :type name: None|basestring.
+    :param layer_attr: Extra Layer attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert input.layer_type == LayerType.CONV_LAYER
+    assert isinstance(input.activation, LinearActivation)
+    assert groups > 1
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+    assert num_channels % groups == 0
+    Layer(name=name,
+          inputs=Input(input.name,
+                       maxout=MaxOut(channels=num_channels,
+                                     groups=groups)),
+          type=LayerType.MAXOUT,
+          **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name, LayerType.MAXOUT, parents=[input])
+
+
+@wrap_name_default()
+@layer_support()
+def ctc_layer(input, label, size=None, name=None, norm_by_times=False,
+              layer_attr=None):
     """
     Connectionist Temporal Classification (CTC) is designed for temporal
     classication task. That is, for sequence labeling problems where the
@@ -3407,6 +3533,8 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
     :type name: basestring|None
     :param norm_by_times: Whether to normalization by times. False by default.
     :type norm_by_times: bool
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3422,14 +3550,17 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
         type=LayerType.CTC_LAYER,
         size=size,
         norm_by_times=norm_by_times,
-        inputs=[input.name, label.name]
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
     return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
 
 
 @wrap_name_default()
 @wrap_param_attr_default()
-def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
+@layer_support()
+def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None,
+              layer_attr=None):
     """
     A layer for calculating the cost of sequential conditional random
     field model.
@@ -3455,6 +3586,8 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
     :type param_attr: ParameterAttribute
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3478,6 +3611,7 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
         type=LayerType.CRF_LAYER,
         size=size,
         inputs=ipts,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
     parents = [input, label]
     if weight is not None:
@@ -3487,7 +3621,9 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
 
 @wrap_name_default()
 @wrap_param_attr_default()
-def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
+@layer_support()
+def crf_decoding_layer(input, size, label=None, param_attr=None, name=None,
+                       layer_attr=None):
     """
     A layer for calculating the decoding sequence of sequential conditional
     random field model. The decoding sequence is stored in output.ids.
@@ -3505,6 +3641,8 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
     :type param_attr: ParameterAttribute
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3521,12 +3659,90 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
         type=LayerType.CRF_DECODING_LAYER,
         size=size,
         inputs=ipts,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
     parents = [input]
     if label is not None:
         parents.append(label)
     return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=size)
 
+@wrap_bias_attr_default(has_bias=True)
+@wrap_name_default()
+@layer_support()
+def nce_layer(input, label, num_classes, weight=None,
+              num_neg_samples=10, neg_distribution=None,
+              name=None, bias_attr=None, layer_attr=None):
+    """
+    Noise-contrastive estimation.
+    Implements the method in the following paper:
+    A fast and simple algorithm for training neural probabilistic language models.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       cost = nce_layer(input=layer1, label=layer2, weight=layer3,
+                        num_classes=3, neg_distribution=[0.1,0.3,0.6])
+
+    :param name: layer name
+    :type name: basestring
+    :param input: input layers. It could be a LayerOutput of list/tuple of LayerOutput.
+    :type input: LayerOutput|list|tuple|collections.Sequence
+    :param label: label layer
+    :type label: LayerOutput
+    :param weight: weight layer, can be None(default)
+    :type weight: LayerOutput
+    :param num_classes: number of classes.
+    :type num_classes: int 
+    :param num_neg_samples: number of negative samples. Default is 10.
+    :type num_neg_samples: int 
+    :param neg_distribution: The distribution for generating the random negative labels.
+                             A uniform distribution will be used if not provided.
+                             If not None, its length must be equal to num_classes.
+    :type neg_distribution: list|tuple|collections.Sequence|None
+    :param bias_attr: Bias parameter attribute. True if no bias.
+    :type bias_attr: ParameterAttribute|None|False
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    if isinstance(input, LayerOutput):
+        input = [input]
+    assert isinstance(input, collections.Sequence)
+    assert isinstance(label, LayerOutput)
+    assert label.layer_type == LayerType.DATA
+    if neg_distribution is not None:
+        assert isinstance(neg_distribution, collections.Sequence)
+        assert len(neg_distribution) == num_classes
+        assert sum(neg_distribution) == 1
+    
+    ipts_for_layer = []
+    parents = []
+    for each_input in input:
+        assert isinstance(each_input, LayerOutput)
+        ipts_for_layer.append(each_input.name)
+        parents.append(each_input)
+    ipts_for_layer.append(label.name)
+    parents.append(label)
+
+    if weight is not None:
+        assert isinstance(weight, LayerOutput)
+        assert weight.layer_type == LayerType.DATA
+        ipts_for_layer.append(weight.name)
+        parents.append(weight)
+
+    Layer(
+        name=name,
+        type=LayerType.NCE_LAYER,
+        num_classes=num_classes,
+        neg_sampling_dist=neg_distribution,
+        num_neg_samples=num_neg_samples,
+        inputs=ipts_for_layer,
+        bias=ParamAttr.to_bias(bias_attr),
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.NCE_LAYER, parents=parents)
 
 """
 following are cost Layers.
@@ -3534,7 +3750,8 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
 
 
 @wrap_name_default()
-def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
+@layer_support()
+def rank_cost(left, right, label, weight=None, name=None, coeff=1.0, layer_attr=None):
     """
     A cost Layer for learning to rank using gradient descent. Details can refer
     to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/
@@ -3578,6 +3795,8 @@ def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3595,13 +3814,15 @@ def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
           type=LayerType.RANK_COST,
           inputs=ipts,
           coeff=coeff,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
 
     return LayerOutput(name, LayerType.RANK_COST, parents=parents)
 
 
 @wrap_name_default()
-def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1):
+@layer_support()
+def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1, layer_attr=None):
     """
     lambdaCost for lambdaRank LTR approach.
 
@@ -3632,6 +3853,8 @@ def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1):
     :type max_sort_size: int
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3642,14 +3865,16 @@ def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1):
           type=LayerType.LAMBDA_COST,
           inputs=[input.name, score.name],
           NDCG_num=NDCG_num,
-          max_sort_size=max_sort_size
+          max_sort_size=max_sort_size,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
 
     return LayerOutput(name, LayerType.LAMBDA_COST, parents=[input, score])
 
 
 @wrap_name_default()
-def cross_entropy(input, label, name=None, coeff=1.0):
+@layer_support()
+def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
     """
     A loss layer for multi class entropy.
 
@@ -3667,6 +3892,8 @@ def cross_entropy(input, label, name=None, coeff=1.0):
     :type name: None|basestring.
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
     """
@@ -3675,13 +3902,16 @@ def cross_entropy(input, label, name=None, coeff=1.0):
           type=LayerType.CROSS_ENTROPY,
           inputs=[input.name, label.name],
           coeff=coeff,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
     return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=[input, label])
 
 
 @wrap_name_default()
+@layer_support()
 def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
-                                softmax_selfnorm_alpha=0.1):
+                                softmax_selfnorm_alpha=0.1,
+                                layer_attr=None):
     """
     A loss layer for multi class entropy with selfnorm.
 
@@ -3701,6 +3931,8 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
     :type coeff: float.
     :param softmax_selfnorm_alpha: The scale factor affects the cost.
     :type softmax_selfnorm_alpha: float.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
     """
@@ -3709,6 +3941,7 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
           inputs=[input.name, label.name],
           coeff=coeff,
           softmax_selfnorm_alpha=softmax_selfnorm_alpha,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
 
     return LayerOutput(name,
@@ -3717,7 +3950,8 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
 
 
 @wrap_name_default()
-def huber_cost(input, label, name=None, coeff=1.0):
+@layer_support()
+def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     """
     A loss layer for huber loss.
 
@@ -3733,6 +3967,8 @@ def huber_cost(input, label, name=None, coeff=1.0):
     :type name: None|basestring.
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
     """
@@ -3743,12 +3979,15 @@ def huber_cost(input, label, name=None, coeff=1.0):
           type=LayerType.HUBER,
           inputs=[input.name, label.name],
           coeff=coeff,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
     return LayerOutput(name, LayerType.HUBER, parents=[input, label])
 
 
 @wrap_name_default()
-def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
+@layer_support()
+def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0,
+                                     layer_attr=None):
     """
     A loss layer for multi binary label cross entropy.
 
@@ -3766,6 +4005,8 @@ def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3773,13 +4014,14 @@ def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
     if input.activation is None or \
             not isinstance(input.activation, SigmoidActivation):
         logger.log(logging.WARN,
-                   "%s is not recommend for batch normalization's activation, "
-                   "maybe the relu is better" % repr(input.activation))
+                   "%s is not recommend for multi_binary_label_cross_entropy's activation, "
+                   "maybe the sigmoid is better" % repr(input.activation))
 
     Layer(name=name,
           type=LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
           inputs=[input.name, label.name],
           coeff=coeff,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
     return LayerOutput(name, LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
                        parents=[input, label])
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index ab4057d9d6c6b9..65512b327cdc6f 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -20,7 +20,7 @@
     IdentityActivation, TanhActivation, SequenceSoftmaxActivation
 from attrs import ExtraAttr
 from default_decorators import wrap_name_default, wrap_act_default, \
-    wrap_param_default
+    wrap_param_default, wrap_bias_attr_default, wrap_param_attr_default
 from layers import *  # There are too many layers used in network, so import *
 from poolings import MaxPooling, SumPooling
 from paddle.trainer.config_parser import *
@@ -30,7 +30,7 @@
            'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network',
            'gru_unit', 'gru_group', 'simple_gru', 'simple_attention',
            'text_conv_pool',
-           'bidirectional_lstm', 'outputs']
+           'bidirectional_lstm', 'inputs', 'outputs']
 
 
 ######################################################
@@ -133,7 +133,7 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
                          pool_type=None, act=None, groups=1, conv_stride=1,
                          conv_padding=0, bias_attr=None, num_channel=None,
                          param_attr=None, shared_bias=True,
-                         conv_layer_attr=None, pool_stride=1, pool_start=None,
+                         conv_layer_attr=None, pool_stride=1,
                          pool_padding=0, pool_layer_attr=None):
     """
     Simple image convolution and pooling group.
@@ -172,8 +172,6 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
     :type conv_layer_attr: ExtraLayerAttribute
     :param pool_stride: see img_pool_layer for details
     :type pool_stride: int
-    :param pool_start: see img_pool_layer for details. It is deprecated now.
-    :type pool_start: int
     :param pool_padding: see img_pool_layer for details
     :type pool_padding: int
     :param pool_layer_attr: see img_pool_layer for details
@@ -192,7 +190,7 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
     return img_pool_layer(name="%s_pool" % name, input=_conv_,
                           pool_size=pool_size,
                           pool_type=pool_type, stride=pool_stride,
-                          start=pool_start, padding=pool_padding,
+                          padding=pool_padding,
                           layer_attr=pool_layer_attr)
 
 
@@ -203,7 +201,7 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
                      conv_param_attr=None, shared_bias=True,
                      conv_layer_attr=None, bn_param_attr=None,
                      bn_bias_attr=None, bn_layer_attr=None, pool_stride=1,
-                     pool_start=None, pool_padding=0, pool_layer_attr=None):
+                     pool_padding=0, pool_layer_attr=None):
     """
     Convolution, batch normalization, pooling group.
 
@@ -243,8 +241,6 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
     :param bn_layer_attr: ParameterAttribute.
     :param pool_stride: see img_pool_layer's document.
     :type pool_stride: int
-    :param pool_start: see img_pool_layer's document. It is deprecated now.
-    :type pool_start: int
     :param pool_padding: see img_pool_layer's document.
     :type pool_padding: int
     :param pool_layer_attr: see img_pool_layer's document.
@@ -268,7 +264,7 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
     return img_pool_layer(name="%s_pool" % name,
                           input=__bn__, pool_type=pool_type,
                           pool_size=pool_size, stride=pool_stride,
-                          start=pool_start, padding=pool_padding,
+                          padding=pool_padding,
                           layer_attr=pool_layer_attr)
 
 
@@ -372,8 +368,8 @@ def __vgg__(ipt, num_filter, times, dropouts, num_channels_=None):
     tmp = __vgg__(tmp, 128, 2, [0.4, 0])
     tmp = __vgg__(tmp, 256, 3, [0.4, 0.4, 0])
     tmp = __vgg__(tmp, 512, 3, [0.4, 0.4, 0])
-    tmp = img_pool_layer(input = tmp, stride = 2,
-                         pool_size = 2, pool_type = MaxPooling())
+    tmp = img_pool_layer(input=tmp, stride=2,
+                         pool_size=2, pool_type=MaxPooling())
     tmp = dropout_layer(input=tmp, dropout_rate=0.5)
     tmp = fc_layer(input=tmp, size=512, layer_attr=ExtraAttr(drop_rate=0.5),
                    act=LinearActivation())
@@ -505,7 +501,7 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
 def lstmemory_unit(input, name=None, size=None, param_attr=None,
                    act=None, gate_act=None, state_act=None,
                    mixed_bias_attr=None, lstm_bias_attr=None,
-                   mixed_layer_attr=None,lstm_layer_attr=None,
+                   mixed_layer_attr=None, lstm_layer_attr=None,
                    get_output_layer_attr=None):
     """
     Define calculations that a LSTM unit performs in a single time step.
@@ -745,7 +741,6 @@ def gru_group(input,
               gru_bias_attr=None,
               act=None, gate_act=None,
               gru_layer_attr=None):
-
     """
     gru_group is a recurrent layer group version Gated Recurrent Unit. It
     does exactly the same calculation as the grumemory layer does. A promising
@@ -919,12 +914,12 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
 
     fw = simple_lstm(name='%s_fw' % name, input=input, size=size,
                      **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
-                        if k.startswith('fwd_')))
+                            if k.startswith('fwd_')))
 
     bw = simple_lstm(name="%s_bw" % name, input=input, size=size,
                      reverse=True,
                      **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
-                        if k.startswith('bwd_')))
+                            if k.startswith('bwd_')))
 
     if return_seq:
         return concat_layer(name=name, input=[fw, bw], layer_attr=concat_attr,
@@ -1052,14 +1047,30 @@ def dropout_layer(input, dropout_rate, name=None):
                        layer_attr=ExtraAttr(drop_rate=dropout_rate))
 
 
-def outputs(layers, *args):
+def inputs(layers, *args):
+    """
+    Declare the inputs of network. The order of input should be as same as
+    the data provider's return order.
+
+    :param layers: Input Layers.
+    :type layers: list|tuple|LayerOutput.
+    :return:
     """
-    Declare the end of network. Currently it will only calculate the
-    input/output order of network. It will calculate the predict network or
-    train network's output automatically.
 
+    if isinstance(layers, LayerOutput) or isinstance(layers, basestring):
+        layers = [layers]
+    if len(args) != 0:
+        layers.extend(args)
 
-    :param layers:
+    Inputs(*[l.name for l in layers])
+
+
+def outputs(layers, *args):
+    """
+    Declare the outputs of network. If user have not defined the inputs of
+    network, this method will calculate the input order by dfs travel.
+
+    :param layers: Output layers.
     :type layers: list|tuple|LayerOutput
     :return:
     """
@@ -1093,6 +1104,11 @@ def __dfs_travel__(layer,
         layers.extend(args)
 
     assert len(layers) > 0
+
+    if HasInputsSet():  # input already set
+        Outputs(*[l.name for l in layers])
+        return  # just return outputs.
+
     if len(layers) != 1:
         logger.warning("`outputs` routine try to calculate network's"
                        " inputs and outputs order. It might not work well."
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index 4660a6b5003daf..d4b947517b7d04 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -362,6 +362,13 @@ def __extends__(dict1, dict2):
                     default_factory=lambda _: BaseRegularization())
 def settings(batch_size,
              learning_rate=1e-3,
+             learning_rate_decay_a=0.,
+             learning_rate_decay_b=0.,
+             learning_rate_schedule='poly',
+             learning_rate_args='',
+             average_window=0,
+             do_average_in_cpu=False,
+             max_average_window=None,
              learning_method=None,
              regularization=None,
              is_async=False,
@@ -408,10 +415,14 @@ def settings(batch_size,
     else:
         algorithm = 'owlqn'
 
+    args=['batch_size', 'learning_rate', 'learning_rate_decay_a',
+          'learning_rate_decay_b', 'learning_rate_schedule',
+          'learning_rate_args', 'average_window', 'do_average_in_cpu',
+          'max_average_window']
     kwargs = dict()
-    kwargs['batch_size'] = batch_size
-    kwargs['learning_rate'] = learning_rate
     kwargs['algorithm'] = algorithm
+    for arg in args:
+        kwargs[arg] = locals()[arg]
 
     kwargs = __extends__(kwargs, learning_method.to_setting_kwargs())
     learning_method.extra_settings()
diff --git a/python/paddle/trainer_config_helpers/tests/configs/check.md5 b/python/paddle/trainer_config_helpers/tests/configs/check.md5
index 359652f3d09c7f..88ce5c129e552e 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5
@@ -2,13 +2,17 @@
 a5d9259ff1fd7ca23d0ef090052cb1f2  last_first_seq.protostr
 9c038249ec8ff719753a746cdb04c026  layer_activations.protostr
 5913f87b39cee3b2701fa158270aca26  projections.protostr
+7334ba0a4544f0623231330fc51d390d  shared_fc.protostr
+8b8b6bb128a7dfcc937be86145f53e2f  shared_lstm.protostr
 6b39e34beea8dfb782bee9bd3dea9eb5  simple_rnn_layers.protostr
 0fc1409600f1a3301da994ab9d28b0bf  test_cost_layers.protostr
+6cd5f28a3416344f20120698470e0a4c  test_cost_layers_with_weight.protostr
 144bc6d3a509de74115fa623741797ed  test_expand_layer.protostr
 2378518bdb71e8c6e888b1842923df58  test_fc.protostr
 8bb44e1e5072d0c261572307e7672bda  test_grumemory_layer.protostr
 1f3510672dce7a9ed25317fc58579ac7  test_hsigmoid.protostr
 d350bd91a0dc13e854b1364c3d9339c6  test_lstmemory_layer.protostr
+6fa59551808ee7012bbd24f757e782d2  test_maxout.protostr
 251a948ba41c1071afcd3d9cf9c233f7  test_ntm_layers.protostr
 e6ff04e70aea27c7b06d808cc49c9497  test_print_layer.protostr
 2a75dd33b640c49a8821c2da6e574577  test_rnn_group.protostr
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index e8be0023e70134..15c66a9754604c 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -8,8 +8,8 @@ configs=(test_fc layer_activations projections test_print_layer
 test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group test_bilinear_interp)
-
+test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
+test_bilinear_interp test_maxout)
 
 for conf in ${configs[*]}
 do
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
new file mode 100644
index 00000000000000..202cf367fc7f28
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
@@ -0,0 +1,22 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+a = data_layer(name='feature_a', size=200)
+b = data_layer(name='feature_b', size=200)
+
+fc_param = ParamAttr(name='fc_param', initial_max=1.0, initial_min=-1.0)
+bias_param = ParamAttr(name='bias_param', initial_mean=0.0, initial_std=0.0)
+
+softmax_param = ParamAttr(name='softmax_param', initial_max=1.0, initial_min=-1.0)
+
+hidden_a = fc_layer(input=a, size=200, param_attr=fc_param, bias_attr=bias_param)
+hidden_b = fc_layer(input=b, size=200, param_attr=fc_param, bias_attr=bias_param)
+
+predict = fc_layer(input=[hidden_a, hidden_b], param_attr=[softmax_param, softmax_param],
+                   bias_attr=False, size=10, act=SoftmaxActivation())
+
+outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
new file mode 100644
index 00000000000000..8557e9daaf66ad
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
@@ -0,0 +1,29 @@
+from paddle.trainer_config_helpers import *
+
+settings(learning_rate=1e-4, batch_size=1000)
+
+data_1 = data_layer(name='data_a', size=100)
+data_2 = data_layer(name='data_b', size=100)
+
+mixed_param = ParamAttr(name='mixed_param')
+
+with mixed_layer(size=400, bias_attr=False) as m1:
+    m1 += full_matrix_projection(input=data_1, param_attr=mixed_param)
+
+with mixed_layer(size=400, bias_attr=False) as m2:
+    m2 += full_matrix_projection(input=data_2, param_attr=mixed_param)
+
+lstm_param = ParamAttr(name='lstm_param')
+lstm_bias = ParamAttr(name='lstm_bias', initial_mean=0., initial_std=0.)
+
+lstm1 = lstmemory_group(input=m1, param_attr=lstm_param, lstm_bias_attr=lstm_bias, mixed_bias_attr=False)
+lstm2 = lstmemory_group(input=m2, param_attr=lstm_param, lstm_bias_attr=lstm_bias, mixed_bias_attr=False)
+
+softmax_param = ParamAttr(name='softmax_param')
+
+predict = fc_layer(input=[last_seq(input=lstm1), last_seq(input=lstm2)],
+                   size=10,
+                   param_attr=[softmax_param, softmax_param],
+                   bias_attr=False,
+                   act=SoftmaxActivation())
+outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
new file mode 100644
index 00000000000000..29749cbb666379
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -0,0 +1,14 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+data = data_layer(name='input', size=300)
+lbl = data_layer(name='label', size=1)
+wt = data_layer(name='weight', size=1)
+fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
+
+outputs(classification_cost(input=fc, label=lbl, weight=wt),
+        regression_cost(input=fc, label=lbl, weight=wt))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
new file mode 100644
index 00000000000000..079e2cf4c43206
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
@@ -0,0 +1,30 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+
+data = data_layer(name='data', size=2304)
+
+conv = img_conv_layer(input=data,
+                      filter_size = 3,
+                      num_channels=1,
+                      num_filters=16,
+                      padding=1,
+                      act=LinearActivation(),
+                      bias_attr=True)
+
+maxout = maxout_layer(input=conv,
+                      num_channels=16,
+                      groups=2)
+
+pool = img_pool_layer(input=maxout,
+                      num_channels=8,
+                      pool_size=2,
+                      stride=2,
+                      pool_type=MaxPooling())
+
+fc = fc_layer(input=pool, size=384, bias_attr=False)
+
+outputs(fc)